aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-09 11:33:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-09 11:33:31 -0400
commit26d2177e977c912863ac04f6c1a967e793ca3a56 (patch)
tree48da04fb0b947cfa404747690d7081b657e33221
parenta794b4f3292160bb3fd0f1f90ec8df454e3b17b3 (diff)
parentd1178cbcdcf91900ccf10a177350d7945703c151 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
Pull inifiniband/rdma updates from Doug Ledford: "This is a fairly sizeable set of changes. I've put them through a decent amount of testing prior to sending the pull request due to that. There are still a few fixups that I know are coming, but I wanted to go ahead and get the big, sizable chunk into your hands sooner rather than waiting for those last few fixups. Of note is the fact that this creates what is intended to be a temporary area in the drivers/staging tree specifically for some cleanups and additions that are coming for the RDMA stack. We deprecated two drivers (ipath and amso1100) and are waiting to hear back if we can deprecate another one (ehca). We also put Intel's new hfi1 driver into this area because it needs to be refactored and a transfer library created out of the factored out code, and then it and the qib driver and the soft-roce driver should all be modified to use that library. I expect drivers/staging/rdma to be around for three or four kernel releases and then to go away as all of the work is completed and final deletions of deprecated drivers are done. Summary of changes for 4.3: - Create drivers/staging/rdma - Move amso1100 driver to staging/rdma and schedule for deletion - Move ipath driver to staging/rdma and schedule for deletion - Add hfi1 driver to staging/rdma and set TODO for move to regular tree - Initial support for namespaces to be used on RDMA devices - Add RoCE GID table handling to the RDMA core caching code - Infrastructure to support handling of devices with differing read and write scatter gather capabilities - Various iSER updates - Kill off unsafe usage of global mr registrations - Update SRP driver - Misc mlx4 driver updates - Support for the mr_alloc verb - Support for a netlink interface between kernel and user space cache daemon to speed path record queries and route resolution - Ininitial support for safe hot removal of verbs devices" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (136 commits) IB/ipoib: Suppress warning for send only join failures IB/ipoib: Clean up send-only multicast joins IB/srp: Fix possible protection fault IB/core: Move SM class defines from ib_mad.h to ib_smi.h IB/core: Remove unnecessary defines from ib_mad.h IB/hfi1: Add PSM2 user space header to header_install IB/hfi1: Add CSRs for CONFIG_SDMA_VERBOSITY mlx5: Fix incorrect wc pkey_index assignment for GSI messages IB/mlx5: avoid destroying a NULL mr in reg_user_mr error flow IB/uverbs: reject invalid or unknown opcodes IB/cxgb4: Fix if statement in pick_local_ip6adddrs IB/sa: Fix rdma netlink message flags IB/ucma: HW Device hot-removal support IB/mlx4_ib: Disassociate support IB/uverbs: Enable device removal when there are active user space applications IB/uverbs: Explicitly pass ib_dev to uverbs commands IB/uverbs: Fix race between ib_uverbs_open and remove_one IB/uverbs: Fix reference counting usage of event files IB/core: Make ib_dealloc_pd return void IB/srp: Create an insecure all physical rkey only if needed ...
-rw-r--r--Documentation/infiniband/sysfs.txt20
-rw-r--r--MAINTAINERS9
-rw-r--r--drivers/infiniband/Kconfig2
-rw-r--r--drivers/infiniband/core/Makefile3
-rw-r--r--drivers/infiniband/core/cache.c773
-rw-r--r--drivers/infiniband/core/cm.c215
-rw-r--r--drivers/infiniband/core/cma.c657
-rw-r--r--drivers/infiniband/core/core_priv.h54
-rw-r--r--drivers/infiniband/core/device.c335
-rw-r--r--drivers/infiniband/core/mad.c28
-rw-r--r--drivers/infiniband/core/mad_priv.h1
-rw-r--r--drivers/infiniband/core/multicast.c7
-rw-r--r--drivers/infiniband/core/netlink.c55
-rw-r--r--drivers/infiniband/core/roce_gid_mgmt.c728
-rw-r--r--drivers/infiniband/core/sa_query.c515
-rw-r--r--drivers/infiniband/core/sysfs.c51
-rw-r--r--drivers/infiniband/core/ucm.c9
-rw-r--r--drivers/infiniband/core/ucma.c146
-rw-r--r--drivers/infiniband/core/user_mad.c6
-rw-r--r--drivers/infiniband/core/uverbs.h16
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c147
-rw-r--r--drivers/infiniband/core/uverbs_main.c448
-rw-r--r--drivers/infiniband/core/verbs.c131
-rw-r--r--drivers/infiniband/hw/Makefile2
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_provider.c14
-rw-r--r--drivers/infiniband/hw/cxgb4/cm.c82
-rw-r--r--drivers/infiniband/hw/cxgb4/iw_cxgb4.h4
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c12
-rw-r--r--drivers/infiniband/hw/cxgb4/provider.c2
-rw-r--r--drivers/infiniband/hw/mlx4/ah.c8
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c2
-rw-r--r--drivers/infiniband/hw/mlx4/mad.c23
-rw-r--r--drivers/infiniband/hw/mlx4/main.c891
-rw-r--r--drivers/infiniband/hw/mlx4/mcg.c15
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h40
-rw-r--r--drivers/infiniband/hw/mlx4/mr.c11
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c10
-rw-r--r--drivers/infiniband/hw/mlx4/sysfs.c5
-rw-r--r--drivers/infiniband/hw/mlx5/cq.c10
-rw-r--r--drivers/infiniband/hw/mlx5/main.c30
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h14
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c124
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c5
-rw-r--r--drivers/infiniband/hw/mthca/mthca_provider.c1
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.c19
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma.h1
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_main.c236
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_sli.h2
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.c56
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.h15
-rw-r--r--drivers/infiniband/hw/qib/qib_keys.c4
-rw-r--r--drivers/infiniband/hw/qib/qib_mad.h147
-rw-r--r--drivers/infiniband/hw/qib/qib_mr.c9
-rw-r--r--drivers/infiniband/hw/qib/qib_ruc.c1
-rw-r--r--drivers/infiniband/hw/qib/qib_verbs.c17
-rw-r--r--drivers/infiniband/hw/qib/qib_verbs.h6
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h1
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c4
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c236
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c50
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c22
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.c91
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.h206
-rw-r--r--drivers/infiniband/ulp/iser/iser_initiator.c38
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c482
-rw-r--r--drivers/infiniband/ulp/iser/iser_verbs.c339
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.c47
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.h1
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c282
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.h25
-rw-r--r--drivers/infiniband/ulp/srpt/ib_srpt.c22
-rw-r--r--drivers/infiniband/ulp/srpt/ib_srpt.h1
-rw-r--r--drivers/net/bonding/bond_options.c13
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_main.c36
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/intf.c3
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/fw.c22
-rw-r--r--drivers/staging/Kconfig2
-rw-r--r--drivers/staging/Makefile1
-rw-r--r--drivers/staging/rdma/Kconfig31
-rw-r--r--drivers/staging/rdma/Makefile4
-rw-r--r--drivers/staging/rdma/amso1100/Kbuild (renamed from drivers/infiniband/hw/amso1100/Kbuild)0
-rw-r--r--drivers/staging/rdma/amso1100/Kconfig (renamed from drivers/infiniband/hw/amso1100/Kconfig)0
-rw-r--r--drivers/staging/rdma/amso1100/TODO4
-rw-r--r--drivers/staging/rdma/amso1100/c2.c (renamed from drivers/infiniband/hw/amso1100/c2.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2.h (renamed from drivers/infiniband/hw/amso1100/c2.h)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_ae.c (renamed from drivers/infiniband/hw/amso1100/c2_ae.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_ae.h (renamed from drivers/infiniband/hw/amso1100/c2_ae.h)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_alloc.c (renamed from drivers/infiniband/hw/amso1100/c2_alloc.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_cm.c (renamed from drivers/infiniband/hw/amso1100/c2_cm.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_cq.c (renamed from drivers/infiniband/hw/amso1100/c2_cq.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_intr.c (renamed from drivers/infiniband/hw/amso1100/c2_intr.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_mm.c (renamed from drivers/infiniband/hw/amso1100/c2_mm.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_mq.c (renamed from drivers/infiniband/hw/amso1100/c2_mq.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_mq.h (renamed from drivers/infiniband/hw/amso1100/c2_mq.h)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_pd.c (renamed from drivers/infiniband/hw/amso1100/c2_pd.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_provider.c (renamed from drivers/infiniband/hw/amso1100/c2_provider.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_provider.h (renamed from drivers/infiniband/hw/amso1100/c2_provider.h)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_qp.c (renamed from drivers/infiniband/hw/amso1100/c2_qp.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_rnic.c (renamed from drivers/infiniband/hw/amso1100/c2_rnic.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_status.h (renamed from drivers/infiniband/hw/amso1100/c2_status.h)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_user.h (renamed from drivers/infiniband/hw/amso1100/c2_user.h)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_vq.c (renamed from drivers/infiniband/hw/amso1100/c2_vq.c)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_vq.h (renamed from drivers/infiniband/hw/amso1100/c2_vq.h)0
-rw-r--r--drivers/staging/rdma/amso1100/c2_wr.h (renamed from drivers/infiniband/hw/amso1100/c2_wr.h)0
-rw-r--r--drivers/staging/rdma/hfi1/Kconfig37
-rw-r--r--drivers/staging/rdma/hfi1/Makefile19
-rw-r--r--drivers/staging/rdma/hfi1/TODO6
-rw-r--r--drivers/staging/rdma/hfi1/chip.c10798
-rw-r--r--drivers/staging/rdma/hfi1/chip.h1035
-rw-r--r--drivers/staging/rdma/hfi1/chip_registers.h1292
-rw-r--r--drivers/staging/rdma/hfi1/common.h415
-rw-r--r--drivers/staging/rdma/hfi1/cq.c558
-rw-r--r--drivers/staging/rdma/hfi1/debugfs.c899
-rw-r--r--drivers/staging/rdma/hfi1/debugfs.h78
-rw-r--r--drivers/staging/rdma/hfi1/device.c142
-rw-r--r--drivers/staging/rdma/hfi1/device.h61
-rw-r--r--drivers/staging/rdma/hfi1/diag.c1873
-rw-r--r--drivers/staging/rdma/hfi1/dma.c186
-rw-r--r--drivers/staging/rdma/hfi1/driver.c1241
-rw-r--r--drivers/staging/rdma/hfi1/eprom.c475
-rw-r--r--drivers/staging/rdma/hfi1/eprom.h55
-rw-r--r--drivers/staging/rdma/hfi1/file_ops.c2140
-rw-r--r--drivers/staging/rdma/hfi1/firmware.c1620
-rw-r--r--drivers/staging/rdma/hfi1/hfi.h1821
-rw-r--r--drivers/staging/rdma/hfi1/init.c1722
-rw-r--r--drivers/staging/rdma/hfi1/intr.c207
-rw-r--r--drivers/staging/rdma/hfi1/iowait.h186
-rw-r--r--drivers/staging/rdma/hfi1/keys.c411
-rw-r--r--drivers/staging/rdma/hfi1/mad.c4257
-rw-r--r--drivers/staging/rdma/hfi1/mad.h325
-rw-r--r--drivers/staging/rdma/hfi1/mmap.c192
-rw-r--r--drivers/staging/rdma/hfi1/mr.c551
-rw-r--r--drivers/staging/rdma/hfi1/opa_compat.h129
-rw-r--r--drivers/staging/rdma/hfi1/pcie.c1253
-rw-r--r--drivers/staging/rdma/hfi1/pio.c1771
-rw-r--r--drivers/staging/rdma/hfi1/pio.h224
-rw-r--r--drivers/staging/rdma/hfi1/pio_copy.c858
-rw-r--r--drivers/staging/rdma/hfi1/platform_config.h286
-rw-r--r--drivers/staging/rdma/hfi1/qp.c1687
-rw-r--r--drivers/staging/rdma/hfi1/qp.h235
-rw-r--r--drivers/staging/rdma/hfi1/qsfp.c546
-rw-r--r--drivers/staging/rdma/hfi1/qsfp.h222
-rw-r--r--drivers/staging/rdma/hfi1/rc.c2426
-rw-r--r--drivers/staging/rdma/hfi1/ruc.c948
-rw-r--r--drivers/staging/rdma/hfi1/sdma.c2962
-rw-r--r--drivers/staging/rdma/hfi1/sdma.h1123
-rw-r--r--drivers/staging/rdma/hfi1/srq.c397
-rw-r--r--drivers/staging/rdma/hfi1/sysfs.c739
-rw-r--r--drivers/staging/rdma/hfi1/trace.c221
-rw-r--r--drivers/staging/rdma/hfi1/trace.h1409
-rw-r--r--drivers/staging/rdma/hfi1/twsi.c518
-rw-r--r--drivers/staging/rdma/hfi1/twsi.h68
-rw-r--r--drivers/staging/rdma/hfi1/uc.c585
-rw-r--r--drivers/staging/rdma/hfi1/ud.c885
-rw-r--r--drivers/staging/rdma/hfi1/user_pages.c156
-rw-r--r--drivers/staging/rdma/hfi1/user_sdma.c1444
-rw-r--r--drivers/staging/rdma/hfi1/user_sdma.h89
-rw-r--r--drivers/staging/rdma/hfi1/verbs.c2143
-rw-r--r--drivers/staging/rdma/hfi1/verbs.h1151
-rw-r--r--drivers/staging/rdma/hfi1/verbs_mcast.c385
-rw-r--r--drivers/staging/rdma/ipath/Kconfig (renamed from drivers/infiniband/hw/ipath/Kconfig)4
-rw-r--r--drivers/staging/rdma/ipath/Makefile (renamed from drivers/infiniband/hw/ipath/Makefile)0
-rw-r--r--drivers/staging/rdma/ipath/TODO5
-rw-r--r--drivers/staging/rdma/ipath/ipath_common.h (renamed from drivers/infiniband/hw/ipath/ipath_common.h)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_cq.c (renamed from drivers/infiniband/hw/ipath/ipath_cq.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_debug.h (renamed from drivers/infiniband/hw/ipath/ipath_debug.h)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_diag.c (renamed from drivers/infiniband/hw/ipath/ipath_diag.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_dma.c (renamed from drivers/infiniband/hw/ipath/ipath_dma.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_driver.c (renamed from drivers/infiniband/hw/ipath/ipath_driver.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_eeprom.c (renamed from drivers/infiniband/hw/ipath/ipath_eeprom.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_file_ops.c (renamed from drivers/infiniband/hw/ipath/ipath_file_ops.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_fs.c (renamed from drivers/infiniband/hw/ipath/ipath_fs.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_iba6110.c (renamed from drivers/infiniband/hw/ipath/ipath_iba6110.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_init_chip.c (renamed from drivers/infiniband/hw/ipath/ipath_init_chip.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_intr.c (renamed from drivers/infiniband/hw/ipath/ipath_intr.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_kernel.h (renamed from drivers/infiniband/hw/ipath/ipath_kernel.h)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_keys.c (renamed from drivers/infiniband/hw/ipath/ipath_keys.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_mad.c (renamed from drivers/infiniband/hw/ipath/ipath_mad.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_mmap.c (renamed from drivers/infiniband/hw/ipath/ipath_mmap.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_mr.c (renamed from drivers/infiniband/hw/ipath/ipath_mr.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_qp.c (renamed from drivers/infiniband/hw/ipath/ipath_qp.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_rc.c (renamed from drivers/infiniband/hw/ipath/ipath_rc.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_registers.h (renamed from drivers/infiniband/hw/ipath/ipath_registers.h)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_ruc.c (renamed from drivers/infiniband/hw/ipath/ipath_ruc.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_sdma.c (renamed from drivers/infiniband/hw/ipath/ipath_sdma.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_srq.c (renamed from drivers/infiniband/hw/ipath/ipath_srq.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_stats.c (renamed from drivers/infiniband/hw/ipath/ipath_stats.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_sysfs.c (renamed from drivers/infiniband/hw/ipath/ipath_sysfs.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_uc.c (renamed from drivers/infiniband/hw/ipath/ipath_uc.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_ud.c (renamed from drivers/infiniband/hw/ipath/ipath_ud.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_user_pages.c (renamed from drivers/infiniband/hw/ipath/ipath_user_pages.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_user_sdma.c (renamed from drivers/infiniband/hw/ipath/ipath_user_sdma.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_user_sdma.h (renamed from drivers/infiniband/hw/ipath/ipath_user_sdma.h)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_verbs.c (renamed from drivers/infiniband/hw/ipath/ipath_verbs.c)1
-rw-r--r--drivers/staging/rdma/ipath/ipath_verbs.h (renamed from drivers/infiniband/hw/ipath/ipath_verbs.h)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_verbs_mcast.c (renamed from drivers/infiniband/hw/ipath/ipath_verbs_mcast.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_wc_ppc64.c (renamed from drivers/infiniband/hw/ipath/ipath_wc_ppc64.c)0
-rw-r--r--drivers/staging/rdma/ipath/ipath_wc_x86_64.c (renamed from drivers/infiniband/hw/ipath/ipath_wc_x86_64.c)0
-rw-r--r--include/linux/mlx4/device.h3
-rw-r--r--include/linux/mlx4/driver.h1
-rw-r--r--include/linux/mlx5/device.h11
-rw-r--r--include/linux/mlx5/driver.h1
-rw-r--r--include/linux/sunrpc/svc_rdma.h1
-rw-r--r--include/net/addrconf.h31
-rw-r--r--include/net/bonding.h7
-rw-r--r--include/rdma/ib_cm.h25
-rw-r--r--include/rdma/ib_mad.h82
-rw-r--r--include/rdma/ib_pack.h2
-rw-r--r--include/rdma/ib_smi.h47
-rw-r--r--include/rdma/ib_verbs.h203
-rw-r--r--include/rdma/opa_port_info.h433
-rw-r--r--include/rdma/opa_smi.h47
-rw-r--r--include/rdma/rdma_netlink.h7
-rw-r--r--include/uapi/rdma/Kbuild1
-rw-r--r--include/uapi/rdma/hfi/Kbuild2
-rw-r--r--include/uapi/rdma/hfi/hfi1_user.h427
-rw-r--r--include/uapi/rdma/rdma_netlink.h82
-rw-r--r--net/9p/trans_rdma.c26
-rw-r--r--net/ipv6/addrconf.c31
-rw-r--r--net/rds/ib.c13
-rw-r--r--net/rds/ib.h2
-rw-r--r--net/rds/ib_cm.c4
-rw-r--r--net/rds/ib_recv.c6
-rw-r--r--net/rds/ib_send.c8
-rw-r--r--net/rds/iw.c10
-rw-r--r--net/rds/iw_rdma.c5
-rw-r--r--net/rds/iw_send.c5
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c6
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c12
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c10
-rw-r--r--net/sunrpc/xprtrdma/verbs.c2
231 files changed, 64431 insertions, 2731 deletions
diff --git a/Documentation/infiniband/sysfs.txt b/Documentation/infiniband/sysfs.txt
index ddd519b72ee1..9028b025501a 100644
--- a/Documentation/infiniband/sysfs.txt
+++ b/Documentation/infiniband/sysfs.txt
@@ -64,3 +64,23 @@ MTHCA
64 fw_ver - Firmware version 64 fw_ver - Firmware version
65 hca_type - HCA type: "MT23108", "MT25208 (MT23108 compat mode)", 65 hca_type - HCA type: "MT23108", "MT25208 (MT23108 compat mode)",
66 or "MT25208" 66 or "MT25208"
67
68HFI1
69
70 The hfi1 driver also creates these additional files:
71
72 hw_rev - hardware revision
73 board_id - manufacturing board id
74 tempsense - thermal sense information
75 serial - board serial number
76 nfreectxts - number of free user contexts
77 nctxts - number of allowed contexts (PSM2)
78 chip_reset - diagnostic (root only)
79 boardversion - board version
80 ports/1/
81 CMgtA/
82 cc_settings_bin - CCA tables used by PSM2
83 cc_table_bin
84 sc2v/ - 32 files (0 - 31) used to translate sl->vl
85 sl2sc/ - 32 files (0 - 31) used to translate sl->sc
86 vl2mtu/ - 16 (0 - 15) files used to determine MTU for vl
diff --git a/MAINTAINERS b/MAINTAINERS
index d8a0aad20d6f..7635955f1c25 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5341,6 +5341,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma.git
5341S: Supported 5341S: Supported
5342F: Documentation/infiniband/ 5342F: Documentation/infiniband/
5343F: drivers/infiniband/ 5343F: drivers/infiniband/
5344F: drivers/staging/rdma/
5344F: include/uapi/linux/if_infiniband.h 5345F: include/uapi/linux/if_infiniband.h
5345F: include/uapi/rdma/ 5346F: include/uapi/rdma/
5346F: include/rdma/ 5347F: include/rdma/
@@ -5598,7 +5599,7 @@ IPATH DRIVER
5598M: Mike Marciniszyn <infinipath@intel.com> 5599M: Mike Marciniszyn <infinipath@intel.com>
5599L: linux-rdma@vger.kernel.org 5600L: linux-rdma@vger.kernel.org
5600S: Maintained 5601S: Maintained
5601F: drivers/infiniband/hw/ipath/ 5602F: drivers/staging/rdma/ipath/
5602 5603
5603IPMI SUBSYSTEM 5604IPMI SUBSYSTEM
5604M: Corey Minyard <minyard@acm.org> 5605M: Corey Minyard <minyard@acm.org>
@@ -9976,6 +9977,12 @@ M: Arnaud Patard <arnaud.patard@rtp-net.org>
9976S: Odd Fixes 9977S: Odd Fixes
9977F: drivers/staging/xgifb/ 9978F: drivers/staging/xgifb/
9978 9979
9980HFI1 DRIVER
9981M: Mike Marciniszyn <infinipath@intel.com>
9982L: linux-rdma@vger.kernel.org
9983S: Supported
9984F: drivers/staging/rdma/hfi1
9985
9979STARFIRE/DURALAN NETWORK DRIVER 9986STARFIRE/DURALAN NETWORK DRIVER
9980M: Ion Badulescu <ionut@badula.org> 9987M: Ion Badulescu <ionut@badula.org>
9981S: Odd Fixes 9988S: Odd Fixes
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index b899531498eb..da4c6979fbb8 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -55,10 +55,8 @@ config INFINIBAND_ADDR_TRANS
55 default y 55 default y
56 56
57source "drivers/infiniband/hw/mthca/Kconfig" 57source "drivers/infiniband/hw/mthca/Kconfig"
58source "drivers/infiniband/hw/ipath/Kconfig"
59source "drivers/infiniband/hw/qib/Kconfig" 58source "drivers/infiniband/hw/qib/Kconfig"
60source "drivers/infiniband/hw/ehca/Kconfig" 59source "drivers/infiniband/hw/ehca/Kconfig"
61source "drivers/infiniband/hw/amso1100/Kconfig"
62source "drivers/infiniband/hw/cxgb3/Kconfig" 60source "drivers/infiniband/hw/cxgb3/Kconfig"
63source "drivers/infiniband/hw/cxgb4/Kconfig" 61source "drivers/infiniband/hw/cxgb4/Kconfig"
64source "drivers/infiniband/hw/mlx4/Kconfig" 62source "drivers/infiniband/hw/mlx4/Kconfig"
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index acf736764445..d43a8994ac5c 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -9,7 +9,8 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
9 $(user_access-y) 9 $(user_access-y)
10 10
11ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ 11ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
12 device.o fmr_pool.o cache.o netlink.o 12 device.o fmr_pool.o cache.o netlink.o \
13 roce_gid_mgmt.o
13ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o 14ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
14ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o 15ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
15 16
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 871da832d016..8f66c67ff0df 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -37,6 +37,8 @@
37#include <linux/errno.h> 37#include <linux/errno.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/workqueue.h> 39#include <linux/workqueue.h>
40#include <linux/netdevice.h>
41#include <net/addrconf.h>
40 42
41#include <rdma/ib_cache.h> 43#include <rdma/ib_cache.h>
42 44
@@ -47,76 +49,621 @@ struct ib_pkey_cache {
47 u16 table[0]; 49 u16 table[0];
48}; 50};
49 51
50struct ib_gid_cache {
51 int table_len;
52 union ib_gid table[0];
53};
54
55struct ib_update_work { 52struct ib_update_work {
56 struct work_struct work; 53 struct work_struct work;
57 struct ib_device *device; 54 struct ib_device *device;
58 u8 port_num; 55 u8 port_num;
59}; 56};
60 57
61int ib_get_cached_gid(struct ib_device *device, 58union ib_gid zgid;
62 u8 port_num, 59EXPORT_SYMBOL(zgid);
63 int index, 60
64 union ib_gid *gid) 61static const struct ib_gid_attr zattr;
62
63enum gid_attr_find_mask {
64 GID_ATTR_FIND_MASK_GID = 1UL << 0,
65 GID_ATTR_FIND_MASK_NETDEV = 1UL << 1,
66 GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2,
67};
68
69enum gid_table_entry_props {
70 GID_TABLE_ENTRY_INVALID = 1UL << 0,
71 GID_TABLE_ENTRY_DEFAULT = 1UL << 1,
72};
73
74enum gid_table_write_action {
75 GID_TABLE_WRITE_ACTION_ADD,
76 GID_TABLE_WRITE_ACTION_DEL,
77 /* MODIFY only updates the GID table. Currently only used by
78 * ib_cache_update.
79 */
80 GID_TABLE_WRITE_ACTION_MODIFY
81};
82
83struct ib_gid_table_entry {
84 /* This lock protects an entry from being
85 * read and written simultaneously.
86 */
87 rwlock_t lock;
88 unsigned long props;
89 union ib_gid gid;
90 struct ib_gid_attr attr;
91 void *context;
92};
93
94struct ib_gid_table {
95 int sz;
96 /* In RoCE, adding a GID to the table requires:
97 * (a) Find if this GID is already exists.
98 * (b) Find a free space.
99 * (c) Write the new GID
100 *
101 * Delete requires different set of operations:
102 * (a) Find the GID
103 * (b) Delete it.
104 *
105 * Add/delete should be carried out atomically.
106 * This is done by locking this mutex from multiple
107 * writers. We don't need this lock for IB, as the MAD
108 * layer replaces all entries. All data_vec entries
109 * are locked by this lock.
110 **/
111 struct mutex lock;
112 struct ib_gid_table_entry *data_vec;
113};
114
115static int write_gid(struct ib_device *ib_dev, u8 port,
116 struct ib_gid_table *table, int ix,
117 const union ib_gid *gid,
118 const struct ib_gid_attr *attr,
119 enum gid_table_write_action action,
120 bool default_gid)
65{ 121{
66 struct ib_gid_cache *cache; 122 int ret = 0;
123 struct net_device *old_net_dev;
67 unsigned long flags; 124 unsigned long flags;
125
126 /* in rdma_cap_roce_gid_table, this funciton should be protected by a
127 * sleep-able lock.
128 */
129 write_lock_irqsave(&table->data_vec[ix].lock, flags);
130
131 if (rdma_cap_roce_gid_table(ib_dev, port)) {
132 table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
133 write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
134 /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by
135 * RoCE providers and thus only updates the cache.
136 */
137 if (action == GID_TABLE_WRITE_ACTION_ADD)
138 ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr,
139 &table->data_vec[ix].context);
140 else if (action == GID_TABLE_WRITE_ACTION_DEL)
141 ret = ib_dev->del_gid(ib_dev, port, ix,
142 &table->data_vec[ix].context);
143 write_lock_irqsave(&table->data_vec[ix].lock, flags);
144 }
145
146 old_net_dev = table->data_vec[ix].attr.ndev;
147 if (old_net_dev && old_net_dev != attr->ndev)
148 dev_put(old_net_dev);
149 /* if modify_gid failed, just delete the old gid */
150 if (ret || action == GID_TABLE_WRITE_ACTION_DEL) {
151 gid = &zgid;
152 attr = &zattr;
153 table->data_vec[ix].context = NULL;
154 }
155 if (default_gid)
156 table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT;
157 memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid));
158 memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr));
159 if (table->data_vec[ix].attr.ndev &&
160 table->data_vec[ix].attr.ndev != old_net_dev)
161 dev_hold(table->data_vec[ix].attr.ndev);
162
163 table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID;
164
165 write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
166
167 if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) {
168 struct ib_event event;
169
170 event.device = ib_dev;
171 event.element.port_num = port;
172 event.event = IB_EVENT_GID_CHANGE;
173
174 ib_dispatch_event(&event);
175 }
176 return ret;
177}
178
179static int add_gid(struct ib_device *ib_dev, u8 port,
180 struct ib_gid_table *table, int ix,
181 const union ib_gid *gid,
182 const struct ib_gid_attr *attr,
183 bool default_gid) {
184 return write_gid(ib_dev, port, table, ix, gid, attr,
185 GID_TABLE_WRITE_ACTION_ADD, default_gid);
186}
187
188static int modify_gid(struct ib_device *ib_dev, u8 port,
189 struct ib_gid_table *table, int ix,
190 const union ib_gid *gid,
191 const struct ib_gid_attr *attr,
192 bool default_gid) {
193 return write_gid(ib_dev, port, table, ix, gid, attr,
194 GID_TABLE_WRITE_ACTION_MODIFY, default_gid);
195}
196
197static int del_gid(struct ib_device *ib_dev, u8 port,
198 struct ib_gid_table *table, int ix,
199 bool default_gid) {
200 return write_gid(ib_dev, port, table, ix, &zgid, &zattr,
201 GID_TABLE_WRITE_ACTION_DEL, default_gid);
202}
203
204static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
205 const struct ib_gid_attr *val, bool default_gid,
206 unsigned long mask)
207{
208 int i;
209
210 for (i = 0; i < table->sz; i++) {
211 unsigned long flags;
212 struct ib_gid_attr *attr = &table->data_vec[i].attr;
213
214 read_lock_irqsave(&table->data_vec[i].lock, flags);
215
216 if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
217 goto next;
218
219 if (mask & GID_ATTR_FIND_MASK_GID &&
220 memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
221 goto next;
222
223 if (mask & GID_ATTR_FIND_MASK_NETDEV &&
224 attr->ndev != val->ndev)
225 goto next;
226
227 if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
228 !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) !=
229 default_gid)
230 goto next;
231
232 read_unlock_irqrestore(&table->data_vec[i].lock, flags);
233 return i;
234next:
235 read_unlock_irqrestore(&table->data_vec[i].lock, flags);
236 }
237
238 return -1;
239}
240
241static void make_default_gid(struct net_device *dev, union ib_gid *gid)
242{
243 gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
244 addrconf_ifid_eui48(&gid->raw[8], dev);
245}
246
247int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
248 union ib_gid *gid, struct ib_gid_attr *attr)
249{
250 struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
251 struct ib_gid_table *table;
252 int ix;
68 int ret = 0; 253 int ret = 0;
254 struct net_device *idev;
69 255
70 if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) 256 table = ports_table[port - rdma_start_port(ib_dev)];
257
258 if (!memcmp(gid, &zgid, sizeof(*gid)))
71 return -EINVAL; 259 return -EINVAL;
72 260
73 read_lock_irqsave(&device->cache.lock, flags); 261 if (ib_dev->get_netdev) {
262 idev = ib_dev->get_netdev(ib_dev, port);
263 if (idev && attr->ndev != idev) {
264 union ib_gid default_gid;
74 265
75 cache = device->cache.gid_cache[port_num - rdma_start_port(device)]; 266 /* Adding default GIDs in not permitted */
267 make_default_gid(idev, &default_gid);
268 if (!memcmp(gid, &default_gid, sizeof(*gid))) {
269 dev_put(idev);
270 return -EPERM;
271 }
272 }
273 if (idev)
274 dev_put(idev);
275 }
76 276
77 if (index < 0 || index >= cache->table_len) 277 mutex_lock(&table->lock);
78 ret = -EINVAL;
79 else
80 *gid = cache->table[index];
81 278
82 read_unlock_irqrestore(&device->cache.lock, flags); 279 ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID |
280 GID_ATTR_FIND_MASK_NETDEV);
281 if (ix >= 0)
282 goto out_unlock;
83 283
284 ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID |
285 GID_ATTR_FIND_MASK_DEFAULT);
286 if (ix < 0) {
287 ret = -ENOSPC;
288 goto out_unlock;
289 }
290
291 add_gid(ib_dev, port, table, ix, gid, attr, false);
292
293out_unlock:
294 mutex_unlock(&table->lock);
84 return ret; 295 return ret;
85} 296}
86EXPORT_SYMBOL(ib_get_cached_gid);
87 297
88int ib_find_cached_gid(struct ib_device *device, 298int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
89 const union ib_gid *gid, 299 union ib_gid *gid, struct ib_gid_attr *attr)
90 u8 *port_num,
91 u16 *index)
92{ 300{
93 struct ib_gid_cache *cache; 301 struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
302 struct ib_gid_table *table;
303 int ix;
304
305 table = ports_table[port - rdma_start_port(ib_dev)];
306
307 mutex_lock(&table->lock);
308
309 ix = find_gid(table, gid, attr, false,
310 GID_ATTR_FIND_MASK_GID |
311 GID_ATTR_FIND_MASK_NETDEV |
312 GID_ATTR_FIND_MASK_DEFAULT);
313 if (ix < 0)
314 goto out_unlock;
315
316 del_gid(ib_dev, port, table, ix, false);
317
318out_unlock:
319 mutex_unlock(&table->lock);
320 return 0;
321}
322
323int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
324 struct net_device *ndev)
325{
326 struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
327 struct ib_gid_table *table;
328 int ix;
329
330 table = ports_table[port - rdma_start_port(ib_dev)];
331
332 mutex_lock(&table->lock);
333
334 for (ix = 0; ix < table->sz; ix++)
335 if (table->data_vec[ix].attr.ndev == ndev)
336 del_gid(ib_dev, port, table, ix, false);
337
338 mutex_unlock(&table->lock);
339 return 0;
340}
341
342static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
343 union ib_gid *gid, struct ib_gid_attr *attr)
344{
345 struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
346 struct ib_gid_table *table;
94 unsigned long flags; 347 unsigned long flags;
95 int p, i;
96 int ret = -ENOENT;
97 348
98 *port_num = -1; 349 table = ports_table[port - rdma_start_port(ib_dev)];
99 if (index)
100 *index = -1;
101 350
102 read_lock_irqsave(&device->cache.lock, flags); 351 if (index < 0 || index >= table->sz)
352 return -EINVAL;
103 353
104 for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) { 354 read_lock_irqsave(&table->data_vec[index].lock, flags);
105 cache = device->cache.gid_cache[p]; 355 if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) {
106 for (i = 0; i < cache->table_len; ++i) { 356 read_unlock_irqrestore(&table->data_vec[index].lock, flags);
107 if (!memcmp(gid, &cache->table[i], sizeof *gid)) { 357 return -EAGAIN;
108 *port_num = p + rdma_start_port(device); 358 }
109 if (index) 359
110 *index = i; 360 memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
111 ret = 0; 361 if (attr) {
112 goto found; 362 memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
113 } 363 if (attr->ndev)
364 dev_hold(attr->ndev);
365 }
366
367 read_unlock_irqrestore(&table->data_vec[index].lock, flags);
368 return 0;
369}
370
371static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
372 const union ib_gid *gid,
373 const struct ib_gid_attr *val,
374 unsigned long mask,
375 u8 *port, u16 *index)
376{
377 struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
378 struct ib_gid_table *table;
379 u8 p;
380 int local_index;
381
382 for (p = 0; p < ib_dev->phys_port_cnt; p++) {
383 table = ports_table[p];
384 local_index = find_gid(table, gid, val, false, mask);
385 if (local_index >= 0) {
386 if (index)
387 *index = local_index;
388 if (port)
389 *port = p + rdma_start_port(ib_dev);
390 return 0;
114 } 391 }
115 } 392 }
116found:
117 read_unlock_irqrestore(&device->cache.lock, flags);
118 393
119 return ret; 394 return -ENOENT;
395}
396
397static int ib_cache_gid_find(struct ib_device *ib_dev,
398 const union ib_gid *gid,
399 struct net_device *ndev, u8 *port,
400 u16 *index)
401{
402 unsigned long mask = GID_ATTR_FIND_MASK_GID;
403 struct ib_gid_attr gid_attr_val = {.ndev = ndev};
404
405 if (ndev)
406 mask |= GID_ATTR_FIND_MASK_NETDEV;
407
408 return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val,
409 mask, port, index);
410}
411
412int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
413 const union ib_gid *gid,
414 u8 port, struct net_device *ndev,
415 u16 *index)
416{
417 int local_index;
418 struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
419 struct ib_gid_table *table;
420 unsigned long mask = GID_ATTR_FIND_MASK_GID;
421 struct ib_gid_attr val = {.ndev = ndev};
422
423 if (port < rdma_start_port(ib_dev) ||
424 port > rdma_end_port(ib_dev))
425 return -ENOENT;
426
427 table = ports_table[port - rdma_start_port(ib_dev)];
428
429 if (ndev)
430 mask |= GID_ATTR_FIND_MASK_NETDEV;
431
432 local_index = find_gid(table, gid, &val, false, mask);
433 if (local_index >= 0) {
434 if (index)
435 *index = local_index;
436 return 0;
437 }
438
439 return -ENOENT;
440}
441
442static struct ib_gid_table *alloc_gid_table(int sz)
443{
444 unsigned int i;
445 struct ib_gid_table *table =
446 kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
447 if (!table)
448 return NULL;
449
450 table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
451 if (!table->data_vec)
452 goto err_free_table;
453
454 mutex_init(&table->lock);
455
456 table->sz = sz;
457
458 for (i = 0; i < sz; i++)
459 rwlock_init(&table->data_vec[i].lock);
460
461 return table;
462
463err_free_table:
464 kfree(table);
465 return NULL;
466}
467
468static void release_gid_table(struct ib_gid_table *table)
469{
470 if (table) {
471 kfree(table->data_vec);
472 kfree(table);
473 }
474}
475
476static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
477 struct ib_gid_table *table)
478{
479 int i;
480
481 if (!table)
482 return;
483
484 for (i = 0; i < table->sz; ++i) {
485 if (memcmp(&table->data_vec[i].gid, &zgid,
486 sizeof(table->data_vec[i].gid)))
487 del_gid(ib_dev, port, table, i,
488 table->data_vec[i].props &
489 GID_ATTR_FIND_MASK_DEFAULT);
490 }
491}
492
493void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
494 struct net_device *ndev,
495 enum ib_cache_gid_default_mode mode)
496{
497 struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
498 union ib_gid gid;
499 struct ib_gid_attr gid_attr;
500 struct ib_gid_table *table;
501 int ix;
502 union ib_gid current_gid;
503 struct ib_gid_attr current_gid_attr = {};
504
505 table = ports_table[port - rdma_start_port(ib_dev)];
506
507 make_default_gid(ndev, &gid);
508 memset(&gid_attr, 0, sizeof(gid_attr));
509 gid_attr.ndev = ndev;
510
511 ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT);
512
513 /* Coudn't find default GID location */
514 WARN_ON(ix < 0);
515
516 mutex_lock(&table->lock);
517 if (!__ib_cache_gid_get(ib_dev, port, ix,
518 &current_gid, &current_gid_attr) &&
519 mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
520 !memcmp(&gid, &current_gid, sizeof(gid)) &&
521 !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
522 goto unlock;
523
524 if ((memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
525 memcmp(&current_gid_attr, &zattr,
526 sizeof(current_gid_attr))) &&
527 del_gid(ib_dev, port, table, ix, true)) {
528 pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
529 ix, gid.raw);
530 goto unlock;
531 }
532
533 if (mode == IB_CACHE_GID_DEFAULT_MODE_SET)
534 if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
535 pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
536 gid.raw);
537
538unlock:
539 if (current_gid_attr.ndev)
540 dev_put(current_gid_attr.ndev);
541 mutex_unlock(&table->lock);
542}
543
544static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
545 struct ib_gid_table *table)
546{
547 if (rdma_protocol_roce(ib_dev, port)) {
548 struct ib_gid_table_entry *entry = &table->data_vec[0];
549
550 entry->props |= GID_TABLE_ENTRY_DEFAULT;
551 }
552
553 return 0;
554}
555
556static int _gid_table_setup_one(struct ib_device *ib_dev)
557{
558 u8 port;
559 struct ib_gid_table **table;
560 int err = 0;
561
562 table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL);
563
564 if (!table) {
565 pr_warn("failed to allocate ib gid cache for %s\n",
566 ib_dev->name);
567 return -ENOMEM;
568 }
569
570 for (port = 0; port < ib_dev->phys_port_cnt; port++) {
571 u8 rdma_port = port + rdma_start_port(ib_dev);
572
573 table[port] =
574 alloc_gid_table(
575 ib_dev->port_immutable[rdma_port].gid_tbl_len);
576 if (!table[port]) {
577 err = -ENOMEM;
578 goto rollback_table_setup;
579 }
580
581 err = gid_table_reserve_default(ib_dev,
582 port + rdma_start_port(ib_dev),
583 table[port]);
584 if (err)
585 goto rollback_table_setup;
586 }
587
588 ib_dev->cache.gid_cache = table;
589 return 0;
590
591rollback_table_setup:
592 for (port = 0; port < ib_dev->phys_port_cnt; port++) {
593 cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
594 table[port]);
595 release_gid_table(table[port]);
596 }
597
598 kfree(table);
599 return err;
600}
601
602static void gid_table_release_one(struct ib_device *ib_dev)
603{
604 struct ib_gid_table **table = ib_dev->cache.gid_cache;
605 u8 port;
606
607 if (!table)
608 return;
609
610 for (port = 0; port < ib_dev->phys_port_cnt; port++)
611 release_gid_table(table[port]);
612
613 kfree(table);
614 ib_dev->cache.gid_cache = NULL;
615}
616
617static void gid_table_cleanup_one(struct ib_device *ib_dev)
618{
619 struct ib_gid_table **table = ib_dev->cache.gid_cache;
620 u8 port;
621
622 if (!table)
623 return;
624
625 for (port = 0; port < ib_dev->phys_port_cnt; port++)
626 cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
627 table[port]);
628}
629
630static int gid_table_setup_one(struct ib_device *ib_dev)
631{
632 int err;
633
634 err = _gid_table_setup_one(ib_dev);
635
636 if (err)
637 return err;
638
639 err = roce_rescan_device(ib_dev);
640
641 if (err) {
642 gid_table_cleanup_one(ib_dev);
643 gid_table_release_one(ib_dev);
644 }
645
646 return err;
647}
648
649int ib_get_cached_gid(struct ib_device *device,
650 u8 port_num,
651 int index,
652 union ib_gid *gid)
653{
654 if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
655 return -EINVAL;
656
657 return __ib_cache_gid_get(device, port_num, index, gid, NULL);
658}
659EXPORT_SYMBOL(ib_get_cached_gid);
660
661int ib_find_cached_gid(struct ib_device *device,
662 const union ib_gid *gid,
663 u8 *port_num,
664 u16 *index)
665{
666 return ib_cache_gid_find(device, gid, NULL, port_num, index);
120} 667}
121EXPORT_SYMBOL(ib_find_cached_gid); 668EXPORT_SYMBOL(ib_find_cached_gid);
122 669
@@ -243,9 +790,21 @@ static void ib_cache_update(struct ib_device *device,
243{ 790{
244 struct ib_port_attr *tprops = NULL; 791 struct ib_port_attr *tprops = NULL;
245 struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; 792 struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
246 struct ib_gid_cache *gid_cache = NULL, *old_gid_cache; 793 struct ib_gid_cache {
794 int table_len;
795 union ib_gid table[0];
796 } *gid_cache = NULL;
247 int i; 797 int i;
248 int ret; 798 int ret;
799 struct ib_gid_table *table;
800 struct ib_gid_table **ports_table = device->cache.gid_cache;
801 bool use_roce_gid_table =
802 rdma_cap_roce_gid_table(device, port);
803
804 if (port < rdma_start_port(device) || port > rdma_end_port(device))
805 return;
806
807 table = ports_table[port - rdma_start_port(device)];
249 808
250 tprops = kmalloc(sizeof *tprops, GFP_KERNEL); 809 tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
251 if (!tprops) 810 if (!tprops)
@@ -265,12 +824,14 @@ static void ib_cache_update(struct ib_device *device,
265 824
266 pkey_cache->table_len = tprops->pkey_tbl_len; 825 pkey_cache->table_len = tprops->pkey_tbl_len;
267 826
268 gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len * 827 if (!use_roce_gid_table) {
269 sizeof *gid_cache->table, GFP_KERNEL); 828 gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len *
270 if (!gid_cache) 829 sizeof(*gid_cache->table), GFP_KERNEL);
271 goto err; 830 if (!gid_cache)
831 goto err;
272 832
273 gid_cache->table_len = tprops->gid_tbl_len; 833 gid_cache->table_len = tprops->gid_tbl_len;
834 }
274 835
275 for (i = 0; i < pkey_cache->table_len; ++i) { 836 for (i = 0; i < pkey_cache->table_len; ++i) {
276 ret = ib_query_pkey(device, port, i, pkey_cache->table + i); 837 ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
@@ -281,29 +842,36 @@ static void ib_cache_update(struct ib_device *device,
281 } 842 }
282 } 843 }
283 844
284 for (i = 0; i < gid_cache->table_len; ++i) { 845 if (!use_roce_gid_table) {
285 ret = ib_query_gid(device, port, i, gid_cache->table + i); 846 for (i = 0; i < gid_cache->table_len; ++i) {
286 if (ret) { 847 ret = ib_query_gid(device, port, i,
287 printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", 848 gid_cache->table + i);
288 ret, device->name, i); 849 if (ret) {
289 goto err; 850 printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
851 ret, device->name, i);
852 goto err;
853 }
290 } 854 }
291 } 855 }
292 856
293 write_lock_irq(&device->cache.lock); 857 write_lock_irq(&device->cache.lock);
294 858
295 old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)]; 859 old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)];
296 old_gid_cache = device->cache.gid_cache [port - rdma_start_port(device)];
297 860
298 device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache; 861 device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache;
299 device->cache.gid_cache [port - rdma_start_port(device)] = gid_cache; 862 if (!use_roce_gid_table) {
863 for (i = 0; i < gid_cache->table_len; i++) {
864 modify_gid(device, port, table, i, gid_cache->table + i,
865 &zattr, false);
866 }
867 }
300 868
301 device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc; 869 device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;
302 870
303 write_unlock_irq(&device->cache.lock); 871 write_unlock_irq(&device->cache.lock);
304 872
873 kfree(gid_cache);
305 kfree(old_pkey_cache); 874 kfree(old_pkey_cache);
306 kfree(old_gid_cache);
307 kfree(tprops); 875 kfree(tprops);
308 return; 876 return;
309 877
@@ -344,85 +912,88 @@ static void ib_cache_event(struct ib_event_handler *handler,
344 } 912 }
345} 913}
346 914
347static void ib_cache_setup_one(struct ib_device *device) 915int ib_cache_setup_one(struct ib_device *device)
348{ 916{
349 int p; 917 int p;
918 int err;
350 919
351 rwlock_init(&device->cache.lock); 920 rwlock_init(&device->cache.lock);
352 921
353 device->cache.pkey_cache = 922 device->cache.pkey_cache =
354 kmalloc(sizeof *device->cache.pkey_cache * 923 kzalloc(sizeof *device->cache.pkey_cache *
355 (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
356 device->cache.gid_cache =
357 kmalloc(sizeof *device->cache.gid_cache *
358 (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); 924 (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
359
360 device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache * 925 device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
361 (rdma_end_port(device) - 926 (rdma_end_port(device) -
362 rdma_start_port(device) + 1), 927 rdma_start_port(device) + 1),
363 GFP_KERNEL); 928 GFP_KERNEL);
364 929 if (!device->cache.pkey_cache ||
365 if (!device->cache.pkey_cache || !device->cache.gid_cache ||
366 !device->cache.lmc_cache) { 930 !device->cache.lmc_cache) {
367 printk(KERN_WARNING "Couldn't allocate cache " 931 printk(KERN_WARNING "Couldn't allocate cache "
368 "for %s\n", device->name); 932 "for %s\n", device->name);
369 goto err; 933 return -ENOMEM;
370 } 934 }
371 935
372 for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) { 936 err = gid_table_setup_one(device);
373 device->cache.pkey_cache[p] = NULL; 937 if (err)
374 device->cache.gid_cache [p] = NULL; 938 /* Allocated memory will be cleaned in the release function */
939 return err;
940
941 for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
375 ib_cache_update(device, p + rdma_start_port(device)); 942 ib_cache_update(device, p + rdma_start_port(device));
376 }
377 943
378 INIT_IB_EVENT_HANDLER(&device->cache.event_handler, 944 INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
379 device, ib_cache_event); 945 device, ib_cache_event);
380 if (ib_register_event_handler(&device->cache.event_handler)) 946 err = ib_register_event_handler(&device->cache.event_handler);
381 goto err_cache; 947 if (err)
382 948 goto err;
383 return;
384 949
385err_cache: 950 return 0;
386 for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
387 kfree(device->cache.pkey_cache[p]);
388 kfree(device->cache.gid_cache[p]);
389 }
390 951
391err: 952err:
392 kfree(device->cache.pkey_cache); 953 gid_table_cleanup_one(device);
393 kfree(device->cache.gid_cache); 954 return err;
394 kfree(device->cache.lmc_cache);
395} 955}
396 956
397static void ib_cache_cleanup_one(struct ib_device *device) 957void ib_cache_release_one(struct ib_device *device)
398{ 958{
399 int p; 959 int p;
400 960
401 ib_unregister_event_handler(&device->cache.event_handler); 961 /*
402 flush_workqueue(ib_wq); 962 * The release function frees all the cache elements.
403 963 * This function should be called as part of freeing
404 for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) { 964 * all the device's resources when the cache could no
405 kfree(device->cache.pkey_cache[p]); 965 * longer be accessed.
406 kfree(device->cache.gid_cache[p]); 966 */
407 } 967 if (device->cache.pkey_cache)
408 968 for (p = 0;
969 p <= rdma_end_port(device) - rdma_start_port(device); ++p)
970 kfree(device->cache.pkey_cache[p]);
971
972 gid_table_release_one(device);
409 kfree(device->cache.pkey_cache); 973 kfree(device->cache.pkey_cache);
410 kfree(device->cache.gid_cache);
411 kfree(device->cache.lmc_cache); 974 kfree(device->cache.lmc_cache);
412} 975}
413 976
414static struct ib_client cache_client = { 977void ib_cache_cleanup_one(struct ib_device *device)
415 .name = "cache", 978{
416 .add = ib_cache_setup_one, 979 /* The cleanup function unregisters the event handler,
417 .remove = ib_cache_cleanup_one 980 * waits for all in-progress workqueue elements and cleans
418}; 981 * up the GID cache. This function should be called after
982 * the device was removed from the devices list and all
983 * clients were removed, so the cache exists but is
984 * non-functional and shouldn't be updated anymore.
985 */
986 ib_unregister_event_handler(&device->cache.event_handler);
987 flush_workqueue(ib_wq);
988 gid_table_cleanup_one(device);
989}
419 990
420int __init ib_cache_setup(void) 991void __init ib_cache_setup(void)
421{ 992{
422 return ib_register_client(&cache_client); 993 roce_gid_mgmt_init();
423} 994}
424 995
425void __exit ib_cache_cleanup(void) 996void __exit ib_cache_cleanup(void)
426{ 997{
427 ib_unregister_client(&cache_client); 998 roce_gid_mgmt_cleanup();
428} 999}
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 3a972ebf3c0d..ea4db9c1d44f 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -58,7 +58,7 @@ MODULE_DESCRIPTION("InfiniBand CM");
58MODULE_LICENSE("Dual BSD/GPL"); 58MODULE_LICENSE("Dual BSD/GPL");
59 59
60static void cm_add_one(struct ib_device *device); 60static void cm_add_one(struct ib_device *device);
61static void cm_remove_one(struct ib_device *device); 61static void cm_remove_one(struct ib_device *device, void *client_data);
62 62
63static struct ib_client cm_client = { 63static struct ib_client cm_client = {
64 .name = "cm", 64 .name = "cm",
@@ -213,13 +213,15 @@ struct cm_id_private {
213 spinlock_t lock; /* Do not acquire inside cm.lock */ 213 spinlock_t lock; /* Do not acquire inside cm.lock */
214 struct completion comp; 214 struct completion comp;
215 atomic_t refcount; 215 atomic_t refcount;
216 /* Number of clients sharing this ib_cm_id. Only valid for listeners.
217 * Protected by the cm.lock spinlock. */
218 int listen_sharecount;
216 219
217 struct ib_mad_send_buf *msg; 220 struct ib_mad_send_buf *msg;
218 struct cm_timewait_info *timewait_info; 221 struct cm_timewait_info *timewait_info;
219 /* todo: use alternate port on send failure */ 222 /* todo: use alternate port on send failure */
220 struct cm_av av; 223 struct cm_av av;
221 struct cm_av alt_av; 224 struct cm_av alt_av;
222 struct ib_cm_compare_data *compare_data;
223 225
224 void *private_data; 226 void *private_data;
225 __be64 tid; 227 __be64 tid;
@@ -440,40 +442,6 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
440 return cm_id_priv; 442 return cm_id_priv;
441} 443}
442 444
443static void cm_mask_copy(u32 *dst, const u32 *src, const u32 *mask)
444{
445 int i;
446
447 for (i = 0; i < IB_CM_COMPARE_SIZE; i++)
448 dst[i] = src[i] & mask[i];
449}
450
451static int cm_compare_data(struct ib_cm_compare_data *src_data,
452 struct ib_cm_compare_data *dst_data)
453{
454 u32 src[IB_CM_COMPARE_SIZE];
455 u32 dst[IB_CM_COMPARE_SIZE];
456
457 if (!src_data || !dst_data)
458 return 0;
459
460 cm_mask_copy(src, src_data->data, dst_data->mask);
461 cm_mask_copy(dst, dst_data->data, src_data->mask);
462 return memcmp(src, dst, sizeof(src));
463}
464
465static int cm_compare_private_data(u32 *private_data,
466 struct ib_cm_compare_data *dst_data)
467{
468 u32 src[IB_CM_COMPARE_SIZE];
469
470 if (!dst_data)
471 return 0;
472
473 cm_mask_copy(src, private_data, dst_data->mask);
474 return memcmp(src, dst_data->data, sizeof(src));
475}
476
477/* 445/*
478 * Trivial helpers to strip endian annotation and compare; the 446 * Trivial helpers to strip endian annotation and compare; the
479 * endianness doesn't actually matter since we just need a stable 447 * endianness doesn't actually matter since we just need a stable
@@ -506,18 +474,14 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
506 struct cm_id_private *cur_cm_id_priv; 474 struct cm_id_private *cur_cm_id_priv;
507 __be64 service_id = cm_id_priv->id.service_id; 475 __be64 service_id = cm_id_priv->id.service_id;
508 __be64 service_mask = cm_id_priv->id.service_mask; 476 __be64 service_mask = cm_id_priv->id.service_mask;
509 int data_cmp;
510 477
511 while (*link) { 478 while (*link) {
512 parent = *link; 479 parent = *link;
513 cur_cm_id_priv = rb_entry(parent, struct cm_id_private, 480 cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
514 service_node); 481 service_node);
515 data_cmp = cm_compare_data(cm_id_priv->compare_data,
516 cur_cm_id_priv->compare_data);
517 if ((cur_cm_id_priv->id.service_mask & service_id) == 482 if ((cur_cm_id_priv->id.service_mask & service_id) ==
518 (service_mask & cur_cm_id_priv->id.service_id) && 483 (service_mask & cur_cm_id_priv->id.service_id) &&
519 (cm_id_priv->id.device == cur_cm_id_priv->id.device) && 484 (cm_id_priv->id.device == cur_cm_id_priv->id.device))
520 !data_cmp)
521 return cur_cm_id_priv; 485 return cur_cm_id_priv;
522 486
523 if (cm_id_priv->id.device < cur_cm_id_priv->id.device) 487 if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
@@ -528,8 +492,6 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
528 link = &(*link)->rb_left; 492 link = &(*link)->rb_left;
529 else if (be64_gt(service_id, cur_cm_id_priv->id.service_id)) 493 else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
530 link = &(*link)->rb_right; 494 link = &(*link)->rb_right;
531 else if (data_cmp < 0)
532 link = &(*link)->rb_left;
533 else 495 else
534 link = &(*link)->rb_right; 496 link = &(*link)->rb_right;
535 } 497 }
@@ -539,20 +501,16 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
539} 501}
540 502
541static struct cm_id_private * cm_find_listen(struct ib_device *device, 503static struct cm_id_private * cm_find_listen(struct ib_device *device,
542 __be64 service_id, 504 __be64 service_id)
543 u32 *private_data)
544{ 505{
545 struct rb_node *node = cm.listen_service_table.rb_node; 506 struct rb_node *node = cm.listen_service_table.rb_node;
546 struct cm_id_private *cm_id_priv; 507 struct cm_id_private *cm_id_priv;
547 int data_cmp;
548 508
549 while (node) { 509 while (node) {
550 cm_id_priv = rb_entry(node, struct cm_id_private, service_node); 510 cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
551 data_cmp = cm_compare_private_data(private_data,
552 cm_id_priv->compare_data);
553 if ((cm_id_priv->id.service_mask & service_id) == 511 if ((cm_id_priv->id.service_mask & service_id) ==
554 cm_id_priv->id.service_id && 512 cm_id_priv->id.service_id &&
555 (cm_id_priv->id.device == device) && !data_cmp) 513 (cm_id_priv->id.device == device))
556 return cm_id_priv; 514 return cm_id_priv;
557 515
558 if (device < cm_id_priv->id.device) 516 if (device < cm_id_priv->id.device)
@@ -563,8 +521,6 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,
563 node = node->rb_left; 521 node = node->rb_left;
564 else if (be64_gt(service_id, cm_id_priv->id.service_id)) 522 else if (be64_gt(service_id, cm_id_priv->id.service_id))
565 node = node->rb_right; 523 node = node->rb_right;
566 else if (data_cmp < 0)
567 node = node->rb_left;
568 else 524 else
569 node = node->rb_right; 525 node = node->rb_right;
570 } 526 }
@@ -859,9 +815,15 @@ retest:
859 spin_lock_irq(&cm_id_priv->lock); 815 spin_lock_irq(&cm_id_priv->lock);
860 switch (cm_id->state) { 816 switch (cm_id->state) {
861 case IB_CM_LISTEN: 817 case IB_CM_LISTEN:
862 cm_id->state = IB_CM_IDLE;
863 spin_unlock_irq(&cm_id_priv->lock); 818 spin_unlock_irq(&cm_id_priv->lock);
819
864 spin_lock_irq(&cm.lock); 820 spin_lock_irq(&cm.lock);
821 if (--cm_id_priv->listen_sharecount > 0) {
822 /* The id is still shared. */
823 cm_deref_id(cm_id_priv);
824 spin_unlock_irq(&cm.lock);
825 return;
826 }
865 rb_erase(&cm_id_priv->service_node, &cm.listen_service_table); 827 rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
866 spin_unlock_irq(&cm.lock); 828 spin_unlock_irq(&cm.lock);
867 break; 829 break;
@@ -930,7 +892,6 @@ retest:
930 wait_for_completion(&cm_id_priv->comp); 892 wait_for_completion(&cm_id_priv->comp);
931 while ((work = cm_dequeue_work(cm_id_priv)) != NULL) 893 while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
932 cm_free_work(work); 894 cm_free_work(work);
933 kfree(cm_id_priv->compare_data);
934 kfree(cm_id_priv->private_data); 895 kfree(cm_id_priv->private_data);
935 kfree(cm_id_priv); 896 kfree(cm_id_priv);
936} 897}
@@ -941,11 +902,23 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)
941} 902}
942EXPORT_SYMBOL(ib_destroy_cm_id); 903EXPORT_SYMBOL(ib_destroy_cm_id);
943 904
944int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, 905/**
945 struct ib_cm_compare_data *compare_data) 906 * __ib_cm_listen - Initiates listening on the specified service ID for
907 * connection and service ID resolution requests.
908 * @cm_id: Connection identifier associated with the listen request.
909 * @service_id: Service identifier matched against incoming connection
910 * and service ID resolution requests. The service ID should be specified
911 * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
912 * assign a service ID to the caller.
913 * @service_mask: Mask applied to service ID used to listen across a
914 * range of service IDs. If set to 0, the service ID is matched
915 * exactly. This parameter is ignored if %service_id is set to
916 * IB_CM_ASSIGN_SERVICE_ID.
917 */
918static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
919 __be64 service_mask)
946{ 920{
947 struct cm_id_private *cm_id_priv, *cur_cm_id_priv; 921 struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
948 unsigned long flags;
949 int ret = 0; 922 int ret = 0;
950 923
951 service_mask = service_mask ? service_mask : ~cpu_to_be64(0); 924 service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
@@ -958,20 +931,9 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
958 if (cm_id->state != IB_CM_IDLE) 931 if (cm_id->state != IB_CM_IDLE)
959 return -EINVAL; 932 return -EINVAL;
960 933
961 if (compare_data) {
962 cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
963 GFP_KERNEL);
964 if (!cm_id_priv->compare_data)
965 return -ENOMEM;
966 cm_mask_copy(cm_id_priv->compare_data->data,
967 compare_data->data, compare_data->mask);
968 memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
969 sizeof(compare_data->mask));
970 }
971
972 cm_id->state = IB_CM_LISTEN; 934 cm_id->state = IB_CM_LISTEN;
935 ++cm_id_priv->listen_sharecount;
973 936
974 spin_lock_irqsave(&cm.lock, flags);
975 if (service_id == IB_CM_ASSIGN_SERVICE_ID) { 937 if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
976 cm_id->service_id = cpu_to_be64(cm.listen_service_id++); 938 cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
977 cm_id->service_mask = ~cpu_to_be64(0); 939 cm_id->service_mask = ~cpu_to_be64(0);
@@ -980,18 +942,95 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
980 cm_id->service_mask = service_mask; 942 cm_id->service_mask = service_mask;
981 } 943 }
982 cur_cm_id_priv = cm_insert_listen(cm_id_priv); 944 cur_cm_id_priv = cm_insert_listen(cm_id_priv);
983 spin_unlock_irqrestore(&cm.lock, flags);
984 945
985 if (cur_cm_id_priv) { 946 if (cur_cm_id_priv) {
986 cm_id->state = IB_CM_IDLE; 947 cm_id->state = IB_CM_IDLE;
987 kfree(cm_id_priv->compare_data); 948 --cm_id_priv->listen_sharecount;
988 cm_id_priv->compare_data = NULL;
989 ret = -EBUSY; 949 ret = -EBUSY;
990 } 950 }
991 return ret; 951 return ret;
992} 952}
953
954int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
955{
956 unsigned long flags;
957 int ret;
958
959 spin_lock_irqsave(&cm.lock, flags);
960 ret = __ib_cm_listen(cm_id, service_id, service_mask);
961 spin_unlock_irqrestore(&cm.lock, flags);
962
963 return ret;
964}
993EXPORT_SYMBOL(ib_cm_listen); 965EXPORT_SYMBOL(ib_cm_listen);
994 966
967/**
968 * Create a new listening ib_cm_id and listen on the given service ID.
969 *
970 * If there's an existing ID listening on that same device and service ID,
971 * return it.
972 *
973 * @device: Device associated with the cm_id. All related communication will
974 * be associated with the specified device.
975 * @cm_handler: Callback invoked to notify the user of CM events.
976 * @service_id: Service identifier matched against incoming connection
977 * and service ID resolution requests. The service ID should be specified
978 * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
979 * assign a service ID to the caller.
980 *
981 * Callers should call ib_destroy_cm_id when done with the listener ID.
982 */
983struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
984 ib_cm_handler cm_handler,
985 __be64 service_id)
986{
987 struct cm_id_private *cm_id_priv;
988 struct ib_cm_id *cm_id;
989 unsigned long flags;
990 int err = 0;
991
992 /* Create an ID in advance, since the creation may sleep */
993 cm_id = ib_create_cm_id(device, cm_handler, NULL);
994 if (IS_ERR(cm_id))
995 return cm_id;
996
997 spin_lock_irqsave(&cm.lock, flags);
998
999 if (service_id == IB_CM_ASSIGN_SERVICE_ID)
1000 goto new_id;
1001
1002 /* Find an existing ID */
1003 cm_id_priv = cm_find_listen(device, service_id);
1004 if (cm_id_priv) {
1005 if (cm_id->cm_handler != cm_handler || cm_id->context) {
1006 /* Sharing an ib_cm_id with different handlers is not
1007 * supported */
1008 spin_unlock_irqrestore(&cm.lock, flags);
1009 return ERR_PTR(-EINVAL);
1010 }
1011 atomic_inc(&cm_id_priv->refcount);
1012 ++cm_id_priv->listen_sharecount;
1013 spin_unlock_irqrestore(&cm.lock, flags);
1014
1015 ib_destroy_cm_id(cm_id);
1016 cm_id = &cm_id_priv->id;
1017 return cm_id;
1018 }
1019
1020new_id:
1021 /* Use newly created ID */
1022 err = __ib_cm_listen(cm_id, service_id, 0);
1023
1024 spin_unlock_irqrestore(&cm.lock, flags);
1025
1026 if (err) {
1027 ib_destroy_cm_id(cm_id);
1028 return ERR_PTR(err);
1029 }
1030 return cm_id;
1031}
1032EXPORT_SYMBOL(ib_cm_insert_listen);
1033
995static __be64 cm_form_tid(struct cm_id_private *cm_id_priv, 1034static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
996 enum cm_msg_sequence msg_seq) 1035 enum cm_msg_sequence msg_seq)
997{ 1036{
@@ -1268,6 +1307,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
1268 primary_path->packet_life_time = 1307 primary_path->packet_life_time =
1269 cm_req_get_primary_local_ack_timeout(req_msg); 1308 cm_req_get_primary_local_ack_timeout(req_msg);
1270 primary_path->packet_life_time -= (primary_path->packet_life_time > 0); 1309 primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
1310 primary_path->service_id = req_msg->service_id;
1271 1311
1272 if (req_msg->alt_local_lid) { 1312 if (req_msg->alt_local_lid) {
1273 memset(alt_path, 0, sizeof *alt_path); 1313 memset(alt_path, 0, sizeof *alt_path);
@@ -1289,7 +1329,26 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
1289 alt_path->packet_life_time = 1329 alt_path->packet_life_time =
1290 cm_req_get_alt_local_ack_timeout(req_msg); 1330 cm_req_get_alt_local_ack_timeout(req_msg);
1291 alt_path->packet_life_time -= (alt_path->packet_life_time > 0); 1331 alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
1332 alt_path->service_id = req_msg->service_id;
1333 }
1334}
1335
1336static u16 cm_get_bth_pkey(struct cm_work *work)
1337{
1338 struct ib_device *ib_dev = work->port->cm_dev->ib_device;
1339 u8 port_num = work->port->port_num;
1340 u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
1341 u16 pkey;
1342 int ret;
1343
1344 ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
1345 if (ret) {
1346 dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n",
1347 port_num, pkey_index, ret);
1348 return 0;
1292 } 1349 }
1350
1351 return pkey;
1293} 1352}
1294 1353
1295static void cm_format_req_event(struct cm_work *work, 1354static void cm_format_req_event(struct cm_work *work,
@@ -1302,6 +1361,7 @@ static void cm_format_req_event(struct cm_work *work,
1302 req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; 1361 req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
1303 param = &work->cm_event.param.req_rcvd; 1362 param = &work->cm_event.param.req_rcvd;
1304 param->listen_id = listen_id; 1363 param->listen_id = listen_id;
1364 param->bth_pkey = cm_get_bth_pkey(work);
1305 param->port = cm_id_priv->av.port->port_num; 1365 param->port = cm_id_priv->av.port->port_num;
1306 param->primary_path = &work->path[0]; 1366 param->primary_path = &work->path[0];
1307 if (req_msg->alt_local_lid) 1367 if (req_msg->alt_local_lid)
@@ -1484,8 +1544,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
1484 1544
1485 /* Find matching listen request. */ 1545 /* Find matching listen request. */
1486 listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, 1546 listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
1487 req_msg->service_id, 1547 req_msg->service_id);
1488 req_msg->private_data);
1489 if (!listen_cm_id_priv) { 1548 if (!listen_cm_id_priv) {
1490 cm_cleanup_timewait(cm_id_priv->timewait_info); 1549 cm_cleanup_timewait(cm_id_priv->timewait_info);
1491 spin_unlock_irq(&cm.lock); 1550 spin_unlock_irq(&cm.lock);
@@ -2992,6 +3051,8 @@ static void cm_format_sidr_req_event(struct cm_work *work,
2992 param = &work->cm_event.param.sidr_req_rcvd; 3051 param = &work->cm_event.param.sidr_req_rcvd;
2993 param->pkey = __be16_to_cpu(sidr_req_msg->pkey); 3052 param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
2994 param->listen_id = listen_id; 3053 param->listen_id = listen_id;
3054 param->service_id = sidr_req_msg->service_id;
3055 param->bth_pkey = cm_get_bth_pkey(work);
2995 param->port = work->port->port_num; 3056 param->port = work->port->port_num;
2996 work->cm_event.private_data = &sidr_req_msg->private_data; 3057 work->cm_event.private_data = &sidr_req_msg->private_data;
2997} 3058}
@@ -3031,8 +3092,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
3031 } 3092 }
3032 cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; 3093 cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
3033 cur_cm_id_priv = cm_find_listen(cm_id->device, 3094 cur_cm_id_priv = cm_find_listen(cm_id->device,
3034 sidr_req_msg->service_id, 3095 sidr_req_msg->service_id);
3035 sidr_req_msg->private_data);
3036 if (!cur_cm_id_priv) { 3096 if (!cur_cm_id_priv) {
3037 spin_unlock_irq(&cm.lock); 3097 spin_unlock_irq(&cm.lock);
3038 cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED); 3098 cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
@@ -3886,9 +3946,9 @@ free:
3886 kfree(cm_dev); 3946 kfree(cm_dev);
3887} 3947}
3888 3948
3889static void cm_remove_one(struct ib_device *ib_device) 3949static void cm_remove_one(struct ib_device *ib_device, void *client_data)
3890{ 3950{
3891 struct cm_device *cm_dev; 3951 struct cm_device *cm_dev = client_data;
3892 struct cm_port *port; 3952 struct cm_port *port;
3893 struct ib_port_modify port_modify = { 3953 struct ib_port_modify port_modify = {
3894 .clr_port_cap_mask = IB_PORT_CM_SUP 3954 .clr_port_cap_mask = IB_PORT_CM_SUP
@@ -3896,7 +3956,6 @@ static void cm_remove_one(struct ib_device *ib_device)
3896 unsigned long flags; 3956 unsigned long flags;
3897 int i; 3957 int i;
3898 3958
3899 cm_dev = ib_get_client_data(ib_device, &cm_client);
3900 if (!cm_dev) 3959 if (!cm_dev)
3901 return; 3960 return;
3902 3961
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 143ded2bbe7c..b1ab13f3e182 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -46,6 +46,8 @@
46 46
47#include <net/tcp.h> 47#include <net/tcp.h>
48#include <net/ipv6.h> 48#include <net/ipv6.h>
49#include <net/ip_fib.h>
50#include <net/ip6_route.h>
49 51
50#include <rdma/rdma_cm.h> 52#include <rdma/rdma_cm.h>
51#include <rdma/rdma_cm_ib.h> 53#include <rdma/rdma_cm_ib.h>
@@ -94,7 +96,7 @@ const char *rdma_event_msg(enum rdma_cm_event_type event)
94EXPORT_SYMBOL(rdma_event_msg); 96EXPORT_SYMBOL(rdma_event_msg);
95 97
96static void cma_add_one(struct ib_device *device); 98static void cma_add_one(struct ib_device *device);
97static void cma_remove_one(struct ib_device *device); 99static void cma_remove_one(struct ib_device *device, void *client_data);
98 100
99static struct ib_client cma_client = { 101static struct ib_client cma_client = {
100 .name = "cma", 102 .name = "cma",
@@ -113,6 +115,22 @@ static DEFINE_IDR(udp_ps);
113static DEFINE_IDR(ipoib_ps); 115static DEFINE_IDR(ipoib_ps);
114static DEFINE_IDR(ib_ps); 116static DEFINE_IDR(ib_ps);
115 117
118static struct idr *cma_idr(enum rdma_port_space ps)
119{
120 switch (ps) {
121 case RDMA_PS_TCP:
122 return &tcp_ps;
123 case RDMA_PS_UDP:
124 return &udp_ps;
125 case RDMA_PS_IPOIB:
126 return &ipoib_ps;
127 case RDMA_PS_IB:
128 return &ib_ps;
129 default:
130 return NULL;
131 }
132}
133
116struct cma_device { 134struct cma_device {
117 struct list_head list; 135 struct list_head list;
118 struct ib_device *device; 136 struct ib_device *device;
@@ -122,11 +140,33 @@ struct cma_device {
122}; 140};
123 141
124struct rdma_bind_list { 142struct rdma_bind_list {
125 struct idr *ps; 143 enum rdma_port_space ps;
126 struct hlist_head owners; 144 struct hlist_head owners;
127 unsigned short port; 145 unsigned short port;
128}; 146};
129 147
148static int cma_ps_alloc(enum rdma_port_space ps,
149 struct rdma_bind_list *bind_list, int snum)
150{
151 struct idr *idr = cma_idr(ps);
152
153 return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
154}
155
156static struct rdma_bind_list *cma_ps_find(enum rdma_port_space ps, int snum)
157{
158 struct idr *idr = cma_idr(ps);
159
160 return idr_find(idr, snum);
161}
162
163static void cma_ps_remove(enum rdma_port_space ps, int snum)
164{
165 struct idr *idr = cma_idr(ps);
166
167 idr_remove(idr, snum);
168}
169
130enum { 170enum {
131 CMA_OPTION_AFONLY, 171 CMA_OPTION_AFONLY,
132}; 172};
@@ -225,6 +265,15 @@ struct cma_hdr {
225 265
226#define CMA_VERSION 0x00 266#define CMA_VERSION 0x00
227 267
268struct cma_req_info {
269 struct ib_device *device;
270 int port;
271 union ib_gid local_gid;
272 __be64 service_id;
273 u16 pkey;
274 bool has_gid:1;
275};
276
228static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) 277static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
229{ 278{
230 unsigned long flags; 279 unsigned long flags;
@@ -262,7 +311,7 @@ static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
262 return old; 311 return old;
263} 312}
264 313
265static inline u8 cma_get_ip_ver(struct cma_hdr *hdr) 314static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
266{ 315{
267 return hdr->ip_version >> 4; 316 return hdr->ip_version >> 4;
268} 317}
@@ -870,107 +919,397 @@ static inline int cma_any_port(struct sockaddr *addr)
870 return !cma_port(addr); 919 return !cma_port(addr);
871} 920}
872 921
873static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, 922static void cma_save_ib_info(struct sockaddr *src_addr,
923 struct sockaddr *dst_addr,
924 struct rdma_cm_id *listen_id,
874 struct ib_sa_path_rec *path) 925 struct ib_sa_path_rec *path)
875{ 926{
876 struct sockaddr_ib *listen_ib, *ib; 927 struct sockaddr_ib *listen_ib, *ib;
877 928
878 listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; 929 listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
879 ib = (struct sockaddr_ib *) &id->route.addr.src_addr; 930 if (src_addr) {
880 ib->sib_family = listen_ib->sib_family; 931 ib = (struct sockaddr_ib *)src_addr;
881 if (path) { 932 ib->sib_family = AF_IB;
882 ib->sib_pkey = path->pkey; 933 if (path) {
883 ib->sib_flowinfo = path->flow_label; 934 ib->sib_pkey = path->pkey;
884 memcpy(&ib->sib_addr, &path->sgid, 16); 935 ib->sib_flowinfo = path->flow_label;
885 } else { 936 memcpy(&ib->sib_addr, &path->sgid, 16);
886 ib->sib_pkey = listen_ib->sib_pkey; 937 ib->sib_sid = path->service_id;
887 ib->sib_flowinfo = listen_ib->sib_flowinfo; 938 ib->sib_scope_id = 0;
888 ib->sib_addr = listen_ib->sib_addr; 939 } else {
889 } 940 ib->sib_pkey = listen_ib->sib_pkey;
890 ib->sib_sid = listen_ib->sib_sid; 941 ib->sib_flowinfo = listen_ib->sib_flowinfo;
891 ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); 942 ib->sib_addr = listen_ib->sib_addr;
892 ib->sib_scope_id = listen_ib->sib_scope_id; 943 ib->sib_sid = listen_ib->sib_sid;
893 944 ib->sib_scope_id = listen_ib->sib_scope_id;
894 if (path) { 945 }
895 ib = (struct sockaddr_ib *) &id->route.addr.dst_addr; 946 ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
896 ib->sib_family = listen_ib->sib_family; 947 }
897 ib->sib_pkey = path->pkey; 948 if (dst_addr) {
898 ib->sib_flowinfo = path->flow_label; 949 ib = (struct sockaddr_ib *)dst_addr;
899 memcpy(&ib->sib_addr, &path->dgid, 16); 950 ib->sib_family = AF_IB;
951 if (path) {
952 ib->sib_pkey = path->pkey;
953 ib->sib_flowinfo = path->flow_label;
954 memcpy(&ib->sib_addr, &path->dgid, 16);
955 }
900 } 956 }
901} 957}
902 958
903static __be16 ss_get_port(const struct sockaddr_storage *ss) 959static void cma_save_ip4_info(struct sockaddr *src_addr,
904{ 960 struct sockaddr *dst_addr,
905 if (ss->ss_family == AF_INET) 961 struct cma_hdr *hdr,
906 return ((struct sockaddr_in *)ss)->sin_port; 962 __be16 local_port)
907 else if (ss->ss_family == AF_INET6)
908 return ((struct sockaddr_in6 *)ss)->sin6_port;
909 BUG();
910}
911
912static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
913 struct cma_hdr *hdr)
914{ 963{
915 struct sockaddr_in *ip4; 964 struct sockaddr_in *ip4;
916 965
917 ip4 = (struct sockaddr_in *) &id->route.addr.src_addr; 966 if (src_addr) {
918 ip4->sin_family = AF_INET; 967 ip4 = (struct sockaddr_in *)src_addr;
919 ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr; 968 ip4->sin_family = AF_INET;
920 ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr); 969 ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
970 ip4->sin_port = local_port;
971 }
921 972
922 ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr; 973 if (dst_addr) {
923 ip4->sin_family = AF_INET; 974 ip4 = (struct sockaddr_in *)dst_addr;
924 ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr; 975 ip4->sin_family = AF_INET;
925 ip4->sin_port = hdr->port; 976 ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
977 ip4->sin_port = hdr->port;
978 }
926} 979}
927 980
928static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, 981static void cma_save_ip6_info(struct sockaddr *src_addr,
929 struct cma_hdr *hdr) 982 struct sockaddr *dst_addr,
983 struct cma_hdr *hdr,
984 __be16 local_port)
930{ 985{
931 struct sockaddr_in6 *ip6; 986 struct sockaddr_in6 *ip6;
932 987
933 ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr; 988 if (src_addr) {
934 ip6->sin6_family = AF_INET6; 989 ip6 = (struct sockaddr_in6 *)src_addr;
935 ip6->sin6_addr = hdr->dst_addr.ip6; 990 ip6->sin6_family = AF_INET6;
936 ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr); 991 ip6->sin6_addr = hdr->dst_addr.ip6;
992 ip6->sin6_port = local_port;
993 }
937 994
938 ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr; 995 if (dst_addr) {
939 ip6->sin6_family = AF_INET6; 996 ip6 = (struct sockaddr_in6 *)dst_addr;
940 ip6->sin6_addr = hdr->src_addr.ip6; 997 ip6->sin6_family = AF_INET6;
941 ip6->sin6_port = hdr->port; 998 ip6->sin6_addr = hdr->src_addr.ip6;
999 ip6->sin6_port = hdr->port;
1000 }
942} 1001}
943 1002
944static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, 1003static u16 cma_port_from_service_id(__be64 service_id)
945 struct ib_cm_event *ib_event)
946{ 1004{
947 struct cma_hdr *hdr; 1005 return (u16)be64_to_cpu(service_id);
1006}
948 1007
949 if (listen_id->route.addr.src_addr.ss_family == AF_IB) { 1008static int cma_save_ip_info(struct sockaddr *src_addr,
950 if (ib_event->event == IB_CM_REQ_RECEIVED) 1009 struct sockaddr *dst_addr,
951 cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path); 1010 struct ib_cm_event *ib_event,
952 else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) 1011 __be64 service_id)
953 cma_save_ib_info(id, listen_id, NULL); 1012{
954 return 0; 1013 struct cma_hdr *hdr;
955 } 1014 __be16 port;
956 1015
957 hdr = ib_event->private_data; 1016 hdr = ib_event->private_data;
958 if (hdr->cma_version != CMA_VERSION) 1017 if (hdr->cma_version != CMA_VERSION)
959 return -EINVAL; 1018 return -EINVAL;
960 1019
1020 port = htons(cma_port_from_service_id(service_id));
1021
961 switch (cma_get_ip_ver(hdr)) { 1022 switch (cma_get_ip_ver(hdr)) {
962 case 4: 1023 case 4:
963 cma_save_ip4_info(id, listen_id, hdr); 1024 cma_save_ip4_info(src_addr, dst_addr, hdr, port);
964 break; 1025 break;
965 case 6: 1026 case 6:
966 cma_save_ip6_info(id, listen_id, hdr); 1027 cma_save_ip6_info(src_addr, dst_addr, hdr, port);
1028 break;
1029 default:
1030 return -EAFNOSUPPORT;
1031 }
1032
1033 return 0;
1034}
1035
1036static int cma_save_net_info(struct sockaddr *src_addr,
1037 struct sockaddr *dst_addr,
1038 struct rdma_cm_id *listen_id,
1039 struct ib_cm_event *ib_event,
1040 sa_family_t sa_family, __be64 service_id)
1041{
1042 if (sa_family == AF_IB) {
1043 if (ib_event->event == IB_CM_REQ_RECEIVED)
1044 cma_save_ib_info(src_addr, dst_addr, listen_id,
1045 ib_event->param.req_rcvd.primary_path);
1046 else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
1047 cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
1048 return 0;
1049 }
1050
1051 return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
1052}
1053
1054static int cma_save_req_info(const struct ib_cm_event *ib_event,
1055 struct cma_req_info *req)
1056{
1057 const struct ib_cm_req_event_param *req_param =
1058 &ib_event->param.req_rcvd;
1059 const struct ib_cm_sidr_req_event_param *sidr_param =
1060 &ib_event->param.sidr_req_rcvd;
1061
1062 switch (ib_event->event) {
1063 case IB_CM_REQ_RECEIVED:
1064 req->device = req_param->listen_id->device;
1065 req->port = req_param->port;
1066 memcpy(&req->local_gid, &req_param->primary_path->sgid,
1067 sizeof(req->local_gid));
1068 req->has_gid = true;
1069 req->service_id = req_param->primary_path->service_id;
1070 req->pkey = req_param->bth_pkey;
1071 break;
1072 case IB_CM_SIDR_REQ_RECEIVED:
1073 req->device = sidr_param->listen_id->device;
1074 req->port = sidr_param->port;
1075 req->has_gid = false;
1076 req->service_id = sidr_param->service_id;
1077 req->pkey = sidr_param->bth_pkey;
967 break; 1078 break;
968 default: 1079 default:
969 return -EINVAL; 1080 return -EINVAL;
970 } 1081 }
1082
971 return 0; 1083 return 0;
972} 1084}
973 1085
1086static bool validate_ipv4_net_dev(struct net_device *net_dev,
1087 const struct sockaddr_in *dst_addr,
1088 const struct sockaddr_in *src_addr)
1089{
1090 __be32 daddr = dst_addr->sin_addr.s_addr,
1091 saddr = src_addr->sin_addr.s_addr;
1092 struct fib_result res;
1093 struct flowi4 fl4;
1094 int err;
1095 bool ret;
1096
1097 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1098 ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
1099 ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
1100 ipv4_is_loopback(saddr))
1101 return false;
1102
1103 memset(&fl4, 0, sizeof(fl4));
1104 fl4.flowi4_iif = net_dev->ifindex;
1105 fl4.daddr = daddr;
1106 fl4.saddr = saddr;
1107
1108 rcu_read_lock();
1109 err = fib_lookup(dev_net(net_dev), &fl4, &res, 0);
1110 if (err)
1111 return false;
1112
1113 ret = FIB_RES_DEV(res) == net_dev;
1114 rcu_read_unlock();
1115
1116 return ret;
1117}
1118
1119static bool validate_ipv6_net_dev(struct net_device *net_dev,
1120 const struct sockaddr_in6 *dst_addr,
1121 const struct sockaddr_in6 *src_addr)
1122{
1123#if IS_ENABLED(CONFIG_IPV6)
1124 const int strict = ipv6_addr_type(&dst_addr->sin6_addr) &
1125 IPV6_ADDR_LINKLOCAL;
1126 struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr,
1127 &src_addr->sin6_addr, net_dev->ifindex,
1128 strict);
1129 bool ret;
1130
1131 if (!rt)
1132 return false;
1133
1134 ret = rt->rt6i_idev->dev == net_dev;
1135 ip6_rt_put(rt);
1136
1137 return ret;
1138#else
1139 return false;
1140#endif
1141}
1142
1143static bool validate_net_dev(struct net_device *net_dev,
1144 const struct sockaddr *daddr,
1145 const struct sockaddr *saddr)
1146{
1147 const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
1148 const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
1149 const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
1150 const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;
1151
1152 switch (daddr->sa_family) {
1153 case AF_INET:
1154 return saddr->sa_family == AF_INET &&
1155 validate_ipv4_net_dev(net_dev, daddr4, saddr4);
1156
1157 case AF_INET6:
1158 return saddr->sa_family == AF_INET6 &&
1159 validate_ipv6_net_dev(net_dev, daddr6, saddr6);
1160
1161 default:
1162 return false;
1163 }
1164}
1165
1166static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event,
1167 const struct cma_req_info *req)
1168{
1169 struct sockaddr_storage listen_addr_storage, src_addr_storage;
1170 struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage,
1171 *src_addr = (struct sockaddr *)&src_addr_storage;
1172 struct net_device *net_dev;
1173 const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
1174 int err;
1175
1176 err = cma_save_ip_info(listen_addr, src_addr, ib_event,
1177 req->service_id);
1178 if (err)
1179 return ERR_PTR(err);
1180
1181 net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey,
1182 gid, listen_addr);
1183 if (!net_dev)
1184 return ERR_PTR(-ENODEV);
1185
1186 if (!validate_net_dev(net_dev, listen_addr, src_addr)) {
1187 dev_put(net_dev);
1188 return ERR_PTR(-EHOSTUNREACH);
1189 }
1190
1191 return net_dev;
1192}
1193
1194static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id)
1195{
1196 return (be64_to_cpu(service_id) >> 16) & 0xffff;
1197}
1198
1199static bool cma_match_private_data(struct rdma_id_private *id_priv,
1200 const struct cma_hdr *hdr)
1201{
1202 struct sockaddr *addr = cma_src_addr(id_priv);
1203 __be32 ip4_addr;
1204 struct in6_addr ip6_addr;
1205
1206 if (cma_any_addr(addr) && !id_priv->afonly)
1207 return true;
1208
1209 switch (addr->sa_family) {
1210 case AF_INET:
1211 ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
1212 if (cma_get_ip_ver(hdr) != 4)
1213 return false;
1214 if (!cma_any_addr(addr) &&
1215 hdr->dst_addr.ip4.addr != ip4_addr)
1216 return false;
1217 break;
1218 case AF_INET6:
1219 ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
1220 if (cma_get_ip_ver(hdr) != 6)
1221 return false;
1222 if (!cma_any_addr(addr) &&
1223 memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
1224 return false;
1225 break;
1226 case AF_IB:
1227 return true;
1228 default:
1229 return false;
1230 }
1231
1232 return true;
1233}
1234
1235static bool cma_match_net_dev(const struct rdma_id_private *id_priv,
1236 const struct net_device *net_dev)
1237{
1238 const struct rdma_addr *addr = &id_priv->id.route.addr;
1239
1240 if (!net_dev)
1241 /* This request is an AF_IB request */
1242 return addr->src_addr.ss_family == AF_IB;
1243
1244 return !addr->dev_addr.bound_dev_if ||
1245 (net_eq(dev_net(net_dev), &init_net) &&
1246 addr->dev_addr.bound_dev_if == net_dev->ifindex);
1247}
1248
1249static struct rdma_id_private *cma_find_listener(
1250 const struct rdma_bind_list *bind_list,
1251 const struct ib_cm_id *cm_id,
1252 const struct ib_cm_event *ib_event,
1253 const struct cma_req_info *req,
1254 const struct net_device *net_dev)
1255{
1256 struct rdma_id_private *id_priv, *id_priv_dev;
1257
1258 if (!bind_list)
1259 return ERR_PTR(-EINVAL);
1260
1261 hlist_for_each_entry(id_priv, &bind_list->owners, node) {
1262 if (cma_match_private_data(id_priv, ib_event->private_data)) {
1263 if (id_priv->id.device == cm_id->device &&
1264 cma_match_net_dev(id_priv, net_dev))
1265 return id_priv;
1266 list_for_each_entry(id_priv_dev,
1267 &id_priv->listen_list,
1268 listen_list) {
1269 if (id_priv_dev->id.device == cm_id->device &&
1270 cma_match_net_dev(id_priv_dev, net_dev))
1271 return id_priv_dev;
1272 }
1273 }
1274 }
1275
1276 return ERR_PTR(-EINVAL);
1277}
1278
1279static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
1280 struct ib_cm_event *ib_event,
1281 struct net_device **net_dev)
1282{
1283 struct cma_req_info req;
1284 struct rdma_bind_list *bind_list;
1285 struct rdma_id_private *id_priv;
1286 int err;
1287
1288 err = cma_save_req_info(ib_event, &req);
1289 if (err)
1290 return ERR_PTR(err);
1291
1292 *net_dev = cma_get_net_dev(ib_event, &req);
1293 if (IS_ERR(*net_dev)) {
1294 if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
1295 /* Assuming the protocol is AF_IB */
1296 *net_dev = NULL;
1297 } else {
1298 return ERR_CAST(*net_dev);
1299 }
1300 }
1301
1302 bind_list = cma_ps_find(rdma_ps_from_service_id(req.service_id),
1303 cma_port_from_service_id(req.service_id));
1304 id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
1305 if (IS_ERR(id_priv)) {
1306 dev_put(*net_dev);
1307 *net_dev = NULL;
1308 }
1309
1310 return id_priv;
1311}
1312
974static inline int cma_user_data_offset(struct rdma_id_private *id_priv) 1313static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
975{ 1314{
976 return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); 1315 return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
@@ -1038,7 +1377,7 @@ static void cma_release_port(struct rdma_id_private *id_priv)
1038 mutex_lock(&lock); 1377 mutex_lock(&lock);
1039 hlist_del(&id_priv->node); 1378 hlist_del(&id_priv->node);
1040 if (hlist_empty(&bind_list->owners)) { 1379 if (hlist_empty(&bind_list->owners)) {
1041 idr_remove(bind_list->ps, bind_list->port); 1380 cma_ps_remove(bind_list->ps, bind_list->port);
1042 kfree(bind_list); 1381 kfree(bind_list);
1043 } 1382 }
1044 mutex_unlock(&lock); 1383 mutex_unlock(&lock);
@@ -1216,11 +1555,15 @@ out:
1216} 1555}
1217 1556
1218static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, 1557static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
1219 struct ib_cm_event *ib_event) 1558 struct ib_cm_event *ib_event,
1559 struct net_device *net_dev)
1220{ 1560{
1221 struct rdma_id_private *id_priv; 1561 struct rdma_id_private *id_priv;
1222 struct rdma_cm_id *id; 1562 struct rdma_cm_id *id;
1223 struct rdma_route *rt; 1563 struct rdma_route *rt;
1564 const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
1565 const __be64 service_id =
1566 ib_event->param.req_rcvd.primary_path->service_id;
1224 int ret; 1567 int ret;
1225 1568
1226 id = rdma_create_id(listen_id->event_handler, listen_id->context, 1569 id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1229,7 +1572,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
1229 return NULL; 1572 return NULL;
1230 1573
1231 id_priv = container_of(id, struct rdma_id_private, id); 1574 id_priv = container_of(id, struct rdma_id_private, id);
1232 if (cma_save_net_info(id, listen_id, ib_event)) 1575 if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
1576 (struct sockaddr *)&id->route.addr.dst_addr,
1577 listen_id, ib_event, ss_family, service_id))
1233 goto err; 1578 goto err;
1234 1579
1235 rt = &id->route; 1580 rt = &id->route;
@@ -1243,14 +1588,16 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
1243 if (rt->num_paths == 2) 1588 if (rt->num_paths == 2)
1244 rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; 1589 rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
1245 1590
1246 if (cma_any_addr(cma_src_addr(id_priv))) { 1591 if (net_dev) {
1247 rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; 1592 ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
1248 rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
1249 ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
1250 } else {
1251 ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
1252 if (ret) 1593 if (ret)
1253 goto err; 1594 goto err;
1595 } else {
1596 /* An AF_IB connection */
1597 WARN_ON_ONCE(ss_family != AF_IB);
1598
1599 cma_translate_ib((struct sockaddr_ib *)cma_src_addr(id_priv),
1600 &rt->addr.dev_addr);
1254 } 1601 }
1255 rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); 1602 rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
1256 1603
@@ -1263,10 +1610,12 @@ err:
1263} 1610}
1264 1611
1265static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, 1612static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
1266 struct ib_cm_event *ib_event) 1613 struct ib_cm_event *ib_event,
1614 struct net_device *net_dev)
1267{ 1615{
1268 struct rdma_id_private *id_priv; 1616 struct rdma_id_private *id_priv;
1269 struct rdma_cm_id *id; 1617 struct rdma_cm_id *id;
1618 const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
1270 int ret; 1619 int ret;
1271 1620
1272 id = rdma_create_id(listen_id->event_handler, listen_id->context, 1621 id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1275,13 +1624,24 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
1275 return NULL; 1624 return NULL;
1276 1625
1277 id_priv = container_of(id, struct rdma_id_private, id); 1626 id_priv = container_of(id, struct rdma_id_private, id);
1278 if (cma_save_net_info(id, listen_id, ib_event)) 1627 if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
1628 (struct sockaddr *)&id->route.addr.dst_addr,
1629 listen_id, ib_event, ss_family,
1630 ib_event->param.sidr_req_rcvd.service_id))
1279 goto err; 1631 goto err;
1280 1632
1281 if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { 1633 if (net_dev) {
1282 ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); 1634 ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
1283 if (ret) 1635 if (ret)
1284 goto err; 1636 goto err;
1637 } else {
1638 /* An AF_IB connection */
1639 WARN_ON_ONCE(ss_family != AF_IB);
1640
1641 if (!cma_any_addr(cma_src_addr(id_priv)))
1642 cma_translate_ib((struct sockaddr_ib *)
1643 cma_src_addr(id_priv),
1644 &id->route.addr.dev_addr);
1285 } 1645 }
1286 1646
1287 id_priv->state = RDMA_CM_CONNECT; 1647 id_priv->state = RDMA_CM_CONNECT;
@@ -1319,25 +1679,33 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
1319{ 1679{
1320 struct rdma_id_private *listen_id, *conn_id; 1680 struct rdma_id_private *listen_id, *conn_id;
1321 struct rdma_cm_event event; 1681 struct rdma_cm_event event;
1682 struct net_device *net_dev;
1322 int offset, ret; 1683 int offset, ret;
1323 1684
1324 listen_id = cm_id->context; 1685 listen_id = cma_id_from_event(cm_id, ib_event, &net_dev);
1325 if (!cma_check_req_qp_type(&listen_id->id, ib_event)) 1686 if (IS_ERR(listen_id))
1326 return -EINVAL; 1687 return PTR_ERR(listen_id);
1327 1688
1328 if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) 1689 if (!cma_check_req_qp_type(&listen_id->id, ib_event)) {
1329 return -ECONNABORTED; 1690 ret = -EINVAL;
1691 goto net_dev_put;
1692 }
1693
1694 if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) {
1695 ret = -ECONNABORTED;
1696 goto net_dev_put;
1697 }
1330 1698
1331 memset(&event, 0, sizeof event); 1699 memset(&event, 0, sizeof event);
1332 offset = cma_user_data_offset(listen_id); 1700 offset = cma_user_data_offset(listen_id);
1333 event.event = RDMA_CM_EVENT_CONNECT_REQUEST; 1701 event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
1334 if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { 1702 if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
1335 conn_id = cma_new_udp_id(&listen_id->id, ib_event); 1703 conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev);
1336 event.param.ud.private_data = ib_event->private_data + offset; 1704 event.param.ud.private_data = ib_event->private_data + offset;
1337 event.param.ud.private_data_len = 1705 event.param.ud.private_data_len =
1338 IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; 1706 IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
1339 } else { 1707 } else {
1340 conn_id = cma_new_conn_id(&listen_id->id, ib_event); 1708 conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev);
1341 cma_set_req_event_data(&event, &ib_event->param.req_rcvd, 1709 cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
1342 ib_event->private_data, offset); 1710 ib_event->private_data, offset);
1343 } 1711 }
@@ -1375,6 +1743,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
1375 mutex_unlock(&conn_id->handler_mutex); 1743 mutex_unlock(&conn_id->handler_mutex);
1376 mutex_unlock(&listen_id->handler_mutex); 1744 mutex_unlock(&listen_id->handler_mutex);
1377 cma_deref_id(conn_id); 1745 cma_deref_id(conn_id);
1746 if (net_dev)
1747 dev_put(net_dev);
1378 return 0; 1748 return 0;
1379 1749
1380err3: 1750err3:
@@ -1388,6 +1758,11 @@ err1:
1388 mutex_unlock(&listen_id->handler_mutex); 1758 mutex_unlock(&listen_id->handler_mutex);
1389 if (conn_id) 1759 if (conn_id)
1390 rdma_destroy_id(&conn_id->id); 1760 rdma_destroy_id(&conn_id->id);
1761
1762net_dev_put:
1763 if (net_dev)
1764 dev_put(net_dev);
1765
1391 return ret; 1766 return ret;
1392} 1767}
1393 1768
@@ -1400,42 +1775,6 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
1400} 1775}
1401EXPORT_SYMBOL(rdma_get_service_id); 1776EXPORT_SYMBOL(rdma_get_service_id);
1402 1777
1403static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
1404 struct ib_cm_compare_data *compare)
1405{
1406 struct cma_hdr *cma_data, *cma_mask;
1407 __be32 ip4_addr;
1408 struct in6_addr ip6_addr;
1409
1410 memset(compare, 0, sizeof *compare);
1411 cma_data = (void *) compare->data;
1412 cma_mask = (void *) compare->mask;
1413
1414 switch (addr->sa_family) {
1415 case AF_INET:
1416 ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
1417 cma_set_ip_ver(cma_data, 4);
1418 cma_set_ip_ver(cma_mask, 0xF);
1419 if (!cma_any_addr(addr)) {
1420 cma_data->dst_addr.ip4.addr = ip4_addr;
1421 cma_mask->dst_addr.ip4.addr = htonl(~0);
1422 }
1423 break;
1424 case AF_INET6:
1425 ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
1426 cma_set_ip_ver(cma_data, 6);
1427 cma_set_ip_ver(cma_mask, 0xF);
1428 if (!cma_any_addr(addr)) {
1429 cma_data->dst_addr.ip6 = ip6_addr;
1430 memset(&cma_mask->dst_addr.ip6, 0xFF,
1431 sizeof cma_mask->dst_addr.ip6);
1432 }
1433 break;
1434 default:
1435 break;
1436 }
1437}
1438
1439static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) 1778static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
1440{ 1779{
1441 struct rdma_id_private *id_priv = iw_id->context; 1780 struct rdma_id_private *id_priv = iw_id->context;
@@ -1589,33 +1928,18 @@ out:
1589 1928
1590static int cma_ib_listen(struct rdma_id_private *id_priv) 1929static int cma_ib_listen(struct rdma_id_private *id_priv)
1591{ 1930{
1592 struct ib_cm_compare_data compare_data;
1593 struct sockaddr *addr; 1931 struct sockaddr *addr;
1594 struct ib_cm_id *id; 1932 struct ib_cm_id *id;
1595 __be64 svc_id; 1933 __be64 svc_id;
1596 int ret;
1597 1934
1598 id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv); 1935 addr = cma_src_addr(id_priv);
1936 svc_id = rdma_get_service_id(&id_priv->id, addr);
1937 id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id);
1599 if (IS_ERR(id)) 1938 if (IS_ERR(id))
1600 return PTR_ERR(id); 1939 return PTR_ERR(id);
1601
1602 id_priv->cm_id.ib = id; 1940 id_priv->cm_id.ib = id;
1603 1941
1604 addr = cma_src_addr(id_priv); 1942 return 0;
1605 svc_id = rdma_get_service_id(&id_priv->id, addr);
1606 if (cma_any_addr(addr) && !id_priv->afonly)
1607 ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
1608 else {
1609 cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
1610 ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
1611 }
1612
1613 if (ret) {
1614 ib_destroy_cm_id(id_priv->cm_id.ib);
1615 id_priv->cm_id.ib = NULL;
1616 }
1617
1618 return ret;
1619} 1943}
1620 1944
1621static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) 1945static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
@@ -2203,8 +2527,11 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
2203 src_addr = (struct sockaddr *) &id->route.addr.src_addr; 2527 src_addr = (struct sockaddr *) &id->route.addr.src_addr;
2204 src_addr->sa_family = dst_addr->sa_family; 2528 src_addr->sa_family = dst_addr->sa_family;
2205 if (dst_addr->sa_family == AF_INET6) { 2529 if (dst_addr->sa_family == AF_INET6) {
2206 ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = 2530 struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
2207 ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; 2531 struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
2532 src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
2533 if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
2534 id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
2208 } else if (dst_addr->sa_family == AF_IB) { 2535 } else if (dst_addr->sa_family == AF_IB) {
2209 ((struct sockaddr_ib *) src_addr)->sib_pkey = 2536 ((struct sockaddr_ib *) src_addr)->sib_pkey =
2210 ((struct sockaddr_ib *) dst_addr)->sib_pkey; 2537 ((struct sockaddr_ib *) dst_addr)->sib_pkey;
@@ -2325,8 +2652,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list,
2325 hlist_add_head(&id_priv->node, &bind_list->owners); 2652 hlist_add_head(&id_priv->node, &bind_list->owners);
2326} 2653}
2327 2654
2328static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, 2655static int cma_alloc_port(enum rdma_port_space ps,
2329 unsigned short snum) 2656 struct rdma_id_private *id_priv, unsigned short snum)
2330{ 2657{
2331 struct rdma_bind_list *bind_list; 2658 struct rdma_bind_list *bind_list;
2332 int ret; 2659 int ret;
@@ -2335,7 +2662,7 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
2335 if (!bind_list) 2662 if (!bind_list)
2336 return -ENOMEM; 2663 return -ENOMEM;
2337 2664
2338 ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL); 2665 ret = cma_ps_alloc(ps, bind_list, snum);
2339 if (ret < 0) 2666 if (ret < 0)
2340 goto err; 2667 goto err;
2341 2668
@@ -2348,7 +2675,8 @@ err:
2348 return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; 2675 return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
2349} 2676}
2350 2677
2351static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) 2678static int cma_alloc_any_port(enum rdma_port_space ps,
2679 struct rdma_id_private *id_priv)
2352{ 2680{
2353 static unsigned int last_used_port; 2681 static unsigned int last_used_port;
2354 int low, high, remaining; 2682 int low, high, remaining;
@@ -2359,7 +2687,7 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
2359 rover = prandom_u32() % remaining + low; 2687 rover = prandom_u32() % remaining + low;
2360retry: 2688retry:
2361 if (last_used_port != rover && 2689 if (last_used_port != rover &&
2362 !idr_find(ps, (unsigned short) rover)) { 2690 !cma_ps_find(ps, (unsigned short)rover)) {
2363 int ret = cma_alloc_port(ps, id_priv, rover); 2691 int ret = cma_alloc_port(ps, id_priv, rover);
2364 /* 2692 /*
2365 * Remember previously used port number in order to avoid 2693 * Remember previously used port number in order to avoid
@@ -2414,7 +2742,8 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
2414 return 0; 2742 return 0;
2415} 2743}
2416 2744
2417static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) 2745static int cma_use_port(enum rdma_port_space ps,
2746 struct rdma_id_private *id_priv)
2418{ 2747{
2419 struct rdma_bind_list *bind_list; 2748 struct rdma_bind_list *bind_list;
2420 unsigned short snum; 2749 unsigned short snum;
@@ -2424,7 +2753,7 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
2424 if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) 2753 if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
2425 return -EACCES; 2754 return -EACCES;
2426 2755
2427 bind_list = idr_find(ps, snum); 2756 bind_list = cma_ps_find(ps, snum);
2428 if (!bind_list) { 2757 if (!bind_list) {
2429 ret = cma_alloc_port(ps, id_priv, snum); 2758 ret = cma_alloc_port(ps, id_priv, snum);
2430 } else { 2759 } else {
@@ -2447,25 +2776,24 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)
2447 return ret; 2776 return ret;
2448} 2777}
2449 2778
2450static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv) 2779static enum rdma_port_space cma_select_inet_ps(
2780 struct rdma_id_private *id_priv)
2451{ 2781{
2452 switch (id_priv->id.ps) { 2782 switch (id_priv->id.ps) {
2453 case RDMA_PS_TCP: 2783 case RDMA_PS_TCP:
2454 return &tcp_ps;
2455 case RDMA_PS_UDP: 2784 case RDMA_PS_UDP:
2456 return &udp_ps;
2457 case RDMA_PS_IPOIB: 2785 case RDMA_PS_IPOIB:
2458 return &ipoib_ps;
2459 case RDMA_PS_IB: 2786 case RDMA_PS_IB:
2460 return &ib_ps; 2787 return id_priv->id.ps;
2461 default: 2788 default:
2462 return NULL; 2789
2790 return 0;
2463 } 2791 }
2464} 2792}
2465 2793
2466static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv) 2794static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv)
2467{ 2795{
2468 struct idr *ps = NULL; 2796 enum rdma_port_space ps = 0;
2469 struct sockaddr_ib *sib; 2797 struct sockaddr_ib *sib;
2470 u64 sid_ps, mask, sid; 2798 u64 sid_ps, mask, sid;
2471 2799
@@ -2475,15 +2803,15 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
2475 2803
2476 if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { 2804 if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
2477 sid_ps = RDMA_IB_IP_PS_IB; 2805 sid_ps = RDMA_IB_IP_PS_IB;
2478 ps = &ib_ps; 2806 ps = RDMA_PS_IB;
2479 } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && 2807 } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
2480 (sid == (RDMA_IB_IP_PS_TCP & mask))) { 2808 (sid == (RDMA_IB_IP_PS_TCP & mask))) {
2481 sid_ps = RDMA_IB_IP_PS_TCP; 2809 sid_ps = RDMA_IB_IP_PS_TCP;
2482 ps = &tcp_ps; 2810 ps = RDMA_PS_TCP;
2483 } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && 2811 } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
2484 (sid == (RDMA_IB_IP_PS_UDP & mask))) { 2812 (sid == (RDMA_IB_IP_PS_UDP & mask))) {
2485 sid_ps = RDMA_IB_IP_PS_UDP; 2813 sid_ps = RDMA_IB_IP_PS_UDP;
2486 ps = &udp_ps; 2814 ps = RDMA_PS_UDP;
2487 } 2815 }
2488 2816
2489 if (ps) { 2817 if (ps) {
@@ -2496,7 +2824,7 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
2496 2824
2497static int cma_get_port(struct rdma_id_private *id_priv) 2825static int cma_get_port(struct rdma_id_private *id_priv)
2498{ 2826{
2499 struct idr *ps; 2827 enum rdma_port_space ps;
2500 int ret; 2828 int ret;
2501 2829
2502 if (cma_family(id_priv) != AF_IB) 2830 if (cma_family(id_priv) != AF_IB)
@@ -3551,11 +3879,10 @@ static void cma_process_remove(struct cma_device *cma_dev)
3551 wait_for_completion(&cma_dev->comp); 3879 wait_for_completion(&cma_dev->comp);
3552} 3880}
3553 3881
3554static void cma_remove_one(struct ib_device *device) 3882static void cma_remove_one(struct ib_device *device, void *client_data)
3555{ 3883{
3556 struct cma_device *cma_dev; 3884 struct cma_device *cma_dev = client_data;
3557 3885
3558 cma_dev = ib_get_client_data(device, &cma_client);
3559 if (!cma_dev) 3886 if (!cma_dev)
3560 return; 3887 return;
3561 3888
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 87d1936f5c1c..70bb36ebb03b 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -43,12 +43,58 @@ int ib_device_register_sysfs(struct ib_device *device,
43 u8, struct kobject *)); 43 u8, struct kobject *));
44void ib_device_unregister_sysfs(struct ib_device *device); 44void ib_device_unregister_sysfs(struct ib_device *device);
45 45
46int ib_sysfs_setup(void); 46void ib_cache_setup(void);
47void ib_sysfs_cleanup(void);
48
49int ib_cache_setup(void);
50void ib_cache_cleanup(void); 47void ib_cache_cleanup(void);
51 48
52int ib_resolve_eth_l2_attrs(struct ib_qp *qp, 49int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
53 struct ib_qp_attr *qp_attr, int *qp_attr_mask); 50 struct ib_qp_attr *qp_attr, int *qp_attr_mask);
51
52typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
53 struct net_device *idev, void *cookie);
54
55typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port,
56 struct net_device *idev, void *cookie);
57
58void ib_enum_roce_netdev(struct ib_device *ib_dev,
59 roce_netdev_filter filter,
60 void *filter_cookie,
61 roce_netdev_callback cb,
62 void *cookie);
63void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
64 void *filter_cookie,
65 roce_netdev_callback cb,
66 void *cookie);
67
68int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
69 const union ib_gid *gid,
70 u8 port, struct net_device *ndev,
71 u16 *index);
72
73enum ib_cache_gid_default_mode {
74 IB_CACHE_GID_DEFAULT_MODE_SET,
75 IB_CACHE_GID_DEFAULT_MODE_DELETE
76};
77
78void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
79 struct net_device *ndev,
80 enum ib_cache_gid_default_mode mode);
81
82int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
83 union ib_gid *gid, struct ib_gid_attr *attr);
84
85int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
86 union ib_gid *gid, struct ib_gid_attr *attr);
87
88int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
89 struct net_device *ndev);
90
91int roce_gid_mgmt_init(void);
92void roce_gid_mgmt_cleanup(void);
93
94int roce_rescan_device(struct ib_device *ib_dev);
95
96int ib_cache_setup_one(struct ib_device *device);
97void ib_cache_cleanup_one(struct ib_device *device);
98void ib_cache_release_one(struct ib_device *device);
99
54#endif /* _CORE_PRIV_H */ 100#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 9567756ca4f9..17639117afc6 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -38,7 +38,10 @@
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/init.h> 39#include <linux/init.h>
40#include <linux/mutex.h> 40#include <linux/mutex.h>
41#include <linux/netdevice.h>
41#include <rdma/rdma_netlink.h> 42#include <rdma/rdma_netlink.h>
43#include <rdma/ib_addr.h>
44#include <rdma/ib_cache.h>
42 45
43#include "core_priv.h" 46#include "core_priv.h"
44 47
@@ -50,22 +53,34 @@ struct ib_client_data {
50 struct list_head list; 53 struct list_head list;
51 struct ib_client *client; 54 struct ib_client *client;
52 void * data; 55 void * data;
56 /* The device or client is going down. Do not call client or device
57 * callbacks other than remove(). */
58 bool going_down;
53}; 59};
54 60
55struct workqueue_struct *ib_wq; 61struct workqueue_struct *ib_wq;
56EXPORT_SYMBOL_GPL(ib_wq); 62EXPORT_SYMBOL_GPL(ib_wq);
57 63
64/* The device_list and client_list contain devices and clients after their
65 * registration has completed, and the devices and clients are removed
66 * during unregistration. */
58static LIST_HEAD(device_list); 67static LIST_HEAD(device_list);
59static LIST_HEAD(client_list); 68static LIST_HEAD(client_list);
60 69
61/* 70/*
62 * device_mutex protects access to both device_list and client_list. 71 * device_mutex and lists_rwsem protect access to both device_list and
63 * There's no real point to using multiple locks or something fancier 72 * client_list. device_mutex protects writer access by device and client
64 * like an rwsem: we always access both lists, and we're always 73 * registration / de-registration. lists_rwsem protects reader access to
65 * modifying one list or the other list. In any case this is not a 74 * these lists. Iterators of these lists must lock it for read, while updates
66 * hot path so there's no point in trying to optimize. 75 * to the lists must be done with a write lock. A special case is when the
76 * device_mutex is locked. In this case locking the lists for read access is
77 * not necessary as the device_mutex implies it.
78 *
79 * lists_rwsem also protects access to the client data list.
67 */ 80 */
68static DEFINE_MUTEX(device_mutex); 81static DEFINE_MUTEX(device_mutex);
82static DECLARE_RWSEM(lists_rwsem);
83
69 84
70static int ib_device_check_mandatory(struct ib_device *device) 85static int ib_device_check_mandatory(struct ib_device *device)
71{ 86{
@@ -152,6 +167,36 @@ static int alloc_name(char *name)
152 return 0; 167 return 0;
153} 168}
154 169
170static void ib_device_release(struct device *device)
171{
172 struct ib_device *dev = container_of(device, struct ib_device, dev);
173
174 ib_cache_release_one(dev);
175 kfree(dev->port_immutable);
176 kfree(dev);
177}
178
179static int ib_device_uevent(struct device *device,
180 struct kobj_uevent_env *env)
181{
182 struct ib_device *dev = container_of(device, struct ib_device, dev);
183
184 if (add_uevent_var(env, "NAME=%s", dev->name))
185 return -ENOMEM;
186
187 /*
188 * It would be nice to pass the node GUID with the event...
189 */
190
191 return 0;
192}
193
194static struct class ib_class = {
195 .name = "infiniband",
196 .dev_release = ib_device_release,
197 .dev_uevent = ib_device_uevent,
198};
199
155/** 200/**
156 * ib_alloc_device - allocate an IB device struct 201 * ib_alloc_device - allocate an IB device struct
157 * @size:size of structure to allocate 202 * @size:size of structure to allocate
@@ -164,9 +209,27 @@ static int alloc_name(char *name)
164 */ 209 */
165struct ib_device *ib_alloc_device(size_t size) 210struct ib_device *ib_alloc_device(size_t size)
166{ 211{
167 BUG_ON(size < sizeof (struct ib_device)); 212 struct ib_device *device;
213
214 if (WARN_ON(size < sizeof(struct ib_device)))
215 return NULL;
216
217 device = kzalloc(size, GFP_KERNEL);
218 if (!device)
219 return NULL;
220
221 device->dev.class = &ib_class;
222 device_initialize(&device->dev);
223
224 dev_set_drvdata(&device->dev, device);
225
226 INIT_LIST_HEAD(&device->event_handler_list);
227 spin_lock_init(&device->event_handler_lock);
228 spin_lock_init(&device->client_data_lock);
229 INIT_LIST_HEAD(&device->client_data_list);
230 INIT_LIST_HEAD(&device->port_list);
168 231
169 return kzalloc(size, GFP_KERNEL); 232 return device;
170} 233}
171EXPORT_SYMBOL(ib_alloc_device); 234EXPORT_SYMBOL(ib_alloc_device);
172 235
@@ -178,13 +241,8 @@ EXPORT_SYMBOL(ib_alloc_device);
178 */ 241 */
179void ib_dealloc_device(struct ib_device *device) 242void ib_dealloc_device(struct ib_device *device)
180{ 243{
181 if (device->reg_state == IB_DEV_UNINITIALIZED) { 244 WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
182 kfree(device); 245 device->reg_state != IB_DEV_UNINITIALIZED);
183 return;
184 }
185
186 BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
187
188 kobject_put(&device->dev.kobj); 246 kobject_put(&device->dev.kobj);
189} 247}
190EXPORT_SYMBOL(ib_dealloc_device); 248EXPORT_SYMBOL(ib_dealloc_device);
@@ -203,10 +261,13 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
203 261
204 context->client = client; 262 context->client = client;
205 context->data = NULL; 263 context->data = NULL;
264 context->going_down = false;
206 265
266 down_write(&lists_rwsem);
207 spin_lock_irqsave(&device->client_data_lock, flags); 267 spin_lock_irqsave(&device->client_data_lock, flags);
208 list_add(&context->list, &device->client_data_list); 268 list_add(&context->list, &device->client_data_list);
209 spin_unlock_irqrestore(&device->client_data_lock, flags); 269 spin_unlock_irqrestore(&device->client_data_lock, flags);
270 up_write(&lists_rwsem);
210 271
211 return 0; 272 return 0;
212} 273}
@@ -219,7 +280,7 @@ static int verify_immutable(const struct ib_device *dev, u8 port)
219 280
220static int read_port_immutable(struct ib_device *device) 281static int read_port_immutable(struct ib_device *device)
221{ 282{
222 int ret = -ENOMEM; 283 int ret;
223 u8 start_port = rdma_start_port(device); 284 u8 start_port = rdma_start_port(device);
224 u8 end_port = rdma_end_port(device); 285 u8 end_port = rdma_end_port(device);
225 u8 port; 286 u8 port;
@@ -235,26 +296,18 @@ static int read_port_immutable(struct ib_device *device)
235 * (end_port + 1), 296 * (end_port + 1),
236 GFP_KERNEL); 297 GFP_KERNEL);
237 if (!device->port_immutable) 298 if (!device->port_immutable)
238 goto err; 299 return -ENOMEM;
239 300
240 for (port = start_port; port <= end_port; ++port) { 301 for (port = start_port; port <= end_port; ++port) {
241 ret = device->get_port_immutable(device, port, 302 ret = device->get_port_immutable(device, port,
242 &device->port_immutable[port]); 303 &device->port_immutable[port]);
243 if (ret) 304 if (ret)
244 goto err; 305 return ret;
245 306
246 if (verify_immutable(device, port)) { 307 if (verify_immutable(device, port))
247 ret = -EINVAL; 308 return -EINVAL;
248 goto err;
249 }
250 } 309 }
251 310 return 0;
252 ret = 0;
253 goto out;
254err:
255 kfree(device->port_immutable);
256out:
257 return ret;
258} 311}
259 312
260/** 313/**
@@ -271,6 +324,7 @@ int ib_register_device(struct ib_device *device,
271 u8, struct kobject *)) 324 u8, struct kobject *))
272{ 325{
273 int ret; 326 int ret;
327 struct ib_client *client;
274 328
275 mutex_lock(&device_mutex); 329 mutex_lock(&device_mutex);
276 330
@@ -285,11 +339,6 @@ int ib_register_device(struct ib_device *device,
285 goto out; 339 goto out;
286 } 340 }
287 341
288 INIT_LIST_HEAD(&device->event_handler_list);
289 INIT_LIST_HEAD(&device->client_data_list);
290 spin_lock_init(&device->event_handler_lock);
291 spin_lock_init(&device->client_data_lock);
292
293 ret = read_port_immutable(device); 342 ret = read_port_immutable(device);
294 if (ret) { 343 if (ret) {
295 printk(KERN_WARNING "Couldn't create per port immutable data %s\n", 344 printk(KERN_WARNING "Couldn't create per port immutable data %s\n",
@@ -297,27 +346,30 @@ int ib_register_device(struct ib_device *device,
297 goto out; 346 goto out;
298 } 347 }
299 348
349 ret = ib_cache_setup_one(device);
350 if (ret) {
351 printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
352 goto out;
353 }
354
300 ret = ib_device_register_sysfs(device, port_callback); 355 ret = ib_device_register_sysfs(device, port_callback);
301 if (ret) { 356 if (ret) {
302 printk(KERN_WARNING "Couldn't register device %s with driver model\n", 357 printk(KERN_WARNING "Couldn't register device %s with driver model\n",
303 device->name); 358 device->name);
304 kfree(device->port_immutable); 359 ib_cache_cleanup_one(device);
305 goto out; 360 goto out;
306 } 361 }
307 362
308 list_add_tail(&device->core_list, &device_list);
309
310 device->reg_state = IB_DEV_REGISTERED; 363 device->reg_state = IB_DEV_REGISTERED;
311 364
312 { 365 list_for_each_entry(client, &client_list, list)
313 struct ib_client *client; 366 if (client->add && !add_client_context(device, client))
314 367 client->add(device);
315 list_for_each_entry(client, &client_list, list)
316 if (client->add && !add_client_context(device, client))
317 client->add(device);
318 }
319 368
320 out: 369 down_write(&lists_rwsem);
370 list_add_tail(&device->core_list, &device_list);
371 up_write(&lists_rwsem);
372out:
321 mutex_unlock(&device_mutex); 373 mutex_unlock(&device_mutex);
322 return ret; 374 return ret;
323} 375}
@@ -331,26 +383,37 @@ EXPORT_SYMBOL(ib_register_device);
331 */ 383 */
332void ib_unregister_device(struct ib_device *device) 384void ib_unregister_device(struct ib_device *device)
333{ 385{
334 struct ib_client *client;
335 struct ib_client_data *context, *tmp; 386 struct ib_client_data *context, *tmp;
336 unsigned long flags; 387 unsigned long flags;
337 388
338 mutex_lock(&device_mutex); 389 mutex_lock(&device_mutex);
339 390
340 list_for_each_entry_reverse(client, &client_list, list) 391 down_write(&lists_rwsem);
341 if (client->remove)
342 client->remove(device);
343
344 list_del(&device->core_list); 392 list_del(&device->core_list);
393 spin_lock_irqsave(&device->client_data_lock, flags);
394 list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
395 context->going_down = true;
396 spin_unlock_irqrestore(&device->client_data_lock, flags);
397 downgrade_write(&lists_rwsem);
398
399 list_for_each_entry_safe(context, tmp, &device->client_data_list,
400 list) {
401 if (context->client->remove)
402 context->client->remove(device, context->data);
403 }
404 up_read(&lists_rwsem);
345 405
346 mutex_unlock(&device_mutex); 406 mutex_unlock(&device_mutex);
347 407
348 ib_device_unregister_sysfs(device); 408 ib_device_unregister_sysfs(device);
409 ib_cache_cleanup_one(device);
349 410
411 down_write(&lists_rwsem);
350 spin_lock_irqsave(&device->client_data_lock, flags); 412 spin_lock_irqsave(&device->client_data_lock, flags);
351 list_for_each_entry_safe(context, tmp, &device->client_data_list, list) 413 list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
352 kfree(context); 414 kfree(context);
353 spin_unlock_irqrestore(&device->client_data_lock, flags); 415 spin_unlock_irqrestore(&device->client_data_lock, flags);
416 up_write(&lists_rwsem);
354 417
355 device->reg_state = IB_DEV_UNREGISTERED; 418 device->reg_state = IB_DEV_UNREGISTERED;
356} 419}
@@ -375,11 +438,14 @@ int ib_register_client(struct ib_client *client)
375 438
376 mutex_lock(&device_mutex); 439 mutex_lock(&device_mutex);
377 440
378 list_add_tail(&client->list, &client_list);
379 list_for_each_entry(device, &device_list, core_list) 441 list_for_each_entry(device, &device_list, core_list)
380 if (client->add && !add_client_context(device, client)) 442 if (client->add && !add_client_context(device, client))
381 client->add(device); 443 client->add(device);
382 444
445 down_write(&lists_rwsem);
446 list_add_tail(&client->list, &client_list);
447 up_write(&lists_rwsem);
448
383 mutex_unlock(&device_mutex); 449 mutex_unlock(&device_mutex);
384 450
385 return 0; 451 return 0;
@@ -402,19 +468,41 @@ void ib_unregister_client(struct ib_client *client)
402 468
403 mutex_lock(&device_mutex); 469 mutex_lock(&device_mutex);
404 470
471 down_write(&lists_rwsem);
472 list_del(&client->list);
473 up_write(&lists_rwsem);
474
405 list_for_each_entry(device, &device_list, core_list) { 475 list_for_each_entry(device, &device_list, core_list) {
406 if (client->remove) 476 struct ib_client_data *found_context = NULL;
407 client->remove(device);
408 477
478 down_write(&lists_rwsem);
409 spin_lock_irqsave(&device->client_data_lock, flags); 479 spin_lock_irqsave(&device->client_data_lock, flags);
410 list_for_each_entry_safe(context, tmp, &device->client_data_list, list) 480 list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
411 if (context->client == client) { 481 if (context->client == client) {
412 list_del(&context->list); 482 context->going_down = true;
413 kfree(context); 483 found_context = context;
484 break;
414 } 485 }
415 spin_unlock_irqrestore(&device->client_data_lock, flags); 486 spin_unlock_irqrestore(&device->client_data_lock, flags);
487 up_write(&lists_rwsem);
488
489 if (client->remove)
490 client->remove(device, found_context ?
491 found_context->data : NULL);
492
493 if (!found_context) {
494 pr_warn("No client context found for %s/%s\n",
495 device->name, client->name);
496 continue;
497 }
498
499 down_write(&lists_rwsem);
500 spin_lock_irqsave(&device->client_data_lock, flags);
501 list_del(&found_context->list);
502 kfree(found_context);
503 spin_unlock_irqrestore(&device->client_data_lock, flags);
504 up_write(&lists_rwsem);
416 } 505 }
417 list_del(&client->list);
418 506
419 mutex_unlock(&device_mutex); 507 mutex_unlock(&device_mutex);
420} 508}
@@ -590,11 +678,80 @@ EXPORT_SYMBOL(ib_query_port);
590int ib_query_gid(struct ib_device *device, 678int ib_query_gid(struct ib_device *device,
591 u8 port_num, int index, union ib_gid *gid) 679 u8 port_num, int index, union ib_gid *gid)
592{ 680{
681 if (rdma_cap_roce_gid_table(device, port_num))
682 return ib_get_cached_gid(device, port_num, index, gid);
683
593 return device->query_gid(device, port_num, index, gid); 684 return device->query_gid(device, port_num, index, gid);
594} 685}
595EXPORT_SYMBOL(ib_query_gid); 686EXPORT_SYMBOL(ib_query_gid);
596 687
597/** 688/**
689 * ib_enum_roce_netdev - enumerate all RoCE ports
690 * @ib_dev : IB device we want to query
691 * @filter: Should we call the callback?
692 * @filter_cookie: Cookie passed to filter
693 * @cb: Callback to call for each found RoCE ports
694 * @cookie: Cookie passed back to the callback
695 *
696 * Enumerates all of the physical RoCE ports of ib_dev
697 * which are related to netdevice and calls callback() on each
698 * device for which filter() function returns non zero.
699 */
700void ib_enum_roce_netdev(struct ib_device *ib_dev,
701 roce_netdev_filter filter,
702 void *filter_cookie,
703 roce_netdev_callback cb,
704 void *cookie)
705{
706 u8 port;
707
708 for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
709 port++)
710 if (rdma_protocol_roce(ib_dev, port)) {
711 struct net_device *idev = NULL;
712
713 if (ib_dev->get_netdev)
714 idev = ib_dev->get_netdev(ib_dev, port);
715
716 if (idev &&
717 idev->reg_state >= NETREG_UNREGISTERED) {
718 dev_put(idev);
719 idev = NULL;
720 }
721
722 if (filter(ib_dev, port, idev, filter_cookie))
723 cb(ib_dev, port, idev, cookie);
724
725 if (idev)
726 dev_put(idev);
727 }
728}
729
730/**
731 * ib_enum_all_roce_netdevs - enumerate all RoCE devices
732 * @filter: Should we call the callback?
733 * @filter_cookie: Cookie passed to filter
734 * @cb: Callback to call for each found RoCE ports
735 * @cookie: Cookie passed back to the callback
736 *
737 * Enumerates all RoCE devices' physical ports which are related
738 * to netdevices and calls callback() on each device for which
739 * filter() function returns non zero.
740 */
741void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
742 void *filter_cookie,
743 roce_netdev_callback cb,
744 void *cookie)
745{
746 struct ib_device *dev;
747
748 down_read(&lists_rwsem);
749 list_for_each_entry(dev, &device_list, core_list)
750 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
751 up_read(&lists_rwsem);
752}
753
754/**
598 * ib_query_pkey - Get P_Key table entry 755 * ib_query_pkey - Get P_Key table entry
599 * @device:Device to query 756 * @device:Device to query
600 * @port_num:Port number to query 757 * @port_num:Port number to query
@@ -673,6 +830,14 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
673 int ret, port, i; 830 int ret, port, i;
674 831
675 for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { 832 for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
833 if (rdma_cap_roce_gid_table(device, port)) {
834 if (!ib_cache_gid_find_by_port(device, gid, port,
835 NULL, index)) {
836 *port_num = port;
837 return 0;
838 }
839 }
840
676 for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { 841 for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
677 ret = ib_query_gid(device, port, i, &tmp_gid); 842 ret = ib_query_gid(device, port, i, &tmp_gid);
678 if (ret) 843 if (ret)
@@ -729,6 +894,51 @@ int ib_find_pkey(struct ib_device *device,
729} 894}
730EXPORT_SYMBOL(ib_find_pkey); 895EXPORT_SYMBOL(ib_find_pkey);
731 896
897/**
898 * ib_get_net_dev_by_params() - Return the appropriate net_dev
899 * for a received CM request
900 * @dev: An RDMA device on which the request has been received.
901 * @port: Port number on the RDMA device.
902 * @pkey: The Pkey the request came on.
903 * @gid: A GID that the net_dev uses to communicate.
904 * @addr: Contains the IP address that the request specified as its
905 * destination.
906 */
907struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
908 u8 port,
909 u16 pkey,
910 const union ib_gid *gid,
911 const struct sockaddr *addr)
912{
913 struct net_device *net_dev = NULL;
914 struct ib_client_data *context;
915
916 if (!rdma_protocol_ib(dev, port))
917 return NULL;
918
919 down_read(&lists_rwsem);
920
921 list_for_each_entry(context, &dev->client_data_list, list) {
922 struct ib_client *client = context->client;
923
924 if (context->going_down)
925 continue;
926
927 if (client->get_net_dev_by_params) {
928 net_dev = client->get_net_dev_by_params(dev, port, pkey,
929 gid, addr,
930 context->data);
931 if (net_dev)
932 break;
933 }
934 }
935
936 up_read(&lists_rwsem);
937
938 return net_dev;
939}
940EXPORT_SYMBOL(ib_get_net_dev_by_params);
941
732static int __init ib_core_init(void) 942static int __init ib_core_init(void)
733{ 943{
734 int ret; 944 int ret;
@@ -737,7 +947,7 @@ static int __init ib_core_init(void)
737 if (!ib_wq) 947 if (!ib_wq)
738 return -ENOMEM; 948 return -ENOMEM;
739 949
740 ret = ib_sysfs_setup(); 950 ret = class_register(&ib_class);
741 if (ret) { 951 if (ret) {
742 printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); 952 printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
743 goto err; 953 goto err;
@@ -749,19 +959,12 @@ static int __init ib_core_init(void)
749 goto err_sysfs; 959 goto err_sysfs;
750 } 960 }
751 961
752 ret = ib_cache_setup(); 962 ib_cache_setup();
753 if (ret) {
754 printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
755 goto err_nl;
756 }
757 963
758 return 0; 964 return 0;
759 965
760err_nl:
761 ibnl_cleanup();
762
763err_sysfs: 966err_sysfs:
764 ib_sysfs_cleanup(); 967 class_unregister(&ib_class);
765 968
766err: 969err:
767 destroy_workqueue(ib_wq); 970 destroy_workqueue(ib_wq);
@@ -772,7 +975,7 @@ static void __exit ib_core_cleanup(void)
772{ 975{
773 ib_cache_cleanup(); 976 ib_cache_cleanup();
774 ibnl_cleanup(); 977 ibnl_cleanup();
775 ib_sysfs_cleanup(); 978 class_unregister(&ib_class);
776 /* Make sure that any pending umem accounting work is done. */ 979 /* Make sure that any pending umem accounting work is done. */
777 destroy_workqueue(ib_wq); 980 destroy_workqueue(ib_wq);
778} 981}
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 786fc51bf04b..4b5c72311deb 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -338,13 +338,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
338 goto error1; 338 goto error1;
339 } 339 }
340 340
341 mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
342 IB_ACCESS_LOCAL_WRITE);
343 if (IS_ERR(mad_agent_priv->agent.mr)) {
344 ret = ERR_PTR(-ENOMEM);
345 goto error2;
346 }
347
348 if (mad_reg_req) { 341 if (mad_reg_req) {
349 reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL); 342 reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
350 if (!reg_req) { 343 if (!reg_req) {
@@ -429,8 +422,6 @@ error4:
429 spin_unlock_irqrestore(&port_priv->reg_lock, flags); 422 spin_unlock_irqrestore(&port_priv->reg_lock, flags);
430 kfree(reg_req); 423 kfree(reg_req);
431error3: 424error3:
432 ib_dereg_mr(mad_agent_priv->agent.mr);
433error2:
434 kfree(mad_agent_priv); 425 kfree(mad_agent_priv);
435error1: 426error1:
436 return ret; 427 return ret;
@@ -590,7 +581,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
590 wait_for_completion(&mad_agent_priv->comp); 581 wait_for_completion(&mad_agent_priv->comp);
591 582
592 kfree(mad_agent_priv->reg_req); 583 kfree(mad_agent_priv->reg_req);
593 ib_dereg_mr(mad_agent_priv->agent.mr);
594 kfree(mad_agent_priv); 584 kfree(mad_agent_priv);
595} 585}
596 586
@@ -1038,7 +1028,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
1038 1028
1039 mad_send_wr->mad_agent_priv = mad_agent_priv; 1029 mad_send_wr->mad_agent_priv = mad_agent_priv;
1040 mad_send_wr->sg_list[0].length = hdr_len; 1030 mad_send_wr->sg_list[0].length = hdr_len;
1041 mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey; 1031 mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;
1042 1032
1043 /* OPA MADs don't have to be the full 2048 bytes */ 1033 /* OPA MADs don't have to be the full 2048 bytes */
1044 if (opa && base_version == OPA_MGMT_BASE_VERSION && 1034 if (opa && base_version == OPA_MGMT_BASE_VERSION &&
@@ -1047,7 +1037,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
1047 else 1037 else
1048 mad_send_wr->sg_list[1].length = mad_size - hdr_len; 1038 mad_send_wr->sg_list[1].length = mad_size - hdr_len;
1049 1039
1050 mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey; 1040 mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
1051 1041
1052 mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr; 1042 mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
1053 mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list; 1043 mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
@@ -2885,7 +2875,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
2885 struct ib_mad_queue *recv_queue = &qp_info->recv_queue; 2875 struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
2886 2876
2887 /* Initialize common scatter list fields */ 2877 /* Initialize common scatter list fields */
2888 sg_list.lkey = (*qp_info->port_priv->mr).lkey; 2878 sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
2889 2879
2890 /* Initialize common receive WR fields */ 2880 /* Initialize common receive WR fields */
2891 recv_wr.next = NULL; 2881 recv_wr.next = NULL;
@@ -3201,13 +3191,6 @@ static int ib_mad_port_open(struct ib_device *device,
3201 goto error4; 3191 goto error4;
3202 } 3192 }
3203 3193
3204 port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
3205 if (IS_ERR(port_priv->mr)) {
3206 dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n");
3207 ret = PTR_ERR(port_priv->mr);
3208 goto error5;
3209 }
3210
3211 if (has_smi) { 3194 if (has_smi) {
3212 ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI); 3195 ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
3213 if (ret) 3196 if (ret)
@@ -3248,8 +3231,6 @@ error8:
3248error7: 3231error7:
3249 destroy_mad_qp(&port_priv->qp_info[0]); 3232 destroy_mad_qp(&port_priv->qp_info[0]);
3250error6: 3233error6:
3251 ib_dereg_mr(port_priv->mr);
3252error5:
3253 ib_dealloc_pd(port_priv->pd); 3234 ib_dealloc_pd(port_priv->pd);
3254error4: 3235error4:
3255 ib_destroy_cq(port_priv->cq); 3236 ib_destroy_cq(port_priv->cq);
@@ -3284,7 +3265,6 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
3284 destroy_workqueue(port_priv->wq); 3265 destroy_workqueue(port_priv->wq);
3285 destroy_mad_qp(&port_priv->qp_info[1]); 3266 destroy_mad_qp(&port_priv->qp_info[1]);
3286 destroy_mad_qp(&port_priv->qp_info[0]); 3267 destroy_mad_qp(&port_priv->qp_info[0]);
3287 ib_dereg_mr(port_priv->mr);
3288 ib_dealloc_pd(port_priv->pd); 3268 ib_dealloc_pd(port_priv->pd);
3289 ib_destroy_cq(port_priv->cq); 3269 ib_destroy_cq(port_priv->cq);
3290 cleanup_recv_queue(&port_priv->qp_info[1]); 3270 cleanup_recv_queue(&port_priv->qp_info[1]);
@@ -3335,7 +3315,7 @@ error:
3335 } 3315 }
3336} 3316}
3337 3317
3338static void ib_mad_remove_device(struct ib_device *device) 3318static void ib_mad_remove_device(struct ib_device *device, void *client_data)
3339{ 3319{
3340 int i; 3320 int i;
3341 3321
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 5be89f98928f..4a4f7aad0978 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -199,7 +199,6 @@ struct ib_mad_port_private {
199 int port_num; 199 int port_num;
200 struct ib_cq *cq; 200 struct ib_cq *cq;
201 struct ib_pd *pd; 201 struct ib_pd *pd;
202 struct ib_mr *mr;
203 202
204 spinlock_t reg_lock; 203 spinlock_t reg_lock;
205 struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION]; 204 struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index 2cb865c7ce7a..d38d8b2b2979 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -43,7 +43,7 @@
43#include "sa.h" 43#include "sa.h"
44 44
45static void mcast_add_one(struct ib_device *device); 45static void mcast_add_one(struct ib_device *device);
46static void mcast_remove_one(struct ib_device *device); 46static void mcast_remove_one(struct ib_device *device, void *client_data);
47 47
48static struct ib_client mcast_client = { 48static struct ib_client mcast_client = {
49 .name = "ib_multicast", 49 .name = "ib_multicast",
@@ -840,13 +840,12 @@ static void mcast_add_one(struct ib_device *device)
840 ib_register_event_handler(&dev->event_handler); 840 ib_register_event_handler(&dev->event_handler);
841} 841}
842 842
843static void mcast_remove_one(struct ib_device *device) 843static void mcast_remove_one(struct ib_device *device, void *client_data)
844{ 844{
845 struct mcast_device *dev; 845 struct mcast_device *dev = client_data;
846 struct mcast_port *port; 846 struct mcast_port *port;
847 int i; 847 int i;
848 848
849 dev = ib_get_client_data(device, &mcast_client);
850 if (!dev) 849 if (!dev)
851 return; 850 return;
852 851
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 23dd5a5c7597..d47df9356779 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -49,6 +49,14 @@ static DEFINE_MUTEX(ibnl_mutex);
49static struct sock *nls; 49static struct sock *nls;
50static LIST_HEAD(client_list); 50static LIST_HEAD(client_list);
51 51
52int ibnl_chk_listeners(unsigned int group)
53{
54 if (netlink_has_listeners(nls, group) == 0)
55 return -1;
56 return 0;
57}
58EXPORT_SYMBOL(ibnl_chk_listeners);
59
52int ibnl_add_client(int index, int nops, 60int ibnl_add_client(int index, int nops,
53 const struct ibnl_client_cbs cb_table[]) 61 const struct ibnl_client_cbs cb_table[])
54{ 62{
@@ -151,6 +159,23 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
151 !client->cb_table[op].dump) 159 !client->cb_table[op].dump)
152 return -EINVAL; 160 return -EINVAL;
153 161
162 /*
163 * For response or local service set_timeout request,
164 * there is no need to use netlink_dump_start.
165 */
166 if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
167 (index == RDMA_NL_LS &&
168 op == RDMA_NL_LS_OP_SET_TIMEOUT)) {
169 struct netlink_callback cb = {
170 .skb = skb,
171 .nlh = nlh,
172 .dump = client->cb_table[op].dump,
173 .module = client->cb_table[op].module,
174 };
175
176 return cb.dump(skb, &cb);
177 }
178
154 { 179 {
155 struct netlink_dump_control c = { 180 struct netlink_dump_control c = {
156 .dump = client->cb_table[op].dump, 181 .dump = client->cb_table[op].dump,
@@ -165,9 +190,39 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
165 return -EINVAL; 190 return -EINVAL;
166} 191}
167 192
193static void ibnl_rcv_reply_skb(struct sk_buff *skb)
194{
195 struct nlmsghdr *nlh;
196 int msglen;
197
198 /*
199 * Process responses until there is no more message or the first
200 * request. Generally speaking, it is not recommended to mix responses
201 * with requests.
202 */
203 while (skb->len >= nlmsg_total_size(0)) {
204 nlh = nlmsg_hdr(skb);
205
206 if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
207 return;
208
209 /* Handle response only */
210 if (nlh->nlmsg_flags & NLM_F_REQUEST)
211 return;
212
213 ibnl_rcv_msg(skb, nlh);
214
215 msglen = NLMSG_ALIGN(nlh->nlmsg_len);
216 if (msglen > skb->len)
217 msglen = skb->len;
218 skb_pull(skb, msglen);
219 }
220}
221
168static void ibnl_rcv(struct sk_buff *skb) 222static void ibnl_rcv(struct sk_buff *skb)
169{ 223{
170 mutex_lock(&ibnl_mutex); 224 mutex_lock(&ibnl_mutex);
225 ibnl_rcv_reply_skb(skb);
171 netlink_rcv_skb(skb, &ibnl_rcv_msg); 226 netlink_rcv_skb(skb, &ibnl_rcv_msg);
172 mutex_unlock(&ibnl_mutex); 227 mutex_unlock(&ibnl_mutex);
173} 228}
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
new file mode 100644
index 000000000000..6b24cba1e474
--- /dev/null
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -0,0 +1,728 @@
1/*
2 * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include "core_priv.h"
34
35#include <linux/in.h>
36#include <linux/in6.h>
37
38/* For in6_dev_get/in6_dev_put */
39#include <net/addrconf.h>
40#include <net/bonding.h>
41
42#include <rdma/ib_cache.h>
43#include <rdma/ib_addr.h>
44
45enum gid_op_type {
46 GID_DEL = 0,
47 GID_ADD
48};
49
50struct update_gid_event_work {
51 struct work_struct work;
52 union ib_gid gid;
53 struct ib_gid_attr gid_attr;
54 enum gid_op_type gid_op;
55};
56
57#define ROCE_NETDEV_CALLBACK_SZ 3
58struct netdev_event_work_cmd {
59 roce_netdev_callback cb;
60 roce_netdev_filter filter;
61 struct net_device *ndev;
62 struct net_device *filter_ndev;
63};
64
65struct netdev_event_work {
66 struct work_struct work;
67 struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ];
68};
69
70static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
71 u8 port, union ib_gid *gid,
72 struct ib_gid_attr *gid_attr)
73{
74 switch (gid_op) {
75 case GID_ADD:
76 ib_cache_gid_add(ib_dev, port, gid, gid_attr);
77 break;
78 case GID_DEL:
79 ib_cache_gid_del(ib_dev, port, gid, gid_attr);
80 break;
81 }
82}
83
84enum bonding_slave_state {
85 BONDING_SLAVE_STATE_ACTIVE = 1UL << 0,
86 BONDING_SLAVE_STATE_INACTIVE = 1UL << 1,
87 /* No primary slave or the device isn't a slave in bonding */
88 BONDING_SLAVE_STATE_NA = 1UL << 2,
89};
90
91static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev,
92 struct net_device *upper)
93{
94 if (upper && netif_is_bond_master(upper)) {
95 struct net_device *pdev =
96 bond_option_active_slave_get_rcu(netdev_priv(upper));
97
98 if (pdev)
99 return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
100 BONDING_SLAVE_STATE_INACTIVE;
101 }
102
103 return BONDING_SLAVE_STATE_NA;
104}
105
106static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper)
107{
108 struct net_device *_upper = NULL;
109 struct list_head *iter;
110
111 netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
112 if (_upper == upper)
113 break;
114
115 return _upper == upper;
116}
117
118#define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \
119 BONDING_SLAVE_STATE_NA)
120static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
121 struct net_device *rdma_ndev, void *cookie)
122{
123 struct net_device *event_ndev = (struct net_device *)cookie;
124 struct net_device *real_dev;
125 int res;
126
127 if (!rdma_ndev)
128 return 0;
129
130 rcu_read_lock();
131 real_dev = rdma_vlan_dev_real_dev(event_ndev);
132 if (!real_dev)
133 real_dev = event_ndev;
134
135 res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) &&
136 (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
137 REQUIRED_BOND_STATES)) ||
138 real_dev == rdma_ndev);
139
140 rcu_read_unlock();
141 return res;
142}
143
144static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port,
145 struct net_device *rdma_ndev, void *cookie)
146{
147 struct net_device *master_dev;
148 int res;
149
150 if (!rdma_ndev)
151 return 0;
152
153 rcu_read_lock();
154 master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
155 res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) ==
156 BONDING_SLAVE_STATE_INACTIVE;
157 rcu_read_unlock();
158
159 return res;
160}
161
162static int pass_all_filter(struct ib_device *ib_dev, u8 port,
163 struct net_device *rdma_ndev, void *cookie)
164{
165 return 1;
166}
167
168static int upper_device_filter(struct ib_device *ib_dev, u8 port,
169 struct net_device *rdma_ndev, void *cookie)
170{
171 struct net_device *event_ndev = (struct net_device *)cookie;
172 int res;
173
174 if (!rdma_ndev)
175 return 0;
176
177 if (rdma_ndev == event_ndev)
178 return 1;
179
180 rcu_read_lock();
181 res = is_upper_dev_rcu(rdma_ndev, event_ndev);
182 rcu_read_unlock();
183
184 return res;
185}
186
187static void update_gid_ip(enum gid_op_type gid_op,
188 struct ib_device *ib_dev,
189 u8 port, struct net_device *ndev,
190 struct sockaddr *addr)
191{
192 union ib_gid gid;
193 struct ib_gid_attr gid_attr;
194
195 rdma_ip2gid(addr, &gid);
196 memset(&gid_attr, 0, sizeof(gid_attr));
197 gid_attr.ndev = ndev;
198
199 update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
200}
201
202static void enum_netdev_default_gids(struct ib_device *ib_dev,
203 u8 port, struct net_device *event_ndev,
204 struct net_device *rdma_ndev)
205{
206 rcu_read_lock();
207 if (!rdma_ndev ||
208 ((rdma_ndev != event_ndev &&
209 !is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
210 is_eth_active_slave_of_bonding_rcu(rdma_ndev,
211 netdev_master_upper_dev_get_rcu(rdma_ndev)) ==
212 BONDING_SLAVE_STATE_INACTIVE)) {
213 rcu_read_unlock();
214 return;
215 }
216 rcu_read_unlock();
217
218 ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
219 IB_CACHE_GID_DEFAULT_MODE_SET);
220}
221
222static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
223 u8 port,
224 struct net_device *event_ndev,
225 struct net_device *rdma_ndev)
226{
227 struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev);
228
229 if (!rdma_ndev)
230 return;
231
232 if (!real_dev)
233 real_dev = event_ndev;
234
235 rcu_read_lock();
236
237 if (is_upper_dev_rcu(rdma_ndev, event_ndev) &&
238 is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) ==
239 BONDING_SLAVE_STATE_INACTIVE) {
240 rcu_read_unlock();
241
242 ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
243 IB_CACHE_GID_DEFAULT_MODE_DELETE);
244 } else {
245 rcu_read_unlock();
246 }
247}
248
249static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
250 u8 port, struct net_device *ndev)
251{
252 struct in_device *in_dev;
253
254 if (ndev->reg_state >= NETREG_UNREGISTERING)
255 return;
256
257 in_dev = in_dev_get(ndev);
258 if (!in_dev)
259 return;
260
261 for_ifa(in_dev) {
262 struct sockaddr_in ip;
263
264 ip.sin_family = AF_INET;
265 ip.sin_addr.s_addr = ifa->ifa_address;
266 update_gid_ip(GID_ADD, ib_dev, port, ndev,
267 (struct sockaddr *)&ip);
268 }
269 endfor_ifa(in_dev);
270
271 in_dev_put(in_dev);
272}
273
274static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
275 u8 port, struct net_device *ndev)
276{
277 struct inet6_ifaddr *ifp;
278 struct inet6_dev *in6_dev;
279 struct sin6_list {
280 struct list_head list;
281 struct sockaddr_in6 sin6;
282 };
283 struct sin6_list *sin6_iter;
284 struct sin6_list *sin6_temp;
285 struct ib_gid_attr gid_attr = {.ndev = ndev};
286 LIST_HEAD(sin6_list);
287
288 if (ndev->reg_state >= NETREG_UNREGISTERING)
289 return;
290
291 in6_dev = in6_dev_get(ndev);
292 if (!in6_dev)
293 return;
294
295 read_lock_bh(&in6_dev->lock);
296 list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
297 struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
298
299 if (!entry) {
300 pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv6 update\n");
301 continue;
302 }
303
304 entry->sin6.sin6_family = AF_INET6;
305 entry->sin6.sin6_addr = ifp->addr;
306 list_add_tail(&entry->list, &sin6_list);
307 }
308 read_unlock_bh(&in6_dev->lock);
309
310 in6_dev_put(in6_dev);
311
312 list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
313 union ib_gid gid;
314
315 rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid);
316 update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
317 list_del(&sin6_iter->list);
318 kfree(sin6_iter);
319 }
320}
321
322static void _add_netdev_ips(struct ib_device *ib_dev, u8 port,
323 struct net_device *ndev)
324{
325 enum_netdev_ipv4_ips(ib_dev, port, ndev);
326 if (IS_ENABLED(CONFIG_IPV6))
327 enum_netdev_ipv6_ips(ib_dev, port, ndev);
328}
329
330static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
331 struct net_device *rdma_ndev, void *cookie)
332{
333 struct net_device *event_ndev = (struct net_device *)cookie;
334
335 enum_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
336 _add_netdev_ips(ib_dev, port, event_ndev);
337}
338
339static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
340 struct net_device *rdma_ndev, void *cookie)
341{
342 struct net_device *event_ndev = (struct net_device *)cookie;
343
344 ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
345}
346
347static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
348 u8 port,
349 struct net_device *rdma_ndev,
350 void *cookie)
351{
352 struct net *net;
353 struct net_device *ndev;
354
355 /* Lock the rtnl to make sure the netdevs does not move under
356 * our feet
357 */
358 rtnl_lock();
359 for_each_net(net)
360 for_each_netdev(net, ndev)
361 if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev))
362 add_netdev_ips(ib_dev, port, rdma_ndev, ndev);
363 rtnl_unlock();
364}
365
366/* This function will rescan all of the network devices in the system
367 * and add their gids, as needed, to the relevant RoCE devices. */
368int roce_rescan_device(struct ib_device *ib_dev)
369{
370 ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
371 enum_all_gids_of_dev_cb, NULL);
372
373 return 0;
374}
375
376static void callback_for_addr_gid_device_scan(struct ib_device *device,
377 u8 port,
378 struct net_device *rdma_ndev,
379 void *cookie)
380{
381 struct update_gid_event_work *parsed = cookie;
382
383 return update_gid(parsed->gid_op, device,
384 port, &parsed->gid,
385 &parsed->gid_attr);
386}
387
388static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
389 void *cookie,
390 void (*handle_netdev)(struct ib_device *ib_dev,
391 u8 port,
392 struct net_device *ndev))
393{
394 struct net_device *ndev = (struct net_device *)cookie;
395 struct upper_list {
396 struct list_head list;
397 struct net_device *upper;
398 };
399 struct net_device *upper;
400 struct list_head *iter;
401 struct upper_list *upper_iter;
402 struct upper_list *upper_temp;
403 LIST_HEAD(upper_list);
404
405 rcu_read_lock();
406 netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) {
407 struct upper_list *entry = kmalloc(sizeof(*entry),
408 GFP_ATOMIC);
409
410 if (!entry) {
411 pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n");
412 continue;
413 }
414
415 list_add_tail(&entry->list, &upper_list);
416 dev_hold(upper);
417 entry->upper = upper;
418 }
419 rcu_read_unlock();
420
421 handle_netdev(ib_dev, port, ndev);
422 list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
423 list) {
424 handle_netdev(ib_dev, port, upper_iter->upper);
425 dev_put(upper_iter->upper);
426 list_del(&upper_iter->list);
427 kfree(upper_iter);
428 }
429}
430
431static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
432 struct net_device *event_ndev)
433{
434 ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
435}
436
437static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
438 struct net_device *rdma_ndev, void *cookie)
439{
440 handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
441}
442
443static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
444 struct net_device *rdma_ndev, void *cookie)
445{
446 handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
447}
448
449static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port,
450 struct net_device *rdma_ndev,
451 void *cookie)
452{
453 struct net_device *master_ndev;
454
455 rcu_read_lock();
456 master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
457 if (master_ndev)
458 dev_hold(master_ndev);
459 rcu_read_unlock();
460
461 if (master_ndev) {
462 bond_delete_netdev_default_gids(ib_dev, port, master_ndev,
463 rdma_ndev);
464 dev_put(master_ndev);
465 }
466}
467
468static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port,
469 struct net_device *rdma_ndev, void *cookie)
470{
471 struct net_device *event_ndev = (struct net_device *)cookie;
472
473 bond_delete_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
474}
475
476/* The following functions operate on all IB devices. netdevice_event and
477 * addr_event execute ib_enum_all_roce_netdevs through a work.
478 * ib_enum_all_roce_netdevs iterates through all IB devices.
479 */
480
481static void netdevice_event_work_handler(struct work_struct *_work)
482{
483 struct netdev_event_work *work =
484 container_of(_work, struct netdev_event_work, work);
485 unsigned int i;
486
487 for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
488 ib_enum_all_roce_netdevs(work->cmds[i].filter,
489 work->cmds[i].filter_ndev,
490 work->cmds[i].cb,
491 work->cmds[i].ndev);
492 dev_put(work->cmds[i].ndev);
493 dev_put(work->cmds[i].filter_ndev);
494 }
495
496 kfree(work);
497}
498
499static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
500 struct net_device *ndev)
501{
502 unsigned int i;
503 struct netdev_event_work *ndev_work =
504 kmalloc(sizeof(*ndev_work), GFP_KERNEL);
505
506 if (!ndev_work) {
507 pr_warn("roce_gid_mgmt: can't allocate work for netdevice_event\n");
508 return NOTIFY_DONE;
509 }
510
511 memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
512 for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
513 if (!ndev_work->cmds[i].ndev)
514 ndev_work->cmds[i].ndev = ndev;
515 if (!ndev_work->cmds[i].filter_ndev)
516 ndev_work->cmds[i].filter_ndev = ndev;
517 dev_hold(ndev_work->cmds[i].ndev);
518 dev_hold(ndev_work->cmds[i].filter_ndev);
519 }
520 INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
521
522 queue_work(ib_wq, &ndev_work->work);
523
524 return NOTIFY_DONE;
525}
526
527static const struct netdev_event_work_cmd add_cmd = {
528 .cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
529static const struct netdev_event_work_cmd add_cmd_upper_ips = {
530 .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev};
531
532static void netdevice_event_changeupper(struct netdev_notifier_changeupper_info *changeupper_info,
533 struct netdev_event_work_cmd *cmds)
534{
535 static const struct netdev_event_work_cmd upper_ips_del_cmd = {
536 .cb = del_netdev_upper_ips, .filter = upper_device_filter};
537 static const struct netdev_event_work_cmd bonding_default_del_cmd = {
538 .cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave};
539
540 if (changeupper_info->linking == false) {
541 cmds[0] = upper_ips_del_cmd;
542 cmds[0].ndev = changeupper_info->upper_dev;
543 cmds[1] = add_cmd;
544 } else {
545 cmds[0] = bonding_default_del_cmd;
546 cmds[0].ndev = changeupper_info->upper_dev;
547 cmds[1] = add_cmd_upper_ips;
548 cmds[1].ndev = changeupper_info->upper_dev;
549 cmds[1].filter_ndev = changeupper_info->upper_dev;
550 }
551}
552
553static int netdevice_event(struct notifier_block *this, unsigned long event,
554 void *ptr)
555{
556 static const struct netdev_event_work_cmd del_cmd = {
557 .cb = del_netdev_ips, .filter = pass_all_filter};
558 static const struct netdev_event_work_cmd bonding_default_del_cmd_join = {
559 .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave};
560 static const struct netdev_event_work_cmd default_del_cmd = {
561 .cb = del_netdev_default_ips, .filter = pass_all_filter};
562 static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
563 .cb = del_netdev_upper_ips, .filter = upper_device_filter};
564 struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
565 struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
566
567 if (ndev->type != ARPHRD_ETHER)
568 return NOTIFY_DONE;
569
570 switch (event) {
571 case NETDEV_REGISTER:
572 case NETDEV_UP:
573 cmds[0] = bonding_default_del_cmd_join;
574 cmds[1] = add_cmd;
575 break;
576
577 case NETDEV_UNREGISTER:
578 if (ndev->reg_state < NETREG_UNREGISTERED)
579 cmds[0] = del_cmd;
580 else
581 return NOTIFY_DONE;
582 break;
583
584 case NETDEV_CHANGEADDR:
585 cmds[0] = default_del_cmd;
586 cmds[1] = add_cmd;
587 break;
588
589 case NETDEV_CHANGEUPPER:
590 netdevice_event_changeupper(
591 container_of(ptr, struct netdev_notifier_changeupper_info, info),
592 cmds);
593 break;
594
595 case NETDEV_BONDING_FAILOVER:
596 cmds[0] = bonding_event_ips_del_cmd;
597 cmds[1] = bonding_default_del_cmd_join;
598 cmds[2] = add_cmd_upper_ips;
599 break;
600
601 default:
602 return NOTIFY_DONE;
603 }
604
605 return netdevice_queue_work(cmds, ndev);
606}
607
608static void update_gid_event_work_handler(struct work_struct *_work)
609{
610 struct update_gid_event_work *work =
611 container_of(_work, struct update_gid_event_work, work);
612
613 ib_enum_all_roce_netdevs(is_eth_port_of_netdev, work->gid_attr.ndev,
614 callback_for_addr_gid_device_scan, work);
615
616 dev_put(work->gid_attr.ndev);
617 kfree(work);
618}
619
620static int addr_event(struct notifier_block *this, unsigned long event,
621 struct sockaddr *sa, struct net_device *ndev)
622{
623 struct update_gid_event_work *work;
624 enum gid_op_type gid_op;
625
626 if (ndev->type != ARPHRD_ETHER)
627 return NOTIFY_DONE;
628
629 switch (event) {
630 case NETDEV_UP:
631 gid_op = GID_ADD;
632 break;
633
634 case NETDEV_DOWN:
635 gid_op = GID_DEL;
636 break;
637
638 default:
639 return NOTIFY_DONE;
640 }
641
642 work = kmalloc(sizeof(*work), GFP_ATOMIC);
643 if (!work) {
644 pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n");
645 return NOTIFY_DONE;
646 }
647
648 INIT_WORK(&work->work, update_gid_event_work_handler);
649
650 rdma_ip2gid(sa, &work->gid);
651 work->gid_op = gid_op;
652
653 memset(&work->gid_attr, 0, sizeof(work->gid_attr));
654 dev_hold(ndev);
655 work->gid_attr.ndev = ndev;
656
657 queue_work(ib_wq, &work->work);
658
659 return NOTIFY_DONE;
660}
661
662static int inetaddr_event(struct notifier_block *this, unsigned long event,
663 void *ptr)
664{
665 struct sockaddr_in in;
666 struct net_device *ndev;
667 struct in_ifaddr *ifa = ptr;
668
669 in.sin_family = AF_INET;
670 in.sin_addr.s_addr = ifa->ifa_address;
671 ndev = ifa->ifa_dev->dev;
672
673 return addr_event(this, event, (struct sockaddr *)&in, ndev);
674}
675
676static int inet6addr_event(struct notifier_block *this, unsigned long event,
677 void *ptr)
678{
679 struct sockaddr_in6 in6;
680 struct net_device *ndev;
681 struct inet6_ifaddr *ifa6 = ptr;
682
683 in6.sin6_family = AF_INET6;
684 in6.sin6_addr = ifa6->addr;
685 ndev = ifa6->idev->dev;
686
687 return addr_event(this, event, (struct sockaddr *)&in6, ndev);
688}
689
690static struct notifier_block nb_netdevice = {
691 .notifier_call = netdevice_event
692};
693
694static struct notifier_block nb_inetaddr = {
695 .notifier_call = inetaddr_event
696};
697
698static struct notifier_block nb_inet6addr = {
699 .notifier_call = inet6addr_event
700};
701
702int __init roce_gid_mgmt_init(void)
703{
704 register_inetaddr_notifier(&nb_inetaddr);
705 if (IS_ENABLED(CONFIG_IPV6))
706 register_inet6addr_notifier(&nb_inet6addr);
707 /* We relay on the netdevice notifier to enumerate all
708 * existing devices in the system. Register to this notifier
709 * last to make sure we will not miss any IP add/del
710 * callbacks.
711 */
712 register_netdevice_notifier(&nb_netdevice);
713
714 return 0;
715}
716
717void __exit roce_gid_mgmt_cleanup(void)
718{
719 if (IS_ENABLED(CONFIG_IPV6))
720 unregister_inet6addr_notifier(&nb_inet6addr);
721 unregister_inetaddr_notifier(&nb_inetaddr);
722 unregister_netdevice_notifier(&nb_netdevice);
723 /* Ensure all gid deletion tasks complete before we go down,
724 * to avoid any reference to free'd memory. By the time
725 * ib-core is removed, all physical devices have been removed,
726 * so no issue with remaining hardware contexts.
727 */
728}
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index ca919f429666..8c014b33d8e0 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -45,12 +45,21 @@
45#include <uapi/linux/if_ether.h> 45#include <uapi/linux/if_ether.h>
46#include <rdma/ib_pack.h> 46#include <rdma/ib_pack.h>
47#include <rdma/ib_cache.h> 47#include <rdma/ib_cache.h>
48#include <rdma/rdma_netlink.h>
49#include <net/netlink.h>
50#include <uapi/rdma/ib_user_sa.h>
51#include <rdma/ib_marshall.h>
48#include "sa.h" 52#include "sa.h"
49 53
50MODULE_AUTHOR("Roland Dreier"); 54MODULE_AUTHOR("Roland Dreier");
51MODULE_DESCRIPTION("InfiniBand subnet administration query support"); 55MODULE_DESCRIPTION("InfiniBand subnet administration query support");
52MODULE_LICENSE("Dual BSD/GPL"); 56MODULE_LICENSE("Dual BSD/GPL");
53 57
58#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100
59#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000
60#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000
61static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
62
54struct ib_sa_sm_ah { 63struct ib_sa_sm_ah {
55 struct ib_ah *ah; 64 struct ib_ah *ah;
56 struct kref ref; 65 struct kref ref;
@@ -80,8 +89,16 @@ struct ib_sa_query {
80 struct ib_mad_send_buf *mad_buf; 89 struct ib_mad_send_buf *mad_buf;
81 struct ib_sa_sm_ah *sm_ah; 90 struct ib_sa_sm_ah *sm_ah;
82 int id; 91 int id;
92 u32 flags;
93 struct list_head list; /* Local svc request list */
94 u32 seq; /* Local svc request sequence number */
95 unsigned long timeout; /* Local svc timeout */
96 u8 path_use; /* How will the pathrecord be used */
83}; 97};
84 98
99#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001
100#define IB_SA_CANCEL 0x00000002
101
85struct ib_sa_service_query { 102struct ib_sa_service_query {
86 void (*callback)(int, struct ib_sa_service_rec *, void *); 103 void (*callback)(int, struct ib_sa_service_rec *, void *);
87 void *context; 104 void *context;
@@ -106,8 +123,28 @@ struct ib_sa_mcmember_query {
106 struct ib_sa_query sa_query; 123 struct ib_sa_query sa_query;
107}; 124};
108 125
126static LIST_HEAD(ib_nl_request_list);
127static DEFINE_SPINLOCK(ib_nl_request_lock);
128static atomic_t ib_nl_sa_request_seq;
129static struct workqueue_struct *ib_nl_wq;
130static struct delayed_work ib_nl_timed_work;
131static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = {
132 [LS_NLA_TYPE_PATH_RECORD] = {.type = NLA_BINARY,
133 .len = sizeof(struct ib_path_rec_data)},
134 [LS_NLA_TYPE_TIMEOUT] = {.type = NLA_U32},
135 [LS_NLA_TYPE_SERVICE_ID] = {.type = NLA_U64},
136 [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
137 .len = sizeof(struct rdma_nla_ls_gid)},
138 [LS_NLA_TYPE_SGID] = {.type = NLA_BINARY,
139 .len = sizeof(struct rdma_nla_ls_gid)},
140 [LS_NLA_TYPE_TCLASS] = {.type = NLA_U8},
141 [LS_NLA_TYPE_PKEY] = {.type = NLA_U16},
142 [LS_NLA_TYPE_QOS_CLASS] = {.type = NLA_U16},
143};
144
145
109static void ib_sa_add_one(struct ib_device *device); 146static void ib_sa_add_one(struct ib_device *device);
110static void ib_sa_remove_one(struct ib_device *device); 147static void ib_sa_remove_one(struct ib_device *device, void *client_data);
111 148
112static struct ib_client sa_client = { 149static struct ib_client sa_client = {
113 .name = "sa", 150 .name = "sa",
@@ -381,6 +418,427 @@ static const struct ib_field guidinfo_rec_table[] = {
381 .size_bits = 512 }, 418 .size_bits = 512 },
382}; 419};
383 420
421static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
422{
423 query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
424}
425
426static inline int ib_sa_query_cancelled(struct ib_sa_query *query)
427{
428 return (query->flags & IB_SA_CANCEL);
429}
430
431static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
432 struct ib_sa_query *query)
433{
434 struct ib_sa_path_rec *sa_rec = query->mad_buf->context[1];
435 struct ib_sa_mad *mad = query->mad_buf->mad;
436 ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask;
437 u16 val16;
438 u64 val64;
439 struct rdma_ls_resolve_header *header;
440
441 query->mad_buf->context[1] = NULL;
442
443 /* Construct the family header first */
444 header = (struct rdma_ls_resolve_header *)
445 skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
446 memcpy(header->device_name, query->port->agent->device->name,
447 LS_DEVICE_NAME_MAX);
448 header->port_num = query->port->port_num;
449
450 if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
451 sa_rec->reversible != 0)
452 query->path_use = LS_RESOLVE_PATH_USE_GMP;
453 else
454 query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
455 header->path_use = query->path_use;
456
457 /* Now build the attributes */
458 if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) {
459 val64 = be64_to_cpu(sa_rec->service_id);
460 nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID,
461 sizeof(val64), &val64);
462 }
463 if (comp_mask & IB_SA_PATH_REC_DGID)
464 nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID,
465 sizeof(sa_rec->dgid), &sa_rec->dgid);
466 if (comp_mask & IB_SA_PATH_REC_SGID)
467 nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID,
468 sizeof(sa_rec->sgid), &sa_rec->sgid);
469 if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
470 nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS,
471 sizeof(sa_rec->traffic_class), &sa_rec->traffic_class);
472
473 if (comp_mask & IB_SA_PATH_REC_PKEY) {
474 val16 = be16_to_cpu(sa_rec->pkey);
475 nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY,
476 sizeof(val16), &val16);
477 }
478 if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) {
479 val16 = be16_to_cpu(sa_rec->qos_class);
480 nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS,
481 sizeof(val16), &val16);
482 }
483}
484
485static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask)
486{
487 int len = 0;
488
489 if (comp_mask & IB_SA_PATH_REC_SERVICE_ID)
490 len += nla_total_size(sizeof(u64));
491 if (comp_mask & IB_SA_PATH_REC_DGID)
492 len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
493 if (comp_mask & IB_SA_PATH_REC_SGID)
494 len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
495 if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
496 len += nla_total_size(sizeof(u8));
497 if (comp_mask & IB_SA_PATH_REC_PKEY)
498 len += nla_total_size(sizeof(u16));
499 if (comp_mask & IB_SA_PATH_REC_QOS_CLASS)
500 len += nla_total_size(sizeof(u16));
501
502 /*
503 * Make sure that at least some of the required comp_mask bits are
504 * set.
505 */
506 if (WARN_ON(len == 0))
507 return len;
508
509 /* Add the family header */
510 len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header));
511
512 return len;
513}
514
515static int ib_nl_send_msg(struct ib_sa_query *query)
516{
517 struct sk_buff *skb = NULL;
518 struct nlmsghdr *nlh;
519 void *data;
520 int ret = 0;
521 struct ib_sa_mad *mad;
522 int len;
523
524 mad = query->mad_buf->mad;
525 len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask);
526 if (len <= 0)
527 return -EMSGSIZE;
528
529 skb = nlmsg_new(len, GFP_KERNEL);
530 if (!skb)
531 return -ENOMEM;
532
533 /* Put nlmsg header only for now */
534 data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
535 RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
536 if (!data) {
537 kfree_skb(skb);
538 return -EMSGSIZE;
539 }
540
541 /* Add attributes */
542 ib_nl_set_path_rec_attrs(skb, query);
543
544 /* Repair the nlmsg header length */
545 nlmsg_end(skb, nlh);
546
547 ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
548 if (!ret)
549 ret = len;
550 else
551 ret = 0;
552
553 return ret;
554}
555
556static int ib_nl_make_request(struct ib_sa_query *query)
557{
558 unsigned long flags;
559 unsigned long delay;
560 int ret;
561
562 INIT_LIST_HEAD(&query->list);
563 query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
564
565 spin_lock_irqsave(&ib_nl_request_lock, flags);
566 ret = ib_nl_send_msg(query);
567 if (ret <= 0) {
568 ret = -EIO;
569 goto request_out;
570 } else {
571 ret = 0;
572 }
573
574 delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
575 query->timeout = delay + jiffies;
576 list_add_tail(&query->list, &ib_nl_request_list);
577 /* Start the timeout if this is the only request */
578 if (ib_nl_request_list.next == &query->list)
579 queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
580
581request_out:
582 spin_unlock_irqrestore(&ib_nl_request_lock, flags);
583
584 return ret;
585}
586
587static int ib_nl_cancel_request(struct ib_sa_query *query)
588{
589 unsigned long flags;
590 struct ib_sa_query *wait_query;
591 int found = 0;
592
593 spin_lock_irqsave(&ib_nl_request_lock, flags);
594 list_for_each_entry(wait_query, &ib_nl_request_list, list) {
595 /* Let the timeout to take care of the callback */
596 if (query == wait_query) {
597 query->flags |= IB_SA_CANCEL;
598 query->timeout = jiffies;
599 list_move(&query->list, &ib_nl_request_list);
600 found = 1;
601 mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
602 break;
603 }
604 }
605 spin_unlock_irqrestore(&ib_nl_request_lock, flags);
606
607 return found;
608}
609
610static void send_handler(struct ib_mad_agent *agent,
611 struct ib_mad_send_wc *mad_send_wc);
612
613static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
614 const struct nlmsghdr *nlh)
615{
616 struct ib_mad_send_wc mad_send_wc;
617 struct ib_sa_mad *mad = NULL;
618 const struct nlattr *head, *curr;
619 struct ib_path_rec_data *rec;
620 int len, rem;
621 u32 mask = 0;
622 int status = -EIO;
623
624 if (query->callback) {
625 head = (const struct nlattr *) nlmsg_data(nlh);
626 len = nlmsg_len(nlh);
627 switch (query->path_use) {
628 case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
629 mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
630 break;
631
632 case LS_RESOLVE_PATH_USE_ALL:
633 case LS_RESOLVE_PATH_USE_GMP:
634 default:
635 mask = IB_PATH_PRIMARY | IB_PATH_GMP |
636 IB_PATH_BIDIRECTIONAL;
637 break;
638 }
639 nla_for_each_attr(curr, head, len, rem) {
640 if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
641 rec = nla_data(curr);
642 /*
643 * Get the first one. In the future, we may
644 * need to get up to 6 pathrecords.
645 */
646 if ((rec->flags & mask) == mask) {
647 mad = query->mad_buf->mad;
648 mad->mad_hdr.method |=
649 IB_MGMT_METHOD_RESP;
650 memcpy(mad->data, rec->path_rec,
651 sizeof(rec->path_rec));
652 status = 0;
653 break;
654 }
655 }
656 }
657 query->callback(query, status, mad);
658 }
659
660 mad_send_wc.send_buf = query->mad_buf;
661 mad_send_wc.status = IB_WC_SUCCESS;
662 send_handler(query->mad_buf->mad_agent, &mad_send_wc);
663}
664
665static void ib_nl_request_timeout(struct work_struct *work)
666{
667 unsigned long flags;
668 struct ib_sa_query *query;
669 unsigned long delay;
670 struct ib_mad_send_wc mad_send_wc;
671 int ret;
672
673 spin_lock_irqsave(&ib_nl_request_lock, flags);
674 while (!list_empty(&ib_nl_request_list)) {
675 query = list_entry(ib_nl_request_list.next,
676 struct ib_sa_query, list);
677
678 if (time_after(query->timeout, jiffies)) {
679 delay = query->timeout - jiffies;
680 if ((long)delay <= 0)
681 delay = 1;
682 queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
683 break;
684 }
685
686 list_del(&query->list);
687 ib_sa_disable_local_svc(query);
688 /* Hold the lock to protect against query cancellation */
689 if (ib_sa_query_cancelled(query))
690 ret = -1;
691 else
692 ret = ib_post_send_mad(query->mad_buf, NULL);
693 if (ret) {
694 mad_send_wc.send_buf = query->mad_buf;
695 mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
696 spin_unlock_irqrestore(&ib_nl_request_lock, flags);
697 send_handler(query->port->agent, &mad_send_wc);
698 spin_lock_irqsave(&ib_nl_request_lock, flags);
699 }
700 }
701 spin_unlock_irqrestore(&ib_nl_request_lock, flags);
702}
703
704static int ib_nl_handle_set_timeout(struct sk_buff *skb,
705 struct netlink_callback *cb)
706{
707 const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
708 int timeout, delta, abs_delta;
709 const struct nlattr *attr;
710 unsigned long flags;
711 struct ib_sa_query *query;
712 long delay = 0;
713 struct nlattr *tb[LS_NLA_TYPE_MAX];
714 int ret;
715
716 if (!netlink_capable(skb, CAP_NET_ADMIN))
717 return -EPERM;
718
719 ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
720 nlmsg_len(nlh), ib_nl_policy);
721 attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];
722 if (ret || !attr)
723 goto settimeout_out;
724
725 timeout = *(int *) nla_data(attr);
726 if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
727 timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
728 if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
729 timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
730
731 delta = timeout - sa_local_svc_timeout_ms;
732 if (delta < 0)
733 abs_delta = -delta;
734 else
735 abs_delta = delta;
736
737 if (delta != 0) {
738 spin_lock_irqsave(&ib_nl_request_lock, flags);
739 sa_local_svc_timeout_ms = timeout;
740 list_for_each_entry(query, &ib_nl_request_list, list) {
741 if (delta < 0 && abs_delta > query->timeout)
742 query->timeout = 0;
743 else
744 query->timeout += delta;
745
746 /* Get the new delay from the first entry */
747 if (!delay) {
748 delay = query->timeout - jiffies;
749 if (delay <= 0)
750 delay = 1;
751 }
752 }
753 if (delay)
754 mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
755 (unsigned long)delay);
756 spin_unlock_irqrestore(&ib_nl_request_lock, flags);
757 }
758
759settimeout_out:
760 return skb->len;
761}
762
763static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
764{
765 struct nlattr *tb[LS_NLA_TYPE_MAX];
766 int ret;
767
768 if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
769 return 0;
770
771 ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
772 nlmsg_len(nlh), ib_nl_policy);
773 if (ret)
774 return 0;
775
776 return 1;
777}
778
779static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
780 struct netlink_callback *cb)
781{
782 const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
783 unsigned long flags;
784 struct ib_sa_query *query;
785 struct ib_mad_send_buf *send_buf;
786 struct ib_mad_send_wc mad_send_wc;
787 int found = 0;
788 int ret;
789
790 if (!netlink_capable(skb, CAP_NET_ADMIN))
791 return -EPERM;
792
793 spin_lock_irqsave(&ib_nl_request_lock, flags);
794 list_for_each_entry(query, &ib_nl_request_list, list) {
795 /*
796 * If the query is cancelled, let the timeout routine
797 * take care of it.
798 */
799 if (nlh->nlmsg_seq == query->seq) {
800 found = !ib_sa_query_cancelled(query);
801 if (found)
802 list_del(&query->list);
803 break;
804 }
805 }
806
807 if (!found) {
808 spin_unlock_irqrestore(&ib_nl_request_lock, flags);
809 goto resp_out;
810 }
811
812 send_buf = query->mad_buf;
813
814 if (!ib_nl_is_good_resolve_resp(nlh)) {
815 /* if the result is a failure, send out the packet via IB */
816 ib_sa_disable_local_svc(query);
817 ret = ib_post_send_mad(query->mad_buf, NULL);
818 spin_unlock_irqrestore(&ib_nl_request_lock, flags);
819 if (ret) {
820 mad_send_wc.send_buf = send_buf;
821 mad_send_wc.status = IB_WC_GENERAL_ERR;
822 send_handler(query->port->agent, &mad_send_wc);
823 }
824 } else {
825 spin_unlock_irqrestore(&ib_nl_request_lock, flags);
826 ib_nl_process_good_resolve_rsp(query, nlh);
827 }
828
829resp_out:
830 return skb->len;
831}
832
833static struct ibnl_client_cbs ib_sa_cb_table[] = {
834 [RDMA_NL_LS_OP_RESOLVE] = {
835 .dump = ib_nl_handle_resolve_resp,
836 .module = THIS_MODULE },
837 [RDMA_NL_LS_OP_SET_TIMEOUT] = {
838 .dump = ib_nl_handle_set_timeout,
839 .module = THIS_MODULE },
840};
841
384static void free_sm_ah(struct kref *kref) 842static void free_sm_ah(struct kref *kref)
385{ 843{
386 struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); 844 struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -502,7 +960,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
502 mad_buf = query->mad_buf; 960 mad_buf = query->mad_buf;
503 spin_unlock_irqrestore(&idr_lock, flags); 961 spin_unlock_irqrestore(&idr_lock, flags);
504 962
505 ib_cancel_mad(agent, mad_buf); 963 /*
964 * If the query is still on the netlink request list, schedule
965 * it to be cancelled by the timeout routine. Otherwise, it has been
966 * sent to the MAD layer and has to be cancelled from there.
967 */
968 if (!ib_nl_cancel_request(query))
969 ib_cancel_mad(agent, mad_buf);
506} 970}
507EXPORT_SYMBOL(ib_sa_cancel_query); 971EXPORT_SYMBOL(ib_sa_cancel_query);
508 972
@@ -639,6 +1103,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
639 query->mad_buf->context[0] = query; 1103 query->mad_buf->context[0] = query;
640 query->id = id; 1104 query->id = id;
641 1105
1106 if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) {
1107 if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) {
1108 if (!ib_nl_make_request(query))
1109 return id;
1110 }
1111 ib_sa_disable_local_svc(query);
1112 }
1113
642 ret = ib_post_send_mad(query->mad_buf, NULL); 1114 ret = ib_post_send_mad(query->mad_buf, NULL);
643 if (ret) { 1115 if (ret) {
644 spin_lock_irqsave(&idr_lock, flags); 1116 spin_lock_irqsave(&idr_lock, flags);
@@ -740,7 +1212,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
740 port = &sa_dev->port[port_num - sa_dev->start_port]; 1212 port = &sa_dev->port[port_num - sa_dev->start_port];
741 agent = port->agent; 1213 agent = port->agent;
742 1214
743 query = kmalloc(sizeof *query, gfp_mask); 1215 query = kzalloc(sizeof(*query), gfp_mask);
744 if (!query) 1216 if (!query)
745 return -ENOMEM; 1217 return -ENOMEM;
746 1218
@@ -767,6 +1239,9 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
767 1239
768 *sa_query = &query->sa_query; 1240 *sa_query = &query->sa_query;
769 1241
1242 query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE;
1243 query->sa_query.mad_buf->context[1] = rec;
1244
770 ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); 1245 ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
771 if (ret < 0) 1246 if (ret < 0)
772 goto err2; 1247 goto err2;
@@ -862,7 +1337,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
862 method != IB_SA_METHOD_DELETE) 1337 method != IB_SA_METHOD_DELETE)
863 return -EINVAL; 1338 return -EINVAL;
864 1339
865 query = kmalloc(sizeof *query, gfp_mask); 1340 query = kzalloc(sizeof(*query), gfp_mask);
866 if (!query) 1341 if (!query)
867 return -ENOMEM; 1342 return -ENOMEM;
868 1343
@@ -954,7 +1429,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
954 port = &sa_dev->port[port_num - sa_dev->start_port]; 1429 port = &sa_dev->port[port_num - sa_dev->start_port];
955 agent = port->agent; 1430 agent = port->agent;
956 1431
957 query = kmalloc(sizeof *query, gfp_mask); 1432 query = kzalloc(sizeof(*query), gfp_mask);
958 if (!query) 1433 if (!query)
959 return -ENOMEM; 1434 return -ENOMEM;
960 1435
@@ -1051,7 +1526,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
1051 port = &sa_dev->port[port_num - sa_dev->start_port]; 1526 port = &sa_dev->port[port_num - sa_dev->start_port];
1052 agent = port->agent; 1527 agent = port->agent;
1053 1528
1054 query = kmalloc(sizeof *query, gfp_mask); 1529 query = kzalloc(sizeof(*query), gfp_mask);
1055 if (!query) 1530 if (!query)
1056 return -ENOMEM; 1531 return -ENOMEM;
1057 1532
@@ -1221,9 +1696,9 @@ free:
1221 return; 1696 return;
1222} 1697}
1223 1698
1224static void ib_sa_remove_one(struct ib_device *device) 1699static void ib_sa_remove_one(struct ib_device *device, void *client_data)
1225{ 1700{
1226 struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); 1701 struct ib_sa_device *sa_dev = client_data;
1227 int i; 1702 int i;
1228 1703
1229 if (!sa_dev) 1704 if (!sa_dev)
@@ -1251,6 +1726,8 @@ static int __init ib_sa_init(void)
1251 1726
1252 get_random_bytes(&tid, sizeof tid); 1727 get_random_bytes(&tid, sizeof tid);
1253 1728
1729 atomic_set(&ib_nl_sa_request_seq, 0);
1730
1254 ret = ib_register_client(&sa_client); 1731 ret = ib_register_client(&sa_client);
1255 if (ret) { 1732 if (ret) {
1256 printk(KERN_ERR "Couldn't register ib_sa client\n"); 1733 printk(KERN_ERR "Couldn't register ib_sa client\n");
@@ -1263,7 +1740,25 @@ static int __init ib_sa_init(void)
1263 goto err2; 1740 goto err2;
1264 } 1741 }
1265 1742
1743 ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq");
1744 if (!ib_nl_wq) {
1745 ret = -ENOMEM;
1746 goto err3;
1747 }
1748
1749 if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS,
1750 ib_sa_cb_table)) {
1751 pr_err("Failed to add netlink callback\n");
1752 ret = -EINVAL;
1753 goto err4;
1754 }
1755 INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
1756
1266 return 0; 1757 return 0;
1758err4:
1759 destroy_workqueue(ib_nl_wq);
1760err3:
1761 mcast_cleanup();
1267err2: 1762err2:
1268 ib_unregister_client(&sa_client); 1763 ib_unregister_client(&sa_client);
1269err1: 1764err1:
@@ -1272,6 +1767,10 @@ err1:
1272 1767
1273static void __exit ib_sa_cleanup(void) 1768static void __exit ib_sa_cleanup(void)
1274{ 1769{
1770 ibnl_remove_client(RDMA_NL_LS);
1771 cancel_delayed_work(&ib_nl_timed_work);
1772 flush_workqueue(ib_nl_wq);
1773 destroy_workqueue(ib_nl_wq);
1275 mcast_cleanup(); 1774 mcast_cleanup();
1276 ib_unregister_client(&sa_client); 1775 ib_unregister_client(&sa_client);
1277 idr_destroy(&query_idr); 1776 idr_destroy(&query_idr);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 0b84a9cdfe5b..34cdd74b0a17 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -457,29 +457,6 @@ static struct kobj_type port_type = {
457 .default_attrs = port_default_attrs 457 .default_attrs = port_default_attrs
458}; 458};
459 459
460static void ib_device_release(struct device *device)
461{
462 struct ib_device *dev = container_of(device, struct ib_device, dev);
463
464 kfree(dev->port_immutable);
465 kfree(dev);
466}
467
468static int ib_device_uevent(struct device *device,
469 struct kobj_uevent_env *env)
470{
471 struct ib_device *dev = container_of(device, struct ib_device, dev);
472
473 if (add_uevent_var(env, "NAME=%s", dev->name))
474 return -ENOMEM;
475
476 /*
477 * It would be nice to pass the node GUID with the event...
478 */
479
480 return 0;
481}
482
483static struct attribute ** 460static struct attribute **
484alloc_group_attrs(ssize_t (*show)(struct ib_port *, 461alloc_group_attrs(ssize_t (*show)(struct ib_port *,
485 struct port_attribute *, char *buf), 462 struct port_attribute *, char *buf),
@@ -702,12 +679,6 @@ static struct device_attribute *ib_class_attributes[] = {
702 &dev_attr_node_desc 679 &dev_attr_node_desc
703}; 680};
704 681
705static struct class ib_class = {
706 .name = "infiniband",
707 .dev_release = ib_device_release,
708 .dev_uevent = ib_device_uevent,
709};
710
711/* Show a given an attribute in the statistics group */ 682/* Show a given an attribute in the statistics group */
712static ssize_t show_protocol_stat(const struct device *device, 683static ssize_t show_protocol_stat(const struct device *device,
713 struct device_attribute *attr, char *buf, 684 struct device_attribute *attr, char *buf,
@@ -846,14 +817,12 @@ int ib_device_register_sysfs(struct ib_device *device,
846 int ret; 817 int ret;
847 int i; 818 int i;
848 819
849 class_dev->class = &ib_class; 820 device->dev.parent = device->dma_device;
850 class_dev->parent = device->dma_device; 821 ret = dev_set_name(class_dev, "%s", device->name);
851 dev_set_name(class_dev, "%s", device->name); 822 if (ret)
852 dev_set_drvdata(class_dev, device); 823 return ret;
853
854 INIT_LIST_HEAD(&device->port_list);
855 824
856 ret = device_register(class_dev); 825 ret = device_add(class_dev);
857 if (ret) 826 if (ret)
858 goto err; 827 goto err;
859 828
@@ -916,13 +885,3 @@ void ib_device_unregister_sysfs(struct ib_device *device)
916 885
917 device_unregister(&device->dev); 886 device_unregister(&device->dev);
918} 887}
919
920int ib_sysfs_setup(void)
921{
922 return class_register(&ib_class);
923}
924
925void ib_sysfs_cleanup(void)
926{
927 class_unregister(&ib_class);
928}
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 009481073644..6b4e8a008bc0 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -109,7 +109,7 @@ enum {
109#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) 109#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
110 110
111static void ib_ucm_add_one(struct ib_device *device); 111static void ib_ucm_add_one(struct ib_device *device);
112static void ib_ucm_remove_one(struct ib_device *device); 112static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
113 113
114static struct ib_client ucm_client = { 114static struct ib_client ucm_client = {
115 .name = "ucm", 115 .name = "ucm",
@@ -658,8 +658,7 @@ static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
658 if (result) 658 if (result)
659 goto out; 659 goto out;
660 660
661 result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask, 661 result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
662 NULL);
663out: 662out:
664 ib_ucm_ctx_put(ctx); 663 ib_ucm_ctx_put(ctx);
665 return result; 664 return result;
@@ -1310,9 +1309,9 @@ err:
1310 return; 1309 return;
1311} 1310}
1312 1311
1313static void ib_ucm_remove_one(struct ib_device *device) 1312static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
1314{ 1313{
1315 struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client); 1314 struct ib_ucm_device *ucm_dev = client_data;
1316 1315
1317 if (!ucm_dev) 1316 if (!ucm_dev)
1318 return; 1317 return;
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 29b21213ea75..a53fc9b01c69 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -74,6 +74,7 @@ struct ucma_file {
74 struct list_head ctx_list; 74 struct list_head ctx_list;
75 struct list_head event_list; 75 struct list_head event_list;
76 wait_queue_head_t poll_wait; 76 wait_queue_head_t poll_wait;
77 struct workqueue_struct *close_wq;
77}; 78};
78 79
79struct ucma_context { 80struct ucma_context {
@@ -89,6 +90,13 @@ struct ucma_context {
89 90
90 struct list_head list; 91 struct list_head list;
91 struct list_head mc_list; 92 struct list_head mc_list;
93 /* mark that device is in process of destroying the internal HW
94 * resources, protected by the global mut
95 */
96 int closing;
97 /* sync between removal event and id destroy, protected by file mut */
98 int destroying;
99 struct work_struct close_work;
92}; 100};
93 101
94struct ucma_multicast { 102struct ucma_multicast {
@@ -107,6 +115,7 @@ struct ucma_event {
107 struct list_head list; 115 struct list_head list;
108 struct rdma_cm_id *cm_id; 116 struct rdma_cm_id *cm_id;
109 struct rdma_ucm_event_resp resp; 117 struct rdma_ucm_event_resp resp;
118 struct work_struct close_work;
110}; 119};
111 120
112static DEFINE_MUTEX(mut); 121static DEFINE_MUTEX(mut);
@@ -132,8 +141,12 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
132 141
133 mutex_lock(&mut); 142 mutex_lock(&mut);
134 ctx = _ucma_find_context(id, file); 143 ctx = _ucma_find_context(id, file);
135 if (!IS_ERR(ctx)) 144 if (!IS_ERR(ctx)) {
136 atomic_inc(&ctx->ref); 145 if (ctx->closing)
146 ctx = ERR_PTR(-EIO);
147 else
148 atomic_inc(&ctx->ref);
149 }
137 mutex_unlock(&mut); 150 mutex_unlock(&mut);
138 return ctx; 151 return ctx;
139} 152}
@@ -144,6 +157,28 @@ static void ucma_put_ctx(struct ucma_context *ctx)
144 complete(&ctx->comp); 157 complete(&ctx->comp);
145} 158}
146 159
160static void ucma_close_event_id(struct work_struct *work)
161{
162 struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work);
163
164 rdma_destroy_id(uevent_close->cm_id);
165 kfree(uevent_close);
166}
167
168static void ucma_close_id(struct work_struct *work)
169{
170 struct ucma_context *ctx = container_of(work, struct ucma_context, close_work);
171
172 /* once all inflight tasks are finished, we close all underlying
173 * resources. The context is still alive till its explicit destryoing
174 * by its creator.
175 */
176 ucma_put_ctx(ctx);
177 wait_for_completion(&ctx->comp);
178 /* No new events will be generated after destroying the id. */
179 rdma_destroy_id(ctx->cm_id);
180}
181
147static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) 182static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
148{ 183{
149 struct ucma_context *ctx; 184 struct ucma_context *ctx;
@@ -152,6 +187,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
152 if (!ctx) 187 if (!ctx)
153 return NULL; 188 return NULL;
154 189
190 INIT_WORK(&ctx->close_work, ucma_close_id);
155 atomic_set(&ctx->ref, 1); 191 atomic_set(&ctx->ref, 1);
156 init_completion(&ctx->comp); 192 init_completion(&ctx->comp);
157 INIT_LIST_HEAD(&ctx->mc_list); 193 INIT_LIST_HEAD(&ctx->mc_list);
@@ -242,6 +278,44 @@ static void ucma_set_event_context(struct ucma_context *ctx,
242 } 278 }
243} 279}
244 280
281/* Called with file->mut locked for the relevant context. */
282static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
283{
284 struct ucma_context *ctx = cm_id->context;
285 struct ucma_event *con_req_eve;
286 int event_found = 0;
287
288 if (ctx->destroying)
289 return;
290
291 /* only if context is pointing to cm_id that it owns it and can be
292 * queued to be closed, otherwise that cm_id is an inflight one that
293 * is part of that context event list pending to be detached and
294 * reattached to its new context as part of ucma_get_event,
295 * handled separately below.
296 */
297 if (ctx->cm_id == cm_id) {
298 mutex_lock(&mut);
299 ctx->closing = 1;
300 mutex_unlock(&mut);
301 queue_work(ctx->file->close_wq, &ctx->close_work);
302 return;
303 }
304
305 list_for_each_entry(con_req_eve, &ctx->file->event_list, list) {
306 if (con_req_eve->cm_id == cm_id &&
307 con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
308 list_del(&con_req_eve->list);
309 INIT_WORK(&con_req_eve->close_work, ucma_close_event_id);
310 queue_work(ctx->file->close_wq, &con_req_eve->close_work);
311 event_found = 1;
312 break;
313 }
314 }
315 if (!event_found)
316 printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n");
317}
318
245static int ucma_event_handler(struct rdma_cm_id *cm_id, 319static int ucma_event_handler(struct rdma_cm_id *cm_id,
246 struct rdma_cm_event *event) 320 struct rdma_cm_event *event)
247{ 321{
@@ -276,14 +350,21 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
276 * We ignore events for new connections until userspace has set 350 * We ignore events for new connections until userspace has set
277 * their context. This can only happen if an error occurs on a 351 * their context. This can only happen if an error occurs on a
278 * new connection before the user accepts it. This is okay, 352 * new connection before the user accepts it. This is okay,
279 * since the accept will just fail later. 353 * since the accept will just fail later. However, we do need
354 * to release the underlying HW resources in case of a device
355 * removal event.
280 */ 356 */
357 if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
358 ucma_removal_event_handler(cm_id);
359
281 kfree(uevent); 360 kfree(uevent);
282 goto out; 361 goto out;
283 } 362 }
284 363
285 list_add_tail(&uevent->list, &ctx->file->event_list); 364 list_add_tail(&uevent->list, &ctx->file->event_list);
286 wake_up_interruptible(&ctx->file->poll_wait); 365 wake_up_interruptible(&ctx->file->poll_wait);
366 if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
367 ucma_removal_event_handler(cm_id);
287out: 368out:
288 mutex_unlock(&ctx->file->mut); 369 mutex_unlock(&ctx->file->mut);
289 return ret; 370 return ret;
@@ -442,9 +523,15 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
442} 523}
443 524
444/* 525/*
445 * We cannot hold file->mut when calling rdma_destroy_id() or we can 526 * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At
446 * deadlock. We also acquire file->mut in ucma_event_handler(), and 527 * this point, no new events will be reported from the hardware. However, we
447 * rdma_destroy_id() will wait until all callbacks have completed. 528 * still need to cleanup the UCMA context for this ID. Specifically, there
529 * might be events that have not yet been consumed by the user space software.
530 * These might include pending connect requests which we have not completed
531 * processing. We cannot call rdma_destroy_id while holding the lock of the
532 * context (file->mut), as it might cause a deadlock. We therefore extract all
533 * relevant events from the context pending events list while holding the
534 * mutex. After that we release them as needed.
448 */ 535 */
449static int ucma_free_ctx(struct ucma_context *ctx) 536static int ucma_free_ctx(struct ucma_context *ctx)
450{ 537{
@@ -452,8 +539,6 @@ static int ucma_free_ctx(struct ucma_context *ctx)
452 struct ucma_event *uevent, *tmp; 539 struct ucma_event *uevent, *tmp;
453 LIST_HEAD(list); 540 LIST_HEAD(list);
454 541
455 /* No new events will be generated after destroying the id. */
456 rdma_destroy_id(ctx->cm_id);
457 542
458 ucma_cleanup_multicast(ctx); 543 ucma_cleanup_multicast(ctx);
459 544
@@ -501,10 +586,24 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
501 if (IS_ERR(ctx)) 586 if (IS_ERR(ctx))
502 return PTR_ERR(ctx); 587 return PTR_ERR(ctx);
503 588
504 ucma_put_ctx(ctx); 589 mutex_lock(&ctx->file->mut);
505 wait_for_completion(&ctx->comp); 590 ctx->destroying = 1;
506 resp.events_reported = ucma_free_ctx(ctx); 591 mutex_unlock(&ctx->file->mut);
507 592
593 flush_workqueue(ctx->file->close_wq);
594 /* At this point it's guaranteed that there is no inflight
595 * closing task */
596 mutex_lock(&mut);
597 if (!ctx->closing) {
598 mutex_unlock(&mut);
599 ucma_put_ctx(ctx);
600 wait_for_completion(&ctx->comp);
601 rdma_destroy_id(ctx->cm_id);
602 } else {
603 mutex_unlock(&mut);
604 }
605
606 resp.events_reported = ucma_free_ctx(ctx);
508 if (copy_to_user((void __user *)(unsigned long)cmd.response, 607 if (copy_to_user((void __user *)(unsigned long)cmd.response,
509 &resp, sizeof(resp))) 608 &resp, sizeof(resp)))
510 ret = -EFAULT; 609 ret = -EFAULT;
@@ -1321,10 +1420,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
1321 mc = ERR_PTR(-ENOENT); 1420 mc = ERR_PTR(-ENOENT);
1322 else if (mc->ctx->file != file) 1421 else if (mc->ctx->file != file)
1323 mc = ERR_PTR(-EINVAL); 1422 mc = ERR_PTR(-EINVAL);
1324 else { 1423 else if (!atomic_inc_not_zero(&mc->ctx->ref))
1424 mc = ERR_PTR(-ENXIO);
1425 else
1325 idr_remove(&multicast_idr, mc->id); 1426 idr_remove(&multicast_idr, mc->id);
1326 atomic_inc(&mc->ctx->ref);
1327 }
1328 mutex_unlock(&mut); 1427 mutex_unlock(&mut);
1329 1428
1330 if (IS_ERR(mc)) { 1429 if (IS_ERR(mc)) {
@@ -1529,6 +1628,7 @@ static int ucma_open(struct inode *inode, struct file *filp)
1529 INIT_LIST_HEAD(&file->ctx_list); 1628 INIT_LIST_HEAD(&file->ctx_list);
1530 init_waitqueue_head(&file->poll_wait); 1629 init_waitqueue_head(&file->poll_wait);
1531 mutex_init(&file->mut); 1630 mutex_init(&file->mut);
1631 file->close_wq = create_singlethread_workqueue("ucma_close_id");
1532 1632
1533 filp->private_data = file; 1633 filp->private_data = file;
1534 file->filp = filp; 1634 file->filp = filp;
@@ -1543,16 +1643,34 @@ static int ucma_close(struct inode *inode, struct file *filp)
1543 1643
1544 mutex_lock(&file->mut); 1644 mutex_lock(&file->mut);
1545 list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { 1645 list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
1646 ctx->destroying = 1;
1546 mutex_unlock(&file->mut); 1647 mutex_unlock(&file->mut);
1547 1648
1548 mutex_lock(&mut); 1649 mutex_lock(&mut);
1549 idr_remove(&ctx_idr, ctx->id); 1650 idr_remove(&ctx_idr, ctx->id);
1550 mutex_unlock(&mut); 1651 mutex_unlock(&mut);
1551 1652
1653 flush_workqueue(file->close_wq);
1654 /* At that step once ctx was marked as destroying and workqueue
1655 * was flushed we are safe from any inflights handlers that
1656 * might put other closing task.
1657 */
1658 mutex_lock(&mut);
1659 if (!ctx->closing) {
1660 mutex_unlock(&mut);
1661 /* rdma_destroy_id ensures that no event handlers are
1662 * inflight for that id before releasing it.
1663 */
1664 rdma_destroy_id(ctx->cm_id);
1665 } else {
1666 mutex_unlock(&mut);
1667 }
1668
1552 ucma_free_ctx(ctx); 1669 ucma_free_ctx(ctx);
1553 mutex_lock(&file->mut); 1670 mutex_lock(&file->mut);
1554 } 1671 }
1555 mutex_unlock(&file->mut); 1672 mutex_unlock(&file->mut);
1673 destroy_workqueue(file->close_wq);
1556 kfree(file); 1674 kfree(file);
1557 return 0; 1675 return 0;
1558} 1676}
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 35567fffaa4e..57f281f8d686 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -133,7 +133,7 @@ static DEFINE_SPINLOCK(port_lock);
133static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); 133static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
134 134
135static void ib_umad_add_one(struct ib_device *device); 135static void ib_umad_add_one(struct ib_device *device);
136static void ib_umad_remove_one(struct ib_device *device); 136static void ib_umad_remove_one(struct ib_device *device, void *client_data);
137 137
138static void ib_umad_release_dev(struct kobject *kobj) 138static void ib_umad_release_dev(struct kobject *kobj)
139{ 139{
@@ -1322,9 +1322,9 @@ free:
1322 kobject_put(&umad_dev->kobj); 1322 kobject_put(&umad_dev->kobj);
1323} 1323}
1324 1324
1325static void ib_umad_remove_one(struct ib_device *device) 1325static void ib_umad_remove_one(struct ib_device *device, void *client_data)
1326{ 1326{
1327 struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client); 1327 struct ib_umad_device *umad_dev = client_data;
1328 int i; 1328 int i;
1329 1329
1330 if (!umad_dev) 1330 if (!umad_dev)
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index ba365b6d1e8d..3863d33c243d 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -85,15 +85,20 @@
85 */ 85 */
86 86
87struct ib_uverbs_device { 87struct ib_uverbs_device {
88 struct kref ref; 88 atomic_t refcount;
89 int num_comp_vectors; 89 int num_comp_vectors;
90 struct completion comp; 90 struct completion comp;
91 struct device *dev; 91 struct device *dev;
92 struct ib_device *ib_dev; 92 struct ib_device __rcu *ib_dev;
93 int devnum; 93 int devnum;
94 struct cdev cdev; 94 struct cdev cdev;
95 struct rb_root xrcd_tree; 95 struct rb_root xrcd_tree;
96 struct mutex xrcd_tree_mutex; 96 struct mutex xrcd_tree_mutex;
97 struct kobject kobj;
98 struct srcu_struct disassociate_srcu;
99 struct mutex lists_mutex; /* protect lists */
100 struct list_head uverbs_file_list;
101 struct list_head uverbs_events_file_list;
97}; 102};
98 103
99struct ib_uverbs_event_file { 104struct ib_uverbs_event_file {
@@ -105,6 +110,7 @@ struct ib_uverbs_event_file {
105 wait_queue_head_t poll_wait; 110 wait_queue_head_t poll_wait;
106 struct fasync_struct *async_queue; 111 struct fasync_struct *async_queue;
107 struct list_head event_list; 112 struct list_head event_list;
113 struct list_head list;
108}; 114};
109 115
110struct ib_uverbs_file { 116struct ib_uverbs_file {
@@ -114,6 +120,8 @@ struct ib_uverbs_file {
114 struct ib_ucontext *ucontext; 120 struct ib_ucontext *ucontext;
115 struct ib_event_handler event_handler; 121 struct ib_event_handler event_handler;
116 struct ib_uverbs_event_file *async_file; 122 struct ib_uverbs_event_file *async_file;
123 struct list_head list;
124 int is_closed;
117}; 125};
118 126
119struct ib_uverbs_event { 127struct ib_uverbs_event {
@@ -177,7 +185,9 @@ extern struct idr ib_uverbs_rule_idr;
177void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); 185void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
178 186
179struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, 187struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
188 struct ib_device *ib_dev,
180 int is_async); 189 int is_async);
190void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
181struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd); 191struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
182 192
183void ib_uverbs_release_ucq(struct ib_uverbs_file *file, 193void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
@@ -212,6 +222,7 @@ struct ib_uverbs_flow_spec {
212 222
213#define IB_UVERBS_DECLARE_CMD(name) \ 223#define IB_UVERBS_DECLARE_CMD(name) \
214 ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ 224 ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
225 struct ib_device *ib_dev, \
215 const char __user *buf, int in_len, \ 226 const char __user *buf, int in_len, \
216 int out_len) 227 int out_len)
217 228
@@ -253,6 +264,7 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
253 264
254#define IB_UVERBS_DECLARE_EX_CMD(name) \ 265#define IB_UVERBS_DECLARE_EX_CMD(name) \
255 int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ 266 int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \
267 struct ib_device *ib_dev, \
256 struct ib_udata *ucore, \ 268 struct ib_udata *ucore, \
257 struct ib_udata *uhw) 269 struct ib_udata *uhw)
258 270
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index bbb02ffe87df..be4cb9f04be3 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -282,13 +282,13 @@ static void put_xrcd_read(struct ib_uobject *uobj)
282} 282}
283 283
284ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, 284ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
285 struct ib_device *ib_dev,
285 const char __user *buf, 286 const char __user *buf,
286 int in_len, int out_len) 287 int in_len, int out_len)
287{ 288{
288 struct ib_uverbs_get_context cmd; 289 struct ib_uverbs_get_context cmd;
289 struct ib_uverbs_get_context_resp resp; 290 struct ib_uverbs_get_context_resp resp;
290 struct ib_udata udata; 291 struct ib_udata udata;
291 struct ib_device *ibdev = file->device->ib_dev;
292#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 292#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
293 struct ib_device_attr dev_attr; 293 struct ib_device_attr dev_attr;
294#endif 294#endif
@@ -313,13 +313,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
313 (unsigned long) cmd.response + sizeof resp, 313 (unsigned long) cmd.response + sizeof resp,
314 in_len - sizeof cmd, out_len - sizeof resp); 314 in_len - sizeof cmd, out_len - sizeof resp);
315 315
316 ucontext = ibdev->alloc_ucontext(ibdev, &udata); 316 ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
317 if (IS_ERR(ucontext)) { 317 if (IS_ERR(ucontext)) {
318 ret = PTR_ERR(ucontext); 318 ret = PTR_ERR(ucontext);
319 goto err; 319 goto err;
320 } 320 }
321 321
322 ucontext->device = ibdev; 322 ucontext->device = ib_dev;
323 INIT_LIST_HEAD(&ucontext->pd_list); 323 INIT_LIST_HEAD(&ucontext->pd_list);
324 INIT_LIST_HEAD(&ucontext->mr_list); 324 INIT_LIST_HEAD(&ucontext->mr_list);
325 INIT_LIST_HEAD(&ucontext->mw_list); 325 INIT_LIST_HEAD(&ucontext->mw_list);
@@ -340,7 +340,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
340 ucontext->odp_mrs_count = 0; 340 ucontext->odp_mrs_count = 0;
341 INIT_LIST_HEAD(&ucontext->no_private_counters); 341 INIT_LIST_HEAD(&ucontext->no_private_counters);
342 342
343 ret = ib_query_device(ibdev, &dev_attr); 343 ret = ib_query_device(ib_dev, &dev_attr);
344 if (ret) 344 if (ret)
345 goto err_free; 345 goto err_free;
346 if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) 346 if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
@@ -355,7 +355,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
355 goto err_free; 355 goto err_free;
356 resp.async_fd = ret; 356 resp.async_fd = ret;
357 357
358 filp = ib_uverbs_alloc_event_file(file, 1); 358 filp = ib_uverbs_alloc_event_file(file, ib_dev, 1);
359 if (IS_ERR(filp)) { 359 if (IS_ERR(filp)) {
360 ret = PTR_ERR(filp); 360 ret = PTR_ERR(filp);
361 goto err_fd; 361 goto err_fd;
@@ -367,16 +367,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
367 goto err_file; 367 goto err_file;
368 } 368 }
369 369
370 file->async_file = filp->private_data;
371
372 INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
373 ib_uverbs_event_handler);
374 ret = ib_register_event_handler(&file->event_handler);
375 if (ret)
376 goto err_file;
377
378 kref_get(&file->async_file->ref);
379 kref_get(&file->ref);
380 file->ucontext = ucontext; 370 file->ucontext = ucontext;
381 371
382 fd_install(resp.async_fd, filp); 372 fd_install(resp.async_fd, filp);
@@ -386,6 +376,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
386 return in_len; 376 return in_len;
387 377
388err_file: 378err_file:
379 ib_uverbs_free_async_event_file(file);
389 fput(filp); 380 fput(filp);
390 381
391err_fd: 382err_fd:
@@ -393,7 +384,7 @@ err_fd:
393 384
394err_free: 385err_free:
395 put_pid(ucontext->tgid); 386 put_pid(ucontext->tgid);
396 ibdev->dealloc_ucontext(ucontext); 387 ib_dev->dealloc_ucontext(ucontext);
397 388
398err: 389err:
399 mutex_unlock(&file->mutex); 390 mutex_unlock(&file->mutex);
@@ -401,11 +392,12 @@ err:
401} 392}
402 393
403static void copy_query_dev_fields(struct ib_uverbs_file *file, 394static void copy_query_dev_fields(struct ib_uverbs_file *file,
395 struct ib_device *ib_dev,
404 struct ib_uverbs_query_device_resp *resp, 396 struct ib_uverbs_query_device_resp *resp,
405 struct ib_device_attr *attr) 397 struct ib_device_attr *attr)
406{ 398{
407 resp->fw_ver = attr->fw_ver; 399 resp->fw_ver = attr->fw_ver;
408 resp->node_guid = file->device->ib_dev->node_guid; 400 resp->node_guid = ib_dev->node_guid;
409 resp->sys_image_guid = attr->sys_image_guid; 401 resp->sys_image_guid = attr->sys_image_guid;
410 resp->max_mr_size = attr->max_mr_size; 402 resp->max_mr_size = attr->max_mr_size;
411 resp->page_size_cap = attr->page_size_cap; 403 resp->page_size_cap = attr->page_size_cap;
@@ -443,10 +435,11 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file,
443 resp->max_srq_sge = attr->max_srq_sge; 435 resp->max_srq_sge = attr->max_srq_sge;
444 resp->max_pkeys = attr->max_pkeys; 436 resp->max_pkeys = attr->max_pkeys;
445 resp->local_ca_ack_delay = attr->local_ca_ack_delay; 437 resp->local_ca_ack_delay = attr->local_ca_ack_delay;
446 resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; 438 resp->phys_port_cnt = ib_dev->phys_port_cnt;
447} 439}
448 440
449ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, 441ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
442 struct ib_device *ib_dev,
450 const char __user *buf, 443 const char __user *buf,
451 int in_len, int out_len) 444 int in_len, int out_len)
452{ 445{
@@ -461,12 +454,12 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
461 if (copy_from_user(&cmd, buf, sizeof cmd)) 454 if (copy_from_user(&cmd, buf, sizeof cmd))
462 return -EFAULT; 455 return -EFAULT;
463 456
464 ret = ib_query_device(file->device->ib_dev, &attr); 457 ret = ib_query_device(ib_dev, &attr);
465 if (ret) 458 if (ret)
466 return ret; 459 return ret;
467 460
468 memset(&resp, 0, sizeof resp); 461 memset(&resp, 0, sizeof resp);
469 copy_query_dev_fields(file, &resp, &attr); 462 copy_query_dev_fields(file, ib_dev, &resp, &attr);
470 463
471 if (copy_to_user((void __user *) (unsigned long) cmd.response, 464 if (copy_to_user((void __user *) (unsigned long) cmd.response,
472 &resp, sizeof resp)) 465 &resp, sizeof resp))
@@ -476,6 +469,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
476} 469}
477 470
478ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, 471ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
472 struct ib_device *ib_dev,
479 const char __user *buf, 473 const char __user *buf,
480 int in_len, int out_len) 474 int in_len, int out_len)
481{ 475{
@@ -490,7 +484,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
490 if (copy_from_user(&cmd, buf, sizeof cmd)) 484 if (copy_from_user(&cmd, buf, sizeof cmd))
491 return -EFAULT; 485 return -EFAULT;
492 486
493 ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr); 487 ret = ib_query_port(ib_dev, cmd.port_num, &attr);
494 if (ret) 488 if (ret)
495 return ret; 489 return ret;
496 490
@@ -515,7 +509,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
515 resp.active_width = attr.active_width; 509 resp.active_width = attr.active_width;
516 resp.active_speed = attr.active_speed; 510 resp.active_speed = attr.active_speed;
517 resp.phys_state = attr.phys_state; 511 resp.phys_state = attr.phys_state;
518 resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev, 512 resp.link_layer = rdma_port_get_link_layer(ib_dev,
519 cmd.port_num); 513 cmd.port_num);
520 514
521 if (copy_to_user((void __user *) (unsigned long) cmd.response, 515 if (copy_to_user((void __user *) (unsigned long) cmd.response,
@@ -526,6 +520,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
526} 520}
527 521
528ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, 522ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
523 struct ib_device *ib_dev,
529 const char __user *buf, 524 const char __user *buf,
530 int in_len, int out_len) 525 int in_len, int out_len)
531{ 526{
@@ -553,15 +548,15 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
553 init_uobj(uobj, 0, file->ucontext, &pd_lock_class); 548 init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
554 down_write(&uobj->mutex); 549 down_write(&uobj->mutex);
555 550
556 pd = file->device->ib_dev->alloc_pd(file->device->ib_dev, 551 pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
557 file->ucontext, &udata);
558 if (IS_ERR(pd)) { 552 if (IS_ERR(pd)) {
559 ret = PTR_ERR(pd); 553 ret = PTR_ERR(pd);
560 goto err; 554 goto err;
561 } 555 }
562 556
563 pd->device = file->device->ib_dev; 557 pd->device = ib_dev;
564 pd->uobject = uobj; 558 pd->uobject = uobj;
559 pd->local_mr = NULL;
565 atomic_set(&pd->usecnt, 0); 560 atomic_set(&pd->usecnt, 0);
566 561
567 uobj->object = pd; 562 uobj->object = pd;
@@ -600,11 +595,13 @@ err:
600} 595}
601 596
602ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, 597ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
598 struct ib_device *ib_dev,
603 const char __user *buf, 599 const char __user *buf,
604 int in_len, int out_len) 600 int in_len, int out_len)
605{ 601{
606 struct ib_uverbs_dealloc_pd cmd; 602 struct ib_uverbs_dealloc_pd cmd;
607 struct ib_uobject *uobj; 603 struct ib_uobject *uobj;
604 struct ib_pd *pd;
608 int ret; 605 int ret;
609 606
610 if (copy_from_user(&cmd, buf, sizeof cmd)) 607 if (copy_from_user(&cmd, buf, sizeof cmd))
@@ -613,15 +610,20 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
613 uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); 610 uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
614 if (!uobj) 611 if (!uobj)
615 return -EINVAL; 612 return -EINVAL;
613 pd = uobj->object;
616 614
617 ret = ib_dealloc_pd(uobj->object); 615 if (atomic_read(&pd->usecnt)) {
618 if (!ret) 616 ret = -EBUSY;
619 uobj->live = 0; 617 goto err_put;
620 618 }
621 put_uobj_write(uobj);
622 619
620 ret = pd->device->dealloc_pd(uobj->object);
621 WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
623 if (ret) 622 if (ret)
624 return ret; 623 goto err_put;
624
625 uobj->live = 0;
626 put_uobj_write(uobj);
625 627
626 idr_remove_uobj(&ib_uverbs_pd_idr, uobj); 628 idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
627 629
@@ -632,6 +634,10 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
632 put_uobj(uobj); 634 put_uobj(uobj);
633 635
634 return in_len; 636 return in_len;
637
638err_put:
639 put_uobj_write(uobj);
640 return ret;
635} 641}
636 642
637struct xrcd_table_entry { 643struct xrcd_table_entry {
@@ -720,6 +726,7 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev,
720} 726}
721 727
722ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, 728ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
729 struct ib_device *ib_dev,
723 const char __user *buf, int in_len, 730 const char __user *buf, int in_len,
724 int out_len) 731 int out_len)
725{ 732{
@@ -778,15 +785,14 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
778 down_write(&obj->uobject.mutex); 785 down_write(&obj->uobject.mutex);
779 786
780 if (!xrcd) { 787 if (!xrcd) {
781 xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev, 788 xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata);
782 file->ucontext, &udata);
783 if (IS_ERR(xrcd)) { 789 if (IS_ERR(xrcd)) {
784 ret = PTR_ERR(xrcd); 790 ret = PTR_ERR(xrcd);
785 goto err; 791 goto err;
786 } 792 }
787 793
788 xrcd->inode = inode; 794 xrcd->inode = inode;
789 xrcd->device = file->device->ib_dev; 795 xrcd->device = ib_dev;
790 atomic_set(&xrcd->usecnt, 0); 796 atomic_set(&xrcd->usecnt, 0);
791 mutex_init(&xrcd->tgt_qp_mutex); 797 mutex_init(&xrcd->tgt_qp_mutex);
792 INIT_LIST_HEAD(&xrcd->tgt_qp_list); 798 INIT_LIST_HEAD(&xrcd->tgt_qp_list);
@@ -857,6 +863,7 @@ err_tree_mutex_unlock:
857} 863}
858 864
859ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, 865ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
866 struct ib_device *ib_dev,
860 const char __user *buf, int in_len, 867 const char __user *buf, int in_len,
861 int out_len) 868 int out_len)
862{ 869{
@@ -934,6 +941,7 @@ void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
934} 941}
935 942
936ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, 943ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
944 struct ib_device *ib_dev,
937 const char __user *buf, int in_len, 945 const char __user *buf, int in_len,
938 int out_len) 946 int out_len)
939{ 947{
@@ -1043,6 +1051,7 @@ err_free:
1043} 1051}
1044 1052
1045ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, 1053ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
1054 struct ib_device *ib_dev,
1046 const char __user *buf, int in_len, 1055 const char __user *buf, int in_len,
1047 int out_len) 1056 int out_len)
1048{ 1057{
@@ -1136,6 +1145,7 @@ put_uobjs:
1136} 1145}
1137 1146
1138ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, 1147ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
1148 struct ib_device *ib_dev,
1139 const char __user *buf, int in_len, 1149 const char __user *buf, int in_len,
1140 int out_len) 1150 int out_len)
1141{ 1151{
@@ -1174,8 +1184,9 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
1174} 1184}
1175 1185
1176ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, 1186ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
1177 const char __user *buf, int in_len, 1187 struct ib_device *ib_dev,
1178 int out_len) 1188 const char __user *buf, int in_len,
1189 int out_len)
1179{ 1190{
1180 struct ib_uverbs_alloc_mw cmd; 1191 struct ib_uverbs_alloc_mw cmd;
1181 struct ib_uverbs_alloc_mw_resp resp; 1192 struct ib_uverbs_alloc_mw_resp resp;
@@ -1256,8 +1267,9 @@ err_free:
1256} 1267}
1257 1268
1258ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, 1269ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
1259 const char __user *buf, int in_len, 1270 struct ib_device *ib_dev,
1260 int out_len) 1271 const char __user *buf, int in_len,
1272 int out_len)
1261{ 1273{
1262 struct ib_uverbs_dealloc_mw cmd; 1274 struct ib_uverbs_dealloc_mw cmd;
1263 struct ib_mw *mw; 1275 struct ib_mw *mw;
@@ -1294,6 +1306,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
1294} 1306}
1295 1307
1296ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, 1308ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
1309 struct ib_device *ib_dev,
1297 const char __user *buf, int in_len, 1310 const char __user *buf, int in_len,
1298 int out_len) 1311 int out_len)
1299{ 1312{
@@ -1313,7 +1326,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
1313 return ret; 1326 return ret;
1314 resp.fd = ret; 1327 resp.fd = ret;
1315 1328
1316 filp = ib_uverbs_alloc_event_file(file, 0); 1329 filp = ib_uverbs_alloc_event_file(file, ib_dev, 0);
1317 if (IS_ERR(filp)) { 1330 if (IS_ERR(filp)) {
1318 put_unused_fd(resp.fd); 1331 put_unused_fd(resp.fd);
1319 return PTR_ERR(filp); 1332 return PTR_ERR(filp);
@@ -1331,6 +1344,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
1331} 1344}
1332 1345
1333static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, 1346static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
1347 struct ib_device *ib_dev,
1334 struct ib_udata *ucore, 1348 struct ib_udata *ucore,
1335 struct ib_udata *uhw, 1349 struct ib_udata *uhw,
1336 struct ib_uverbs_ex_create_cq *cmd, 1350 struct ib_uverbs_ex_create_cq *cmd,
@@ -1379,14 +1393,14 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
1379 if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) 1393 if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
1380 attr.flags = cmd->flags; 1394 attr.flags = cmd->flags;
1381 1395
1382 cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr, 1396 cq = ib_dev->create_cq(ib_dev, &attr,
1383 file->ucontext, uhw); 1397 file->ucontext, uhw);
1384 if (IS_ERR(cq)) { 1398 if (IS_ERR(cq)) {
1385 ret = PTR_ERR(cq); 1399 ret = PTR_ERR(cq);
1386 goto err_file; 1400 goto err_file;
1387 } 1401 }
1388 1402
1389 cq->device = file->device->ib_dev; 1403 cq->device = ib_dev;
1390 cq->uobject = &obj->uobject; 1404 cq->uobject = &obj->uobject;
1391 cq->comp_handler = ib_uverbs_comp_handler; 1405 cq->comp_handler = ib_uverbs_comp_handler;
1392 cq->event_handler = ib_uverbs_cq_event_handler; 1406 cq->event_handler = ib_uverbs_cq_event_handler;
@@ -1447,6 +1461,7 @@ static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
1447} 1461}
1448 1462
1449ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, 1463ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
1464 struct ib_device *ib_dev,
1450 const char __user *buf, int in_len, 1465 const char __user *buf, int in_len,
1451 int out_len) 1466 int out_len)
1452{ 1467{
@@ -1475,7 +1490,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
1475 cmd_ex.comp_vector = cmd.comp_vector; 1490 cmd_ex.comp_vector = cmd.comp_vector;
1476 cmd_ex.comp_channel = cmd.comp_channel; 1491 cmd_ex.comp_channel = cmd.comp_channel;
1477 1492
1478 obj = create_cq(file, &ucore, &uhw, &cmd_ex, 1493 obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex,
1479 offsetof(typeof(cmd_ex), comp_channel) + 1494 offsetof(typeof(cmd_ex), comp_channel) +
1480 sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb, 1495 sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
1481 NULL); 1496 NULL);
@@ -1498,6 +1513,7 @@ static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
1498} 1513}
1499 1514
1500int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, 1515int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
1516 struct ib_device *ib_dev,
1501 struct ib_udata *ucore, 1517 struct ib_udata *ucore,
1502 struct ib_udata *uhw) 1518 struct ib_udata *uhw)
1503{ 1519{
@@ -1523,7 +1539,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
1523 sizeof(resp.response_length))) 1539 sizeof(resp.response_length)))
1524 return -ENOSPC; 1540 return -ENOSPC;
1525 1541
1526 obj = create_cq(file, ucore, uhw, &cmd, 1542 obj = create_cq(file, ib_dev, ucore, uhw, &cmd,
1527 min(ucore->inlen, sizeof(cmd)), 1543 min(ucore->inlen, sizeof(cmd)),
1528 ib_uverbs_ex_create_cq_cb, NULL); 1544 ib_uverbs_ex_create_cq_cb, NULL);
1529 1545
@@ -1534,6 +1550,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
1534} 1550}
1535 1551
1536ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, 1552ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
1553 struct ib_device *ib_dev,
1537 const char __user *buf, int in_len, 1554 const char __user *buf, int in_len,
1538 int out_len) 1555 int out_len)
1539{ 1556{
@@ -1597,6 +1614,7 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
1597} 1614}
1598 1615
1599ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, 1616ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
1617 struct ib_device *ib_dev,
1600 const char __user *buf, int in_len, 1618 const char __user *buf, int in_len,
1601 int out_len) 1619 int out_len)
1602{ 1620{
@@ -1648,6 +1666,7 @@ out_put:
1648} 1666}
1649 1667
1650ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, 1668ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
1669 struct ib_device *ib_dev,
1651 const char __user *buf, int in_len, 1670 const char __user *buf, int in_len,
1652 int out_len) 1671 int out_len)
1653{ 1672{
@@ -1670,6 +1689,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
1670} 1689}
1671 1690
1672ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, 1691ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
1692 struct ib_device *ib_dev,
1673 const char __user *buf, int in_len, 1693 const char __user *buf, int in_len,
1674 int out_len) 1694 int out_len)
1675{ 1695{
@@ -1722,6 +1742,7 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
1722} 1742}
1723 1743
1724ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, 1744ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
1745 struct ib_device *ib_dev,
1725 const char __user *buf, int in_len, 1746 const char __user *buf, int in_len,
1726 int out_len) 1747 int out_len)
1727{ 1748{
@@ -1917,6 +1938,7 @@ err_put:
1917} 1938}
1918 1939
1919ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, 1940ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
1941 struct ib_device *ib_dev,
1920 const char __user *buf, int in_len, int out_len) 1942 const char __user *buf, int in_len, int out_len)
1921{ 1943{
1922 struct ib_uverbs_open_qp cmd; 1944 struct ib_uverbs_open_qp cmd;
@@ -2011,6 +2033,7 @@ err_put:
2011} 2033}
2012 2034
2013ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, 2035ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
2036 struct ib_device *ib_dev,
2014 const char __user *buf, int in_len, 2037 const char __user *buf, int in_len,
2015 int out_len) 2038 int out_len)
2016{ 2039{
@@ -2125,6 +2148,7 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
2125} 2148}
2126 2149
2127ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, 2150ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
2151 struct ib_device *ib_dev,
2128 const char __user *buf, int in_len, 2152 const char __user *buf, int in_len,
2129 int out_len) 2153 int out_len)
2130{ 2154{
@@ -2221,6 +2245,7 @@ out:
2221} 2245}
2222 2246
2223ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, 2247ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
2248 struct ib_device *ib_dev,
2224 const char __user *buf, int in_len, 2249 const char __user *buf, int in_len,
2225 int out_len) 2250 int out_len)
2226{ 2251{
@@ -2279,6 +2304,7 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
2279} 2304}
2280 2305
2281ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, 2306ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
2307 struct ib_device *ib_dev,
2282 const char __user *buf, int in_len, 2308 const char __user *buf, int in_len,
2283 int out_len) 2309 int out_len)
2284{ 2310{
@@ -2346,6 +2372,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
2346 next->send_flags = user_wr->send_flags; 2372 next->send_flags = user_wr->send_flags;
2347 2373
2348 if (is_ud) { 2374 if (is_ud) {
2375 if (next->opcode != IB_WR_SEND &&
2376 next->opcode != IB_WR_SEND_WITH_IMM) {
2377 ret = -EINVAL;
2378 goto out_put;
2379 }
2380
2349 next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah, 2381 next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
2350 file->ucontext); 2382 file->ucontext);
2351 if (!next->wr.ud.ah) { 2383 if (!next->wr.ud.ah) {
@@ -2385,9 +2417,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
2385 user_wr->wr.atomic.compare_add; 2417 user_wr->wr.atomic.compare_add;
2386 next->wr.atomic.swap = user_wr->wr.atomic.swap; 2418 next->wr.atomic.swap = user_wr->wr.atomic.swap;
2387 next->wr.atomic.rkey = user_wr->wr.atomic.rkey; 2419 next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
2420 case IB_WR_SEND:
2388 break; 2421 break;
2389 default: 2422 default:
2390 break; 2423 ret = -EINVAL;
2424 goto out_put;
2391 } 2425 }
2392 } 2426 }
2393 2427
@@ -2523,6 +2557,7 @@ err:
2523} 2557}
2524 2558
2525ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, 2559ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
2560 struct ib_device *ib_dev,
2526 const char __user *buf, int in_len, 2561 const char __user *buf, int in_len,
2527 int out_len) 2562 int out_len)
2528{ 2563{
@@ -2572,6 +2607,7 @@ out:
2572} 2607}
2573 2608
2574ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, 2609ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
2610 struct ib_device *ib_dev,
2575 const char __user *buf, int in_len, 2611 const char __user *buf, int in_len,
2576 int out_len) 2612 int out_len)
2577{ 2613{
@@ -2621,6 +2657,7 @@ out:
2621} 2657}
2622 2658
2623ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, 2659ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
2660 struct ib_device *ib_dev,
2624 const char __user *buf, int in_len, 2661 const char __user *buf, int in_len,
2625 int out_len) 2662 int out_len)
2626{ 2663{
@@ -2713,6 +2750,7 @@ err:
2713} 2750}
2714 2751
2715ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, 2752ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
2753 struct ib_device *ib_dev,
2716 const char __user *buf, int in_len, int out_len) 2754 const char __user *buf, int in_len, int out_len)
2717{ 2755{
2718 struct ib_uverbs_destroy_ah cmd; 2756 struct ib_uverbs_destroy_ah cmd;
@@ -2749,6 +2787,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
2749} 2787}
2750 2788
2751ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, 2789ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
2790 struct ib_device *ib_dev,
2752 const char __user *buf, int in_len, 2791 const char __user *buf, int in_len,
2753 int out_len) 2792 int out_len)
2754{ 2793{
@@ -2796,6 +2835,7 @@ out_put:
2796} 2835}
2797 2836
2798ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, 2837ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
2838 struct ib_device *ib_dev,
2799 const char __user *buf, int in_len, 2839 const char __user *buf, int in_len,
2800 int out_len) 2840 int out_len)
2801{ 2841{
@@ -2876,6 +2916,7 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
2876} 2916}
2877 2917
2878int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, 2918int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
2919 struct ib_device *ib_dev,
2879 struct ib_udata *ucore, 2920 struct ib_udata *ucore,
2880 struct ib_udata *uhw) 2921 struct ib_udata *uhw)
2881{ 2922{
@@ -3036,6 +3077,7 @@ err_free_attr:
3036} 3077}
3037 3078
3038int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, 3079int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
3080 struct ib_device *ib_dev,
3039 struct ib_udata *ucore, 3081 struct ib_udata *ucore,
3040 struct ib_udata *uhw) 3082 struct ib_udata *uhw)
3041{ 3083{
@@ -3078,6 +3120,7 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
3078} 3120}
3079 3121
3080static int __uverbs_create_xsrq(struct ib_uverbs_file *file, 3122static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
3123 struct ib_device *ib_dev,
3081 struct ib_uverbs_create_xsrq *cmd, 3124 struct ib_uverbs_create_xsrq *cmd,
3082 struct ib_udata *udata) 3125 struct ib_udata *udata)
3083{ 3126{
@@ -3211,6 +3254,7 @@ err:
3211} 3254}
3212 3255
3213ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, 3256ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
3257 struct ib_device *ib_dev,
3214 const char __user *buf, int in_len, 3258 const char __user *buf, int in_len,
3215 int out_len) 3259 int out_len)
3216{ 3260{
@@ -3238,7 +3282,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
3238 (unsigned long) cmd.response + sizeof resp, 3282 (unsigned long) cmd.response + sizeof resp,
3239 in_len - sizeof cmd, out_len - sizeof resp); 3283 in_len - sizeof cmd, out_len - sizeof resp);
3240 3284
3241 ret = __uverbs_create_xsrq(file, &xcmd, &udata); 3285 ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);
3242 if (ret) 3286 if (ret)
3243 return ret; 3287 return ret;
3244 3288
@@ -3246,6 +3290,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
3246} 3290}
3247 3291
3248ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, 3292ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
3293 struct ib_device *ib_dev,
3249 const char __user *buf, int in_len, int out_len) 3294 const char __user *buf, int in_len, int out_len)
3250{ 3295{
3251 struct ib_uverbs_create_xsrq cmd; 3296 struct ib_uverbs_create_xsrq cmd;
@@ -3263,7 +3308,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
3263 (unsigned long) cmd.response + sizeof resp, 3308 (unsigned long) cmd.response + sizeof resp,
3264 in_len - sizeof cmd, out_len - sizeof resp); 3309 in_len - sizeof cmd, out_len - sizeof resp);
3265 3310
3266 ret = __uverbs_create_xsrq(file, &cmd, &udata); 3311 ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);
3267 if (ret) 3312 if (ret)
3268 return ret; 3313 return ret;
3269 3314
@@ -3271,6 +3316,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
3271} 3316}
3272 3317
3273ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, 3318ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
3319 struct ib_device *ib_dev,
3274 const char __user *buf, int in_len, 3320 const char __user *buf, int in_len,
3275 int out_len) 3321 int out_len)
3276{ 3322{
@@ -3301,6 +3347,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
3301} 3347}
3302 3348
3303ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, 3349ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
3350 struct ib_device *ib_dev,
3304 const char __user *buf, 3351 const char __user *buf,
3305 int in_len, int out_len) 3352 int in_len, int out_len)
3306{ 3353{
@@ -3341,6 +3388,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
3341} 3388}
3342 3389
3343ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, 3390ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
3391 struct ib_device *ib_dev,
3344 const char __user *buf, int in_len, 3392 const char __user *buf, int in_len,
3345 int out_len) 3393 int out_len)
3346{ 3394{
@@ -3398,16 +3446,15 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
3398} 3446}
3399 3447
3400int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, 3448int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
3449 struct ib_device *ib_dev,
3401 struct ib_udata *ucore, 3450 struct ib_udata *ucore,
3402 struct ib_udata *uhw) 3451 struct ib_udata *uhw)
3403{ 3452{
3404 struct ib_uverbs_ex_query_device_resp resp; 3453 struct ib_uverbs_ex_query_device_resp resp;
3405 struct ib_uverbs_ex_query_device cmd; 3454 struct ib_uverbs_ex_query_device cmd;
3406 struct ib_device_attr attr; 3455 struct ib_device_attr attr;
3407 struct ib_device *device;
3408 int err; 3456 int err;
3409 3457
3410 device = file->device->ib_dev;
3411 if (ucore->inlen < sizeof(cmd)) 3458 if (ucore->inlen < sizeof(cmd))
3412 return -EINVAL; 3459 return -EINVAL;
3413 3460
@@ -3428,11 +3475,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
3428 3475
3429 memset(&attr, 0, sizeof(attr)); 3476 memset(&attr, 0, sizeof(attr));
3430 3477
3431 err = device->query_device(device, &attr, uhw); 3478 err = ib_dev->query_device(ib_dev, &attr, uhw);
3432 if (err) 3479 if (err)
3433 return err; 3480 return err;
3434 3481
3435 copy_query_dev_fields(file, &resp.base, &attr); 3482 copy_query_dev_fields(file, ib_dev, &resp.base, &attr);
3436 resp.comp_mask = 0; 3483 resp.comp_mask = 0;
3437 3484
3438 if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps)) 3485 if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index f6eef2da7097..c29a660c72fe 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -79,6 +79,7 @@ static DEFINE_SPINLOCK(map_lock);
79static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); 79static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
80 80
81static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, 81static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
82 struct ib_device *ib_dev,
82 const char __user *buf, int in_len, 83 const char __user *buf, int in_len,
83 int out_len) = { 84 int out_len) = {
84 [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, 85 [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context,
@@ -119,6 +120,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
119}; 120};
120 121
121static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, 122static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
123 struct ib_device *ib_dev,
122 struct ib_udata *ucore, 124 struct ib_udata *ucore,
123 struct ib_udata *uhw) = { 125 struct ib_udata *uhw) = {
124 [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, 126 [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
@@ -128,16 +130,21 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
128}; 130};
129 131
130static void ib_uverbs_add_one(struct ib_device *device); 132static void ib_uverbs_add_one(struct ib_device *device);
131static void ib_uverbs_remove_one(struct ib_device *device); 133static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
132 134
133static void ib_uverbs_release_dev(struct kref *ref) 135static void ib_uverbs_release_dev(struct kobject *kobj)
134{ 136{
135 struct ib_uverbs_device *dev = 137 struct ib_uverbs_device *dev =
136 container_of(ref, struct ib_uverbs_device, ref); 138 container_of(kobj, struct ib_uverbs_device, kobj);
137 139
138 complete(&dev->comp); 140 cleanup_srcu_struct(&dev->disassociate_srcu);
141 kfree(dev);
139} 142}
140 143
144static struct kobj_type ib_uverbs_dev_ktype = {
145 .release = ib_uverbs_release_dev,
146};
147
141static void ib_uverbs_release_event_file(struct kref *ref) 148static void ib_uverbs_release_event_file(struct kref *ref)
142{ 149{
143 struct ib_uverbs_event_file *file = 150 struct ib_uverbs_event_file *file =
@@ -201,9 +208,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
201{ 208{
202 struct ib_uobject *uobj, *tmp; 209 struct ib_uobject *uobj, *tmp;
203 210
204 if (!context)
205 return 0;
206
207 context->closing = 1; 211 context->closing = 1;
208 212
209 list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) { 213 list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
@@ -303,13 +307,27 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
303 return context->device->dealloc_ucontext(context); 307 return context->device->dealloc_ucontext(context);
304} 308}
305 309
310static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
311{
312 complete(&dev->comp);
313}
314
306static void ib_uverbs_release_file(struct kref *ref) 315static void ib_uverbs_release_file(struct kref *ref)
307{ 316{
308 struct ib_uverbs_file *file = 317 struct ib_uverbs_file *file =
309 container_of(ref, struct ib_uverbs_file, ref); 318 container_of(ref, struct ib_uverbs_file, ref);
319 struct ib_device *ib_dev;
320 int srcu_key;
321
322 srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
323 ib_dev = srcu_dereference(file->device->ib_dev,
324 &file->device->disassociate_srcu);
325 if (ib_dev && !ib_dev->disassociate_ucontext)
326 module_put(ib_dev->owner);
327 srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
310 328
311 module_put(file->device->ib_dev->owner); 329 if (atomic_dec_and_test(&file->device->refcount))
312 kref_put(&file->device->ref, ib_uverbs_release_dev); 330 ib_uverbs_comp_dev(file->device);
313 331
314 kfree(file); 332 kfree(file);
315} 333}
@@ -331,9 +349,19 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
331 return -EAGAIN; 349 return -EAGAIN;
332 350
333 if (wait_event_interruptible(file->poll_wait, 351 if (wait_event_interruptible(file->poll_wait,
334 !list_empty(&file->event_list))) 352 (!list_empty(&file->event_list) ||
353 /* The barriers built into wait_event_interruptible()
354 * and wake_up() guarentee this will see the null set
355 * without using RCU
356 */
357 !file->uverbs_file->device->ib_dev)))
335 return -ERESTARTSYS; 358 return -ERESTARTSYS;
336 359
360 /* If device was disassociated and no event exists set an error */
361 if (list_empty(&file->event_list) &&
362 !file->uverbs_file->device->ib_dev)
363 return -EIO;
364
337 spin_lock_irq(&file->lock); 365 spin_lock_irq(&file->lock);
338 } 366 }
339 367
@@ -396,8 +424,11 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
396{ 424{
397 struct ib_uverbs_event_file *file = filp->private_data; 425 struct ib_uverbs_event_file *file = filp->private_data;
398 struct ib_uverbs_event *entry, *tmp; 426 struct ib_uverbs_event *entry, *tmp;
427 int closed_already = 0;
399 428
429 mutex_lock(&file->uverbs_file->device->lists_mutex);
400 spin_lock_irq(&file->lock); 430 spin_lock_irq(&file->lock);
431 closed_already = file->is_closed;
401 file->is_closed = 1; 432 file->is_closed = 1;
402 list_for_each_entry_safe(entry, tmp, &file->event_list, list) { 433 list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
403 if (entry->counter) 434 if (entry->counter)
@@ -405,11 +436,15 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
405 kfree(entry); 436 kfree(entry);
406 } 437 }
407 spin_unlock_irq(&file->lock); 438 spin_unlock_irq(&file->lock);
408 439 if (!closed_already) {
409 if (file->is_async) { 440 list_del(&file->list);
410 ib_unregister_event_handler(&file->uverbs_file->event_handler); 441 if (file->is_async)
411 kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); 442 ib_unregister_event_handler(&file->uverbs_file->
443 event_handler);
412 } 444 }
445 mutex_unlock(&file->uverbs_file->device->lists_mutex);
446
447 kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
413 kref_put(&file->ref, ib_uverbs_release_event_file); 448 kref_put(&file->ref, ib_uverbs_release_event_file);
414 449
415 return 0; 450 return 0;
@@ -541,13 +576,21 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
541 NULL, NULL); 576 NULL, NULL);
542} 577}
543 578
579void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
580{
581 kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
582 file->async_file = NULL;
583}
584
544struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, 585struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
586 struct ib_device *ib_dev,
545 int is_async) 587 int is_async)
546{ 588{
547 struct ib_uverbs_event_file *ev_file; 589 struct ib_uverbs_event_file *ev_file;
548 struct file *filp; 590 struct file *filp;
591 int ret;
549 592
550 ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL); 593 ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
551 if (!ev_file) 594 if (!ev_file)
552 return ERR_PTR(-ENOMEM); 595 return ERR_PTR(-ENOMEM);
553 596
@@ -556,16 +599,47 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
556 INIT_LIST_HEAD(&ev_file->event_list); 599 INIT_LIST_HEAD(&ev_file->event_list);
557 init_waitqueue_head(&ev_file->poll_wait); 600 init_waitqueue_head(&ev_file->poll_wait);
558 ev_file->uverbs_file = uverbs_file; 601 ev_file->uverbs_file = uverbs_file;
602 kref_get(&ev_file->uverbs_file->ref);
559 ev_file->async_queue = NULL; 603 ev_file->async_queue = NULL;
560 ev_file->is_async = is_async;
561 ev_file->is_closed = 0; 604 ev_file->is_closed = 0;
562 605
563 filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops, 606 filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
564 ev_file, O_RDONLY); 607 ev_file, O_RDONLY);
565 if (IS_ERR(filp)) 608 if (IS_ERR(filp))
566 kfree(ev_file); 609 goto err_put_refs;
610
611 mutex_lock(&uverbs_file->device->lists_mutex);
612 list_add_tail(&ev_file->list,
613 &uverbs_file->device->uverbs_events_file_list);
614 mutex_unlock(&uverbs_file->device->lists_mutex);
615
616 if (is_async) {
617 WARN_ON(uverbs_file->async_file);
618 uverbs_file->async_file = ev_file;
619 kref_get(&uverbs_file->async_file->ref);
620 INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
621 ib_dev,
622 ib_uverbs_event_handler);
623 ret = ib_register_event_handler(&uverbs_file->event_handler);
624 if (ret)
625 goto err_put_file;
626
627 /* At that point async file stuff was fully set */
628 ev_file->is_async = 1;
629 }
567 630
568 return filp; 631 return filp;
632
633err_put_file:
634 fput(filp);
635 kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file);
636 uverbs_file->async_file = NULL;
637 return ERR_PTR(ret);
638
639err_put_refs:
640 kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
641 kref_put(&ev_file->ref, ib_uverbs_release_event_file);
642 return filp;
569} 643}
570 644
571/* 645/*
@@ -601,8 +675,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
601 size_t count, loff_t *pos) 675 size_t count, loff_t *pos)
602{ 676{
603 struct ib_uverbs_file *file = filp->private_data; 677 struct ib_uverbs_file *file = filp->private_data;
678 struct ib_device *ib_dev;
604 struct ib_uverbs_cmd_hdr hdr; 679 struct ib_uverbs_cmd_hdr hdr;
605 __u32 flags; 680 __u32 flags;
681 int srcu_key;
682 ssize_t ret;
606 683
607 if (count < sizeof hdr) 684 if (count < sizeof hdr)
608 return -EINVAL; 685 return -EINVAL;
@@ -610,6 +687,14 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
610 if (copy_from_user(&hdr, buf, sizeof hdr)) 687 if (copy_from_user(&hdr, buf, sizeof hdr))
611 return -EFAULT; 688 return -EFAULT;
612 689
690 srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
691 ib_dev = srcu_dereference(file->device->ib_dev,
692 &file->device->disassociate_srcu);
693 if (!ib_dev) {
694 ret = -EIO;
695 goto out;
696 }
697
613 flags = (hdr.command & 698 flags = (hdr.command &
614 IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; 699 IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
615 700
@@ -617,26 +702,36 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
617 __u32 command; 702 __u32 command;
618 703
619 if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | 704 if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
620 IB_USER_VERBS_CMD_COMMAND_MASK)) 705 IB_USER_VERBS_CMD_COMMAND_MASK)) {
621 return -EINVAL; 706 ret = -EINVAL;
707 goto out;
708 }
622 709
623 command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; 710 command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
624 711
625 if (command >= ARRAY_SIZE(uverbs_cmd_table) || 712 if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
626 !uverbs_cmd_table[command]) 713 !uverbs_cmd_table[command]) {
627 return -EINVAL; 714 ret = -EINVAL;
715 goto out;
716 }
628 717
629 if (!file->ucontext && 718 if (!file->ucontext &&
630 command != IB_USER_VERBS_CMD_GET_CONTEXT) 719 command != IB_USER_VERBS_CMD_GET_CONTEXT) {
631 return -EINVAL; 720 ret = -EINVAL;
721 goto out;
722 }
632 723
633 if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command))) 724 if (!(ib_dev->uverbs_cmd_mask & (1ull << command))) {
634 return -ENOSYS; 725 ret = -ENOSYS;
726 goto out;
727 }
635 728
636 if (hdr.in_words * 4 != count) 729 if (hdr.in_words * 4 != count) {
637 return -EINVAL; 730 ret = -EINVAL;
731 goto out;
732 }
638 733
639 return uverbs_cmd_table[command](file, 734 ret = uverbs_cmd_table[command](file, ib_dev,
640 buf + sizeof(hdr), 735 buf + sizeof(hdr),
641 hdr.in_words * 4, 736 hdr.in_words * 4,
642 hdr.out_words * 4); 737 hdr.out_words * 4);
@@ -647,51 +742,72 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
647 struct ib_uverbs_ex_cmd_hdr ex_hdr; 742 struct ib_uverbs_ex_cmd_hdr ex_hdr;
648 struct ib_udata ucore; 743 struct ib_udata ucore;
649 struct ib_udata uhw; 744 struct ib_udata uhw;
650 int err;
651 size_t written_count = count; 745 size_t written_count = count;
652 746
653 if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | 747 if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
654 IB_USER_VERBS_CMD_COMMAND_MASK)) 748 IB_USER_VERBS_CMD_COMMAND_MASK)) {
655 return -EINVAL; 749 ret = -EINVAL;
750 goto out;
751 }
656 752
657 command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; 753 command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
658 754
659 if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || 755 if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
660 !uverbs_ex_cmd_table[command]) 756 !uverbs_ex_cmd_table[command]) {
661 return -ENOSYS; 757 ret = -ENOSYS;
758 goto out;
759 }
662 760
663 if (!file->ucontext) 761 if (!file->ucontext) {
664 return -EINVAL; 762 ret = -EINVAL;
763 goto out;
764 }
665 765
666 if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command))) 766 if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command))) {
667 return -ENOSYS; 767 ret = -ENOSYS;
768 goto out;
769 }
668 770
669 if (count < (sizeof(hdr) + sizeof(ex_hdr))) 771 if (count < (sizeof(hdr) + sizeof(ex_hdr))) {
670 return -EINVAL; 772 ret = -EINVAL;
773 goto out;
774 }
671 775
672 if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) 776 if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) {
673 return -EFAULT; 777 ret = -EFAULT;
778 goto out;
779 }
674 780
675 count -= sizeof(hdr) + sizeof(ex_hdr); 781 count -= sizeof(hdr) + sizeof(ex_hdr);
676 buf += sizeof(hdr) + sizeof(ex_hdr); 782 buf += sizeof(hdr) + sizeof(ex_hdr);
677 783
678 if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) 784 if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) {
679 return -EINVAL; 785 ret = -EINVAL;
786 goto out;
787 }
680 788
681 if (ex_hdr.cmd_hdr_reserved) 789 if (ex_hdr.cmd_hdr_reserved) {
682 return -EINVAL; 790 ret = -EINVAL;
791 goto out;
792 }
683 793
684 if (ex_hdr.response) { 794 if (ex_hdr.response) {
685 if (!hdr.out_words && !ex_hdr.provider_out_words) 795 if (!hdr.out_words && !ex_hdr.provider_out_words) {
686 return -EINVAL; 796 ret = -EINVAL;
797 goto out;
798 }
687 799
688 if (!access_ok(VERIFY_WRITE, 800 if (!access_ok(VERIFY_WRITE,
689 (void __user *) (unsigned long) ex_hdr.response, 801 (void __user *) (unsigned long) ex_hdr.response,
690 (hdr.out_words + ex_hdr.provider_out_words) * 8)) 802 (hdr.out_words + ex_hdr.provider_out_words) * 8)) {
691 return -EFAULT; 803 ret = -EFAULT;
804 goto out;
805 }
692 } else { 806 } else {
693 if (hdr.out_words || ex_hdr.provider_out_words) 807 if (hdr.out_words || ex_hdr.provider_out_words) {
694 return -EINVAL; 808 ret = -EINVAL;
809 goto out;
810 }
695 } 811 }
696 812
697 INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response, 813 INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
@@ -703,27 +819,43 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
703 ex_hdr.provider_in_words * 8, 819 ex_hdr.provider_in_words * 8,
704 ex_hdr.provider_out_words * 8); 820 ex_hdr.provider_out_words * 8);
705 821
706 err = uverbs_ex_cmd_table[command](file, 822 ret = uverbs_ex_cmd_table[command](file,
823 ib_dev,
707 &ucore, 824 &ucore,
708 &uhw); 825 &uhw);
709 826 if (!ret)
710 if (err) 827 ret = written_count;
711 return err; 828 } else {
712 829 ret = -ENOSYS;
713 return written_count;
714 } 830 }
715 831
716 return -ENOSYS; 832out:
833 srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
834 return ret;
717} 835}
718 836
719static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) 837static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
720{ 838{
721 struct ib_uverbs_file *file = filp->private_data; 839 struct ib_uverbs_file *file = filp->private_data;
840 struct ib_device *ib_dev;
841 int ret = 0;
842 int srcu_key;
843
844 srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
845 ib_dev = srcu_dereference(file->device->ib_dev,
846 &file->device->disassociate_srcu);
847 if (!ib_dev) {
848 ret = -EIO;
849 goto out;
850 }
722 851
723 if (!file->ucontext) 852 if (!file->ucontext)
724 return -ENODEV; 853 ret = -ENODEV;
725 else 854 else
726 return file->device->ib_dev->mmap(file->ucontext, vma); 855 ret = ib_dev->mmap(file->ucontext, vma);
856out:
857 srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
858 return ret;
727} 859}
728 860
729/* 861/*
@@ -740,23 +872,43 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
740{ 872{
741 struct ib_uverbs_device *dev; 873 struct ib_uverbs_device *dev;
742 struct ib_uverbs_file *file; 874 struct ib_uverbs_file *file;
875 struct ib_device *ib_dev;
743 int ret; 876 int ret;
877 int module_dependent;
878 int srcu_key;
744 879
745 dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); 880 dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
746 if (dev) 881 if (!atomic_inc_not_zero(&dev->refcount))
747 kref_get(&dev->ref);
748 else
749 return -ENXIO; 882 return -ENXIO;
750 883
751 if (!try_module_get(dev->ib_dev->owner)) { 884 srcu_key = srcu_read_lock(&dev->disassociate_srcu);
752 ret = -ENODEV; 885 mutex_lock(&dev->lists_mutex);
886 ib_dev = srcu_dereference(dev->ib_dev,
887 &dev->disassociate_srcu);
888 if (!ib_dev) {
889 ret = -EIO;
753 goto err; 890 goto err;
754 } 891 }
755 892
756 file = kmalloc(sizeof *file, GFP_KERNEL); 893 /* In case IB device supports disassociate ucontext, there is no hard
894 * dependency between uverbs device and its low level device.
895 */
896 module_dependent = !(ib_dev->disassociate_ucontext);
897
898 if (module_dependent) {
899 if (!try_module_get(ib_dev->owner)) {
900 ret = -ENODEV;
901 goto err;
902 }
903 }
904
905 file = kzalloc(sizeof(*file), GFP_KERNEL);
757 if (!file) { 906 if (!file) {
758 ret = -ENOMEM; 907 ret = -ENOMEM;
759 goto err_module; 908 if (module_dependent)
909 goto err_module;
910
911 goto err;
760 } 912 }
761 913
762 file->device = dev; 914 file->device = dev;
@@ -766,27 +918,47 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
766 mutex_init(&file->mutex); 918 mutex_init(&file->mutex);
767 919
768 filp->private_data = file; 920 filp->private_data = file;
921 kobject_get(&dev->kobj);
922 list_add_tail(&file->list, &dev->uverbs_file_list);
923 mutex_unlock(&dev->lists_mutex);
924 srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
769 925
770 return nonseekable_open(inode, filp); 926 return nonseekable_open(inode, filp);
771 927
772err_module: 928err_module:
773 module_put(dev->ib_dev->owner); 929 module_put(ib_dev->owner);
774 930
775err: 931err:
776 kref_put(&dev->ref, ib_uverbs_release_dev); 932 mutex_unlock(&dev->lists_mutex);
933 srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
934 if (atomic_dec_and_test(&dev->refcount))
935 ib_uverbs_comp_dev(dev);
936
777 return ret; 937 return ret;
778} 938}
779 939
780static int ib_uverbs_close(struct inode *inode, struct file *filp) 940static int ib_uverbs_close(struct inode *inode, struct file *filp)
781{ 941{
782 struct ib_uverbs_file *file = filp->private_data; 942 struct ib_uverbs_file *file = filp->private_data;
783 943 struct ib_uverbs_device *dev = file->device;
784 ib_uverbs_cleanup_ucontext(file, file->ucontext); 944 struct ib_ucontext *ucontext = NULL;
945
946 mutex_lock(&file->device->lists_mutex);
947 ucontext = file->ucontext;
948 file->ucontext = NULL;
949 if (!file->is_closed) {
950 list_del(&file->list);
951 file->is_closed = 1;
952 }
953 mutex_unlock(&file->device->lists_mutex);
954 if (ucontext)
955 ib_uverbs_cleanup_ucontext(file, ucontext);
785 956
786 if (file->async_file) 957 if (file->async_file)
787 kref_put(&file->async_file->ref, ib_uverbs_release_event_file); 958 kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
788 959
789 kref_put(&file->ref, ib_uverbs_release_file); 960 kref_put(&file->ref, ib_uverbs_release_file);
961 kobject_put(&dev->kobj);
790 962
791 return 0; 963 return 0;
792} 964}
@@ -817,12 +989,21 @@ static struct ib_client uverbs_client = {
817static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, 989static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
818 char *buf) 990 char *buf)
819{ 991{
992 int ret = -ENODEV;
993 int srcu_key;
820 struct ib_uverbs_device *dev = dev_get_drvdata(device); 994 struct ib_uverbs_device *dev = dev_get_drvdata(device);
995 struct ib_device *ib_dev;
821 996
822 if (!dev) 997 if (!dev)
823 return -ENODEV; 998 return -ENODEV;
824 999
825 return sprintf(buf, "%s\n", dev->ib_dev->name); 1000 srcu_key = srcu_read_lock(&dev->disassociate_srcu);
1001 ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
1002 if (ib_dev)
1003 ret = sprintf(buf, "%s\n", ib_dev->name);
1004 srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
1005
1006 return ret;
826} 1007}
827static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); 1008static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
828 1009
@@ -830,11 +1011,19 @@ static ssize_t show_dev_abi_version(struct device *device,
830 struct device_attribute *attr, char *buf) 1011 struct device_attribute *attr, char *buf)
831{ 1012{
832 struct ib_uverbs_device *dev = dev_get_drvdata(device); 1013 struct ib_uverbs_device *dev = dev_get_drvdata(device);
1014 int ret = -ENODEV;
1015 int srcu_key;
1016 struct ib_device *ib_dev;
833 1017
834 if (!dev) 1018 if (!dev)
835 return -ENODEV; 1019 return -ENODEV;
1020 srcu_key = srcu_read_lock(&dev->disassociate_srcu);
1021 ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
1022 if (ib_dev)
1023 ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
1024 srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
836 1025
837 return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver); 1026 return ret;
838} 1027}
839static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); 1028static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
840 1029
@@ -874,6 +1063,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
874 int devnum; 1063 int devnum;
875 dev_t base; 1064 dev_t base;
876 struct ib_uverbs_device *uverbs_dev; 1065 struct ib_uverbs_device *uverbs_dev;
1066 int ret;
877 1067
878 if (!device->alloc_ucontext) 1068 if (!device->alloc_ucontext)
879 return; 1069 return;
@@ -882,10 +1072,20 @@ static void ib_uverbs_add_one(struct ib_device *device)
882 if (!uverbs_dev) 1072 if (!uverbs_dev)
883 return; 1073 return;
884 1074
885 kref_init(&uverbs_dev->ref); 1075 ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
1076 if (ret) {
1077 kfree(uverbs_dev);
1078 return;
1079 }
1080
1081 atomic_set(&uverbs_dev->refcount, 1);
886 init_completion(&uverbs_dev->comp); 1082 init_completion(&uverbs_dev->comp);
887 uverbs_dev->xrcd_tree = RB_ROOT; 1083 uverbs_dev->xrcd_tree = RB_ROOT;
888 mutex_init(&uverbs_dev->xrcd_tree_mutex); 1084 mutex_init(&uverbs_dev->xrcd_tree_mutex);
1085 kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
1086 mutex_init(&uverbs_dev->lists_mutex);
1087 INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
1088 INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
889 1089
890 spin_lock(&map_lock); 1090 spin_lock(&map_lock);
891 devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); 1091 devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -906,12 +1106,13 @@ static void ib_uverbs_add_one(struct ib_device *device)
906 } 1106 }
907 spin_unlock(&map_lock); 1107 spin_unlock(&map_lock);
908 1108
909 uverbs_dev->ib_dev = device; 1109 rcu_assign_pointer(uverbs_dev->ib_dev, device);
910 uverbs_dev->num_comp_vectors = device->num_comp_vectors; 1110 uverbs_dev->num_comp_vectors = device->num_comp_vectors;
911 1111
912 cdev_init(&uverbs_dev->cdev, NULL); 1112 cdev_init(&uverbs_dev->cdev, NULL);
913 uverbs_dev->cdev.owner = THIS_MODULE; 1113 uverbs_dev->cdev.owner = THIS_MODULE;
914 uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; 1114 uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
1115 uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj;
915 kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); 1116 kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
916 if (cdev_add(&uverbs_dev->cdev, base, 1)) 1117 if (cdev_add(&uverbs_dev->cdev, base, 1))
917 goto err_cdev; 1118 goto err_cdev;
@@ -942,15 +1143,79 @@ err_cdev:
942 clear_bit(devnum, overflow_map); 1143 clear_bit(devnum, overflow_map);
943 1144
944err: 1145err:
945 kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); 1146 if (atomic_dec_and_test(&uverbs_dev->refcount))
1147 ib_uverbs_comp_dev(uverbs_dev);
946 wait_for_completion(&uverbs_dev->comp); 1148 wait_for_completion(&uverbs_dev->comp);
947 kfree(uverbs_dev); 1149 kobject_put(&uverbs_dev->kobj);
948 return; 1150 return;
949} 1151}
950 1152
951static void ib_uverbs_remove_one(struct ib_device *device) 1153static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
1154 struct ib_device *ib_dev)
952{ 1155{
953 struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client); 1156 struct ib_uverbs_file *file;
1157 struct ib_uverbs_event_file *event_file;
1158 struct ib_event event;
1159
1160 /* Pending running commands to terminate */
1161 synchronize_srcu(&uverbs_dev->disassociate_srcu);
1162 event.event = IB_EVENT_DEVICE_FATAL;
1163 event.element.port_num = 0;
1164 event.device = ib_dev;
1165
1166 mutex_lock(&uverbs_dev->lists_mutex);
1167 while (!list_empty(&uverbs_dev->uverbs_file_list)) {
1168 struct ib_ucontext *ucontext;
1169
1170 file = list_first_entry(&uverbs_dev->uverbs_file_list,
1171 struct ib_uverbs_file, list);
1172 file->is_closed = 1;
1173 ucontext = file->ucontext;
1174 list_del(&file->list);
1175 file->ucontext = NULL;
1176 kref_get(&file->ref);
1177 mutex_unlock(&uverbs_dev->lists_mutex);
1178 /* We must release the mutex before going ahead and calling
1179 * disassociate_ucontext. disassociate_ucontext might end up
1180 * indirectly calling uverbs_close, for example due to freeing
1181 * the resources (e.g mmput).
1182 */
1183 ib_uverbs_event_handler(&file->event_handler, &event);
1184 if (ucontext) {
1185 ib_dev->disassociate_ucontext(ucontext);
1186 ib_uverbs_cleanup_ucontext(file, ucontext);
1187 }
1188
1189 mutex_lock(&uverbs_dev->lists_mutex);
1190 kref_put(&file->ref, ib_uverbs_release_file);
1191 }
1192
1193 while (!list_empty(&uverbs_dev->uverbs_events_file_list)) {
1194 event_file = list_first_entry(&uverbs_dev->
1195 uverbs_events_file_list,
1196 struct ib_uverbs_event_file,
1197 list);
1198 spin_lock_irq(&event_file->lock);
1199 event_file->is_closed = 1;
1200 spin_unlock_irq(&event_file->lock);
1201
1202 list_del(&event_file->list);
1203 if (event_file->is_async) {
1204 ib_unregister_event_handler(&event_file->uverbs_file->
1205 event_handler);
1206 event_file->uverbs_file->event_handler.device = NULL;
1207 }
1208
1209 wake_up_interruptible(&event_file->poll_wait);
1210 kill_fasync(&event_file->async_queue, SIGIO, POLL_IN);
1211 }
1212 mutex_unlock(&uverbs_dev->lists_mutex);
1213}
1214
1215static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
1216{
1217 struct ib_uverbs_device *uverbs_dev = client_data;
1218 int wait_clients = 1;
954 1219
955 if (!uverbs_dev) 1220 if (!uverbs_dev)
956 return; 1221 return;
@@ -964,9 +1229,28 @@ static void ib_uverbs_remove_one(struct ib_device *device)
964 else 1229 else
965 clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); 1230 clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
966 1231
967 kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); 1232 if (device->disassociate_ucontext) {
968 wait_for_completion(&uverbs_dev->comp); 1233 /* We disassociate HW resources and immediately return.
969 kfree(uverbs_dev); 1234 * Userspace will see a EIO errno for all future access.
1235 * Upon returning, ib_device may be freed internally and is not
1236 * valid any more.
1237 * uverbs_device is still available until all clients close
1238 * their files, then the uverbs device ref count will be zero
1239 * and its resources will be freed.
1240 * Note: At this point no more files can be opened since the
1241 * cdev was deleted, however active clients can still issue
1242 * commands and close their open files.
1243 */
1244 rcu_assign_pointer(uverbs_dev->ib_dev, NULL);
1245 ib_uverbs_free_hw_resources(uverbs_dev, device);
1246 wait_clients = 0;
1247 }
1248
1249 if (atomic_dec_and_test(&uverbs_dev->refcount))
1250 ib_uverbs_comp_dev(uverbs_dev);
1251 if (wait_clients)
1252 wait_for_completion(&uverbs_dev->comp);
1253 kobject_put(&uverbs_dev->kobj);
970} 1254}
971 1255
972static char *uverbs_devnode(struct device *dev, umode_t *mode) 1256static char *uverbs_devnode(struct device *dev, umode_t *mode)
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 30eb2457000c..e1f2c9887f3f 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -213,28 +213,79 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
213 213
214/* Protection domains */ 214/* Protection domains */
215 215
216/**
217 * ib_alloc_pd - Allocates an unused protection domain.
218 * @device: The device on which to allocate the protection domain.
219 *
220 * A protection domain object provides an association between QPs, shared
221 * receive queues, address handles, memory regions, and memory windows.
222 *
223 * Every PD has a local_dma_lkey which can be used as the lkey value for local
224 * memory operations.
225 */
216struct ib_pd *ib_alloc_pd(struct ib_device *device) 226struct ib_pd *ib_alloc_pd(struct ib_device *device)
217{ 227{
218 struct ib_pd *pd; 228 struct ib_pd *pd;
229 struct ib_device_attr devattr;
230 int rc;
231
232 rc = ib_query_device(device, &devattr);
233 if (rc)
234 return ERR_PTR(rc);
219 235
220 pd = device->alloc_pd(device, NULL, NULL); 236 pd = device->alloc_pd(device, NULL, NULL);
237 if (IS_ERR(pd))
238 return pd;
239
240 pd->device = device;
241 pd->uobject = NULL;
242 pd->local_mr = NULL;
243 atomic_set(&pd->usecnt, 0);
244
245 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
246 pd->local_dma_lkey = device->local_dma_lkey;
247 else {
248 struct ib_mr *mr;
249
250 mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE);
251 if (IS_ERR(mr)) {
252 ib_dealloc_pd(pd);
253 return (struct ib_pd *)mr;
254 }
221 255
222 if (!IS_ERR(pd)) { 256 pd->local_mr = mr;
223 pd->device = device; 257 pd->local_dma_lkey = pd->local_mr->lkey;
224 pd->uobject = NULL;
225 atomic_set(&pd->usecnt, 0);
226 } 258 }
227
228 return pd; 259 return pd;
229} 260}
230EXPORT_SYMBOL(ib_alloc_pd); 261EXPORT_SYMBOL(ib_alloc_pd);
231 262
232int ib_dealloc_pd(struct ib_pd *pd) 263/**
264 * ib_dealloc_pd - Deallocates a protection domain.
265 * @pd: The protection domain to deallocate.
266 *
267 * It is an error to call this function while any resources in the pd still
268 * exist. The caller is responsible to synchronously destroy them and
269 * guarantee no new allocations will happen.
270 */
271void ib_dealloc_pd(struct ib_pd *pd)
233{ 272{
234 if (atomic_read(&pd->usecnt)) 273 int ret;
235 return -EBUSY; 274
275 if (pd->local_mr) {
276 ret = ib_dereg_mr(pd->local_mr);
277 WARN_ON(ret);
278 pd->local_mr = NULL;
279 }
236 280
237 return pd->device->dealloc_pd(pd); 281 /* uverbs manipulates usecnt with proper locking, while the kabi
282 requires the caller to guarantee we can't race here. */
283 WARN_ON(atomic_read(&pd->usecnt));
284
285 /* Making delalloc_pd a void return is a WIP, no driver should return
286 an error here. */
287 ret = pd->device->dealloc_pd(pd);
288 WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
238} 289}
239EXPORT_SYMBOL(ib_dealloc_pd); 290EXPORT_SYMBOL(ib_dealloc_pd);
240 291
@@ -1168,54 +1219,28 @@ int ib_dereg_mr(struct ib_mr *mr)
1168} 1219}
1169EXPORT_SYMBOL(ib_dereg_mr); 1220EXPORT_SYMBOL(ib_dereg_mr);
1170 1221
1171struct ib_mr *ib_create_mr(struct ib_pd *pd, 1222/**
1172 struct ib_mr_init_attr *mr_init_attr) 1223 * ib_alloc_mr() - Allocates a memory region
1173{ 1224 * @pd: protection domain associated with the region
1174 struct ib_mr *mr; 1225 * @mr_type: memory region type
1175 1226 * @max_num_sg: maximum sg entries available for registration.
1176 if (!pd->device->create_mr) 1227 *
1177 return ERR_PTR(-ENOSYS); 1228 * Notes:
1178 1229 * Memory registeration page/sg lists must not exceed max_num_sg.
1179 mr = pd->device->create_mr(pd, mr_init_attr); 1230 * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
1180 1231 * max_num_sg * used_page_size.
1181 if (!IS_ERR(mr)) { 1232 *
1182 mr->device = pd->device; 1233 */
1183 mr->pd = pd; 1234struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
1184 mr->uobject = NULL; 1235 enum ib_mr_type mr_type,
1185 atomic_inc(&pd->usecnt); 1236 u32 max_num_sg)
1186 atomic_set(&mr->usecnt, 0);
1187 }
1188
1189 return mr;
1190}
1191EXPORT_SYMBOL(ib_create_mr);
1192
1193int ib_destroy_mr(struct ib_mr *mr)
1194{
1195 struct ib_pd *pd;
1196 int ret;
1197
1198 if (atomic_read(&mr->usecnt))
1199 return -EBUSY;
1200
1201 pd = mr->pd;
1202 ret = mr->device->destroy_mr(mr);
1203 if (!ret)
1204 atomic_dec(&pd->usecnt);
1205
1206 return ret;
1207}
1208EXPORT_SYMBOL(ib_destroy_mr);
1209
1210struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
1211{ 1237{
1212 struct ib_mr *mr; 1238 struct ib_mr *mr;
1213 1239
1214 if (!pd->device->alloc_fast_reg_mr) 1240 if (!pd->device->alloc_mr)
1215 return ERR_PTR(-ENOSYS); 1241 return ERR_PTR(-ENOSYS);
1216 1242
1217 mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len); 1243 mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
1218
1219 if (!IS_ERR(mr)) { 1244 if (!IS_ERR(mr)) {
1220 mr->device = pd->device; 1245 mr->device = pd->device;
1221 mr->pd = pd; 1246 mr->pd = pd;
@@ -1226,7 +1251,7 @@ struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
1226 1251
1227 return mr; 1252 return mr;
1228} 1253}
1229EXPORT_SYMBOL(ib_alloc_fast_reg_mr); 1254EXPORT_SYMBOL(ib_alloc_mr);
1230 1255
1231struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device, 1256struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
1232 int max_page_list_len) 1257 int max_page_list_len)
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index e900b03531a9..1bdb9996d371 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -1,8 +1,6 @@
1obj-$(CONFIG_INFINIBAND_MTHCA) += mthca/ 1obj-$(CONFIG_INFINIBAND_MTHCA) += mthca/
2obj-$(CONFIG_INFINIBAND_IPATH) += ipath/
3obj-$(CONFIG_INFINIBAND_QIB) += qib/ 2obj-$(CONFIG_INFINIBAND_QIB) += qib/
4obj-$(CONFIG_INFINIBAND_EHCA) += ehca/ 3obj-$(CONFIG_INFINIBAND_EHCA) += ehca/
5obj-$(CONFIG_INFINIBAND_AMSO1100) += amso1100/
6obj-$(CONFIG_INFINIBAND_CXGB3) += cxgb3/ 4obj-$(CONFIG_INFINIBAND_CXGB3) += cxgb3/
7obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/ 5obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/
8obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/ 6obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index bbbe0184e592..93308c45f298 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -800,7 +800,9 @@ static int iwch_dealloc_mw(struct ib_mw *mw)
800 return 0; 800 return 0;
801} 801}
802 802
803static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth) 803static struct ib_mr *iwch_alloc_mr(struct ib_pd *pd,
804 enum ib_mr_type mr_type,
805 u32 max_num_sg)
804{ 806{
805 struct iwch_dev *rhp; 807 struct iwch_dev *rhp;
806 struct iwch_pd *php; 808 struct iwch_pd *php;
@@ -809,6 +811,10 @@ static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
809 u32 stag = 0; 811 u32 stag = 0;
810 int ret = 0; 812 int ret = 0;
811 813
814 if (mr_type != IB_MR_TYPE_MEM_REG ||
815 max_num_sg > T3_MAX_FASTREG_DEPTH)
816 return ERR_PTR(-EINVAL);
817
812 php = to_iwch_pd(pd); 818 php = to_iwch_pd(pd);
813 rhp = php->rhp; 819 rhp = php->rhp;
814 mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); 820 mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
@@ -816,10 +822,10 @@ static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
816 goto err; 822 goto err;
817 823
818 mhp->rhp = rhp; 824 mhp->rhp = rhp;
819 ret = iwch_alloc_pbl(mhp, pbl_depth); 825 ret = iwch_alloc_pbl(mhp, max_num_sg);
820 if (ret) 826 if (ret)
821 goto err1; 827 goto err1;
822 mhp->attr.pbl_size = pbl_depth; 828 mhp->attr.pbl_size = max_num_sg;
823 ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid, 829 ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid,
824 mhp->attr.pbl_size, mhp->attr.pbl_addr); 830 mhp->attr.pbl_size, mhp->attr.pbl_addr);
825 if (ret) 831 if (ret)
@@ -1443,7 +1449,7 @@ int iwch_register_device(struct iwch_dev *dev)
1443 dev->ibdev.alloc_mw = iwch_alloc_mw; 1449 dev->ibdev.alloc_mw = iwch_alloc_mw;
1444 dev->ibdev.bind_mw = iwch_bind_mw; 1450 dev->ibdev.bind_mw = iwch_bind_mw;
1445 dev->ibdev.dealloc_mw = iwch_dealloc_mw; 1451 dev->ibdev.dealloc_mw = iwch_dealloc_mw;
1446 dev->ibdev.alloc_fast_reg_mr = iwch_alloc_fast_reg_mr; 1452 dev->ibdev.alloc_mr = iwch_alloc_mr;
1447 dev->ibdev.alloc_fast_reg_page_list = iwch_alloc_fastreg_pbl; 1453 dev->ibdev.alloc_fast_reg_page_list = iwch_alloc_fastreg_pbl;
1448 dev->ibdev.free_fast_reg_page_list = iwch_free_fastreg_pbl; 1454 dev->ibdev.free_fast_reg_page_list = iwch_free_fastreg_pbl;
1449 dev->ibdev.attach_mcast = iwch_multicast_attach; 1455 dev->ibdev.attach_mcast = iwch_multicast_attach;
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 3ad8dc798f52..debc39d2cbc2 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -50,6 +50,7 @@
50#include <rdma/ib_addr.h> 50#include <rdma/ib_addr.h>
51 51
52#include "iw_cxgb4.h" 52#include "iw_cxgb4.h"
53#include "clip_tbl.h"
53 54
54static char *states[] = { 55static char *states[] = {
55 "idle", 56 "idle",
@@ -115,11 +116,11 @@ module_param(ep_timeout_secs, int, 0644);
115MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " 116MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
116 "in seconds (default=60)"); 117 "in seconds (default=60)");
117 118
118static int mpa_rev = 1; 119static int mpa_rev = 2;
119module_param(mpa_rev, int, 0644); 120module_param(mpa_rev, int, 0644);
120MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, " 121MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
121 "1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft" 122 "1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft"
122 " compliant (default=1)"); 123 " compliant (default=2)");
123 124
124static int markers_enabled; 125static int markers_enabled;
125module_param(markers_enabled, int, 0644); 126module_param(markers_enabled, int, 0644);
@@ -298,6 +299,16 @@ void _c4iw_free_ep(struct kref *kref)
298 if (test_bit(QP_REFERENCED, &ep->com.flags)) 299 if (test_bit(QP_REFERENCED, &ep->com.flags))
299 deref_qp(ep); 300 deref_qp(ep);
300 if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) { 301 if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
302 if (ep->com.remote_addr.ss_family == AF_INET6) {
303 struct sockaddr_in6 *sin6 =
304 (struct sockaddr_in6 *)
305 &ep->com.mapped_local_addr;
306
307 cxgb4_clip_release(
308 ep->com.dev->rdev.lldi.ports[0],
309 (const u32 *)&sin6->sin6_addr.s6_addr,
310 1);
311 }
301 remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid); 312 remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
302 cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid); 313 cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
303 dst_release(ep->dst); 314 dst_release(ep->dst);
@@ -442,6 +453,12 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
442 kfree_skb(skb); 453 kfree_skb(skb);
443 connect_reply_upcall(ep, -EHOSTUNREACH); 454 connect_reply_upcall(ep, -EHOSTUNREACH);
444 state_set(&ep->com, DEAD); 455 state_set(&ep->com, DEAD);
456 if (ep->com.remote_addr.ss_family == AF_INET6) {
457 struct sockaddr_in6 *sin6 =
458 (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
459 cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
460 (const u32 *)&sin6->sin6_addr.s6_addr, 1);
461 }
445 remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); 462 remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
446 cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); 463 cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
447 dst_release(ep->dst); 464 dst_release(ep->dst);
@@ -640,6 +657,7 @@ static int send_connect(struct c4iw_ep *ep)
640 struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *) 657 struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)
641 &ep->com.mapped_remote_addr; 658 &ep->com.mapped_remote_addr;
642 int win; 659 int win;
660 int ret;
643 661
644 wrlen = (ep->com.remote_addr.ss_family == AF_INET) ? 662 wrlen = (ep->com.remote_addr.ss_family == AF_INET) ?
645 roundup(sizev4, 16) : 663 roundup(sizev4, 16) :
@@ -693,6 +711,11 @@ static int send_connect(struct c4iw_ep *ep)
693 opt2 |= CONG_CNTRL_V(CONG_ALG_TAHOE); 711 opt2 |= CONG_CNTRL_V(CONG_ALG_TAHOE);
694 opt2 |= T5_ISS_F; 712 opt2 |= T5_ISS_F;
695 } 713 }
714
715 if (ep->com.remote_addr.ss_family == AF_INET6)
716 cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
717 (const u32 *)&la6->sin6_addr.s6_addr, 1);
718
696 t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure); 719 t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure);
697 720
698 if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) { 721 if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) {
@@ -790,7 +813,11 @@ static int send_connect(struct c4iw_ep *ep)
790 } 813 }
791 814
792 set_bit(ACT_OPEN_REQ, &ep->com.history); 815 set_bit(ACT_OPEN_REQ, &ep->com.history);
793 return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); 816 ret = c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
817 if (ret && ep->com.remote_addr.ss_family == AF_INET6)
818 cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
819 (const u32 *)&la6->sin6_addr.s6_addr, 1);
820 return ret;
794} 821}
795 822
796static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, 823static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
@@ -2091,6 +2118,15 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
2091 case CPL_ERR_CONN_EXIST: 2118 case CPL_ERR_CONN_EXIST:
2092 if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) { 2119 if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
2093 set_bit(ACT_RETRY_INUSE, &ep->com.history); 2120 set_bit(ACT_RETRY_INUSE, &ep->com.history);
2121 if (ep->com.remote_addr.ss_family == AF_INET6) {
2122 struct sockaddr_in6 *sin6 =
2123 (struct sockaddr_in6 *)
2124 &ep->com.mapped_local_addr;
2125 cxgb4_clip_release(
2126 ep->com.dev->rdev.lldi.ports[0],
2127 (const u32 *)
2128 &sin6->sin6_addr.s6_addr, 1);
2129 }
2094 remove_handle(ep->com.dev, &ep->com.dev->atid_idr, 2130 remove_handle(ep->com.dev, &ep->com.dev->atid_idr,
2095 atid); 2131 atid);
2096 cxgb4_free_atid(t, atid); 2132 cxgb4_free_atid(t, atid);
@@ -2118,6 +2154,12 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
2118 connect_reply_upcall(ep, status2errno(status)); 2154 connect_reply_upcall(ep, status2errno(status));
2119 state_set(&ep->com, DEAD); 2155 state_set(&ep->com, DEAD);
2120 2156
2157 if (ep->com.remote_addr.ss_family == AF_INET6) {
2158 struct sockaddr_in6 *sin6 =
2159 (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
2160 cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
2161 (const u32 *)&sin6->sin6_addr.s6_addr, 1);
2162 }
2121 if (status && act_open_has_tid(status)) 2163 if (status && act_open_has_tid(status))
2122 cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl)); 2164 cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl));
2123 2165
@@ -2302,6 +2344,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
2302 struct dst_entry *dst; 2344 struct dst_entry *dst;
2303 __u8 local_ip[16], peer_ip[16]; 2345 __u8 local_ip[16], peer_ip[16];
2304 __be16 local_port, peer_port; 2346 __be16 local_port, peer_port;
2347 struct sockaddr_in6 *sin6;
2305 int err; 2348 int err;
2306 u16 peer_mss = ntohs(req->tcpopt.mss); 2349 u16 peer_mss = ntohs(req->tcpopt.mss);
2307 int iptype; 2350 int iptype;
@@ -2400,9 +2443,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
2400 sin->sin_port = peer_port; 2443 sin->sin_port = peer_port;
2401 sin->sin_addr.s_addr = *(__be32 *)peer_ip; 2444 sin->sin_addr.s_addr = *(__be32 *)peer_ip;
2402 } else { 2445 } else {
2403 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) 2446 sin6 = (struct sockaddr_in6 *)&child_ep->com.mapped_local_addr;
2404 &child_ep->com.mapped_local_addr;
2405
2406 sin6->sin6_family = PF_INET6; 2447 sin6->sin6_family = PF_INET6;
2407 sin6->sin6_port = local_port; 2448 sin6->sin6_port = local_port;
2408 memcpy(sin6->sin6_addr.s6_addr, local_ip, 16); 2449 memcpy(sin6->sin6_addr.s6_addr, local_ip, 16);
@@ -2436,6 +2477,11 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
2436 insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid); 2477 insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid);
2437 accept_cr(child_ep, skb, req); 2478 accept_cr(child_ep, skb, req);
2438 set_bit(PASS_ACCEPT_REQ, &child_ep->com.history); 2479 set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
2480 if (iptype == 6) {
2481 sin6 = (struct sockaddr_in6 *)&child_ep->com.mapped_local_addr;
2482 cxgb4_clip_get(child_ep->com.dev->rdev.lldi.ports[0],
2483 (const u32 *)&sin6->sin6_addr.s6_addr, 1);
2484 }
2439 goto out; 2485 goto out;
2440reject: 2486reject:
2441 reject_cr(dev, hwtid, skb); 2487 reject_cr(dev, hwtid, skb);
@@ -2672,6 +2718,15 @@ out:
2672 if (release) 2718 if (release)
2673 release_ep_resources(ep); 2719 release_ep_resources(ep);
2674 else if (ep->retry_with_mpa_v1) { 2720 else if (ep->retry_with_mpa_v1) {
2721 if (ep->com.remote_addr.ss_family == AF_INET6) {
2722 struct sockaddr_in6 *sin6 =
2723 (struct sockaddr_in6 *)
2724 &ep->com.mapped_local_addr;
2725 cxgb4_clip_release(
2726 ep->com.dev->rdev.lldi.ports[0],
2727 (const u32 *)&sin6->sin6_addr.s6_addr,
2728 1);
2729 }
2675 remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid); 2730 remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
2676 cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid); 2731 cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
2677 dst_release(ep->dst); 2732 dst_release(ep->dst);
@@ -2976,7 +3031,7 @@ static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id)
2976 struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->local_addr; 3031 struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->local_addr;
2977 struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->remote_addr; 3032 struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->remote_addr;
2978 3033
2979 if (get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) { 3034 if (!get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) {
2980 memcpy(la6->sin6_addr.s6_addr, &addr, 16); 3035 memcpy(la6->sin6_addr.s6_addr, &addr, 16);
2981 memcpy(ra6->sin6_addr.s6_addr, &addr, 16); 3036 memcpy(ra6->sin6_addr.s6_addr, &addr, 16);
2982 return 0; 3037 return 0;
@@ -3186,6 +3241,9 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
3186 pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n", 3241 pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
3187 err, ep->stid, 3242 err, ep->stid,
3188 sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port)); 3243 sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port));
3244 else
3245 cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
3246 (const u32 *)&sin6->sin6_addr.s6_addr, 1);
3189 return err; 3247 return err;
3190} 3248}
3191 3249
@@ -3334,6 +3392,7 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
3334 ep->com.dev->rdev.lldi.ports[0], ep->stid, 3392 ep->com.dev->rdev.lldi.ports[0], ep->stid,
3335 ep->com.dev->rdev.lldi.rxq_ids[0], 0); 3393 ep->com.dev->rdev.lldi.rxq_ids[0], 0);
3336 } else { 3394 } else {
3395 struct sockaddr_in6 *sin6;
3337 c4iw_init_wr_wait(&ep->com.wr_wait); 3396 c4iw_init_wr_wait(&ep->com.wr_wait);
3338 err = cxgb4_remove_server( 3397 err = cxgb4_remove_server(
3339 ep->com.dev->rdev.lldi.ports[0], ep->stid, 3398 ep->com.dev->rdev.lldi.ports[0], ep->stid,
@@ -3342,6 +3401,9 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
3342 goto done; 3401 goto done;
3343 err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, 3402 err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait,
3344 0, 0, __func__); 3403 0, 0, __func__);
3404 sin6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
3405 cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
3406 (const u32 *)&sin6->sin6_addr.s6_addr, 1);
3345 } 3407 }
3346 remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid); 3408 remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
3347 cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, 3409 cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
@@ -3461,6 +3523,12 @@ static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
3461 mutex_unlock(&dev->rdev.stats.lock); 3523 mutex_unlock(&dev->rdev.stats.lock);
3462 connect_reply_upcall(ep, status2errno(req->retval)); 3524 connect_reply_upcall(ep, status2errno(req->retval));
3463 state_set(&ep->com, DEAD); 3525 state_set(&ep->com, DEAD);
3526 if (ep->com.remote_addr.ss_family == AF_INET6) {
3527 struct sockaddr_in6 *sin6 =
3528 (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
3529 cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
3530 (const u32 *)&sin6->sin6_addr.s6_addr, 1);
3531 }
3464 remove_handle(dev, &dev->atid_idr, atid); 3532 remove_handle(dev, &dev->atid_idr, atid);
3465 cxgb4_free_atid(dev->rdev.lldi.tids, atid); 3533 cxgb4_free_atid(dev->rdev.lldi.tids, atid);
3466 dst_release(ep->dst); 3534 dst_release(ep->dst);
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index cc77844fada3..c7bb38c931a5 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -970,7 +970,9 @@ void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list);
970struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl( 970struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(
971 struct ib_device *device, 971 struct ib_device *device,
972 int page_list_len); 972 int page_list_len);
973struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth); 973struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
974 enum ib_mr_type mr_type,
975 u32 max_num_sg);
974int c4iw_dealloc_mw(struct ib_mw *mw); 976int c4iw_dealloc_mw(struct ib_mw *mw);
975struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); 977struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
976struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, 978struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index cff815b91707..026b91ebd5e2 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -853,7 +853,9 @@ int c4iw_dealloc_mw(struct ib_mw *mw)
853 return 0; 853 return 0;
854} 854}
855 855
856struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth) 856struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
857 enum ib_mr_type mr_type,
858 u32 max_num_sg)
857{ 859{
858 struct c4iw_dev *rhp; 860 struct c4iw_dev *rhp;
859 struct c4iw_pd *php; 861 struct c4iw_pd *php;
@@ -862,6 +864,10 @@ struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
862 u32 stag = 0; 864 u32 stag = 0;
863 int ret = 0; 865 int ret = 0;
864 866
867 if (mr_type != IB_MR_TYPE_MEM_REG ||
868 max_num_sg > t4_max_fr_depth(use_dsgl))
869 return ERR_PTR(-EINVAL);
870
865 php = to_c4iw_pd(pd); 871 php = to_c4iw_pd(pd);
866 rhp = php->rhp; 872 rhp = php->rhp;
867 mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); 873 mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
@@ -871,10 +877,10 @@ struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
871 } 877 }
872 878
873 mhp->rhp = rhp; 879 mhp->rhp = rhp;
874 ret = alloc_pbl(mhp, pbl_depth); 880 ret = alloc_pbl(mhp, max_num_sg);
875 if (ret) 881 if (ret)
876 goto err1; 882 goto err1;
877 mhp->attr.pbl_size = pbl_depth; 883 mhp->attr.pbl_size = max_num_sg;
878 ret = allocate_stag(&rhp->rdev, &stag, php->pdid, 884 ret = allocate_stag(&rhp->rdev, &stag, php->pdid,
879 mhp->attr.pbl_size, mhp->attr.pbl_addr); 885 mhp->attr.pbl_size, mhp->attr.pbl_addr);
880 if (ret) 886 if (ret)
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 6eee3d385541..7746113552e7 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -556,7 +556,7 @@ int c4iw_register_device(struct c4iw_dev *dev)
556 dev->ibdev.alloc_mw = c4iw_alloc_mw; 556 dev->ibdev.alloc_mw = c4iw_alloc_mw;
557 dev->ibdev.bind_mw = c4iw_bind_mw; 557 dev->ibdev.bind_mw = c4iw_bind_mw;
558 dev->ibdev.dealloc_mw = c4iw_dealloc_mw; 558 dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
559 dev->ibdev.alloc_fast_reg_mr = c4iw_alloc_fast_reg_mr; 559 dev->ibdev.alloc_mr = c4iw_alloc_mr;
560 dev->ibdev.alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl; 560 dev->ibdev.alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl;
561 dev->ibdev.free_fast_reg_page_list = c4iw_free_fastreg_pbl; 561 dev->ibdev.free_fast_reg_page_list = c4iw_free_fastreg_pbl;
562 dev->ibdev.attach_mcast = c4iw_multicast_attach; 562 dev->ibdev.attach_mcast = c4iw_multicast_attach;
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index f50a546224ad..1688a17de4fe 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -89,7 +89,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
89 if (vlan_tag < 0x1000) 89 if (vlan_tag < 0x1000)
90 vlan_tag |= (ah_attr->sl & 7) << 13; 90 vlan_tag |= (ah_attr->sl & 7) << 13;
91 ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); 91 ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
92 ah->av.eth.gid_index = ah_attr->grh.sgid_index; 92 ah->av.eth.gid_index = mlx4_ib_gid_index_to_real_index(ibdev, ah_attr->port_num, ah_attr->grh.sgid_index);
93 ah->av.eth.vlan = cpu_to_be16(vlan_tag); 93 ah->av.eth.vlan = cpu_to_be16(vlan_tag);
94 if (ah_attr->static_rate) { 94 if (ah_attr->static_rate) {
95 ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; 95 ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
@@ -148,9 +148,13 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
148 enum rdma_link_layer ll; 148 enum rdma_link_layer ll;
149 149
150 memset(ah_attr, 0, sizeof *ah_attr); 150 memset(ah_attr, 0, sizeof *ah_attr);
151 ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
152 ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24; 151 ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
153 ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num); 152 ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num);
153 if (ll == IB_LINK_LAYER_ETHERNET)
154 ah_attr->sl = be32_to_cpu(ah->av.eth.sl_tclass_flowlabel) >> 29;
155 else
156 ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
157
154 ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0; 158 ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0;
155 if (ah->av.ib.stat_rate) 159 if (ah->av.ib.stat_rate)
156 ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET; 160 ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET;
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 180a8f7ec82d..5fd49f9435f9 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -638,7 +638,7 @@ static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries,
638 * simulated FLUSH_ERR completions 638 * simulated FLUSH_ERR completions
639 */ 639 */
640 list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) { 640 list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) {
641 mlx4_ib_qp_sw_comp(qp, num_entries, wc, npolled, 1); 641 mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 1);
642 if (*npolled >= num_entries) 642 if (*npolled >= num_entries)
643 goto out; 643 goto out;
644 } 644 }
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 68b3dfa922bf..1cd75ff02251 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -580,7 +580,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
580 580
581 list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map; 581 list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map;
582 list.length = sizeof (struct mlx4_rcv_tunnel_mad); 582 list.length = sizeof (struct mlx4_rcv_tunnel_mad);
583 list.lkey = tun_ctx->mr->lkey; 583 list.lkey = tun_ctx->pd->local_dma_lkey;
584 584
585 wr.wr.ud.ah = ah; 585 wr.wr.ud.ah = ah;
586 wr.wr.ud.port_num = port; 586 wr.wr.ud.port_num = port;
@@ -1133,7 +1133,7 @@ static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
1133 1133
1134 sg_list.addr = tun_qp->ring[index].map; 1134 sg_list.addr = tun_qp->ring[index].map;
1135 sg_list.length = size; 1135 sg_list.length = size;
1136 sg_list.lkey = ctx->mr->lkey; 1136 sg_list.lkey = ctx->pd->local_dma_lkey;
1137 1137
1138 recv_wr.next = NULL; 1138 recv_wr.next = NULL;
1139 recv_wr.sg_list = &sg_list; 1139 recv_wr.sg_list = &sg_list;
@@ -1244,7 +1244,7 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
1244 1244
1245 list.addr = sqp->tx_ring[wire_tx_ix].buf.map; 1245 list.addr = sqp->tx_ring[wire_tx_ix].buf.map;
1246 list.length = sizeof (struct mlx4_mad_snd_buf); 1246 list.length = sizeof (struct mlx4_mad_snd_buf);
1247 list.lkey = sqp_ctx->mr->lkey; 1247 list.lkey = sqp_ctx->pd->local_dma_lkey;
1248 1248
1249 wr.wr.ud.ah = ah; 1249 wr.wr.ud.ah = ah;
1250 wr.wr.ud.port_num = port; 1250 wr.wr.ud.port_num = port;
@@ -1827,19 +1827,12 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
1827 goto err_cq; 1827 goto err_cq;
1828 } 1828 }
1829 1829
1830 ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE);
1831 if (IS_ERR(ctx->mr)) {
1832 ret = PTR_ERR(ctx->mr);
1833 pr_err("Couldn't get tunnel DMA MR (%d)\n", ret);
1834 goto err_pd;
1835 }
1836
1837 if (ctx->has_smi) { 1830 if (ctx->has_smi) {
1838 ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun); 1831 ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun);
1839 if (ret) { 1832 if (ret) {
1840 pr_err("Couldn't create %s QP0 (%d)\n", 1833 pr_err("Couldn't create %s QP0 (%d)\n",
1841 create_tun ? "tunnel for" : "", ret); 1834 create_tun ? "tunnel for" : "", ret);
1842 goto err_mr; 1835 goto err_pd;
1843 } 1836 }
1844 } 1837 }
1845 1838
@@ -1876,10 +1869,6 @@ err_qp0:
1876 ib_destroy_qp(ctx->qp[0].qp); 1869 ib_destroy_qp(ctx->qp[0].qp);
1877 ctx->qp[0].qp = NULL; 1870 ctx->qp[0].qp = NULL;
1878 1871
1879err_mr:
1880 ib_dereg_mr(ctx->mr);
1881 ctx->mr = NULL;
1882
1883err_pd: 1872err_pd:
1884 ib_dealloc_pd(ctx->pd); 1873 ib_dealloc_pd(ctx->pd);
1885 ctx->pd = NULL; 1874 ctx->pd = NULL;
@@ -1916,8 +1905,6 @@ static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port,
1916 ib_destroy_qp(ctx->qp[1].qp); 1905 ib_destroy_qp(ctx->qp[1].qp);
1917 ctx->qp[1].qp = NULL; 1906 ctx->qp[1].qp = NULL;
1918 mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1); 1907 mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1);
1919 ib_dereg_mr(ctx->mr);
1920 ctx->mr = NULL;
1921 ib_dealloc_pd(ctx->pd); 1908 ib_dealloc_pd(ctx->pd);
1922 ctx->pd = NULL; 1909 ctx->pd = NULL;
1923 ib_destroy_cq(ctx->cq); 1910 ib_destroy_cq(ctx->cq);
@@ -2050,8 +2037,6 @@ static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx)
2050 ib_destroy_qp(sqp_ctx->qp[1].qp); 2037 ib_destroy_qp(sqp_ctx->qp[1].qp);
2051 sqp_ctx->qp[1].qp = NULL; 2038 sqp_ctx->qp[1].qp = NULL;
2052 mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0); 2039 mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0);
2053 ib_dereg_mr(sqp_ctx->mr);
2054 sqp_ctx->mr = NULL;
2055 ib_dealloc_pd(sqp_ctx->pd); 2040 ib_dealloc_pd(sqp_ctx->pd);
2056 sqp_ctx->pd = NULL; 2041 sqp_ctx->pd = NULL;
2057 ib_destroy_cq(sqp_ctx->cq); 2042 ib_destroy_cq(sqp_ctx->cq);
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 8be6db816460..efecdf0216d8 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -45,6 +45,9 @@
45#include <rdma/ib_smi.h> 45#include <rdma/ib_smi.h>
46#include <rdma/ib_user_verbs.h> 46#include <rdma/ib_user_verbs.h>
47#include <rdma/ib_addr.h> 47#include <rdma/ib_addr.h>
48#include <rdma/ib_cache.h>
49
50#include <net/bonding.h>
48 51
49#include <linux/mlx4/driver.h> 52#include <linux/mlx4/driver.h>
50#include <linux/mlx4/cmd.h> 53#include <linux/mlx4/cmd.h>
@@ -74,13 +77,6 @@ static const char mlx4_ib_version[] =
74 DRV_NAME ": Mellanox ConnectX InfiniBand driver v" 77 DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
75 DRV_VERSION " (" DRV_RELDATE ")\n"; 78 DRV_VERSION " (" DRV_RELDATE ")\n";
76 79
77struct update_gid_work {
78 struct work_struct work;
79 union ib_gid gids[128];
80 struct mlx4_ib_dev *dev;
81 int port;
82};
83
84static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); 80static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
85 81
86static struct workqueue_struct *wq; 82static struct workqueue_struct *wq;
@@ -93,8 +89,6 @@ static void init_query_mad(struct ib_smp *mad)
93 mad->method = IB_MGMT_METHOD_GET; 89 mad->method = IB_MGMT_METHOD_GET;
94} 90}
95 91
96static union ib_gid zgid;
97
98static int check_flow_steering_support(struct mlx4_dev *dev) 92static int check_flow_steering_support(struct mlx4_dev *dev)
99{ 93{
100 int eth_num_ports = 0; 94 int eth_num_ports = 0;
@@ -131,6 +125,237 @@ static int num_ib_ports(struct mlx4_dev *dev)
131 return ib_ports; 125 return ib_ports;
132} 126}
133 127
128static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num)
129{
130 struct mlx4_ib_dev *ibdev = to_mdev(device);
131 struct net_device *dev;
132
133 rcu_read_lock();
134 dev = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num);
135
136 if (dev) {
137 if (mlx4_is_bonded(ibdev->dev)) {
138 struct net_device *upper = NULL;
139
140 upper = netdev_master_upper_dev_get_rcu(dev);
141 if (upper) {
142 struct net_device *active;
143
144 active = bond_option_active_slave_get_rcu(netdev_priv(upper));
145 if (active)
146 dev = active;
147 }
148 }
149 }
150 if (dev)
151 dev_hold(dev);
152
153 rcu_read_unlock();
154 return dev;
155}
156
157static int mlx4_ib_update_gids(struct gid_entry *gids,
158 struct mlx4_ib_dev *ibdev,
159 u8 port_num)
160{
161 struct mlx4_cmd_mailbox *mailbox;
162 int err;
163 struct mlx4_dev *dev = ibdev->dev;
164 int i;
165 union ib_gid *gid_tbl;
166
167 mailbox = mlx4_alloc_cmd_mailbox(dev);
168 if (IS_ERR(mailbox))
169 return -ENOMEM;
170
171 gid_tbl = mailbox->buf;
172
173 for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
174 memcpy(&gid_tbl[i], &gids[i].gid, sizeof(union ib_gid));
175
176 err = mlx4_cmd(dev, mailbox->dma,
177 MLX4_SET_PORT_GID_TABLE << 8 | port_num,
178 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
179 MLX4_CMD_WRAPPED);
180 if (mlx4_is_bonded(dev))
181 err += mlx4_cmd(dev, mailbox->dma,
182 MLX4_SET_PORT_GID_TABLE << 8 | 2,
183 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
184 MLX4_CMD_WRAPPED);
185
186 mlx4_free_cmd_mailbox(dev, mailbox);
187 return err;
188}
189
190static int mlx4_ib_add_gid(struct ib_device *device,
191 u8 port_num,
192 unsigned int index,
193 const union ib_gid *gid,
194 const struct ib_gid_attr *attr,
195 void **context)
196{
197 struct mlx4_ib_dev *ibdev = to_mdev(device);
198 struct mlx4_ib_iboe *iboe = &ibdev->iboe;
199 struct mlx4_port_gid_table *port_gid_table;
200 int free = -1, found = -1;
201 int ret = 0;
202 int hw_update = 0;
203 int i;
204 struct gid_entry *gids = NULL;
205
206 if (!rdma_cap_roce_gid_table(device, port_num))
207 return -EINVAL;
208
209 if (port_num > MLX4_MAX_PORTS)
210 return -EINVAL;
211
212 if (!context)
213 return -EINVAL;
214
215 port_gid_table = &iboe->gids[port_num - 1];
216 spin_lock_bh(&iboe->lock);
217 for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
218 if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid))) {
219 found = i;
220 break;
221 }
222 if (free < 0 && !memcmp(&port_gid_table->gids[i].gid, &zgid, sizeof(*gid)))
223 free = i; /* HW has space */
224 }
225
226 if (found < 0) {
227 if (free < 0) {
228 ret = -ENOSPC;
229 } else {
230 port_gid_table->gids[free].ctx = kmalloc(sizeof(*port_gid_table->gids[free].ctx), GFP_ATOMIC);
231 if (!port_gid_table->gids[free].ctx) {
232 ret = -ENOMEM;
233 } else {
234 *context = port_gid_table->gids[free].ctx;
235 memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid));
236 port_gid_table->gids[free].ctx->real_index = free;
237 port_gid_table->gids[free].ctx->refcount = 1;
238 hw_update = 1;
239 }
240 }
241 } else {
242 struct gid_cache_context *ctx = port_gid_table->gids[found].ctx;
243 *context = ctx;
244 ctx->refcount++;
245 }
246 if (!ret && hw_update) {
247 gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC);
248 if (!gids) {
249 ret = -ENOMEM;
250 } else {
251 for (i = 0; i < MLX4_MAX_PORT_GIDS; i++)
252 memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
253 }
254 }
255 spin_unlock_bh(&iboe->lock);
256
257 if (!ret && hw_update) {
258 ret = mlx4_ib_update_gids(gids, ibdev, port_num);
259 kfree(gids);
260 }
261
262 return ret;
263}
264
265static int mlx4_ib_del_gid(struct ib_device *device,
266 u8 port_num,
267 unsigned int index,
268 void **context)
269{
270 struct gid_cache_context *ctx = *context;
271 struct mlx4_ib_dev *ibdev = to_mdev(device);
272 struct mlx4_ib_iboe *iboe = &ibdev->iboe;
273 struct mlx4_port_gid_table *port_gid_table;
274 int ret = 0;
275 int hw_update = 0;
276 struct gid_entry *gids = NULL;
277
278 if (!rdma_cap_roce_gid_table(device, port_num))
279 return -EINVAL;
280
281 if (port_num > MLX4_MAX_PORTS)
282 return -EINVAL;
283
284 port_gid_table = &iboe->gids[port_num - 1];
285 spin_lock_bh(&iboe->lock);
286 if (ctx) {
287 ctx->refcount--;
288 if (!ctx->refcount) {
289 unsigned int real_index = ctx->real_index;
290
291 memcpy(&port_gid_table->gids[real_index].gid, &zgid, sizeof(zgid));
292 kfree(port_gid_table->gids[real_index].ctx);
293 port_gid_table->gids[real_index].ctx = NULL;
294 hw_update = 1;
295 }
296 }
297 if (!ret && hw_update) {
298 int i;
299
300 gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC);
301 if (!gids) {
302 ret = -ENOMEM;
303 } else {
304 for (i = 0; i < MLX4_MAX_PORT_GIDS; i++)
305 memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
306 }
307 }
308 spin_unlock_bh(&iboe->lock);
309
310 if (!ret && hw_update) {
311 ret = mlx4_ib_update_gids(gids, ibdev, port_num);
312 kfree(gids);
313 }
314 return ret;
315}
316
317int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
318 u8 port_num, int index)
319{
320 struct mlx4_ib_iboe *iboe = &ibdev->iboe;
321 struct gid_cache_context *ctx = NULL;
322 union ib_gid gid;
323 struct mlx4_port_gid_table *port_gid_table;
324 int real_index = -EINVAL;
325 int i;
326 int ret;
327 unsigned long flags;
328
329 if (port_num > MLX4_MAX_PORTS)
330 return -EINVAL;
331
332 if (mlx4_is_bonded(ibdev->dev))
333 port_num = 1;
334
335 if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num))
336 return index;
337
338 ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid);
339 if (ret)
340 return ret;
341
342 if (!memcmp(&gid, &zgid, sizeof(gid)))
343 return -EINVAL;
344
345 spin_lock_irqsave(&iboe->lock, flags);
346 port_gid_table = &iboe->gids[port_num - 1];
347
348 for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
349 if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid))) {
350 ctx = port_gid_table->gids[i].ctx;
351 break;
352 }
353 if (ctx)
354 real_index = ctx->real_index;
355 spin_unlock_irqrestore(&iboe->lock, flags);
356 return real_index;
357}
358
134static int mlx4_ib_query_device(struct ib_device *ibdev, 359static int mlx4_ib_query_device(struct ib_device *ibdev,
135 struct ib_device_attr *props, 360 struct ib_device_attr *props,
136 struct ib_udata *uhw) 361 struct ib_udata *uhw)
@@ -229,6 +454,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
229 props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; 454 props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
230 props->max_sge = min(dev->dev->caps.max_sq_sg, 455 props->max_sge = min(dev->dev->caps.max_sq_sg,
231 dev->dev->caps.max_rq_sg); 456 dev->dev->caps.max_rq_sg);
457 props->max_sge_rd = props->max_sge;
232 props->max_cq = dev->dev->quotas.cq; 458 props->max_cq = dev->dev->quotas.cq;
233 props->max_cqe = dev->dev->caps.max_cqes; 459 props->max_cqe = dev->dev->caps.max_cqes;
234 props->max_mr = dev->dev->quotas.mpt; 460 props->max_mr = dev->dev->quotas.mpt;
@@ -414,12 +640,13 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
414 props->state = IB_PORT_DOWN; 640 props->state = IB_PORT_DOWN;
415 props->phys_state = state_to_phys_state(props->state); 641 props->phys_state = state_to_phys_state(props->state);
416 props->active_mtu = IB_MTU_256; 642 props->active_mtu = IB_MTU_256;
417 if (is_bonded)
418 rtnl_lock(); /* required to get upper dev */
419 spin_lock_bh(&iboe->lock); 643 spin_lock_bh(&iboe->lock);
420 ndev = iboe->netdevs[port - 1]; 644 ndev = iboe->netdevs[port - 1];
421 if (ndev && is_bonded) 645 if (ndev && is_bonded) {
422 ndev = netdev_master_upper_dev_get(ndev); 646 rcu_read_lock(); /* required to get upper dev */
647 ndev = netdev_master_upper_dev_get_rcu(ndev);
648 rcu_read_unlock();
649 }
423 if (!ndev) 650 if (!ndev)
424 goto out_unlock; 651 goto out_unlock;
425 652
@@ -431,8 +658,6 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
431 props->phys_state = state_to_phys_state(props->state); 658 props->phys_state = state_to_phys_state(props->state);
432out_unlock: 659out_unlock:
433 spin_unlock_bh(&iboe->lock); 660 spin_unlock_bh(&iboe->lock);
434 if (is_bonded)
435 rtnl_unlock();
436out: 661out:
437 mlx4_free_cmd_mailbox(mdev->dev, mailbox); 662 mlx4_free_cmd_mailbox(mdev->dev, mailbox);
438 return err; 663 return err;
@@ -515,23 +740,27 @@ out:
515 return err; 740 return err;
516} 741}
517 742
518static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index,
519 union ib_gid *gid)
520{
521 struct mlx4_ib_dev *dev = to_mdev(ibdev);
522
523 *gid = dev->iboe.gid_table[port - 1][index];
524
525 return 0;
526}
527
528static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, 743static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
529 union ib_gid *gid) 744 union ib_gid *gid)
530{ 745{
531 if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) 746 int ret;
747
748 if (rdma_protocol_ib(ibdev, port))
532 return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); 749 return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
533 else 750
534 return iboe_query_gid(ibdev, port, index, gid); 751 if (!rdma_protocol_roce(ibdev, port))
752 return -ENODEV;
753
754 if (!rdma_cap_roce_gid_table(ibdev, port))
755 return -ENODEV;
756
757 ret = ib_get_cached_gid(ibdev, port, index, gid);
758 if (ret == -EAGAIN) {
759 memcpy(gid, &zgid, sizeof(*gid));
760 return 0;
761 }
762
763 return ret;
535} 764}
536 765
537int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, 766int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
@@ -692,7 +921,7 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
692 resp.cqe_size = dev->dev->caps.cqe_size; 921 resp.cqe_size = dev->dev->caps.cqe_size;
693 } 922 }
694 923
695 context = kmalloc(sizeof *context, GFP_KERNEL); 924 context = kzalloc(sizeof(*context), GFP_KERNEL);
696 if (!context) 925 if (!context)
697 return ERR_PTR(-ENOMEM); 926 return ERR_PTR(-ENOMEM);
698 927
@@ -729,21 +958,143 @@ static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
729 return 0; 958 return 0;
730} 959}
731 960
961static void mlx4_ib_vma_open(struct vm_area_struct *area)
962{
963 /* vma_open is called when a new VMA is created on top of our VMA.
964 * This is done through either mremap flow or split_vma (usually due
965 * to mlock, madvise, munmap, etc.). We do not support a clone of the
966 * vma, as this VMA is strongly hardware related. Therefore we set the
967 * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
968 * calling us again and trying to do incorrect actions. We assume that
969 * the original vma size is exactly a single page that there will be no
970 * "splitting" operations on.
971 */
972 area->vm_ops = NULL;
973}
974
975static void mlx4_ib_vma_close(struct vm_area_struct *area)
976{
977 struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data;
978
979 /* It's guaranteed that all VMAs opened on a FD are closed before the
980 * file itself is closed, therefore no sync is needed with the regular
981 * closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync
982 * with accessing the vma as part of mlx4_ib_disassociate_ucontext.
983 * The close operation is usually called under mm->mmap_sem except when
984 * process is exiting. The exiting case is handled explicitly as part
985 * of mlx4_ib_disassociate_ucontext.
986 */
987 mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *)
988 area->vm_private_data;
989
990 /* set the vma context pointer to null in the mlx4_ib driver's private
991 * data to protect against a race condition in mlx4_ib_dissassociate_ucontext().
992 */
993 mlx4_ib_vma_priv_data->vma = NULL;
994}
995
996static const struct vm_operations_struct mlx4_ib_vm_ops = {
997 .open = mlx4_ib_vma_open,
998 .close = mlx4_ib_vma_close
999};
1000
1001static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
1002{
1003 int i;
1004 int ret = 0;
1005 struct vm_area_struct *vma;
1006 struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
1007 struct task_struct *owning_process = NULL;
1008 struct mm_struct *owning_mm = NULL;
1009
1010 owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
1011 if (!owning_process)
1012 return;
1013
1014 owning_mm = get_task_mm(owning_process);
1015 if (!owning_mm) {
1016 pr_info("no mm, disassociate ucontext is pending task termination\n");
1017 while (1) {
1018 /* make sure that task is dead before returning, it may
1019 * prevent a rare case of module down in parallel to a
1020 * call to mlx4_ib_vma_close.
1021 */
1022 put_task_struct(owning_process);
1023 msleep(1);
1024 owning_process = get_pid_task(ibcontext->tgid,
1025 PIDTYPE_PID);
1026 if (!owning_process ||
1027 owning_process->state == TASK_DEAD) {
1028 pr_info("disassociate ucontext done, task was terminated\n");
1029 /* in case task was dead need to release the task struct */
1030 if (owning_process)
1031 put_task_struct(owning_process);
1032 return;
1033 }
1034 }
1035 }
1036
1037 /* need to protect from a race on closing the vma as part of
1038 * mlx4_ib_vma_close().
1039 */
1040 down_read(&owning_mm->mmap_sem);
1041 for (i = 0; i < HW_BAR_COUNT; i++) {
1042 vma = context->hw_bar_info[i].vma;
1043 if (!vma)
1044 continue;
1045
1046 ret = zap_vma_ptes(context->hw_bar_info[i].vma,
1047 context->hw_bar_info[i].vma->vm_start,
1048 PAGE_SIZE);
1049 if (ret) {
1050 pr_err("Error: zap_vma_ptes failed for index=%d, ret=%d\n", i, ret);
1051 BUG_ON(1);
1052 }
1053
1054 /* context going to be destroyed, should not access ops any more */
1055 context->hw_bar_info[i].vma->vm_ops = NULL;
1056 }
1057
1058 up_read(&owning_mm->mmap_sem);
1059 mmput(owning_mm);
1060 put_task_struct(owning_process);
1061}
1062
1063static void mlx4_ib_set_vma_data(struct vm_area_struct *vma,
1064 struct mlx4_ib_vma_private_data *vma_private_data)
1065{
1066 vma_private_data->vma = vma;
1067 vma->vm_private_data = vma_private_data;
1068 vma->vm_ops = &mlx4_ib_vm_ops;
1069}
1070
732static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) 1071static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
733{ 1072{
734 struct mlx4_ib_dev *dev = to_mdev(context->device); 1073 struct mlx4_ib_dev *dev = to_mdev(context->device);
1074 struct mlx4_ib_ucontext *mucontext = to_mucontext(context);
735 1075
736 if (vma->vm_end - vma->vm_start != PAGE_SIZE) 1076 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
737 return -EINVAL; 1077 return -EINVAL;
738 1078
739 if (vma->vm_pgoff == 0) { 1079 if (vma->vm_pgoff == 0) {
1080 /* We prevent double mmaping on same context */
1081 if (mucontext->hw_bar_info[HW_BAR_DB].vma)
1082 return -EINVAL;
1083
740 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1084 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
741 1085
742 if (io_remap_pfn_range(vma, vma->vm_start, 1086 if (io_remap_pfn_range(vma, vma->vm_start,
743 to_mucontext(context)->uar.pfn, 1087 to_mucontext(context)->uar.pfn,
744 PAGE_SIZE, vma->vm_page_prot)) 1088 PAGE_SIZE, vma->vm_page_prot))
745 return -EAGAIN; 1089 return -EAGAIN;
1090
1091 mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_DB]);
1092
746 } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) { 1093 } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
1094 /* We prevent double mmaping on same context */
1095 if (mucontext->hw_bar_info[HW_BAR_BF].vma)
1096 return -EINVAL;
1097
747 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 1098 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
748 1099
749 if (io_remap_pfn_range(vma, vma->vm_start, 1100 if (io_remap_pfn_range(vma, vma->vm_start,
@@ -751,9 +1102,18 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
751 dev->dev->caps.num_uars, 1102 dev->dev->caps.num_uars,
752 PAGE_SIZE, vma->vm_page_prot)) 1103 PAGE_SIZE, vma->vm_page_prot))
753 return -EAGAIN; 1104 return -EAGAIN;
1105
1106 mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_BF]);
1107
754 } else if (vma->vm_pgoff == 3) { 1108 } else if (vma->vm_pgoff == 3) {
755 struct mlx4_clock_params params; 1109 struct mlx4_clock_params params;
756 int ret = mlx4_get_internal_clock_params(dev->dev, &params); 1110 int ret;
1111
1112 /* We prevent double mmaping on same context */
1113 if (mucontext->hw_bar_info[HW_BAR_CLOCK].vma)
1114 return -EINVAL;
1115
1116 ret = mlx4_get_internal_clock_params(dev->dev, &params);
757 1117
758 if (ret) 1118 if (ret)
759 return ret; 1119 return ret;
@@ -766,6 +1126,9 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
766 >> PAGE_SHIFT, 1126 >> PAGE_SHIFT,
767 PAGE_SIZE, vma->vm_page_prot)) 1127 PAGE_SIZE, vma->vm_page_prot))
768 return -EAGAIN; 1128 return -EAGAIN;
1129
1130 mlx4_ib_set_vma_data(vma,
1131 &mucontext->hw_bar_info[HW_BAR_CLOCK]);
769 } else { 1132 } else {
770 return -EINVAL; 1133 return -EINVAL;
771 } 1134 }
@@ -1547,272 +1910,6 @@ static struct device_attribute *mlx4_class_attributes[] = {
1547 &dev_attr_board_id 1910 &dev_attr_board_id
1548}; 1911};
1549 1912
1550static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id,
1551 struct net_device *dev)
1552{
1553 memcpy(eui, dev->dev_addr, 3);
1554 memcpy(eui + 5, dev->dev_addr + 3, 3);
1555 if (vlan_id < 0x1000) {
1556 eui[3] = vlan_id >> 8;
1557 eui[4] = vlan_id & 0xff;
1558 } else {
1559 eui[3] = 0xff;
1560 eui[4] = 0xfe;
1561 }
1562 eui[0] ^= 2;
1563}
1564
1565static void update_gids_task(struct work_struct *work)
1566{
1567 struct update_gid_work *gw = container_of(work, struct update_gid_work, work);
1568 struct mlx4_cmd_mailbox *mailbox;
1569 union ib_gid *gids;
1570 int err;
1571 struct mlx4_dev *dev = gw->dev->dev;
1572 int is_bonded = mlx4_is_bonded(dev);
1573
1574 if (!gw->dev->ib_active)
1575 return;
1576
1577 mailbox = mlx4_alloc_cmd_mailbox(dev);
1578 if (IS_ERR(mailbox)) {
1579 pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox));
1580 return;
1581 }
1582
1583 gids = mailbox->buf;
1584 memcpy(gids, gw->gids, sizeof gw->gids);
1585
1586 err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
1587 MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT,
1588 MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
1589 if (err)
1590 pr_warn("set port command failed\n");
1591 else
1592 if ((gw->port == 1) || !is_bonded)
1593 mlx4_ib_dispatch_event(gw->dev,
1594 is_bonded ? 1 : gw->port,
1595 IB_EVENT_GID_CHANGE);
1596
1597 mlx4_free_cmd_mailbox(dev, mailbox);
1598 kfree(gw);
1599}
1600
1601static void reset_gids_task(struct work_struct *work)
1602{
1603 struct update_gid_work *gw =
1604 container_of(work, struct update_gid_work, work);
1605 struct mlx4_cmd_mailbox *mailbox;
1606 union ib_gid *gids;
1607 int err;
1608 struct mlx4_dev *dev = gw->dev->dev;
1609
1610 if (!gw->dev->ib_active)
1611 return;
1612
1613 mailbox = mlx4_alloc_cmd_mailbox(dev);
1614 if (IS_ERR(mailbox)) {
1615 pr_warn("reset gid table failed\n");
1616 goto free;
1617 }
1618
1619 gids = mailbox->buf;
1620 memcpy(gids, gw->gids, sizeof(gw->gids));
1621
1622 if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) ==
1623 IB_LINK_LAYER_ETHERNET) {
1624 err = mlx4_cmd(dev, mailbox->dma,
1625 MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
1626 MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT,
1627 MLX4_CMD_TIME_CLASS_B,
1628 MLX4_CMD_WRAPPED);
1629 if (err)
1630 pr_warn("set port %d command failed\n", gw->port);
1631 }
1632
1633 mlx4_free_cmd_mailbox(dev, mailbox);
1634free:
1635 kfree(gw);
1636}
1637
1638static int update_gid_table(struct mlx4_ib_dev *dev, int port,
1639 union ib_gid *gid, int clear,
1640 int default_gid)
1641{
1642 struct update_gid_work *work;
1643 int i;
1644 int need_update = 0;
1645 int free = -1;
1646 int found = -1;
1647 int max_gids;
1648
1649 if (default_gid) {
1650 free = 0;
1651 } else {
1652 max_gids = dev->dev->caps.gid_table_len[port];
1653 for (i = 1; i < max_gids; ++i) {
1654 if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid,
1655 sizeof(*gid)))
1656 found = i;
1657
1658 if (clear) {
1659 if (found >= 0) {
1660 need_update = 1;
1661 dev->iboe.gid_table[port - 1][found] =
1662 zgid;
1663 break;
1664 }
1665 } else {
1666 if (found >= 0)
1667 break;
1668
1669 if (free < 0 &&
1670 !memcmp(&dev->iboe.gid_table[port - 1][i],
1671 &zgid, sizeof(*gid)))
1672 free = i;
1673 }
1674 }
1675 }
1676
1677 if (found == -1 && !clear && free >= 0) {
1678 dev->iboe.gid_table[port - 1][free] = *gid;
1679 need_update = 1;
1680 }
1681
1682 if (!need_update)
1683 return 0;
1684
1685 work = kzalloc(sizeof(*work), GFP_ATOMIC);
1686 if (!work)
1687 return -ENOMEM;
1688
1689 memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids));
1690 INIT_WORK(&work->work, update_gids_task);
1691 work->port = port;
1692 work->dev = dev;
1693 queue_work(wq, &work->work);
1694
1695 return 0;
1696}
1697
1698static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid)
1699{
1700 gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
1701 mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev);
1702}
1703
1704
1705static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port)
1706{
1707 struct update_gid_work *work;
1708
1709 work = kzalloc(sizeof(*work), GFP_ATOMIC);
1710 if (!work)
1711 return -ENOMEM;
1712
1713 memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids));
1714 memset(work->gids, 0, sizeof(work->gids));
1715 INIT_WORK(&work->work, reset_gids_task);
1716 work->dev = dev;
1717 work->port = port;
1718 queue_work(wq, &work->work);
1719 return 0;
1720}
1721
1722static int mlx4_ib_addr_event(int event, struct net_device *event_netdev,
1723 struct mlx4_ib_dev *ibdev, union ib_gid *gid)
1724{
1725 struct mlx4_ib_iboe *iboe;
1726 int port = 0;
1727 struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ?
1728 rdma_vlan_dev_real_dev(event_netdev) :
1729 event_netdev;
1730 union ib_gid default_gid;
1731
1732 mlx4_make_default_gid(real_dev, &default_gid);
1733
1734 if (!memcmp(gid, &default_gid, sizeof(*gid)))
1735 return 0;
1736
1737 if (event != NETDEV_DOWN && event != NETDEV_UP)
1738 return 0;
1739
1740 if ((real_dev != event_netdev) &&
1741 (event == NETDEV_DOWN) &&
1742 rdma_link_local_addr((struct in6_addr *)gid))
1743 return 0;
1744
1745 iboe = &ibdev->iboe;
1746 spin_lock_bh(&iboe->lock);
1747
1748 for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
1749 if ((netif_is_bond_master(real_dev) &&
1750 (real_dev == iboe->masters[port - 1])) ||
1751 (!netif_is_bond_master(real_dev) &&
1752 (real_dev == iboe->netdevs[port - 1])))
1753 update_gid_table(ibdev, port, gid,
1754 event == NETDEV_DOWN, 0);
1755
1756 spin_unlock_bh(&iboe->lock);
1757 return 0;
1758
1759}
1760
1761static u8 mlx4_ib_get_dev_port(struct net_device *dev,
1762 struct mlx4_ib_dev *ibdev)
1763{
1764 u8 port = 0;
1765 struct mlx4_ib_iboe *iboe;
1766 struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ?
1767 rdma_vlan_dev_real_dev(dev) : dev;
1768
1769 iboe = &ibdev->iboe;
1770
1771 for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
1772 if ((netif_is_bond_master(real_dev) &&
1773 (real_dev == iboe->masters[port - 1])) ||
1774 (!netif_is_bond_master(real_dev) &&
1775 (real_dev == iboe->netdevs[port - 1])))
1776 break;
1777
1778 if ((port == 0) || (port > ibdev->dev->caps.num_ports))
1779 return 0;
1780 else
1781 return port;
1782}
1783
1784static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event,
1785 void *ptr)
1786{
1787 struct mlx4_ib_dev *ibdev;
1788 struct in_ifaddr *ifa = ptr;
1789 union ib_gid gid;
1790 struct net_device *event_netdev = ifa->ifa_dev->dev;
1791
1792 ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
1793
1794 ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet);
1795
1796 mlx4_ib_addr_event(event, event_netdev, ibdev, &gid);
1797 return NOTIFY_DONE;
1798}
1799
1800#if IS_ENABLED(CONFIG_IPV6)
1801static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event,
1802 void *ptr)
1803{
1804 struct mlx4_ib_dev *ibdev;
1805 struct inet6_ifaddr *ifa = ptr;
1806 union ib_gid *gid = (union ib_gid *)&ifa->addr;
1807 struct net_device *event_netdev = ifa->idev->dev;
1808
1809 ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6);
1810
1811 mlx4_ib_addr_event(event, event_netdev, ibdev, gid);
1812 return NOTIFY_DONE;
1813}
1814#endif
1815
1816#define MLX4_IB_INVALID_MAC ((u64)-1) 1913#define MLX4_IB_INVALID_MAC ((u64)-1)
1817static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, 1914static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
1818 struct net_device *dev, 1915 struct net_device *dev,
@@ -1871,94 +1968,6 @@ unlock:
1871 mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]); 1968 mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
1872} 1969}
1873 1970
1874static void mlx4_ib_get_dev_addr(struct net_device *dev,
1875 struct mlx4_ib_dev *ibdev, u8 port)
1876{
1877 struct in_device *in_dev;
1878#if IS_ENABLED(CONFIG_IPV6)
1879 struct inet6_dev *in6_dev;
1880 union ib_gid *pgid;
1881 struct inet6_ifaddr *ifp;
1882 union ib_gid default_gid;
1883#endif
1884 union ib_gid gid;
1885
1886
1887 if ((port == 0) || (port > ibdev->dev->caps.num_ports))
1888 return;
1889
1890 /* IPv4 gids */
1891 in_dev = in_dev_get(dev);
1892 if (in_dev) {
1893 for_ifa(in_dev) {
1894 /*ifa->ifa_address;*/
1895 ipv6_addr_set_v4mapped(ifa->ifa_address,
1896 (struct in6_addr *)&gid);
1897 update_gid_table(ibdev, port, &gid, 0, 0);
1898 }
1899 endfor_ifa(in_dev);
1900 in_dev_put(in_dev);
1901 }
1902#if IS_ENABLED(CONFIG_IPV6)
1903 mlx4_make_default_gid(dev, &default_gid);
1904 /* IPv6 gids */
1905 in6_dev = in6_dev_get(dev);
1906 if (in6_dev) {
1907 read_lock_bh(&in6_dev->lock);
1908 list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
1909 pgid = (union ib_gid *)&ifp->addr;
1910 if (!memcmp(pgid, &default_gid, sizeof(*pgid)))
1911 continue;
1912 update_gid_table(ibdev, port, pgid, 0, 0);
1913 }
1914 read_unlock_bh(&in6_dev->lock);
1915 in6_dev_put(in6_dev);
1916 }
1917#endif
1918}
1919
1920static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev,
1921 struct net_device *dev, u8 port)
1922{
1923 union ib_gid gid;
1924 mlx4_make_default_gid(dev, &gid);
1925 update_gid_table(ibdev, port, &gid, 0, 1);
1926}
1927
1928static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
1929{
1930 struct net_device *dev;
1931 struct mlx4_ib_iboe *iboe = &ibdev->iboe;
1932 int i;
1933 int err = 0;
1934
1935 for (i = 1; i <= ibdev->num_ports; ++i) {
1936 if (rdma_port_get_link_layer(&ibdev->ib_dev, i) ==
1937 IB_LINK_LAYER_ETHERNET) {
1938 err = reset_gid_table(ibdev, i);
1939 if (err)
1940 goto out;
1941 }
1942 }
1943
1944 read_lock(&dev_base_lock);
1945 spin_lock_bh(&iboe->lock);
1946
1947 for_each_netdev(&init_net, dev) {
1948 u8 port = mlx4_ib_get_dev_port(dev, ibdev);
1949 /* port will be non-zero only for ETH ports */
1950 if (port) {
1951 mlx4_ib_set_default_gid(ibdev, dev, port);
1952 mlx4_ib_get_dev_addr(dev, ibdev, port);
1953 }
1954 }
1955
1956 spin_unlock_bh(&iboe->lock);
1957 read_unlock(&dev_base_lock);
1958out:
1959 return err;
1960}
1961
1962static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, 1971static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
1963 struct net_device *dev, 1972 struct net_device *dev,
1964 unsigned long event) 1973 unsigned long event)
@@ -1968,81 +1977,22 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
1968 int update_qps_port = -1; 1977 int update_qps_port = -1;
1969 int port; 1978 int port;
1970 1979
1980 ASSERT_RTNL();
1981
1971 iboe = &ibdev->iboe; 1982 iboe = &ibdev->iboe;
1972 1983
1973 spin_lock_bh(&iboe->lock); 1984 spin_lock_bh(&iboe->lock);
1974 mlx4_foreach_ib_transport_port(port, ibdev->dev) { 1985 mlx4_foreach_ib_transport_port(port, ibdev->dev) {
1975 enum ib_port_state port_state = IB_PORT_NOP;
1976 struct net_device *old_master = iboe->masters[port - 1];
1977 struct net_device *curr_netdev;
1978 struct net_device *curr_master;
1979 1986
1980 iboe->netdevs[port - 1] = 1987 iboe->netdevs[port - 1] =
1981 mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); 1988 mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
1982 if (iboe->netdevs[port - 1])
1983 mlx4_ib_set_default_gid(ibdev,
1984 iboe->netdevs[port - 1], port);
1985 curr_netdev = iboe->netdevs[port - 1];
1986
1987 if (iboe->netdevs[port - 1] &&
1988 netif_is_bond_slave(iboe->netdevs[port - 1])) {
1989 iboe->masters[port - 1] = netdev_master_upper_dev_get(
1990 iboe->netdevs[port - 1]);
1991 } else {
1992 iboe->masters[port - 1] = NULL;
1993 }
1994 curr_master = iboe->masters[port - 1];
1995 1989
1996 if (dev == iboe->netdevs[port - 1] && 1990 if (dev == iboe->netdevs[port - 1] &&
1997 (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || 1991 (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER ||
1998 event == NETDEV_UP || event == NETDEV_CHANGE)) 1992 event == NETDEV_UP || event == NETDEV_CHANGE))
1999 update_qps_port = port; 1993 update_qps_port = port;
2000 1994
2001 if (curr_netdev) {
2002 port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ?
2003 IB_PORT_ACTIVE : IB_PORT_DOWN;
2004 mlx4_ib_set_default_gid(ibdev, curr_netdev, port);
2005 if (curr_master) {
2006 /* if using bonding/team and a slave port is down, we
2007 * don't want the bond IP based gids in the table since
2008 * flows that select port by gid may get the down port.
2009 */
2010 if (port_state == IB_PORT_DOWN &&
2011 !mlx4_is_bonded(ibdev->dev)) {
2012 reset_gid_table(ibdev, port);
2013 mlx4_ib_set_default_gid(ibdev,
2014 curr_netdev,
2015 port);
2016 } else {
2017 /* gids from the upper dev (bond/team)
2018 * should appear in port's gid table
2019 */
2020 mlx4_ib_get_dev_addr(curr_master,
2021 ibdev, port);
2022 }
2023 }
2024 /* if bonding is used it is possible that we add it to
2025 * masters only after IP address is assigned to the
2026 * net bonding interface.
2027 */
2028 if (curr_master && (old_master != curr_master)) {
2029 reset_gid_table(ibdev, port);
2030 mlx4_ib_set_default_gid(ibdev,
2031 curr_netdev, port);
2032 mlx4_ib_get_dev_addr(curr_master, ibdev, port);
2033 }
2034
2035 if (!curr_master && (old_master != curr_master)) {
2036 reset_gid_table(ibdev, port);
2037 mlx4_ib_set_default_gid(ibdev,
2038 curr_netdev, port);
2039 mlx4_ib_get_dev_addr(curr_netdev, ibdev, port);
2040 }
2041 } else {
2042 reset_gid_table(ibdev, port);
2043 }
2044 } 1995 }
2045
2046 spin_unlock_bh(&iboe->lock); 1996 spin_unlock_bh(&iboe->lock);
2047 1997
2048 if (update_qps_port > 0) 1998 if (update_qps_port > 0)
@@ -2225,6 +2175,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
2225 1 : ibdev->num_ports; 2175 1 : ibdev->num_ports;
2226 ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; 2176 ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
2227 ibdev->ib_dev.dma_device = &dev->persist->pdev->dev; 2177 ibdev->ib_dev.dma_device = &dev->persist->pdev->dev;
2178 ibdev->ib_dev.get_netdev = mlx4_ib_get_netdev;
2179 ibdev->ib_dev.add_gid = mlx4_ib_add_gid;
2180 ibdev->ib_dev.del_gid = mlx4_ib_del_gid;
2228 2181
2229 if (dev->caps.userspace_caps) 2182 if (dev->caps.userspace_caps)
2230 ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; 2183 ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
@@ -2293,13 +2246,14 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
2293 ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; 2246 ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr;
2294 ibdev->ib_dev.rereg_user_mr = mlx4_ib_rereg_user_mr; 2247 ibdev->ib_dev.rereg_user_mr = mlx4_ib_rereg_user_mr;
2295 ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; 2248 ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr;
2296 ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr; 2249 ibdev->ib_dev.alloc_mr = mlx4_ib_alloc_mr;
2297 ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list; 2250 ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
2298 ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list; 2251 ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list;
2299 ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; 2252 ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach;
2300 ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; 2253 ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach;
2301 ibdev->ib_dev.process_mad = mlx4_ib_process_mad; 2254 ibdev->ib_dev.process_mad = mlx4_ib_process_mad;
2302 ibdev->ib_dev.get_port_immutable = mlx4_port_immutable; 2255 ibdev->ib_dev.get_port_immutable = mlx4_port_immutable;
2256 ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
2303 2257
2304 if (!mlx4_is_slave(ibdev->dev)) { 2258 if (!mlx4_is_slave(ibdev->dev)) {
2305 ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; 2259 ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc;
@@ -2435,26 +2389,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
2435 goto err_notif; 2389 goto err_notif;
2436 } 2390 }
2437 } 2391 }
2438 if (!iboe->nb_inet.notifier_call) {
2439 iboe->nb_inet.notifier_call = mlx4_ib_inet_event;
2440 err = register_inetaddr_notifier(&iboe->nb_inet);
2441 if (err) {
2442 iboe->nb_inet.notifier_call = NULL;
2443 goto err_notif;
2444 }
2445 }
2446#if IS_ENABLED(CONFIG_IPV6)
2447 if (!iboe->nb_inet6.notifier_call) {
2448 iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event;
2449 err = register_inet6addr_notifier(&iboe->nb_inet6);
2450 if (err) {
2451 iboe->nb_inet6.notifier_call = NULL;
2452 goto err_notif;
2453 }
2454 }
2455#endif
2456 if (mlx4_ib_init_gid_table(ibdev))
2457 goto err_notif;
2458 } 2392 }
2459 2393
2460 for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { 2394 for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
@@ -2485,18 +2419,6 @@ err_notif:
2485 pr_warn("failure unregistering notifier\n"); 2419 pr_warn("failure unregistering notifier\n");
2486 ibdev->iboe.nb.notifier_call = NULL; 2420 ibdev->iboe.nb.notifier_call = NULL;
2487 } 2421 }
2488 if (ibdev->iboe.nb_inet.notifier_call) {
2489 if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
2490 pr_warn("failure unregistering notifier\n");
2491 ibdev->iboe.nb_inet.notifier_call = NULL;
2492 }
2493#if IS_ENABLED(CONFIG_IPV6)
2494 if (ibdev->iboe.nb_inet6.notifier_call) {
2495 if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
2496 pr_warn("failure unregistering notifier\n");
2497 ibdev->iboe.nb_inet6.notifier_call = NULL;
2498 }
2499#endif
2500 flush_workqueue(wq); 2422 flush_workqueue(wq);
2501 2423
2502 mlx4_ib_close_sriov(ibdev); 2424 mlx4_ib_close_sriov(ibdev);
@@ -2622,19 +2544,6 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
2622 kfree(ibdev->ib_uc_qpns_bitmap); 2544 kfree(ibdev->ib_uc_qpns_bitmap);
2623 } 2545 }
2624 2546
2625 if (ibdev->iboe.nb_inet.notifier_call) {
2626 if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
2627 pr_warn("failure unregistering notifier\n");
2628 ibdev->iboe.nb_inet.notifier_call = NULL;
2629 }
2630#if IS_ENABLED(CONFIG_IPV6)
2631 if (ibdev->iboe.nb_inet6.notifier_call) {
2632 if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
2633 pr_warn("failure unregistering notifier\n");
2634 ibdev->iboe.nb_inet6.notifier_call = NULL;
2635 }
2636#endif
2637
2638 iounmap(ibdev->uar_map); 2547 iounmap(ibdev->uar_map);
2639 for (p = 0; p < ibdev->num_ports; ++p) 2548 for (p = 0; p < ibdev->num_ports; ++p)
2640 if (ibdev->counters[p].index != -1 && 2549 if (ibdev->counters[p].index != -1 &&
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
index ed327e6c8fdc..2d5bccd71fc6 100644
--- a/drivers/infiniband/hw/mlx4/mcg.c
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -51,6 +51,10 @@
51 pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\ 51 pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
52 (group)->name, group->demux->port, ## arg) 52 (group)->name, group->demux->port, ## arg)
53 53
54#define mcg_debug_group(group, format, arg...) \
55 pr_debug("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
56 (group)->name, (group)->demux->port, ## arg)
57
54#define mcg_error_group(group, format, arg...) \ 58#define mcg_error_group(group, format, arg...) \
55 pr_err(" %16s: " format, (group)->name, ## arg) 59 pr_err(" %16s: " format, (group)->name, ## arg)
56 60
@@ -206,15 +210,16 @@ static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
206{ 210{
207 struct mlx4_ib_dev *dev = ctx->dev; 211 struct mlx4_ib_dev *dev = ctx->dev;
208 struct ib_ah_attr ah_attr; 212 struct ib_ah_attr ah_attr;
213 unsigned long flags;
209 214
210 spin_lock(&dev->sm_lock); 215 spin_lock_irqsave(&dev->sm_lock, flags);
211 if (!dev->sm_ah[ctx->port - 1]) { 216 if (!dev->sm_ah[ctx->port - 1]) {
212 /* port is not yet Active, sm_ah not ready */ 217 /* port is not yet Active, sm_ah not ready */
213 spin_unlock(&dev->sm_lock); 218 spin_unlock_irqrestore(&dev->sm_lock, flags);
214 return -EAGAIN; 219 return -EAGAIN;
215 } 220 }
216 mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); 221 mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
217 spin_unlock(&dev->sm_lock); 222 spin_unlock_irqrestore(&dev->sm_lock, flags);
218 return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), 223 return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev),
219 ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY, 224 ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY,
220 &ah_attr, NULL, mad); 225 &ah_attr, NULL, mad);
@@ -961,8 +966,8 @@ int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port,
961 mutex_lock(&group->lock); 966 mutex_lock(&group->lock);
962 if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) { 967 if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) {
963 mutex_unlock(&group->lock); 968 mutex_unlock(&group->lock);
964 mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n", 969 mcg_debug_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
965 port, slave, MAX_PEND_REQS_PER_FUNC); 970 port, slave, MAX_PEND_REQS_PER_FUNC);
966 release_group(group, 0); 971 release_group(group, 0);
967 kfree(req); 972 kfree(req);
968 return -ENOMEM; 973 return -ENOMEM;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 334387f63358..1e7b23bb2eb0 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -70,11 +70,24 @@ extern int mlx4_ib_sm_guid_assign;
70 70
71#define MLX4_IB_UC_STEER_QPN_ALIGN 1 71#define MLX4_IB_UC_STEER_QPN_ALIGN 1
72#define MLX4_IB_UC_MAX_NUM_QPS 256 72#define MLX4_IB_UC_MAX_NUM_QPS 256
73
74enum hw_bar_type {
75 HW_BAR_BF,
76 HW_BAR_DB,
77 HW_BAR_CLOCK,
78 HW_BAR_COUNT
79};
80
81struct mlx4_ib_vma_private_data {
82 struct vm_area_struct *vma;
83};
84
73struct mlx4_ib_ucontext { 85struct mlx4_ib_ucontext {
74 struct ib_ucontext ibucontext; 86 struct ib_ucontext ibucontext;
75 struct mlx4_uar uar; 87 struct mlx4_uar uar;
76 struct list_head db_page_list; 88 struct list_head db_page_list;
77 struct mutex db_page_mutex; 89 struct mutex db_page_mutex;
90 struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT];
78}; 91};
79 92
80struct mlx4_ib_pd { 93struct mlx4_ib_pd {
@@ -415,7 +428,6 @@ struct mlx4_ib_demux_pv_ctx {
415 struct ib_device *ib_dev; 428 struct ib_device *ib_dev;
416 struct ib_cq *cq; 429 struct ib_cq *cq;
417 struct ib_pd *pd; 430 struct ib_pd *pd;
418 struct ib_mr *mr;
419 struct work_struct work; 431 struct work_struct work;
420 struct workqueue_struct *wq; 432 struct workqueue_struct *wq;
421 struct mlx4_ib_demux_pv_qp qp[2]; 433 struct mlx4_ib_demux_pv_qp qp[2];
@@ -457,15 +469,26 @@ struct mlx4_ib_sriov {
457 struct idr pv_id_table; 469 struct idr pv_id_table;
458}; 470};
459 471
472struct gid_cache_context {
473 int real_index;
474 int refcount;
475};
476
477struct gid_entry {
478 union ib_gid gid;
479 struct gid_cache_context *ctx;
480};
481
482struct mlx4_port_gid_table {
483 struct gid_entry gids[MLX4_MAX_PORT_GIDS];
484};
485
460struct mlx4_ib_iboe { 486struct mlx4_ib_iboe {
461 spinlock_t lock; 487 spinlock_t lock;
462 struct net_device *netdevs[MLX4_MAX_PORTS]; 488 struct net_device *netdevs[MLX4_MAX_PORTS];
463 struct net_device *masters[MLX4_MAX_PORTS];
464 atomic64_t mac[MLX4_MAX_PORTS]; 489 atomic64_t mac[MLX4_MAX_PORTS];
465 struct notifier_block nb; 490 struct notifier_block nb;
466 struct notifier_block nb_inet; 491 struct mlx4_port_gid_table gids[MLX4_MAX_PORTS];
467 struct notifier_block nb_inet6;
468 union ib_gid gid_table[MLX4_MAX_PORTS][128];
469}; 492};
470 493
471struct pkey_mgt { 494struct pkey_mgt {
@@ -680,8 +703,9 @@ struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
680int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, 703int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
681 struct ib_mw_bind *mw_bind); 704 struct ib_mw_bind *mw_bind);
682int mlx4_ib_dealloc_mw(struct ib_mw *mw); 705int mlx4_ib_dealloc_mw(struct ib_mw *mw);
683struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, 706struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
684 int max_page_list_len); 707 enum ib_mr_type mr_type,
708 u32 max_num_sg);
685struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, 709struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
686 int page_list_len); 710 int page_list_len);
687void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); 711void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
@@ -838,5 +862,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
838 u64 start, u64 length, u64 virt_addr, 862 u64 start, u64 length, u64 virt_addr,
839 int mr_access_flags, struct ib_pd *pd, 863 int mr_access_flags, struct ib_pd *pd,
840 struct ib_udata *udata); 864 struct ib_udata *udata);
865int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
866 u8 port_num, int index);
841 867
842#endif /* MLX4_IB_H */ 868#endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index e0d271782d0a..2542fd3c1a49 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -350,19 +350,24 @@ int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
350 return 0; 350 return 0;
351} 351}
352 352
353struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, 353struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
354 int max_page_list_len) 354 enum ib_mr_type mr_type,
355 u32 max_num_sg)
355{ 356{
356 struct mlx4_ib_dev *dev = to_mdev(pd->device); 357 struct mlx4_ib_dev *dev = to_mdev(pd->device);
357 struct mlx4_ib_mr *mr; 358 struct mlx4_ib_mr *mr;
358 int err; 359 int err;
359 360
361 if (mr_type != IB_MR_TYPE_MEM_REG ||
362 max_num_sg > MLX4_MAX_FAST_REG_PAGES)
363 return ERR_PTR(-EINVAL);
364
360 mr = kmalloc(sizeof *mr, GFP_KERNEL); 365 mr = kmalloc(sizeof *mr, GFP_KERNEL);
361 if (!mr) 366 if (!mr)
362 return ERR_PTR(-ENOMEM); 367 return ERR_PTR(-ENOMEM);
363 368
364 err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0, 369 err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
365 max_page_list_len, 0, &mr->mmr); 370 max_num_sg, 0, &mr->mmr);
366 if (err) 371 if (err)
367 goto err_free; 372 goto err_free;
368 373
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index c5a3a5f0de41..4ad9be3ad61c 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1292,14 +1292,18 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
1292 path->static_rate = 0; 1292 path->static_rate = 0;
1293 1293
1294 if (ah->ah_flags & IB_AH_GRH) { 1294 if (ah->ah_flags & IB_AH_GRH) {
1295 if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) { 1295 int real_sgid_index = mlx4_ib_gid_index_to_real_index(dev,
1296 port,
1297 ah->grh.sgid_index);
1298
1299 if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) {
1296 pr_err("sgid_index (%u) too large. max is %d\n", 1300 pr_err("sgid_index (%u) too large. max is %d\n",
1297 ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1); 1301 real_sgid_index, dev->dev->caps.gid_table_len[port] - 1);
1298 return -1; 1302 return -1;
1299 } 1303 }
1300 1304
1301 path->grh_mylmc |= 1 << 7; 1305 path->grh_mylmc |= 1 << 7;
1302 path->mgid_index = ah->grh.sgid_index; 1306 path->mgid_index = real_sgid_index;
1303 path->hop_limit = ah->grh.hop_limit; 1307 path->hop_limit = ah->grh.hop_limit;
1304 path->tclass_flowlabel = 1308 path->tclass_flowlabel =
1305 cpu_to_be32((ah->grh.traffic_class << 20) | 1309 cpu_to_be32((ah->grh.traffic_class << 20) |
diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
index 6797108ce873..69fb5ba94d0f 100644
--- a/drivers/infiniband/hw/mlx4/sysfs.c
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -640,6 +640,8 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
640 struct mlx4_port *p; 640 struct mlx4_port *p;
641 int i; 641 int i;
642 int ret; 642 int ret;
643 int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port_num) ==
644 IB_LINK_LAYER_ETHERNET;
643 645
644 p = kzalloc(sizeof *p, GFP_KERNEL); 646 p = kzalloc(sizeof *p, GFP_KERNEL);
645 if (!p) 647 if (!p)
@@ -657,7 +659,8 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
657 659
658 p->pkey_group.name = "pkey_idx"; 660 p->pkey_group.name = "pkey_idx";
659 p->pkey_group.attrs = 661 p->pkey_group.attrs =
660 alloc_group_attrs(show_port_pkey, store_port_pkey, 662 alloc_group_attrs(show_port_pkey,
663 is_eth ? NULL : store_port_pkey,
661 dev->dev->caps.pkey_table_len[port_num]); 664 dev->dev->caps.pkey_table_len[port_num]);
662 if (!p->pkey_group.attrs) { 665 if (!p->pkey_group.attrs) {
663 ret = -ENOMEM; 666 ret = -ENOMEM;
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 5c9eeea62805..2d0dbbf38ceb 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -33,6 +33,7 @@
33#include <linux/kref.h> 33#include <linux/kref.h>
34#include <rdma/ib_umem.h> 34#include <rdma/ib_umem.h>
35#include <rdma/ib_user_verbs.h> 35#include <rdma/ib_user_verbs.h>
36#include <rdma/ib_cache.h>
36#include "mlx5_ib.h" 37#include "mlx5_ib.h"
37#include "user.h" 38#include "user.h"
38 39
@@ -227,7 +228,14 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
227 wc->dlid_path_bits = cqe->ml_path; 228 wc->dlid_path_bits = cqe->ml_path;
228 g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; 229 g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
229 wc->wc_flags |= g ? IB_WC_GRH : 0; 230 wc->wc_flags |= g ? IB_WC_GRH : 0;
230 wc->pkey_index = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; 231 if (unlikely(is_qp1(qp->ibqp.qp_type))) {
232 u16 pkey = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff;
233
234 ib_find_cached_pkey(&dev->ib_dev, qp->port, pkey,
235 &wc->pkey_index);
236 } else {
237 wc->pkey_index = 0;
238 }
231} 239}
232 240
233static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) 241static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 085c24b4b603..41d6911e244e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -212,6 +212,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
212 int err = -ENOMEM; 212 int err = -ENOMEM;
213 int max_rq_sg; 213 int max_rq_sg;
214 int max_sq_sg; 214 int max_sq_sg;
215 u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
215 216
216 if (uhw->inlen || uhw->outlen) 217 if (uhw->inlen || uhw->outlen)
217 return -EINVAL; 218 return -EINVAL;
@@ -264,7 +265,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
264 props->hw_ver = mdev->pdev->revision; 265 props->hw_ver = mdev->pdev->revision;
265 266
266 props->max_mr_size = ~0ull; 267 props->max_mr_size = ~0ull;
267 props->page_size_cap = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz); 268 props->page_size_cap = ~(min_page_size - 1);
268 props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); 269 props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
269 props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); 270 props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
270 max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / 271 max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
@@ -273,6 +274,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
273 sizeof(struct mlx5_wqe_ctrl_seg)) / 274 sizeof(struct mlx5_wqe_ctrl_seg)) /
274 sizeof(struct mlx5_wqe_data_seg); 275 sizeof(struct mlx5_wqe_data_seg);
275 props->max_sge = min(max_rq_sg, max_sq_sg); 276 props->max_sge = min(max_rq_sg, max_sq_sg);
277 props->max_sge_rd = props->max_sge;
276 props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); 278 props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
277 props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_eq_sz)) - 1; 279 props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_eq_sz)) - 1;
278 props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); 280 props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
@@ -1121,7 +1123,6 @@ static void destroy_umrc_res(struct mlx5_ib_dev *dev)
1121 1123
1122 mlx5_ib_destroy_qp(dev->umrc.qp); 1124 mlx5_ib_destroy_qp(dev->umrc.qp);
1123 ib_destroy_cq(dev->umrc.cq); 1125 ib_destroy_cq(dev->umrc.cq);
1124 ib_dereg_mr(dev->umrc.mr);
1125 ib_dealloc_pd(dev->umrc.pd); 1126 ib_dealloc_pd(dev->umrc.pd);
1126} 1127}
1127 1128
@@ -1136,7 +1137,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev)
1136 struct ib_pd *pd; 1137 struct ib_pd *pd;
1137 struct ib_cq *cq; 1138 struct ib_cq *cq;
1138 struct ib_qp *qp; 1139 struct ib_qp *qp;
1139 struct ib_mr *mr;
1140 struct ib_cq_init_attr cq_attr = {}; 1140 struct ib_cq_init_attr cq_attr = {};
1141 int ret; 1141 int ret;
1142 1142
@@ -1154,13 +1154,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev)
1154 goto error_0; 1154 goto error_0;
1155 } 1155 }
1156 1156
1157 mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE);
1158 if (IS_ERR(mr)) {
1159 mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n");
1160 ret = PTR_ERR(mr);
1161 goto error_1;
1162 }
1163
1164 cq_attr.cqe = 128; 1157 cq_attr.cqe = 128;
1165 cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, 1158 cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL,
1166 &cq_attr); 1159 &cq_attr);
@@ -1218,7 +1211,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev)
1218 1211
1219 dev->umrc.qp = qp; 1212 dev->umrc.qp = qp;
1220 dev->umrc.cq = cq; 1213 dev->umrc.cq = cq;
1221 dev->umrc.mr = mr;
1222 dev->umrc.pd = pd; 1214 dev->umrc.pd = pd;
1223 1215
1224 sema_init(&dev->umrc.sem, MAX_UMR_WR); 1216 sema_init(&dev->umrc.sem, MAX_UMR_WR);
@@ -1240,9 +1232,6 @@ error_3:
1240 ib_destroy_cq(cq); 1232 ib_destroy_cq(cq);
1241 1233
1242error_2: 1234error_2:
1243 ib_dereg_mr(mr);
1244
1245error_1:
1246 ib_dealloc_pd(pd); 1235 ib_dealloc_pd(pd);
1247 1236
1248error_0: 1237error_0:
@@ -1256,10 +1245,18 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
1256 struct ib_srq_init_attr attr; 1245 struct ib_srq_init_attr attr;
1257 struct mlx5_ib_dev *dev; 1246 struct mlx5_ib_dev *dev;
1258 struct ib_cq_init_attr cq_attr = {.cqe = 1}; 1247 struct ib_cq_init_attr cq_attr = {.cqe = 1};
1248 u32 rsvd_lkey;
1259 int ret = 0; 1249 int ret = 0;
1260 1250
1261 dev = container_of(devr, struct mlx5_ib_dev, devr); 1251 dev = container_of(devr, struct mlx5_ib_dev, devr);
1262 1252
1253 ret = mlx5_core_query_special_context(dev->mdev, &rsvd_lkey);
1254 if (ret) {
1255 pr_err("Failed to query special context %d\n", ret);
1256 return ret;
1257 }
1258 dev->ib_dev.local_dma_lkey = rsvd_lkey;
1259
1263 devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); 1260 devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
1264 if (IS_ERR(devr->p0)) { 1261 if (IS_ERR(devr->p0)) {
1265 ret = PTR_ERR(devr->p0); 1262 ret = PTR_ERR(devr->p0);
@@ -1421,7 +1418,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
1421 strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); 1418 strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
1422 dev->ib_dev.owner = THIS_MODULE; 1419 dev->ib_dev.owner = THIS_MODULE;
1423 dev->ib_dev.node_type = RDMA_NODE_IB_CA; 1420 dev->ib_dev.node_type = RDMA_NODE_IB_CA;
1424 dev->ib_dev.local_dma_lkey = 0 /* not supported for now */;
1425 dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); 1421 dev->num_ports = MLX5_CAP_GEN(mdev, num_ports);
1426 dev->ib_dev.phys_port_cnt = dev->num_ports; 1422 dev->ib_dev.phys_port_cnt = dev->num_ports;
1427 dev->ib_dev.num_comp_vectors = 1423 dev->ib_dev.num_comp_vectors =
@@ -1490,12 +1486,10 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
1490 dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; 1486 dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr;
1491 dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; 1487 dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr;
1492 dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; 1488 dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr;
1493 dev->ib_dev.destroy_mr = mlx5_ib_destroy_mr;
1494 dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; 1489 dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach;
1495 dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; 1490 dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach;
1496 dev->ib_dev.process_mad = mlx5_ib_process_mad; 1491 dev->ib_dev.process_mad = mlx5_ib_process_mad;
1497 dev->ib_dev.create_mr = mlx5_ib_create_mr; 1492 dev->ib_dev.alloc_mr = mlx5_ib_alloc_mr;
1498 dev->ib_dev.alloc_fast_reg_mr = mlx5_ib_alloc_fast_reg_mr;
1499 dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list; 1493 dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
1500 dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; 1494 dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list;
1501 dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; 1495 dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7cae09836481..bb8cda79e881 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -349,7 +349,6 @@ struct umr_common {
349 struct ib_pd *pd; 349 struct ib_pd *pd;
350 struct ib_cq *cq; 350 struct ib_cq *cq;
351 struct ib_qp *qp; 351 struct ib_qp *qp;
352 struct ib_mr *mr;
353 /* control access to UMR QP 352 /* control access to UMR QP
354 */ 353 */
355 struct semaphore sem; 354 struct semaphore sem;
@@ -573,11 +572,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
573int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, 572int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index,
574 int npages, int zap); 573 int npages, int zap);
575int mlx5_ib_dereg_mr(struct ib_mr *ibmr); 574int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
576int mlx5_ib_destroy_mr(struct ib_mr *ibmr); 575struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
577struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, 576 enum ib_mr_type mr_type,
578 struct ib_mr_init_attr *mr_init_attr); 577 u32 max_num_sg);
579struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
580 int max_page_list_len);
581struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, 578struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
582 int page_list_len); 579 int page_list_len);
583void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); 580void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
@@ -683,6 +680,11 @@ static inline u8 convert_access(int acc)
683 MLX5_PERM_LOCAL_READ; 680 MLX5_PERM_LOCAL_READ;
684} 681}
685 682
683static inline int is_qp1(enum ib_qp_type qp_type)
684{
685 return qp_type == IB_QPT_GSI;
686}
687
686#define MLX5_MAX_UMR_SHIFT 16 688#define MLX5_MAX_UMR_SHIFT 16
687#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) 689#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
688 690
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index bc9a0de897cb..54a15b5d336d 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -441,9 +441,6 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
441 spin_unlock_irq(&ent->lock); 441 spin_unlock_irq(&ent->lock);
442 442
443 queue_work(cache->wq, &ent->work); 443 queue_work(cache->wq, &ent->work);
444
445 if (mr)
446 break;
447 } 444 }
448 445
449 if (!mr) 446 if (!mr)
@@ -690,12 +687,11 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
690 int access_flags) 687 int access_flags)
691{ 688{
692 struct mlx5_ib_dev *dev = to_mdev(pd->device); 689 struct mlx5_ib_dev *dev = to_mdev(pd->device);
693 struct ib_mr *mr = dev->umrc.mr;
694 struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; 690 struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
695 691
696 sg->addr = dma; 692 sg->addr = dma;
697 sg->length = ALIGN(sizeof(u64) * n, 64); 693 sg->length = ALIGN(sizeof(u64) * n, 64);
698 sg->lkey = mr->lkey; 694 sg->lkey = dev->umrc.pd->local_dma_lkey;
699 695
700 wr->next = NULL; 696 wr->next = NULL;
701 wr->send_flags = 0; 697 wr->send_flags = 0;
@@ -926,7 +922,7 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
926 sg.addr = dma; 922 sg.addr = dma;
927 sg.length = ALIGN(npages * sizeof(u64), 923 sg.length = ALIGN(npages * sizeof(u64),
928 MLX5_UMR_MTT_ALIGNMENT); 924 MLX5_UMR_MTT_ALIGNMENT);
929 sg.lkey = dev->umrc.mr->lkey; 925 sg.lkey = dev->umrc.pd->local_dma_lkey;
930 926
931 wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | 927 wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
932 MLX5_IB_SEND_UMR_UPDATE_MTT; 928 MLX5_IB_SEND_UMR_UPDATE_MTT;
@@ -1118,19 +1114,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1118 return &mr->ibmr; 1114 return &mr->ibmr;
1119 1115
1120error: 1116error:
1121 /*
1122 * Destroy the umem *before* destroying the MR, to ensure we
1123 * will not have any in-flight notifiers when destroying the
1124 * MR.
1125 *
1126 * As the MR is completely invalid to begin with, and this
1127 * error path is only taken if we can't push the mr entry into
1128 * the pagefault tree, this is safe.
1129 */
1130
1131 ib_umem_release(umem); 1117 ib_umem_release(umem);
1132 /* Kill the MR, and return an error code. */
1133 clean_mr(mr);
1134 return ERR_PTR(err); 1118 return ERR_PTR(err);
1135} 1119}
1136 1120
@@ -1173,6 +1157,19 @@ static int clean_mr(struct mlx5_ib_mr *mr)
1173 int umred = mr->umred; 1157 int umred = mr->umred;
1174 int err; 1158 int err;
1175 1159
1160 if (mr->sig) {
1161 if (mlx5_core_destroy_psv(dev->mdev,
1162 mr->sig->psv_memory.psv_idx))
1163 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1164 mr->sig->psv_memory.psv_idx);
1165 if (mlx5_core_destroy_psv(dev->mdev,
1166 mr->sig->psv_wire.psv_idx))
1167 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1168 mr->sig->psv_wire.psv_idx);
1169 kfree(mr->sig);
1170 mr->sig = NULL;
1171 }
1172
1176 if (!umred) { 1173 if (!umred) {
1177 err = destroy_mkey(dev, mr); 1174 err = destroy_mkey(dev, mr);
1178 if (err) { 1175 if (err) {
@@ -1234,14 +1231,15 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1234 return 0; 1231 return 0;
1235} 1232}
1236 1233
1237struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, 1234struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
1238 struct ib_mr_init_attr *mr_init_attr) 1235 enum ib_mr_type mr_type,
1236 u32 max_num_sg)
1239{ 1237{
1240 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1238 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1241 struct mlx5_create_mkey_mbox_in *in; 1239 struct mlx5_create_mkey_mbox_in *in;
1242 struct mlx5_ib_mr *mr; 1240 struct mlx5_ib_mr *mr;
1243 int access_mode, err; 1241 int access_mode, err;
1244 int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4); 1242 int ndescs = roundup(max_num_sg, 4);
1245 1243
1246 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1244 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1247 if (!mr) 1245 if (!mr)
@@ -1257,9 +1255,11 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
1257 in->seg.xlt_oct_size = cpu_to_be32(ndescs); 1255 in->seg.xlt_oct_size = cpu_to_be32(ndescs);
1258 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); 1256 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1259 in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); 1257 in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
1260 access_mode = MLX5_ACCESS_MODE_MTT;
1261 1258
1262 if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) { 1259 if (mr_type == IB_MR_TYPE_MEM_REG) {
1260 access_mode = MLX5_ACCESS_MODE_MTT;
1261 in->seg.log2_page_size = PAGE_SHIFT;
1262 } else if (mr_type == IB_MR_TYPE_SIGNATURE) {
1263 u32 psv_index[2]; 1263 u32 psv_index[2];
1264 1264
1265 in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) | 1265 in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) |
@@ -1285,6 +1285,10 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
1285 mr->sig->sig_err_exists = false; 1285 mr->sig->sig_err_exists = false;
1286 /* Next UMR, Arm SIGERR */ 1286 /* Next UMR, Arm SIGERR */
1287 ++mr->sig->sigerr_count; 1287 ++mr->sig->sigerr_count;
1288 } else {
1289 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
1290 err = -EINVAL;
1291 goto err_free_in;
1288 } 1292 }
1289 1293
1290 in->seg.flags = MLX5_PERM_UMR_EN | access_mode; 1294 in->seg.flags = MLX5_PERM_UMR_EN | access_mode;
@@ -1320,80 +1324,6 @@ err_free:
1320 return ERR_PTR(err); 1324 return ERR_PTR(err);
1321} 1325}
1322 1326
1323int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
1324{
1325 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1326 struct mlx5_ib_mr *mr = to_mmr(ibmr);
1327 int err;
1328
1329 if (mr->sig) {
1330 if (mlx5_core_destroy_psv(dev->mdev,
1331 mr->sig->psv_memory.psv_idx))
1332 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1333 mr->sig->psv_memory.psv_idx);
1334 if (mlx5_core_destroy_psv(dev->mdev,
1335 mr->sig->psv_wire.psv_idx))
1336 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1337 mr->sig->psv_wire.psv_idx);
1338 kfree(mr->sig);
1339 }
1340
1341 err = destroy_mkey(dev, mr);
1342 if (err) {
1343 mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
1344 mr->mmr.key, err);
1345 return err;
1346 }
1347
1348 kfree(mr);
1349
1350 return err;
1351}
1352
1353struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
1354 int max_page_list_len)
1355{
1356 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1357 struct mlx5_create_mkey_mbox_in *in;
1358 struct mlx5_ib_mr *mr;
1359 int err;
1360
1361 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1362 if (!mr)
1363 return ERR_PTR(-ENOMEM);
1364
1365 in = kzalloc(sizeof(*in), GFP_KERNEL);
1366 if (!in) {
1367 err = -ENOMEM;
1368 goto err_free;
1369 }
1370
1371 in->seg.status = MLX5_MKEY_STATUS_FREE;
1372 in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2);
1373 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1374 in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;
1375 in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
1376 /*
1377 * TBD not needed - issue 197292 */
1378 in->seg.log2_page_size = PAGE_SHIFT;
1379
1380 err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL,
1381 NULL, NULL);
1382 kfree(in);
1383 if (err)
1384 goto err_free;
1385
1386 mr->ibmr.lkey = mr->mmr.key;
1387 mr->ibmr.rkey = mr->mmr.key;
1388 mr->umem = NULL;
1389
1390 return &mr->ibmr;
1391
1392err_free:
1393 kfree(mr);
1394 return ERR_PTR(err);
1395}
1396
1397struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, 1327struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
1398 int page_list_len) 1328 int page_list_len)
1399{ 1329{
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 203c8a45e095..c745c6c5e10d 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -76,11 +76,6 @@ static int is_qp0(enum ib_qp_type qp_type)
76 return qp_type == IB_QPT_SMI; 76 return qp_type == IB_QPT_SMI;
77} 77}
78 78
79static int is_qp1(enum ib_qp_type qp_type)
80{
81 return qp_type == IB_QPT_GSI;
82}
83
84static int is_sqp(enum ib_qp_type qp_type) 79static int is_sqp(enum ib_qp_type qp_type)
85{ 80{
86 return is_qp0(qp_type) || is_qp1(qp_type); 81 return is_qp0(qp_type) || is_qp1(qp_type);
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 93ae51dcf2ff..dc2d48c59e62 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -97,6 +97,7 @@ static int mthca_query_device(struct ib_device *ibdev, struct ib_device_attr *pr
97 props->max_qp = mdev->limits.num_qps - mdev->limits.reserved_qps; 97 props->max_qp = mdev->limits.num_qps - mdev->limits.reserved_qps;
98 props->max_qp_wr = mdev->limits.max_wqes; 98 props->max_qp_wr = mdev->limits.max_wqes;
99 props->max_sge = mdev->limits.max_sg; 99 props->max_sge = mdev->limits.max_sg;
100 props->max_sge_rd = props->max_sge;
100 props->max_cq = mdev->limits.num_cqs - mdev->limits.reserved_cqs; 101 props->max_cq = mdev->limits.num_cqs - mdev->limits.reserved_cqs;
101 props->max_cqe = mdev->limits.max_cqes; 102 props->max_cqe = mdev->limits.max_cqes;
102 props->max_mr = mdev->limits.num_mpts - mdev->limits.reserved_mrws; 103 props->max_mr = mdev->limits.num_mpts - mdev->limits.reserved_mrws;
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index fbc43e5f717b..44cb513f9a87 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -375,9 +375,11 @@ static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
375} 375}
376 376
377/* 377/*
378 * nes_alloc_fast_reg_mr 378 * nes_alloc_mr
379 */ 379 */
380static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len) 380static struct ib_mr *nes_alloc_mr(struct ib_pd *ibpd,
381 enum ib_mr_type mr_type,
382 u32 max_num_sg)
381{ 383{
382 struct nes_pd *nespd = to_nespd(ibpd); 384 struct nes_pd *nespd = to_nespd(ibpd);
383 struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); 385 struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
@@ -393,11 +395,18 @@ static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list
393 u32 stag; 395 u32 stag;
394 int ret; 396 int ret;
395 struct ib_mr *ibmr; 397 struct ib_mr *ibmr;
398
399 if (mr_type != IB_MR_TYPE_MEM_REG)
400 return ERR_PTR(-EINVAL);
401
402 if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
403 return ERR_PTR(-E2BIG);
404
396/* 405/*
397 * Note: Set to always use a fixed length single page entry PBL. This is to allow 406 * Note: Set to always use a fixed length single page entry PBL. This is to allow
398 * for the fast_reg_mr operation to always know the size of the PBL. 407 * for the fast_reg_mr operation to always know the size of the PBL.
399 */ 408 */
400 if (max_page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) 409 if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
401 return ERR_PTR(-E2BIG); 410 return ERR_PTR(-E2BIG);
402 411
403 get_random_bytes(&next_stag_index, sizeof(next_stag_index)); 412 get_random_bytes(&next_stag_index, sizeof(next_stag_index));
@@ -424,7 +433,7 @@ static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list
424 nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n", 433 nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n",
425 stag, stag_index); 434 stag, stag_index);
426 435
427 ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_page_list_len); 436 ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_num_sg);
428 437
429 if (ret == 0) { 438 if (ret == 0) {
430 nesmr->ibmr.rkey = stag; 439 nesmr->ibmr.rkey = stag;
@@ -3929,7 +3938,7 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
3929 nesibdev->ibdev.dealloc_mw = nes_dealloc_mw; 3938 nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
3930 nesibdev->ibdev.bind_mw = nes_bind_mw; 3939 nesibdev->ibdev.bind_mw = nes_bind_mw;
3931 3940
3932 nesibdev->ibdev.alloc_fast_reg_mr = nes_alloc_fast_reg_mr; 3941 nesibdev->ibdev.alloc_mr = nes_alloc_mr;
3933 nesibdev->ibdev.alloc_fast_reg_page_list = nes_alloc_fast_reg_page_list; 3942 nesibdev->ibdev.alloc_fast_reg_page_list = nes_alloc_fast_reg_page_list;
3934 nesibdev->ibdev.free_fast_reg_page_list = nes_free_fast_reg_page_list; 3943 nesibdev->ibdev.free_fast_reg_page_list = nes_free_fast_reg_page_list;
3935 3944
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h
index 6a36338593cd..b4091ab48db0 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma.h
@@ -246,7 +246,6 @@ struct ocrdma_dev {
246 u16 base_eqid; 246 u16 base_eqid;
247 u16 max_eq; 247 u16 max_eq;
248 248
249 union ib_gid *sgid_tbl;
250 /* provided synchronization to sgid table for 249 /* provided synchronization to sgid table for
251 * updating gid entries triggered by notifier. 250 * updating gid entries triggered by notifier.
252 */ 251 */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index b119a3413a15..87aa55df7c82 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -67,8 +67,6 @@ static LIST_HEAD(ocrdma_dev_list);
67static DEFINE_SPINLOCK(ocrdma_devlist_lock); 67static DEFINE_SPINLOCK(ocrdma_devlist_lock);
68static DEFINE_IDR(ocrdma_dev_id); 68static DEFINE_IDR(ocrdma_dev_id);
69 69
70static union ib_gid ocrdma_zero_sgid;
71
72void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid) 70void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
73{ 71{
74 u8 mac_addr[6]; 72 u8 mac_addr[6];
@@ -83,135 +81,6 @@ void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
83 guid[6] = mac_addr[4]; 81 guid[6] = mac_addr[4];
84 guid[7] = mac_addr[5]; 82 guid[7] = mac_addr[5];
85} 83}
86
87static bool ocrdma_add_sgid(struct ocrdma_dev *dev, union ib_gid *new_sgid)
88{
89 int i;
90 unsigned long flags;
91
92 memset(&ocrdma_zero_sgid, 0, sizeof(union ib_gid));
93
94
95 spin_lock_irqsave(&dev->sgid_lock, flags);
96 for (i = 0; i < OCRDMA_MAX_SGID; i++) {
97 if (!memcmp(&dev->sgid_tbl[i], &ocrdma_zero_sgid,
98 sizeof(union ib_gid))) {
99 /* found free entry */
100 memcpy(&dev->sgid_tbl[i], new_sgid,
101 sizeof(union ib_gid));
102 spin_unlock_irqrestore(&dev->sgid_lock, flags);
103 return true;
104 } else if (!memcmp(&dev->sgid_tbl[i], new_sgid,
105 sizeof(union ib_gid))) {
106 /* entry already present, no addition is required. */
107 spin_unlock_irqrestore(&dev->sgid_lock, flags);
108 return false;
109 }
110 }
111 spin_unlock_irqrestore(&dev->sgid_lock, flags);
112 return false;
113}
114
115static bool ocrdma_del_sgid(struct ocrdma_dev *dev, union ib_gid *sgid)
116{
117 int found = false;
118 int i;
119 unsigned long flags;
120
121
122 spin_lock_irqsave(&dev->sgid_lock, flags);
123 /* first is default sgid, which cannot be deleted. */
124 for (i = 1; i < OCRDMA_MAX_SGID; i++) {
125 if (!memcmp(&dev->sgid_tbl[i], sgid, sizeof(union ib_gid))) {
126 /* found matching entry */
127 memset(&dev->sgid_tbl[i], 0, sizeof(union ib_gid));
128 found = true;
129 break;
130 }
131 }
132 spin_unlock_irqrestore(&dev->sgid_lock, flags);
133 return found;
134}
135
136static int ocrdma_addr_event(unsigned long event, struct net_device *netdev,
137 union ib_gid *gid)
138{
139 struct ib_event gid_event;
140 struct ocrdma_dev *dev;
141 bool found = false;
142 bool updated = false;
143 bool is_vlan = false;
144
145 is_vlan = netdev->priv_flags & IFF_802_1Q_VLAN;
146 if (is_vlan)
147 netdev = rdma_vlan_dev_real_dev(netdev);
148
149 rcu_read_lock();
150 list_for_each_entry_rcu(dev, &ocrdma_dev_list, entry) {
151 if (dev->nic_info.netdev == netdev) {
152 found = true;
153 break;
154 }
155 }
156 rcu_read_unlock();
157
158 if (!found)
159 return NOTIFY_DONE;
160
161 mutex_lock(&dev->dev_lock);
162 switch (event) {
163 case NETDEV_UP:
164 updated = ocrdma_add_sgid(dev, gid);
165 break;
166 case NETDEV_DOWN:
167 updated = ocrdma_del_sgid(dev, gid);
168 break;
169 default:
170 break;
171 }
172 if (updated) {
173 /* GID table updated, notify the consumers about it */
174 gid_event.device = &dev->ibdev;
175 gid_event.element.port_num = 1;
176 gid_event.event = IB_EVENT_GID_CHANGE;
177 ib_dispatch_event(&gid_event);
178 }
179 mutex_unlock(&dev->dev_lock);
180 return NOTIFY_OK;
181}
182
183static int ocrdma_inetaddr_event(struct notifier_block *notifier,
184 unsigned long event, void *ptr)
185{
186 struct in_ifaddr *ifa = ptr;
187 union ib_gid gid;
188 struct net_device *netdev = ifa->ifa_dev->dev;
189
190 ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
191 return ocrdma_addr_event(event, netdev, &gid);
192}
193
194static struct notifier_block ocrdma_inetaddr_notifier = {
195 .notifier_call = ocrdma_inetaddr_event
196};
197
198#if IS_ENABLED(CONFIG_IPV6)
199
200static int ocrdma_inet6addr_event(struct notifier_block *notifier,
201 unsigned long event, void *ptr)
202{
203 struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
204 union ib_gid *gid = (union ib_gid *)&ifa->addr;
205 struct net_device *netdev = ifa->idev->dev;
206 return ocrdma_addr_event(event, netdev, gid);
207}
208
209static struct notifier_block ocrdma_inet6addr_notifier = {
210 .notifier_call = ocrdma_inet6addr_event
211};
212
213#endif /* IPV6 and VLAN */
214
215static enum rdma_link_layer ocrdma_link_layer(struct ib_device *device, 84static enum rdma_link_layer ocrdma_link_layer(struct ib_device *device,
216 u8 port_num) 85 u8 port_num)
217{ 86{
@@ -280,6 +149,9 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
280 dev->ibdev.query_port = ocrdma_query_port; 149 dev->ibdev.query_port = ocrdma_query_port;
281 dev->ibdev.modify_port = ocrdma_modify_port; 150 dev->ibdev.modify_port = ocrdma_modify_port;
282 dev->ibdev.query_gid = ocrdma_query_gid; 151 dev->ibdev.query_gid = ocrdma_query_gid;
152 dev->ibdev.get_netdev = ocrdma_get_netdev;
153 dev->ibdev.add_gid = ocrdma_add_gid;
154 dev->ibdev.del_gid = ocrdma_del_gid;
283 dev->ibdev.get_link_layer = ocrdma_link_layer; 155 dev->ibdev.get_link_layer = ocrdma_link_layer;
284 dev->ibdev.alloc_pd = ocrdma_alloc_pd; 156 dev->ibdev.alloc_pd = ocrdma_alloc_pd;
285 dev->ibdev.dealloc_pd = ocrdma_dealloc_pd; 157 dev->ibdev.dealloc_pd = ocrdma_dealloc_pd;
@@ -309,7 +181,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
309 dev->ibdev.dereg_mr = ocrdma_dereg_mr; 181 dev->ibdev.dereg_mr = ocrdma_dereg_mr;
310 dev->ibdev.reg_user_mr = ocrdma_reg_user_mr; 182 dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
311 183
312 dev->ibdev.alloc_fast_reg_mr = ocrdma_alloc_frmr; 184 dev->ibdev.alloc_mr = ocrdma_alloc_mr;
313 dev->ibdev.alloc_fast_reg_page_list = ocrdma_alloc_frmr_page_list; 185 dev->ibdev.alloc_fast_reg_page_list = ocrdma_alloc_frmr_page_list;
314 dev->ibdev.free_fast_reg_page_list = ocrdma_free_frmr_page_list; 186 dev->ibdev.free_fast_reg_page_list = ocrdma_free_frmr_page_list;
315 187
@@ -342,12 +214,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
342static int ocrdma_alloc_resources(struct ocrdma_dev *dev) 214static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
343{ 215{
344 mutex_init(&dev->dev_lock); 216 mutex_init(&dev->dev_lock);
345 dev->sgid_tbl = kzalloc(sizeof(union ib_gid) *
346 OCRDMA_MAX_SGID, GFP_KERNEL);
347 if (!dev->sgid_tbl)
348 goto alloc_err;
349 spin_lock_init(&dev->sgid_lock);
350
351 dev->cq_tbl = kzalloc(sizeof(struct ocrdma_cq *) * 217 dev->cq_tbl = kzalloc(sizeof(struct ocrdma_cq *) *
352 OCRDMA_MAX_CQ, GFP_KERNEL); 218 OCRDMA_MAX_CQ, GFP_KERNEL);
353 if (!dev->cq_tbl) 219 if (!dev->cq_tbl)
@@ -379,7 +245,6 @@ static void ocrdma_free_resources(struct ocrdma_dev *dev)
379 kfree(dev->stag_arr); 245 kfree(dev->stag_arr);
380 kfree(dev->qp_tbl); 246 kfree(dev->qp_tbl);
381 kfree(dev->cq_tbl); 247 kfree(dev->cq_tbl);
382 kfree(dev->sgid_tbl);
383} 248}
384 249
385/* OCRDMA sysfs interface */ 250/* OCRDMA sysfs interface */
@@ -425,68 +290,6 @@ static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev)
425 device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]); 290 device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]);
426} 291}
427 292
428static void ocrdma_add_default_sgid(struct ocrdma_dev *dev)
429{
430 /* GID Index 0 - Invariant manufacturer-assigned EUI-64 */
431 union ib_gid *sgid = &dev->sgid_tbl[0];
432
433 sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
434 ocrdma_get_guid(dev, &sgid->raw[8]);
435}
436
437static void ocrdma_init_ipv4_gids(struct ocrdma_dev *dev,
438 struct net_device *net)
439{
440 struct in_device *in_dev;
441 union ib_gid gid;
442 in_dev = in_dev_get(net);
443 if (in_dev) {
444 for_ifa(in_dev) {
445 ipv6_addr_set_v4mapped(ifa->ifa_address,
446 (struct in6_addr *)&gid);
447 ocrdma_add_sgid(dev, &gid);
448 }
449 endfor_ifa(in_dev);
450 in_dev_put(in_dev);
451 }
452}
453
454static void ocrdma_init_ipv6_gids(struct ocrdma_dev *dev,
455 struct net_device *net)
456{
457#if IS_ENABLED(CONFIG_IPV6)
458 struct inet6_dev *in6_dev;
459 union ib_gid *pgid;
460 struct inet6_ifaddr *ifp;
461 in6_dev = in6_dev_get(net);
462 if (in6_dev) {
463 read_lock_bh(&in6_dev->lock);
464 list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
465 pgid = (union ib_gid *)&ifp->addr;
466 ocrdma_add_sgid(dev, pgid);
467 }
468 read_unlock_bh(&in6_dev->lock);
469 in6_dev_put(in6_dev);
470 }
471#endif
472}
473
474static void ocrdma_init_gid_table(struct ocrdma_dev *dev)
475{
476 struct net_device *net_dev;
477
478 for_each_netdev(&init_net, net_dev) {
479 struct net_device *real_dev = rdma_vlan_dev_real_dev(net_dev) ?
480 rdma_vlan_dev_real_dev(net_dev) : net_dev;
481
482 if (real_dev == dev->nic_info.netdev) {
483 ocrdma_add_default_sgid(dev);
484 ocrdma_init_ipv4_gids(dev, net_dev);
485 ocrdma_init_ipv6_gids(dev, net_dev);
486 }
487 }
488}
489
490static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) 293static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
491{ 294{
492 int status = 0, i; 295 int status = 0, i;
@@ -515,7 +318,6 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
515 goto alloc_err; 318 goto alloc_err;
516 319
517 ocrdma_init_service_level(dev); 320 ocrdma_init_service_level(dev);
518 ocrdma_init_gid_table(dev);
519 status = ocrdma_register_device(dev); 321 status = ocrdma_register_device(dev);
520 if (status) 322 if (status)
521 goto alloc_err; 323 goto alloc_err;
@@ -662,34 +464,12 @@ static struct ocrdma_driver ocrdma_drv = {
662 .be_abi_version = OCRDMA_BE_ROCE_ABI_VERSION, 464 .be_abi_version = OCRDMA_BE_ROCE_ABI_VERSION,
663}; 465};
664 466
665static void ocrdma_unregister_inet6addr_notifier(void)
666{
667#if IS_ENABLED(CONFIG_IPV6)
668 unregister_inet6addr_notifier(&ocrdma_inet6addr_notifier);
669#endif
670}
671
672static void ocrdma_unregister_inetaddr_notifier(void)
673{
674 unregister_inetaddr_notifier(&ocrdma_inetaddr_notifier);
675}
676
677static int __init ocrdma_init_module(void) 467static int __init ocrdma_init_module(void)
678{ 468{
679 int status; 469 int status;
680 470
681 ocrdma_init_debugfs(); 471 ocrdma_init_debugfs();
682 472
683 status = register_inetaddr_notifier(&ocrdma_inetaddr_notifier);
684 if (status)
685 return status;
686
687#if IS_ENABLED(CONFIG_IPV6)
688 status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier);
689 if (status)
690 goto err_notifier6;
691#endif
692
693 status = be_roce_register_driver(&ocrdma_drv); 473 status = be_roce_register_driver(&ocrdma_drv);
694 if (status) 474 if (status)
695 goto err_be_reg; 475 goto err_be_reg;
@@ -697,19 +477,13 @@ static int __init ocrdma_init_module(void)
697 return 0; 477 return 0;
698 478
699err_be_reg: 479err_be_reg:
700#if IS_ENABLED(CONFIG_IPV6) 480
701 ocrdma_unregister_inet6addr_notifier();
702err_notifier6:
703#endif
704 ocrdma_unregister_inetaddr_notifier();
705 return status; 481 return status;
706} 482}
707 483
708static void __exit ocrdma_exit_module(void) 484static void __exit ocrdma_exit_module(void)
709{ 485{
710 be_roce_unregister_driver(&ocrdma_drv); 486 be_roce_unregister_driver(&ocrdma_drv);
711 ocrdma_unregister_inet6addr_notifier();
712 ocrdma_unregister_inetaddr_notifier();
713 ocrdma_rem_debugfs(); 487 ocrdma_rem_debugfs();
714 idr_destroy(&ocrdma_dev_id); 488 idr_destroy(&ocrdma_dev_id);
715} 489}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
index 80006b24aa11..6a38268bbe9f 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
@@ -140,6 +140,8 @@ enum {
140 OCRDMA_DB_RQ_SHIFT = 24 140 OCRDMA_DB_RQ_SHIFT = 24
141}; 141};
142 142
143#define OCRDMA_ROUDP_FLAGS_SHIFT 0x03
144
143#define OCRDMA_DB_CQ_RING_ID_MASK 0x3FF /* bits 0 - 9 */ 145#define OCRDMA_DB_CQ_RING_ID_MASK 0x3FF /* bits 0 - 9 */
144#define OCRDMA_DB_CQ_RING_ID_EXT_MASK 0x0C00 /* bits 10-11 of qid at 12-11 */ 146#define OCRDMA_DB_CQ_RING_ID_EXT_MASK 0x0C00 /* bits 10-11 of qid at 12-11 */
145/* qid #2 msbits at 12-11 */ 147/* qid #2 msbits at 12-11 */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index bc84cd462ecf..1f3affb6a477 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -46,6 +46,7 @@
46#include <rdma/iw_cm.h> 46#include <rdma/iw_cm.h>
47#include <rdma/ib_umem.h> 47#include <rdma/ib_umem.h>
48#include <rdma/ib_addr.h> 48#include <rdma/ib_addr.h>
49#include <rdma/ib_cache.h>
49 50
50#include "ocrdma.h" 51#include "ocrdma.h"
51#include "ocrdma_hw.h" 52#include "ocrdma_hw.h"
@@ -64,6 +65,7 @@ int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
64int ocrdma_query_gid(struct ib_device *ibdev, u8 port, 65int ocrdma_query_gid(struct ib_device *ibdev, u8 port,
65 int index, union ib_gid *sgid) 66 int index, union ib_gid *sgid)
66{ 67{
68 int ret;
67 struct ocrdma_dev *dev; 69 struct ocrdma_dev *dev;
68 70
69 dev = get_ocrdma_dev(ibdev); 71 dev = get_ocrdma_dev(ibdev);
@@ -71,8 +73,28 @@ int ocrdma_query_gid(struct ib_device *ibdev, u8 port,
71 if (index >= OCRDMA_MAX_SGID) 73 if (index >= OCRDMA_MAX_SGID)
72 return -EINVAL; 74 return -EINVAL;
73 75
74 memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid)); 76 ret = ib_get_cached_gid(ibdev, port, index, sgid);
77 if (ret == -EAGAIN) {
78 memcpy(sgid, &zgid, sizeof(*sgid));
79 return 0;
80 }
81
82 return ret;
83}
84
85int ocrdma_add_gid(struct ib_device *device,
86 u8 port_num,
87 unsigned int index,
88 const union ib_gid *gid,
89 const struct ib_gid_attr *attr,
90 void **context) {
91 return 0;
92}
75 93
94int ocrdma_del_gid(struct ib_device *device,
95 u8 port_num,
96 unsigned int index,
97 void **context) {
76 return 0; 98 return 0;
77} 99}
78 100
@@ -125,6 +147,24 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr,
125 return 0; 147 return 0;
126} 148}
127 149
150struct net_device *ocrdma_get_netdev(struct ib_device *ibdev, u8 port_num)
151{
152 struct ocrdma_dev *dev;
153 struct net_device *ndev = NULL;
154
155 rcu_read_lock();
156
157 dev = get_ocrdma_dev(ibdev);
158 if (dev)
159 ndev = dev->nic_info.netdev;
160 if (ndev)
161 dev_hold(ndev);
162
163 rcu_read_unlock();
164
165 return ndev;
166}
167
128static inline void get_link_speed_and_width(struct ocrdma_dev *dev, 168static inline void get_link_speed_and_width(struct ocrdma_dev *dev,
129 u8 *ib_speed, u8 *ib_width) 169 u8 *ib_speed, u8 *ib_width)
130{ 170{
@@ -194,7 +234,8 @@ int ocrdma_query_port(struct ib_device *ibdev,
194 props->port_cap_flags = 234 props->port_cap_flags =
195 IB_PORT_CM_SUP | 235 IB_PORT_CM_SUP |
196 IB_PORT_REINIT_SUP | 236 IB_PORT_REINIT_SUP |
197 IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS; 237 IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP |
238 IB_PORT_IP_BASED_GIDS;
198 props->gid_tbl_len = OCRDMA_MAX_SGID; 239 props->gid_tbl_len = OCRDMA_MAX_SGID;
199 props->pkey_tbl_len = 1; 240 props->pkey_tbl_len = 1;
200 props->bad_pkey_cntr = 0; 241 props->bad_pkey_cntr = 0;
@@ -2998,21 +3039,26 @@ int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags)
2998 return 0; 3039 return 0;
2999} 3040}
3000 3041
3001struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len) 3042struct ib_mr *ocrdma_alloc_mr(struct ib_pd *ibpd,
3043 enum ib_mr_type mr_type,
3044 u32 max_num_sg)
3002{ 3045{
3003 int status; 3046 int status;
3004 struct ocrdma_mr *mr; 3047 struct ocrdma_mr *mr;
3005 struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); 3048 struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
3006 struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); 3049 struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
3007 3050
3008 if (max_page_list_len > dev->attr.max_pages_per_frmr) 3051 if (mr_type != IB_MR_TYPE_MEM_REG)
3052 return ERR_PTR(-EINVAL);
3053
3054 if (max_num_sg > dev->attr.max_pages_per_frmr)
3009 return ERR_PTR(-EINVAL); 3055 return ERR_PTR(-EINVAL);
3010 3056
3011 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 3057 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
3012 if (!mr) 3058 if (!mr)
3013 return ERR_PTR(-ENOMEM); 3059 return ERR_PTR(-ENOMEM);
3014 3060
3015 status = ocrdma_get_pbl_info(dev, mr, max_page_list_len); 3061 status = ocrdma_get_pbl_info(dev, mr, max_num_sg);
3016 if (status) 3062 if (status)
3017 goto pbl_err; 3063 goto pbl_err;
3018 mr->hwmr.fr_mr = 1; 3064 mr->hwmr.fr_mr = 1;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index eaccb2d3cb9f..308c16857a5d 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -63,6 +63,17 @@ ocrdma_query_protocol(struct ib_device *device, u8 port_num);
63void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid); 63void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
64int ocrdma_query_gid(struct ib_device *, u8 port, 64int ocrdma_query_gid(struct ib_device *, u8 port,
65 int index, union ib_gid *gid); 65 int index, union ib_gid *gid);
66struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num);
67int ocrdma_add_gid(struct ib_device *device,
68 u8 port_num,
69 unsigned int index,
70 const union ib_gid *gid,
71 const struct ib_gid_attr *attr,
72 void **context);
73int ocrdma_del_gid(struct ib_device *device,
74 u8 port_num,
75 unsigned int index,
76 void **context);
66int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey); 77int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
67 78
68struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *, 79struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *,
@@ -111,7 +122,9 @@ struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *,
111 int num_phys_buf, int acc, u64 *iova_start); 122 int num_phys_buf, int acc, u64 *iova_start);
112struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length, 123struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
113 u64 virt, int acc, struct ib_udata *); 124 u64 virt, int acc, struct ib_udata *);
114struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *pd, int max_page_list_len); 125struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd,
126 enum ib_mr_type mr_type,
127 u32 max_num_sg);
115struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device 128struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device
116 *ibdev, 129 *ibdev,
117 int page_list_len); 130 int page_list_len);
diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c
index ad843c786e72..5afaa218508d 100644
--- a/drivers/infiniband/hw/qib/qib_keys.c
+++ b/drivers/infiniband/hw/qib/qib_keys.c
@@ -86,6 +86,10 @@ int qib_alloc_lkey(struct qib_mregion *mr, int dma_region)
86 * unrestricted LKEY. 86 * unrestricted LKEY.
87 */ 87 */
88 rkt->gen++; 88 rkt->gen++;
89 /*
90 * bits are capped in qib_verbs.c to insure enough bits
91 * for generation number
92 */
89 mr->lkey = (r << (32 - ib_qib_lkey_table_size)) | 93 mr->lkey = (r << (32 - ib_qib_lkey_table_size)) |
90 ((((1 << (24 - ib_qib_lkey_table_size)) - 1) & rkt->gen) 94 ((((1 << (24 - ib_qib_lkey_table_size)) - 1) & rkt->gen)
91 << 8); 95 << 8);
diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h
index 941d4d50d8e7..57e99dc0d80c 100644
--- a/drivers/infiniband/hw/qib/qib_mad.h
+++ b/drivers/infiniband/hw/qib/qib_mad.h
@@ -36,148 +36,17 @@
36 36
37#include <rdma/ib_pma.h> 37#include <rdma/ib_pma.h>
38 38
39#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004) 39#define IB_SMP_UNSUP_VERSION \
40#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008) 40cpu_to_be16(IB_MGMT_MAD_STATUS_BAD_VERSION)
41#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C)
42#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C)
43 41
44struct ib_node_info { 42#define IB_SMP_UNSUP_METHOD \
45 u8 base_version; 43cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD)
46 u8 class_version;
47 u8 node_type;
48 u8 num_ports;
49 __be64 sys_guid;
50 __be64 node_guid;
51 __be64 port_guid;
52 __be16 partition_cap;
53 __be16 device_id;
54 __be32 revision;
55 u8 local_port_num;
56 u8 vendor_id[3];
57} __packed;
58
59struct ib_mad_notice_attr {
60 u8 generic_type;
61 u8 prod_type_msb;
62 __be16 prod_type_lsb;
63 __be16 trap_num;
64 __be16 issuer_lid;
65 __be16 toggle_count;
66
67 union {
68 struct {
69 u8 details[54];
70 } raw_data;
71
72 struct {
73 __be16 reserved;
74 __be16 lid; /* where violation happened */
75 u8 port_num; /* where violation happened */
76 } __packed ntc_129_131;
77
78 struct {
79 __be16 reserved;
80 __be16 lid; /* LID where change occurred */
81 u8 reserved2;
82 u8 local_changes; /* low bit - local changes */
83 __be32 new_cap_mask; /* new capability mask */
84 u8 reserved3;
85 u8 change_flags; /* low 3 bits only */
86 } __packed ntc_144;
87
88 struct {
89 __be16 reserved;
90 __be16 lid; /* lid where sys guid changed */
91 __be16 reserved2;
92 __be64 new_sys_guid;
93 } __packed ntc_145;
94
95 struct {
96 __be16 reserved;
97 __be16 lid;
98 __be16 dr_slid;
99 u8 method;
100 u8 reserved2;
101 __be16 attr_id;
102 __be32 attr_mod;
103 __be64 mkey;
104 u8 reserved3;
105 u8 dr_trunc_hop;
106 u8 dr_rtn_path[30];
107 } __packed ntc_256;
108
109 struct {
110 __be16 reserved;
111 __be16 lid1;
112 __be16 lid2;
113 __be32 key;
114 __be32 sl_qp1; /* SL: high 4 bits */
115 __be32 qp2; /* high 8 bits reserved */
116 union ib_gid gid1;
117 union ib_gid gid2;
118 } __packed ntc_257_258;
119
120 } details;
121};
122
123/*
124 * Generic trap/notice types
125 */
126#define IB_NOTICE_TYPE_FATAL 0x80
127#define IB_NOTICE_TYPE_URGENT 0x81
128#define IB_NOTICE_TYPE_SECURITY 0x82
129#define IB_NOTICE_TYPE_SM 0x83
130#define IB_NOTICE_TYPE_INFO 0x84
131 44
132/* 45#define IB_SMP_UNSUP_METH_ATTR \
133 * Generic trap/notice producers 46cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB)
134 */
135#define IB_NOTICE_PROD_CA cpu_to_be16(1)
136#define IB_NOTICE_PROD_SWITCH cpu_to_be16(2)
137#define IB_NOTICE_PROD_ROUTER cpu_to_be16(3)
138#define IB_NOTICE_PROD_CLASS_MGR cpu_to_be16(4)
139 47
140/* 48#define IB_SMP_INVALID_FIELD \
141 * Generic trap/notice numbers 49cpu_to_be16(IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE)
142 */
143#define IB_NOTICE_TRAP_LLI_THRESH cpu_to_be16(129)
144#define IB_NOTICE_TRAP_EBO_THRESH cpu_to_be16(130)
145#define IB_NOTICE_TRAP_FLOW_UPDATE cpu_to_be16(131)
146#define IB_NOTICE_TRAP_CAP_MASK_CHG cpu_to_be16(144)
147#define IB_NOTICE_TRAP_SYS_GUID_CHG cpu_to_be16(145)
148#define IB_NOTICE_TRAP_BAD_MKEY cpu_to_be16(256)
149#define IB_NOTICE_TRAP_BAD_PKEY cpu_to_be16(257)
150#define IB_NOTICE_TRAP_BAD_QKEY cpu_to_be16(258)
151
152/*
153 * Repress trap/notice flags
154 */
155#define IB_NOTICE_REPRESS_LLI_THRESH (1 << 0)
156#define IB_NOTICE_REPRESS_EBO_THRESH (1 << 1)
157#define IB_NOTICE_REPRESS_FLOW_UPDATE (1 << 2)
158#define IB_NOTICE_REPRESS_CAP_MASK_CHG (1 << 3)
159#define IB_NOTICE_REPRESS_SYS_GUID_CHG (1 << 4)
160#define IB_NOTICE_REPRESS_BAD_MKEY (1 << 5)
161#define IB_NOTICE_REPRESS_BAD_PKEY (1 << 6)
162#define IB_NOTICE_REPRESS_BAD_QKEY (1 << 7)
163
164/*
165 * Generic trap/notice other local changes flags (trap 144).
166 */
167#define IB_NOTICE_TRAP_LSE_CHG 0x04 /* Link Speed Enable changed */
168#define IB_NOTICE_TRAP_LWE_CHG 0x02 /* Link Width Enable changed */
169#define IB_NOTICE_TRAP_NODE_DESC_CHG 0x01
170
171/*
172 * Generic trap/notice M_Key volation flags in dr_trunc_hop (trap 256).
173 */
174#define IB_NOTICE_TRAP_DR_NOTICE 0x80
175#define IB_NOTICE_TRAP_DR_TRUNC 0x40
176
177struct ib_vl_weight_elem {
178 u8 vl; /* Only low 4 bits, upper 4 bits reserved */
179 u8 weight;
180};
181 50
182#define IB_VLARB_LOWPRI_0_31 1 51#define IB_VLARB_LOWPRI_0_31 1
183#define IB_VLARB_LOWPRI_32_63 2 52#define IB_VLARB_LOWPRI_32_63 2
diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c
index c4473db46699..19220dcb9a3b 100644
--- a/drivers/infiniband/hw/qib/qib_mr.c
+++ b/drivers/infiniband/hw/qib/qib_mr.c
@@ -327,11 +327,16 @@ out:
327 * 327 *
328 * Return the memory region on success, otherwise return an errno. 328 * Return the memory region on success, otherwise return an errno.
329 */ 329 */
330struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) 330struct ib_mr *qib_alloc_mr(struct ib_pd *pd,
331 enum ib_mr_type mr_type,
332 u32 max_num_sg)
331{ 333{
332 struct qib_mr *mr; 334 struct qib_mr *mr;
333 335
334 mr = alloc_mr(max_page_list_len, pd); 336 if (mr_type != IB_MR_TYPE_MEM_REG)
337 return ERR_PTR(-EINVAL);
338
339 mr = alloc_mr(max_num_sg, pd);
335 if (IS_ERR(mr)) 340 if (IS_ERR(mr))
336 return (struct ib_mr *)mr; 341 return (struct ib_mr *)mr;
337 342
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
index f42bd0f47577..22e356ca8058 100644
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -32,6 +32,7 @@
32 */ 32 */
33 33
34#include <linux/spinlock.h> 34#include <linux/spinlock.h>
35#include <rdma/ib_smi.h>
35 36
36#include "qib.h" 37#include "qib.h"
37#include "qib_mad.h" 38#include "qib_mad.h"
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index a05d1a372208..3dcc4985b60f 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -40,6 +40,7 @@
40#include <linux/rculist.h> 40#include <linux/rculist.h>
41#include <linux/mm.h> 41#include <linux/mm.h>
42#include <linux/random.h> 42#include <linux/random.h>
43#include <linux/vmalloc.h>
43 44
44#include "qib.h" 45#include "qib.h"
45#include "qib_common.h" 46#include "qib_common.h"
@@ -1574,6 +1575,7 @@ static int qib_query_device(struct ib_device *ibdev, struct ib_device_attr *prop
1574 props->max_qp = ib_qib_max_qps; 1575 props->max_qp = ib_qib_max_qps;
1575 props->max_qp_wr = ib_qib_max_qp_wrs; 1576 props->max_qp_wr = ib_qib_max_qp_wrs;
1576 props->max_sge = ib_qib_max_sges; 1577 props->max_sge = ib_qib_max_sges;
1578 props->max_sge_rd = ib_qib_max_sges;
1577 props->max_cq = ib_qib_max_cqs; 1579 props->max_cq = ib_qib_max_cqs;
1578 props->max_ah = ib_qib_max_ahs; 1580 props->max_ah = ib_qib_max_ahs;
1579 props->max_cqe = ib_qib_max_cqes; 1581 props->max_cqe = ib_qib_max_cqes;
@@ -2109,10 +2111,16 @@ int qib_register_ib_device(struct qib_devdata *dd)
2109 * the LKEY). The remaining bits act as a generation number or tag. 2111 * the LKEY). The remaining bits act as a generation number or tag.
2110 */ 2112 */
2111 spin_lock_init(&dev->lk_table.lock); 2113 spin_lock_init(&dev->lk_table.lock);
2114 /* insure generation is at least 4 bits see keys.c */
2115 if (ib_qib_lkey_table_size > MAX_LKEY_TABLE_BITS) {
2116 qib_dev_warn(dd, "lkey bits %u too large, reduced to %u\n",
2117 ib_qib_lkey_table_size, MAX_LKEY_TABLE_BITS);
2118 ib_qib_lkey_table_size = MAX_LKEY_TABLE_BITS;
2119 }
2112 dev->lk_table.max = 1 << ib_qib_lkey_table_size; 2120 dev->lk_table.max = 1 << ib_qib_lkey_table_size;
2113 lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table); 2121 lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
2114 dev->lk_table.table = (struct qib_mregion __rcu **) 2122 dev->lk_table.table = (struct qib_mregion __rcu **)
2115 __get_free_pages(GFP_KERNEL, get_order(lk_tab_size)); 2123 vmalloc(lk_tab_size);
2116 if (dev->lk_table.table == NULL) { 2124 if (dev->lk_table.table == NULL) {
2117 ret = -ENOMEM; 2125 ret = -ENOMEM;
2118 goto err_lk; 2126 goto err_lk;
@@ -2235,7 +2243,7 @@ int qib_register_ib_device(struct qib_devdata *dd)
2235 ibdev->reg_phys_mr = qib_reg_phys_mr; 2243 ibdev->reg_phys_mr = qib_reg_phys_mr;
2236 ibdev->reg_user_mr = qib_reg_user_mr; 2244 ibdev->reg_user_mr = qib_reg_user_mr;
2237 ibdev->dereg_mr = qib_dereg_mr; 2245 ibdev->dereg_mr = qib_dereg_mr;
2238 ibdev->alloc_fast_reg_mr = qib_alloc_fast_reg_mr; 2246 ibdev->alloc_mr = qib_alloc_mr;
2239 ibdev->alloc_fast_reg_page_list = qib_alloc_fast_reg_page_list; 2247 ibdev->alloc_fast_reg_page_list = qib_alloc_fast_reg_page_list;
2240 ibdev->free_fast_reg_page_list = qib_free_fast_reg_page_list; 2248 ibdev->free_fast_reg_page_list = qib_free_fast_reg_page_list;
2241 ibdev->alloc_fmr = qib_alloc_fmr; 2249 ibdev->alloc_fmr = qib_alloc_fmr;
@@ -2286,7 +2294,7 @@ err_tx:
2286 sizeof(struct qib_pio_header), 2294 sizeof(struct qib_pio_header),
2287 dev->pio_hdrs, dev->pio_hdrs_phys); 2295 dev->pio_hdrs, dev->pio_hdrs_phys);
2288err_hdrs: 2296err_hdrs:
2289 free_pages((unsigned long) dev->lk_table.table, get_order(lk_tab_size)); 2297 vfree(dev->lk_table.table);
2290err_lk: 2298err_lk:
2291 kfree(dev->qp_table); 2299 kfree(dev->qp_table);
2292err_qpt: 2300err_qpt:
@@ -2340,8 +2348,7 @@ void qib_unregister_ib_device(struct qib_devdata *dd)
2340 sizeof(struct qib_pio_header), 2348 sizeof(struct qib_pio_header),
2341 dev->pio_hdrs, dev->pio_hdrs_phys); 2349 dev->pio_hdrs, dev->pio_hdrs_phys);
2342 lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table); 2350 lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
2343 free_pages((unsigned long) dev->lk_table.table, 2351 vfree(dev->lk_table.table);
2344 get_order(lk_tab_size));
2345 kfree(dev->qp_table); 2352 kfree(dev->qp_table);
2346} 2353}
2347 2354
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index 1635572752ce..a08df70e8503 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -647,6 +647,8 @@ struct qib_qpn_table {
647 struct qpn_map map[QPNMAP_ENTRIES]; 647 struct qpn_map map[QPNMAP_ENTRIES];
648}; 648};
649 649
650#define MAX_LKEY_TABLE_BITS 23
651
650struct qib_lkey_table { 652struct qib_lkey_table {
651 spinlock_t lock; /* protect changes in this struct */ 653 spinlock_t lock; /* protect changes in this struct */
652 u32 next; /* next unused index (speeds search) */ 654 u32 next; /* next unused index (speeds search) */
@@ -1032,7 +1034,9 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1032 1034
1033int qib_dereg_mr(struct ib_mr *ibmr); 1035int qib_dereg_mr(struct ib_mr *ibmr);
1034 1036
1035struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); 1037struct ib_mr *qib_alloc_mr(struct ib_pd *pd,
1038 enum ib_mr_type mr_type,
1039 u32 max_entries);
1036 1040
1037struct ib_fast_reg_page_list *qib_alloc_fast_reg_page_list( 1041struct ib_fast_reg_page_list *qib_alloc_fast_reg_page_list(
1038 struct ib_device *ibdev, int page_list_len); 1042 struct ib_device *ibdev, int page_list_len);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 79859c4d43c9..ca2873698d75 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -342,7 +342,6 @@ struct ipoib_dev_priv {
342 u16 pkey; 342 u16 pkey;
343 u16 pkey_index; 343 u16 pkey_index;
344 struct ib_pd *pd; 344 struct ib_pd *pd;
345 struct ib_mr *mr;
346 struct ib_cq *recv_cq; 345 struct ib_cq *recv_cq;
347 struct ib_cq *send_cq; 346 struct ib_cq *send_cq;
348 struct ib_qp *qp; 347 struct ib_qp *qp;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index ee39be6ccfb0..c78dc1638030 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -332,7 +332,7 @@ static void ipoib_cm_init_rx_wr(struct net_device *dev,
332 int i; 332 int i;
333 333
334 for (i = 0; i < priv->cm.num_frags; ++i) 334 for (i = 0; i < priv->cm.num_frags; ++i)
335 sge[i].lkey = priv->mr->lkey; 335 sge[i].lkey = priv->pd->local_dma_lkey;
336 336
337 sge[0].length = IPOIB_CM_HEAD_SIZE; 337 sge[0].length = IPOIB_CM_HEAD_SIZE;
338 for (i = 1; i < priv->cm.num_frags; ++i) 338 for (i = 1; i < priv->cm.num_frags; ++i)
@@ -848,7 +848,7 @@ int ipoib_cm_dev_open(struct net_device *dev)
848 } 848 }
849 849
850 ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), 850 ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
851 0, NULL); 851 0);
852 if (ret) { 852 if (ret) {
853 printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, 853 printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
854 IPOIB_CM_IETF_ID | priv->qp->qp_num); 854 IPOIB_CM_IETF_ID | priv->qp->qp_num);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index b2943c84a5dd..36536ce5a3e2 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -48,6 +48,9 @@
48 48
49#include <linux/jhash.h> 49#include <linux/jhash.h>
50#include <net/arp.h> 50#include <net/arp.h>
51#include <net/addrconf.h>
52#include <linux/inetdevice.h>
53#include <rdma/ib_cache.h>
51 54
52#define DRV_VERSION "1.0.0" 55#define DRV_VERSION "1.0.0"
53 56
@@ -89,13 +92,18 @@ struct workqueue_struct *ipoib_workqueue;
89struct ib_sa_client ipoib_sa_client; 92struct ib_sa_client ipoib_sa_client;
90 93
91static void ipoib_add_one(struct ib_device *device); 94static void ipoib_add_one(struct ib_device *device);
92static void ipoib_remove_one(struct ib_device *device); 95static void ipoib_remove_one(struct ib_device *device, void *client_data);
93static void ipoib_neigh_reclaim(struct rcu_head *rp); 96static void ipoib_neigh_reclaim(struct rcu_head *rp);
97static struct net_device *ipoib_get_net_dev_by_params(
98 struct ib_device *dev, u8 port, u16 pkey,
99 const union ib_gid *gid, const struct sockaddr *addr,
100 void *client_data);
94 101
95static struct ib_client ipoib_client = { 102static struct ib_client ipoib_client = {
96 .name = "ipoib", 103 .name = "ipoib",
97 .add = ipoib_add_one, 104 .add = ipoib_add_one,
98 .remove = ipoib_remove_one 105 .remove = ipoib_remove_one,
106 .get_net_dev_by_params = ipoib_get_net_dev_by_params,
99}; 107};
100 108
101int ipoib_open(struct net_device *dev) 109int ipoib_open(struct net_device *dev)
@@ -222,6 +230,225 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
222 return 0; 230 return 0;
223} 231}
224 232
233/* Called with an RCU read lock taken */
234static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
235 struct net_device *dev)
236{
237 struct net *net = dev_net(dev);
238 struct in_device *in_dev;
239 struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
240 struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
241 __be32 ret_addr;
242
243 switch (addr->sa_family) {
244 case AF_INET:
245 in_dev = in_dev_get(dev);
246 if (!in_dev)
247 return false;
248
249 ret_addr = inet_confirm_addr(net, in_dev, 0,
250 addr_in->sin_addr.s_addr,
251 RT_SCOPE_HOST);
252 in_dev_put(in_dev);
253 if (ret_addr)
254 return true;
255
256 break;
257 case AF_INET6:
258 if (IS_ENABLED(CONFIG_IPV6) &&
259 ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
260 return true;
261
262 break;
263 }
264 return false;
265}
266
267/**
268 * Find the master net_device on top of the given net_device.
269 * @dev: base IPoIB net_device
270 *
271 * Returns the master net_device with a reference held, or the same net_device
272 * if no master exists.
273 */
274static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
275{
276 struct net_device *master;
277
278 rcu_read_lock();
279 master = netdev_master_upper_dev_get_rcu(dev);
280 if (master)
281 dev_hold(master);
282 rcu_read_unlock();
283
284 if (master)
285 return master;
286
287 dev_hold(dev);
288 return dev;
289}
290
291/**
292 * Find a net_device matching the given address, which is an upper device of
293 * the given net_device.
294 * @addr: IP address to look for.
295 * @dev: base IPoIB net_device
296 *
297 * If found, returns the net_device with a reference held. Otherwise return
298 * NULL.
299 */
300static struct net_device *ipoib_get_net_dev_match_addr(
301 const struct sockaddr *addr, struct net_device *dev)
302{
303 struct net_device *upper,
304 *result = NULL;
305 struct list_head *iter;
306
307 rcu_read_lock();
308 if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
309 dev_hold(dev);
310 result = dev;
311 goto out;
312 }
313
314 netdev_for_each_all_upper_dev_rcu(dev, upper, iter) {
315 if (ipoib_is_dev_match_addr_rcu(addr, upper)) {
316 dev_hold(upper);
317 result = upper;
318 break;
319 }
320 }
321out:
322 rcu_read_unlock();
323 return result;
324}
325
326/* returns the number of IPoIB netdevs on top a given ipoib device matching a
327 * pkey_index and address, if one exists.
328 *
329 * @found_net_dev: contains a matching net_device if the return value >= 1,
330 * with a reference held. */
331static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
332 const union ib_gid *gid,
333 u16 pkey_index,
334 const struct sockaddr *addr,
335 int nesting,
336 struct net_device **found_net_dev)
337{
338 struct ipoib_dev_priv *child_priv;
339 struct net_device *net_dev = NULL;
340 int matches = 0;
341
342 if (priv->pkey_index == pkey_index &&
343 (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
344 if (!addr) {
345 net_dev = ipoib_get_master_net_dev(priv->dev);
346 } else {
347 /* Verify the net_device matches the IP address, as
348 * IPoIB child devices currently share a GID. */
349 net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
350 }
351 if (net_dev) {
352 if (!*found_net_dev)
353 *found_net_dev = net_dev;
354 else
355 dev_put(net_dev);
356 ++matches;
357 }
358 }
359
360 /* Check child interfaces */
361 down_read_nested(&priv->vlan_rwsem, nesting);
362 list_for_each_entry(child_priv, &priv->child_intfs, list) {
363 matches += ipoib_match_gid_pkey_addr(child_priv, gid,
364 pkey_index, addr,
365 nesting + 1,
366 found_net_dev);
367 if (matches > 1)
368 break;
369 }
370 up_read(&priv->vlan_rwsem);
371
372 return matches;
373}
374
375/* Returns the number of matching net_devs found (between 0 and 2). Also
376 * return the matching net_device in the @net_dev parameter, holding a
377 * reference to the net_device, if the number of matches >= 1 */
378static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
379 u16 pkey_index,
380 const union ib_gid *gid,
381 const struct sockaddr *addr,
382 struct net_device **net_dev)
383{
384 struct ipoib_dev_priv *priv;
385 int matches = 0;
386
387 *net_dev = NULL;
388
389 list_for_each_entry(priv, dev_list, list) {
390 if (priv->port != port)
391 continue;
392
393 matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
394 addr, 0, net_dev);
395 if (matches > 1)
396 break;
397 }
398
399 return matches;
400}
401
402static struct net_device *ipoib_get_net_dev_by_params(
403 struct ib_device *dev, u8 port, u16 pkey,
404 const union ib_gid *gid, const struct sockaddr *addr,
405 void *client_data)
406{
407 struct net_device *net_dev;
408 struct list_head *dev_list = client_data;
409 u16 pkey_index;
410 int matches;
411 int ret;
412
413 if (!rdma_protocol_ib(dev, port))
414 return NULL;
415
416 ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
417 if (ret)
418 return NULL;
419
420 if (!dev_list)
421 return NULL;
422
423 /* See if we can find a unique device matching the L2 parameters */
424 matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
425 gid, NULL, &net_dev);
426
427 switch (matches) {
428 case 0:
429 return NULL;
430 case 1:
431 return net_dev;
432 }
433
434 dev_put(net_dev);
435
436 /* Couldn't find a unique device with L2 parameters only. Use L3
437 * address to uniquely match the net device */
438 matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
439 gid, addr, &net_dev);
440 switch (matches) {
441 case 0:
442 return NULL;
443 default:
444 dev_warn_ratelimited(&dev->dev,
445 "duplicate IP address detected\n");
446 /* Fall through */
447 case 1:
448 return net_dev;
449 }
450}
451
225int ipoib_set_mode(struct net_device *dev, const char *buf) 452int ipoib_set_mode(struct net_device *dev, const char *buf)
226{ 453{
227 struct ipoib_dev_priv *priv = netdev_priv(dev); 454 struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1715,12 +1942,11 @@ static void ipoib_add_one(struct ib_device *device)
1715 ib_set_client_data(device, &ipoib_client, dev_list); 1942 ib_set_client_data(device, &ipoib_client, dev_list);
1716} 1943}
1717 1944
1718static void ipoib_remove_one(struct ib_device *device) 1945static void ipoib_remove_one(struct ib_device *device, void *client_data)
1719{ 1946{
1720 struct ipoib_dev_priv *priv, *tmp; 1947 struct ipoib_dev_priv *priv, *tmp;
1721 struct list_head *dev_list; 1948 struct list_head *dev_list = client_data;
1722 1949
1723 dev_list = ib_get_client_data(device, &ipoib_client);
1724 if (!dev_list) 1950 if (!dev_list)
1725 return; 1951 return;
1726 1952
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 0d23e0568deb..09a1748f9d13 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -393,8 +393,13 @@ static int ipoib_mcast_join_complete(int status,
393 goto out_locked; 393 goto out_locked;
394 } 394 }
395 } else { 395 } else {
396 if (mcast->logcount++ < 20) { 396 bool silent_fail =
397 if (status == -ETIMEDOUT || status == -EAGAIN) { 397 test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
398 status == -EINVAL;
399
400 if (mcast->logcount < 20) {
401 if (status == -ETIMEDOUT || status == -EAGAIN ||
402 silent_fail) {
398 ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n", 403 ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n",
399 test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", 404 test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
400 mcast->mcmember.mgid.raw, status); 405 mcast->mcmember.mgid.raw, status);
@@ -403,6 +408,9 @@ static int ipoib_mcast_join_complete(int status,
403 test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", 408 test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
404 mcast->mcmember.mgid.raw, status); 409 mcast->mcmember.mgid.raw, status);
405 } 410 }
411
412 if (!silent_fail)
413 mcast->logcount++;
406 } 414 }
407 415
408 if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && 416 if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
@@ -448,8 +456,7 @@ out_locked:
448 return status; 456 return status;
449} 457}
450 458
451static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, 459static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
452 int create)
453{ 460{
454 struct ipoib_dev_priv *priv = netdev_priv(dev); 461 struct ipoib_dev_priv *priv = netdev_priv(dev);
455 struct ib_sa_multicast *multicast; 462 struct ib_sa_multicast *multicast;
@@ -471,7 +478,14 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
471 IB_SA_MCMEMBER_REC_PKEY | 478 IB_SA_MCMEMBER_REC_PKEY |
472 IB_SA_MCMEMBER_REC_JOIN_STATE; 479 IB_SA_MCMEMBER_REC_JOIN_STATE;
473 480
474 if (create) { 481 if (mcast != priv->broadcast) {
482 /*
483 * RFC 4391:
484 * The MGID MUST use the same P_Key, Q_Key, SL, MTU,
485 * and HopLimit as those used in the broadcast-GID. The rest
486 * of attributes SHOULD follow the values used in the
487 * broadcast-GID as well.
488 */
475 comp_mask |= 489 comp_mask |=
476 IB_SA_MCMEMBER_REC_QKEY | 490 IB_SA_MCMEMBER_REC_QKEY |
477 IB_SA_MCMEMBER_REC_MTU_SELECTOR | 491 IB_SA_MCMEMBER_REC_MTU_SELECTOR |
@@ -492,6 +506,22 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
492 rec.sl = priv->broadcast->mcmember.sl; 506 rec.sl = priv->broadcast->mcmember.sl;
493 rec.flow_label = priv->broadcast->mcmember.flow_label; 507 rec.flow_label = priv->broadcast->mcmember.flow_label;
494 rec.hop_limit = priv->broadcast->mcmember.hop_limit; 508 rec.hop_limit = priv->broadcast->mcmember.hop_limit;
509
510 /*
511 * Historically Linux IPoIB has never properly supported SEND
512 * ONLY join. It emulated it by not providing all the required
513 * attributes, which is enough to prevent group creation and
514 * detect if there are full members or not. A major problem
515 * with supporting SEND ONLY is detecting when the group is
516 * auto-destroyed as IPoIB will cache the MLID..
517 */
518#if 1
519 if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
520 comp_mask &= ~IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
521#else
522 if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
523 rec.join_state = 4;
524#endif
495 } 525 }
496 526
497 multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, 527 multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
@@ -517,7 +547,6 @@ void ipoib_mcast_join_task(struct work_struct *work)
517 struct ib_port_attr port_attr; 547 struct ib_port_attr port_attr;
518 unsigned long delay_until = 0; 548 unsigned long delay_until = 0;
519 struct ipoib_mcast *mcast = NULL; 549 struct ipoib_mcast *mcast = NULL;
520 int create = 1;
521 550
522 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) 551 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
523 return; 552 return;
@@ -566,7 +595,6 @@ void ipoib_mcast_join_task(struct work_struct *work)
566 if (IS_ERR_OR_NULL(priv->broadcast->mc) && 595 if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
567 !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) { 596 !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) {
568 mcast = priv->broadcast; 597 mcast = priv->broadcast;
569 create = 0;
570 if (mcast->backoff > 1 && 598 if (mcast->backoff > 1 &&
571 time_before(jiffies, mcast->delay_until)) { 599 time_before(jiffies, mcast->delay_until)) {
572 delay_until = mcast->delay_until; 600 delay_until = mcast->delay_until;
@@ -590,12 +618,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
590 /* Found the next unjoined group */ 618 /* Found the next unjoined group */
591 init_completion(&mcast->done); 619 init_completion(&mcast->done);
592 set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); 620 set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
593 if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
594 create = 0;
595 else
596 create = 1;
597 spin_unlock_irq(&priv->lock); 621 spin_unlock_irq(&priv->lock);
598 ipoib_mcast_join(dev, mcast, create); 622 ipoib_mcast_join(dev, mcast);
599 spin_lock_irq(&priv->lock); 623 spin_lock_irq(&priv->lock);
600 } else if (!delay_until || 624 } else if (!delay_until ||
601 time_before(mcast->delay_until, delay_until)) 625 time_before(mcast->delay_until, delay_until))
@@ -618,7 +642,7 @@ out:
618 } 642 }
619 spin_unlock_irq(&priv->lock); 643 spin_unlock_irq(&priv->lock);
620 if (mcast) 644 if (mcast)
621 ipoib_mcast_join(dev, mcast, create); 645 ipoib_mcast_join(dev, mcast);
622} 646}
623 647
624int ipoib_mcast_start_thread(struct net_device *dev) 648int ipoib_mcast_start_thread(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 851c8219d501..78845b6e8b81 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -152,12 +152,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
152 return -ENODEV; 152 return -ENODEV;
153 } 153 }
154 154
155 priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
156 if (IS_ERR(priv->mr)) {
157 printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
158 goto out_free_pd;
159 }
160
161 /* 155 /*
162 * the various IPoIB tasks assume they will never race against 156 * the various IPoIB tasks assume they will never race against
163 * themselves, so always use a single thread workqueue 157 * themselves, so always use a single thread workqueue
@@ -165,7 +159,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
165 priv->wq = create_singlethread_workqueue("ipoib_wq"); 159 priv->wq = create_singlethread_workqueue("ipoib_wq");
166 if (!priv->wq) { 160 if (!priv->wq) {
167 printk(KERN_WARNING "ipoib: failed to allocate device WQ\n"); 161 printk(KERN_WARNING "ipoib: failed to allocate device WQ\n");
168 goto out_free_mr; 162 goto out_free_pd;
169 } 163 }
170 164
171 size = ipoib_recvq_size + 1; 165 size = ipoib_recvq_size + 1;
@@ -225,13 +219,13 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
225 priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; 219 priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff;
226 220
227 for (i = 0; i < MAX_SKB_FRAGS + 1; ++i) 221 for (i = 0; i < MAX_SKB_FRAGS + 1; ++i)
228 priv->tx_sge[i].lkey = priv->mr->lkey; 222 priv->tx_sge[i].lkey = priv->pd->local_dma_lkey;
229 223
230 priv->tx_wr.opcode = IB_WR_SEND; 224 priv->tx_wr.opcode = IB_WR_SEND;
231 priv->tx_wr.sg_list = priv->tx_sge; 225 priv->tx_wr.sg_list = priv->tx_sge;
232 priv->tx_wr.send_flags = IB_SEND_SIGNALED; 226 priv->tx_wr.send_flags = IB_SEND_SIGNALED;
233 227
234 priv->rx_sge[0].lkey = priv->mr->lkey; 228 priv->rx_sge[0].lkey = priv->pd->local_dma_lkey;
235 229
236 priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); 230 priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
237 priv->rx_wr.num_sge = 1; 231 priv->rx_wr.num_sge = 1;
@@ -254,9 +248,6 @@ out_free_wq:
254 destroy_workqueue(priv->wq); 248 destroy_workqueue(priv->wq);
255 priv->wq = NULL; 249 priv->wq = NULL;
256 250
257out_free_mr:
258 ib_dereg_mr(priv->mr);
259
260out_free_pd: 251out_free_pd:
261 ib_dealloc_pd(priv->pd); 252 ib_dealloc_pd(priv->pd);
262 253
@@ -289,12 +280,7 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
289 priv->wq = NULL; 280 priv->wq = NULL;
290 } 281 }
291 282
292 if (ib_dereg_mr(priv->mr)) 283 ib_dealloc_pd(priv->pd);
293 ipoib_warn(priv, "ib_dereg_mr failed\n");
294
295 if (ib_dealloc_pd(priv->pd))
296 ipoib_warn(priv, "ib_dealloc_pd failed\n");
297
298} 284}
299 285
300void ipoib_event(struct ib_event_handler *handler, 286void ipoib_event(struct ib_event_handler *handler,
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 6a594aac2290..1ace5d83a4d7 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -74,34 +74,37 @@
74 74
75#include "iscsi_iser.h" 75#include "iscsi_iser.h"
76 76
77MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover");
78MODULE_LICENSE("Dual BSD/GPL");
79MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz");
80MODULE_VERSION(DRV_VER);
81
77static struct scsi_host_template iscsi_iser_sht; 82static struct scsi_host_template iscsi_iser_sht;
78static struct iscsi_transport iscsi_iser_transport; 83static struct iscsi_transport iscsi_iser_transport;
79static struct scsi_transport_template *iscsi_iser_scsi_transport; 84static struct scsi_transport_template *iscsi_iser_scsi_transport;
80 85static struct workqueue_struct *release_wq;
81static unsigned int iscsi_max_lun = 512; 86struct iser_global ig;
82module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
83 87
84int iser_debug_level = 0; 88int iser_debug_level = 0;
85bool iser_pi_enable = false; 89module_param_named(debug_level, iser_debug_level, int, S_IRUGO | S_IWUSR);
86int iser_pi_guard = 1; 90MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)");
87 91
88MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover"); 92static unsigned int iscsi_max_lun = 512;
89MODULE_LICENSE("Dual BSD/GPL"); 93module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
90MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz"); 94MODULE_PARM_DESC(max_lun, "Max LUNs to allow per session (default:512");
91MODULE_VERSION(DRV_VER);
92 95
93module_param_named(debug_level, iser_debug_level, int, 0644); 96unsigned int iser_max_sectors = ISER_DEF_MAX_SECTORS;
94MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)"); 97module_param_named(max_sectors, iser_max_sectors, uint, S_IRUGO | S_IWUSR);
98MODULE_PARM_DESC(max_sectors, "Max number of sectors in a single scsi command (default:1024");
95 99
96module_param_named(pi_enable, iser_pi_enable, bool, 0644); 100bool iser_pi_enable = false;
101module_param_named(pi_enable, iser_pi_enable, bool, S_IRUGO);
97MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); 102MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)");
98 103
99module_param_named(pi_guard, iser_pi_guard, int, 0644); 104int iser_pi_guard;
105module_param_named(pi_guard, iser_pi_guard, int, S_IRUGO);
100MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]"); 106MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]");
101 107
102static struct workqueue_struct *release_wq;
103struct iser_global ig;
104
105/* 108/*
106 * iscsi_iser_recv() - Process a successfull recv completion 109 * iscsi_iser_recv() - Process a successfull recv completion
107 * @conn: iscsi connection 110 * @conn: iscsi connection
@@ -201,10 +204,12 @@ iser_initialize_task_headers(struct iscsi_task *task,
201 goto out; 204 goto out;
202 } 205 }
203 206
207 tx_desc->wr_idx = 0;
208 tx_desc->mapped = true;
204 tx_desc->dma_addr = dma_addr; 209 tx_desc->dma_addr = dma_addr;
205 tx_desc->tx_sg[0].addr = tx_desc->dma_addr; 210 tx_desc->tx_sg[0].addr = tx_desc->dma_addr;
206 tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; 211 tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
207 tx_desc->tx_sg[0].lkey = device->mr->lkey; 212 tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey;
208 213
209 iser_task->iser_conn = iser_conn; 214 iser_task->iser_conn = iser_conn;
210out: 215out:
@@ -360,16 +365,19 @@ iscsi_iser_task_xmit(struct iscsi_task *task)
360static void iscsi_iser_cleanup_task(struct iscsi_task *task) 365static void iscsi_iser_cleanup_task(struct iscsi_task *task)
361{ 366{
362 struct iscsi_iser_task *iser_task = task->dd_data; 367 struct iscsi_iser_task *iser_task = task->dd_data;
363 struct iser_tx_desc *tx_desc = &iser_task->desc; 368 struct iser_tx_desc *tx_desc = &iser_task->desc;
364 struct iser_conn *iser_conn = task->conn->dd_data; 369 struct iser_conn *iser_conn = task->conn->dd_data;
365 struct iser_device *device = iser_conn->ib_conn.device; 370 struct iser_device *device = iser_conn->ib_conn.device;
366 371
367 /* DEVICE_REMOVAL event might have already released the device */ 372 /* DEVICE_REMOVAL event might have already released the device */
368 if (!device) 373 if (!device)
369 return; 374 return;
370 375
371 ib_dma_unmap_single(device->ib_device, 376 if (likely(tx_desc->mapped)) {
372 tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); 377 ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
378 ISER_HEADERS_LEN, DMA_TO_DEVICE);
379 tx_desc->mapped = false;
380 }
373 381
374 /* mgmt tasks do not need special cleanup */ 382 /* mgmt tasks do not need special cleanup */
375 if (!task->sc) 383 if (!task->sc)
@@ -622,6 +630,8 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
622 if (ep) { 630 if (ep) {
623 iser_conn = ep->dd_data; 631 iser_conn = ep->dd_data;
624 max_cmds = iser_conn->max_cmds; 632 max_cmds = iser_conn->max_cmds;
633 shost->sg_tablesize = iser_conn->scsi_sg_tablesize;
634 shost->max_sectors = iser_conn->scsi_max_sectors;
625 635
626 mutex_lock(&iser_conn->state_mutex); 636 mutex_lock(&iser_conn->state_mutex);
627 if (iser_conn->state != ISER_CONN_UP) { 637 if (iser_conn->state != ISER_CONN_UP) {
@@ -640,6 +650,15 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
640 SHOST_DIX_GUARD_CRC); 650 SHOST_DIX_GUARD_CRC);
641 } 651 }
642 652
653 /*
654 * Limit the sg_tablesize and max_sectors based on the device
655 * max fastreg page list length.
656 */
657 shost->sg_tablesize = min_t(unsigned short, shost->sg_tablesize,
658 ib_conn->device->dev_attr.max_fast_reg_page_list_len);
659 shost->max_sectors = min_t(unsigned int,
660 1024, (shost->sg_tablesize * PAGE_SIZE) >> 9);
661
643 if (iscsi_host_add(shost, 662 if (iscsi_host_add(shost,
644 ib_conn->device->ib_device->dma_device)) { 663 ib_conn->device->ib_device->dma_device)) {
645 mutex_unlock(&iser_conn->state_mutex); 664 mutex_unlock(&iser_conn->state_mutex);
@@ -742,15 +761,9 @@ iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *s
742 stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */ 761 stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */
743 stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt; 762 stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
744 stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt; 763 stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
745 stats->custom_length = 4; 764 stats->custom_length = 1;
746 strcpy(stats->custom[0].desc, "qp_tx_queue_full"); 765 strcpy(stats->custom[0].desc, "fmr_unalign_cnt");
747 stats->custom[0].value = 0; /* TB iser_conn->qp_tx_queue_full; */ 766 stats->custom[0].value = conn->fmr_unalign_cnt;
748 strcpy(stats->custom[1].desc, "fmr_map_not_avail");
749 stats->custom[1].value = 0; /* TB iser_conn->fmr_map_not_avail */;
750 strcpy(stats->custom[2].desc, "eh_abort_cnt");
751 stats->custom[2].value = conn->eh_abort_cnt;
752 strcpy(stats->custom[3].desc, "fmr_unalign_cnt");
753 stats->custom[3].value = conn->fmr_unalign_cnt;
754} 767}
755 768
756static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep, 769static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep,
@@ -839,10 +852,9 @@ failure:
839static int 852static int
840iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) 853iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
841{ 854{
842 struct iser_conn *iser_conn; 855 struct iser_conn *iser_conn = ep->dd_data;
843 int rc; 856 int rc;
844 857
845 iser_conn = ep->dd_data;
846 rc = wait_for_completion_interruptible_timeout(&iser_conn->up_completion, 858 rc = wait_for_completion_interruptible_timeout(&iser_conn->up_completion,
847 msecs_to_jiffies(timeout_ms)); 859 msecs_to_jiffies(timeout_ms));
848 /* if conn establishment failed, return error code to iscsi */ 860 /* if conn establishment failed, return error code to iscsi */
@@ -854,7 +866,7 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
854 mutex_unlock(&iser_conn->state_mutex); 866 mutex_unlock(&iser_conn->state_mutex);
855 } 867 }
856 868
857 iser_info("ib conn %p rc = %d\n", iser_conn, rc); 869 iser_info("iser conn %p rc = %d\n", iser_conn, rc);
858 870
859 if (rc > 0) 871 if (rc > 0)
860 return 1; /* success, this is the equivalent of POLLOUT */ 872 return 1; /* success, this is the equivalent of POLLOUT */
@@ -876,11 +888,9 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
876static void 888static void
877iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) 889iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
878{ 890{
879 struct iser_conn *iser_conn; 891 struct iser_conn *iser_conn = ep->dd_data;
880 892
881 iser_conn = ep->dd_data; 893 iser_info("ep %p iser conn %p\n", ep, iser_conn);
882 iser_info("ep %p iser conn %p state %d\n",
883 ep, iser_conn, iser_conn->state);
884 894
885 mutex_lock(&iser_conn->state_mutex); 895 mutex_lock(&iser_conn->state_mutex);
886 iser_conn_terminate(iser_conn); 896 iser_conn_terminate(iser_conn);
@@ -900,6 +910,7 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
900 mutex_unlock(&iser_conn->state_mutex); 910 mutex_unlock(&iser_conn->state_mutex);
901 iser_conn_release(iser_conn); 911 iser_conn_release(iser_conn);
902 } 912 }
913
903 iscsi_destroy_endpoint(ep); 914 iscsi_destroy_endpoint(ep);
904} 915}
905 916
@@ -962,8 +973,8 @@ static struct scsi_host_template iscsi_iser_sht = {
962 .name = "iSCSI Initiator over iSER", 973 .name = "iSCSI Initiator over iSER",
963 .queuecommand = iscsi_queuecommand, 974 .queuecommand = iscsi_queuecommand,
964 .change_queue_depth = scsi_change_queue_depth, 975 .change_queue_depth = scsi_change_queue_depth,
965 .sg_tablesize = ISCSI_ISER_SG_TABLESIZE, 976 .sg_tablesize = ISCSI_ISER_DEF_SG_TABLESIZE,
966 .max_sectors = 1024, 977 .max_sectors = ISER_DEF_MAX_SECTORS,
967 .cmd_per_lun = ISER_DEF_CMD_PER_LUN, 978 .cmd_per_lun = ISER_DEF_CMD_PER_LUN,
968 .eh_abort_handler = iscsi_eh_abort, 979 .eh_abort_handler = iscsi_eh_abort,
969 .eh_device_reset_handler= iscsi_eh_device_reset, 980 .eh_device_reset_handler= iscsi_eh_device_reset,
@@ -1074,7 +1085,7 @@ static void __exit iser_exit(void)
1074 1085
1075 if (!connlist_empty) { 1086 if (!connlist_empty) {
1076 iser_err("Error cleanup stage completed but we still have iser " 1087 iser_err("Error cleanup stage completed but we still have iser "
1077 "connections, destroying them anyway.\n"); 1088 "connections, destroying them anyway\n");
1078 list_for_each_entry_safe(iser_conn, n, &ig.connlist, 1089 list_for_each_entry_safe(iser_conn, n, &ig.connlist,
1079 conn_list) { 1090 conn_list) {
1080 iser_conn_release(iser_conn); 1091 iser_conn_release(iser_conn);
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 262ba1f8ee50..86f6583485ef 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -98,8 +98,13 @@
98#define SHIFT_4K 12 98#define SHIFT_4K 12
99#define SIZE_4K (1ULL << SHIFT_4K) 99#define SIZE_4K (1ULL << SHIFT_4K)
100#define MASK_4K (~(SIZE_4K-1)) 100#define MASK_4K (~(SIZE_4K-1))
101 /* support up to 512KB in one RDMA */ 101
102#define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) 102/* Default support is 512KB I/O size */
103#define ISER_DEF_MAX_SECTORS 1024
104#define ISCSI_ISER_DEF_SG_TABLESIZE ((ISER_DEF_MAX_SECTORS * 512) >> SHIFT_4K)
105/* Maximum support is 8MB I/O size */
106#define ISCSI_ISER_MAX_SG_TABLESIZE ((16384 * 512) >> SHIFT_4K)
107
103#define ISER_DEF_XMIT_CMDS_DEFAULT 512 108#define ISER_DEF_XMIT_CMDS_DEFAULT 512
104#if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT 109#if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT
105 #define ISER_DEF_XMIT_CMDS_MAX ISCSI_DEF_XMIT_CMDS_MAX 110 #define ISER_DEF_XMIT_CMDS_MAX ISCSI_DEF_XMIT_CMDS_MAX
@@ -239,6 +244,7 @@ struct iser_data_buf {
239struct iser_device; 244struct iser_device;
240struct iscsi_iser_task; 245struct iscsi_iser_task;
241struct iscsi_endpoint; 246struct iscsi_endpoint;
247struct iser_reg_resources;
242 248
243/** 249/**
244 * struct iser_mem_reg - iSER memory registration info 250 * struct iser_mem_reg - iSER memory registration info
@@ -259,6 +265,14 @@ enum iser_desc_type {
259 ISCSI_TX_DATAOUT 265 ISCSI_TX_DATAOUT
260}; 266};
261 267
268/* Maximum number of work requests per task:
269 * Data memory region local invalidate + fast registration
270 * Protection memory region local invalidate + fast registration
271 * Signature memory region local invalidate + fast registration
272 * PDU send
273 */
274#define ISER_MAX_WRS 7
275
262/** 276/**
263 * struct iser_tx_desc - iSER TX descriptor (for send wr_id) 277 * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
264 * 278 *
@@ -270,6 +284,12 @@ enum iser_desc_type {
270 * sg[1] optionally points to either of immediate data 284 * sg[1] optionally points to either of immediate data
271 * unsolicited data-out or control 285 * unsolicited data-out or control
272 * @num_sge: number sges used on this TX task 286 * @num_sge: number sges used on this TX task
287 * @mapped: Is the task header mapped
288 * @wr_idx: Current WR index
289 * @wrs: Array of WRs per task
290 * @data_reg: Data buffer registration details
291 * @prot_reg: Protection buffer registration details
292 * @sig_attrs: Signature attributes
273 */ 293 */
274struct iser_tx_desc { 294struct iser_tx_desc {
275 struct iser_hdr iser_header; 295 struct iser_hdr iser_header;
@@ -278,6 +298,12 @@ struct iser_tx_desc {
278 u64 dma_addr; 298 u64 dma_addr;
279 struct ib_sge tx_sg[2]; 299 struct ib_sge tx_sg[2];
280 int num_sge; 300 int num_sge;
301 bool mapped;
302 u8 wr_idx;
303 struct ib_send_wr wrs[ISER_MAX_WRS];
304 struct iser_mem_reg data_reg;
305 struct iser_mem_reg prot_reg;
306 struct ib_sig_attrs sig_attrs;
281}; 307};
282 308
283#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ 309#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \
@@ -324,6 +350,33 @@ struct iser_comp {
324}; 350};
325 351
326/** 352/**
353 * struct iser_device - Memory registration operations
354 * per-device registration schemes
355 *
356 * @alloc_reg_res: Allocate registration resources
357 * @free_reg_res: Free registration resources
358 * @fast_reg_mem: Register memory buffers
359 * @unreg_mem: Un-register memory buffers
360 * @reg_desc_get: Get a registration descriptor for pool
361 * @reg_desc_put: Get a registration descriptor to pool
362 */
363struct iser_reg_ops {
364 int (*alloc_reg_res)(struct ib_conn *ib_conn,
365 unsigned cmds_max,
366 unsigned int size);
367 void (*free_reg_res)(struct ib_conn *ib_conn);
368 int (*reg_mem)(struct iscsi_iser_task *iser_task,
369 struct iser_data_buf *mem,
370 struct iser_reg_resources *rsc,
371 struct iser_mem_reg *reg);
372 void (*unreg_mem)(struct iscsi_iser_task *iser_task,
373 enum iser_data_dir cmd_dir);
374 struct iser_fr_desc * (*reg_desc_get)(struct ib_conn *ib_conn);
375 void (*reg_desc_put)(struct ib_conn *ib_conn,
376 struct iser_fr_desc *desc);
377};
378
379/**
327 * struct iser_device - iSER device handle 380 * struct iser_device - iSER device handle
328 * 381 *
329 * @ib_device: RDMA device 382 * @ib_device: RDMA device
@@ -336,11 +389,7 @@ struct iser_comp {
336 * @comps_used: Number of completion contexts used, Min between online 389 * @comps_used: Number of completion contexts used, Min between online
337 * cpus and device max completion vectors 390 * cpus and device max completion vectors
338 * @comps: Dinamically allocated array of completion handlers 391 * @comps: Dinamically allocated array of completion handlers
339 * Memory registration pool Function pointers (FMR or Fastreg): 392 * @reg_ops: Registration ops
340 * @iser_alloc_rdma_reg_res: Allocation of memory regions pool
341 * @iser_free_rdma_reg_res: Free of memory regions pool
342 * @iser_reg_rdma_mem: Memory registration routine
343 * @iser_unreg_rdma_mem: Memory deregistration routine
344 */ 393 */
345struct iser_device { 394struct iser_device {
346 struct ib_device *ib_device; 395 struct ib_device *ib_device;
@@ -352,54 +401,73 @@ struct iser_device {
352 int refcount; 401 int refcount;
353 int comps_used; 402 int comps_used;
354 struct iser_comp *comps; 403 struct iser_comp *comps;
355 int (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn, 404 struct iser_reg_ops *reg_ops;
356 unsigned cmds_max);
357 void (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn);
358 int (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task,
359 enum iser_data_dir cmd_dir);
360 void (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task,
361 enum iser_data_dir cmd_dir);
362}; 405};
363 406
364#define ISER_CHECK_GUARD 0xc0 407#define ISER_CHECK_GUARD 0xc0
365#define ISER_CHECK_REFTAG 0x0f 408#define ISER_CHECK_REFTAG 0x0f
366#define ISER_CHECK_APPTAG 0x30 409#define ISER_CHECK_APPTAG 0x30
367 410
368enum iser_reg_indicator { 411/**
369 ISER_DATA_KEY_VALID = 1 << 0, 412 * struct iser_reg_resources - Fast registration recources
370 ISER_PROT_KEY_VALID = 1 << 1, 413 *
371 ISER_SIG_KEY_VALID = 1 << 2, 414 * @mr: memory region
372 ISER_FASTREG_PROTECTED = 1 << 3, 415 * @fmr_pool: pool of fmrs
416 * @frpl: fast reg page list used by frwrs
417 * @page_vec: fast reg page list used by fmr pool
418 * @mr_valid: is mr valid indicator
419 */
420struct iser_reg_resources {
421 union {
422 struct ib_mr *mr;
423 struct ib_fmr_pool *fmr_pool;
424 };
425 union {
426 struct ib_fast_reg_page_list *frpl;
427 struct iser_page_vec *page_vec;
428 };
429 u8 mr_valid:1;
373}; 430};
374 431
375/** 432/**
376 * struct iser_pi_context - Protection information context 433 * struct iser_pi_context - Protection information context
377 * 434 *
378 * @prot_mr: protection memory region 435 * @rsc: protection buffer registration resources
379 * @prot_frpl: protection fastreg page list 436 * @sig_mr: signature enable memory region
380 * @sig_mr: signature feature enabled memory region 437 * @sig_mr_valid: is sig_mr valid indicator
438 * @sig_protected: is region protected indicator
381 */ 439 */
382struct iser_pi_context { 440struct iser_pi_context {
383 struct ib_mr *prot_mr; 441 struct iser_reg_resources rsc;
384 struct ib_fast_reg_page_list *prot_frpl;
385 struct ib_mr *sig_mr; 442 struct ib_mr *sig_mr;
443 u8 sig_mr_valid:1;
444 u8 sig_protected:1;
386}; 445};
387 446
388/** 447/**
389 * struct fast_reg_descriptor - Fast registration descriptor 448 * struct iser_fr_desc - Fast registration descriptor
390 * 449 *
391 * @list: entry in connection fastreg pool 450 * @list: entry in connection fastreg pool
392 * @data_mr: data memory region 451 * @rsc: data buffer registration resources
393 * @data_frpl: data fastreg page list
394 * @pi_ctx: protection information context 452 * @pi_ctx: protection information context
395 * @reg_indicators: fast registration indicators
396 */ 453 */
397struct fast_reg_descriptor { 454struct iser_fr_desc {
398 struct list_head list; 455 struct list_head list;
399 struct ib_mr *data_mr; 456 struct iser_reg_resources rsc;
400 struct ib_fast_reg_page_list *data_frpl;
401 struct iser_pi_context *pi_ctx; 457 struct iser_pi_context *pi_ctx;
402 u8 reg_indicators; 458};
459
460/**
461 * struct iser_fr_pool: connection fast registration pool
462 *
463 * @list: list of fastreg descriptors
464 * @lock: protects fmr/fastreg pool
465 * @size: size of the pool
466 */
467struct iser_fr_pool {
468 struct list_head list;
469 spinlock_t lock;
470 int size;
403}; 471};
404 472
405/** 473/**
@@ -415,15 +483,7 @@ struct fast_reg_descriptor {
415 * @pi_support: Indicate device T10-PI support 483 * @pi_support: Indicate device T10-PI support
416 * @beacon: beacon send wr to signal all flush errors were drained 484 * @beacon: beacon send wr to signal all flush errors were drained
417 * @flush_comp: completes when all connection completions consumed 485 * @flush_comp: completes when all connection completions consumed
418 * @lock: protects fmr/fastreg pool 486 * @fr_pool: connection fast registration poool
419 * @union.fmr:
420 * @pool: FMR pool for fast registrations
421 * @page_vec: page vector to hold mapped commands pages
422 * used for registration
423 * @union.fastreg:
424 * @pool: Fast registration descriptors pool for fast
425 * registrations
426 * @pool_size: Size of pool
427 */ 487 */
428struct ib_conn { 488struct ib_conn {
429 struct rdma_cm_id *cma_id; 489 struct rdma_cm_id *cma_id;
@@ -436,17 +496,7 @@ struct ib_conn {
436 bool pi_support; 496 bool pi_support;
437 struct ib_send_wr beacon; 497 struct ib_send_wr beacon;
438 struct completion flush_comp; 498 struct completion flush_comp;
439 spinlock_t lock; 499 struct iser_fr_pool fr_pool;
440 union {
441 struct {
442 struct ib_fmr_pool *pool;
443 struct iser_page_vec *page_vec;
444 } fmr;
445 struct {
446 struct list_head pool;
447 int pool_size;
448 } fastreg;
449 };
450}; 500};
451 501
452/** 502/**
@@ -477,6 +527,8 @@ struct ib_conn {
477 * @rx_desc_head: head of rx_descs cyclic buffer 527 * @rx_desc_head: head of rx_descs cyclic buffer
478 * @rx_descs: rx buffers array (cyclic buffer) 528 * @rx_descs: rx buffers array (cyclic buffer)
479 * @num_rx_descs: number of rx descriptors 529 * @num_rx_descs: number of rx descriptors
530 * @scsi_sg_tablesize: scsi host sg_tablesize
531 * @scsi_max_sectors: scsi host max sectors
480 */ 532 */
481struct iser_conn { 533struct iser_conn {
482 struct ib_conn ib_conn; 534 struct ib_conn ib_conn;
@@ -501,6 +553,8 @@ struct iser_conn {
501 unsigned int rx_desc_head; 553 unsigned int rx_desc_head;
502 struct iser_rx_desc *rx_descs; 554 struct iser_rx_desc *rx_descs;
503 u32 num_rx_descs; 555 u32 num_rx_descs;
556 unsigned short scsi_sg_tablesize;
557 unsigned int scsi_max_sectors;
504}; 558};
505 559
506/** 560/**
@@ -556,6 +610,9 @@ extern struct iser_global ig;
556extern int iser_debug_level; 610extern int iser_debug_level;
557extern bool iser_pi_enable; 611extern bool iser_pi_enable;
558extern int iser_pi_guard; 612extern int iser_pi_guard;
613extern unsigned int iser_max_sectors;
614
615int iser_assign_reg_ops(struct iser_device *device);
559 616
560int iser_send_control(struct iscsi_conn *conn, 617int iser_send_control(struct iscsi_conn *conn,
561 struct iscsi_task *task); 618 struct iscsi_task *task);
@@ -597,10 +654,10 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
597 struct iser_data_buf *mem, 654 struct iser_data_buf *mem,
598 enum iser_data_dir cmd_dir); 655 enum iser_data_dir cmd_dir);
599 656
600int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, 657int iser_reg_rdma_mem(struct iscsi_iser_task *task,
601 enum iser_data_dir cmd_dir); 658 enum iser_data_dir dir);
602int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task, 659void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
603 enum iser_data_dir cmd_dir); 660 enum iser_data_dir dir);
604 661
605int iser_connect(struct iser_conn *iser_conn, 662int iser_connect(struct iser_conn *iser_conn,
606 struct sockaddr *src_addr, 663 struct sockaddr *src_addr,
@@ -630,15 +687,40 @@ int iser_initialize_task_headers(struct iscsi_task *task,
630 struct iser_tx_desc *tx_desc); 687 struct iser_tx_desc *tx_desc);
631int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, 688int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
632 struct iscsi_session *session); 689 struct iscsi_session *session);
633int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max); 690int iser_alloc_fmr_pool(struct ib_conn *ib_conn,
691 unsigned cmds_max,
692 unsigned int size);
634void iser_free_fmr_pool(struct ib_conn *ib_conn); 693void iser_free_fmr_pool(struct ib_conn *ib_conn);
635int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max); 694int iser_alloc_fastreg_pool(struct ib_conn *ib_conn,
695 unsigned cmds_max,
696 unsigned int size);
636void iser_free_fastreg_pool(struct ib_conn *ib_conn); 697void iser_free_fastreg_pool(struct ib_conn *ib_conn);
637u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, 698u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
638 enum iser_data_dir cmd_dir, sector_t *sector); 699 enum iser_data_dir cmd_dir, sector_t *sector);
639struct fast_reg_descriptor * 700struct iser_fr_desc *
640iser_reg_desc_get(struct ib_conn *ib_conn); 701iser_reg_desc_get_fr(struct ib_conn *ib_conn);
641void 702void
642iser_reg_desc_put(struct ib_conn *ib_conn, 703iser_reg_desc_put_fr(struct ib_conn *ib_conn,
643 struct fast_reg_descriptor *desc); 704 struct iser_fr_desc *desc);
705struct iser_fr_desc *
706iser_reg_desc_get_fmr(struct ib_conn *ib_conn);
707void
708iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
709 struct iser_fr_desc *desc);
710
711static inline struct ib_send_wr *
712iser_tx_next_wr(struct iser_tx_desc *tx_desc)
713{
714 struct ib_send_wr *cur_wr = &tx_desc->wrs[tx_desc->wr_idx];
715 struct ib_send_wr *last_wr;
716
717 if (tx_desc->wr_idx) {
718 last_wr = &tx_desc->wrs[tx_desc->wr_idx - 1];
719 last_wr->next = cur_wr;
720 }
721 tx_desc->wr_idx++;
722
723 return cur_wr;
724}
725
644#endif 726#endif
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 3e2118e8ed87..d511879d8cdf 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -49,7 +49,6 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
49 49
50{ 50{
51 struct iscsi_iser_task *iser_task = task->dd_data; 51 struct iscsi_iser_task *iser_task = task->dd_data;
52 struct iser_device *device = iser_task->iser_conn->ib_conn.device;
53 struct iser_mem_reg *mem_reg; 52 struct iser_mem_reg *mem_reg;
54 int err; 53 int err;
55 struct iser_hdr *hdr = &iser_task->desc.iser_header; 54 struct iser_hdr *hdr = &iser_task->desc.iser_header;
@@ -73,7 +72,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
73 return err; 72 return err;
74 } 73 }
75 74
76 err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN); 75 err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
77 if (err) { 76 if (err) {
78 iser_err("Failed to set up Data-IN RDMA\n"); 77 iser_err("Failed to set up Data-IN RDMA\n");
79 return err; 78 return err;
@@ -103,7 +102,6 @@ iser_prepare_write_cmd(struct iscsi_task *task,
103 unsigned int edtl) 102 unsigned int edtl)
104{ 103{
105 struct iscsi_iser_task *iser_task = task->dd_data; 104 struct iscsi_iser_task *iser_task = task->dd_data;
106 struct iser_device *device = iser_task->iser_conn->ib_conn.device;
107 struct iser_mem_reg *mem_reg; 105 struct iser_mem_reg *mem_reg;
108 int err; 106 int err;
109 struct iser_hdr *hdr = &iser_task->desc.iser_header; 107 struct iser_hdr *hdr = &iser_task->desc.iser_header;
@@ -128,7 +126,7 @@ iser_prepare_write_cmd(struct iscsi_task *task,
128 return err; 126 return err;
129 } 127 }
130 128
131 err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT); 129 err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
132 if (err != 0) { 130 if (err != 0) {
133 iser_err("Failed to register write cmd RDMA mem\n"); 131 iser_err("Failed to register write cmd RDMA mem\n");
134 return err; 132 return err;
@@ -170,13 +168,7 @@ static void iser_create_send_desc(struct iser_conn *iser_conn,
170 168
171 memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); 169 memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
172 tx_desc->iser_header.flags = ISER_VER; 170 tx_desc->iser_header.flags = ISER_VER;
173
174 tx_desc->num_sge = 1; 171 tx_desc->num_sge = 1;
175
176 if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
177 tx_desc->tx_sg[0].lkey = device->mr->lkey;
178 iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc);
179 }
180} 172}
181 173
182static void iser_free_login_buf(struct iser_conn *iser_conn) 174static void iser_free_login_buf(struct iser_conn *iser_conn)
@@ -266,7 +258,8 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
266 iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ 258 iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
267 iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2; 259 iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;
268 260
269 if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max)) 261 if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max,
262 iser_conn->scsi_sg_tablesize))
270 goto create_rdma_reg_res_failed; 263 goto create_rdma_reg_res_failed;
271 264
272 if (iser_alloc_login_buf(iser_conn)) 265 if (iser_alloc_login_buf(iser_conn))
@@ -291,7 +284,7 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
291 rx_sg = &rx_desc->rx_sg; 284 rx_sg = &rx_desc->rx_sg;
292 rx_sg->addr = rx_desc->dma_addr; 285 rx_sg->addr = rx_desc->dma_addr;
293 rx_sg->length = ISER_RX_PAYLOAD_SIZE; 286 rx_sg->length = ISER_RX_PAYLOAD_SIZE;
294 rx_sg->lkey = device->mr->lkey; 287 rx_sg->lkey = device->pd->local_dma_lkey;
295 } 288 }
296 289
297 iser_conn->rx_desc_head = 0; 290 iser_conn->rx_desc_head = 0;
@@ -307,7 +300,7 @@ rx_desc_dma_map_failed:
307rx_desc_alloc_fail: 300rx_desc_alloc_fail:
308 iser_free_login_buf(iser_conn); 301 iser_free_login_buf(iser_conn);
309alloc_login_buf_fail: 302alloc_login_buf_fail:
310 device->iser_free_rdma_reg_res(ib_conn); 303 device->reg_ops->free_reg_res(ib_conn);
311create_rdma_reg_res_failed: 304create_rdma_reg_res_failed:
312 iser_err("failed allocating rx descriptors / data buffers\n"); 305 iser_err("failed allocating rx descriptors / data buffers\n");
313 return -ENOMEM; 306 return -ENOMEM;
@@ -320,8 +313,8 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn)
320 struct ib_conn *ib_conn = &iser_conn->ib_conn; 313 struct ib_conn *ib_conn = &iser_conn->ib_conn;
321 struct iser_device *device = ib_conn->device; 314 struct iser_device *device = ib_conn->device;
322 315
323 if (device->iser_free_rdma_reg_res) 316 if (device->reg_ops->free_reg_res)
324 device->iser_free_rdma_reg_res(ib_conn); 317 device->reg_ops->free_reg_res(ib_conn);
325 318
326 rx_desc = iser_conn->rx_descs; 319 rx_desc = iser_conn->rx_descs;
327 for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++) 320 for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++)
@@ -454,7 +447,7 @@ int iser_send_data_out(struct iscsi_conn *conn,
454 unsigned long buf_offset; 447 unsigned long buf_offset;
455 unsigned long data_seg_len; 448 unsigned long data_seg_len;
456 uint32_t itt; 449 uint32_t itt;
457 int err = 0; 450 int err;
458 struct ib_sge *tx_dsg; 451 struct ib_sge *tx_dsg;
459 452
460 itt = (__force uint32_t)hdr->itt; 453 itt = (__force uint32_t)hdr->itt;
@@ -475,7 +468,9 @@ int iser_send_data_out(struct iscsi_conn *conn,
475 memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr)); 468 memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
476 469
477 /* build the tx desc */ 470 /* build the tx desc */
478 iser_initialize_task_headers(task, tx_desc); 471 err = iser_initialize_task_headers(task, tx_desc);
472 if (err)
473 goto send_data_out_error;
479 474
480 mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT]; 475 mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT];
481 tx_dsg = &tx_desc->tx_sg[1]; 476 tx_dsg = &tx_desc->tx_sg[1];
@@ -502,7 +497,7 @@ int iser_send_data_out(struct iscsi_conn *conn,
502 497
503send_data_out_error: 498send_data_out_error:
504 kmem_cache_free(ig.desc_cache, tx_desc); 499 kmem_cache_free(ig.desc_cache, tx_desc);
505 iser_err("conn %p failed err %d\n",conn, err); 500 iser_err("conn %p failed err %d\n", conn, err);
506 return err; 501 return err;
507} 502}
508 503
@@ -543,7 +538,7 @@ int iser_send_control(struct iscsi_conn *conn,
543 538
544 tx_dsg->addr = iser_conn->login_req_dma; 539 tx_dsg->addr = iser_conn->login_req_dma;
545 tx_dsg->length = task->data_count; 540 tx_dsg->length = task->data_count;
546 tx_dsg->lkey = device->mr->lkey; 541 tx_dsg->lkey = device->pd->local_dma_lkey;
547 mdesc->num_sge = 2; 542 mdesc->num_sge = 2;
548 } 543 }
549 544
@@ -666,7 +661,6 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
666 661
667void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) 662void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
668{ 663{
669 struct iser_device *device = iser_task->iser_conn->ib_conn.device;
670 int is_rdma_data_aligned = 1; 664 int is_rdma_data_aligned = 1;
671 int is_rdma_prot_aligned = 1; 665 int is_rdma_prot_aligned = 1;
672 int prot_count = scsi_prot_sg_count(iser_task->sc); 666 int prot_count = scsi_prot_sg_count(iser_task->sc);
@@ -703,7 +697,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
703 } 697 }
704 698
705 if (iser_task->dir[ISER_DIR_IN]) { 699 if (iser_task->dir[ISER_DIR_IN]) {
706 device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN); 700 iser_unreg_rdma_mem(iser_task, ISER_DIR_IN);
707 if (is_rdma_data_aligned) 701 if (is_rdma_data_aligned)
708 iser_dma_unmap_task_data(iser_task, 702 iser_dma_unmap_task_data(iser_task,
709 &iser_task->data[ISER_DIR_IN], 703 &iser_task->data[ISER_DIR_IN],
@@ -715,7 +709,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
715 } 709 }
716 710
717 if (iser_task->dir[ISER_DIR_OUT]) { 711 if (iser_task->dir[ISER_DIR_OUT]) {
718 device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT); 712 iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT);
719 if (is_rdma_data_aligned) 713 if (is_rdma_data_aligned)
720 iser_dma_unmap_task_data(iser_task, 714 iser_dma_unmap_task_data(iser_task,
721 &iser_task->data[ISER_DIR_OUT], 715 &iser_task->data[ISER_DIR_OUT],
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index f0cdc961eb11..2493cc748db8 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -38,6 +38,55 @@
38#include <linux/scatterlist.h> 38#include <linux/scatterlist.h>
39 39
40#include "iscsi_iser.h" 40#include "iscsi_iser.h"
41static
42int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
43 struct iser_data_buf *mem,
44 struct iser_reg_resources *rsc,
45 struct iser_mem_reg *mem_reg);
46static
47int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
48 struct iser_data_buf *mem,
49 struct iser_reg_resources *rsc,
50 struct iser_mem_reg *mem_reg);
51
52static struct iser_reg_ops fastreg_ops = {
53 .alloc_reg_res = iser_alloc_fastreg_pool,
54 .free_reg_res = iser_free_fastreg_pool,
55 .reg_mem = iser_fast_reg_mr,
56 .unreg_mem = iser_unreg_mem_fastreg,
57 .reg_desc_get = iser_reg_desc_get_fr,
58 .reg_desc_put = iser_reg_desc_put_fr,
59};
60
61static struct iser_reg_ops fmr_ops = {
62 .alloc_reg_res = iser_alloc_fmr_pool,
63 .free_reg_res = iser_free_fmr_pool,
64 .reg_mem = iser_fast_reg_fmr,
65 .unreg_mem = iser_unreg_mem_fmr,
66 .reg_desc_get = iser_reg_desc_get_fmr,
67 .reg_desc_put = iser_reg_desc_put_fmr,
68};
69
70int iser_assign_reg_ops(struct iser_device *device)
71{
72 struct ib_device_attr *dev_attr = &device->dev_attr;
73
74 /* Assign function handles - based on FMR support */
75 if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
76 device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
77 iser_info("FMR supported, using FMR for registration\n");
78 device->reg_ops = &fmr_ops;
79 } else
80 if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
81 iser_info("FastReg supported, using FastReg for registration\n");
82 device->reg_ops = &fastreg_ops;
83 } else {
84 iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
85 return -1;
86 }
87
88 return 0;
89}
41 90
42static void 91static void
43iser_free_bounce_sg(struct iser_data_buf *data) 92iser_free_bounce_sg(struct iser_data_buf *data)
@@ -146,30 +195,47 @@ iser_copy_to_bounce(struct iser_data_buf *data)
146 iser_copy_bounce(data, true); 195 iser_copy_bounce(data, true);
147} 196}
148 197
149struct fast_reg_descriptor * 198struct iser_fr_desc *
150iser_reg_desc_get(struct ib_conn *ib_conn) 199iser_reg_desc_get_fr(struct ib_conn *ib_conn)
151{ 200{
152 struct fast_reg_descriptor *desc; 201 struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
202 struct iser_fr_desc *desc;
153 unsigned long flags; 203 unsigned long flags;
154 204
155 spin_lock_irqsave(&ib_conn->lock, flags); 205 spin_lock_irqsave(&fr_pool->lock, flags);
156 desc = list_first_entry(&ib_conn->fastreg.pool, 206 desc = list_first_entry(&fr_pool->list,
157 struct fast_reg_descriptor, list); 207 struct iser_fr_desc, list);
158 list_del(&desc->list); 208 list_del(&desc->list);
159 spin_unlock_irqrestore(&ib_conn->lock, flags); 209 spin_unlock_irqrestore(&fr_pool->lock, flags);
160 210
161 return desc; 211 return desc;
162} 212}
163 213
164void 214void
165iser_reg_desc_put(struct ib_conn *ib_conn, 215iser_reg_desc_put_fr(struct ib_conn *ib_conn,
166 struct fast_reg_descriptor *desc) 216 struct iser_fr_desc *desc)
167{ 217{
218 struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
168 unsigned long flags; 219 unsigned long flags;
169 220
170 spin_lock_irqsave(&ib_conn->lock, flags); 221 spin_lock_irqsave(&fr_pool->lock, flags);
171 list_add(&desc->list, &ib_conn->fastreg.pool); 222 list_add(&desc->list, &fr_pool->list);
172 spin_unlock_irqrestore(&ib_conn->lock, flags); 223 spin_unlock_irqrestore(&fr_pool->lock, flags);
224}
225
226struct iser_fr_desc *
227iser_reg_desc_get_fmr(struct ib_conn *ib_conn)
228{
229 struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
230
231 return list_first_entry(&fr_pool->list,
232 struct iser_fr_desc, list);
233}
234
235void
236iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
237 struct iser_fr_desc *desc)
238{
173} 239}
174 240
175/** 241/**
@@ -297,7 +363,8 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data,
297 * consecutive SG elements are actually fragments of the same physcial page. 363 * consecutive SG elements are actually fragments of the same physcial page.
298 */ 364 */
299static int iser_data_buf_aligned_len(struct iser_data_buf *data, 365static int iser_data_buf_aligned_len(struct iser_data_buf *data,
300 struct ib_device *ibdev) 366 struct ib_device *ibdev,
367 unsigned sg_tablesize)
301{ 368{
302 struct scatterlist *sg, *sgl, *next_sg = NULL; 369 struct scatterlist *sg, *sgl, *next_sg = NULL;
303 u64 start_addr, end_addr; 370 u64 start_addr, end_addr;
@@ -309,6 +376,14 @@ static int iser_data_buf_aligned_len(struct iser_data_buf *data,
309 sgl = data->sg; 376 sgl = data->sg;
310 start_addr = ib_sg_dma_address(ibdev, sgl); 377 start_addr = ib_sg_dma_address(ibdev, sgl);
311 378
379 if (unlikely(sgl[0].offset &&
380 data->data_len >= sg_tablesize * PAGE_SIZE)) {
381 iser_dbg("can't register length %lx with offset %x "
382 "fall to bounce buffer\n", data->data_len,
383 sgl[0].offset);
384 return 0;
385 }
386
312 for_each_sg(sgl, sg, data->dma_nents, i) { 387 for_each_sg(sgl, sg, data->dma_nents, i) {
313 if (start_check && !IS_4K_ALIGNED(start_addr)) 388 if (start_check && !IS_4K_ALIGNED(start_addr))
314 break; 389 break;
@@ -330,8 +405,11 @@ static int iser_data_buf_aligned_len(struct iser_data_buf *data,
330 break; 405 break;
331 } 406 }
332 ret_len = (next_sg) ? i : i+1; 407 ret_len = (next_sg) ? i : i+1;
333 iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", 408
334 ret_len, data->dma_nents, data); 409 if (unlikely(ret_len != data->dma_nents))
410 iser_warn("rdma alignment violation (%d/%d aligned)\n",
411 ret_len, data->dma_nents);
412
335 return ret_len; 413 return ret_len;
336} 414}
337 415
@@ -393,7 +471,7 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
393{ 471{
394 struct scatterlist *sg = mem->sg; 472 struct scatterlist *sg = mem->sg;
395 473
396 reg->sge.lkey = device->mr->lkey; 474 reg->sge.lkey = device->pd->local_dma_lkey;
397 reg->rkey = device->mr->rkey; 475 reg->rkey = device->mr->rkey;
398 reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]); 476 reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
399 reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]); 477 reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
@@ -407,15 +485,12 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
407 485
408static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, 486static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
409 struct iser_data_buf *mem, 487 struct iser_data_buf *mem,
410 enum iser_data_dir cmd_dir, 488 enum iser_data_dir cmd_dir)
411 int aligned_len)
412{ 489{
413 struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; 490 struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn;
414 struct iser_device *device = iser_task->iser_conn->ib_conn.device; 491 struct iser_device *device = iser_task->iser_conn->ib_conn.device;
415 492
416 iscsi_conn->fmr_unalign_cnt++; 493 iscsi_conn->fmr_unalign_cnt++;
417 iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n",
418 aligned_len, mem->size);
419 494
420 if (iser_debug_level > 0) 495 if (iser_debug_level > 0)
421 iser_data_buf_dump(mem, device->ib_device); 496 iser_data_buf_dump(mem, device->ib_device);
@@ -439,13 +514,15 @@ static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
439 * returns: 0 on success, errno code on failure 514 * returns: 0 on success, errno code on failure
440 */ 515 */
441static 516static
442int iser_reg_page_vec(struct iscsi_iser_task *iser_task, 517int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
443 struct iser_data_buf *mem, 518 struct iser_data_buf *mem,
444 struct iser_page_vec *page_vec, 519 struct iser_reg_resources *rsc,
445 struct iser_mem_reg *mem_reg) 520 struct iser_mem_reg *reg)
446{ 521{
447 struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; 522 struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
448 struct iser_device *device = ib_conn->device; 523 struct iser_device *device = ib_conn->device;
524 struct iser_page_vec *page_vec = rsc->page_vec;
525 struct ib_fmr_pool *fmr_pool = rsc->fmr_pool;
449 struct ib_pool_fmr *fmr; 526 struct ib_pool_fmr *fmr;
450 int ret, plen; 527 int ret, plen;
451 528
@@ -461,7 +538,7 @@ int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
461 return -EINVAL; 538 return -EINVAL;
462 } 539 }
463 540
464 fmr = ib_fmr_pool_map_phys(ib_conn->fmr.pool, 541 fmr = ib_fmr_pool_map_phys(fmr_pool,
465 page_vec->pages, 542 page_vec->pages,
466 page_vec->length, 543 page_vec->length,
467 page_vec->pages[0]); 544 page_vec->pages[0]);
@@ -471,11 +548,15 @@ int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
471 return ret; 548 return ret;
472 } 549 }
473 550
474 mem_reg->sge.lkey = fmr->fmr->lkey; 551 reg->sge.lkey = fmr->fmr->lkey;
475 mem_reg->rkey = fmr->fmr->rkey; 552 reg->rkey = fmr->fmr->rkey;
476 mem_reg->sge.addr = page_vec->pages[0] + page_vec->offset; 553 reg->sge.addr = page_vec->pages[0] + page_vec->offset;
477 mem_reg->sge.length = page_vec->data_size; 554 reg->sge.length = page_vec->data_size;
478 mem_reg->mem_h = fmr; 555 reg->mem_h = fmr;
556
557 iser_dbg("fmr reg: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
558 " length=0x%x\n", reg->sge.lkey, reg->rkey,
559 reg->sge.addr, reg->sge.length);
479 560
480 return 0; 561 return 0;
481} 562}
@@ -505,71 +586,17 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
505void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, 586void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
506 enum iser_data_dir cmd_dir) 587 enum iser_data_dir cmd_dir)
507{ 588{
589 struct iser_device *device = iser_task->iser_conn->ib_conn.device;
508 struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; 590 struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
509 591
510 if (!reg->mem_h) 592 if (!reg->mem_h)
511 return; 593 return;
512 594
513 iser_reg_desc_put(&iser_task->iser_conn->ib_conn, 595 device->reg_ops->reg_desc_put(&iser_task->iser_conn->ib_conn,
514 reg->mem_h); 596 reg->mem_h);
515 reg->mem_h = NULL; 597 reg->mem_h = NULL;
516} 598}
517 599
518/**
519 * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA,
520 * using FMR (if possible) obtaining rkey and va
521 *
522 * returns 0 on success, errno code on failure
523 */
524int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
525 enum iser_data_dir cmd_dir)
526{
527 struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
528 struct iser_device *device = ib_conn->device;
529 struct ib_device *ibdev = device->ib_device;
530 struct iser_data_buf *mem = &iser_task->data[cmd_dir];
531 struct iser_mem_reg *mem_reg;
532 int aligned_len;
533 int err;
534 int i;
535
536 mem_reg = &iser_task->rdma_reg[cmd_dir];
537
538 aligned_len = iser_data_buf_aligned_len(mem, ibdev);
539 if (aligned_len != mem->dma_nents) {
540 err = fall_to_bounce_buf(iser_task, mem,
541 cmd_dir, aligned_len);
542 if (err) {
543 iser_err("failed to allocate bounce buffer\n");
544 return err;
545 }
546 }
547
548 /* if there a single dma entry, FMR is not needed */
549 if (mem->dma_nents == 1) {
550 return iser_reg_dma(device, mem, mem_reg);
551 } else { /* use FMR for multiple dma entries */
552 err = iser_reg_page_vec(iser_task, mem, ib_conn->fmr.page_vec,
553 mem_reg);
554 if (err && err != -EAGAIN) {
555 iser_data_buf_dump(mem, ibdev);
556 iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
557 mem->dma_nents,
558 ntoh24(iser_task->desc.iscsi_header.dlength));
559 iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
560 ib_conn->fmr.page_vec->data_size,
561 ib_conn->fmr.page_vec->length,
562 ib_conn->fmr.page_vec->offset);
563 for (i = 0; i < ib_conn->fmr.page_vec->length; i++)
564 iser_err("page_vec[%d] = 0x%llx\n", i,
565 (unsigned long long)ib_conn->fmr.page_vec->pages[i]);
566 }
567 if (err)
568 return err;
569 }
570 return 0;
571}
572
573static void 600static void
574iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, 601iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
575 struct ib_sig_domain *domain) 602 struct ib_sig_domain *domain)
@@ -637,10 +664,11 @@ iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
637{ 664{
638 u32 rkey; 665 u32 rkey;
639 666
640 memset(inv_wr, 0, sizeof(*inv_wr));
641 inv_wr->opcode = IB_WR_LOCAL_INV; 667 inv_wr->opcode = IB_WR_LOCAL_INV;
642 inv_wr->wr_id = ISER_FASTREG_LI_WRID; 668 inv_wr->wr_id = ISER_FASTREG_LI_WRID;
643 inv_wr->ex.invalidate_rkey = mr->rkey; 669 inv_wr->ex.invalidate_rkey = mr->rkey;
670 inv_wr->send_flags = 0;
671 inv_wr->num_sge = 0;
644 672
645 rkey = ib_inc_rkey(mr->rkey); 673 rkey = ib_inc_rkey(mr->rkey);
646 ib_update_fast_reg_key(mr, rkey); 674 ib_update_fast_reg_key(mr, rkey);
@@ -648,61 +676,51 @@ iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
648 676
649static int 677static int
650iser_reg_sig_mr(struct iscsi_iser_task *iser_task, 678iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
651 struct fast_reg_descriptor *desc, 679 struct iser_pi_context *pi_ctx,
652 struct iser_mem_reg *data_reg, 680 struct iser_mem_reg *data_reg,
653 struct iser_mem_reg *prot_reg, 681 struct iser_mem_reg *prot_reg,
654 struct iser_mem_reg *sig_reg) 682 struct iser_mem_reg *sig_reg)
655{ 683{
656 struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; 684 struct iser_tx_desc *tx_desc = &iser_task->desc;
657 struct iser_pi_context *pi_ctx = desc->pi_ctx; 685 struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs;
658 struct ib_send_wr sig_wr, inv_wr; 686 struct ib_send_wr *wr;
659 struct ib_send_wr *bad_wr, *wr = NULL;
660 struct ib_sig_attrs sig_attrs;
661 int ret; 687 int ret;
662 688
663 memset(&sig_attrs, 0, sizeof(sig_attrs)); 689 memset(sig_attrs, 0, sizeof(*sig_attrs));
664 ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); 690 ret = iser_set_sig_attrs(iser_task->sc, sig_attrs);
665 if (ret) 691 if (ret)
666 goto err; 692 goto err;
667 693
668 iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); 694 iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask);
669 695
670 if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { 696 if (!pi_ctx->sig_mr_valid) {
671 iser_inv_rkey(&inv_wr, pi_ctx->sig_mr); 697 wr = iser_tx_next_wr(tx_desc);
672 wr = &inv_wr; 698 iser_inv_rkey(wr, pi_ctx->sig_mr);
673 } 699 }
674 700
675 memset(&sig_wr, 0, sizeof(sig_wr)); 701 wr = iser_tx_next_wr(tx_desc);
676 sig_wr.opcode = IB_WR_REG_SIG_MR; 702 wr->opcode = IB_WR_REG_SIG_MR;
677 sig_wr.wr_id = ISER_FASTREG_LI_WRID; 703 wr->wr_id = ISER_FASTREG_LI_WRID;
678 sig_wr.sg_list = &data_reg->sge; 704 wr->sg_list = &data_reg->sge;
679 sig_wr.num_sge = 1; 705 wr->num_sge = 1;
680 sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; 706 wr->send_flags = 0;
681 sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; 707 wr->wr.sig_handover.sig_attrs = sig_attrs;
708 wr->wr.sig_handover.sig_mr = pi_ctx->sig_mr;
682 if (scsi_prot_sg_count(iser_task->sc)) 709 if (scsi_prot_sg_count(iser_task->sc))
683 sig_wr.wr.sig_handover.prot = &prot_reg->sge; 710 wr->wr.sig_handover.prot = &prot_reg->sge;
684 sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE |
685 IB_ACCESS_REMOTE_READ |
686 IB_ACCESS_REMOTE_WRITE;
687
688 if (!wr)
689 wr = &sig_wr;
690 else 711 else
691 wr->next = &sig_wr; 712 wr->wr.sig_handover.prot = NULL;
692 713 wr->wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE |
693 ret = ib_post_send(ib_conn->qp, wr, &bad_wr); 714 IB_ACCESS_REMOTE_READ |
694 if (ret) { 715 IB_ACCESS_REMOTE_WRITE;
695 iser_err("reg_sig_mr failed, ret:%d\n", ret); 716 pi_ctx->sig_mr_valid = 0;
696 goto err;
697 }
698 desc->reg_indicators &= ~ISER_SIG_KEY_VALID;
699 717
700 sig_reg->sge.lkey = pi_ctx->sig_mr->lkey; 718 sig_reg->sge.lkey = pi_ctx->sig_mr->lkey;
701 sig_reg->rkey = pi_ctx->sig_mr->rkey; 719 sig_reg->rkey = pi_ctx->sig_mr->rkey;
702 sig_reg->sge.addr = 0; 720 sig_reg->sge.addr = 0;
703 sig_reg->sge.length = scsi_transfer_length(iser_task->sc); 721 sig_reg->sge.length = scsi_transfer_length(iser_task->sc);
704 722
705 iser_dbg("sig_sge: lkey: 0x%x, rkey: 0x%x, addr: 0x%llx, length: %u\n", 723 iser_dbg("sig reg: lkey: 0x%x, rkey: 0x%x, addr: 0x%llx, length: %u\n",
706 sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr, 724 sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr,
707 sig_reg->sge.length); 725 sig_reg->sge.length);
708err: 726err:
@@ -711,29 +729,16 @@ err:
711 729
712static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, 730static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
713 struct iser_data_buf *mem, 731 struct iser_data_buf *mem,
714 struct fast_reg_descriptor *desc, 732 struct iser_reg_resources *rsc,
715 enum iser_reg_indicator ind,
716 struct iser_mem_reg *reg) 733 struct iser_mem_reg *reg)
717{ 734{
718 struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; 735 struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
719 struct iser_device *device = ib_conn->device; 736 struct iser_device *device = ib_conn->device;
720 struct ib_mr *mr; 737 struct ib_mr *mr = rsc->mr;
721 struct ib_fast_reg_page_list *frpl; 738 struct ib_fast_reg_page_list *frpl = rsc->frpl;
722 struct ib_send_wr fastreg_wr, inv_wr; 739 struct iser_tx_desc *tx_desc = &iser_task->desc;
723 struct ib_send_wr *bad_wr, *wr = NULL; 740 struct ib_send_wr *wr;
724 int ret, offset, size, plen; 741 int offset, size, plen;
725
726 /* if there a single dma entry, dma mr suffices */
727 if (mem->dma_nents == 1)
728 return iser_reg_dma(device, mem, reg);
729
730 if (ind == ISER_DATA_KEY_VALID) {
731 mr = desc->data_mr;
732 frpl = desc->data_frpl;
733 } else {
734 mr = desc->pi_ctx->prot_mr;
735 frpl = desc->pi_ctx->prot_frpl;
736 }
737 742
738 plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list, 743 plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list,
739 &offset, &size); 744 &offset, &size);
@@ -742,118 +747,151 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
742 return -EINVAL; 747 return -EINVAL;
743 } 748 }
744 749
745 if (!(desc->reg_indicators & ind)) { 750 if (!rsc->mr_valid) {
746 iser_inv_rkey(&inv_wr, mr); 751 wr = iser_tx_next_wr(tx_desc);
747 wr = &inv_wr; 752 iser_inv_rkey(wr, mr);
748 } 753 }
749 754
750 /* Prepare FASTREG WR */ 755 wr = iser_tx_next_wr(tx_desc);
751 memset(&fastreg_wr, 0, sizeof(fastreg_wr)); 756 wr->opcode = IB_WR_FAST_REG_MR;
752 fastreg_wr.wr_id = ISER_FASTREG_LI_WRID; 757 wr->wr_id = ISER_FASTREG_LI_WRID;
753 fastreg_wr.opcode = IB_WR_FAST_REG_MR; 758 wr->send_flags = 0;
754 fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset; 759 wr->wr.fast_reg.iova_start = frpl->page_list[0] + offset;
755 fastreg_wr.wr.fast_reg.page_list = frpl; 760 wr->wr.fast_reg.page_list = frpl;
756 fastreg_wr.wr.fast_reg.page_list_len = plen; 761 wr->wr.fast_reg.page_list_len = plen;
757 fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K; 762 wr->wr.fast_reg.page_shift = SHIFT_4K;
758 fastreg_wr.wr.fast_reg.length = size; 763 wr->wr.fast_reg.length = size;
759 fastreg_wr.wr.fast_reg.rkey = mr->rkey; 764 wr->wr.fast_reg.rkey = mr->rkey;
760 fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | 765 wr->wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE |
761 IB_ACCESS_REMOTE_WRITE | 766 IB_ACCESS_REMOTE_WRITE |
762 IB_ACCESS_REMOTE_READ); 767 IB_ACCESS_REMOTE_READ);
763 768 rsc->mr_valid = 0;
764 if (!wr)
765 wr = &fastreg_wr;
766 else
767 wr->next = &fastreg_wr;
768
769 ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
770 if (ret) {
771 iser_err("fast registration failed, ret:%d\n", ret);
772 return ret;
773 }
774 desc->reg_indicators &= ~ind;
775 769
776 reg->sge.lkey = mr->lkey; 770 reg->sge.lkey = mr->lkey;
777 reg->rkey = mr->rkey; 771 reg->rkey = mr->rkey;
778 reg->sge.addr = frpl->page_list[0] + offset; 772 reg->sge.addr = frpl->page_list[0] + offset;
779 reg->sge.length = size; 773 reg->sge.length = size;
780 774
781 return ret; 775 iser_dbg("fast reg: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
776 " length=0x%x\n", reg->sge.lkey, reg->rkey,
777 reg->sge.addr, reg->sge.length);
778
779 return 0;
782} 780}
783 781
784/** 782static int
785 * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA, 783iser_handle_unaligned_buf(struct iscsi_iser_task *task,
786 * using Fast Registration WR (if possible) obtaining rkey and va 784 struct iser_data_buf *mem,
787 * 785 enum iser_data_dir dir)
788 * returns 0 on success, errno code on failure
789 */
790int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
791 enum iser_data_dir cmd_dir)
792{ 786{
793 struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; 787 struct iser_conn *iser_conn = task->iser_conn;
794 struct iser_device *device = ib_conn->device; 788 struct iser_device *device = iser_conn->ib_conn.device;
795 struct ib_device *ibdev = device->ib_device;
796 struct iser_data_buf *mem = &iser_task->data[cmd_dir];
797 struct iser_mem_reg *mem_reg = &iser_task->rdma_reg[cmd_dir];
798 struct fast_reg_descriptor *desc = NULL;
799 int err, aligned_len; 789 int err, aligned_len;
800 790
801 aligned_len = iser_data_buf_aligned_len(mem, ibdev); 791 aligned_len = iser_data_buf_aligned_len(mem, device->ib_device,
792 iser_conn->scsi_sg_tablesize);
802 if (aligned_len != mem->dma_nents) { 793 if (aligned_len != mem->dma_nents) {
803 err = fall_to_bounce_buf(iser_task, mem, 794 err = fall_to_bounce_buf(task, mem, dir);
804 cmd_dir, aligned_len); 795 if (err)
805 if (err) {
806 iser_err("failed to allocate bounce buffer\n");
807 return err; 796 return err;
808 }
809 } 797 }
810 798
799 return 0;
800}
801
802static int
803iser_reg_prot_sg(struct iscsi_iser_task *task,
804 struct iser_data_buf *mem,
805 struct iser_fr_desc *desc,
806 struct iser_mem_reg *reg)
807{
808 struct iser_device *device = task->iser_conn->ib_conn.device;
809
810 if (mem->dma_nents == 1)
811 return iser_reg_dma(device, mem, reg);
812
813 return device->reg_ops->reg_mem(task, mem, &desc->pi_ctx->rsc, reg);
814}
815
816static int
817iser_reg_data_sg(struct iscsi_iser_task *task,
818 struct iser_data_buf *mem,
819 struct iser_fr_desc *desc,
820 struct iser_mem_reg *reg)
821{
822 struct iser_device *device = task->iser_conn->ib_conn.device;
823
824 if (mem->dma_nents == 1)
825 return iser_reg_dma(device, mem, reg);
826
827 return device->reg_ops->reg_mem(task, mem, &desc->rsc, reg);
828}
829
830int iser_reg_rdma_mem(struct iscsi_iser_task *task,
831 enum iser_data_dir dir)
832{
833 struct ib_conn *ib_conn = &task->iser_conn->ib_conn;
834 struct iser_device *device = ib_conn->device;
835 struct iser_data_buf *mem = &task->data[dir];
836 struct iser_mem_reg *reg = &task->rdma_reg[dir];
837 struct iser_mem_reg *data_reg;
838 struct iser_fr_desc *desc = NULL;
839 int err;
840
841 err = iser_handle_unaligned_buf(task, mem, dir);
842 if (unlikely(err))
843 return err;
844
811 if (mem->dma_nents != 1 || 845 if (mem->dma_nents != 1 ||
812 scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { 846 scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) {
813 desc = iser_reg_desc_get(ib_conn); 847 desc = device->reg_ops->reg_desc_get(ib_conn);
814 mem_reg->mem_h = desc; 848 reg->mem_h = desc;
815 } 849 }
816 850
817 err = iser_fast_reg_mr(iser_task, mem, desc, 851 if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL)
818 ISER_DATA_KEY_VALID, mem_reg); 852 data_reg = reg;
819 if (err) 853 else
854 data_reg = &task->desc.data_reg;
855
856 err = iser_reg_data_sg(task, mem, desc, data_reg);
857 if (unlikely(err))
820 goto err_reg; 858 goto err_reg;
821 859
822 if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { 860 if (scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) {
823 struct iser_mem_reg prot_reg; 861 struct iser_mem_reg *prot_reg = &task->desc.prot_reg;
824
825 memset(&prot_reg, 0, sizeof(prot_reg));
826 if (scsi_prot_sg_count(iser_task->sc)) {
827 mem = &iser_task->prot[cmd_dir];
828 aligned_len = iser_data_buf_aligned_len(mem, ibdev);
829 if (aligned_len != mem->dma_nents) {
830 err = fall_to_bounce_buf(iser_task, mem,
831 cmd_dir, aligned_len);
832 if (err) {
833 iser_err("failed to allocate bounce buffer\n");
834 return err;
835 }
836 }
837 862
838 err = iser_fast_reg_mr(iser_task, mem, desc, 863 if (scsi_prot_sg_count(task->sc)) {
839 ISER_PROT_KEY_VALID, &prot_reg); 864 mem = &task->prot[dir];
840 if (err) 865 err = iser_handle_unaligned_buf(task, mem, dir);
866 if (unlikely(err))
841 goto err_reg; 867 goto err_reg;
842 }
843 868
844 err = iser_reg_sig_mr(iser_task, desc, mem_reg, 869 err = iser_reg_prot_sg(task, mem, desc, prot_reg);
845 &prot_reg, mem_reg); 870 if (unlikely(err))
846 if (err) { 871 goto err_reg;
847 iser_err("Failed to register signature mr\n");
848 return err;
849 } 872 }
850 desc->reg_indicators |= ISER_FASTREG_PROTECTED; 873
874 err = iser_reg_sig_mr(task, desc->pi_ctx, data_reg,
875 prot_reg, reg);
876 if (unlikely(err))
877 goto err_reg;
878
879 desc->pi_ctx->sig_protected = 1;
851 } 880 }
852 881
853 return 0; 882 return 0;
883
854err_reg: 884err_reg:
855 if (desc) 885 if (desc)
856 iser_reg_desc_put(ib_conn, desc); 886 device->reg_ops->reg_desc_put(ib_conn, desc);
857 887
858 return err; 888 return err;
859} 889}
890
891void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
892 enum iser_data_dir dir)
893{
894 struct iser_device *device = task->iser_conn->ib_conn.device;
895
896 device->reg_ops->unreg_mem(task, dir);
897}
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 5c9f565ea0e8..ae70cc1463ac 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -87,25 +87,9 @@ static int iser_create_device_ib_res(struct iser_device *device)
87 return ret; 87 return ret;
88 } 88 }
89 89
90 /* Assign function handles - based on FMR support */ 90 ret = iser_assign_reg_ops(device);
91 if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr && 91 if (ret)
92 device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) { 92 return ret;
93 iser_info("FMR supported, using FMR for registration\n");
94 device->iser_alloc_rdma_reg_res = iser_create_fmr_pool;
95 device->iser_free_rdma_reg_res = iser_free_fmr_pool;
96 device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr;
97 device->iser_unreg_rdma_mem = iser_unreg_mem_fmr;
98 } else
99 if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
100 iser_info("FastReg supported, using FastReg for registration\n");
101 device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool;
102 device->iser_free_rdma_reg_res = iser_free_fastreg_pool;
103 device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg;
104 device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg;
105 } else {
106 iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
107 return -1;
108 }
109 93
110 device->comps_used = min_t(int, num_online_cpus(), 94 device->comps_used = min_t(int, num_online_cpus(),
111 device->ib_device->num_comp_vectors); 95 device->ib_device->num_comp_vectors);
@@ -201,7 +185,7 @@ static void iser_free_device_ib_res(struct iser_device *device)
201 185
202 (void)ib_unregister_event_handler(&device->event_handler); 186 (void)ib_unregister_event_handler(&device->event_handler);
203 (void)ib_dereg_mr(device->mr); 187 (void)ib_dereg_mr(device->mr);
204 (void)ib_dealloc_pd(device->pd); 188 ib_dealloc_pd(device->pd);
205 189
206 kfree(device->comps); 190 kfree(device->comps);
207 device->comps = NULL; 191 device->comps = NULL;
@@ -211,28 +195,40 @@ static void iser_free_device_ib_res(struct iser_device *device)
211} 195}
212 196
213/** 197/**
214 * iser_create_fmr_pool - Creates FMR pool and page_vector 198 * iser_alloc_fmr_pool - Creates FMR pool and page_vector
215 * 199 *
216 * returns 0 on success, or errno code on failure 200 * returns 0 on success, or errno code on failure
217 */ 201 */
218int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max) 202int iser_alloc_fmr_pool(struct ib_conn *ib_conn,
203 unsigned cmds_max,
204 unsigned int size)
219{ 205{
220 struct iser_device *device = ib_conn->device; 206 struct iser_device *device = ib_conn->device;
207 struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
208 struct iser_page_vec *page_vec;
209 struct iser_fr_desc *desc;
210 struct ib_fmr_pool *fmr_pool;
221 struct ib_fmr_pool_param params; 211 struct ib_fmr_pool_param params;
222 int ret = -ENOMEM; 212 int ret;
223 213
224 ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) + 214 INIT_LIST_HEAD(&fr_pool->list);
225 (sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)), 215 spin_lock_init(&fr_pool->lock);
226 GFP_KERNEL); 216
227 if (!ib_conn->fmr.page_vec) 217 desc = kzalloc(sizeof(*desc), GFP_KERNEL);
228 return ret; 218 if (!desc)
219 return -ENOMEM;
229 220
230 ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1); 221 page_vec = kmalloc(sizeof(*page_vec) + (sizeof(u64) * size),
222 GFP_KERNEL);
223 if (!page_vec) {
224 ret = -ENOMEM;
225 goto err_frpl;
226 }
227
228 page_vec->pages = (u64 *)(page_vec + 1);
231 229
232 params.page_shift = SHIFT_4K; 230 params.page_shift = SHIFT_4K;
233 /* when the first/last SG element are not start/end * 231 params.max_pages_per_fmr = size;
234 * page aligned, the map whould be of N+1 pages */
235 params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1;
236 /* make the pool size twice the max number of SCSI commands * 232 /* make the pool size twice the max number of SCSI commands *
237 * the ML is expected to queue, watermark for unmap at 50% */ 233 * the ML is expected to queue, watermark for unmap at 50% */
238 params.pool_size = cmds_max * 2; 234 params.pool_size = cmds_max * 2;
@@ -243,23 +239,25 @@ int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
243 IB_ACCESS_REMOTE_WRITE | 239 IB_ACCESS_REMOTE_WRITE |
244 IB_ACCESS_REMOTE_READ); 240 IB_ACCESS_REMOTE_READ);
245 241
246 ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, &params); 242 fmr_pool = ib_create_fmr_pool(device->pd, &params);
247 if (!IS_ERR(ib_conn->fmr.pool)) 243 if (IS_ERR(fmr_pool)) {
248 return 0; 244 ret = PTR_ERR(fmr_pool);
249
250 /* no FMR => no need for page_vec */
251 kfree(ib_conn->fmr.page_vec);
252 ib_conn->fmr.page_vec = NULL;
253
254 ret = PTR_ERR(ib_conn->fmr.pool);
255 ib_conn->fmr.pool = NULL;
256 if (ret != -ENOSYS) {
257 iser_err("FMR allocation failed, err %d\n", ret); 245 iser_err("FMR allocation failed, err %d\n", ret);
258 return ret; 246 goto err_fmr;
259 } else {
260 iser_warn("FMRs are not supported, using unaligned mode\n");
261 return 0;
262 } 247 }
248
249 desc->rsc.page_vec = page_vec;
250 desc->rsc.fmr_pool = fmr_pool;
251 list_add(&desc->list, &fr_pool->list);
252
253 return 0;
254
255err_fmr:
256 kfree(page_vec);
257err_frpl:
258 kfree(desc);
259
260 return ret;
263} 261}
264 262
265/** 263/**
@@ -267,26 +265,68 @@ int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
267 */ 265 */
268void iser_free_fmr_pool(struct ib_conn *ib_conn) 266void iser_free_fmr_pool(struct ib_conn *ib_conn)
269{ 267{
268 struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
269 struct iser_fr_desc *desc;
270
271 desc = list_first_entry(&fr_pool->list,
272 struct iser_fr_desc, list);
273 list_del(&desc->list);
274
270 iser_info("freeing conn %p fmr pool %p\n", 275 iser_info("freeing conn %p fmr pool %p\n",
271 ib_conn, ib_conn->fmr.pool); 276 ib_conn, desc->rsc.fmr_pool);
277
278 ib_destroy_fmr_pool(desc->rsc.fmr_pool);
279 kfree(desc->rsc.page_vec);
280 kfree(desc);
281}
282
283static int
284iser_alloc_reg_res(struct ib_device *ib_device,
285 struct ib_pd *pd,
286 struct iser_reg_resources *res,
287 unsigned int size)
288{
289 int ret;
290
291 res->frpl = ib_alloc_fast_reg_page_list(ib_device, size);
292 if (IS_ERR(res->frpl)) {
293 ret = PTR_ERR(res->frpl);
294 iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n",
295 ret);
296 return PTR_ERR(res->frpl);
297 }
298
299 res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, size);
300 if (IS_ERR(res->mr)) {
301 ret = PTR_ERR(res->mr);
302 iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
303 goto fast_reg_mr_failure;
304 }
305 res->mr_valid = 1;
272 306
273 if (ib_conn->fmr.pool != NULL) 307 return 0;
274 ib_destroy_fmr_pool(ib_conn->fmr.pool); 308
309fast_reg_mr_failure:
310 ib_free_fast_reg_page_list(res->frpl);
275 311
276 ib_conn->fmr.pool = NULL; 312 return ret;
313}
277 314
278 kfree(ib_conn->fmr.page_vec); 315static void
279 ib_conn->fmr.page_vec = NULL; 316iser_free_reg_res(struct iser_reg_resources *rsc)
317{
318 ib_dereg_mr(rsc->mr);
319 ib_free_fast_reg_page_list(rsc->frpl);
280} 320}
281 321
282static int 322static int
283iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd, 323iser_alloc_pi_ctx(struct ib_device *ib_device,
284 struct fast_reg_descriptor *desc) 324 struct ib_pd *pd,
325 struct iser_fr_desc *desc,
326 unsigned int size)
285{ 327{
286 struct iser_pi_context *pi_ctx = NULL; 328 struct iser_pi_context *pi_ctx = NULL;
287 struct ib_mr_init_attr mr_init_attr = {.max_reg_descriptors = 2, 329 int ret;
288 .flags = IB_MR_SIGNATURE_EN};
289 int ret = 0;
290 330
291 desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); 331 desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL);
292 if (!desc->pi_ctx) 332 if (!desc->pi_ctx)
@@ -294,36 +334,25 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
294 334
295 pi_ctx = desc->pi_ctx; 335 pi_ctx = desc->pi_ctx;
296 336
297 pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, 337 ret = iser_alloc_reg_res(ib_device, pd, &pi_ctx->rsc, size);
298 ISCSI_ISER_SG_TABLESIZE); 338 if (ret) {
299 if (IS_ERR(pi_ctx->prot_frpl)) { 339 iser_err("failed to allocate reg_resources\n");
300 ret = PTR_ERR(pi_ctx->prot_frpl); 340 goto alloc_reg_res_err;
301 goto prot_frpl_failure;
302 }
303
304 pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd,
305 ISCSI_ISER_SG_TABLESIZE + 1);
306 if (IS_ERR(pi_ctx->prot_mr)) {
307 ret = PTR_ERR(pi_ctx->prot_mr);
308 goto prot_mr_failure;
309 } 341 }
310 desc->reg_indicators |= ISER_PROT_KEY_VALID;
311 342
312 pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); 343 pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
313 if (IS_ERR(pi_ctx->sig_mr)) { 344 if (IS_ERR(pi_ctx->sig_mr)) {
314 ret = PTR_ERR(pi_ctx->sig_mr); 345 ret = PTR_ERR(pi_ctx->sig_mr);
315 goto sig_mr_failure; 346 goto sig_mr_failure;
316 } 347 }
317 desc->reg_indicators |= ISER_SIG_KEY_VALID; 348 pi_ctx->sig_mr_valid = 1;
318 desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; 349 desc->pi_ctx->sig_protected = 0;
319 350
320 return 0; 351 return 0;
321 352
322sig_mr_failure: 353sig_mr_failure:
323 ib_dereg_mr(desc->pi_ctx->prot_mr); 354 iser_free_reg_res(&pi_ctx->rsc);
324prot_mr_failure: 355alloc_reg_res_err:
325 ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl);
326prot_frpl_failure:
327 kfree(desc->pi_ctx); 356 kfree(desc->pi_ctx);
328 357
329 return ret; 358 return ret;
@@ -332,82 +361,71 @@ prot_frpl_failure:
332static void 361static void
333iser_free_pi_ctx(struct iser_pi_context *pi_ctx) 362iser_free_pi_ctx(struct iser_pi_context *pi_ctx)
334{ 363{
335 ib_free_fast_reg_page_list(pi_ctx->prot_frpl); 364 iser_free_reg_res(&pi_ctx->rsc);
336 ib_dereg_mr(pi_ctx->prot_mr); 365 ib_dereg_mr(pi_ctx->sig_mr);
337 ib_destroy_mr(pi_ctx->sig_mr);
338 kfree(pi_ctx); 366 kfree(pi_ctx);
339} 367}
340 368
341static int 369static struct iser_fr_desc *
342iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, 370iser_create_fastreg_desc(struct ib_device *ib_device,
343 bool pi_enable, struct fast_reg_descriptor *desc) 371 struct ib_pd *pd,
372 bool pi_enable,
373 unsigned int size)
344{ 374{
375 struct iser_fr_desc *desc;
345 int ret; 376 int ret;
346 377
347 desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, 378 desc = kzalloc(sizeof(*desc), GFP_KERNEL);
348 ISCSI_ISER_SG_TABLESIZE + 1); 379 if (!desc)
349 if (IS_ERR(desc->data_frpl)) { 380 return ERR_PTR(-ENOMEM);
350 ret = PTR_ERR(desc->data_frpl);
351 iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n",
352 ret);
353 return PTR_ERR(desc->data_frpl);
354 }
355 381
356 desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1); 382 ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc, size);
357 if (IS_ERR(desc->data_mr)) { 383 if (ret)
358 ret = PTR_ERR(desc->data_mr); 384 goto reg_res_alloc_failure;
359 iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
360 goto fast_reg_mr_failure;
361 }
362 desc->reg_indicators |= ISER_DATA_KEY_VALID;
363 385
364 if (pi_enable) { 386 if (pi_enable) {
365 ret = iser_alloc_pi_ctx(ib_device, pd, desc); 387 ret = iser_alloc_pi_ctx(ib_device, pd, desc, size);
366 if (ret) 388 if (ret)
367 goto pi_ctx_alloc_failure; 389 goto pi_ctx_alloc_failure;
368 } 390 }
369 391
370 return 0; 392 return desc;
393
371pi_ctx_alloc_failure: 394pi_ctx_alloc_failure:
372 ib_dereg_mr(desc->data_mr); 395 iser_free_reg_res(&desc->rsc);
373fast_reg_mr_failure: 396reg_res_alloc_failure:
374 ib_free_fast_reg_page_list(desc->data_frpl); 397 kfree(desc);
375 398
376 return ret; 399 return ERR_PTR(ret);
377} 400}
378 401
379/** 402/**
380 * iser_create_fastreg_pool - Creates pool of fast_reg descriptors 403 * iser_alloc_fastreg_pool - Creates pool of fast_reg descriptors
381 * for fast registration work requests. 404 * for fast registration work requests.
382 * returns 0 on success, or errno code on failure 405 * returns 0 on success, or errno code on failure
383 */ 406 */
384int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max) 407int iser_alloc_fastreg_pool(struct ib_conn *ib_conn,
408 unsigned cmds_max,
409 unsigned int size)
385{ 410{
386 struct iser_device *device = ib_conn->device; 411 struct iser_device *device = ib_conn->device;
387 struct fast_reg_descriptor *desc; 412 struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
413 struct iser_fr_desc *desc;
388 int i, ret; 414 int i, ret;
389 415
390 INIT_LIST_HEAD(&ib_conn->fastreg.pool); 416 INIT_LIST_HEAD(&fr_pool->list);
391 ib_conn->fastreg.pool_size = 0; 417 spin_lock_init(&fr_pool->lock);
418 fr_pool->size = 0;
392 for (i = 0; i < cmds_max; i++) { 419 for (i = 0; i < cmds_max; i++) {
393 desc = kzalloc(sizeof(*desc), GFP_KERNEL); 420 desc = iser_create_fastreg_desc(device->ib_device, device->pd,
394 if (!desc) { 421 ib_conn->pi_support, size);
395 iser_err("Failed to allocate a new fast_reg descriptor\n"); 422 if (IS_ERR(desc)) {
396 ret = -ENOMEM; 423 ret = PTR_ERR(desc);
397 goto err;
398 }
399
400 ret = iser_create_fastreg_desc(device->ib_device, device->pd,
401 ib_conn->pi_support, desc);
402 if (ret) {
403 iser_err("Failed to create fastreg descriptor err=%d\n",
404 ret);
405 kfree(desc);
406 goto err; 424 goto err;
407 } 425 }
408 426
409 list_add_tail(&desc->list, &ib_conn->fastreg.pool); 427 list_add_tail(&desc->list, &fr_pool->list);
410 ib_conn->fastreg.pool_size++; 428 fr_pool->size++;
411 } 429 }
412 430
413 return 0; 431 return 0;
@@ -422,27 +440,27 @@ err:
422 */ 440 */
423void iser_free_fastreg_pool(struct ib_conn *ib_conn) 441void iser_free_fastreg_pool(struct ib_conn *ib_conn)
424{ 442{
425 struct fast_reg_descriptor *desc, *tmp; 443 struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
444 struct iser_fr_desc *desc, *tmp;
426 int i = 0; 445 int i = 0;
427 446
428 if (list_empty(&ib_conn->fastreg.pool)) 447 if (list_empty(&fr_pool->list))
429 return; 448 return;
430 449
431 iser_info("freeing conn %p fr pool\n", ib_conn); 450 iser_info("freeing conn %p fr pool\n", ib_conn);
432 451
433 list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { 452 list_for_each_entry_safe(desc, tmp, &fr_pool->list, list) {
434 list_del(&desc->list); 453 list_del(&desc->list);
435 ib_free_fast_reg_page_list(desc->data_frpl); 454 iser_free_reg_res(&desc->rsc);
436 ib_dereg_mr(desc->data_mr);
437 if (desc->pi_ctx) 455 if (desc->pi_ctx)
438 iser_free_pi_ctx(desc->pi_ctx); 456 iser_free_pi_ctx(desc->pi_ctx);
439 kfree(desc); 457 kfree(desc);
440 ++i; 458 ++i;
441 } 459 }
442 460
443 if (i < ib_conn->fastreg.pool_size) 461 if (i < fr_pool->size)
444 iser_warn("pool still has %d regions registered\n", 462 iser_warn("pool still has %d regions registered\n",
445 ib_conn->fastreg.pool_size - i); 463 fr_pool->size - i);
446} 464}
447 465
448/** 466/**
@@ -738,6 +756,31 @@ static void iser_connect_error(struct rdma_cm_id *cma_id)
738 iser_conn->state = ISER_CONN_TERMINATING; 756 iser_conn->state = ISER_CONN_TERMINATING;
739} 757}
740 758
759static void
760iser_calc_scsi_params(struct iser_conn *iser_conn,
761 unsigned int max_sectors)
762{
763 struct iser_device *device = iser_conn->ib_conn.device;
764 unsigned short sg_tablesize, sup_sg_tablesize;
765
766 sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K);
767 sup_sg_tablesize = min_t(unsigned, ISCSI_ISER_MAX_SG_TABLESIZE,
768 device->dev_attr.max_fast_reg_page_list_len);
769
770 if (sg_tablesize > sup_sg_tablesize) {
771 sg_tablesize = sup_sg_tablesize;
772 iser_conn->scsi_max_sectors = sg_tablesize * SIZE_4K / 512;
773 } else {
774 iser_conn->scsi_max_sectors = max_sectors;
775 }
776
777 iser_conn->scsi_sg_tablesize = sg_tablesize;
778
779 iser_dbg("iser_conn %p, sg_tablesize %u, max_sectors %u\n",
780 iser_conn, iser_conn->scsi_sg_tablesize,
781 iser_conn->scsi_max_sectors);
782}
783
741/** 784/**
742 * Called with state mutex held 785 * Called with state mutex held
743 **/ 786 **/
@@ -776,6 +819,8 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
776 } 819 }
777 } 820 }
778 821
822 iser_calc_scsi_params(iser_conn, iser_max_sectors);
823
779 ret = rdma_resolve_route(cma_id, 1000); 824 ret = rdma_resolve_route(cma_id, 1000);
780 if (ret) { 825 if (ret) {
781 iser_err("resolve route failed: %d\n", ret); 826 iser_err("resolve route failed: %d\n", ret);
@@ -938,7 +983,6 @@ void iser_conn_init(struct iser_conn *iser_conn)
938 init_completion(&iser_conn->ib_completion); 983 init_completion(&iser_conn->ib_completion);
939 init_completion(&iser_conn->up_completion); 984 init_completion(&iser_conn->up_completion);
940 INIT_LIST_HEAD(&iser_conn->conn_list); 985 INIT_LIST_HEAD(&iser_conn->conn_list);
941 spin_lock_init(&iser_conn->ib_conn.lock);
942 mutex_init(&iser_conn->state_mutex); 986 mutex_init(&iser_conn->state_mutex);
943} 987}
944 988
@@ -1017,7 +1061,7 @@ int iser_post_recvl(struct iser_conn *iser_conn)
1017 1061
1018 sge.addr = iser_conn->login_resp_dma; 1062 sge.addr = iser_conn->login_resp_dma;
1019 sge.length = ISER_RX_LOGIN_SIZE; 1063 sge.length = ISER_RX_LOGIN_SIZE;
1020 sge.lkey = ib_conn->device->mr->lkey; 1064 sge.lkey = ib_conn->device->pd->local_dma_lkey;
1021 1065
1022 rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf; 1066 rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf;
1023 rx_wr.sg_list = &sge; 1067 rx_wr.sg_list = &sge;
@@ -1072,23 +1116,24 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count)
1072int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, 1116int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
1073 bool signal) 1117 bool signal)
1074{ 1118{
1075 int ib_ret; 1119 struct ib_send_wr *bad_wr, *wr = iser_tx_next_wr(tx_desc);
1076 struct ib_send_wr send_wr, *send_wr_failed; 1120 int ib_ret;
1077 1121
1078 ib_dma_sync_single_for_device(ib_conn->device->ib_device, 1122 ib_dma_sync_single_for_device(ib_conn->device->ib_device,
1079 tx_desc->dma_addr, ISER_HEADERS_LEN, 1123 tx_desc->dma_addr, ISER_HEADERS_LEN,
1080 DMA_TO_DEVICE); 1124 DMA_TO_DEVICE);
1081 1125
1082 send_wr.next = NULL; 1126 wr->next = NULL;
1083 send_wr.wr_id = (uintptr_t)tx_desc; 1127 wr->wr_id = (uintptr_t)tx_desc;
1084 send_wr.sg_list = tx_desc->tx_sg; 1128 wr->sg_list = tx_desc->tx_sg;
1085 send_wr.num_sge = tx_desc->num_sge; 1129 wr->num_sge = tx_desc->num_sge;
1086 send_wr.opcode = IB_WR_SEND; 1130 wr->opcode = IB_WR_SEND;
1087 send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0; 1131 wr->send_flags = signal ? IB_SEND_SIGNALED : 0;
1088 1132
1089 ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); 1133 ib_ret = ib_post_send(ib_conn->qp, &tx_desc->wrs[0], &bad_wr);
1090 if (ib_ret) 1134 if (ib_ret)
1091 iser_err("ib_post_send failed, ret:%d\n", ib_ret); 1135 iser_err("ib_post_send failed, ret:%d opcode:%d\n",
1136 ib_ret, bad_wr->opcode);
1092 1137
1093 return ib_ret; 1138 return ib_ret;
1094} 1139}
@@ -1240,13 +1285,13 @@ u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
1240 enum iser_data_dir cmd_dir, sector_t *sector) 1285 enum iser_data_dir cmd_dir, sector_t *sector)
1241{ 1286{
1242 struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; 1287 struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
1243 struct fast_reg_descriptor *desc = reg->mem_h; 1288 struct iser_fr_desc *desc = reg->mem_h;
1244 unsigned long sector_size = iser_task->sc->device->sector_size; 1289 unsigned long sector_size = iser_task->sc->device->sector_size;
1245 struct ib_mr_status mr_status; 1290 struct ib_mr_status mr_status;
1246 int ret; 1291 int ret;
1247 1292
1248 if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) { 1293 if (desc && desc->pi_ctx->sig_protected) {
1249 desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; 1294 desc->pi_ctx->sig_protected = 0;
1250 ret = ib_check_mr_status(desc->pi_ctx->sig_mr, 1295 ret = ib_check_mr_status(desc->pi_ctx->sig_mr,
1251 IB_MR_CHECK_SIG_STATUS, &mr_status); 1296 IB_MR_CHECK_SIG_STATUS, &mr_status);
1252 if (ret) { 1297 if (ret) {
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index d851e1828d6f..dc439a40da3f 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -235,7 +235,7 @@ isert_alloc_rx_descriptors(struct isert_conn *isert_conn)
235 rx_sg = &rx_desc->rx_sg; 235 rx_sg = &rx_desc->rx_sg;
236 rx_sg->addr = rx_desc->dma_addr; 236 rx_sg->addr = rx_desc->dma_addr;
237 rx_sg->length = ISER_RX_PAYLOAD_SIZE; 237 rx_sg->length = ISER_RX_PAYLOAD_SIZE;
238 rx_sg->lkey = device->mr->lkey; 238 rx_sg->lkey = device->pd->local_dma_lkey;
239 } 239 }
240 240
241 isert_conn->rx_desc_head = 0; 241 isert_conn->rx_desc_head = 0;
@@ -385,22 +385,12 @@ isert_create_device_ib_res(struct isert_device *device)
385 goto out_cq; 385 goto out_cq;
386 } 386 }
387 387
388 device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE);
389 if (IS_ERR(device->mr)) {
390 ret = PTR_ERR(device->mr);
391 isert_err("failed to create dma mr, device %p, ret=%d\n",
392 device, ret);
393 goto out_mr;
394 }
395
396 /* Check signature cap */ 388 /* Check signature cap */
397 device->pi_capable = dev_attr->device_cap_flags & 389 device->pi_capable = dev_attr->device_cap_flags &
398 IB_DEVICE_SIGNATURE_HANDOVER ? true : false; 390 IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
399 391
400 return 0; 392 return 0;
401 393
402out_mr:
403 ib_dealloc_pd(device->pd);
404out_cq: 394out_cq:
405 isert_free_comps(device); 395 isert_free_comps(device);
406 return ret; 396 return ret;
@@ -411,7 +401,6 @@ isert_free_device_ib_res(struct isert_device *device)
411{ 401{
412 isert_info("device %p\n", device); 402 isert_info("device %p\n", device);
413 403
414 ib_dereg_mr(device->mr);
415 ib_dealloc_pd(device->pd); 404 ib_dealloc_pd(device->pd);
416 isert_free_comps(device); 405 isert_free_comps(device);
417} 406}
@@ -491,7 +480,7 @@ isert_conn_free_fastreg_pool(struct isert_conn *isert_conn)
491 if (fr_desc->pi_ctx) { 480 if (fr_desc->pi_ctx) {
492 ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); 481 ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl);
493 ib_dereg_mr(fr_desc->pi_ctx->prot_mr); 482 ib_dereg_mr(fr_desc->pi_ctx->prot_mr);
494 ib_destroy_mr(fr_desc->pi_ctx->sig_mr); 483 ib_dereg_mr(fr_desc->pi_ctx->sig_mr);
495 kfree(fr_desc->pi_ctx); 484 kfree(fr_desc->pi_ctx);
496 } 485 }
497 kfree(fr_desc); 486 kfree(fr_desc);
@@ -508,7 +497,6 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc,
508 struct ib_device *device, 497 struct ib_device *device,
509 struct ib_pd *pd) 498 struct ib_pd *pd)
510{ 499{
511 struct ib_mr_init_attr mr_init_attr;
512 struct pi_context *pi_ctx; 500 struct pi_context *pi_ctx;
513 int ret; 501 int ret;
514 502
@@ -527,7 +515,8 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc,
527 goto err_pi_ctx; 515 goto err_pi_ctx;
528 } 516 }
529 517
530 pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); 518 pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
519 ISCSI_ISER_SG_TABLESIZE);
531 if (IS_ERR(pi_ctx->prot_mr)) { 520 if (IS_ERR(pi_ctx->prot_mr)) {
532 isert_err("Failed to allocate prot frmr err=%ld\n", 521 isert_err("Failed to allocate prot frmr err=%ld\n",
533 PTR_ERR(pi_ctx->prot_mr)); 522 PTR_ERR(pi_ctx->prot_mr));
@@ -536,10 +525,7 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc,
536 } 525 }
537 desc->ind |= ISERT_PROT_KEY_VALID; 526 desc->ind |= ISERT_PROT_KEY_VALID;
538 527
539 memset(&mr_init_attr, 0, sizeof(mr_init_attr)); 528 pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
540 mr_init_attr.max_reg_descriptors = 2;
541 mr_init_attr.flags |= IB_MR_SIGNATURE_EN;
542 pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr);
543 if (IS_ERR(pi_ctx->sig_mr)) { 529 if (IS_ERR(pi_ctx->sig_mr)) {
544 isert_err("Failed to allocate signature enabled mr err=%ld\n", 530 isert_err("Failed to allocate signature enabled mr err=%ld\n",
545 PTR_ERR(pi_ctx->sig_mr)); 531 PTR_ERR(pi_ctx->sig_mr));
@@ -577,7 +563,8 @@ isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd,
577 return PTR_ERR(fr_desc->data_frpl); 563 return PTR_ERR(fr_desc->data_frpl);
578 } 564 }
579 565
580 fr_desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); 566 fr_desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
567 ISCSI_ISER_SG_TABLESIZE);
581 if (IS_ERR(fr_desc->data_mr)) { 568 if (IS_ERR(fr_desc->data_mr)) {
582 isert_err("Failed to allocate data frmr err=%ld\n", 569 isert_err("Failed to allocate data frmr err=%ld\n",
583 PTR_ERR(fr_desc->data_mr)); 570 PTR_ERR(fr_desc->data_mr));
@@ -1092,8 +1079,8 @@ isert_create_send_desc(struct isert_conn *isert_conn,
1092 tx_desc->num_sge = 1; 1079 tx_desc->num_sge = 1;
1093 tx_desc->isert_cmd = isert_cmd; 1080 tx_desc->isert_cmd = isert_cmd;
1094 1081
1095 if (tx_desc->tx_sg[0].lkey != device->mr->lkey) { 1082 if (tx_desc->tx_sg[0].lkey != device->pd->local_dma_lkey) {
1096 tx_desc->tx_sg[0].lkey = device->mr->lkey; 1083 tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey;
1097 isert_dbg("tx_desc %p lkey mismatch, fixing\n", tx_desc); 1084 isert_dbg("tx_desc %p lkey mismatch, fixing\n", tx_desc);
1098 } 1085 }
1099} 1086}
@@ -1116,7 +1103,7 @@ isert_init_tx_hdrs(struct isert_conn *isert_conn,
1116 tx_desc->dma_addr = dma_addr; 1103 tx_desc->dma_addr = dma_addr;
1117 tx_desc->tx_sg[0].addr = tx_desc->dma_addr; 1104 tx_desc->tx_sg[0].addr = tx_desc->dma_addr;
1118 tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; 1105 tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
1119 tx_desc->tx_sg[0].lkey = device->mr->lkey; 1106 tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey;
1120 1107
1121 isert_dbg("Setup tx_sg[0].addr: 0x%llx length: %u lkey: 0x%x\n", 1108 isert_dbg("Setup tx_sg[0].addr: 0x%llx length: %u lkey: 0x%x\n",
1122 tx_desc->tx_sg[0].addr, tx_desc->tx_sg[0].length, 1109 tx_desc->tx_sg[0].addr, tx_desc->tx_sg[0].length,
@@ -1149,7 +1136,7 @@ isert_rdma_post_recvl(struct isert_conn *isert_conn)
1149 memset(&sge, 0, sizeof(struct ib_sge)); 1136 memset(&sge, 0, sizeof(struct ib_sge));
1150 sge.addr = isert_conn->login_req_dma; 1137 sge.addr = isert_conn->login_req_dma;
1151 sge.length = ISER_RX_LOGIN_SIZE; 1138 sge.length = ISER_RX_LOGIN_SIZE;
1152 sge.lkey = isert_conn->device->mr->lkey; 1139 sge.lkey = isert_conn->device->pd->local_dma_lkey;
1153 1140
1154 isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n", 1141 isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n",
1155 sge.addr, sge.length, sge.lkey); 1142 sge.addr, sge.length, sge.lkey);
@@ -1199,7 +1186,7 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login,
1199 1186
1200 tx_dsg->addr = isert_conn->login_rsp_dma; 1187 tx_dsg->addr = isert_conn->login_rsp_dma;
1201 tx_dsg->length = length; 1188 tx_dsg->length = length;
1202 tx_dsg->lkey = isert_conn->device->mr->lkey; 1189 tx_dsg->lkey = isert_conn->device->pd->local_dma_lkey;
1203 tx_desc->num_sge = 2; 1190 tx_desc->num_sge = 2;
1204 } 1191 }
1205 if (!login->login_failed) { 1192 if (!login->login_failed) {
@@ -2216,7 +2203,7 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
2216 isert_cmd->pdu_buf_len = pdu_len; 2203 isert_cmd->pdu_buf_len = pdu_len;
2217 tx_dsg->addr = isert_cmd->pdu_buf_dma; 2204 tx_dsg->addr = isert_cmd->pdu_buf_dma;
2218 tx_dsg->length = pdu_len; 2205 tx_dsg->length = pdu_len;
2219 tx_dsg->lkey = device->mr->lkey; 2206 tx_dsg->lkey = device->pd->local_dma_lkey;
2220 isert_cmd->tx_desc.num_sge = 2; 2207 isert_cmd->tx_desc.num_sge = 2;
2221 } 2208 }
2222 2209
@@ -2344,7 +2331,7 @@ isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
2344 isert_cmd->pdu_buf_len = ISCSI_HDR_LEN; 2331 isert_cmd->pdu_buf_len = ISCSI_HDR_LEN;
2345 tx_dsg->addr = isert_cmd->pdu_buf_dma; 2332 tx_dsg->addr = isert_cmd->pdu_buf_dma;
2346 tx_dsg->length = ISCSI_HDR_LEN; 2333 tx_dsg->length = ISCSI_HDR_LEN;
2347 tx_dsg->lkey = device->mr->lkey; 2334 tx_dsg->lkey = device->pd->local_dma_lkey;
2348 isert_cmd->tx_desc.num_sge = 2; 2335 isert_cmd->tx_desc.num_sge = 2;
2349 2336
2350 isert_init_send_wr(isert_conn, isert_cmd, send_wr); 2337 isert_init_send_wr(isert_conn, isert_cmd, send_wr);
@@ -2385,7 +2372,7 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
2385 isert_cmd->pdu_buf_len = txt_rsp_len; 2372 isert_cmd->pdu_buf_len = txt_rsp_len;
2386 tx_dsg->addr = isert_cmd->pdu_buf_dma; 2373 tx_dsg->addr = isert_cmd->pdu_buf_dma;
2387 tx_dsg->length = txt_rsp_len; 2374 tx_dsg->length = txt_rsp_len;
2388 tx_dsg->lkey = device->mr->lkey; 2375 tx_dsg->lkey = device->pd->local_dma_lkey;
2389 isert_cmd->tx_desc.num_sge = 2; 2376 isert_cmd->tx_desc.num_sge = 2;
2390 } 2377 }
2391 isert_init_send_wr(isert_conn, isert_cmd, send_wr); 2378 isert_init_send_wr(isert_conn, isert_cmd, send_wr);
@@ -2426,7 +2413,7 @@ isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
2426 ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off; 2413 ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off;
2427 ib_sge->length = min_t(u32, data_left, 2414 ib_sge->length = min_t(u32, data_left,
2428 ib_sg_dma_len(ib_dev, tmp_sg) - page_off); 2415 ib_sg_dma_len(ib_dev, tmp_sg) - page_off);
2429 ib_sge->lkey = device->mr->lkey; 2416 ib_sge->lkey = device->pd->local_dma_lkey;
2430 2417
2431 isert_dbg("RDMA ib_sge: addr: 0x%llx length: %u lkey: %x\n", 2418 isert_dbg("RDMA ib_sge: addr: 0x%llx length: %u lkey: %x\n",
2432 ib_sge->addr, ib_sge->length, ib_sge->lkey); 2419 ib_sge->addr, ib_sge->length, ib_sge->lkey);
@@ -2600,7 +2587,7 @@ isert_fast_reg_mr(struct isert_conn *isert_conn,
2600 u32 page_off; 2587 u32 page_off;
2601 2588
2602 if (mem->dma_nents == 1) { 2589 if (mem->dma_nents == 1) {
2603 sge->lkey = device->mr->lkey; 2590 sge->lkey = device->pd->local_dma_lkey;
2604 sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]); 2591 sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]);
2605 sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]); 2592 sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]);
2606 isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n", 2593 isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n",
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
index 9ec23a786c02..6a04ba3c0f72 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.h
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -209,7 +209,6 @@ struct isert_device {
209 int refcount; 209 int refcount;
210 struct ib_device *ib_device; 210 struct ib_device *ib_device;
211 struct ib_pd *pd; 211 struct ib_pd *pd;
212 struct ib_mr *mr;
213 struct isert_comp *comps; 212 struct isert_comp *comps;
214 int comps_used; 213 int comps_used;
215 struct list_head dev_node; 214 struct list_head dev_node;
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 31a20b462266..b481490ad257 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -55,8 +55,8 @@
55 55
56#define DRV_NAME "ib_srp" 56#define DRV_NAME "ib_srp"
57#define PFX DRV_NAME ": " 57#define PFX DRV_NAME ": "
58#define DRV_VERSION "1.0" 58#define DRV_VERSION "2.0"
59#define DRV_RELDATE "July 1, 2013" 59#define DRV_RELDATE "July 26, 2015"
60 60
61MODULE_AUTHOR("Roland Dreier"); 61MODULE_AUTHOR("Roland Dreier");
62MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator"); 62MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator");
@@ -68,8 +68,8 @@ static unsigned int srp_sg_tablesize;
68static unsigned int cmd_sg_entries; 68static unsigned int cmd_sg_entries;
69static unsigned int indirect_sg_entries; 69static unsigned int indirect_sg_entries;
70static bool allow_ext_sg; 70static bool allow_ext_sg;
71static bool prefer_fr; 71static bool prefer_fr = true;
72static bool register_always; 72static bool register_always = true;
73static int topspin_workarounds = 1; 73static int topspin_workarounds = 1;
74 74
75module_param(srp_sg_tablesize, uint, 0444); 75module_param(srp_sg_tablesize, uint, 0444);
@@ -131,7 +131,7 @@ MODULE_PARM_DESC(ch_count,
131 "Number of RDMA channels to use for communication with an SRP target. Using more than one channel improves performance if the HCA supports multiple completion vectors. The default value is the minimum of four times the number of online CPU sockets and the number of completion vectors supported by the HCA."); 131 "Number of RDMA channels to use for communication with an SRP target. Using more than one channel improves performance if the HCA supports multiple completion vectors. The default value is the minimum of four times the number of online CPU sockets and the number of completion vectors supported by the HCA.");
132 132
133static void srp_add_one(struct ib_device *device); 133static void srp_add_one(struct ib_device *device);
134static void srp_remove_one(struct ib_device *device); 134static void srp_remove_one(struct ib_device *device, void *client_data);
135static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr); 135static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr);
136static void srp_send_completion(struct ib_cq *cq, void *ch_ptr); 136static void srp_send_completion(struct ib_cq *cq, void *ch_ptr);
137static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); 137static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
@@ -378,7 +378,8 @@ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device,
378 INIT_LIST_HEAD(&pool->free_list); 378 INIT_LIST_HEAD(&pool->free_list);
379 379
380 for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { 380 for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) {
381 mr = ib_alloc_fast_reg_mr(pd, max_page_list_len); 381 mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
382 max_page_list_len);
382 if (IS_ERR(mr)) { 383 if (IS_ERR(mr)) {
383 ret = PTR_ERR(mr); 384 ret = PTR_ERR(mr);
384 goto destroy_pool; 385 goto destroy_pool;
@@ -545,7 +546,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
545 if (ret) 546 if (ret)
546 goto err_qp; 547 goto err_qp;
547 548
548 if (dev->use_fast_reg && dev->has_fr) { 549 if (dev->use_fast_reg) {
549 fr_pool = srp_alloc_fr_pool(target); 550 fr_pool = srp_alloc_fr_pool(target);
550 if (IS_ERR(fr_pool)) { 551 if (IS_ERR(fr_pool)) {
551 ret = PTR_ERR(fr_pool); 552 ret = PTR_ERR(fr_pool);
@@ -553,10 +554,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
553 "FR pool allocation failed (%d)\n", ret); 554 "FR pool allocation failed (%d)\n", ret);
554 goto err_qp; 555 goto err_qp;
555 } 556 }
556 if (ch->fr_pool) 557 } else if (dev->use_fmr) {
557 srp_destroy_fr_pool(ch->fr_pool);
558 ch->fr_pool = fr_pool;
559 } else if (!dev->use_fast_reg && dev->has_fmr) {
560 fmr_pool = srp_alloc_fmr_pool(target); 558 fmr_pool = srp_alloc_fmr_pool(target);
561 if (IS_ERR(fmr_pool)) { 559 if (IS_ERR(fmr_pool)) {
562 ret = PTR_ERR(fmr_pool); 560 ret = PTR_ERR(fmr_pool);
@@ -564,9 +562,6 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
564 "FMR pool allocation failed (%d)\n", ret); 562 "FMR pool allocation failed (%d)\n", ret);
565 goto err_qp; 563 goto err_qp;
566 } 564 }
567 if (ch->fmr_pool)
568 ib_destroy_fmr_pool(ch->fmr_pool);
569 ch->fmr_pool = fmr_pool;
570 } 565 }
571 566
572 if (ch->qp) 567 if (ch->qp)
@@ -580,6 +575,16 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
580 ch->recv_cq = recv_cq; 575 ch->recv_cq = recv_cq;
581 ch->send_cq = send_cq; 576 ch->send_cq = send_cq;
582 577
578 if (dev->use_fast_reg) {
579 if (ch->fr_pool)
580 srp_destroy_fr_pool(ch->fr_pool);
581 ch->fr_pool = fr_pool;
582 } else if (dev->use_fmr) {
583 if (ch->fmr_pool)
584 ib_destroy_fmr_pool(ch->fmr_pool);
585 ch->fmr_pool = fmr_pool;
586 }
587
583 kfree(init_attr); 588 kfree(init_attr);
584 return 0; 589 return 0;
585 590
@@ -622,7 +627,7 @@ static void srp_free_ch_ib(struct srp_target_port *target,
622 if (dev->use_fast_reg) { 627 if (dev->use_fast_reg) {
623 if (ch->fr_pool) 628 if (ch->fr_pool)
624 srp_destroy_fr_pool(ch->fr_pool); 629 srp_destroy_fr_pool(ch->fr_pool);
625 } else { 630 } else if (dev->use_fmr) {
626 if (ch->fmr_pool) 631 if (ch->fmr_pool)
627 ib_destroy_fmr_pool(ch->fmr_pool); 632 ib_destroy_fmr_pool(ch->fmr_pool);
628 } 633 }
@@ -1084,7 +1089,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd,
1084 if (req->nmdesc) 1089 if (req->nmdesc)
1085 srp_fr_pool_put(ch->fr_pool, req->fr_list, 1090 srp_fr_pool_put(ch->fr_pool, req->fr_list,
1086 req->nmdesc); 1091 req->nmdesc);
1087 } else { 1092 } else if (dev->use_fmr) {
1088 struct ib_pool_fmr **pfmr; 1093 struct ib_pool_fmr **pfmr;
1089 1094
1090 for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++) 1095 for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++)
@@ -1259,6 +1264,8 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
1259{ 1264{
1260 struct srp_direct_buf *desc = state->desc; 1265 struct srp_direct_buf *desc = state->desc;
1261 1266
1267 WARN_ON_ONCE(!dma_len);
1268
1262 desc->va = cpu_to_be64(dma_addr); 1269 desc->va = cpu_to_be64(dma_addr);
1263 desc->key = cpu_to_be32(rkey); 1270 desc->key = cpu_to_be32(rkey);
1264 desc->len = cpu_to_be32(dma_len); 1271 desc->len = cpu_to_be32(dma_len);
@@ -1271,18 +1278,24 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
1271static int srp_map_finish_fmr(struct srp_map_state *state, 1278static int srp_map_finish_fmr(struct srp_map_state *state,
1272 struct srp_rdma_ch *ch) 1279 struct srp_rdma_ch *ch)
1273{ 1280{
1281 struct srp_target_port *target = ch->target;
1282 struct srp_device *dev = target->srp_host->srp_dev;
1274 struct ib_pool_fmr *fmr; 1283 struct ib_pool_fmr *fmr;
1275 u64 io_addr = 0; 1284 u64 io_addr = 0;
1276 1285
1286 if (state->fmr.next >= state->fmr.end)
1287 return -ENOMEM;
1288
1277 fmr = ib_fmr_pool_map_phys(ch->fmr_pool, state->pages, 1289 fmr = ib_fmr_pool_map_phys(ch->fmr_pool, state->pages,
1278 state->npages, io_addr); 1290 state->npages, io_addr);
1279 if (IS_ERR(fmr)) 1291 if (IS_ERR(fmr))
1280 return PTR_ERR(fmr); 1292 return PTR_ERR(fmr);
1281 1293
1282 *state->next_fmr++ = fmr; 1294 *state->fmr.next++ = fmr;
1283 state->nmdesc++; 1295 state->nmdesc++;
1284 1296
1285 srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey); 1297 srp_map_desc(state, state->base_dma_addr & ~dev->mr_page_mask,
1298 state->dma_len, fmr->fmr->rkey);
1286 1299
1287 return 0; 1300 return 0;
1288} 1301}
@@ -1297,6 +1310,9 @@ static int srp_map_finish_fr(struct srp_map_state *state,
1297 struct srp_fr_desc *desc; 1310 struct srp_fr_desc *desc;
1298 u32 rkey; 1311 u32 rkey;
1299 1312
1313 if (state->fr.next >= state->fr.end)
1314 return -ENOMEM;
1315
1300 desc = srp_fr_pool_get(ch->fr_pool); 1316 desc = srp_fr_pool_get(ch->fr_pool);
1301 if (!desc) 1317 if (!desc)
1302 return -ENOMEM; 1318 return -ENOMEM;
@@ -1320,7 +1336,7 @@ static int srp_map_finish_fr(struct srp_map_state *state,
1320 IB_ACCESS_REMOTE_WRITE); 1336 IB_ACCESS_REMOTE_WRITE);
1321 wr.wr.fast_reg.rkey = desc->mr->lkey; 1337 wr.wr.fast_reg.rkey = desc->mr->lkey;
1322 1338
1323 *state->next_fr++ = desc; 1339 *state->fr.next++ = desc;
1324 state->nmdesc++; 1340 state->nmdesc++;
1325 1341
1326 srp_map_desc(state, state->base_dma_addr, state->dma_len, 1342 srp_map_desc(state, state->base_dma_addr, state->dma_len,
@@ -1333,17 +1349,19 @@ static int srp_finish_mapping(struct srp_map_state *state,
1333 struct srp_rdma_ch *ch) 1349 struct srp_rdma_ch *ch)
1334{ 1350{
1335 struct srp_target_port *target = ch->target; 1351 struct srp_target_port *target = ch->target;
1352 struct srp_device *dev = target->srp_host->srp_dev;
1336 int ret = 0; 1353 int ret = 0;
1337 1354
1355 WARN_ON_ONCE(!dev->use_fast_reg && !dev->use_fmr);
1356
1338 if (state->npages == 0) 1357 if (state->npages == 0)
1339 return 0; 1358 return 0;
1340 1359
1341 if (state->npages == 1 && !register_always) 1360 if (state->npages == 1 && target->global_mr)
1342 srp_map_desc(state, state->base_dma_addr, state->dma_len, 1361 srp_map_desc(state, state->base_dma_addr, state->dma_len,
1343 target->rkey); 1362 target->global_mr->rkey);
1344 else 1363 else
1345 ret = target->srp_host->srp_dev->use_fast_reg ? 1364 ret = dev->use_fast_reg ? srp_map_finish_fr(state, ch) :
1346 srp_map_finish_fr(state, ch) :
1347 srp_map_finish_fmr(state, ch); 1365 srp_map_finish_fmr(state, ch);
1348 1366
1349 if (ret == 0) { 1367 if (ret == 0) {
@@ -1354,66 +1372,19 @@ static int srp_finish_mapping(struct srp_map_state *state,
1354 return ret; 1372 return ret;
1355} 1373}
1356 1374
1357static void srp_map_update_start(struct srp_map_state *state,
1358 struct scatterlist *sg, int sg_index,
1359 dma_addr_t dma_addr)
1360{
1361 state->unmapped_sg = sg;
1362 state->unmapped_index = sg_index;
1363 state->unmapped_addr = dma_addr;
1364}
1365
1366static int srp_map_sg_entry(struct srp_map_state *state, 1375static int srp_map_sg_entry(struct srp_map_state *state,
1367 struct srp_rdma_ch *ch, 1376 struct srp_rdma_ch *ch,
1368 struct scatterlist *sg, int sg_index, 1377 struct scatterlist *sg, int sg_index)
1369 bool use_mr)
1370{ 1378{
1371 struct srp_target_port *target = ch->target; 1379 struct srp_target_port *target = ch->target;
1372 struct srp_device *dev = target->srp_host->srp_dev; 1380 struct srp_device *dev = target->srp_host->srp_dev;
1373 struct ib_device *ibdev = dev->dev; 1381 struct ib_device *ibdev = dev->dev;
1374 dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg); 1382 dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg);
1375 unsigned int dma_len = ib_sg_dma_len(ibdev, sg); 1383 unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
1376 unsigned int len; 1384 unsigned int len = 0;
1377 int ret; 1385 int ret;
1378 1386
1379 if (!dma_len) 1387 WARN_ON_ONCE(!dma_len);
1380 return 0;
1381
1382 if (!use_mr) {
1383 /*
1384 * Once we're in direct map mode for a request, we don't
1385 * go back to FMR or FR mode, so no need to update anything
1386 * other than the descriptor.
1387 */
1388 srp_map_desc(state, dma_addr, dma_len, target->rkey);
1389 return 0;
1390 }
1391
1392 /*
1393 * Since not all RDMA HW drivers support non-zero page offsets for
1394 * FMR, if we start at an offset into a page, don't merge into the
1395 * current FMR mapping. Finish it out, and use the kernel's MR for
1396 * this sg entry.
1397 */
1398 if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) ||
1399 dma_len > dev->mr_max_size) {
1400 ret = srp_finish_mapping(state, ch);
1401 if (ret)
1402 return ret;
1403
1404 srp_map_desc(state, dma_addr, dma_len, target->rkey);
1405 srp_map_update_start(state, NULL, 0, 0);
1406 return 0;
1407 }
1408
1409 /*
1410 * If this is the first sg that will be mapped via FMR or via FR, save
1411 * our position. We need to know the first unmapped entry, its index,
1412 * and the first unmapped address within that entry to be able to
1413 * restart mapping after an error.
1414 */
1415 if (!state->unmapped_sg)
1416 srp_map_update_start(state, sg, sg_index, dma_addr);
1417 1388
1418 while (dma_len) { 1389 while (dma_len) {
1419 unsigned offset = dma_addr & ~dev->mr_page_mask; 1390 unsigned offset = dma_addr & ~dev->mr_page_mask;
@@ -1421,8 +1392,6 @@ static int srp_map_sg_entry(struct srp_map_state *state,
1421 ret = srp_finish_mapping(state, ch); 1392 ret = srp_finish_mapping(state, ch);
1422 if (ret) 1393 if (ret)
1423 return ret; 1394 return ret;
1424
1425 srp_map_update_start(state, sg, sg_index, dma_addr);
1426 } 1395 }
1427 1396
1428 len = min_t(unsigned int, dma_len, dev->mr_page_size - offset); 1397 len = min_t(unsigned int, dma_len, dev->mr_page_size - offset);
@@ -1441,11 +1410,8 @@ static int srp_map_sg_entry(struct srp_map_state *state,
1441 * boundries. 1410 * boundries.
1442 */ 1411 */
1443 ret = 0; 1412 ret = 0;
1444 if (len != dev->mr_page_size) { 1413 if (len != dev->mr_page_size)
1445 ret = srp_finish_mapping(state, ch); 1414 ret = srp_finish_mapping(state, ch);
1446 if (!ret)
1447 srp_map_update_start(state, NULL, 0, 0);
1448 }
1449 return ret; 1415 return ret;
1450} 1416}
1451 1417
@@ -1455,50 +1421,80 @@ static int srp_map_sg(struct srp_map_state *state, struct srp_rdma_ch *ch,
1455{ 1421{
1456 struct srp_target_port *target = ch->target; 1422 struct srp_target_port *target = ch->target;
1457 struct srp_device *dev = target->srp_host->srp_dev; 1423 struct srp_device *dev = target->srp_host->srp_dev;
1458 struct ib_device *ibdev = dev->dev;
1459 struct scatterlist *sg; 1424 struct scatterlist *sg;
1460 int i; 1425 int i, ret;
1461 bool use_mr;
1462 1426
1463 state->desc = req->indirect_desc; 1427 state->desc = req->indirect_desc;
1464 state->pages = req->map_page; 1428 state->pages = req->map_page;
1465 if (dev->use_fast_reg) { 1429 if (dev->use_fast_reg) {
1466 state->next_fr = req->fr_list; 1430 state->fr.next = req->fr_list;
1467 use_mr = !!ch->fr_pool; 1431 state->fr.end = req->fr_list + target->cmd_sg_cnt;
1468 } else { 1432 } else if (dev->use_fmr) {
1469 state->next_fmr = req->fmr_list; 1433 state->fmr.next = req->fmr_list;
1470 use_mr = !!ch->fmr_pool; 1434 state->fmr.end = req->fmr_list + target->cmd_sg_cnt;
1471 } 1435 }
1472 1436
1473 for_each_sg(scat, sg, count, i) { 1437 if (dev->use_fast_reg || dev->use_fmr) {
1474 if (srp_map_sg_entry(state, ch, sg, i, use_mr)) { 1438 for_each_sg(scat, sg, count, i) {
1475 /* 1439 ret = srp_map_sg_entry(state, ch, sg, i);
1476 * Memory registration failed, so backtrack to the 1440 if (ret)
1477 * first unmapped entry and continue on without using 1441 goto out;
1478 * memory registration. 1442 }
1479 */ 1443 ret = srp_finish_mapping(state, ch);
1480 dma_addr_t dma_addr; 1444 if (ret)
1481 unsigned int dma_len; 1445 goto out;
1482 1446 } else {
1483backtrack: 1447 for_each_sg(scat, sg, count, i) {
1484 sg = state->unmapped_sg; 1448 srp_map_desc(state, ib_sg_dma_address(dev->dev, sg),
1485 i = state->unmapped_index; 1449 ib_sg_dma_len(dev->dev, sg),
1486 1450 target->global_mr->rkey);
1487 dma_addr = ib_sg_dma_address(ibdev, sg);
1488 dma_len = ib_sg_dma_len(ibdev, sg);
1489 dma_len -= (state->unmapped_addr - dma_addr);
1490 dma_addr = state->unmapped_addr;
1491 use_mr = false;
1492 srp_map_desc(state, dma_addr, dma_len, target->rkey);
1493 } 1451 }
1494 } 1452 }
1495 1453
1496 if (use_mr && srp_finish_mapping(state, ch))
1497 goto backtrack;
1498
1499 req->nmdesc = state->nmdesc; 1454 req->nmdesc = state->nmdesc;
1455 ret = 0;
1500 1456
1501 return 0; 1457out:
1458 return ret;
1459}
1460
1461/*
1462 * Register the indirect data buffer descriptor with the HCA.
1463 *
1464 * Note: since the indirect data buffer descriptor has been allocated with
1465 * kmalloc() it is guaranteed that this buffer is a physically contiguous
1466 * memory buffer.
1467 */
1468static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
1469 void **next_mr, void **end_mr, u32 idb_len,
1470 __be32 *idb_rkey)
1471{
1472 struct srp_target_port *target = ch->target;
1473 struct srp_device *dev = target->srp_host->srp_dev;
1474 struct srp_map_state state;
1475 struct srp_direct_buf idb_desc;
1476 u64 idb_pages[1];
1477 int ret;
1478
1479 memset(&state, 0, sizeof(state));
1480 memset(&idb_desc, 0, sizeof(idb_desc));
1481 state.gen.next = next_mr;
1482 state.gen.end = end_mr;
1483 state.desc = &idb_desc;
1484 state.pages = idb_pages;
1485 state.pages[0] = (req->indirect_dma_addr &
1486 dev->mr_page_mask);
1487 state.npages = 1;
1488 state.base_dma_addr = req->indirect_dma_addr;
1489 state.dma_len = idb_len;
1490 ret = srp_finish_mapping(&state, ch);
1491 if (ret < 0)
1492 goto out;
1493
1494 *idb_rkey = idb_desc.key;
1495
1496out:
1497 return ret;
1502} 1498}
1503 1499
1504static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, 1500static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
@@ -1507,12 +1503,13 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
1507 struct srp_target_port *target = ch->target; 1503 struct srp_target_port *target = ch->target;
1508 struct scatterlist *scat; 1504 struct scatterlist *scat;
1509 struct srp_cmd *cmd = req->cmd->buf; 1505 struct srp_cmd *cmd = req->cmd->buf;
1510 int len, nents, count; 1506 int len, nents, count, ret;
1511 struct srp_device *dev; 1507 struct srp_device *dev;
1512 struct ib_device *ibdev; 1508 struct ib_device *ibdev;
1513 struct srp_map_state state; 1509 struct srp_map_state state;
1514 struct srp_indirect_buf *indirect_hdr; 1510 struct srp_indirect_buf *indirect_hdr;
1515 u32 table_len; 1511 u32 idb_len, table_len;
1512 __be32 idb_rkey;
1516 u8 fmt; 1513 u8 fmt;
1517 1514
1518 if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE) 1515 if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE)
@@ -1539,7 +1536,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
1539 fmt = SRP_DATA_DESC_DIRECT; 1536 fmt = SRP_DATA_DESC_DIRECT;
1540 len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf); 1537 len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf);
1541 1538
1542 if (count == 1 && !register_always) { 1539 if (count == 1 && target->global_mr) {
1543 /* 1540 /*
1544 * The midlayer only generated a single gather/scatter 1541 * The midlayer only generated a single gather/scatter
1545 * entry, or DMA mapping coalesced everything to a 1542 * entry, or DMA mapping coalesced everything to a
@@ -1549,7 +1546,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
1549 struct srp_direct_buf *buf = (void *) cmd->add_data; 1546 struct srp_direct_buf *buf = (void *) cmd->add_data;
1550 1547
1551 buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, scat)); 1548 buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, scat));
1552 buf->key = cpu_to_be32(target->rkey); 1549 buf->key = cpu_to_be32(target->global_mr->rkey);
1553 buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); 1550 buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat));
1554 1551
1555 req->nmdesc = 0; 1552 req->nmdesc = 0;
@@ -1594,6 +1591,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
1594 1591
1595 count = min(state.ndesc, target->cmd_sg_cnt); 1592 count = min(state.ndesc, target->cmd_sg_cnt);
1596 table_len = state.ndesc * sizeof (struct srp_direct_buf); 1593 table_len = state.ndesc * sizeof (struct srp_direct_buf);
1594 idb_len = sizeof(struct srp_indirect_buf) + table_len;
1597 1595
1598 fmt = SRP_DATA_DESC_INDIRECT; 1596 fmt = SRP_DATA_DESC_INDIRECT;
1599 len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf); 1597 len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf);
@@ -1602,8 +1600,18 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
1602 memcpy(indirect_hdr->desc_list, req->indirect_desc, 1600 memcpy(indirect_hdr->desc_list, req->indirect_desc,
1603 count * sizeof (struct srp_direct_buf)); 1601 count * sizeof (struct srp_direct_buf));
1604 1602
1603 if (!target->global_mr) {
1604 ret = srp_map_idb(ch, req, state.gen.next, state.gen.end,
1605 idb_len, &idb_rkey);
1606 if (ret < 0)
1607 return ret;
1608 req->nmdesc++;
1609 } else {
1610 idb_rkey = target->global_mr->rkey;
1611 }
1612
1605 indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr); 1613 indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr);
1606 indirect_hdr->table_desc.key = cpu_to_be32(target->rkey); 1614 indirect_hdr->table_desc.key = idb_rkey;
1607 indirect_hdr->table_desc.len = cpu_to_be32(table_len); 1615 indirect_hdr->table_desc.len = cpu_to_be32(table_len);
1608 indirect_hdr->len = cpu_to_be32(state.total_len); 1616 indirect_hdr->len = cpu_to_be32(state.total_len);
1609 1617
@@ -2171,7 +2179,7 @@ static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask)
2171} 2179}
2172 2180
2173static void srp_cm_rep_handler(struct ib_cm_id *cm_id, 2181static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
2174 struct srp_login_rsp *lrsp, 2182 const struct srp_login_rsp *lrsp,
2175 struct srp_rdma_ch *ch) 2183 struct srp_rdma_ch *ch)
2176{ 2184{
2177 struct srp_target_port *target = ch->target; 2185 struct srp_target_port *target = ch->target;
@@ -2757,6 +2765,13 @@ static int srp_sdev_count(struct Scsi_Host *host)
2757 return c; 2765 return c;
2758} 2766}
2759 2767
2768/*
2769 * Return values:
2770 * < 0 upon failure. Caller is responsible for SRP target port cleanup.
2771 * 0 and target->state == SRP_TARGET_REMOVED if asynchronous target port
2772 * removal has been scheduled.
2773 * 0 and target->state != SRP_TARGET_REMOVED upon success.
2774 */
2760static int srp_add_target(struct srp_host *host, struct srp_target_port *target) 2775static int srp_add_target(struct srp_host *host, struct srp_target_port *target)
2761{ 2776{
2762 struct srp_rport_identifiers ids; 2777 struct srp_rport_identifiers ids;
@@ -3146,8 +3161,8 @@ static ssize_t srp_create_target(struct device *dev,
3146 target->io_class = SRP_REV16A_IB_IO_CLASS; 3161 target->io_class = SRP_REV16A_IB_IO_CLASS;
3147 target->scsi_host = target_host; 3162 target->scsi_host = target_host;
3148 target->srp_host = host; 3163 target->srp_host = host;
3149 target->lkey = host->srp_dev->mr->lkey; 3164 target->lkey = host->srp_dev->pd->local_dma_lkey;
3150 target->rkey = host->srp_dev->mr->rkey; 3165 target->global_mr = host->srp_dev->global_mr;
3151 target->cmd_sg_cnt = cmd_sg_entries; 3166 target->cmd_sg_cnt = cmd_sg_entries;
3152 target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; 3167 target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries;
3153 target->allow_ext_sg = allow_ext_sg; 3168 target->allow_ext_sg = allow_ext_sg;
@@ -3262,7 +3277,7 @@ static ssize_t srp_create_target(struct device *dev,
3262 srp_free_ch_ib(target, ch); 3277 srp_free_ch_ib(target, ch);
3263 srp_free_req_data(target, ch); 3278 srp_free_req_data(target, ch);
3264 target->ch_count = ch - target->ch; 3279 target->ch_count = ch - target->ch;
3265 break; 3280 goto connected;
3266 } 3281 }
3267 } 3282 }
3268 3283
@@ -3272,6 +3287,7 @@ static ssize_t srp_create_target(struct device *dev,
3272 node_idx++; 3287 node_idx++;
3273 } 3288 }
3274 3289
3290connected:
3275 target->scsi_host->nr_hw_queues = target->ch_count; 3291 target->scsi_host->nr_hw_queues = target->ch_count;
3276 3292
3277 ret = srp_add_target(host, target); 3293 ret = srp_add_target(host, target);
@@ -3294,6 +3310,8 @@ out:
3294 mutex_unlock(&host->add_target_mutex); 3310 mutex_unlock(&host->add_target_mutex);
3295 3311
3296 scsi_host_put(target->scsi_host); 3312 scsi_host_put(target->scsi_host);
3313 if (ret < 0)
3314 scsi_host_put(target->scsi_host);
3297 3315
3298 return ret; 3316 return ret;
3299 3317
@@ -3401,6 +3419,7 @@ static void srp_add_one(struct ib_device *device)
3401 3419
3402 srp_dev->use_fast_reg = (srp_dev->has_fr && 3420 srp_dev->use_fast_reg = (srp_dev->has_fr &&
3403 (!srp_dev->has_fmr || prefer_fr)); 3421 (!srp_dev->has_fmr || prefer_fr));
3422 srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
3404 3423
3405 /* 3424 /*
3406 * Use the smallest page size supported by the HCA, down to a 3425 * Use the smallest page size supported by the HCA, down to a
@@ -3433,12 +3452,16 @@ static void srp_add_one(struct ib_device *device)
3433 if (IS_ERR(srp_dev->pd)) 3452 if (IS_ERR(srp_dev->pd))
3434 goto free_dev; 3453 goto free_dev;
3435 3454
3436 srp_dev->mr = ib_get_dma_mr(srp_dev->pd, 3455 if (!register_always || (!srp_dev->has_fmr && !srp_dev->has_fr)) {
3437 IB_ACCESS_LOCAL_WRITE | 3456 srp_dev->global_mr = ib_get_dma_mr(srp_dev->pd,
3438 IB_ACCESS_REMOTE_READ | 3457 IB_ACCESS_LOCAL_WRITE |
3439 IB_ACCESS_REMOTE_WRITE); 3458 IB_ACCESS_REMOTE_READ |
3440 if (IS_ERR(srp_dev->mr)) 3459 IB_ACCESS_REMOTE_WRITE);
3441 goto err_pd; 3460 if (IS_ERR(srp_dev->global_mr))
3461 goto err_pd;
3462 } else {
3463 srp_dev->global_mr = NULL;
3464 }
3442 3465
3443 for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { 3466 for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
3444 host = srp_add_port(srp_dev, p); 3467 host = srp_add_port(srp_dev, p);
@@ -3460,13 +3483,13 @@ free_attr:
3460 kfree(dev_attr); 3483 kfree(dev_attr);
3461} 3484}
3462 3485
3463static void srp_remove_one(struct ib_device *device) 3486static void srp_remove_one(struct ib_device *device, void *client_data)
3464{ 3487{
3465 struct srp_device *srp_dev; 3488 struct srp_device *srp_dev;
3466 struct srp_host *host, *tmp_host; 3489 struct srp_host *host, *tmp_host;
3467 struct srp_target_port *target; 3490 struct srp_target_port *target;
3468 3491
3469 srp_dev = ib_get_client_data(device, &srp_client); 3492 srp_dev = client_data;
3470 if (!srp_dev) 3493 if (!srp_dev)
3471 return; 3494 return;
3472 3495
@@ -3495,7 +3518,8 @@ static void srp_remove_one(struct ib_device *device)
3495 kfree(host); 3518 kfree(host);
3496 } 3519 }
3497 3520
3498 ib_dereg_mr(srp_dev->mr); 3521 if (srp_dev->global_mr)
3522 ib_dereg_mr(srp_dev->global_mr);
3499 ib_dealloc_pd(srp_dev->pd); 3523 ib_dealloc_pd(srp_dev->pd);
3500 3524
3501 kfree(srp_dev); 3525 kfree(srp_dev);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 17ee3f80ba55..3608f2e4819c 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -95,13 +95,14 @@ struct srp_device {
95 struct list_head dev_list; 95 struct list_head dev_list;
96 struct ib_device *dev; 96 struct ib_device *dev;
97 struct ib_pd *pd; 97 struct ib_pd *pd;
98 struct ib_mr *mr; 98 struct ib_mr *global_mr;
99 u64 mr_page_mask; 99 u64 mr_page_mask;
100 int mr_page_size; 100 int mr_page_size;
101 int mr_max_size; 101 int mr_max_size;
102 int max_pages_per_mr; 102 int max_pages_per_mr;
103 bool has_fmr; 103 bool has_fmr;
104 bool has_fr; 104 bool has_fr;
105 bool use_fmr;
105 bool use_fast_reg; 106 bool use_fast_reg;
106}; 107};
107 108
@@ -182,10 +183,10 @@ struct srp_target_port {
182 spinlock_t lock; 183 spinlock_t lock;
183 184
184 /* read only in the hot path */ 185 /* read only in the hot path */
186 struct ib_mr *global_mr;
185 struct srp_rdma_ch *ch; 187 struct srp_rdma_ch *ch;
186 u32 ch_count; 188 u32 ch_count;
187 u32 lkey; 189 u32 lkey;
188 u32 rkey;
189 enum srp_target_state state; 190 enum srp_target_state state;
190 unsigned int max_iu_len; 191 unsigned int max_iu_len;
191 unsigned int cmd_sg_cnt; 192 unsigned int cmd_sg_cnt;
@@ -276,14 +277,21 @@ struct srp_fr_pool {
276 * @npages: Number of page addresses in the pages[] array. 277 * @npages: Number of page addresses in the pages[] array.
277 * @nmdesc: Number of FMR or FR memory descriptors used for mapping. 278 * @nmdesc: Number of FMR or FR memory descriptors used for mapping.
278 * @ndesc: Number of SRP buffer descriptors that have been filled in. 279 * @ndesc: Number of SRP buffer descriptors that have been filled in.
279 * @unmapped_sg: First element of the sg-list that is mapped via FMR or FR.
280 * @unmapped_index: Index of the first element mapped via FMR or FR.
281 * @unmapped_addr: DMA address of the first element mapped via FMR or FR.
282 */ 280 */
283struct srp_map_state { 281struct srp_map_state {
284 union { 282 union {
285 struct ib_pool_fmr **next_fmr; 283 struct {
286 struct srp_fr_desc **next_fr; 284 struct ib_pool_fmr **next;
285 struct ib_pool_fmr **end;
286 } fmr;
287 struct {
288 struct srp_fr_desc **next;
289 struct srp_fr_desc **end;
290 } fr;
291 struct {
292 void **next;
293 void **end;
294 } gen;
287 }; 295 };
288 struct srp_direct_buf *desc; 296 struct srp_direct_buf *desc;
289 u64 *pages; 297 u64 *pages;
@@ -293,9 +301,6 @@ struct srp_map_state {
293 unsigned int npages; 301 unsigned int npages;
294 unsigned int nmdesc; 302 unsigned int nmdesc;
295 unsigned int ndesc; 303 unsigned int ndesc;
296 struct scatterlist *unmapped_sg;
297 int unmapped_index;
298 dma_addr_t unmapped_addr;
299}; 304};
300 305
301#endif /* IB_SRP_H */ 306#endif /* IB_SRP_H */
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 60ff0a2390e5..f6fe0414139b 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -783,7 +783,7 @@ static int srpt_post_recv(struct srpt_device *sdev,
783 783
784 list.addr = ioctx->ioctx.dma; 784 list.addr = ioctx->ioctx.dma;
785 list.length = srp_max_req_size; 785 list.length = srp_max_req_size;
786 list.lkey = sdev->mr->lkey; 786 list.lkey = sdev->pd->local_dma_lkey;
787 787
788 wr.next = NULL; 788 wr.next = NULL;
789 wr.sg_list = &list; 789 wr.sg_list = &list;
@@ -818,7 +818,7 @@ static int srpt_post_send(struct srpt_rdma_ch *ch,
818 818
819 list.addr = ioctx->ioctx.dma; 819 list.addr = ioctx->ioctx.dma;
820 list.length = len; 820 list.length = len;
821 list.lkey = sdev->mr->lkey; 821 list.lkey = sdev->pd->local_dma_lkey;
822 822
823 wr.next = NULL; 823 wr.next = NULL;
824 wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index); 824 wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index);
@@ -1206,7 +1206,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
1206 1206
1207 while (rsize > 0 && tsize > 0) { 1207 while (rsize > 0 && tsize > 0) {
1208 sge->addr = dma_addr; 1208 sge->addr = dma_addr;
1209 sge->lkey = ch->sport->sdev->mr->lkey; 1209 sge->lkey = ch->sport->sdev->pd->local_dma_lkey;
1210 1210
1211 if (rsize >= dma_len) { 1211 if (rsize >= dma_len) {
1212 sge->length = 1212 sge->length =
@@ -3211,10 +3211,6 @@ static void srpt_add_one(struct ib_device *device)
3211 if (IS_ERR(sdev->pd)) 3211 if (IS_ERR(sdev->pd))
3212 goto free_dev; 3212 goto free_dev;
3213 3213
3214 sdev->mr = ib_get_dma_mr(sdev->pd, IB_ACCESS_LOCAL_WRITE);
3215 if (IS_ERR(sdev->mr))
3216 goto err_pd;
3217
3218 sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr); 3214 sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr);
3219 3215
3220 srq_attr.event_handler = srpt_srq_event; 3216 srq_attr.event_handler = srpt_srq_event;
@@ -3226,7 +3222,7 @@ static void srpt_add_one(struct ib_device *device)
3226 3222
3227 sdev->srq = ib_create_srq(sdev->pd, &srq_attr); 3223 sdev->srq = ib_create_srq(sdev->pd, &srq_attr);
3228 if (IS_ERR(sdev->srq)) 3224 if (IS_ERR(sdev->srq))
3229 goto err_mr; 3225 goto err_pd;
3230 3226
3231 pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n", 3227 pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n",
3232 __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr, 3228 __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr,
@@ -3250,7 +3246,7 @@ static void srpt_add_one(struct ib_device *device)
3250 * in the system as service_id; therefore, the target_id will change 3246 * in the system as service_id; therefore, the target_id will change
3251 * if this HCA is gone bad and replaced by different HCA 3247 * if this HCA is gone bad and replaced by different HCA
3252 */ 3248 */
3253 if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0, NULL)) 3249 if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0))
3254 goto err_cm; 3250 goto err_cm;
3255 3251
3256 INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device, 3252 INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device,
@@ -3311,8 +3307,6 @@ err_cm:
3311 ib_destroy_cm_id(sdev->cm_id); 3307 ib_destroy_cm_id(sdev->cm_id);
3312err_srq: 3308err_srq:
3313 ib_destroy_srq(sdev->srq); 3309 ib_destroy_srq(sdev->srq);
3314err_mr:
3315 ib_dereg_mr(sdev->mr);
3316err_pd: 3310err_pd:
3317 ib_dealloc_pd(sdev->pd); 3311 ib_dealloc_pd(sdev->pd);
3318free_dev: 3312free_dev:
@@ -3326,12 +3320,11 @@ err:
3326/** 3320/**
3327 * srpt_remove_one() - InfiniBand device removal callback function. 3321 * srpt_remove_one() - InfiniBand device removal callback function.
3328 */ 3322 */
3329static void srpt_remove_one(struct ib_device *device) 3323static void srpt_remove_one(struct ib_device *device, void *client_data)
3330{ 3324{
3331 struct srpt_device *sdev; 3325 struct srpt_device *sdev = client_data;
3332 int i; 3326 int i;
3333 3327
3334 sdev = ib_get_client_data(device, &srpt_client);
3335 if (!sdev) { 3328 if (!sdev) {
3336 pr_info("%s(%s): nothing to do.\n", __func__, device->name); 3329 pr_info("%s(%s): nothing to do.\n", __func__, device->name);
3337 return; 3330 return;
@@ -3358,7 +3351,6 @@ static void srpt_remove_one(struct ib_device *device)
3358 srpt_release_sdev(sdev); 3351 srpt_release_sdev(sdev);
3359 3352
3360 ib_destroy_srq(sdev->srq); 3353 ib_destroy_srq(sdev->srq);
3361 ib_dereg_mr(sdev->mr);
3362 ib_dealloc_pd(sdev->pd); 3354 ib_dealloc_pd(sdev->pd);
3363 3355
3364 srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev, 3356 srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev,
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index 21f8df67522a..5faad8acd789 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -393,7 +393,6 @@ struct srpt_port {
393struct srpt_device { 393struct srpt_device {
394 struct ib_device *device; 394 struct ib_device *device;
395 struct ib_pd *pd; 395 struct ib_pd *pd;
396 struct ib_mr *mr;
397 struct ib_srq *srq; 396 struct ib_srq *srq;
398 struct ib_cm_id *cm_id; 397 struct ib_cm_id *cm_id;
399 struct ib_device_attr dev_attr; 398 struct ib_device_attr dev_attr;
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 6dda57e2e724..55e93b6b6d21 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -737,19 +737,6 @@ static int bond_option_mode_set(struct bonding *bond,
737 return 0; 737 return 0;
738} 738}
739 739
740static struct net_device *__bond_option_active_slave_get(struct bonding *bond,
741 struct slave *slave)
742{
743 return bond_uses_primary(bond) && slave ? slave->dev : NULL;
744}
745
746struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond)
747{
748 struct slave *slave = rcu_dereference(bond->curr_active_slave);
749
750 return __bond_option_active_slave_get(bond, slave);
751}
752
753static int bond_option_active_slave_set(struct bonding *bond, 740static int bond_option_active_slave_set(struct bonding *bond,
754 const struct bond_opt_value *newval) 741 const struct bond_opt_value *newval)
755{ 742{
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c
index 913b716ed2e1..a946e4bf71d2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c
@@ -224,6 +224,26 @@ static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
224 kfree(mdev); 224 kfree(mdev);
225} 225}
226 226
227static void mlx4_en_activate(struct mlx4_dev *dev, void *ctx)
228{
229 int i;
230 struct mlx4_en_dev *mdev = ctx;
231
232 /* Create a netdev for each port */
233 mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
234 mlx4_info(mdev, "Activating port:%d\n", i);
235 if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
236 mdev->pndev[i] = NULL;
237 }
238
239 /* register notifier */
240 mdev->nb.notifier_call = mlx4_en_netdev_event;
241 if (register_netdevice_notifier(&mdev->nb)) {
242 mdev->nb.notifier_call = NULL;
243 mlx4_err(mdev, "Failed to create notifier\n");
244 }
245}
246
227static void *mlx4_en_add(struct mlx4_dev *dev) 247static void *mlx4_en_add(struct mlx4_dev *dev)
228{ 248{
229 struct mlx4_en_dev *mdev; 249 struct mlx4_en_dev *mdev;
@@ -297,21 +317,6 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
297 mutex_init(&mdev->state_lock); 317 mutex_init(&mdev->state_lock);
298 mdev->device_up = true; 318 mdev->device_up = true;
299 319
300 /* Setup ports */
301
302 /* Create a netdev for each port */
303 mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
304 mlx4_info(mdev, "Activating port:%d\n", i);
305 if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
306 mdev->pndev[i] = NULL;
307 }
308 /* register notifier */
309 mdev->nb.notifier_call = mlx4_en_netdev_event;
310 if (register_netdevice_notifier(&mdev->nb)) {
311 mdev->nb.notifier_call = NULL;
312 mlx4_err(mdev, "Failed to create notifier\n");
313 }
314
315 return mdev; 320 return mdev;
316 321
317err_mr: 322err_mr:
@@ -335,6 +340,7 @@ static struct mlx4_interface mlx4_en_interface = {
335 .event = mlx4_en_event, 340 .event = mlx4_en_event,
336 .get_dev = mlx4_en_get_netdev, 341 .get_dev = mlx4_en_get_netdev,
337 .protocol = MLX4_PROT_ETH, 342 .protocol = MLX4_PROT_ETH,
343 .activate = mlx4_en_activate,
338}; 344};
339 345
340static void mlx4_en_verify_params(void) 346static void mlx4_en_verify_params(void)
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index 0d80aed59043..0472941af820 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -63,8 +63,11 @@ static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
63 spin_lock_irq(&priv->ctx_lock); 63 spin_lock_irq(&priv->ctx_lock);
64 list_add_tail(&dev_ctx->list, &priv->ctx_list); 64 list_add_tail(&dev_ctx->list, &priv->ctx_list);
65 spin_unlock_irq(&priv->ctx_lock); 65 spin_unlock_irq(&priv->ctx_lock);
66 if (intf->activate)
67 intf->activate(&priv->dev, dev_ctx->context);
66 } else 68 } else
67 kfree(dev_ctx); 69 kfree(dev_ctx);
70
68} 71}
69 72
70static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv) 73static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 9335e5ae18cc..aa0d5ffe92d8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -200,3 +200,25 @@ int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev)
200 200
201 return err; 201 return err;
202} 202}
203
204int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey)
205{
206 struct mlx5_cmd_query_special_contexts_mbox_in in;
207 struct mlx5_cmd_query_special_contexts_mbox_out out;
208 int err;
209
210 memset(&in, 0, sizeof(in));
211 memset(&out, 0, sizeof(out));
212 in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
213 err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
214 if (err)
215 return err;
216
217 if (out.hdr.status)
218 err = mlx5_cmd_status_to_err(&out.hdr);
219
220 *rsvd_lkey = be32_to_cpu(out.resd_lkey);
221
222 return err;
223}
224EXPORT_SYMBOL(mlx5_core_query_special_context);
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index e29293c0c71e..39d950584c9f 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -72,6 +72,8 @@ source "drivers/staging/nvec/Kconfig"
72 72
73source "drivers/staging/media/Kconfig" 73source "drivers/staging/media/Kconfig"
74 74
75source "drivers/staging/rdma/Kconfig"
76
75source "drivers/staging/android/Kconfig" 77source "drivers/staging/android/Kconfig"
76 78
77source "drivers/staging/board/Kconfig" 79source "drivers/staging/board/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 50824dde2c09..e4f33d91872b 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_FT1000) += ft1000/
29obj-$(CONFIG_SPEAKUP) += speakup/ 29obj-$(CONFIG_SPEAKUP) += speakup/
30obj-$(CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI4) += ste_rmi4/ 30obj-$(CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI4) += ste_rmi4/
31obj-$(CONFIG_MFD_NVEC) += nvec/ 31obj-$(CONFIG_MFD_NVEC) += nvec/
32obj-$(CONFIG_STAGING_RDMA) += rdma/
32obj-$(CONFIG_ANDROID) += android/ 33obj-$(CONFIG_ANDROID) += android/
33obj-$(CONFIG_STAGING_BOARD) += board/ 34obj-$(CONFIG_STAGING_BOARD) += board/
34obj-$(CONFIG_WIMAX_GDM72XX) += gdm72xx/ 35obj-$(CONFIG_WIMAX_GDM72XX) += gdm72xx/
diff --git a/drivers/staging/rdma/Kconfig b/drivers/staging/rdma/Kconfig
new file mode 100644
index 000000000000..cf5fe9bb87a1
--- /dev/null
+++ b/drivers/staging/rdma/Kconfig
@@ -0,0 +1,31 @@
1menuconfig STAGING_RDMA
2 bool "RDMA staging drivers"
3 depends on INFINIBAND
4 depends on PCI || BROKEN
5 depends on HAS_IOMEM
6 depends on NET
7 depends on INET
8 default n
9 ---help---
10 This option allows you to select a number of RDMA drivers that
11 fall into one of two categories: deprecated drivers being held
12 here before finally being removed or new drivers that still need
13 some work before being moved to the normal RDMA driver area.
14
15 If you wish to work on these drivers, to help improve them, or
16 to report problems you have with them, please use the
17 linux-rdma@vger.kernel.org mailing list.
18
19 If in doubt, say N here.
20
21
22# Please keep entries in alphabetic order
23if STAGING_RDMA
24
25source "drivers/staging/rdma/amso1100/Kconfig"
26
27source "drivers/staging/rdma/hfi1/Kconfig"
28
29source "drivers/staging/rdma/ipath/Kconfig"
30
31endif
diff --git a/drivers/staging/rdma/Makefile b/drivers/staging/rdma/Makefile
new file mode 100644
index 000000000000..cbd915ac7f20
--- /dev/null
+++ b/drivers/staging/rdma/Makefile
@@ -0,0 +1,4 @@
1# Entries for RDMA_STAGING tree
2obj-$(CONFIG_INFINIBAND_AMSO1100) += amso1100/
3obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/
4obj-$(CONFIG_INFINIBAND_IPATH) += ipath/
diff --git a/drivers/infiniband/hw/amso1100/Kbuild b/drivers/staging/rdma/amso1100/Kbuild
index 950dfabcd89d..950dfabcd89d 100644
--- a/drivers/infiniband/hw/amso1100/Kbuild
+++ b/drivers/staging/rdma/amso1100/Kbuild
diff --git a/drivers/infiniband/hw/amso1100/Kconfig b/drivers/staging/rdma/amso1100/Kconfig
index e6ce5f209e47..e6ce5f209e47 100644
--- a/drivers/infiniband/hw/amso1100/Kconfig
+++ b/drivers/staging/rdma/amso1100/Kconfig
diff --git a/drivers/staging/rdma/amso1100/TODO b/drivers/staging/rdma/amso1100/TODO
new file mode 100644
index 000000000000..18b00a5cb549
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/TODO
@@ -0,0 +1,4 @@
17/2015
2
3The amso1100 driver has been deprecated and moved to drivers/staging.
4It will be removed in the 4.6 merge window.
diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/staging/rdma/amso1100/c2.c
index 766a71ccefed..766a71ccefed 100644
--- a/drivers/infiniband/hw/amso1100/c2.c
+++ b/drivers/staging/rdma/amso1100/c2.c
diff --git a/drivers/infiniband/hw/amso1100/c2.h b/drivers/staging/rdma/amso1100/c2.h
index d619d735838b..d619d735838b 100644
--- a/drivers/infiniband/hw/amso1100/c2.h
+++ b/drivers/staging/rdma/amso1100/c2.h
diff --git a/drivers/infiniband/hw/amso1100/c2_ae.c b/drivers/staging/rdma/amso1100/c2_ae.c
index cedda25232be..cedda25232be 100644
--- a/drivers/infiniband/hw/amso1100/c2_ae.c
+++ b/drivers/staging/rdma/amso1100/c2_ae.c
diff --git a/drivers/infiniband/hw/amso1100/c2_ae.h b/drivers/staging/rdma/amso1100/c2_ae.h
index 3a065c33b83b..3a065c33b83b 100644
--- a/drivers/infiniband/hw/amso1100/c2_ae.h
+++ b/drivers/staging/rdma/amso1100/c2_ae.h
diff --git a/drivers/infiniband/hw/amso1100/c2_alloc.c b/drivers/staging/rdma/amso1100/c2_alloc.c
index 78d247ec6961..78d247ec6961 100644
--- a/drivers/infiniband/hw/amso1100/c2_alloc.c
+++ b/drivers/staging/rdma/amso1100/c2_alloc.c
diff --git a/drivers/infiniband/hw/amso1100/c2_cm.c b/drivers/staging/rdma/amso1100/c2_cm.c
index 23bfa94fbd4e..23bfa94fbd4e 100644
--- a/drivers/infiniband/hw/amso1100/c2_cm.c
+++ b/drivers/staging/rdma/amso1100/c2_cm.c
diff --git a/drivers/infiniband/hw/amso1100/c2_cq.c b/drivers/staging/rdma/amso1100/c2_cq.c
index 1b63185b4ad4..1b63185b4ad4 100644
--- a/drivers/infiniband/hw/amso1100/c2_cq.c
+++ b/drivers/staging/rdma/amso1100/c2_cq.c
diff --git a/drivers/infiniband/hw/amso1100/c2_intr.c b/drivers/staging/rdma/amso1100/c2_intr.c
index 3a17d9b36dba..3a17d9b36dba 100644
--- a/drivers/infiniband/hw/amso1100/c2_intr.c
+++ b/drivers/staging/rdma/amso1100/c2_intr.c
diff --git a/drivers/infiniband/hw/amso1100/c2_mm.c b/drivers/staging/rdma/amso1100/c2_mm.c
index 119c4f3d9791..119c4f3d9791 100644
--- a/drivers/infiniband/hw/amso1100/c2_mm.c
+++ b/drivers/staging/rdma/amso1100/c2_mm.c
diff --git a/drivers/infiniband/hw/amso1100/c2_mq.c b/drivers/staging/rdma/amso1100/c2_mq.c
index 0cddc49beae1..0cddc49beae1 100644
--- a/drivers/infiniband/hw/amso1100/c2_mq.c
+++ b/drivers/staging/rdma/amso1100/c2_mq.c
diff --git a/drivers/infiniband/hw/amso1100/c2_mq.h b/drivers/staging/rdma/amso1100/c2_mq.h
index fc1b9a7cec4b..fc1b9a7cec4b 100644
--- a/drivers/infiniband/hw/amso1100/c2_mq.h
+++ b/drivers/staging/rdma/amso1100/c2_mq.h
diff --git a/drivers/infiniband/hw/amso1100/c2_pd.c b/drivers/staging/rdma/amso1100/c2_pd.c
index f3e81dc357bb..f3e81dc357bb 100644
--- a/drivers/infiniband/hw/amso1100/c2_pd.c
+++ b/drivers/staging/rdma/amso1100/c2_pd.c
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/staging/rdma/amso1100/c2_provider.c
index 25c3f0085563..25c3f0085563 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/staging/rdma/amso1100/c2_provider.c
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.h b/drivers/staging/rdma/amso1100/c2_provider.h
index bf189987711f..bf189987711f 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.h
+++ b/drivers/staging/rdma/amso1100/c2_provider.h
diff --git a/drivers/infiniband/hw/amso1100/c2_qp.c b/drivers/staging/rdma/amso1100/c2_qp.c
index 86708dee58b1..86708dee58b1 100644
--- a/drivers/infiniband/hw/amso1100/c2_qp.c
+++ b/drivers/staging/rdma/amso1100/c2_qp.c
diff --git a/drivers/infiniband/hw/amso1100/c2_rnic.c b/drivers/staging/rdma/amso1100/c2_rnic.c
index d2a6d961344b..d2a6d961344b 100644
--- a/drivers/infiniband/hw/amso1100/c2_rnic.c
+++ b/drivers/staging/rdma/amso1100/c2_rnic.c
diff --git a/drivers/infiniband/hw/amso1100/c2_status.h b/drivers/staging/rdma/amso1100/c2_status.h
index 6ee4aa92d875..6ee4aa92d875 100644
--- a/drivers/infiniband/hw/amso1100/c2_status.h
+++ b/drivers/staging/rdma/amso1100/c2_status.h
diff --git a/drivers/infiniband/hw/amso1100/c2_user.h b/drivers/staging/rdma/amso1100/c2_user.h
index 7e9e7ad65467..7e9e7ad65467 100644
--- a/drivers/infiniband/hw/amso1100/c2_user.h
+++ b/drivers/staging/rdma/amso1100/c2_user.h
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.c b/drivers/staging/rdma/amso1100/c2_vq.c
index 2ec716fb2edb..2ec716fb2edb 100644
--- a/drivers/infiniband/hw/amso1100/c2_vq.c
+++ b/drivers/staging/rdma/amso1100/c2_vq.c
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.h b/drivers/staging/rdma/amso1100/c2_vq.h
index 33805627a607..33805627a607 100644
--- a/drivers/infiniband/hw/amso1100/c2_vq.h
+++ b/drivers/staging/rdma/amso1100/c2_vq.h
diff --git a/drivers/infiniband/hw/amso1100/c2_wr.h b/drivers/staging/rdma/amso1100/c2_wr.h
index 8d4b4ca463ca..8d4b4ca463ca 100644
--- a/drivers/infiniband/hw/amso1100/c2_wr.h
+++ b/drivers/staging/rdma/amso1100/c2_wr.h
diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/staging/rdma/hfi1/Kconfig
new file mode 100644
index 000000000000..fd25078ee923
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/Kconfig
@@ -0,0 +1,37 @@
1config INFINIBAND_HFI1
2 tristate "Intel OPA Gen1 support"
3 depends on X86_64
4 default m
5 ---help---
6 This is a low-level driver for Intel OPA Gen1 adapter.
7config HFI1_DEBUG_SDMA_ORDER
8 bool "HFI1 SDMA Order debug"
9 depends on INFINIBAND_HFI1
10 default n
11 ---help---
12 This is a debug flag to test for out of order
13 sdma completions for unit testing
14config HFI1_VERBS_31BIT_PSN
15 bool "HFI1 enable 31 bit PSN"
16 depends on INFINIBAND_HFI1
17 default y
18 ---help---
19 Setting this enables 31 BIT PSN
20 For verbs RC/UC
21config SDMA_VERBOSITY
22 bool "Config SDMA Verbosity"
23 depends on INFINIBAND_HFI1
24 default n
25 ---help---
26 This is a configuration flag to enable verbose
27 SDMA debug
28config PRESCAN_RXQ
29 bool "Enable prescanning of the RX queue for ECNs"
30 depends on INFINIBAND_HFI1
31 default n
32 ---help---
33 This option toggles the prescanning of the receive queue for
34 Explicit Congestion Notifications. If an ECN is detected, it
35 is processed as quickly as possible, the ECN is toggled off.
36 After the prescanning step, the receive queue is processed as
37 usual.
diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/staging/rdma/hfi1/Makefile
new file mode 100644
index 000000000000..2e5daa6cdcc2
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/Makefile
@@ -0,0 +1,19 @@
1#
2# HFI driver
3#
4#
5#
6# Called from the kernel module build system.
7#
8obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
9
10hfi1-y := chip.o cq.o device.o diag.o dma.o driver.o eprom.o file_ops.o firmware.o \
11 init.o intr.o keys.o mad.o mmap.o mr.o pcie.o pio.o pio_copy.o \
12 qp.o qsfp.o rc.o ruc.o sdma.o srq.o sysfs.o trace.o twsi.o \
13 uc.o ud.o user_pages.o user_sdma.o verbs_mcast.o verbs.o
14hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
15
16CFLAGS_trace.o = -I$(src)
17ifdef MVERSION
18CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\"
19endif
diff --git a/drivers/staging/rdma/hfi1/TODO b/drivers/staging/rdma/hfi1/TODO
new file mode 100644
index 000000000000..05de0dad8762
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/TODO
@@ -0,0 +1,6 @@
1July, 2015
2
3- Remove unneeded file entries in sysfs
4- Remove software processing of IB protocol and place in library for use
5 by qib, ipath (if still present), hfi1, and eventually soft-roce
6
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
new file mode 100644
index 000000000000..654eafef1d30
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -0,0 +1,10798 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51/*
52 * This file contains all of the code that is specific to the HFI chip
53 */
54
55#include <linux/pci.h>
56#include <linux/delay.h>
57#include <linux/interrupt.h>
58#include <linux/module.h>
59
60#include "hfi.h"
61#include "trace.h"
62#include "mad.h"
63#include "pio.h"
64#include "sdma.h"
65#include "eprom.h"
66
67#define NUM_IB_PORTS 1
68
69uint kdeth_qp;
70module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
71MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
72
73uint num_vls = HFI1_MAX_VLS_SUPPORTED;
74module_param(num_vls, uint, S_IRUGO);
75MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
76
77/*
78 * Default time to aggregate two 10K packets from the idle state
79 * (timer not running). The timer starts at the end of the first packet,
80 * so only the time for one 10K packet and header plus a bit extra is needed.
81 * 10 * 1024 + 64 header byte = 10304 byte
82 * 10304 byte / 12.5 GB/s = 824.32ns
83 */
84uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
85module_param(rcv_intr_timeout, uint, S_IRUGO);
86MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
87
88uint rcv_intr_count = 16; /* same as qib */
89module_param(rcv_intr_count, uint, S_IRUGO);
90MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
91
92ushort link_crc_mask = SUPPORTED_CRCS;
93module_param(link_crc_mask, ushort, S_IRUGO);
94MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
95
96uint loopback;
97module_param_named(loopback, loopback, uint, S_IRUGO);
98MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable");
99
100/* Other driver tunables */
101uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/
102static ushort crc_14b_sideband = 1;
103static uint use_flr = 1;
104uint quick_linkup; /* skip LNI */
105
106struct flag_table {
107 u64 flag; /* the flag */
108 char *str; /* description string */
109 u16 extra; /* extra information */
110 u16 unused0;
111 u32 unused1;
112};
113
114/* str must be a string constant */
115#define FLAG_ENTRY(str, extra, flag) {flag, str, extra}
116#define FLAG_ENTRY0(str, flag) {flag, str, 0}
117
118/* Send Error Consequences */
119#define SEC_WRITE_DROPPED 0x1
120#define SEC_PACKET_DROPPED 0x2
121#define SEC_SC_HALTED 0x4 /* per-context only */
122#define SEC_SPC_FREEZE 0x8 /* per-HFI only */
123
124#define VL15CTXT 1
125#define MIN_KERNEL_KCTXTS 2
126#define NUM_MAP_REGS 32
127
128/* Bit offset into the GUID which carries HFI id information */
129#define GUID_HFI_INDEX_SHIFT 39
130
131/* extract the emulation revision */
132#define emulator_rev(dd) ((dd)->irev >> 8)
133/* parallel and serial emulation versions are 3 and 4 respectively */
134#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
135#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
136
137/* RSM fields */
138
139/* packet type */
140#define IB_PACKET_TYPE 2ull
141#define QW_SHIFT 6ull
142/* QPN[7..1] */
143#define QPN_WIDTH 7ull
144
145/* LRH.BTH: QW 0, OFFSET 48 - for match */
146#define LRH_BTH_QW 0ull
147#define LRH_BTH_BIT_OFFSET 48ull
148#define LRH_BTH_OFFSET(off) ((LRH_BTH_QW << QW_SHIFT) | (off))
149#define LRH_BTH_MATCH_OFFSET LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET)
150#define LRH_BTH_SELECT
151#define LRH_BTH_MASK 3ull
152#define LRH_BTH_VALUE 2ull
153
154/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */
155#define LRH_SC_QW 0ull
156#define LRH_SC_BIT_OFFSET 56ull
157#define LRH_SC_OFFSET(off) ((LRH_SC_QW << QW_SHIFT) | (off))
158#define LRH_SC_MATCH_OFFSET LRH_SC_OFFSET(LRH_SC_BIT_OFFSET)
159#define LRH_SC_MASK 128ull
160#define LRH_SC_VALUE 0ull
161
162/* SC[n..0] QW 0, OFFSET 60 - for select */
163#define LRH_SC_SELECT_OFFSET ((LRH_SC_QW << QW_SHIFT) | (60ull))
164
165/* QPN[m+n:1] QW 1, OFFSET 1 */
166#define QPN_SELECT_OFFSET ((1ull << QW_SHIFT) | (1ull))
167
168/* defines to build power on SC2VL table */
169#define SC2VL_VAL( \
170 num, \
171 sc0, sc0val, \
172 sc1, sc1val, \
173 sc2, sc2val, \
174 sc3, sc3val, \
175 sc4, sc4val, \
176 sc5, sc5val, \
177 sc6, sc6val, \
178 sc7, sc7val) \
179( \
180 ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \
181 ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \
182 ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \
183 ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \
184 ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \
185 ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \
186 ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \
187 ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT) \
188)
189
190#define DC_SC_VL_VAL( \
191 range, \
192 e0, e0val, \
193 e1, e1val, \
194 e2, e2val, \
195 e3, e3val, \
196 e4, e4val, \
197 e5, e5val, \
198 e6, e6val, \
199 e7, e7val, \
200 e8, e8val, \
201 e9, e9val, \
202 e10, e10val, \
203 e11, e11val, \
204 e12, e12val, \
205 e13, e13val, \
206 e14, e14val, \
207 e15, e15val) \
208( \
209 ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \
210 ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \
211 ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \
212 ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \
213 ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \
214 ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \
215 ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \
216 ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \
217 ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \
218 ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \
219 ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \
220 ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \
221 ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \
222 ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \
223 ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \
224 ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \
225)
226
227/* all CceStatus sub-block freeze bits */
228#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \
229 | CCE_STATUS_RXE_FROZE_SMASK \
230 | CCE_STATUS_TXE_FROZE_SMASK \
231 | CCE_STATUS_TXE_PIO_FROZE_SMASK)
232/* all CceStatus sub-block TXE pause bits */
233#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \
234 | CCE_STATUS_TXE_PAUSED_SMASK \
235 | CCE_STATUS_SDMA_PAUSED_SMASK)
236/* all CceStatus sub-block RXE pause bits */
237#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
238
239/*
240 * CCE Error flags.
241 */
242static struct flag_table cce_err_status_flags[] = {
243/* 0*/ FLAG_ENTRY0("CceCsrParityErr",
244 CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK),
245/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr",
246 CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK),
247/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr",
248 CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK),
249/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr",
250 CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK),
251/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr",
252 CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK),
253/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr",
254 CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK),
255/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr",
256 CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK),
257/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr",
258 CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK),
259/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr",
260 CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK),
261/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
262 CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK),
263/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
264 CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK),
265/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError",
266 CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK),
267/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError",
268 CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK),
269/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr",
270 CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK),
271/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr",
272 CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK),
273/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr",
274 CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK),
275/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr",
276 CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK),
277/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr",
278 CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK),
279/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr",
280 CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK),
281/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr",
282 CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK),
283/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr",
284 CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK),
285/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr",
286 CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK),
287/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr",
288 CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK),
289/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr",
290 CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK),
291/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr",
292 CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK),
293/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr",
294 CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK),
295/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr",
296 CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK),
297/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr",
298 CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK),
299/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr",
300 CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK),
301/*29*/ FLAG_ENTRY0("PcicReceiveParityErr",
302 CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK),
303/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr",
304 CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK),
305/*31*/ FLAG_ENTRY0("LATriggered",
306 CCE_ERR_STATUS_LA_TRIGGERED_SMASK),
307/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr",
308 CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK),
309/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr",
310 CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK),
311/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr",
312 CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK),
313/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr",
314 CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK),
315/*36*/ FLAG_ENTRY0("CceMsixTableCorErr",
316 CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK),
317/*37*/ FLAG_ENTRY0("CceMsixTableUncErr",
318 CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK),
319/*38*/ FLAG_ENTRY0("CceIntMapCorErr",
320 CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK),
321/*39*/ FLAG_ENTRY0("CceIntMapUncErr",
322 CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK),
323/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr",
324 CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK),
325/*41-63 reserved*/
326};
327
328/*
329 * Misc Error flags
330 */
331#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK
332static struct flag_table misc_err_status_flags[] = {
333/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)),
334/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)),
335/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)),
336/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)),
337/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)),
338/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)),
339/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)),
340/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)),
341/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)),
342/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)),
343/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)),
344/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)),
345/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL))
346};
347
348/*
349 * TXE PIO Error flags and consequences
350 */
351static struct flag_table pio_err_status_flags[] = {
352/* 0*/ FLAG_ENTRY("PioWriteBadCtxt",
353 SEC_WRITE_DROPPED,
354 SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK),
355/* 1*/ FLAG_ENTRY("PioWriteAddrParity",
356 SEC_SPC_FREEZE,
357 SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK),
358/* 2*/ FLAG_ENTRY("PioCsrParity",
359 SEC_SPC_FREEZE,
360 SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK),
361/* 3*/ FLAG_ENTRY("PioSbMemFifo0",
362 SEC_SPC_FREEZE,
363 SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK),
364/* 4*/ FLAG_ENTRY("PioSbMemFifo1",
365 SEC_SPC_FREEZE,
366 SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK),
367/* 5*/ FLAG_ENTRY("PioPccFifoParity",
368 SEC_SPC_FREEZE,
369 SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK),
370/* 6*/ FLAG_ENTRY("PioPecFifoParity",
371 SEC_SPC_FREEZE,
372 SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK),
373/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity",
374 SEC_SPC_FREEZE,
375 SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK),
376/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity",
377 SEC_SPC_FREEZE,
378 SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK),
379/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr",
380 SEC_SPC_FREEZE,
381 SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK),
382/*10*/ FLAG_ENTRY("PioSmPktResetParity",
383 SEC_SPC_FREEZE,
384 SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK),
385/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc",
386 SEC_SPC_FREEZE,
387 SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK),
388/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc",
389 SEC_SPC_FREEZE,
390 SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK),
391/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor",
392 0,
393 SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK),
394/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor",
395 0,
396 SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK),
397/*15*/ FLAG_ENTRY("PioCreditRetFifoParity",
398 SEC_SPC_FREEZE,
399 SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK),
400/*16*/ FLAG_ENTRY("PioPpmcPblFifo",
401 SEC_SPC_FREEZE,
402 SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK),
403/*17*/ FLAG_ENTRY("PioInitSmIn",
404 0,
405 SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK),
406/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm",
407 SEC_SPC_FREEZE,
408 SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK),
409/*19*/ FLAG_ENTRY("PioHostAddrMemUnc",
410 SEC_SPC_FREEZE,
411 SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK),
412/*20*/ FLAG_ENTRY("PioHostAddrMemCor",
413 0,
414 SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK),
415/*21*/ FLAG_ENTRY("PioWriteDataParity",
416 SEC_SPC_FREEZE,
417 SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK),
418/*22*/ FLAG_ENTRY("PioStateMachine",
419 SEC_SPC_FREEZE,
420 SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
421/*23*/ FLAG_ENTRY("PioWriteQwValidParity",
422 SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
423 SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
424/*24*/ FLAG_ENTRY("PioBlockQwCountParity",
425 SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
426 SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
427/*25*/ FLAG_ENTRY("PioVlfVlLenParity",
428 SEC_SPC_FREEZE,
429 SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK),
430/*26*/ FLAG_ENTRY("PioVlfSopParity",
431 SEC_SPC_FREEZE,
432 SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK),
433/*27*/ FLAG_ENTRY("PioVlFifoParity",
434 SEC_SPC_FREEZE,
435 SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK),
436/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity",
437 SEC_SPC_FREEZE,
438 SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK),
439/*29*/ FLAG_ENTRY("PioPpmcSopLen",
440 SEC_SPC_FREEZE,
441 SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK),
442/*30-31 reserved*/
443/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity",
444 SEC_SPC_FREEZE,
445 SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK),
446/*33*/ FLAG_ENTRY("PioLastReturnedCntParity",
447 SEC_SPC_FREEZE,
448 SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK),
449/*34*/ FLAG_ENTRY("PioPccSopHeadParity",
450 SEC_SPC_FREEZE,
451 SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK),
452/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr",
453 SEC_SPC_FREEZE,
454 SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK),
455/*36-63 reserved*/
456};
457
458/* TXE PIO errors that cause an SPC freeze */
459#define ALL_PIO_FREEZE_ERR \
460 (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \
461 | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \
462 | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \
463 | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \
464 | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \
465 | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \
466 | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \
467 | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
468 | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \
469 | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \
470 | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \
471 | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \
472 | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \
473 | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \
474 | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \
475 | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \
476 | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \
477 | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \
478 | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \
479 | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
480 | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \
481 | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \
482 | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \
483 | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \
484 | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \
485 | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
486 | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
487 | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
488 | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK)
489
490/*
491 * TXE SDMA Error flags
492 */
493static struct flag_table sdma_err_status_flags[] = {
494/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr",
495 SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK),
496/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr",
497 SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK),
498/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr",
499 SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK),
500/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr",
501 SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK),
502/*04-63 reserved*/
503};
504
505/* TXE SDMA errors that cause an SPC freeze */
506#define ALL_SDMA_FREEZE_ERR \
507 (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \
508 | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
509 | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
510
511/*
512 * TXE Egress Error flags
513 */
514#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK
515static struct flag_table egress_err_status_flags[] = {
516/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)),
517/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)),
518/* 2 reserved */
519/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr",
520 SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)),
521/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)),
522/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)),
523/* 6 reserved */
524/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr",
525 SEES(TX_PIO_LAUNCH_INTF_PARITY)),
526/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr",
527 SEES(TX_SDMA_LAUNCH_INTF_PARITY)),
528/* 9-10 reserved */
529/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr",
530 SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)),
531/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)),
532/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)),
533/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)),
534/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)),
535/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr",
536 SEES(TX_SDMA0_DISALLOWED_PACKET)),
537/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr",
538 SEES(TX_SDMA1_DISALLOWED_PACKET)),
539/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr",
540 SEES(TX_SDMA2_DISALLOWED_PACKET)),
541/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr",
542 SEES(TX_SDMA3_DISALLOWED_PACKET)),
543/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr",
544 SEES(TX_SDMA4_DISALLOWED_PACKET)),
545/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr",
546 SEES(TX_SDMA5_DISALLOWED_PACKET)),
547/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr",
548 SEES(TX_SDMA6_DISALLOWED_PACKET)),
549/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr",
550 SEES(TX_SDMA7_DISALLOWED_PACKET)),
551/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr",
552 SEES(TX_SDMA8_DISALLOWED_PACKET)),
553/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr",
554 SEES(TX_SDMA9_DISALLOWED_PACKET)),
555/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr",
556 SEES(TX_SDMA10_DISALLOWED_PACKET)),
557/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr",
558 SEES(TX_SDMA11_DISALLOWED_PACKET)),
559/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr",
560 SEES(TX_SDMA12_DISALLOWED_PACKET)),
561/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr",
562 SEES(TX_SDMA13_DISALLOWED_PACKET)),
563/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr",
564 SEES(TX_SDMA14_DISALLOWED_PACKET)),
565/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr",
566 SEES(TX_SDMA15_DISALLOWED_PACKET)),
567/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr",
568 SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)),
569/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr",
570 SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)),
571/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr",
572 SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)),
573/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr",
574 SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)),
575/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr",
576 SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)),
577/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr",
578 SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)),
579/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr",
580 SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)),
581/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr",
582 SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)),
583/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr",
584 SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)),
585/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)),
586/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)),
587/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)),
588/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)),
589/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)),
590/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)),
591/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)),
592/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)),
593/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)),
594/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)),
595/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)),
596/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)),
597/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)),
598/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)),
599/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)),
600/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)),
601/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)),
602/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)),
603/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)),
604/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)),
605/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)),
606/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr",
607 SEES(TX_READ_SDMA_MEMORY_CSR_UNC)),
608/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr",
609 SEES(TX_READ_PIO_MEMORY_CSR_UNC)),
610};
611
612/*
613 * TXE Egress Error Info flags
614 */
615#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK
616static struct flag_table egress_err_info_flags[] = {
617/* 0*/ FLAG_ENTRY0("Reserved", 0ull),
618/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)),
619/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
620/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
621/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)),
622/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)),
623/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)),
624/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)),
625/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)),
626/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)),
627/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)),
628/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)),
629/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)),
630/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)),
631/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)),
632/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)),
633/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)),
634/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)),
635/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)),
636/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)),
637/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)),
638/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)),
639};
640
641/* TXE Egress errors that cause an SPC freeze */
642#define ALL_TXE_EGRESS_FREEZE_ERR \
643 (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \
644 | SEES(TX_PIO_LAUNCH_INTF_PARITY) \
645 | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \
646 | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \
647 | SEES(TX_LAUNCH_CSR_PARITY) \
648 | SEES(TX_SBRD_CTL_CSR_PARITY) \
649 | SEES(TX_CONFIG_PARITY) \
650 | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \
651 | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \
652 | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \
653 | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \
654 | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \
655 | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \
656 | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \
657 | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \
658 | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \
659 | SEES(TX_CREDIT_RETURN_PARITY))
660
661/*
662 * TXE Send error flags
663 */
664#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK
665static struct flag_table send_err_status_flags[] = {
666/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr", SES(CSR_PARITY)),
667/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)),
668/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR))
669};
670
671/*
672 * TXE Send Context Error flags and consequences
673 */
674static struct flag_table sc_err_status_flags[] = {
675/* 0*/ FLAG_ENTRY("InconsistentSop",
676 SEC_PACKET_DROPPED | SEC_SC_HALTED,
677 SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK),
678/* 1*/ FLAG_ENTRY("DisallowedPacket",
679 SEC_PACKET_DROPPED | SEC_SC_HALTED,
680 SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK),
681/* 2*/ FLAG_ENTRY("WriteCrossesBoundary",
682 SEC_WRITE_DROPPED | SEC_SC_HALTED,
683 SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK),
684/* 3*/ FLAG_ENTRY("WriteOverflow",
685 SEC_WRITE_DROPPED | SEC_SC_HALTED,
686 SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK),
687/* 4*/ FLAG_ENTRY("WriteOutOfBounds",
688 SEC_WRITE_DROPPED | SEC_SC_HALTED,
689 SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK),
690/* 5-63 reserved*/
691};
692
693/*
694 * RXE Receive Error flags
695 */
696#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK
697static struct flag_table rxe_err_status_flags[] = {
698/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)),
699/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)),
700/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)),
701/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)),
702/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)),
703/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)),
704/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)),
705/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)),
706/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)),
707/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)),
708/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)),
709/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)),
710/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)),
711/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)),
712/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)),
713/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)),
714/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr",
715 RXES(RBUF_LOOKUP_DES_REG_UNC_COR)),
716/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)),
717/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)),
718/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr",
719 RXES(RBUF_BLOCK_LIST_READ_UNC)),
720/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr",
721 RXES(RBUF_BLOCK_LIST_READ_COR)),
722/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr",
723 RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)),
724/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr",
725 RXES(RBUF_CSR_QENT_CNT_PARITY)),
726/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr",
727 RXES(RBUF_CSR_QNEXT_BUF_PARITY)),
728/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr",
729 RXES(RBUF_CSR_QVLD_BIT_PARITY)),
730/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)),
731/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)),
732/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr",
733 RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)),
734/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)),
735/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)),
736/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)),
737/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)),
738/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)),
739/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)),
740/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)),
741/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr",
742 RXES(RBUF_FL_INITDONE_PARITY)),
743/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr",
744 RXES(RBUF_FL_INIT_WR_ADDR_PARITY)),
745/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)),
746/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)),
747/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)),
748/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr",
749 RXES(LOOKUP_DES_PART1_UNC_COR)),
750/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr",
751 RXES(LOOKUP_DES_PART2_PARITY)),
752/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)),
753/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)),
754/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)),
755/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)),
756/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)),
757/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)),
758/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)),
759/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)),
760/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)),
761/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)),
762/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)),
763/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)),
764/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)),
765/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)),
766/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)),
767/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)),
768/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)),
769/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)),
770/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)),
771/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)),
772/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)),
773/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY))
774};
775
776/* RXE errors that will trigger an SPC freeze */
777#define ALL_RXE_FREEZE_ERR \
778 (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \
779 | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \
780 | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \
781 | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \
782 | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \
783 | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \
784 | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \
785 | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \
786 | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \
787 | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
788 | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \
789 | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \
790 | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \
791 | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \
792 | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \
793 | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
794 | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \
795 | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \
796 | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \
797 | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \
798 | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \
799 | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \
800 | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \
801 | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \
802 | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
803 | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \
804 | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \
805 | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
806 | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
807 | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \
808 | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \
809 | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \
810 | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \
811 | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \
812 | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \
813 | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \
814 | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \
815 | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
816 | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \
817 | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \
818 | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
819 | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
820 | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \
821 | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK)
822
823#define RXE_FREEZE_ABORT_MASK \
824 (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \
825 RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \
826 RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK)
827
828/*
829 * DCC Error Flags
830 */
831#define DCCE(name) DCC_ERR_FLG_##name##_SMASK
832static struct flag_table dcc_err_flags[] = {
833 FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)),
834 FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)),
835 FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)),
836 FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)),
837 FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)),
838 FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)),
839 FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)),
840 FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)),
841 FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)),
842 FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)),
843 FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)),
844 FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)),
845 FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)),
846 FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)),
847 FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)),
848 FLAG_ENTRY0("link_err", DCCE(LINK_ERR)),
849 FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)),
850 FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)),
851 FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)),
852 FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)),
853 FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)),
854 FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)),
855 FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)),
856 FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)),
857 FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)),
858 FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)),
859 FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)),
860 FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)),
861 FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)),
862 FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)),
863 FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)),
864 FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)),
865 FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)),
866 FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)),
867 FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)),
868 FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)),
869 FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)),
870 FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)),
871 FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)),
872 FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)),
873 FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)),
874 FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)),
875 FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)),
876 FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)),
877 FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)),
878 FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)),
879};
880
881/*
882 * LCB error flags
883 */
884#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK
885static struct flag_table lcb_err_flags[] = {
886/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)),
887/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)),
888/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)),
889/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST",
890 LCBE(ALL_LNS_FAILED_REINIT_TEST)),
891/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)),
892/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)),
893/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)),
894/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)),
895/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)),
896/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)),
897/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)),
898/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)),
899/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)),
900/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER",
901 LCBE(UNEXPECTED_ROUND_TRIP_MARKER)),
902/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)),
903/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)),
904/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)),
905/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)),
906/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)),
907/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE",
908 LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)),
909/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)),
910/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)),
911/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)),
912/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)),
913/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)),
914/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)),
915/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP",
916 LCBE(RST_FOR_INCOMPLT_RND_TRIP)),
917/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)),
918/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE",
919 LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)),
920/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR",
921 LCBE(REDUNDANT_FLIT_PARITY_ERR))
922};
923
924/*
925 * DC8051 Error Flags
926 */
927#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK
928static struct flag_table dc8051_err_flags[] = {
929 FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)),
930 FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)),
931 FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)),
932 FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)),
933 FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)),
934 FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)),
935 FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
936 FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
937 FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
938 D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
939 FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
940};
941
942/*
943 * DC8051 Information Error flags
944 *
945 * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field.
946 */
947static struct flag_table dc8051_info_err_flags[] = {
948 FLAG_ENTRY0("Spico ROM check failed", SPICO_ROM_FAILED),
949 FLAG_ENTRY0("Unknown frame received", UNKNOWN_FRAME),
950 FLAG_ENTRY0("Target BER not met", TARGET_BER_NOT_MET),
951 FLAG_ENTRY0("Serdes internal loopback failure",
952 FAILED_SERDES_INTERNAL_LOOPBACK),
953 FLAG_ENTRY0("Failed SerDes init", FAILED_SERDES_INIT),
954 FLAG_ENTRY0("Failed LNI(Polling)", FAILED_LNI_POLLING),
955 FLAG_ENTRY0("Failed LNI(Debounce)", FAILED_LNI_DEBOUNCE),
956 FLAG_ENTRY0("Failed LNI(EstbComm)", FAILED_LNI_ESTBCOMM),
957 FLAG_ENTRY0("Failed LNI(OptEq)", FAILED_LNI_OPTEQ),
958 FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
959 FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
960 FLAG_ENTRY0("Failed LNI(ConfigLT)", FAILED_LNI_CONFIGLT)
961};
962
963/*
964 * DC8051 Information Host Information flags
965 *
966 * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field.
967 */
968static struct flag_table dc8051_info_host_msg_flags[] = {
969 FLAG_ENTRY0("Host request done", 0x0001),
970 FLAG_ENTRY0("BC SMA message", 0x0002),
971 FLAG_ENTRY0("BC PWR_MGM message", 0x0004),
972 FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
973 FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
974 FLAG_ENTRY0("External device config request", 0x0020),
975 FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
976 FLAG_ENTRY0("LinkUp achieved", 0x0080),
977 FLAG_ENTRY0("Link going down", 0x0100),
978};
979
980
981static u32 encoded_size(u32 size);
982static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
983static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
984static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
985 u8 *continuous);
986static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
987 u8 *vcu, u16 *vl15buf, u8 *crc_sizes);
988static void read_vc_remote_link_width(struct hfi1_devdata *dd,
989 u8 *remote_tx_rate, u16 *link_widths);
990static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
991 u8 *flag_bits, u16 *link_widths);
992static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
993 u8 *device_rev);
994static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed);
995static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx);
996static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx,
997 u8 *tx_polarity_inversion,
998 u8 *rx_polarity_inversion, u8 *max_rate);
999static void handle_sdma_eng_err(struct hfi1_devdata *dd,
1000 unsigned int context, u64 err_status);
1001static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg);
1002static void handle_dcc_err(struct hfi1_devdata *dd,
1003 unsigned int context, u64 err_status);
1004static void handle_lcb_err(struct hfi1_devdata *dd,
1005 unsigned int context, u64 err_status);
1006static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg);
1007static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
1008static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
1009static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
1010static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
1011static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
1012static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
1013static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
1014static void set_partition_keys(struct hfi1_pportdata *);
1015static const char *link_state_name(u32 state);
1016static const char *link_state_reason_name(struct hfi1_pportdata *ppd,
1017 u32 state);
1018static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
1019 u64 *out_data);
1020static int read_idle_sma(struct hfi1_devdata *dd, u64 *data);
1021static int thermal_init(struct hfi1_devdata *dd);
1022
1023static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
1024 int msecs);
1025static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
1026static void handle_temp_err(struct hfi1_devdata *);
1027static void dc_shutdown(struct hfi1_devdata *);
1028static void dc_start(struct hfi1_devdata *);
1029
1030/*
1031 * Error interrupt table entry. This is used as input to the interrupt
1032 * "clear down" routine used for all second tier error interrupt register.
1033 * Second tier interrupt registers have a single bit representing them
1034 * in the top-level CceIntStatus.
1035 */
1036struct err_reg_info {
1037 u32 status; /* status CSR offset */
1038 u32 clear; /* clear CSR offset */
1039 u32 mask; /* mask CSR offset */
1040 void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg);
1041 const char *desc;
1042};
1043
1044#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
1045#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
1046#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
1047
1048/*
1049 * Helpers for building HFI and DC error interrupt table entries. Different
1050 * helpers are needed because of inconsistent register names.
1051 */
1052#define EE(reg, handler, desc) \
1053 { reg##_STATUS, reg##_CLEAR, reg##_MASK, \
1054 handler, desc }
1055#define DC_EE1(reg, handler, desc) \
1056 { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc }
1057#define DC_EE2(reg, handler, desc) \
1058 { reg##_FLG, reg##_CLR, reg##_EN, handler, desc }
1059
1060/*
1061 * Table of the "misc" grouping of error interrupts. Each entry refers to
1062 * another register containing more information.
1063 */
1064static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = {
1065/* 0*/ EE(CCE_ERR, handle_cce_err, "CceErr"),
1066/* 1*/ EE(RCV_ERR, handle_rxe_err, "RxeErr"),
1067/* 2*/ EE(MISC_ERR, handle_misc_err, "MiscErr"),
1068/* 3*/ { 0, 0, 0, NULL }, /* reserved */
1069/* 4*/ EE(SEND_PIO_ERR, handle_pio_err, "PioErr"),
1070/* 5*/ EE(SEND_DMA_ERR, handle_sdma_err, "SDmaErr"),
1071/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"),
1072/* 7*/ EE(SEND_ERR, handle_txe_err, "TxeErr")
1073 /* the rest are reserved */
1074};
1075
1076/*
1077 * Index into the Various section of the interrupt sources
1078 * corresponding to the Critical Temperature interrupt.
1079 */
1080#define TCRIT_INT_SOURCE 4
1081
1082/*
1083 * SDMA error interrupt entry - refers to another register containing more
1084 * information.
1085 */
1086static const struct err_reg_info sdma_eng_err =
1087 EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr");
1088
1089static const struct err_reg_info various_err[NUM_VARIOUS] = {
1090/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */
1091/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */
1092/* 2*/ EE(ASIC_QSFP1, handle_qsfp_int, "QSFP1"),
1093/* 3*/ EE(ASIC_QSFP2, handle_qsfp_int, "QSFP2"),
1094/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */
1095 /* rest are reserved */
1096};
1097
1098/*
1099 * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG
1100 * register can not be derived from the MTU value because 10K is not
1101 * a power of 2. Therefore, we need a constant. Everything else can
1102 * be calculated.
1103 */
1104#define DCC_CFG_PORT_MTU_CAP_10240 7
1105
1106/*
1107 * Table of the DC grouping of error interrupts. Each entry refers to
1108 * another register containing more information.
1109 */
1110static const struct err_reg_info dc_errs[NUM_DC_ERRS] = {
1111/* 0*/ DC_EE1(DCC_ERR, handle_dcc_err, "DCC Err"),
1112/* 1*/ DC_EE2(DC_LCB_ERR, handle_lcb_err, "LCB Err"),
1113/* 2*/ DC_EE2(DC_DC8051_ERR, handle_8051_interrupt, "DC8051 Interrupt"),
1114/* 3*/ /* dc_lbm_int - special, see is_dc_int() */
1115 /* the rest are reserved */
1116};
1117
1118struct cntr_entry {
1119 /*
1120 * counter name
1121 */
1122 char *name;
1123
1124 /*
1125 * csr to read for name (if applicable)
1126 */
1127 u64 csr;
1128
1129 /*
1130 * offset into dd or ppd to store the counter's value
1131 */
1132 int offset;
1133
1134 /*
1135 * flags
1136 */
1137 u8 flags;
1138
1139 /*
1140 * accessor for stat element, context either dd or ppd
1141 */
1142 u64 (*rw_cntr)(const struct cntr_entry *,
1143 void *context,
1144 int vl,
1145 int mode,
1146 u64 data);
1147};
1148
1149#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
1150#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159
1151
1152#define CNTR_ELEM(name, csr, offset, flags, accessor) \
1153{ \
1154 name, \
1155 csr, \
1156 offset, \
1157 flags, \
1158 accessor \
1159}
1160
1161/* 32bit RXE */
1162#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \
1163CNTR_ELEM(#name, \
1164 (counter * 8 + RCV_COUNTER_ARRAY32), \
1165 0, flags | CNTR_32BIT, \
1166 port_access_u32_csr)
1167
1168#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \
1169CNTR_ELEM(#name, \
1170 (counter * 8 + RCV_COUNTER_ARRAY32), \
1171 0, flags | CNTR_32BIT, \
1172 dev_access_u32_csr)
1173
1174/* 64bit RXE */
1175#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \
1176CNTR_ELEM(#name, \
1177 (counter * 8 + RCV_COUNTER_ARRAY64), \
1178 0, flags, \
1179 port_access_u64_csr)
1180
1181#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \
1182CNTR_ELEM(#name, \
1183 (counter * 8 + RCV_COUNTER_ARRAY64), \
1184 0, flags, \
1185 dev_access_u64_csr)
1186
1187#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
1188#define OVR_ELM(ctx) \
1189CNTR_ELEM("RcvHdrOvr" #ctx, \
1190 (RCV_HDR_OVFL_CNT + ctx*0x100), \
1191 0, CNTR_NORMAL, port_access_u64_csr)
1192
1193/* 32bit TXE */
1194#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \
1195CNTR_ELEM(#name, \
1196 (counter * 8 + SEND_COUNTER_ARRAY32), \
1197 0, flags | CNTR_32BIT, \
1198 port_access_u32_csr)
1199
1200/* 64bit TXE */
1201#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \
1202CNTR_ELEM(#name, \
1203 (counter * 8 + SEND_COUNTER_ARRAY64), \
1204 0, flags, \
1205 port_access_u64_csr)
1206
1207# define TX64_DEV_CNTR_ELEM(name, counter, flags) \
1208CNTR_ELEM(#name,\
1209 counter * 8 + SEND_COUNTER_ARRAY64, \
1210 0, \
1211 flags, \
1212 dev_access_u64_csr)
1213
1214/* CCE */
1215#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \
1216CNTR_ELEM(#name, \
1217 (counter * 8 + CCE_COUNTER_ARRAY32), \
1218 0, flags | CNTR_32BIT, \
1219 dev_access_u32_csr)
1220
1221#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \
1222CNTR_ELEM(#name, \
1223 (counter * 8 + CCE_INT_COUNTER_ARRAY32), \
1224 0, flags | CNTR_32BIT, \
1225 dev_access_u32_csr)
1226
1227/* DC */
1228#define DC_PERF_CNTR(name, counter, flags) \
1229CNTR_ELEM(#name, \
1230 counter, \
1231 0, \
1232 flags, \
1233 dev_access_u64_csr)
1234
1235#define DC_PERF_CNTR_LCB(name, counter, flags) \
1236CNTR_ELEM(#name, \
1237 counter, \
1238 0, \
1239 flags, \
1240 dc_access_lcb_cntr)
1241
1242/* ibp counters */
1243#define SW_IBP_CNTR(name, cntr) \
1244CNTR_ELEM(#name, \
1245 0, \
1246 0, \
1247 CNTR_SYNTH, \
1248 access_ibp_##cntr)
1249
1250u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
1251{
1252 u64 val;
1253
1254 if (dd->flags & HFI1_PRESENT) {
1255 val = readq((void __iomem *)dd->kregbase + offset);
1256 return val;
1257 }
1258 return -1;
1259}
1260
1261void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
1262{
1263 if (dd->flags & HFI1_PRESENT)
1264 writeq(value, (void __iomem *)dd->kregbase + offset);
1265}
1266
1267void __iomem *get_csr_addr(
1268 struct hfi1_devdata *dd,
1269 u32 offset)
1270{
1271 return (void __iomem *)dd->kregbase + offset;
1272}
1273
1274static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
1275 int mode, u64 value)
1276{
1277 u64 ret;
1278
1279
1280 if (mode == CNTR_MODE_R) {
1281 ret = read_csr(dd, csr);
1282 } else if (mode == CNTR_MODE_W) {
1283 write_csr(dd, csr, value);
1284 ret = value;
1285 } else {
1286 dd_dev_err(dd, "Invalid cntr register access mode");
1287 return 0;
1288 }
1289
1290 hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode);
1291 return ret;
1292}
1293
1294/* Dev Access */
1295static u64 dev_access_u32_csr(const struct cntr_entry *entry,
1296 void *context, int vl, int mode, u64 data)
1297{
1298 struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
1299
1300 if (vl != CNTR_INVALID_VL)
1301 return 0;
1302 return read_write_csr(dd, entry->csr, mode, data);
1303}
1304
1305static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
1306 int vl, int mode, u64 data)
1307{
1308 struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
1309
1310 u64 val = 0;
1311 u64 csr = entry->csr;
1312
1313 if (entry->flags & CNTR_VL) {
1314 if (vl == CNTR_INVALID_VL)
1315 return 0;
1316 csr += 8 * vl;
1317 } else {
1318 if (vl != CNTR_INVALID_VL)
1319 return 0;
1320 }
1321
1322 val = read_write_csr(dd, csr, mode, data);
1323 return val;
1324}
1325
1326static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
1327 int vl, int mode, u64 data)
1328{
1329 struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
1330 u32 csr = entry->csr;
1331 int ret = 0;
1332
1333 if (vl != CNTR_INVALID_VL)
1334 return 0;
1335 if (mode == CNTR_MODE_R)
1336 ret = read_lcb_csr(dd, csr, &data);
1337 else if (mode == CNTR_MODE_W)
1338 ret = write_lcb_csr(dd, csr, data);
1339
1340 if (ret) {
1341 dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr);
1342 return 0;
1343 }
1344
1345 hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode);
1346 return data;
1347}
1348
1349/* Port Access */
1350static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
1351 int vl, int mode, u64 data)
1352{
1353 struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
1354
1355 if (vl != CNTR_INVALID_VL)
1356 return 0;
1357 return read_write_csr(ppd->dd, entry->csr, mode, data);
1358}
1359
1360static u64 port_access_u64_csr(const struct cntr_entry *entry,
1361 void *context, int vl, int mode, u64 data)
1362{
1363 struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
1364 u64 val;
1365 u64 csr = entry->csr;
1366
1367 if (entry->flags & CNTR_VL) {
1368 if (vl == CNTR_INVALID_VL)
1369 return 0;
1370 csr += 8 * vl;
1371 } else {
1372 if (vl != CNTR_INVALID_VL)
1373 return 0;
1374 }
1375 val = read_write_csr(ppd->dd, csr, mode, data);
1376 return val;
1377}
1378
1379/* Software defined */
1380static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode,
1381 u64 data)
1382{
1383 u64 ret;
1384
1385 if (mode == CNTR_MODE_R) {
1386 ret = *cntr;
1387 } else if (mode == CNTR_MODE_W) {
1388 *cntr = data;
1389 ret = data;
1390 } else {
1391 dd_dev_err(dd, "Invalid cntr sw access mode");
1392 return 0;
1393 }
1394
1395 hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode);
1396
1397 return ret;
1398}
1399
1400static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
1401 int vl, int mode, u64 data)
1402{
1403 struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
1404
1405 if (vl != CNTR_INVALID_VL)
1406 return 0;
1407 return read_write_sw(ppd->dd, &ppd->link_downed, mode, data);
1408}
1409
1410static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
1411 int vl, int mode, u64 data)
1412{
1413 struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
1414
1415 if (vl != CNTR_INVALID_VL)
1416 return 0;
1417 return read_write_sw(ppd->dd, &ppd->link_up, mode, data);
1418}
1419
1420static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
1421 void *context, int vl, int mode, u64 data)
1422{
1423 struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
1424
1425 if (vl != CNTR_INVALID_VL)
1426 return 0;
1427
1428 return read_write_sw(ppd->dd, &ppd->port_xmit_discards, mode, data);
1429}
1430
1431static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
1432 void *context, int vl, int mode, u64 data)
1433{
1434 struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
1435
1436 if (vl != CNTR_INVALID_VL)
1437 return 0;
1438
1439 return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors,
1440 mode, data);
1441}
1442
1443static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
1444 void *context, int vl, int mode, u64 data)
1445{
1446 struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
1447
1448 if (vl != CNTR_INVALID_VL)
1449 return 0;
1450
1451 return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors,
1452 mode, data);
1453}
1454
1455u64 get_all_cpu_total(u64 __percpu *cntr)
1456{
1457 int cpu;
1458 u64 counter = 0;
1459
1460 for_each_possible_cpu(cpu)
1461 counter += *per_cpu_ptr(cntr, cpu);
1462 return counter;
1463}
1464
1465static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
1466 u64 __percpu *cntr,
1467 int vl, int mode, u64 data)
1468{
1469
1470 u64 ret = 0;
1471
1472 if (vl != CNTR_INVALID_VL)
1473 return 0;
1474
1475 if (mode == CNTR_MODE_R) {
1476 ret = get_all_cpu_total(cntr) - *z_val;
1477 } else if (mode == CNTR_MODE_W) {
1478 /* A write can only zero the counter */
1479 if (data == 0)
1480 *z_val = get_all_cpu_total(cntr);
1481 else
1482 dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
1483 } else {
1484 dd_dev_err(dd, "Invalid cntr sw cpu access mode");
1485 return 0;
1486 }
1487
1488 return ret;
1489}
1490
1491static u64 access_sw_cpu_intr(const struct cntr_entry *entry,
1492 void *context, int vl, int mode, u64 data)
1493{
1494 struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
1495
1496 return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl,
1497 mode, data);
1498}
1499
1500static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
1501 void *context, int vl, int mode, u64 data)
1502{
1503 struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
1504
1505 return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl,
1506 mode, data);
1507}
1508
1509static u64 access_sw_pio_wait(const struct cntr_entry *entry,
1510 void *context, int vl, int mode, u64 data)
1511{
1512 struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
1513
1514 return dd->verbs_dev.n_piowait;
1515}
1516
1517static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
1518 void *context, int vl, int mode, u64 data)
1519{
1520 struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
1521
1522 return dd->verbs_dev.n_txwait;
1523}
1524
1525static u64 access_sw_kmem_wait(const struct cntr_entry *entry,
1526 void *context, int vl, int mode, u64 data)
1527{
1528 struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
1529
1530 return dd->verbs_dev.n_kmem_wait;
1531}
1532
1533#define def_access_sw_cpu(cntr) \
1534static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry, \
1535 void *context, int vl, int mode, u64 data) \
1536{ \
1537 struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \
1538 return read_write_cpu(ppd->dd, &ppd->ibport_data.z_ ##cntr, \
1539 ppd->ibport_data.cntr, vl, \
1540 mode, data); \
1541}
1542
1543def_access_sw_cpu(rc_acks);
1544def_access_sw_cpu(rc_qacks);
1545def_access_sw_cpu(rc_delayed_comp);
1546
1547#define def_access_ibp_counter(cntr) \
1548static u64 access_ibp_##cntr(const struct cntr_entry *entry, \
1549 void *context, int vl, int mode, u64 data) \
1550{ \
1551 struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \
1552 \
1553 if (vl != CNTR_INVALID_VL) \
1554 return 0; \
1555 \
1556 return read_write_sw(ppd->dd, &ppd->ibport_data.n_ ##cntr, \
1557 mode, data); \
1558}
1559
1560def_access_ibp_counter(loop_pkts);
1561def_access_ibp_counter(rc_resends);
1562def_access_ibp_counter(rnr_naks);
1563def_access_ibp_counter(other_naks);
1564def_access_ibp_counter(rc_timeouts);
1565def_access_ibp_counter(pkt_drops);
1566def_access_ibp_counter(dmawait);
1567def_access_ibp_counter(rc_seqnak);
1568def_access_ibp_counter(rc_dupreq);
1569def_access_ibp_counter(rdma_seq);
1570def_access_ibp_counter(unaligned);
1571def_access_ibp_counter(seq_naks);
1572
1573static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
1574[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
1575[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
1576 CNTR_NORMAL),
1577[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
1578 CNTR_NORMAL),
1579[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs,
1580 RCV_TID_FLOW_GEN_MISMATCH_CNT,
1581 CNTR_NORMAL),
1582[C_RX_CTX_RHQS] = RXE32_DEV_CNTR_ELEM(RxCtxRHQS, RCV_CONTEXT_RHQ_STALL,
1583 CNTR_NORMAL),
1584[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL,
1585 CNTR_NORMAL),
1586[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs,
1587 RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL),
1588[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt,
1589 CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL),
1590[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT,
1591 CNTR_NORMAL),
1592[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT,
1593 CNTR_NORMAL),
1594[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT,
1595 CNTR_NORMAL),
1596[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT,
1597 CNTR_NORMAL),
1598[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT,
1599 CNTR_NORMAL),
1600[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT,
1601 CNTR_NORMAL),
1602[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt,
1603 CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL),
1604[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt,
1605 CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
1606[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
1607 CNTR_SYNTH),
1608[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH),
1609[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
1610 CNTR_SYNTH),
1611[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
1612 CNTR_SYNTH),
1613[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT,
1614 CNTR_SYNTH),
1615[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts,
1616 DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH),
1617[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts,
1618 DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT,
1619 CNTR_SYNTH),
1620[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr,
1621 DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH),
1622[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT,
1623 CNTR_SYNTH),
1624[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT,
1625 CNTR_SYNTH),
1626[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT,
1627 CNTR_SYNTH),
1628[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT,
1629 CNTR_SYNTH),
1630[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT,
1631 CNTR_SYNTH),
1632[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT,
1633 CNTR_SYNTH),
1634[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT,
1635 CNTR_SYNTH),
1636[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT,
1637 CNTR_SYNTH | CNTR_VL),
1638[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT,
1639 CNTR_SYNTH | CNTR_VL),
1640[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH),
1641[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT,
1642 CNTR_SYNTH | CNTR_VL),
1643[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH),
1644[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT,
1645 CNTR_SYNTH | CNTR_VL),
1646[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT,
1647 CNTR_SYNTH),
1648[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT,
1649 CNTR_SYNTH | CNTR_VL),
1650[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT,
1651 CNTR_SYNTH),
1652[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT,
1653 CNTR_SYNTH | CNTR_VL),
1654[C_DC_TOTAL_CRC] =
1655 DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR,
1656 CNTR_SYNTH),
1657[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0,
1658 CNTR_SYNTH),
1659[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1,
1660 CNTR_SYNTH),
1661[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2,
1662 CNTR_SYNTH),
1663[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3,
1664 CNTR_SYNTH),
1665[C_DC_CRC_MULT_LN] =
1666 DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN,
1667 CNTR_SYNTH),
1668[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT,
1669 CNTR_SYNTH),
1670[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT,
1671 CNTR_SYNTH),
1672[C_DC_SEQ_CRC_CNT] =
1673 DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT,
1674 CNTR_SYNTH),
1675[C_DC_ESC0_ONLY_CNT] =
1676 DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT,
1677 CNTR_SYNTH),
1678[C_DC_ESC0_PLUS1_CNT] =
1679 DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT,
1680 CNTR_SYNTH),
1681[C_DC_ESC0_PLUS2_CNT] =
1682 DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT,
1683 CNTR_SYNTH),
1684[C_DC_REINIT_FROM_PEER_CNT] =
1685 DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT,
1686 CNTR_SYNTH),
1687[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT,
1688 CNTR_SYNTH),
1689[C_DC_MISC_FLG_CNT] =
1690 DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT,
1691 CNTR_SYNTH),
1692[C_DC_PRF_GOOD_LTP_CNT] =
1693 DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH),
1694[C_DC_PRF_ACCEPTED_LTP_CNT] =
1695 DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT,
1696 CNTR_SYNTH),
1697[C_DC_PRF_RX_FLIT_CNT] =
1698 DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH),
1699[C_DC_PRF_TX_FLIT_CNT] =
1700 DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH),
1701[C_DC_PRF_CLK_CNTR] =
1702 DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH),
1703[C_DC_PG_DBG_FLIT_CRDTS_CNT] =
1704 DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH),
1705[C_DC_PG_STS_PAUSE_COMPLETE_CNT] =
1706 DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT,
1707 CNTR_SYNTH),
1708[C_DC_PG_STS_TX_SBE_CNT] =
1709 DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH),
1710[C_DC_PG_STS_TX_MBE_CNT] =
1711 DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT,
1712 CNTR_SYNTH),
1713[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL,
1714 access_sw_cpu_intr),
1715[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
1716 access_sw_cpu_rcv_limit),
1717[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
1718 access_sw_vtx_wait),
1719[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
1720 access_sw_pio_wait),
1721[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
1722 access_sw_kmem_wait),
1723};
1724
1725static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = {
1726[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT,
1727 CNTR_NORMAL),
1728[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT,
1729 CNTR_NORMAL),
1730[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT,
1731 CNTR_NORMAL),
1732[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT,
1733 CNTR_NORMAL),
1734[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT,
1735 CNTR_NORMAL),
1736[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT,
1737 CNTR_NORMAL),
1738[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT,
1739 CNTR_NORMAL),
1740[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL),
1741[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
1742[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
1743[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
1744 CNTR_SYNTH | CNTR_VL),
1745[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
1746 CNTR_SYNTH | CNTR_VL),
1747[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
1748 CNTR_SYNTH | CNTR_VL),
1749[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
1750[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
1751[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
1752 access_sw_link_dn_cnt),
1753[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
1754 access_sw_link_up_cnt),
1755[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
1756 access_sw_xmit_discards),
1757[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
1758 CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
1759 access_sw_xmit_discards),
1760[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
1761 access_xmit_constraint_errs),
1762[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
1763 access_rcv_constraint_errs),
1764[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
1765[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
1766[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
1767[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks),
1768[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts),
1769[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops),
1770[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait),
1771[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak),
1772[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq),
1773[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
1774[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
1775[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
1776[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
1777 access_sw_cpu_rc_acks),
1778[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
1779 access_sw_cpu_rc_qacks),
1780[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
1781 access_sw_cpu_rc_delayed_comp),
1782[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
1783[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
1784[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
1785[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7),
1786[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9),
1787[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11),
1788[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13),
1789[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15),
1790[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17),
1791[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19),
1792[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21),
1793[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23),
1794[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25),
1795[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27),
1796[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29),
1797[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31),
1798[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33),
1799[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35),
1800[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37),
1801[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39),
1802[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41),
1803[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43),
1804[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45),
1805[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47),
1806[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49),
1807[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51),
1808[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53),
1809[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55),
1810[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57),
1811[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59),
1812[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61),
1813[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63),
1814[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65),
1815[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67),
1816[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69),
1817[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71),
1818[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73),
1819[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75),
1820[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77),
1821[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79),
1822[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81),
1823[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83),
1824[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85),
1825[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87),
1826[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89),
1827[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91),
1828[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93),
1829[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95),
1830[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97),
1831[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99),
1832[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101),
1833[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103),
1834[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105),
1835[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107),
1836[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109),
1837[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111),
1838[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113),
1839[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115),
1840[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117),
1841[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119),
1842[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121),
1843[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123),
1844[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125),
1845[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127),
1846[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129),
1847[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131),
1848[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133),
1849[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135),
1850[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137),
1851[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139),
1852[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141),
1853[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143),
1854[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145),
1855[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147),
1856[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149),
1857[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151),
1858[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153),
1859[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155),
1860[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157),
1861[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159),
1862};
1863
1864/* ======================================================================== */
1865
1866/* return true if this is chip revision revision a0 */
1867int is_a0(struct hfi1_devdata *dd)
1868{
1869 return ((dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
1870 & CCE_REVISION_CHIP_REV_MINOR_MASK) == 0;
1871}
1872
1873/* return true if this is chip revision revision a */
1874int is_ax(struct hfi1_devdata *dd)
1875{
1876 u8 chip_rev_minor =
1877 dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
1878 & CCE_REVISION_CHIP_REV_MINOR_MASK;
1879 return (chip_rev_minor & 0xf0) == 0;
1880}
1881
1882/* return true if this is chip revision revision b */
1883int is_bx(struct hfi1_devdata *dd)
1884{
1885 u8 chip_rev_minor =
1886 dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
1887 & CCE_REVISION_CHIP_REV_MINOR_MASK;
1888 return !!(chip_rev_minor & 0x10);
1889}
1890
1891/*
1892 * Append string s to buffer buf. Arguments curp and len are the current
1893 * position and remaining length, respectively.
1894 *
1895 * return 0 on success, 1 on out of room
1896 */
1897static int append_str(char *buf, char **curp, int *lenp, const char *s)
1898{
1899 char *p = *curp;
1900 int len = *lenp;
1901 int result = 0; /* success */
1902 char c;
1903
1904 /* add a comma, if first in the buffer */
1905 if (p != buf) {
1906 if (len == 0) {
1907 result = 1; /* out of room */
1908 goto done;
1909 }
1910 *p++ = ',';
1911 len--;
1912 }
1913
1914 /* copy the string */
1915 while ((c = *s++) != 0) {
1916 if (len == 0) {
1917 result = 1; /* out of room */
1918 goto done;
1919 }
1920 *p++ = c;
1921 len--;
1922 }
1923
1924done:
1925 /* write return values */
1926 *curp = p;
1927 *lenp = len;
1928
1929 return result;
1930}
1931
1932/*
1933 * Using the given flag table, print a comma separated string into
1934 * the buffer. End in '*' if the buffer is too short.
1935 */
1936static char *flag_string(char *buf, int buf_len, u64 flags,
1937 struct flag_table *table, int table_size)
1938{
1939 char extra[32];
1940 char *p = buf;
1941 int len = buf_len;
1942 int no_room = 0;
1943 int i;
1944
1945 /* make sure there is at least 2 so we can form "*" */
1946 if (len < 2)
1947 return "";
1948
1949 len--; /* leave room for a nul */
1950 for (i = 0; i < table_size; i++) {
1951 if (flags & table[i].flag) {
1952 no_room = append_str(buf, &p, &len, table[i].str);
1953 if (no_room)
1954 break;
1955 flags &= ~table[i].flag;
1956 }
1957 }
1958
1959 /* any undocumented bits left? */
1960 if (!no_room && flags) {
1961 snprintf(extra, sizeof(extra), "bits 0x%llx", flags);
1962 no_room = append_str(buf, &p, &len, extra);
1963 }
1964
1965 /* add * if ran out of room */
1966 if (no_room) {
1967 /* may need to back up to add space for a '*' */
1968 if (len == 0)
1969 --p;
1970 *p++ = '*';
1971 }
1972
1973 /* add final nul - space already allocated above */
1974 *p = 0;
1975 return buf;
1976}
1977
1978/* first 8 CCE error interrupt source names */
1979static const char * const cce_misc_names[] = {
1980 "CceErrInt", /* 0 */
1981 "RxeErrInt", /* 1 */
1982 "MiscErrInt", /* 2 */
1983 "Reserved3", /* 3 */
1984 "PioErrInt", /* 4 */
1985 "SDmaErrInt", /* 5 */
1986 "EgressErrInt", /* 6 */
1987 "TxeErrInt" /* 7 */
1988};
1989
1990/*
1991 * Return the miscellaneous error interrupt name.
1992 */
1993static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source)
1994{
1995 if (source < ARRAY_SIZE(cce_misc_names))
1996 strncpy(buf, cce_misc_names[source], bsize);
1997 else
1998 snprintf(buf,
1999 bsize,
2000 "Reserved%u",
2001 source + IS_GENERAL_ERR_START);
2002
2003 return buf;
2004}
2005
2006/*
2007 * Return the SDMA engine error interrupt name.
2008 */
2009static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source)
2010{
2011 snprintf(buf, bsize, "SDmaEngErrInt%u", source);
2012 return buf;
2013}
2014
2015/*
2016 * Return the send context error interrupt name.
2017 */
2018static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source)
2019{
2020 snprintf(buf, bsize, "SendCtxtErrInt%u", source);
2021 return buf;
2022}
2023
2024static const char * const various_names[] = {
2025 "PbcInt",
2026 "GpioAssertInt",
2027 "Qsfp1Int",
2028 "Qsfp2Int",
2029 "TCritInt"
2030};
2031
2032/*
2033 * Return the various interrupt name.
2034 */
2035static char *is_various_name(char *buf, size_t bsize, unsigned int source)
2036{
2037 if (source < ARRAY_SIZE(various_names))
2038 strncpy(buf, various_names[source], bsize);
2039 else
2040 snprintf(buf, bsize, "Reserved%u", source+IS_VARIOUS_START);
2041 return buf;
2042}
2043
2044/*
2045 * Return the DC interrupt name.
2046 */
2047static char *is_dc_name(char *buf, size_t bsize, unsigned int source)
2048{
2049 static const char * const dc_int_names[] = {
2050 "common",
2051 "lcb",
2052 "8051",
2053 "lbm" /* local block merge */
2054 };
2055
2056 if (source < ARRAY_SIZE(dc_int_names))
2057 snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]);
2058 else
2059 snprintf(buf, bsize, "DCInt%u", source);
2060 return buf;
2061}
2062
2063static const char * const sdma_int_names[] = {
2064 "SDmaInt",
2065 "SdmaIdleInt",
2066 "SdmaProgressInt",
2067};
2068
2069/*
2070 * Return the SDMA engine interrupt name.
2071 */
2072static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source)
2073{
2074 /* what interrupt */
2075 unsigned int what = source / TXE_NUM_SDMA_ENGINES;
2076 /* which engine */
2077 unsigned int which = source % TXE_NUM_SDMA_ENGINES;
2078
2079 if (likely(what < 3))
2080 snprintf(buf, bsize, "%s%u", sdma_int_names[what], which);
2081 else
2082 snprintf(buf, bsize, "Invalid SDMA interrupt %u", source);
2083 return buf;
2084}
2085
2086/*
2087 * Return the receive available interrupt name.
2088 */
2089static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source)
2090{
2091 snprintf(buf, bsize, "RcvAvailInt%u", source);
2092 return buf;
2093}
2094
2095/*
2096 * Return the receive urgent interrupt name.
2097 */
2098static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source)
2099{
2100 snprintf(buf, bsize, "RcvUrgentInt%u", source);
2101 return buf;
2102}
2103
2104/*
2105 * Return the send credit interrupt name.
2106 */
2107static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source)
2108{
2109 snprintf(buf, bsize, "SendCreditInt%u", source);
2110 return buf;
2111}
2112
2113/*
2114 * Return the reserved interrupt name.
2115 */
2116static char *is_reserved_name(char *buf, size_t bsize, unsigned int source)
2117{
2118 snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START);
2119 return buf;
2120}
2121
2122static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
2123{
2124 return flag_string(buf, buf_len, flags,
2125 cce_err_status_flags, ARRAY_SIZE(cce_err_status_flags));
2126}
2127
2128static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
2129{
2130 return flag_string(buf, buf_len, flags,
2131 rxe_err_status_flags, ARRAY_SIZE(rxe_err_status_flags));
2132}
2133
2134static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
2135{
2136 return flag_string(buf, buf_len, flags, misc_err_status_flags,
2137 ARRAY_SIZE(misc_err_status_flags));
2138}
2139
2140static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
2141{
2142 return flag_string(buf, buf_len, flags,
2143 pio_err_status_flags, ARRAY_SIZE(pio_err_status_flags));
2144}
2145
2146static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
2147{
2148 return flag_string(buf, buf_len, flags,
2149 sdma_err_status_flags,
2150 ARRAY_SIZE(sdma_err_status_flags));
2151}
2152
2153static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
2154{
2155 return flag_string(buf, buf_len, flags,
2156 egress_err_status_flags, ARRAY_SIZE(egress_err_status_flags));
2157}
2158
2159static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
2160{
2161 return flag_string(buf, buf_len, flags,
2162 egress_err_info_flags, ARRAY_SIZE(egress_err_info_flags));
2163}
2164
2165static char *send_err_status_string(char *buf, int buf_len, u64 flags)
2166{
2167 return flag_string(buf, buf_len, flags,
2168 send_err_status_flags,
2169 ARRAY_SIZE(send_err_status_flags));
2170}
2171
2172static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
2173{
2174 char buf[96];
2175
2176 /*
2177 * For most these errors, there is nothing that can be done except
2178 * report or record it.
2179 */
2180 dd_dev_info(dd, "CCE Error: %s\n",
2181 cce_err_status_string(buf, sizeof(buf), reg));
2182
2183 if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK)
2184 && is_a0(dd)
2185 && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
2186 /* this error requires a manual drop into SPC freeze mode */
2187 /* then a fix up */
2188 start_freeze_handling(dd->pport, FREEZE_SELF);
2189 }
2190}
2191
2192/*
2193 * Check counters for receive errors that do not have an interrupt
2194 * associated with them.
2195 */
2196#define RCVERR_CHECK_TIME 10
2197static void update_rcverr_timer(unsigned long opaque)
2198{
2199 struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
2200 struct hfi1_pportdata *ppd = dd->pport;
2201 u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
2202
2203 if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
2204 ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
2205 dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
2206 set_link_down_reason(ppd,
2207 OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
2208 OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
2209 queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
2210 }
2211 dd->rcv_ovfl_cnt = (u32) cur_ovfl_cnt;
2212
2213 mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
2214}
2215
2216static int init_rcverr(struct hfi1_devdata *dd)
2217{
2218 init_timer(&dd->rcverr_timer);
2219 dd->rcverr_timer.function = update_rcverr_timer;
2220 dd->rcverr_timer.data = (unsigned long) dd;
2221 /* Assume the hardware counter has been reset */
2222 dd->rcv_ovfl_cnt = 0;
2223 return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
2224}
2225
2226static void free_rcverr(struct hfi1_devdata *dd)
2227{
2228 if (dd->rcverr_timer.data)
2229 del_timer_sync(&dd->rcverr_timer);
2230 dd->rcverr_timer.data = 0;
2231}
2232
2233static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
2234{
2235 char buf[96];
2236
2237 dd_dev_info(dd, "Receive Error: %s\n",
2238 rxe_err_status_string(buf, sizeof(buf), reg));
2239
2240 if (reg & ALL_RXE_FREEZE_ERR) {
2241 int flags = 0;
2242
2243 /*
2244 * Freeze mode recovery is disabled for the errors
2245 * in RXE_FREEZE_ABORT_MASK
2246 */
2247 if (is_a0(dd) && (reg & RXE_FREEZE_ABORT_MASK))
2248 flags = FREEZE_ABORT;
2249
2250 start_freeze_handling(dd->pport, flags);
2251 }
2252}
2253
2254static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
2255{
2256 char buf[96];
2257
2258 dd_dev_info(dd, "Misc Error: %s",
2259 misc_err_status_string(buf, sizeof(buf), reg));
2260}
2261
2262static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
2263{
2264 char buf[96];
2265
2266 dd_dev_info(dd, "PIO Error: %s\n",
2267 pio_err_status_string(buf, sizeof(buf), reg));
2268
2269 if (reg & ALL_PIO_FREEZE_ERR)
2270 start_freeze_handling(dd->pport, 0);
2271}
2272
2273static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
2274{
2275 char buf[96];
2276
2277 dd_dev_info(dd, "SDMA Error: %s\n",
2278 sdma_err_status_string(buf, sizeof(buf), reg));
2279
2280 if (reg & ALL_SDMA_FREEZE_ERR)
2281 start_freeze_handling(dd->pport, 0);
2282}
2283
2284static void count_port_inactive(struct hfi1_devdata *dd)
2285{
2286 struct hfi1_pportdata *ppd = dd->pport;
2287
2288 if (ppd->port_xmit_discards < ~(u64)0)
2289 ppd->port_xmit_discards++;
2290}
2291
2292/*
2293 * We have had a "disallowed packet" error during egress. Determine the
2294 * integrity check which failed, and update relevant error counter, etc.
2295 *
2296 * Note that the SEND_EGRESS_ERR_INFO register has only a single
2297 * bit of state per integrity check, and so we can miss the reason for an
2298 * egress error if more than one packet fails the same integrity check
2299 * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
2300 */
2301static void handle_send_egress_err_info(struct hfi1_devdata *dd)
2302{
2303 struct hfi1_pportdata *ppd = dd->pport;
2304 u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
2305 u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO);
2306 char buf[96];
2307
2308 /* clear down all observed info as quickly as possible after read */
2309 write_csr(dd, SEND_EGRESS_ERR_INFO, info);
2310
2311 dd_dev_info(dd,
2312 "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
2313 info, egress_err_info_string(buf, sizeof(buf), info), src);
2314
2315 /* Eventually add other counters for each bit */
2316
2317 if (info & SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK) {
2318 if (ppd->port_xmit_discards < ~(u64)0)
2319 ppd->port_xmit_discards++;
2320 }
2321}
2322
2323/*
2324 * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
2325 * register. Does it represent a 'port inactive' error?
2326 */
2327static inline int port_inactive_err(u64 posn)
2328{
2329 return (posn >= SEES(TX_LINKDOWN) &&
2330 posn <= SEES(TX_INCORRECT_LINK_STATE));
2331}
2332
2333/*
2334 * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
2335 * register. Does it represent a 'disallowed packet' error?
2336 */
2337static inline int disallowed_pkt_err(u64 posn)
2338{
2339 return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
2340 posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
2341}
2342
2343static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
2344{
2345 u64 reg_copy = reg, handled = 0;
2346 char buf[96];
2347
2348 if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
2349 start_freeze_handling(dd->pport, 0);
2350 if (is_a0(dd) && (reg &
2351 SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK)
2352 && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
2353 start_freeze_handling(dd->pport, 0);
2354
2355 while (reg_copy) {
2356 int posn = fls64(reg_copy);
2357 /*
2358 * fls64() returns a 1-based offset, but we generally
2359 * want 0-based offsets.
2360 */
2361 int shift = posn - 1;
2362
2363 if (port_inactive_err(shift)) {
2364 count_port_inactive(dd);
2365 handled |= (1ULL << shift);
2366 } else if (disallowed_pkt_err(shift)) {
2367 handle_send_egress_err_info(dd);
2368 handled |= (1ULL << shift);
2369 }
2370 clear_bit(shift, (unsigned long *)&reg_copy);
2371 }
2372
2373 reg &= ~handled;
2374
2375 if (reg)
2376 dd_dev_info(dd, "Egress Error: %s\n",
2377 egress_err_status_string(buf, sizeof(buf), reg));
2378}
2379
2380static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
2381{
2382 char buf[96];
2383
2384 dd_dev_info(dd, "Send Error: %s\n",
2385 send_err_status_string(buf, sizeof(buf), reg));
2386
2387}
2388
2389/*
2390 * The maximum number of times the error clear down will loop before
2391 * blocking a repeating error. This value is arbitrary.
2392 */
2393#define MAX_CLEAR_COUNT 20
2394
2395/*
2396 * Clear and handle an error register. All error interrupts are funneled
2397 * through here to have a central location to correctly handle single-
2398 * or multi-shot errors.
2399 *
2400 * For non per-context registers, call this routine with a context value
2401 * of 0 so the per-context offset is zero.
2402 *
2403 * If the handler loops too many times, assume that something is wrong
2404 * and can't be fixed, so mask the error bits.
2405 */
2406static void interrupt_clear_down(struct hfi1_devdata *dd,
2407 u32 context,
2408 const struct err_reg_info *eri)
2409{
2410 u64 reg;
2411 u32 count;
2412
2413 /* read in a loop until no more errors are seen */
2414 count = 0;
2415 while (1) {
2416 reg = read_kctxt_csr(dd, context, eri->status);
2417 if (reg == 0)
2418 break;
2419 write_kctxt_csr(dd, context, eri->clear, reg);
2420 if (likely(eri->handler))
2421 eri->handler(dd, context, reg);
2422 count++;
2423 if (count > MAX_CLEAR_COUNT) {
2424 u64 mask;
2425
2426 dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
2427 eri->desc, reg);
2428 /*
2429 * Read-modify-write so any other masked bits
2430 * remain masked.
2431 */
2432 mask = read_kctxt_csr(dd, context, eri->mask);
2433 mask &= ~reg;
2434 write_kctxt_csr(dd, context, eri->mask, mask);
2435 break;
2436 }
2437 }
2438}
2439
2440/*
2441 * CCE block "misc" interrupt. Source is < 16.
2442 */
2443static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source)
2444{
2445 const struct err_reg_info *eri = &misc_errs[source];
2446
2447 if (eri->handler) {
2448 interrupt_clear_down(dd, 0, eri);
2449 } else {
2450 dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
2451 source);
2452 }
2453}
2454
2455static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
2456{
2457 return flag_string(buf, buf_len, flags,
2458 sc_err_status_flags, ARRAY_SIZE(sc_err_status_flags));
2459}
2460
2461/*
2462 * Send context error interrupt. Source (hw_context) is < 160.
2463 *
2464 * All send context errors cause the send context to halt. The normal
2465 * clear-down mechanism cannot be used because we cannot clear the
2466 * error bits until several other long-running items are done first.
2467 * This is OK because with the context halted, nothing else is going
2468 * to happen on it anyway.
2469 */
2470static void is_sendctxt_err_int(struct hfi1_devdata *dd,
2471 unsigned int hw_context)
2472{
2473 struct send_context_info *sci;
2474 struct send_context *sc;
2475 char flags[96];
2476 u64 status;
2477 u32 sw_index;
2478
2479 sw_index = dd->hw_to_sw[hw_context];
2480 if (sw_index >= dd->num_send_contexts) {
2481 dd_dev_err(dd,
2482 "out of range sw index %u for send context %u\n",
2483 sw_index, hw_context);
2484 return;
2485 }
2486 sci = &dd->send_contexts[sw_index];
2487 sc = sci->sc;
2488 if (!sc) {
2489 dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
2490 sw_index, hw_context);
2491 return;
2492 }
2493
2494 /* tell the software that a halt has begun */
2495 sc_stop(sc, SCF_HALTED);
2496
2497 status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
2498
2499 dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
2500 send_context_err_status_string(flags, sizeof(flags), status));
2501
2502 if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
2503 handle_send_egress_err_info(dd);
2504
2505 /*
2506 * Automatically restart halted kernel contexts out of interrupt
2507 * context. User contexts must ask the driver to restart the context.
2508 */
2509 if (sc->type != SC_USER)
2510 queue_work(dd->pport->hfi1_wq, &sc->halt_work);
2511}
2512
2513static void handle_sdma_eng_err(struct hfi1_devdata *dd,
2514 unsigned int source, u64 status)
2515{
2516 struct sdma_engine *sde;
2517
2518 sde = &dd->per_sdma[source];
2519#ifdef CONFIG_SDMA_VERBOSITY
2520 dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
2521 slashstrip(__FILE__), __LINE__, __func__);
2522 dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
2523 sde->this_idx, source, (unsigned long long)status);
2524#endif
2525 sdma_engine_error(sde, status);
2526}
2527
2528/*
2529 * CCE block SDMA error interrupt. Source is < 16.
2530 */
2531static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source)
2532{
2533#ifdef CONFIG_SDMA_VERBOSITY
2534 struct sdma_engine *sde = &dd->per_sdma[source];
2535
2536 dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
2537 slashstrip(__FILE__), __LINE__, __func__);
2538 dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx,
2539 source);
2540 sdma_dumpstate(sde);
2541#endif
2542 interrupt_clear_down(dd, source, &sdma_eng_err);
2543}
2544
2545/*
2546 * CCE block "various" interrupt. Source is < 8.
2547 */
2548static void is_various_int(struct hfi1_devdata *dd, unsigned int source)
2549{
2550 const struct err_reg_info *eri = &various_err[source];
2551
2552 /*
2553 * TCritInt cannot go through interrupt_clear_down()
2554 * because it is not a second tier interrupt. The handler
2555 * should be called directly.
2556 */
2557 if (source == TCRIT_INT_SOURCE)
2558 handle_temp_err(dd);
2559 else if (eri->handler)
2560 interrupt_clear_down(dd, 0, eri);
2561 else
2562 dd_dev_info(dd,
2563 "%s: Unimplemented/reserved interrupt %d\n",
2564 __func__, source);
2565}
2566
2567static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
2568{
2569 /* source is always zero */
2570 struct hfi1_pportdata *ppd = dd->pport;
2571 unsigned long flags;
2572 u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
2573
2574 if (reg & QSFP_HFI0_MODPRST_N) {
2575
2576 dd_dev_info(dd, "%s: ModPresent triggered QSFP interrupt\n",
2577 __func__);
2578
2579 if (!qsfp_mod_present(ppd)) {
2580 ppd->driver_link_ready = 0;
2581 /*
2582 * Cable removed, reset all our information about the
2583 * cache and cable capabilities
2584 */
2585
2586 spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
2587 /*
2588 * We don't set cache_refresh_required here as we expect
2589 * an interrupt when a cable is inserted
2590 */
2591 ppd->qsfp_info.cache_valid = 0;
2592 ppd->qsfp_info.qsfp_interrupt_functional = 0;
2593 spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
2594 flags);
2595 write_csr(dd,
2596 dd->hfi1_id ?
2597 ASIC_QSFP2_INVERT :
2598 ASIC_QSFP1_INVERT,
2599 qsfp_int_mgmt);
2600 if (ppd->host_link_state == HLS_DN_POLL) {
2601 /*
2602 * The link is still in POLL. This means
2603 * that the normal link down processing
2604 * will not happen. We have to do it here
2605 * before turning the DC off.
2606 */
2607 queue_work(ppd->hfi1_wq, &ppd->link_down_work);
2608 }
2609 } else {
2610 spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
2611 ppd->qsfp_info.cache_valid = 0;
2612 ppd->qsfp_info.cache_refresh_required = 1;
2613 spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
2614 flags);
2615
2616 qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
2617 write_csr(dd,
2618 dd->hfi1_id ?
2619 ASIC_QSFP2_INVERT :
2620 ASIC_QSFP1_INVERT,
2621 qsfp_int_mgmt);
2622 }
2623 }
2624
2625 if (reg & QSFP_HFI0_INT_N) {
2626
2627 dd_dev_info(dd, "%s: IntN triggered QSFP interrupt\n",
2628 __func__);
2629 spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
2630 ppd->qsfp_info.check_interrupt_flags = 1;
2631 ppd->qsfp_info.qsfp_interrupt_functional = 1;
2632 spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
2633 }
2634
2635 /* Schedule the QSFP work only if there is a cable attached. */
2636 if (qsfp_mod_present(ppd))
2637 queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work);
2638}
2639
2640static int request_host_lcb_access(struct hfi1_devdata *dd)
2641{
2642 int ret;
2643
2644 ret = do_8051_command(dd, HCMD_MISC,
2645 (u64)HCMD_MISC_REQUEST_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
2646 NULL);
2647 if (ret != HCMD_SUCCESS) {
2648 dd_dev_err(dd, "%s: command failed with error %d\n",
2649 __func__, ret);
2650 }
2651 return ret == HCMD_SUCCESS ? 0 : -EBUSY;
2652}
2653
2654static int request_8051_lcb_access(struct hfi1_devdata *dd)
2655{
2656 int ret;
2657
2658 ret = do_8051_command(dd, HCMD_MISC,
2659 (u64)HCMD_MISC_GRANT_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
2660 NULL);
2661 if (ret != HCMD_SUCCESS) {
2662 dd_dev_err(dd, "%s: command failed with error %d\n",
2663 __func__, ret);
2664 }
2665 return ret == HCMD_SUCCESS ? 0 : -EBUSY;
2666}
2667
2668/*
2669 * Set the LCB selector - allow host access. The DCC selector always
2670 * points to the host.
2671 */
2672static inline void set_host_lcb_access(struct hfi1_devdata *dd)
2673{
2674 write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
2675 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK
2676 | DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
2677}
2678
2679/*
2680 * Clear the LCB selector - allow 8051 access. The DCC selector always
2681 * points to the host.
2682 */
2683static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
2684{
2685 write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
2686 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
2687}
2688
2689/*
2690 * Acquire LCB access from the 8051. If the host already has access,
2691 * just increment a counter. Otherwise, inform the 8051 that the
2692 * host is taking access.
2693 *
2694 * Returns:
2695 * 0 on success
2696 * -EBUSY if the 8051 has control and cannot be disturbed
2697 * -errno if unable to acquire access from the 8051
2698 */
2699int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
2700{
2701 struct hfi1_pportdata *ppd = dd->pport;
2702 int ret = 0;
2703
2704 /*
2705 * Use the host link state lock so the operation of this routine
2706 * { link state check, selector change, count increment } can occur
2707 * as a unit against a link state change. Otherwise there is a
2708 * race between the state change and the count increment.
2709 */
2710 if (sleep_ok) {
2711 mutex_lock(&ppd->hls_lock);
2712 } else {
2713 while (mutex_trylock(&ppd->hls_lock) == EBUSY)
2714 udelay(1);
2715 }
2716
2717 /* this access is valid only when the link is up */
2718 if ((ppd->host_link_state & HLS_UP) == 0) {
2719 dd_dev_info(dd, "%s: link state %s not up\n",
2720 __func__, link_state_name(ppd->host_link_state));
2721 ret = -EBUSY;
2722 goto done;
2723 }
2724
2725 if (dd->lcb_access_count == 0) {
2726 ret = request_host_lcb_access(dd);
2727 if (ret) {
2728 dd_dev_err(dd,
2729 "%s: unable to acquire LCB access, err %d\n",
2730 __func__, ret);
2731 goto done;
2732 }
2733 set_host_lcb_access(dd);
2734 }
2735 dd->lcb_access_count++;
2736done:
2737 mutex_unlock(&ppd->hls_lock);
2738 return ret;
2739}
2740
2741/*
2742 * Release LCB access by decrementing the use count. If the count is moving
2743 * from 1 to 0, inform 8051 that it has control back.
2744 *
2745 * Returns:
2746 * 0 on success
2747 * -errno if unable to release access to the 8051
2748 */
2749int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
2750{
2751 int ret = 0;
2752
2753 /*
2754 * Use the host link state lock because the acquire needed it.
2755 * Here, we only need to keep { selector change, count decrement }
2756 * as a unit.
2757 */
2758 if (sleep_ok) {
2759 mutex_lock(&dd->pport->hls_lock);
2760 } else {
2761 while (mutex_trylock(&dd->pport->hls_lock) == EBUSY)
2762 udelay(1);
2763 }
2764
2765 if (dd->lcb_access_count == 0) {
2766 dd_dev_err(dd, "%s: LCB access count is zero. Skipping.\n",
2767 __func__);
2768 goto done;
2769 }
2770
2771 if (dd->lcb_access_count == 1) {
2772 set_8051_lcb_access(dd);
2773 ret = request_8051_lcb_access(dd);
2774 if (ret) {
2775 dd_dev_err(dd,
2776 "%s: unable to release LCB access, err %d\n",
2777 __func__, ret);
2778 /* restore host access if the grant didn't work */
2779 set_host_lcb_access(dd);
2780 goto done;
2781 }
2782 }
2783 dd->lcb_access_count--;
2784done:
2785 mutex_unlock(&dd->pport->hls_lock);
2786 return ret;
2787}
2788
2789/*
2790 * Initialize LCB access variables and state. Called during driver load,
2791 * after most of the initialization is finished.
2792 *
2793 * The DC default is LCB access on for the host. The driver defaults to
2794 * leaving access to the 8051. Assign access now - this constrains the call
2795 * to this routine to be after all LCB set-up is done. In particular, after
2796 * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
2797 */
2798static void init_lcb_access(struct hfi1_devdata *dd)
2799{
2800 dd->lcb_access_count = 0;
2801}
2802
2803/*
2804 * Write a response back to a 8051 request.
2805 */
2806static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
2807{
2808 write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
2809 DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK
2810 | (u64)return_code << DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT
2811 | (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
2812}
2813
2814/*
2815 * Handle requests from the 8051.
2816 */
2817static void handle_8051_request(struct hfi1_devdata *dd)
2818{
2819 u64 reg;
2820 u16 data;
2821 u8 type;
2822
2823 reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
2824 if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
2825 return; /* no request */
2826
2827 /* zero out COMPLETED so the response is seen */
2828 write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
2829
2830 /* extract request details */
2831 type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
2832 & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
2833 data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
2834 & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
2835
2836 switch (type) {
2837 case HREQ_LOAD_CONFIG:
2838 case HREQ_SAVE_CONFIG:
2839 case HREQ_READ_CONFIG:
2840 case HREQ_SET_TX_EQ_ABS:
2841 case HREQ_SET_TX_EQ_REL:
2842 case HREQ_ENABLE:
2843 dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
2844 type);
2845 hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
2846 break;
2847
2848 case HREQ_CONFIG_DONE:
2849 hreq_response(dd, HREQ_SUCCESS, 0);
2850 break;
2851
2852 case HREQ_INTERFACE_TEST:
2853 hreq_response(dd, HREQ_SUCCESS, data);
2854 break;
2855
2856 default:
2857 dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
2858 hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
2859 break;
2860 }
2861}
2862
2863static void write_global_credit(struct hfi1_devdata *dd,
2864 u8 vau, u16 total, u16 shared)
2865{
2866 write_csr(dd, SEND_CM_GLOBAL_CREDIT,
2867 ((u64)total
2868 << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
2869 | ((u64)shared
2870 << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
2871 | ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
2872}
2873
2874/*
2875 * Set up initial VL15 credits of the remote. Assumes the rest of
2876 * the CM credit registers are zero from a previous global or credit reset .
2877 */
2878void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf)
2879{
2880 /* leave shared count at zero for both global and VL15 */
2881 write_global_credit(dd, vau, vl15buf, 0);
2882
2883 /* We may need some credits for another VL when sending packets
2884 * with the snoop interface. Dividing it down the middle for VL15
2885 * and VL0 should suffice.
2886 */
2887 if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) {
2888 write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1)
2889 << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
2890 write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1)
2891 << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT);
2892 } else {
2893 write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf
2894 << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
2895 }
2896}
2897
2898/*
2899 * Zero all credit details from the previous connection and
2900 * reset the CM manager's internal counters.
2901 */
2902void reset_link_credits(struct hfi1_devdata *dd)
2903{
2904 int i;
2905
2906 /* remove all previous VL credit limits */
2907 for (i = 0; i < TXE_NUM_DATA_VL; i++)
2908 write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
2909 write_csr(dd, SEND_CM_CREDIT_VL15, 0);
2910 write_global_credit(dd, 0, 0, 0);
2911 /* reset the CM block */
2912 pio_send_control(dd, PSC_CM_RESET);
2913}
2914
2915/* convert a vCU to a CU */
2916static u32 vcu_to_cu(u8 vcu)
2917{
2918 return 1 << vcu;
2919}
2920
2921/* convert a CU to a vCU */
2922static u8 cu_to_vcu(u32 cu)
2923{
2924 return ilog2(cu);
2925}
2926
2927/* convert a vAU to an AU */
2928static u32 vau_to_au(u8 vau)
2929{
2930 return 8 * (1 << vau);
2931}
2932
2933static void set_linkup_defaults(struct hfi1_pportdata *ppd)
2934{
2935 ppd->sm_trap_qp = 0x0;
2936 ppd->sa_qp = 0x1;
2937}
2938
2939/*
2940 * Graceful LCB shutdown. This leaves the LCB FIFOs in reset.
2941 */
2942static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
2943{
2944 u64 reg;
2945
2946 /* clear lcb run: LCB_CFG_RUN.EN = 0 */
2947 write_csr(dd, DC_LCB_CFG_RUN, 0);
2948 /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
2949 write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
2950 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
2951 /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
2952 dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
2953 reg = read_csr(dd, DCC_CFG_RESET);
2954 write_csr(dd, DCC_CFG_RESET,
2955 reg
2956 | (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT)
2957 | (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
2958 (void) read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
2959 if (!abort) {
2960 udelay(1); /* must hold for the longer of 16cclks or 20ns */
2961 write_csr(dd, DCC_CFG_RESET, reg);
2962 write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
2963 }
2964}
2965
2966/*
2967 * This routine should be called after the link has been transitioned to
2968 * OFFLINE (OFFLINE state has the side effect of putting the SerDes into
2969 * reset).
2970 *
2971 * The expectation is that the caller of this routine would have taken
2972 * care of properly transitioning the link into the correct state.
2973 */
2974static void dc_shutdown(struct hfi1_devdata *dd)
2975{
2976 unsigned long flags;
2977
2978 spin_lock_irqsave(&dd->dc8051_lock, flags);
2979 if (dd->dc_shutdown) {
2980 spin_unlock_irqrestore(&dd->dc8051_lock, flags);
2981 return;
2982 }
2983 dd->dc_shutdown = 1;
2984 spin_unlock_irqrestore(&dd->dc8051_lock, flags);
2985 /* Shutdown the LCB */
2986 lcb_shutdown(dd, 1);
2987 /* Going to OFFLINE would have causes the 8051 to put the
2988 * SerDes into reset already. Just need to shut down the 8051,
2989 * itself. */
2990 write_csr(dd, DC_DC8051_CFG_RST, 0x1);
2991}
2992
2993/* Calling this after the DC has been brought out of reset should not
2994 * do any damage. */
2995static void dc_start(struct hfi1_devdata *dd)
2996{
2997 unsigned long flags;
2998 int ret;
2999
3000 spin_lock_irqsave(&dd->dc8051_lock, flags);
3001 if (!dd->dc_shutdown)
3002 goto done;
3003 spin_unlock_irqrestore(&dd->dc8051_lock, flags);
3004 /* Take the 8051 out of reset */
3005 write_csr(dd, DC_DC8051_CFG_RST, 0ull);
3006 /* Wait until 8051 is ready */
3007 ret = wait_fm_ready(dd, TIMEOUT_8051_START);
3008 if (ret) {
3009 dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
3010 __func__);
3011 }
3012 /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
3013 write_csr(dd, DCC_CFG_RESET, 0x10);
3014 /* lcb_shutdown() with abort=1 does not restore these */
3015 write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
3016 spin_lock_irqsave(&dd->dc8051_lock, flags);
3017 dd->dc_shutdown = 0;
3018done:
3019 spin_unlock_irqrestore(&dd->dc8051_lock, flags);
3020}
3021
3022/*
3023 * These LCB adjustments are for the Aurora SerDes core in the FPGA.
3024 */
3025static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
3026{
3027 u64 rx_radr, tx_radr;
3028 u32 version;
3029
3030 if (dd->icode != ICODE_FPGA_EMULATION)
3031 return;
3032
3033 /*
3034 * These LCB defaults on emulator _s are good, nothing to do here:
3035 * LCB_CFG_TX_FIFOS_RADR
3036 * LCB_CFG_RX_FIFOS_RADR
3037 * LCB_CFG_LN_DCLK
3038 * LCB_CFG_IGNORE_LOST_RCLK
3039 */
3040 if (is_emulator_s(dd))
3041 return;
3042 /* else this is _p */
3043
3044 version = emulator_rev(dd);
3045 if (!is_a0(dd))
3046 version = 0x2d; /* all B0 use 0x2d or higher settings */
3047
3048 if (version <= 0x12) {
3049 /* release 0x12 and below */
3050
3051 /*
3052 * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9
3053 * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9
3054 * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa
3055 */
3056 rx_radr =
3057 0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
3058 | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
3059 | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
3060 /*
3061 * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default)
3062 * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6
3063 */
3064 tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
3065 } else if (version <= 0x18) {
3066 /* release 0x13 up to 0x18 */
3067 /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
3068 rx_radr =
3069 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
3070 | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
3071 | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
3072 tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
3073 } else if (version == 0x19) {
3074 /* release 0x19 */
3075 /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */
3076 rx_radr =
3077 0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
3078 | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
3079 | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
3080 tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
3081 } else if (version == 0x1a) {
3082 /* release 0x1a */
3083 /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
3084 rx_radr =
3085 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
3086 | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
3087 | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
3088 tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
3089 write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull);
3090 } else {
3091 /* release 0x1b and higher */
3092 /* LCB_CFG_RX_FIFOS_RADR = 0x877 */
3093 rx_radr =
3094 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
3095 | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
3096 | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
3097 tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
3098 }
3099
3100 write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
3101 /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
3102 write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
3103 DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
3104 write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
3105}
3106
3107/*
3108 * Handle a SMA idle message
3109 *
3110 * This is a work-queue function outside of the interrupt.
3111 */
3112void handle_sma_message(struct work_struct *work)
3113{
3114 struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
3115 sma_message_work);
3116 struct hfi1_devdata *dd = ppd->dd;
3117 u64 msg;
3118 int ret;
3119
3120 /* msg is bytes 1-4 of the 40-bit idle message - the command code
3121 is stripped off */
3122 ret = read_idle_sma(dd, &msg);
3123 if (ret)
3124 return;
3125 dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg);
3126 /*
3127 * React to the SMA message. Byte[1] (0 for us) is the command.
3128 */
3129 switch (msg & 0xff) {
3130 case SMA_IDLE_ARM:
3131 /*
3132 * See OPAv1 table 9-14 - HFI and External Switch Ports Key
3133 * State Transitions
3134 *
3135 * Only expected in INIT or ARMED, discard otherwise.
3136 */
3137 if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
3138 ppd->neighbor_normal = 1;
3139 break;
3140 case SMA_IDLE_ACTIVE:
3141 /*
3142 * See OPAv1 table 9-14 - HFI and External Switch Ports Key
3143 * State Transitions
3144 *
3145 * Can activate the node. Discard otherwise.
3146 */
3147 if (ppd->host_link_state == HLS_UP_ARMED
3148 && ppd->is_active_optimize_enabled) {
3149 ppd->neighbor_normal = 1;
3150 ret = set_link_state(ppd, HLS_UP_ACTIVE);
3151 if (ret)
3152 dd_dev_err(
3153 dd,
3154 "%s: received Active SMA idle message, couldn't set link to Active\n",
3155 __func__);
3156 }
3157 break;
3158 default:
3159 dd_dev_err(dd,
3160 "%s: received unexpected SMA idle message 0x%llx\n",
3161 __func__, msg);
3162 break;
3163 }
3164}
3165
3166static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear)
3167{
3168 u64 rcvctrl;
3169 unsigned long flags;
3170
3171 spin_lock_irqsave(&dd->rcvctrl_lock, flags);
3172 rcvctrl = read_csr(dd, RCV_CTRL);
3173 rcvctrl |= add;
3174 rcvctrl &= ~clear;
3175 write_csr(dd, RCV_CTRL, rcvctrl);
3176 spin_unlock_irqrestore(&dd->rcvctrl_lock, flags);
3177}
3178
3179static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add)
3180{
3181 adjust_rcvctrl(dd, add, 0);
3182}
3183
3184static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear)
3185{
3186 adjust_rcvctrl(dd, 0, clear);
3187}
3188
3189/*
3190 * Called from all interrupt handlers to start handling an SPC freeze.
3191 */
3192void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
3193{
3194 struct hfi1_devdata *dd = ppd->dd;
3195 struct send_context *sc;
3196 int i;
3197
3198 if (flags & FREEZE_SELF)
3199 write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
3200
3201 /* enter frozen mode */
3202 dd->flags |= HFI1_FROZEN;
3203
3204 /* notify all SDMA engines that they are going into a freeze */
3205 sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
3206
3207 /* do halt pre-handling on all enabled send contexts */
3208 for (i = 0; i < dd->num_send_contexts; i++) {
3209 sc = dd->send_contexts[i].sc;
3210 if (sc && (sc->flags & SCF_ENABLED))
3211 sc_stop(sc, SCF_FROZEN | SCF_HALTED);
3212 }
3213
3214 /* Send context are frozen. Notify user space */
3215 hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
3216
3217 if (flags & FREEZE_ABORT) {
3218 dd_dev_err(dd,
3219 "Aborted freeze recovery. Please REBOOT system\n");
3220 return;
3221 }
3222 /* queue non-interrupt handler */
3223 queue_work(ppd->hfi1_wq, &ppd->freeze_work);
3224}
3225
3226/*
3227 * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen,
3228 * depending on the "freeze" parameter.
3229 *
3230 * No need to return an error if it times out, our only option
3231 * is to proceed anyway.
3232 */
3233static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
3234{
3235 unsigned long timeout;
3236 u64 reg;
3237
3238 timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT);
3239 while (1) {
3240 reg = read_csr(dd, CCE_STATUS);
3241 if (freeze) {
3242 /* waiting until all indicators are set */
3243 if ((reg & ALL_FROZE) == ALL_FROZE)
3244 return; /* all done */
3245 } else {
3246 /* waiting until all indicators are clear */
3247 if ((reg & ALL_FROZE) == 0)
3248 return; /* all done */
3249 }
3250
3251 if (time_after(jiffies, timeout)) {
3252 dd_dev_err(dd,
3253 "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
3254 freeze ? "" : "un",
3255 reg & ALL_FROZE,
3256 freeze ? ALL_FROZE : 0ull);
3257 return;
3258 }
3259 usleep_range(80, 120);
3260 }
3261}
3262
3263/*
3264 * Do all freeze handling for the RXE block.
3265 */
3266static void rxe_freeze(struct hfi1_devdata *dd)
3267{
3268 int i;
3269
3270 /* disable port */
3271 clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
3272
3273 /* disable all receive contexts */
3274 for (i = 0; i < dd->num_rcv_contexts; i++)
3275 hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i);
3276}
3277
3278/*
3279 * Unfreeze handling for the RXE block - kernel contexts only.
3280 * This will also enable the port. User contexts will do unfreeze
3281 * handling on a per-context basis as they call into the driver.
3282 *
3283 */
3284static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
3285{
3286 int i;
3287
3288 /* enable all kernel contexts */
3289 for (i = 0; i < dd->n_krcv_queues; i++)
3290 hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, i);
3291
3292 /* enable port */
3293 add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
3294}
3295
3296/*
3297 * Non-interrupt SPC freeze handling.
3298 *
3299 * This is a work-queue function outside of the triggering interrupt.
3300 */
3301void handle_freeze(struct work_struct *work)
3302{
3303 struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
3304 freeze_work);
3305 struct hfi1_devdata *dd = ppd->dd;
3306
3307 /* wait for freeze indicators on all affected blocks */
3308 dd_dev_info(dd, "Entering SPC freeze\n");
3309 wait_for_freeze_status(dd, 1);
3310
3311 /* SPC is now frozen */
3312
3313 /* do send PIO freeze steps */
3314 pio_freeze(dd);
3315
3316 /* do send DMA freeze steps */
3317 sdma_freeze(dd);
3318
3319 /* do send egress freeze steps - nothing to do */
3320
3321 /* do receive freeze steps */
3322 rxe_freeze(dd);
3323
3324 /*
3325 * Unfreeze the hardware - clear the freeze, wait for each
3326 * block's frozen bit to clear, then clear the frozen flag.
3327 */
3328 write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
3329 wait_for_freeze_status(dd, 0);
3330
3331 if (is_a0(dd)) {
3332 write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
3333 wait_for_freeze_status(dd, 1);
3334 write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
3335 wait_for_freeze_status(dd, 0);
3336 }
3337
3338 /* do send PIO unfreeze steps for kernel contexts */
3339 pio_kernel_unfreeze(dd);
3340
3341 /* do send DMA unfreeze steps */
3342 sdma_unfreeze(dd);
3343
3344 /* do send egress unfreeze steps - nothing to do */
3345
3346 /* do receive unfreeze steps for kernel contexts */
3347 rxe_kernel_unfreeze(dd);
3348
3349 /*
3350 * The unfreeze procedure touches global device registers when
3351 * it disables and re-enables RXE. Mark the device unfrozen
3352 * after all that is done so other parts of the driver waiting
3353 * for the device to unfreeze don't do things out of order.
3354 *
3355 * The above implies that the meaning of HFI1_FROZEN flag is
3356 * "Device has gone into freeze mode and freeze mode handling
3357 * is still in progress."
3358 *
3359 * The flag will be removed when freeze mode processing has
3360 * completed.
3361 */
3362 dd->flags &= ~HFI1_FROZEN;
3363 wake_up(&dd->event_queue);
3364
3365 /* no longer frozen */
3366 dd_dev_err(dd, "Exiting SPC freeze\n");
3367}
3368
3369/*
3370 * Handle a link up interrupt from the 8051.
3371 *
3372 * This is a work-queue function outside of the interrupt.
3373 */
3374void handle_link_up(struct work_struct *work)
3375{
3376 struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
3377 link_up_work);
3378 set_link_state(ppd, HLS_UP_INIT);
3379
3380 /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
3381 read_ltp_rtt(ppd->dd);
3382 /*
3383 * OPA specifies that certain counters are cleared on a transition
3384 * to link up, so do that.
3385 */
3386 clear_linkup_counters(ppd->dd);
3387 /*
3388 * And (re)set link up default values.
3389 */
3390 set_linkup_defaults(ppd);
3391
3392 /* enforce link speed enabled */
3393 if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
3394 /* oops - current speed is not enabled, bounce */
3395 dd_dev_err(ppd->dd,
3396 "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
3397 ppd->link_speed_active, ppd->link_speed_enabled);
3398 set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
3399 OPA_LINKDOWN_REASON_SPEED_POLICY);
3400 set_link_state(ppd, HLS_DN_OFFLINE);
3401 start_link(ppd);
3402 }
3403}
3404
3405/* Several pieces of LNI information were cached for SMA in ppd.
3406 * Reset these on link down */
3407static void reset_neighbor_info(struct hfi1_pportdata *ppd)
3408{
3409 ppd->neighbor_guid = 0;
3410 ppd->neighbor_port_number = 0;
3411 ppd->neighbor_type = 0;
3412 ppd->neighbor_fm_security = 0;
3413}
3414
3415/*
3416 * Handle a link down interrupt from the 8051.
3417 *
3418 * This is a work-queue function outside of the interrupt.
3419 */
3420void handle_link_down(struct work_struct *work)
3421{
3422 u8 lcl_reason, neigh_reason = 0;
3423 struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
3424 link_down_work);
3425
3426 /* go offline first, then deal with reasons */
3427 set_link_state(ppd, HLS_DN_OFFLINE);
3428
3429 lcl_reason = 0;
3430 read_planned_down_reason_code(ppd->dd, &neigh_reason);
3431
3432 /*
3433 * If no reason, assume peer-initiated but missed
3434 * LinkGoingDown idle flits.
3435 */
3436 if (neigh_reason == 0)
3437 lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
3438
3439 set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
3440
3441 reset_neighbor_info(ppd);
3442
3443 /* disable the port */
3444 clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
3445
3446 /* If there is no cable attached, turn the DC off. Otherwise,
3447 * start the link bring up. */
3448 if (!qsfp_mod_present(ppd))
3449 dc_shutdown(ppd->dd);
3450 else
3451 start_link(ppd);
3452}
3453
3454void handle_link_bounce(struct work_struct *work)
3455{
3456 struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
3457 link_bounce_work);
3458
3459 /*
3460 * Only do something if the link is currently up.
3461 */
3462 if (ppd->host_link_state & HLS_UP) {
3463 set_link_state(ppd, HLS_DN_OFFLINE);
3464 start_link(ppd);
3465 } else {
3466 dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
3467 __func__, link_state_name(ppd->host_link_state));
3468 }
3469}
3470
3471/*
3472 * Mask conversion: Capability exchange to Port LTP. The capability
3473 * exchange has an implicit 16b CRC that is mandatory.
3474 */
3475static int cap_to_port_ltp(int cap)
3476{
3477 int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
3478
3479 if (cap & CAP_CRC_14B)
3480 port_ltp |= PORT_LTP_CRC_MODE_14;
3481 if (cap & CAP_CRC_48B)
3482 port_ltp |= PORT_LTP_CRC_MODE_48;
3483 if (cap & CAP_CRC_12B_16B_PER_LANE)
3484 port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
3485
3486 return port_ltp;
3487}
3488
3489/*
3490 * Convert an OPA Port LTP mask to capability mask
3491 */
3492int port_ltp_to_cap(int port_ltp)
3493{
3494 int cap_mask = 0;
3495
3496 if (port_ltp & PORT_LTP_CRC_MODE_14)
3497 cap_mask |= CAP_CRC_14B;
3498 if (port_ltp & PORT_LTP_CRC_MODE_48)
3499 cap_mask |= CAP_CRC_48B;
3500 if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
3501 cap_mask |= CAP_CRC_12B_16B_PER_LANE;
3502
3503 return cap_mask;
3504}
3505
3506/*
3507 * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
3508 */
3509static int lcb_to_port_ltp(int lcb_crc)
3510{
3511 int port_ltp = 0;
3512
3513 if (lcb_crc == LCB_CRC_12B_16B_PER_LANE)
3514 port_ltp = PORT_LTP_CRC_MODE_PER_LANE;
3515 else if (lcb_crc == LCB_CRC_48B)
3516 port_ltp = PORT_LTP_CRC_MODE_48;
3517 else if (lcb_crc == LCB_CRC_14B)
3518 port_ltp = PORT_LTP_CRC_MODE_14;
3519 else
3520 port_ltp = PORT_LTP_CRC_MODE_16;
3521
3522 return port_ltp;
3523}
3524
3525/*
3526 * Our neighbor has indicated that we are allowed to act as a fabric
3527 * manager, so place the full management partition key in the second
3528 * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note
3529 * that we should already have the limited management partition key in
3530 * array element 1, and also that the port is not yet up when
3531 * add_full_mgmt_pkey() is invoked.
3532 */
3533static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd)
3534{
3535 struct hfi1_devdata *dd = ppd->dd;
3536
3537 /* Sanity check - ppd->pkeys[2] should be 0 */
3538 if (ppd->pkeys[2] != 0)
3539 dd_dev_err(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n",
3540 __func__, ppd->pkeys[2], FULL_MGMT_P_KEY);
3541 ppd->pkeys[2] = FULL_MGMT_P_KEY;
3542 (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
3543}
3544
3545/*
3546 * Convert the given link width to the OPA link width bitmask.
3547 */
3548static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
3549{
3550 switch (width) {
3551 case 0:
3552 /*
3553 * Simulator and quick linkup do not set the width.
3554 * Just set it to 4x without complaint.
3555 */
3556 if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup)
3557 return OPA_LINK_WIDTH_4X;
3558 return 0; /* no lanes up */
3559 case 1: return OPA_LINK_WIDTH_1X;
3560 case 2: return OPA_LINK_WIDTH_2X;
3561 case 3: return OPA_LINK_WIDTH_3X;
3562 default:
3563 dd_dev_info(dd, "%s: invalid width %d, using 4\n",
3564 __func__, width);
3565 /* fall through */
3566 case 4: return OPA_LINK_WIDTH_4X;
3567 }
3568}
3569
3570/*
3571 * Do a population count on the bottom nibble.
3572 */
3573static const u8 bit_counts[16] = {
3574 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
3575};
3576static inline u8 nibble_to_count(u8 nibble)
3577{
3578 return bit_counts[nibble & 0xf];
3579}
3580
3581/*
3582 * Read the active lane information from the 8051 registers and return
3583 * their widths.
3584 *
3585 * Active lane information is found in these 8051 registers:
3586 * enable_lane_tx
3587 * enable_lane_rx
3588 */
3589static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
3590 u16 *rx_width)
3591{
3592 u16 tx, rx;
3593 u8 enable_lane_rx;
3594 u8 enable_lane_tx;
3595 u8 tx_polarity_inversion;
3596 u8 rx_polarity_inversion;
3597 u8 max_rate;
3598
3599 /* read the active lanes */
3600 read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
3601 &rx_polarity_inversion, &max_rate);
3602 read_local_lni(dd, &enable_lane_rx);
3603
3604 /* convert to counts */
3605 tx = nibble_to_count(enable_lane_tx);
3606 rx = nibble_to_count(enable_lane_rx);
3607
3608 /*
3609 * Set link_speed_active here, overriding what was set in
3610 * handle_verify_cap(). The ASIC 8051 firmware does not correctly
3611 * set the max_rate field in handle_verify_cap until v0.19.
3612 */
3613 if ((dd->icode == ICODE_RTL_SILICON)
3614 && (dd->dc8051_ver < dc8051_ver(0, 19))) {
3615 /* max_rate: 0 = 12.5G, 1 = 25G */
3616 switch (max_rate) {
3617 case 0:
3618 dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
3619 break;
3620 default:
3621 dd_dev_err(dd,
3622 "%s: unexpected max rate %d, using 25Gb\n",
3623 __func__, (int)max_rate);
3624 /* fall through */
3625 case 1:
3626 dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
3627 break;
3628 }
3629 }
3630
3631 dd_dev_info(dd,
3632 "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
3633 enable_lane_tx, tx, enable_lane_rx, rx);
3634 *tx_width = link_width_to_bits(dd, tx);
3635 *rx_width = link_width_to_bits(dd, rx);
3636}
3637
3638/*
3639 * Read verify_cap_local_fm_link_width[1] to obtain the link widths.
3640 * Valid after the end of VerifyCap and during LinkUp. Does not change
3641 * after link up. I.e. look elsewhere for downgrade information.
3642 *
3643 * Bits are:
3644 * + bits [7:4] contain the number of active transmitters
3645 * + bits [3:0] contain the number of active receivers
3646 * These are numbers 1 through 4 and can be different values if the
3647 * link is asymmetric.
3648 *
3649 * verify_cap_local_fm_link_width[0] retains its original value.
3650 */
3651static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
3652 u16 *rx_width)
3653{
3654 u16 widths, tx, rx;
3655 u8 misc_bits, local_flags;
3656 u16 active_tx, active_rx;
3657
3658 read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths);
3659 tx = widths >> 12;
3660 rx = (widths >> 8) & 0xf;
3661
3662 *tx_width = link_width_to_bits(dd, tx);
3663 *rx_width = link_width_to_bits(dd, rx);
3664
3665 /* print the active widths */
3666 get_link_widths(dd, &active_tx, &active_rx);
3667}
3668
3669/*
3670 * Set ppd->link_width_active and ppd->link_width_downgrade_active using
3671 * hardware information when the link first comes up.
3672 *
3673 * The link width is not available until after VerifyCap.AllFramesReceived
3674 * (the trigger for handle_verify_cap), so this is outside that routine
3675 * and should be called when the 8051 signals linkup.
3676 */
3677void get_linkup_link_widths(struct hfi1_pportdata *ppd)
3678{
3679 u16 tx_width, rx_width;
3680
3681 /* get end-of-LNI link widths */
3682 get_linkup_widths(ppd->dd, &tx_width, &rx_width);
3683
3684 /* use tx_width as the link is supposed to be symmetric on link up */
3685 ppd->link_width_active = tx_width;
3686 /* link width downgrade active (LWD.A) starts out matching LW.A */
3687 ppd->link_width_downgrade_tx_active = ppd->link_width_active;
3688 ppd->link_width_downgrade_rx_active = ppd->link_width_active;
3689 /* per OPA spec, on link up LWD.E resets to LWD.S */
3690 ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported;
3691 /* cache the active egress rate (units {10^6 bits/sec]) */
3692 ppd->current_egress_rate = active_egress_rate(ppd);
3693}
3694
3695/*
3696 * Handle a verify capabilities interrupt from the 8051.
3697 *
3698 * This is a work-queue function outside of the interrupt.
3699 */
3700void handle_verify_cap(struct work_struct *work)
3701{
3702 struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
3703 link_vc_work);
3704 struct hfi1_devdata *dd = ppd->dd;
3705 u64 reg;
3706 u8 power_management;
3707 u8 continious;
3708 u8 vcu;
3709 u8 vau;
3710 u8 z;
3711 u16 vl15buf;
3712 u16 link_widths;
3713 u16 crc_mask;
3714 u16 crc_val;
3715 u16 device_id;
3716 u16 active_tx, active_rx;
3717 u8 partner_supported_crc;
3718 u8 remote_tx_rate;
3719 u8 device_rev;
3720
3721 set_link_state(ppd, HLS_VERIFY_CAP);
3722
3723 lcb_shutdown(dd, 0);
3724 adjust_lcb_for_fpga_serdes(dd);
3725
3726 /*
3727 * These are now valid:
3728 * remote VerifyCap fields in the general LNI config
3729 * CSR DC8051_STS_REMOTE_GUID
3730 * CSR DC8051_STS_REMOTE_NODE_TYPE
3731 * CSR DC8051_STS_REMOTE_FM_SECURITY
3732 * CSR DC8051_STS_REMOTE_PORT_NO
3733 */
3734
3735 read_vc_remote_phy(dd, &power_management, &continious);
3736 read_vc_remote_fabric(
3737 dd,
3738 &vau,
3739 &z,
3740 &vcu,
3741 &vl15buf,
3742 &partner_supported_crc);
3743 read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
3744 read_remote_device_id(dd, &device_id, &device_rev);
3745 /*
3746 * And the 'MgmtAllowed' information, which is exchanged during
3747 * LNI, is also be available at this point.
3748 */
3749 read_mgmt_allowed(dd, &ppd->mgmt_allowed);
3750 /* print the active widths */
3751 get_link_widths(dd, &active_tx, &active_rx);
3752 dd_dev_info(dd,
3753 "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
3754 (int)power_management, (int)continious);
3755 dd_dev_info(dd,
3756 "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
3757 (int)vau,
3758 (int)z,
3759 (int)vcu,
3760 (int)vl15buf,
3761 (int)partner_supported_crc);
3762 dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
3763 (u32)remote_tx_rate, (u32)link_widths);
3764 dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
3765 (u32)device_id, (u32)device_rev);
3766 /*
3767 * The peer vAU value just read is the peer receiver value. HFI does
3768 * not support a transmit vAU of 0 (AU == 8). We advertised that
3769 * with Z=1 in the fabric capabilities sent to the peer. The peer
3770 * will see our Z=1, and, if it advertised a vAU of 0, will move its
3771 * receive to vAU of 1 (AU == 16). Do the same here. We do not care
3772 * about the peer Z value - our sent vAU is 3 (hardwired) and is not
3773 * subject to the Z value exception.
3774 */
3775 if (vau == 0)
3776 vau = 1;
3777 set_up_vl15(dd, vau, vl15buf);
3778
3779 /* set up the LCB CRC mode */
3780 crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
3781
3782 /* order is important: use the lowest bit in common */
3783 if (crc_mask & CAP_CRC_14B)
3784 crc_val = LCB_CRC_14B;
3785 else if (crc_mask & CAP_CRC_48B)
3786 crc_val = LCB_CRC_48B;
3787 else if (crc_mask & CAP_CRC_12B_16B_PER_LANE)
3788 crc_val = LCB_CRC_12B_16B_PER_LANE;
3789 else
3790 crc_val = LCB_CRC_16B;
3791
3792 dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val);
3793 write_csr(dd, DC_LCB_CFG_CRC_MODE,
3794 (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT);
3795
3796 /* set (14b only) or clear sideband credit */
3797 reg = read_csr(dd, SEND_CM_CTRL);
3798 if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
3799 write_csr(dd, SEND_CM_CTRL,
3800 reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
3801 } else {
3802 write_csr(dd, SEND_CM_CTRL,
3803 reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
3804 }
3805
3806 ppd->link_speed_active = 0; /* invalid value */
3807 if (dd->dc8051_ver < dc8051_ver(0, 20)) {
3808 /* remote_tx_rate: 0 = 12.5G, 1 = 25G */
3809 switch (remote_tx_rate) {
3810 case 0:
3811 ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
3812 break;
3813 case 1:
3814 ppd->link_speed_active = OPA_LINK_SPEED_25G;
3815 break;
3816 }
3817 } else {
3818 /* actual rate is highest bit of the ANDed rates */
3819 u8 rate = remote_tx_rate & ppd->local_tx_rate;
3820
3821 if (rate & 2)
3822 ppd->link_speed_active = OPA_LINK_SPEED_25G;
3823 else if (rate & 1)
3824 ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
3825 }
3826 if (ppd->link_speed_active == 0) {
3827 dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
3828 __func__, (int)remote_tx_rate);
3829 ppd->link_speed_active = OPA_LINK_SPEED_25G;
3830 }
3831
3832 /*
3833 * Cache the values of the supported, enabled, and active
3834 * LTP CRC modes to return in 'portinfo' queries. But the bit
3835 * flags that are returned in the portinfo query differ from
3836 * what's in the link_crc_mask, crc_sizes, and crc_val
3837 * variables. Convert these here.
3838 */
3839 ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
3840 /* supported crc modes */
3841 ppd->port_ltp_crc_mode |=
3842 cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4;
3843 /* enabled crc modes */
3844 ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val);
3845 /* active crc mode */
3846
3847 /* set up the remote credit return table */
3848 assign_remote_cm_au_table(dd, vcu);
3849
3850 /*
3851 * The LCB is reset on entry to handle_verify_cap(), so this must
3852 * be applied on every link up.
3853 *
3854 * Adjust LCB error kill enable to kill the link if
3855 * these RBUF errors are seen:
3856 * REPLAY_BUF_MBE_SMASK
3857 * FLIT_INPUT_BUF_MBE_SMASK
3858 */
3859 if (is_a0(dd)) { /* fixed in B0 */
3860 reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
3861 reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
3862 | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
3863 write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
3864 }
3865
3866 /* pull LCB fifos out of reset - all fifo clocks must be stable */
3867 write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
3868
3869 /* give 8051 access to the LCB CSRs */
3870 write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
3871 set_8051_lcb_access(dd);
3872
3873 ppd->neighbor_guid =
3874 read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
3875 ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
3876 DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
3877 ppd->neighbor_type =
3878 read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
3879 DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
3880 ppd->neighbor_fm_security =
3881 read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
3882 DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
3883 dd_dev_info(dd,
3884 "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
3885 ppd->neighbor_guid, ppd->neighbor_type,
3886 ppd->mgmt_allowed, ppd->neighbor_fm_security);
3887 if (ppd->mgmt_allowed)
3888 add_full_mgmt_pkey(ppd);
3889
3890 /* tell the 8051 to go to LinkUp */
3891 set_link_state(ppd, HLS_GOING_UP);
3892}
3893
3894/*
3895 * Apply the link width downgrade enabled policy against the current active
3896 * link widths.
3897 *
3898 * Called when the enabled policy changes or the active link widths change.
3899 */
3900void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
3901{
3902 int skip = 1;
3903 int do_bounce = 0;
3904 u16 lwde = ppd->link_width_downgrade_enabled;
3905 u16 tx, rx;
3906
3907 mutex_lock(&ppd->hls_lock);
3908 /* only apply if the link is up */
3909 if (ppd->host_link_state & HLS_UP)
3910 skip = 0;
3911 mutex_unlock(&ppd->hls_lock);
3912 if (skip)
3913 return;
3914
3915 if (refresh_widths) {
3916 get_link_widths(ppd->dd, &tx, &rx);
3917 ppd->link_width_downgrade_tx_active = tx;
3918 ppd->link_width_downgrade_rx_active = rx;
3919 }
3920
3921 if (lwde == 0) {
3922 /* downgrade is disabled */
3923
3924 /* bounce if not at starting active width */
3925 if ((ppd->link_width_active !=
3926 ppd->link_width_downgrade_tx_active)
3927 || (ppd->link_width_active !=
3928 ppd->link_width_downgrade_rx_active)) {
3929 dd_dev_err(ppd->dd,
3930 "Link downgrade is disabled and link has downgraded, downing link\n");
3931 dd_dev_err(ppd->dd,
3932 " original 0x%x, tx active 0x%x, rx active 0x%x\n",
3933 ppd->link_width_active,
3934 ppd->link_width_downgrade_tx_active,
3935 ppd->link_width_downgrade_rx_active);
3936 do_bounce = 1;
3937 }
3938 } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0
3939 || (lwde & ppd->link_width_downgrade_rx_active) == 0) {
3940 /* Tx or Rx is outside the enabled policy */
3941 dd_dev_err(ppd->dd,
3942 "Link is outside of downgrade allowed, downing link\n");
3943 dd_dev_err(ppd->dd,
3944 " enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
3945 lwde,
3946 ppd->link_width_downgrade_tx_active,
3947 ppd->link_width_downgrade_rx_active);
3948 do_bounce = 1;
3949 }
3950
3951 if (do_bounce) {
3952 set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
3953 OPA_LINKDOWN_REASON_WIDTH_POLICY);
3954 set_link_state(ppd, HLS_DN_OFFLINE);
3955 start_link(ppd);
3956 }
3957}
3958
3959/*
3960 * Handle a link downgrade interrupt from the 8051.
3961 *
3962 * This is a work-queue function outside of the interrupt.
3963 */
3964void handle_link_downgrade(struct work_struct *work)
3965{
3966 struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
3967 link_downgrade_work);
3968
3969 dd_dev_info(ppd->dd, "8051: Link width downgrade\n");
3970 apply_link_downgrade_policy(ppd, 1);
3971}
3972
3973static char *dcc_err_string(char *buf, int buf_len, u64 flags)
3974{
3975 return flag_string(buf, buf_len, flags, dcc_err_flags,
3976 ARRAY_SIZE(dcc_err_flags));
3977}
3978
3979static char *lcb_err_string(char *buf, int buf_len, u64 flags)
3980{
3981 return flag_string(buf, buf_len, flags, lcb_err_flags,
3982 ARRAY_SIZE(lcb_err_flags));
3983}
3984
3985static char *dc8051_err_string(char *buf, int buf_len, u64 flags)
3986{
3987 return flag_string(buf, buf_len, flags, dc8051_err_flags,
3988 ARRAY_SIZE(dc8051_err_flags));
3989}
3990
3991static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags)
3992{
3993 return flag_string(buf, buf_len, flags, dc8051_info_err_flags,
3994 ARRAY_SIZE(dc8051_info_err_flags));
3995}
3996
3997static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags)
3998{
3999 return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags,
4000 ARRAY_SIZE(dc8051_info_host_msg_flags));
4001}
4002
4003static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
4004{
4005 struct hfi1_pportdata *ppd = dd->pport;
4006 u64 info, err, host_msg;
4007 int queue_link_down = 0;
4008 char buf[96];
4009
4010 /* look at the flags */
4011 if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) {
4012 /* 8051 information set by firmware */
4013 /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
4014 info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
4015 err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
4016 & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
4017 host_msg = (info >>
4018 DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
4019 & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
4020
4021 /*
4022 * Handle error flags.
4023 */
4024 if (err & FAILED_LNI) {
4025 /*
4026 * LNI error indications are cleared by the 8051
4027 * only when starting polling. Only pay attention
4028 * to them when in the states that occur during
4029 * LNI.
4030 */
4031 if (ppd->host_link_state
4032 & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
4033 queue_link_down = 1;
4034 dd_dev_info(dd, "Link error: %s\n",
4035 dc8051_info_err_string(buf,
4036 sizeof(buf),
4037 err & FAILED_LNI));
4038 }
4039 err &= ~(u64)FAILED_LNI;
4040 }
4041 if (err) {
4042 /* report remaining errors, but do not do anything */
4043 dd_dev_err(dd, "8051 info error: %s\n",
4044 dc8051_info_err_string(buf, sizeof(buf), err));
4045 }
4046
4047 /*
4048 * Handle host message flags.
4049 */
4050 if (host_msg & HOST_REQ_DONE) {
4051 /*
4052 * Presently, the driver does a busy wait for
4053 * host requests to complete. This is only an
4054 * informational message.
4055 * NOTE: The 8051 clears the host message
4056 * information *on the next 8051 command*.
4057 * Therefore, when linkup is achieved,
4058 * this flag will still be set.
4059 */
4060 host_msg &= ~(u64)HOST_REQ_DONE;
4061 }
4062 if (host_msg & BC_SMA_MSG) {
4063 queue_work(ppd->hfi1_wq, &ppd->sma_message_work);
4064 host_msg &= ~(u64)BC_SMA_MSG;
4065 }
4066 if (host_msg & LINKUP_ACHIEVED) {
4067 dd_dev_info(dd, "8051: Link up\n");
4068 queue_work(ppd->hfi1_wq, &ppd->link_up_work);
4069 host_msg &= ~(u64)LINKUP_ACHIEVED;
4070 }
4071 if (host_msg & EXT_DEVICE_CFG_REQ) {
4072 handle_8051_request(dd);
4073 host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
4074 }
4075 if (host_msg & VERIFY_CAP_FRAME) {
4076 queue_work(ppd->hfi1_wq, &ppd->link_vc_work);
4077 host_msg &= ~(u64)VERIFY_CAP_FRAME;
4078 }
4079 if (host_msg & LINK_GOING_DOWN) {
4080 const char *extra = "";
4081 /* no downgrade action needed if going down */
4082 if (host_msg & LINK_WIDTH_DOWNGRADED) {
4083 host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
4084 extra = " (ignoring downgrade)";
4085 }
4086 dd_dev_info(dd, "8051: Link down%s\n", extra);
4087 queue_link_down = 1;
4088 host_msg &= ~(u64)LINK_GOING_DOWN;
4089 }
4090 if (host_msg & LINK_WIDTH_DOWNGRADED) {
4091 queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work);
4092 host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
4093 }
4094 if (host_msg) {
4095 /* report remaining messages, but do not do anything */
4096 dd_dev_info(dd, "8051 info host message: %s\n",
4097 dc8051_info_host_msg_string(buf, sizeof(buf),
4098 host_msg));
4099 }
4100
4101 reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
4102 }
4103 if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) {
4104 /*
4105 * Lost the 8051 heartbeat. If this happens, we
4106 * receive constant interrupts about it. Disable
4107 * the interrupt after the first.
4108 */
4109 dd_dev_err(dd, "Lost 8051 heartbeat\n");
4110 write_csr(dd, DC_DC8051_ERR_EN,
4111 read_csr(dd, DC_DC8051_ERR_EN)
4112 & ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
4113
4114 reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
4115 }
4116 if (reg) {
4117 /* report the error, but do not do anything */
4118 dd_dev_err(dd, "8051 error: %s\n",
4119 dc8051_err_string(buf, sizeof(buf), reg));
4120 }
4121
4122 if (queue_link_down) {
4123 /* if the link is already going down or disabled, do not
4124 * queue another */
4125 if ((ppd->host_link_state
4126 & (HLS_GOING_OFFLINE|HLS_LINK_COOLDOWN))
4127 || ppd->link_enabled == 0) {
4128 dd_dev_info(dd, "%s: not queuing link down\n",
4129 __func__);
4130 } else {
4131 queue_work(ppd->hfi1_wq, &ppd->link_down_work);
4132 }
4133 }
4134}
4135
4136static const char * const fm_config_txt[] = {
4137[0] =
4138 "BadHeadDist: Distance violation between two head flits",
4139[1] =
4140 "BadTailDist: Distance violation between two tail flits",
4141[2] =
4142 "BadCtrlDist: Distance violation between two credit control flits",
4143[3] =
4144 "BadCrdAck: Credits return for unsupported VL",
4145[4] =
4146 "UnsupportedVLMarker: Received VL Marker",
4147[5] =
4148 "BadPreempt: Exceeded the preemption nesting level",
4149[6] =
4150 "BadControlFlit: Received unsupported control flit",
4151/* no 7 */
4152[8] =
4153 "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
4154};
4155
4156static const char * const port_rcv_txt[] = {
4157[1] =
4158 "BadPktLen: Illegal PktLen",
4159[2] =
4160 "PktLenTooLong: Packet longer than PktLen",
4161[3] =
4162 "PktLenTooShort: Packet shorter than PktLen",
4163[4] =
4164 "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
4165[5] =
4166 "BadDLID: Illegal DLID (0, doesn't match HFI)",
4167[6] =
4168 "BadL2: Illegal L2 opcode",
4169[7] =
4170 "BadSC: Unsupported SC",
4171[9] =
4172 "BadRC: Illegal RC",
4173[11] =
4174 "PreemptError: Preempting with same VL",
4175[12] =
4176 "PreemptVL15: Preempting a VL15 packet",
4177};
4178
4179#define OPA_LDR_FMCONFIG_OFFSET 16
4180#define OPA_LDR_PORTRCV_OFFSET 0
4181static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
4182{
4183 u64 info, hdr0, hdr1;
4184 const char *extra;
4185 char buf[96];
4186 struct hfi1_pportdata *ppd = dd->pport;
4187 u8 lcl_reason = 0;
4188 int do_bounce = 0;
4189
4190 if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) {
4191 if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
4192 info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
4193 dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK;
4194 /* set status bit */
4195 dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
4196 }
4197 reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
4198 }
4199
4200 if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) {
4201 struct hfi1_pportdata *ppd = dd->pport;
4202 /* this counter saturates at (2^32) - 1 */
4203 if (ppd->link_downed < (u32)UINT_MAX)
4204 ppd->link_downed++;
4205 reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
4206 }
4207
4208 if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
4209 u8 reason_valid = 1;
4210
4211 info = read_csr(dd, DCC_ERR_INFO_FMCONFIG);
4212 if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
4213 dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK;
4214 /* set status bit */
4215 dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
4216 }
4217 switch (info) {
4218 case 0:
4219 case 1:
4220 case 2:
4221 case 3:
4222 case 4:
4223 case 5:
4224 case 6:
4225 extra = fm_config_txt[info];
4226 break;
4227 case 8:
4228 extra = fm_config_txt[info];
4229 if (ppd->port_error_action &
4230 OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
4231 do_bounce = 1;
4232 /*
4233 * lcl_reason cannot be derived from info
4234 * for this error
4235 */
4236 lcl_reason =
4237 OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
4238 }
4239 break;
4240 default:
4241 reason_valid = 0;
4242 snprintf(buf, sizeof(buf), "reserved%lld", info);
4243 extra = buf;
4244 break;
4245 }
4246
4247 if (reason_valid && !do_bounce) {
4248 do_bounce = ppd->port_error_action &
4249 (1 << (OPA_LDR_FMCONFIG_OFFSET + info));
4250 lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
4251 }
4252
4253 /* just report this */
4254 dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra);
4255 reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
4256 }
4257
4258 if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
4259 u8 reason_valid = 1;
4260
4261 info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
4262 hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
4263 hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1);
4264 if (!(dd->err_info_rcvport.status_and_code &
4265 OPA_EI_STATUS_SMASK)) {
4266 dd->err_info_rcvport.status_and_code =
4267 info & OPA_EI_CODE_SMASK;
4268 /* set status bit */
4269 dd->err_info_rcvport.status_and_code |=
4270 OPA_EI_STATUS_SMASK;
4271 /* save first 2 flits in the packet that caused
4272 * the error */
4273 dd->err_info_rcvport.packet_flit1 = hdr0;
4274 dd->err_info_rcvport.packet_flit2 = hdr1;
4275 }
4276 switch (info) {
4277 case 1:
4278 case 2:
4279 case 3:
4280 case 4:
4281 case 5:
4282 case 6:
4283 case 7:
4284 case 9:
4285 case 11:
4286 case 12:
4287 extra = port_rcv_txt[info];
4288 break;
4289 default:
4290 reason_valid = 0;
4291 snprintf(buf, sizeof(buf), "reserved%lld", info);
4292 extra = buf;
4293 break;
4294 }
4295
4296 if (reason_valid && !do_bounce) {
4297 do_bounce = ppd->port_error_action &
4298 (1 << (OPA_LDR_PORTRCV_OFFSET + info));
4299 lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
4300 }
4301
4302 /* just report this */
4303 dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra);
4304 dd_dev_info(dd, " hdr0 0x%llx, hdr1 0x%llx\n",
4305 hdr0, hdr1);
4306
4307 reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
4308 }
4309
4310 if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) {
4311 /* informative only */
4312 dd_dev_info(dd, "8051 access to LCB blocked\n");
4313 reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
4314 }
4315 if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) {
4316 /* informative only */
4317 dd_dev_info(dd, "host access to LCB blocked\n");
4318 reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
4319 }
4320
4321 /* report any remaining errors */
4322 if (reg)
4323 dd_dev_info(dd, "DCC Error: %s\n",
4324 dcc_err_string(buf, sizeof(buf), reg));
4325
4326 if (lcl_reason == 0)
4327 lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
4328
4329 if (do_bounce) {
4330 dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
4331 set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
4332 queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
4333 }
4334}
4335
4336static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
4337{
4338 char buf[96];
4339
4340 dd_dev_info(dd, "LCB Error: %s\n",
4341 lcb_err_string(buf, sizeof(buf), reg));
4342}
4343
4344/*
4345 * CCE block DC interrupt. Source is < 8.
4346 */
4347static void is_dc_int(struct hfi1_devdata *dd, unsigned int source)
4348{
4349 const struct err_reg_info *eri = &dc_errs[source];
4350
4351 if (eri->handler) {
4352 interrupt_clear_down(dd, 0, eri);
4353 } else if (source == 3 /* dc_lbm_int */) {
4354 /*
4355 * This indicates that a parity error has occurred on the
4356 * address/control lines presented to the LBM. The error
4357 * is a single pulse, there is no associated error flag,
4358 * and it is non-maskable. This is because if a parity
4359 * error occurs on the request the request is dropped.
4360 * This should never occur, but it is nice to know if it
4361 * ever does.
4362 */
4363 dd_dev_err(dd, "Parity error in DC LBM block\n");
4364 } else {
4365 dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
4366 }
4367}
4368
4369/*
4370 * TX block send credit interrupt. Source is < 160.
4371 */
4372static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source)
4373{
4374 sc_group_release_update(dd, source);
4375}
4376
4377/*
4378 * TX block SDMA interrupt. Source is < 48.
4379 *
4380 * SDMA interrupts are grouped by type:
4381 *
4382 * 0 - N-1 = SDma
4383 * N - 2N-1 = SDmaProgress
4384 * 2N - 3N-1 = SDmaIdle
4385 */
4386static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source)
4387{
4388 /* what interrupt */
4389 unsigned int what = source / TXE_NUM_SDMA_ENGINES;
4390 /* which engine */
4391 unsigned int which = source % TXE_NUM_SDMA_ENGINES;
4392
4393#ifdef CONFIG_SDMA_VERBOSITY
4394 dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which,
4395 slashstrip(__FILE__), __LINE__, __func__);
4396 sdma_dumpstate(&dd->per_sdma[which]);
4397#endif
4398
4399 if (likely(what < 3 && which < dd->num_sdma)) {
4400 sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
4401 } else {
4402 /* should not happen */
4403 dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
4404 }
4405}
4406
4407/*
4408 * RX block receive available interrupt. Source is < 160.
4409 */
4410static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
4411{
4412 struct hfi1_ctxtdata *rcd;
4413 char *err_detail;
4414
4415 if (likely(source < dd->num_rcv_contexts)) {
4416 rcd = dd->rcd[source];
4417 if (rcd) {
4418 if (source < dd->first_user_ctxt)
4419 rcd->do_interrupt(rcd);
4420 else
4421 handle_user_interrupt(rcd);
4422 return; /* OK */
4423 }
4424 /* received an interrupt, but no rcd */
4425 err_detail = "dataless";
4426 } else {
4427 /* received an interrupt, but are not using that context */
4428 err_detail = "out of range";
4429 }
4430 dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
4431 err_detail, source);
4432}
4433
4434/*
4435 * RX block receive urgent interrupt. Source is < 160.
4436 */
4437static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
4438{
4439 struct hfi1_ctxtdata *rcd;
4440 char *err_detail;
4441
4442 if (likely(source < dd->num_rcv_contexts)) {
4443 rcd = dd->rcd[source];
4444 if (rcd) {
4445 /* only pay attention to user urgent interrupts */
4446 if (source >= dd->first_user_ctxt)
4447 handle_user_interrupt(rcd);
4448 return; /* OK */
4449 }
4450 /* received an interrupt, but no rcd */
4451 err_detail = "dataless";
4452 } else {
4453 /* received an interrupt, but are not using that context */
4454 err_detail = "out of range";
4455 }
4456 dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
4457 err_detail, source);
4458}
4459
4460/*
4461 * Reserved range interrupt. Should not be called in normal operation.
4462 */
4463static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source)
4464{
4465 char name[64];
4466
4467 dd_dev_err(dd, "unexpected %s interrupt\n",
4468 is_reserved_name(name, sizeof(name), source));
4469}
4470
4471static const struct is_table is_table[] = {
4472/* start end
4473 name func interrupt func */
4474{ IS_GENERAL_ERR_START, IS_GENERAL_ERR_END,
4475 is_misc_err_name, is_misc_err_int },
4476{ IS_SDMAENG_ERR_START, IS_SDMAENG_ERR_END,
4477 is_sdma_eng_err_name, is_sdma_eng_err_int },
4478{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
4479 is_sendctxt_err_name, is_sendctxt_err_int },
4480{ IS_SDMA_START, IS_SDMA_END,
4481 is_sdma_eng_name, is_sdma_eng_int },
4482{ IS_VARIOUS_START, IS_VARIOUS_END,
4483 is_various_name, is_various_int },
4484{ IS_DC_START, IS_DC_END,
4485 is_dc_name, is_dc_int },
4486{ IS_RCVAVAIL_START, IS_RCVAVAIL_END,
4487 is_rcv_avail_name, is_rcv_avail_int },
4488{ IS_RCVURGENT_START, IS_RCVURGENT_END,
4489 is_rcv_urgent_name, is_rcv_urgent_int },
4490{ IS_SENDCREDIT_START, IS_SENDCREDIT_END,
4491 is_send_credit_name, is_send_credit_int},
4492{ IS_RESERVED_START, IS_RESERVED_END,
4493 is_reserved_name, is_reserved_int},
4494};
4495
4496/*
4497 * Interrupt source interrupt - called when the given source has an interrupt.
4498 * Source is a bit index into an array of 64-bit integers.
4499 */
4500static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
4501{
4502 const struct is_table *entry;
4503
4504 /* avoids a double compare by walking the table in-order */
4505 for (entry = &is_table[0]; entry->is_name; entry++) {
4506 if (source < entry->end) {
4507 trace_hfi1_interrupt(dd, entry, source);
4508 entry->is_int(dd, source - entry->start);
4509 return;
4510 }
4511 }
4512 /* fell off the end */
4513 dd_dev_err(dd, "invalid interrupt source %u\n", source);
4514}
4515
4516/*
4517 * General interrupt handler. This is able to correctly handle
4518 * all interrupts in case INTx is used.
4519 */
4520static irqreturn_t general_interrupt(int irq, void *data)
4521{
4522 struct hfi1_devdata *dd = data;
4523 u64 regs[CCE_NUM_INT_CSRS];
4524 u32 bit;
4525 int i;
4526
4527 this_cpu_inc(*dd->int_counter);
4528
4529 /* phase 1: scan and clear all handled interrupts */
4530 for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
4531 if (dd->gi_mask[i] == 0) {
4532 regs[i] = 0; /* used later */
4533 continue;
4534 }
4535 regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
4536 dd->gi_mask[i];
4537 /* only clear if anything is set */
4538 if (regs[i])
4539 write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
4540 }
4541
4542 /* phase 2: call the appropriate handler */
4543 for_each_set_bit(bit, (unsigned long *)&regs[0],
4544 CCE_NUM_INT_CSRS*64) {
4545 is_interrupt(dd, bit);
4546 }
4547
4548 return IRQ_HANDLED;
4549}
4550
4551static irqreturn_t sdma_interrupt(int irq, void *data)
4552{
4553 struct sdma_engine *sde = data;
4554 struct hfi1_devdata *dd = sde->dd;
4555 u64 status;
4556
4557#ifdef CONFIG_SDMA_VERBOSITY
4558 dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
4559 slashstrip(__FILE__), __LINE__, __func__);
4560 sdma_dumpstate(sde);
4561#endif
4562
4563 this_cpu_inc(*dd->int_counter);
4564
4565 /* This read_csr is really bad in the hot path */
4566 status = read_csr(dd,
4567 CCE_INT_STATUS + (8*(IS_SDMA_START/64)))
4568 & sde->imask;
4569 if (likely(status)) {
4570 /* clear the interrupt(s) */
4571 write_csr(dd,
4572 CCE_INT_CLEAR + (8*(IS_SDMA_START/64)),
4573 status);
4574
4575 /* handle the interrupt(s) */
4576 sdma_engine_interrupt(sde, status);
4577 } else
4578 dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n",
4579 sde->this_idx);
4580
4581 return IRQ_HANDLED;
4582}
4583
4584/*
4585 * NOTE: this routine expects to be on its own MSI-X interrupt. If
4586 * multiple receive contexts share the same MSI-X interrupt, then this
4587 * routine must check for who received it.
4588 */
4589static irqreturn_t receive_context_interrupt(int irq, void *data)
4590{
4591 struct hfi1_ctxtdata *rcd = data;
4592 struct hfi1_devdata *dd = rcd->dd;
4593
4594 trace_hfi1_receive_interrupt(dd, rcd->ctxt);
4595 this_cpu_inc(*dd->int_counter);
4596
4597 /* clear the interrupt */
4598 write_csr(rcd->dd, CCE_INT_CLEAR + (8*rcd->ireg), rcd->imask);
4599
4600 /* handle the interrupt */
4601 rcd->do_interrupt(rcd);
4602
4603 return IRQ_HANDLED;
4604}
4605
4606/* ========================================================================= */
4607
4608u32 read_physical_state(struct hfi1_devdata *dd)
4609{
4610 u64 reg;
4611
4612 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
4613 return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT)
4614 & DC_DC8051_STS_CUR_STATE_PORT_MASK;
4615}
4616
4617static u32 read_logical_state(struct hfi1_devdata *dd)
4618{
4619 u64 reg;
4620
4621 reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
4622 return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT)
4623 & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK;
4624}
4625
4626static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate)
4627{
4628 u64 reg;
4629
4630 reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
4631 /* clear current state, set new state */
4632 reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK;
4633 reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT;
4634 write_csr(dd, DCC_CFG_PORT_CONFIG, reg);
4635}
4636
4637/*
4638 * Use the 8051 to read a LCB CSR.
4639 */
4640static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data)
4641{
4642 u32 regno;
4643 int ret;
4644
4645 if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
4646 if (acquire_lcb_access(dd, 0) == 0) {
4647 *data = read_csr(dd, addr);
4648 release_lcb_access(dd, 0);
4649 return 0;
4650 }
4651 return -EBUSY;
4652 }
4653
4654 /* register is an index of LCB registers: (offset - base) / 8 */
4655 regno = (addr - DC_LCB_CFG_RUN) >> 3;
4656 ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data);
4657 if (ret != HCMD_SUCCESS)
4658 return -EBUSY;
4659 return 0;
4660}
4661
4662/*
4663 * Read an LCB CSR. Access may not be in host control, so check.
4664 * Return 0 on success, -EBUSY on failure.
4665 */
4666int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
4667{
4668 struct hfi1_pportdata *ppd = dd->pport;
4669
4670 /* if up, go through the 8051 for the value */
4671 if (ppd->host_link_state & HLS_UP)
4672 return read_lcb_via_8051(dd, addr, data);
4673 /* if going up or down, no access */
4674 if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
4675 return -EBUSY;
4676 /* otherwise, host has access */
4677 *data = read_csr(dd, addr);
4678 return 0;
4679}
4680
4681/*
4682 * Use the 8051 to write a LCB CSR.
4683 */
4684static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data)
4685{
4686
4687 if (acquire_lcb_access(dd, 0) == 0) {
4688 write_csr(dd, addr, data);
4689 release_lcb_access(dd, 0);
4690 return 0;
4691 }
4692 return -EBUSY;
4693}
4694
4695/*
4696 * Write an LCB CSR. Access may not be in host control, so check.
4697 * Return 0 on success, -EBUSY on failure.
4698 */
4699int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
4700{
4701 struct hfi1_pportdata *ppd = dd->pport;
4702
4703 /* if up, go through the 8051 for the value */
4704 if (ppd->host_link_state & HLS_UP)
4705 return write_lcb_via_8051(dd, addr, data);
4706 /* if going up or down, no access */
4707 if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
4708 return -EBUSY;
4709 /* otherwise, host has access */
4710 write_csr(dd, addr, data);
4711 return 0;
4712}
4713
4714/*
4715 * Returns:
4716 * < 0 = Linux error, not able to get access
4717 * > 0 = 8051 command RETURN_CODE
4718 */
4719static int do_8051_command(
4720 struct hfi1_devdata *dd,
4721 u32 type,
4722 u64 in_data,
4723 u64 *out_data)
4724{
4725 u64 reg, completed;
4726 int return_code;
4727 unsigned long flags;
4728 unsigned long timeout;
4729
4730 hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
4731
4732 /*
4733 * Alternative to holding the lock for a long time:
4734 * - keep busy wait - have other users bounce off
4735 */
4736 spin_lock_irqsave(&dd->dc8051_lock, flags);
4737
4738 /* We can't send any commands to the 8051 if it's in reset */
4739 if (dd->dc_shutdown) {
4740 return_code = -ENODEV;
4741 goto fail;
4742 }
4743
4744 /*
4745 * If an 8051 host command timed out previously, then the 8051 is
4746 * stuck.
4747 *
4748 * On first timeout, attempt to reset and restart the entire DC
4749 * block (including 8051). (Is this too big of a hammer?)
4750 *
4751 * If the 8051 times out a second time, the reset did not bring it
4752 * back to healthy life. In that case, fail any subsequent commands.
4753 */
4754 if (dd->dc8051_timed_out) {
4755 if (dd->dc8051_timed_out > 1) {
4756 dd_dev_err(dd,
4757 "Previous 8051 host command timed out, skipping command %u\n",
4758 type);
4759 return_code = -ENXIO;
4760 goto fail;
4761 }
4762 spin_unlock_irqrestore(&dd->dc8051_lock, flags);
4763 dc_shutdown(dd);
4764 dc_start(dd);
4765 spin_lock_irqsave(&dd->dc8051_lock, flags);
4766 }
4767
4768 /*
4769 * If there is no timeout, then the 8051 command interface is
4770 * waiting for a command.
4771 */
4772
4773 /*
4774 * Do two writes: the first to stabilize the type and req_data, the
4775 * second to activate.
4776 */
4777 reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
4778 << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
4779 | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
4780 << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
4781 write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
4782 reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
4783 write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
4784
4785 /* wait for completion, alternate: interrupt */
4786 timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT);
4787 while (1) {
4788 reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1);
4789 completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK;
4790 if (completed)
4791 break;
4792 if (time_after(jiffies, timeout)) {
4793 dd->dc8051_timed_out++;
4794 dd_dev_err(dd, "8051 host command %u timeout\n", type);
4795 if (out_data)
4796 *out_data = 0;
4797 return_code = -ETIMEDOUT;
4798 goto fail;
4799 }
4800 udelay(2);
4801 }
4802
4803 if (out_data) {
4804 *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT)
4805 & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK;
4806 if (type == HCMD_READ_LCB_CSR) {
4807 /* top 16 bits are in a different register */
4808 *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1)
4809 & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK)
4810 << (48
4811 - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT);
4812 }
4813 }
4814 return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT)
4815 & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK;
4816 dd->dc8051_timed_out = 0;
4817 /*
4818 * Clear command for next user.
4819 */
4820 write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
4821
4822fail:
4823 spin_unlock_irqrestore(&dd->dc8051_lock, flags);
4824
4825 return return_code;
4826}
4827
4828static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
4829{
4830 return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
4831}
4832
4833static int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
4834 u8 lane_id, u32 config_data)
4835{
4836 u64 data;
4837 int ret;
4838
4839 data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
4840 | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
4841 | (u64)config_data << LOAD_DATA_DATA_SHIFT;
4842 ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
4843 if (ret != HCMD_SUCCESS) {
4844 dd_dev_err(dd,
4845 "load 8051 config: field id %d, lane %d, err %d\n",
4846 (int)field_id, (int)lane_id, ret);
4847 }
4848 return ret;
4849}
4850
4851/*
4852 * Read the 8051 firmware "registers". Use the RAM directly. Always
4853 * set the result, even on error.
4854 * Return 0 on success, -errno on failure
4855 */
4856static int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
4857 u32 *result)
4858{
4859 u64 big_data;
4860 u32 addr;
4861 int ret;
4862
4863 /* address start depends on the lane_id */
4864 if (lane_id < 4)
4865 addr = (4 * NUM_GENERAL_FIELDS)
4866 + (lane_id * 4 * NUM_LANE_FIELDS);
4867 else
4868 addr = 0;
4869 addr += field_id * 4;
4870
4871 /* read is in 8-byte chunks, hardware will truncate the address down */
4872 ret = read_8051_data(dd, addr, 8, &big_data);
4873
4874 if (ret == 0) {
4875 /* extract the 4 bytes we want */
4876 if (addr & 0x4)
4877 *result = (u32)(big_data >> 32);
4878 else
4879 *result = (u32)big_data;
4880 } else {
4881 *result = 0;
4882 dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
4883 __func__, lane_id, field_id);
4884 }
4885
4886 return ret;
4887}
4888
4889static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management,
4890 u8 continuous)
4891{
4892 u32 frame;
4893
4894 frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT
4895 | power_management << POWER_MANAGEMENT_SHIFT;
4896 return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY,
4897 GENERAL_CONFIG, frame);
4898}
4899
4900static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu,
4901 u16 vl15buf, u8 crc_sizes)
4902{
4903 u32 frame;
4904
4905 frame = (u32)vau << VAU_SHIFT
4906 | (u32)z << Z_SHIFT
4907 | (u32)vcu << VCU_SHIFT
4908 | (u32)vl15buf << VL15BUF_SHIFT
4909 | (u32)crc_sizes << CRC_SIZES_SHIFT;
4910 return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC,
4911 GENERAL_CONFIG, frame);
4912}
4913
4914static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
4915 u8 *flag_bits, u16 *link_widths)
4916{
4917 u32 frame;
4918
4919 read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
4920 &frame);
4921 *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
4922 *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
4923 *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
4924}
4925
4926static int write_vc_local_link_width(struct hfi1_devdata *dd,
4927 u8 misc_bits,
4928 u8 flag_bits,
4929 u16 link_widths)
4930{
4931 u32 frame;
4932
4933 frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT
4934 | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT
4935 | (u32)link_widths << LINK_WIDTH_SHIFT;
4936 return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
4937 frame);
4938}
4939
4940static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id,
4941 u8 device_rev)
4942{
4943 u32 frame;
4944
4945 frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT)
4946 | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT);
4947 return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame);
4948}
4949
4950static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
4951 u8 *device_rev)
4952{
4953 u32 frame;
4954
4955 read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame);
4956 *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK;
4957 *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT)
4958 & REMOTE_DEVICE_REV_MASK;
4959}
4960
4961void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b)
4962{
4963 u32 frame;
4964
4965 read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame);
4966 *ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK;
4967 *ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK;
4968}
4969
4970static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
4971 u8 *continuous)
4972{
4973 u32 frame;
4974
4975 read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame);
4976 *power_management = (frame >> POWER_MANAGEMENT_SHIFT)
4977 & POWER_MANAGEMENT_MASK;
4978 *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT)
4979 & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK;
4980}
4981
4982static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
4983 u8 *vcu, u16 *vl15buf, u8 *crc_sizes)
4984{
4985 u32 frame;
4986
4987 read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame);
4988 *vau = (frame >> VAU_SHIFT) & VAU_MASK;
4989 *z = (frame >> Z_SHIFT) & Z_MASK;
4990 *vcu = (frame >> VCU_SHIFT) & VCU_MASK;
4991 *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK;
4992 *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK;
4993}
4994
4995static void read_vc_remote_link_width(struct hfi1_devdata *dd,
4996 u8 *remote_tx_rate,
4997 u16 *link_widths)
4998{
4999 u32 frame;
5000
5001 read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
5002 &frame);
5003 *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
5004 & REMOTE_TX_RATE_MASK;
5005 *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
5006}
5007
5008static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx)
5009{
5010 u32 frame;
5011
5012 read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame);
5013 *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK;
5014}
5015
5016static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed)
5017{
5018 u32 frame;
5019
5020 read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame);
5021 *mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK;
5022}
5023
5024static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls)
5025{
5026 read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls);
5027}
5028
5029static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs)
5030{
5031 read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs);
5032}
5033
5034void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality)
5035{
5036 u32 frame;
5037 int ret;
5038
5039 *link_quality = 0;
5040 if (dd->pport->host_link_state & HLS_UP) {
5041 ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
5042 &frame);
5043 if (ret == 0)
5044 *link_quality = (frame >> LINK_QUALITY_SHIFT)
5045 & LINK_QUALITY_MASK;
5046 }
5047}
5048
5049static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
5050{
5051 u32 frame;
5052
5053 read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame);
5054 *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
5055}
5056
5057static int read_tx_settings(struct hfi1_devdata *dd,
5058 u8 *enable_lane_tx,
5059 u8 *tx_polarity_inversion,
5060 u8 *rx_polarity_inversion,
5061 u8 *max_rate)
5062{
5063 u32 frame;
5064 int ret;
5065
5066 ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame);
5067 *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT)
5068 & ENABLE_LANE_TX_MASK;
5069 *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT)
5070 & TX_POLARITY_INVERSION_MASK;
5071 *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT)
5072 & RX_POLARITY_INVERSION_MASK;
5073 *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK;
5074 return ret;
5075}
5076
5077static int write_tx_settings(struct hfi1_devdata *dd,
5078 u8 enable_lane_tx,
5079 u8 tx_polarity_inversion,
5080 u8 rx_polarity_inversion,
5081 u8 max_rate)
5082{
5083 u32 frame;
5084
5085 /* no need to mask, all variable sizes match field widths */
5086 frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
5087 | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
5088 | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
5089 | max_rate << MAX_RATE_SHIFT;
5090 return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
5091}
5092
5093static void check_fabric_firmware_versions(struct hfi1_devdata *dd)
5094{
5095 u32 frame, version, prod_id;
5096 int ret, lane;
5097
5098 /* 4 lanes */
5099 for (lane = 0; lane < 4; lane++) {
5100 ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
5101 if (ret) {
5102 dd_dev_err(
5103 dd,
5104 "Unable to read lane %d firmware details\n",
5105 lane);
5106 continue;
5107 }
5108 version = (frame >> SPICO_ROM_VERSION_SHIFT)
5109 & SPICO_ROM_VERSION_MASK;
5110 prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
5111 & SPICO_ROM_PROD_ID_MASK;
5112 dd_dev_info(dd,
5113 "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
5114 lane, version, prod_id);
5115 }
5116}
5117
5118/*
5119 * Read an idle LCB message.
5120 *
5121 * Returns 0 on success, -EINVAL on error
5122 */
5123static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
5124{
5125 int ret;
5126
5127 ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG,
5128 type, data_out);
5129 if (ret != HCMD_SUCCESS) {
5130 dd_dev_err(dd, "read idle message: type %d, err %d\n",
5131 (u32)type, ret);
5132 return -EINVAL;
5133 }
5134 dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
5135 /* return only the payload as we already know the type */
5136 *data_out >>= IDLE_PAYLOAD_SHIFT;
5137 return 0;
5138}
5139
5140/*
5141 * Read an idle SMA message. To be done in response to a notification from
5142 * the 8051.
5143 *
5144 * Returns 0 on success, -EINVAL on error
5145 */
5146static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
5147{
5148 return read_idle_message(dd,
5149 (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT, data);
5150}
5151
5152/*
5153 * Send an idle LCB message.
5154 *
5155 * Returns 0 on success, -EINVAL on error
5156 */
5157static int send_idle_message(struct hfi1_devdata *dd, u64 data)
5158{
5159 int ret;
5160
5161 dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data);
5162 ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
5163 if (ret != HCMD_SUCCESS) {
5164 dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
5165 data, ret);
5166 return -EINVAL;
5167 }
5168 return 0;
5169}
5170
5171/*
5172 * Send an idle SMA message.
5173 *
5174 * Returns 0 on success, -EINVAL on error
5175 */
5176int send_idle_sma(struct hfi1_devdata *dd, u64 message)
5177{
5178 u64 data;
5179
5180 data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT)
5181 | ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
5182 return send_idle_message(dd, data);
5183}
5184
5185/*
5186 * Initialize the LCB then do a quick link up. This may or may not be
5187 * in loopback.
5188 *
5189 * return 0 on success, -errno on error
5190 */
5191static int do_quick_linkup(struct hfi1_devdata *dd)
5192{
5193 u64 reg;
5194 unsigned long timeout;
5195 int ret;
5196
5197 lcb_shutdown(dd, 0);
5198
5199 if (loopback) {
5200 /* LCB_CFG_LOOPBACK.VAL = 2 */
5201 /* LCB_CFG_LANE_WIDTH.VAL = 0 */
5202 write_csr(dd, DC_LCB_CFG_LOOPBACK,
5203 IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
5204 write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
5205 }
5206
5207 /* start the LCBs */
5208 /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */
5209 write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
5210
5211 /* simulator only loopback steps */
5212 if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
5213 /* LCB_CFG_RUN.EN = 1 */
5214 write_csr(dd, DC_LCB_CFG_RUN,
5215 1ull << DC_LCB_CFG_RUN_EN_SHIFT);
5216
5217 /* watch LCB_STS_LINK_TRANSFER_ACTIVE */
5218 timeout = jiffies + msecs_to_jiffies(10);
5219 while (1) {
5220 reg = read_csr(dd,
5221 DC_LCB_STS_LINK_TRANSFER_ACTIVE);
5222 if (reg)
5223 break;
5224 if (time_after(jiffies, timeout)) {
5225 dd_dev_err(dd,
5226 "timeout waiting for LINK_TRANSFER_ACTIVE\n");
5227 return -ETIMEDOUT;
5228 }
5229 udelay(2);
5230 }
5231
5232 write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
5233 1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
5234 }
5235
5236 if (!loopback) {
5237 /*
5238 * When doing quick linkup and not in loopback, both
5239 * sides must be done with LCB set-up before either
5240 * starts the quick linkup. Put a delay here so that
5241 * both sides can be started and have a chance to be
5242 * done with LCB set up before resuming.
5243 */
5244 dd_dev_err(dd,
5245 "Pausing for peer to be finished with LCB set up\n");
5246 msleep(5000);
5247 dd_dev_err(dd,
5248 "Continuing with quick linkup\n");
5249 }
5250
5251 write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
5252 set_8051_lcb_access(dd);
5253
5254 /*
5255 * State "quick" LinkUp request sets the physical link state to
5256 * LinkUp without a verify capability sequence.
5257 * This state is in simulator v37 and later.
5258 */
5259 ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
5260 if (ret != HCMD_SUCCESS) {
5261 dd_dev_err(dd,
5262 "%s: set physical link state to quick LinkUp failed with return %d\n",
5263 __func__, ret);
5264
5265 set_host_lcb_access(dd);
5266 write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
5267
5268 if (ret >= 0)
5269 ret = -EINVAL;
5270 return ret;
5271 }
5272
5273 return 0; /* success */
5274}
5275
5276/*
5277 * Set the SerDes to internal loopback mode.
5278 * Returns 0 on success, -errno on error.
5279 */
5280static int set_serdes_loopback_mode(struct hfi1_devdata *dd)
5281{
5282 int ret;
5283
5284 ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK);
5285 if (ret == HCMD_SUCCESS)
5286 return 0;
5287 dd_dev_err(dd,
5288 "Set physical link state to SerDes Loopback failed with return %d\n",
5289 ret);
5290 if (ret >= 0)
5291 ret = -EINVAL;
5292 return ret;
5293}
5294
5295/*
5296 * Do all special steps to set up loopback.
5297 */
5298static int init_loopback(struct hfi1_devdata *dd)
5299{
5300 dd_dev_info(dd, "Entering loopback mode\n");
5301
5302 /* all loopbacks should disable self GUID check */
5303 write_csr(dd, DC_DC8051_CFG_MODE,
5304 (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
5305
5306 /*
5307 * The simulator has only one loopback option - LCB. Switch
5308 * to that option, which includes quick link up.
5309 *
5310 * Accept all valid loopback values.
5311 */
5312 if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
5313 && (loopback == LOOPBACK_SERDES
5314 || loopback == LOOPBACK_LCB
5315 || loopback == LOOPBACK_CABLE)) {
5316 loopback = LOOPBACK_LCB;
5317 quick_linkup = 1;
5318 return 0;
5319 }
5320
5321 /* handle serdes loopback */
5322 if (loopback == LOOPBACK_SERDES) {
5323 /* internal serdes loopack needs quick linkup on RTL */
5324 if (dd->icode == ICODE_RTL_SILICON)
5325 quick_linkup = 1;
5326 return set_serdes_loopback_mode(dd);
5327 }
5328
5329 /* LCB loopback - handled at poll time */
5330 if (loopback == LOOPBACK_LCB) {
5331 quick_linkup = 1; /* LCB is always quick linkup */
5332
5333 /* not supported in emulation due to emulation RTL changes */
5334 if (dd->icode == ICODE_FPGA_EMULATION) {
5335 dd_dev_err(dd,
5336 "LCB loopback not supported in emulation\n");
5337 return -EINVAL;
5338 }
5339 return 0;
5340 }
5341
5342 /* external cable loopback requires no extra steps */
5343 if (loopback == LOOPBACK_CABLE)
5344 return 0;
5345
5346 dd_dev_err(dd, "Invalid loopback mode %d\n", loopback);
5347 return -EINVAL;
5348}
5349
5350/*
5351 * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits
5352 * used in the Verify Capability link width attribute.
5353 */
5354static u16 opa_to_vc_link_widths(u16 opa_widths)
5355{
5356 int i;
5357 u16 result = 0;
5358
5359 static const struct link_bits {
5360 u16 from;
5361 u16 to;
5362 } opa_link_xlate[] = {
5363 { OPA_LINK_WIDTH_1X, 1 << (1-1) },
5364 { OPA_LINK_WIDTH_2X, 1 << (2-1) },
5365 { OPA_LINK_WIDTH_3X, 1 << (3-1) },
5366 { OPA_LINK_WIDTH_4X, 1 << (4-1) },
5367 };
5368
5369 for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
5370 if (opa_widths & opa_link_xlate[i].from)
5371 result |= opa_link_xlate[i].to;
5372 }
5373 return result;
5374}
5375
5376/*
5377 * Set link attributes before moving to polling.
5378 */
5379static int set_local_link_attributes(struct hfi1_pportdata *ppd)
5380{
5381 struct hfi1_devdata *dd = ppd->dd;
5382 u8 enable_lane_tx;
5383 u8 tx_polarity_inversion;
5384 u8 rx_polarity_inversion;
5385 int ret;
5386
5387 /* reset our fabric serdes to clear any lingering problems */
5388 fabric_serdes_reset(dd);
5389
5390 /* set the local tx rate - need to read-modify-write */
5391 ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
5392 &rx_polarity_inversion, &ppd->local_tx_rate);
5393 if (ret)
5394 goto set_local_link_attributes_fail;
5395
5396 if (dd->dc8051_ver < dc8051_ver(0, 20)) {
5397 /* set the tx rate to the fastest enabled */
5398 if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
5399 ppd->local_tx_rate = 1;
5400 else
5401 ppd->local_tx_rate = 0;
5402 } else {
5403 /* set the tx rate to all enabled */
5404 ppd->local_tx_rate = 0;
5405 if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
5406 ppd->local_tx_rate |= 2;
5407 if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
5408 ppd->local_tx_rate |= 1;
5409 }
5410 ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
5411 rx_polarity_inversion, ppd->local_tx_rate);
5412 if (ret != HCMD_SUCCESS)
5413 goto set_local_link_attributes_fail;
5414
5415 /*
5416 * DC supports continuous updates.
5417 */
5418 ret = write_vc_local_phy(dd, 0 /* no power management */,
5419 1 /* continuous updates */);
5420 if (ret != HCMD_SUCCESS)
5421 goto set_local_link_attributes_fail;
5422
5423 /* z=1 in the next call: AU of 0 is not supported by the hardware */
5424 ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
5425 ppd->port_crc_mode_enabled);
5426 if (ret != HCMD_SUCCESS)
5427 goto set_local_link_attributes_fail;
5428
5429 ret = write_vc_local_link_width(dd, 0, 0,
5430 opa_to_vc_link_widths(ppd->link_width_enabled));
5431 if (ret != HCMD_SUCCESS)
5432 goto set_local_link_attributes_fail;
5433
5434 /* let peer know who we are */
5435 ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev);
5436 if (ret == HCMD_SUCCESS)
5437 return 0;
5438
5439set_local_link_attributes_fail:
5440 dd_dev_err(dd,
5441 "Failed to set local link attributes, return 0x%x\n",
5442 ret);
5443 return ret;
5444}
5445
5446/*
5447 * Call this to start the link. Schedule a retry if the cable is not
5448 * present or if unable to start polling. Do not do anything if the
5449 * link is disabled. Returns 0 if link is disabled or moved to polling
5450 */
5451int start_link(struct hfi1_pportdata *ppd)
5452{
5453 if (!ppd->link_enabled) {
5454 dd_dev_info(ppd->dd,
5455 "%s: stopping link start because link is disabled\n",
5456 __func__);
5457 return 0;
5458 }
5459 if (!ppd->driver_link_ready) {
5460 dd_dev_info(ppd->dd,
5461 "%s: stopping link start because driver is not ready\n",
5462 __func__);
5463 return 0;
5464 }
5465
5466 if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
5467 loopback == LOOPBACK_LCB ||
5468 ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
5469 return set_link_state(ppd, HLS_DN_POLL);
5470
5471 dd_dev_info(ppd->dd,
5472 "%s: stopping link start because no cable is present\n",
5473 __func__);
5474 return -EAGAIN;
5475}
5476
5477static void reset_qsfp(struct hfi1_pportdata *ppd)
5478{
5479 struct hfi1_devdata *dd = ppd->dd;
5480 u64 mask, qsfp_mask;
5481
5482 mask = (u64)QSFP_HFI0_RESET_N;
5483 qsfp_mask = read_csr(dd,
5484 dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE);
5485 qsfp_mask |= mask;
5486 write_csr(dd,
5487 dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE,
5488 qsfp_mask);
5489
5490 qsfp_mask = read_csr(dd,
5491 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
5492 qsfp_mask &= ~mask;
5493 write_csr(dd,
5494 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
5495 qsfp_mask);
5496
5497 udelay(10);
5498
5499 qsfp_mask |= mask;
5500 write_csr(dd,
5501 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
5502 qsfp_mask);
5503}
5504
5505static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
5506 u8 *qsfp_interrupt_status)
5507{
5508 struct hfi1_devdata *dd = ppd->dd;
5509
5510 if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
5511 (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
5512 dd_dev_info(dd,
5513 "%s: QSFP cable on fire\n",
5514 __func__);
5515
5516 if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
5517 (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
5518 dd_dev_info(dd,
5519 "%s: QSFP cable temperature too low\n",
5520 __func__);
5521
5522 if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
5523 (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
5524 dd_dev_info(dd,
5525 "%s: QSFP supply voltage too high\n",
5526 __func__);
5527
5528 if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
5529 (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
5530 dd_dev_info(dd,
5531 "%s: QSFP supply voltage too low\n",
5532 __func__);
5533
5534 /* Byte 2 is vendor specific */
5535
5536 if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
5537 (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
5538 dd_dev_info(dd,
5539 "%s: Cable RX channel 1/2 power too high\n",
5540 __func__);
5541
5542 if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
5543 (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
5544 dd_dev_info(dd,
5545 "%s: Cable RX channel 1/2 power too low\n",
5546 __func__);
5547
5548 if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
5549 (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
5550 dd_dev_info(dd,
5551 "%s: Cable RX channel 3/4 power too high\n",
5552 __func__);
5553
5554 if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
5555 (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
5556 dd_dev_info(dd,
5557 "%s: Cable RX channel 3/4 power too low\n",
5558 __func__);
5559
5560 if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
5561 (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
5562 dd_dev_info(dd,
5563 "%s: Cable TX channel 1/2 bias too high\n",
5564 __func__);
5565
5566 if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
5567 (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
5568 dd_dev_info(dd,
5569 "%s: Cable TX channel 1/2 bias too low\n",
5570 __func__);
5571
5572 if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
5573 (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
5574 dd_dev_info(dd,
5575 "%s: Cable TX channel 3/4 bias too high\n",
5576 __func__);
5577
5578 if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
5579 (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
5580 dd_dev_info(dd,
5581 "%s: Cable TX channel 3/4 bias too low\n",
5582 __func__);
5583
5584 if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
5585 (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
5586 dd_dev_info(dd,
5587 "%s: Cable TX channel 1/2 power too high\n",
5588 __func__);
5589
5590 if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
5591 (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
5592 dd_dev_info(dd,
5593 "%s: Cable TX channel 1/2 power too low\n",
5594 __func__);
5595
5596 if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
5597 (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
5598 dd_dev_info(dd,
5599 "%s: Cable TX channel 3/4 power too high\n",
5600 __func__);
5601
5602 if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
5603 (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
5604 dd_dev_info(dd,
5605 "%s: Cable TX channel 3/4 power too low\n",
5606 __func__);
5607
5608 /* Bytes 9-10 and 11-12 are reserved */
5609 /* Bytes 13-15 are vendor specific */
5610
5611 return 0;
5612}
5613
5614static int do_pre_lni_host_behaviors(struct hfi1_pportdata *ppd)
5615{
5616 refresh_qsfp_cache(ppd, &ppd->qsfp_info);
5617
5618 return 0;
5619}
5620
5621static int do_qsfp_intr_fallback(struct hfi1_pportdata *ppd)
5622{
5623 struct hfi1_devdata *dd = ppd->dd;
5624 u8 qsfp_interrupt_status = 0;
5625
5626 if (qsfp_read(ppd, dd->hfi1_id, 2, &qsfp_interrupt_status, 1)
5627 != 1) {
5628 dd_dev_info(dd,
5629 "%s: Failed to read status of QSFP module\n",
5630 __func__);
5631 return -EIO;
5632 }
5633
5634 /* We don't care about alarms & warnings with a non-functional INT_N */
5635 if (!(qsfp_interrupt_status & QSFP_DATA_NOT_READY))
5636 do_pre_lni_host_behaviors(ppd);
5637
5638 return 0;
5639}
5640
5641/* This routine will only be scheduled if the QSFP module is present */
5642static void qsfp_event(struct work_struct *work)
5643{
5644 struct qsfp_data *qd;
5645 struct hfi1_pportdata *ppd;
5646 struct hfi1_devdata *dd;
5647
5648 qd = container_of(work, struct qsfp_data, qsfp_work);
5649 ppd = qd->ppd;
5650 dd = ppd->dd;
5651
5652 /* Sanity check */
5653 if (!qsfp_mod_present(ppd))
5654 return;
5655
5656 /*
5657 * Turn DC back on after cables has been
5658 * re-inserted. Up until now, the DC has been in
5659 * reset to save power.
5660 */
5661 dc_start(dd);
5662
5663 if (qd->cache_refresh_required) {
5664 msleep(3000);
5665 reset_qsfp(ppd);
5666
5667 /* Check for QSFP interrupt after t_init (SFF 8679)
5668 * + extra
5669 */
5670 msleep(3000);
5671 if (!qd->qsfp_interrupt_functional) {
5672 if (do_qsfp_intr_fallback(ppd) < 0)
5673 dd_dev_info(dd, "%s: QSFP fallback failed\n",
5674 __func__);
5675 ppd->driver_link_ready = 1;
5676 start_link(ppd);
5677 }
5678 }
5679
5680 if (qd->check_interrupt_flags) {
5681 u8 qsfp_interrupt_status[16] = {0,};
5682
5683 if (qsfp_read(ppd, dd->hfi1_id, 6,
5684 &qsfp_interrupt_status[0], 16) != 16) {
5685 dd_dev_info(dd,
5686 "%s: Failed to read status of QSFP module\n",
5687 __func__);
5688 } else {
5689 unsigned long flags;
5690 u8 data_status;
5691
5692 spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
5693 ppd->qsfp_info.check_interrupt_flags = 0;
5694 spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
5695 flags);
5696
5697 if (qsfp_read(ppd, dd->hfi1_id, 2, &data_status, 1)
5698 != 1) {
5699 dd_dev_info(dd,
5700 "%s: Failed to read status of QSFP module\n",
5701 __func__);
5702 }
5703 if (!(data_status & QSFP_DATA_NOT_READY)) {
5704 do_pre_lni_host_behaviors(ppd);
5705 start_link(ppd);
5706 } else
5707 handle_qsfp_error_conditions(ppd,
5708 qsfp_interrupt_status);
5709 }
5710 }
5711}
5712
5713void init_qsfp(struct hfi1_pportdata *ppd)
5714{
5715 struct hfi1_devdata *dd = ppd->dd;
5716 u64 qsfp_mask;
5717
5718 if (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
5719 ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR ||
5720 !HFI1_CAP_IS_KSET(QSFP_ENABLED)) {
5721 ppd->driver_link_ready = 1;
5722 return;
5723 }
5724
5725 ppd->qsfp_info.ppd = ppd;
5726 INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
5727
5728 qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
5729 /* Clear current status to avoid spurious interrupts */
5730 write_csr(dd,
5731 dd->hfi1_id ?
5732 ASIC_QSFP2_CLEAR :
5733 ASIC_QSFP1_CLEAR,
5734 qsfp_mask);
5735
5736 /* Handle active low nature of INT_N and MODPRST_N pins */
5737 if (qsfp_mod_present(ppd))
5738 qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
5739 write_csr(dd,
5740 dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
5741 qsfp_mask);
5742
5743 /* Allow only INT_N and MODPRST_N to trigger QSFP interrupts */
5744 qsfp_mask |= (u64)QSFP_HFI0_MODPRST_N;
5745 write_csr(dd,
5746 dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
5747 qsfp_mask);
5748
5749 if (qsfp_mod_present(ppd)) {
5750 msleep(3000);
5751 reset_qsfp(ppd);
5752
5753 /* Check for QSFP interrupt after t_init (SFF 8679)
5754 * + extra
5755 */
5756 msleep(3000);
5757 if (!ppd->qsfp_info.qsfp_interrupt_functional) {
5758 if (do_qsfp_intr_fallback(ppd) < 0)
5759 dd_dev_info(dd,
5760 "%s: QSFP fallback failed\n",
5761 __func__);
5762 ppd->driver_link_ready = 1;
5763 }
5764 }
5765}
5766
5767int bringup_serdes(struct hfi1_pportdata *ppd)
5768{
5769 struct hfi1_devdata *dd = ppd->dd;
5770 u64 guid;
5771 int ret;
5772
5773 if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
5774 add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
5775
5776 guid = ppd->guid;
5777 if (!guid) {
5778 if (dd->base_guid)
5779 guid = dd->base_guid + ppd->port - 1;
5780 ppd->guid = guid;
5781 }
5782
5783 /* the link defaults to enabled */
5784 ppd->link_enabled = 1;
5785 /* Set linkinit_reason on power up per OPA spec */
5786 ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
5787
5788 if (loopback) {
5789 ret = init_loopback(dd);
5790 if (ret < 0)
5791 return ret;
5792 }
5793
5794 return start_link(ppd);
5795}
5796
5797void hfi1_quiet_serdes(struct hfi1_pportdata *ppd)
5798{
5799 struct hfi1_devdata *dd = ppd->dd;
5800
5801 /*
5802 * Shut down the link and keep it down. First turn off that the
5803 * driver wants to allow the link to be up (driver_link_ready).
5804 * Then make sure the link is not automatically restarted
5805 * (link_enabled). Cancel any pending restart. And finally
5806 * go offline.
5807 */
5808 ppd->driver_link_ready = 0;
5809 ppd->link_enabled = 0;
5810
5811 set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0,
5812 OPA_LINKDOWN_REASON_SMA_DISABLED);
5813 set_link_state(ppd, HLS_DN_OFFLINE);
5814
5815 /* disable the port */
5816 clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
5817}
5818
5819static inline int init_cpu_counters(struct hfi1_devdata *dd)
5820{
5821 struct hfi1_pportdata *ppd;
5822 int i;
5823
5824 ppd = (struct hfi1_pportdata *)(dd + 1);
5825 for (i = 0; i < dd->num_pports; i++, ppd++) {
5826 ppd->ibport_data.rc_acks = NULL;
5827 ppd->ibport_data.rc_qacks = NULL;
5828 ppd->ibport_data.rc_acks = alloc_percpu(u64);
5829 ppd->ibport_data.rc_qacks = alloc_percpu(u64);
5830 ppd->ibport_data.rc_delayed_comp = alloc_percpu(u64);
5831 if ((ppd->ibport_data.rc_acks == NULL) ||
5832 (ppd->ibport_data.rc_delayed_comp == NULL) ||
5833 (ppd->ibport_data.rc_qacks == NULL))
5834 return -ENOMEM;
5835 }
5836
5837 return 0;
5838}
5839
5840static const char * const pt_names[] = {
5841 "expected",
5842 "eager",
5843 "invalid"
5844};
5845
5846static const char *pt_name(u32 type)
5847{
5848 return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type];
5849}
5850
5851/*
5852 * index is the index into the receive array
5853 */
5854void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
5855 u32 type, unsigned long pa, u16 order)
5856{
5857 u64 reg;
5858 void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
5859 (dd->kregbase + RCV_ARRAY));
5860
5861 if (!(dd->flags & HFI1_PRESENT))
5862 goto done;
5863
5864 if (type == PT_INVALID) {
5865 pa = 0;
5866 } else if (type > PT_INVALID) {
5867 dd_dev_err(dd,
5868 "unexpected receive array type %u for index %u, not handled\n",
5869 type, index);
5870 goto done;
5871 }
5872
5873 hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
5874 pt_name(type), index, pa, (unsigned long)order);
5875
5876#define RT_ADDR_SHIFT 12 /* 4KB kernel address boundary */
5877 reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
5878 | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
5879 | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
5880 << RCV_ARRAY_RT_ADDR_SHIFT;
5881 writeq(reg, base + (index * 8));
5882
5883 if (type == PT_EAGER)
5884 /*
5885 * Eager entries are written one-by-one so we have to push them
5886 * after we write the entry.
5887 */
5888 flush_wc();
5889done:
5890 return;
5891}
5892
5893void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
5894{
5895 struct hfi1_devdata *dd = rcd->dd;
5896 u32 i;
5897
5898 /* this could be optimized */
5899 for (i = rcd->eager_base; i < rcd->eager_base +
5900 rcd->egrbufs.alloced; i++)
5901 hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
5902
5903 for (i = rcd->expected_base;
5904 i < rcd->expected_base + rcd->expected_count; i++)
5905 hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
5906}
5907
5908int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
5909 struct hfi1_ctxt_info *kinfo)
5910{
5911 kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) |
5912 HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U);
5913 return 0;
5914}
5915
5916struct hfi1_message_header *hfi1_get_msgheader(
5917 struct hfi1_devdata *dd, __le32 *rhf_addr)
5918{
5919 u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
5920
5921 return (struct hfi1_message_header *)
5922 (rhf_addr - dd->rhf_offset + offset);
5923}
5924
5925static const char * const ib_cfg_name_strings[] = {
5926 "HFI1_IB_CFG_LIDLMC",
5927 "HFI1_IB_CFG_LWID_DG_ENB",
5928 "HFI1_IB_CFG_LWID_ENB",
5929 "HFI1_IB_CFG_LWID",
5930 "HFI1_IB_CFG_SPD_ENB",
5931 "HFI1_IB_CFG_SPD",
5932 "HFI1_IB_CFG_RXPOL_ENB",
5933 "HFI1_IB_CFG_LREV_ENB",
5934 "HFI1_IB_CFG_LINKLATENCY",
5935 "HFI1_IB_CFG_HRTBT",
5936 "HFI1_IB_CFG_OP_VLS",
5937 "HFI1_IB_CFG_VL_HIGH_CAP",
5938 "HFI1_IB_CFG_VL_LOW_CAP",
5939 "HFI1_IB_CFG_OVERRUN_THRESH",
5940 "HFI1_IB_CFG_PHYERR_THRESH",
5941 "HFI1_IB_CFG_LINKDEFAULT",
5942 "HFI1_IB_CFG_PKEYS",
5943 "HFI1_IB_CFG_MTU",
5944 "HFI1_IB_CFG_LSTATE",
5945 "HFI1_IB_CFG_VL_HIGH_LIMIT",
5946 "HFI1_IB_CFG_PMA_TICKS",
5947 "HFI1_IB_CFG_PORT"
5948};
5949
5950static const char *ib_cfg_name(int which)
5951{
5952 if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings))
5953 return "invalid";
5954 return ib_cfg_name_strings[which];
5955}
5956
5957int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
5958{
5959 struct hfi1_devdata *dd = ppd->dd;
5960 int val = 0;
5961
5962 switch (which) {
5963 case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
5964 val = ppd->link_width_enabled;
5965 break;
5966 case HFI1_IB_CFG_LWID: /* currently active Link-width */
5967 val = ppd->link_width_active;
5968 break;
5969 case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
5970 val = ppd->link_speed_enabled;
5971 break;
5972 case HFI1_IB_CFG_SPD: /* current Link speed */
5973 val = ppd->link_speed_active;
5974 break;
5975
5976 case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */
5977 case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */
5978 case HFI1_IB_CFG_LINKLATENCY:
5979 goto unimplemented;
5980
5981 case HFI1_IB_CFG_OP_VLS:
5982 val = ppd->vls_operational;
5983 break;
5984 case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
5985 val = VL_ARB_HIGH_PRIO_TABLE_SIZE;
5986 break;
5987 case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
5988 val = VL_ARB_LOW_PRIO_TABLE_SIZE;
5989 break;
5990 case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
5991 val = ppd->overrun_threshold;
5992 break;
5993 case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
5994 val = ppd->phy_error_threshold;
5995 break;
5996 case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
5997 val = dd->link_default;
5998 break;
5999
6000 case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */
6001 case HFI1_IB_CFG_PMA_TICKS:
6002 default:
6003unimplemented:
6004 if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
6005 dd_dev_info(
6006 dd,
6007 "%s: which %s: not implemented\n",
6008 __func__,
6009 ib_cfg_name(which));
6010 break;
6011 }
6012
6013 return val;
6014}
6015
6016/*
6017 * The largest MAD packet size.
6018 */
6019#define MAX_MAD_PACKET 2048
6020
6021/*
6022 * Return the maximum header bytes that can go on the _wire_
6023 * for this device. This count includes the ICRC which is
6024 * not part of the packet held in memory but it is appended
6025 * by the HW.
6026 * This is dependent on the device's receive header entry size.
6027 * HFI allows this to be set per-receive context, but the
6028 * driver presently enforces a global value.
6029 */
6030u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
6031{
6032 /*
6033 * The maximum non-payload (MTU) bytes in LRH.PktLen are
6034 * the Receive Header Entry Size minus the PBC (or RHF) size
6035 * plus one DW for the ICRC appended by HW.
6036 *
6037 * dd->rcd[0].rcvhdrqentsize is in DW.
6038 * We use rcd[0] as all context will have the same value. Also,
6039 * the first kernel context would have been allocated by now so
6040 * we are guaranteed a valid value.
6041 */
6042 return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
6043}
6044
6045/*
6046 * Set Send Length
6047 * @ppd - per port data
6048 *
6049 * Set the MTU by limiting how many DWs may be sent. The SendLenCheck*
6050 * registers compare against LRH.PktLen, so use the max bytes included
6051 * in the LRH.
6052 *
6053 * This routine changes all VL values except VL15, which it maintains at
6054 * the same value.
6055 */
6056static void set_send_length(struct hfi1_pportdata *ppd)
6057{
6058 struct hfi1_devdata *dd = ppd->dd;
6059 u32 max_hb = lrh_max_header_bytes(dd), maxvlmtu = 0, dcmtu;
6060 u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
6061 & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
6062 SEND_LEN_CHECK1_LEN_VL15_SHIFT;
6063 int i;
6064
6065 for (i = 0; i < ppd->vls_supported; i++) {
6066 if (dd->vld[i].mtu > maxvlmtu)
6067 maxvlmtu = dd->vld[i].mtu;
6068 if (i <= 3)
6069 len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
6070 & SEND_LEN_CHECK0_LEN_VL0_MASK) <<
6071 ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT);
6072 else
6073 len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
6074 & SEND_LEN_CHECK1_LEN_VL4_MASK) <<
6075 ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
6076 }
6077 write_csr(dd, SEND_LEN_CHECK0, len1);
6078 write_csr(dd, SEND_LEN_CHECK1, len2);
6079 /* adjust kernel credit return thresholds based on new MTUs */
6080 /* all kernel receive contexts have the same hdrqentsize */
6081 for (i = 0; i < ppd->vls_supported; i++) {
6082 sc_set_cr_threshold(dd->vld[i].sc,
6083 sc_mtu_to_threshold(dd->vld[i].sc, dd->vld[i].mtu,
6084 dd->rcd[0]->rcvhdrqentsize));
6085 }
6086 sc_set_cr_threshold(dd->vld[15].sc,
6087 sc_mtu_to_threshold(dd->vld[15].sc, dd->vld[15].mtu,
6088 dd->rcd[0]->rcvhdrqentsize));
6089
6090 /* Adjust maximum MTU for the port in DC */
6091 dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
6092 (ilog2(maxvlmtu >> 8) + 1);
6093 len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
6094 len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
6095 len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
6096 DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
6097 write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
6098}
6099
6100static void set_lidlmc(struct hfi1_pportdata *ppd)
6101{
6102 int i;
6103 u64 sreg = 0;
6104 struct hfi1_devdata *dd = ppd->dd;
6105 u32 mask = ~((1U << ppd->lmc) - 1);
6106 u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
6107
6108 if (dd->hfi1_snoop.mode_flag)
6109 dd_dev_info(dd, "Set lid/lmc while snooping");
6110
6111 c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
6112 | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
6113 c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
6114 << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT)|
6115 ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
6116 << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
6117 write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
6118
6119 /*
6120 * Iterate over all the send contexts and set their SLID check
6121 */
6122 sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
6123 SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
6124 (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
6125 SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
6126
6127 for (i = 0; i < dd->chip_send_contexts; i++) {
6128 hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
6129 i, (u32)sreg);
6130 write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
6131 }
6132
6133 /* Now we have to do the same thing for the sdma engines */
6134 sdma_update_lmc(dd, mask, ppd->lid);
6135}
6136
6137static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
6138{
6139 unsigned long timeout;
6140 u32 curr_state;
6141
6142 timeout = jiffies + msecs_to_jiffies(msecs);
6143 while (1) {
6144 curr_state = read_physical_state(dd);
6145 if (curr_state == state)
6146 break;
6147 if (time_after(jiffies, timeout)) {
6148 dd_dev_err(dd,
6149 "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
6150 state, curr_state);
6151 return -ETIMEDOUT;
6152 }
6153 usleep_range(1950, 2050); /* sleep 2ms-ish */
6154 }
6155
6156 return 0;
6157}
6158
6159/*
6160 * Helper for set_link_state(). Do not call except from that routine.
6161 * Expects ppd->hls_mutex to be held.
6162 *
6163 * @rem_reason value to be sent to the neighbor
6164 *
6165 * LinkDownReasons only set if transition succeeds.
6166 */
6167static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
6168{
6169 struct hfi1_devdata *dd = ppd->dd;
6170 u32 pstate, previous_state;
6171 u32 last_local_state;
6172 u32 last_remote_state;
6173 int ret;
6174 int do_transition;
6175 int do_wait;
6176
6177 previous_state = ppd->host_link_state;
6178 ppd->host_link_state = HLS_GOING_OFFLINE;
6179 pstate = read_physical_state(dd);
6180 if (pstate == PLS_OFFLINE) {
6181 do_transition = 0; /* in right state */
6182 do_wait = 0; /* ...no need to wait */
6183 } else if ((pstate & 0xff) == PLS_OFFLINE) {
6184 do_transition = 0; /* in an offline transient state */
6185 do_wait = 1; /* ...wait for it to settle */
6186 } else {
6187 do_transition = 1; /* need to move to offline */
6188 do_wait = 1; /* ...will need to wait */
6189 }
6190
6191 if (do_transition) {
6192 ret = set_physical_link_state(dd,
6193 PLS_OFFLINE | (rem_reason << 8));
6194
6195 if (ret != HCMD_SUCCESS) {
6196 dd_dev_err(dd,
6197 "Failed to transition to Offline link state, return %d\n",
6198 ret);
6199 return -EINVAL;
6200 }
6201 if (ppd->offline_disabled_reason == OPA_LINKDOWN_REASON_NONE)
6202 ppd->offline_disabled_reason =
6203 OPA_LINKDOWN_REASON_TRANSIENT;
6204 }
6205
6206 if (do_wait) {
6207 /* it can take a while for the link to go down */
6208 ret = wait_phy_linkstate(dd, PLS_OFFLINE, 5000);
6209 if (ret < 0)
6210 return ret;
6211 }
6212
6213 /* make sure the logical state is also down */
6214 wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000);
6215
6216 /*
6217 * Now in charge of LCB - must be after the physical state is
6218 * offline.quiet and before host_link_state is changed.
6219 */
6220 set_host_lcb_access(dd);
6221 write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
6222 ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
6223
6224 /*
6225 * The LNI has a mandatory wait time after the physical state
6226 * moves to Offline.Quiet. The wait time may be different
6227 * depending on how the link went down. The 8051 firmware
6228 * will observe the needed wait time and only move to ready
6229 * when that is completed. The largest of the quiet timeouts
6230 * is 2.5s, so wait that long and then a bit more.
6231 */
6232 ret = wait_fm_ready(dd, 3000);
6233 if (ret) {
6234 dd_dev_err(dd,
6235 "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
6236 /* state is really offline, so make it so */
6237 ppd->host_link_state = HLS_DN_OFFLINE;
6238 return ret;
6239 }
6240
6241 /*
6242 * The state is now offline and the 8051 is ready to accept host
6243 * requests.
6244 * - change our state
6245 * - notify others if we were previously in a linkup state
6246 */
6247 ppd->host_link_state = HLS_DN_OFFLINE;
6248 if (previous_state & HLS_UP) {
6249 /* went down while link was up */
6250 handle_linkup_change(dd, 0);
6251 } else if (previous_state
6252 & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
6253 /* went down while attempting link up */
6254 /* byte 1 of last_*_state is the failure reason */
6255 read_last_local_state(dd, &last_local_state);
6256 read_last_remote_state(dd, &last_remote_state);
6257 dd_dev_err(dd,
6258 "LNI failure last states: local 0x%08x, remote 0x%08x\n",
6259 last_local_state, last_remote_state);
6260 }
6261
6262 /* the active link width (downgrade) is 0 on link down */
6263 ppd->link_width_active = 0;
6264 ppd->link_width_downgrade_tx_active = 0;
6265 ppd->link_width_downgrade_rx_active = 0;
6266 ppd->current_egress_rate = 0;
6267 return 0;
6268}
6269
6270/* return the link state name */
6271static const char *link_state_name(u32 state)
6272{
6273 const char *name;
6274 int n = ilog2(state);
6275 static const char * const names[] = {
6276 [__HLS_UP_INIT_BP] = "INIT",
6277 [__HLS_UP_ARMED_BP] = "ARMED",
6278 [__HLS_UP_ACTIVE_BP] = "ACTIVE",
6279 [__HLS_DN_DOWNDEF_BP] = "DOWNDEF",
6280 [__HLS_DN_POLL_BP] = "POLL",
6281 [__HLS_DN_DISABLE_BP] = "DISABLE",
6282 [__HLS_DN_OFFLINE_BP] = "OFFLINE",
6283 [__HLS_VERIFY_CAP_BP] = "VERIFY_CAP",
6284 [__HLS_GOING_UP_BP] = "GOING_UP",
6285 [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
6286 [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
6287 };
6288
6289 name = n < ARRAY_SIZE(names) ? names[n] : NULL;
6290 return name ? name : "unknown";
6291}
6292
6293/* return the link state reason name */
6294static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
6295{
6296 if (state == HLS_UP_INIT) {
6297 switch (ppd->linkinit_reason) {
6298 case OPA_LINKINIT_REASON_LINKUP:
6299 return "(LINKUP)";
6300 case OPA_LINKINIT_REASON_FLAPPING:
6301 return "(FLAPPING)";
6302 case OPA_LINKINIT_OUTSIDE_POLICY:
6303 return "(OUTSIDE_POLICY)";
6304 case OPA_LINKINIT_QUARANTINED:
6305 return "(QUARANTINED)";
6306 case OPA_LINKINIT_INSUFIC_CAPABILITY:
6307 return "(INSUFIC_CAPABILITY)";
6308 default:
6309 break;
6310 }
6311 }
6312 return "";
6313}
6314
6315/*
6316 * driver_physical_state - convert the driver's notion of a port's
6317 * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*).
6318 * Return -1 (converted to a u32) to indicate error.
6319 */
6320u32 driver_physical_state(struct hfi1_pportdata *ppd)
6321{
6322 switch (ppd->host_link_state) {
6323 case HLS_UP_INIT:
6324 case HLS_UP_ARMED:
6325 case HLS_UP_ACTIVE:
6326 return IB_PORTPHYSSTATE_LINKUP;
6327 case HLS_DN_POLL:
6328 return IB_PORTPHYSSTATE_POLLING;
6329 case HLS_DN_DISABLE:
6330 return IB_PORTPHYSSTATE_DISABLED;
6331 case HLS_DN_OFFLINE:
6332 return OPA_PORTPHYSSTATE_OFFLINE;
6333 case HLS_VERIFY_CAP:
6334 return IB_PORTPHYSSTATE_POLLING;
6335 case HLS_GOING_UP:
6336 return IB_PORTPHYSSTATE_POLLING;
6337 case HLS_GOING_OFFLINE:
6338 return OPA_PORTPHYSSTATE_OFFLINE;
6339 case HLS_LINK_COOLDOWN:
6340 return OPA_PORTPHYSSTATE_OFFLINE;
6341 case HLS_DN_DOWNDEF:
6342 default:
6343 dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
6344 ppd->host_link_state);
6345 return -1;
6346 }
6347}
6348
6349/*
6350 * driver_logical_state - convert the driver's notion of a port's
6351 * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1
6352 * (converted to a u32) to indicate error.
6353 */
6354u32 driver_logical_state(struct hfi1_pportdata *ppd)
6355{
6356 if (ppd->host_link_state && !(ppd->host_link_state & HLS_UP))
6357 return IB_PORT_DOWN;
6358
6359 switch (ppd->host_link_state & HLS_UP) {
6360 case HLS_UP_INIT:
6361 return IB_PORT_INIT;
6362 case HLS_UP_ARMED:
6363 return IB_PORT_ARMED;
6364 case HLS_UP_ACTIVE:
6365 return IB_PORT_ACTIVE;
6366 default:
6367 dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
6368 ppd->host_link_state);
6369 return -1;
6370 }
6371}
6372
6373void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
6374 u8 neigh_reason, u8 rem_reason)
6375{
6376 if (ppd->local_link_down_reason.latest == 0 &&
6377 ppd->neigh_link_down_reason.latest == 0) {
6378 ppd->local_link_down_reason.latest = lcl_reason;
6379 ppd->neigh_link_down_reason.latest = neigh_reason;
6380 ppd->remote_link_down_reason = rem_reason;
6381 }
6382}
6383
6384/*
6385 * Change the physical and/or logical link state.
6386 *
6387 * Do not call this routine while inside an interrupt. It contains
6388 * calls to routines that can take multiple seconds to finish.
6389 *
6390 * Returns 0 on success, -errno on failure.
6391 */
6392int set_link_state(struct hfi1_pportdata *ppd, u32 state)
6393{
6394 struct hfi1_devdata *dd = ppd->dd;
6395 struct ib_event event = {.device = NULL};
6396 int ret1, ret = 0;
6397 int was_up, is_down;
6398 int orig_new_state, poll_bounce;
6399
6400 mutex_lock(&ppd->hls_lock);
6401
6402 orig_new_state = state;
6403 if (state == HLS_DN_DOWNDEF)
6404 state = dd->link_default;
6405
6406 /* interpret poll -> poll as a link bounce */
6407 poll_bounce = ppd->host_link_state == HLS_DN_POLL
6408 && state == HLS_DN_POLL;
6409
6410 dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
6411 link_state_name(ppd->host_link_state),
6412 link_state_name(orig_new_state),
6413 poll_bounce ? "(bounce) " : "",
6414 link_state_reason_name(ppd, state));
6415
6416 was_up = !!(ppd->host_link_state & HLS_UP);
6417
6418 /*
6419 * If we're going to a (HLS_*) link state that implies the logical
6420 * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
6421 * reset is_sm_config_started to 0.
6422 */
6423 if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
6424 ppd->is_sm_config_started = 0;
6425
6426 /*
6427 * Do nothing if the states match. Let a poll to poll link bounce
6428 * go through.
6429 */
6430 if (ppd->host_link_state == state && !poll_bounce)
6431 goto done;
6432
6433 switch (state) {
6434 case HLS_UP_INIT:
6435 if (ppd->host_link_state == HLS_DN_POLL && (quick_linkup
6436 || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
6437 /*
6438 * Quick link up jumps from polling to here.
6439 *
6440 * Whether in normal or loopback mode, the
6441 * simulator jumps from polling to link up.
6442 * Accept that here.
6443 */
6444 /* OK */;
6445 } else if (ppd->host_link_state != HLS_GOING_UP) {
6446 goto unexpected;
6447 }
6448
6449 ppd->host_link_state = HLS_UP_INIT;
6450 ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
6451 if (ret) {
6452 /* logical state didn't change, stay at going_up */
6453 ppd->host_link_state = HLS_GOING_UP;
6454 dd_dev_err(dd,
6455 "%s: logical state did not change to INIT\n",
6456 __func__);
6457 } else {
6458 /* clear old transient LINKINIT_REASON code */
6459 if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
6460 ppd->linkinit_reason =
6461 OPA_LINKINIT_REASON_LINKUP;
6462
6463 /* enable the port */
6464 add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
6465
6466 handle_linkup_change(dd, 1);
6467 }
6468 break;
6469 case HLS_UP_ARMED:
6470 if (ppd->host_link_state != HLS_UP_INIT)
6471 goto unexpected;
6472
6473 ppd->host_link_state = HLS_UP_ARMED;
6474 set_logical_state(dd, LSTATE_ARMED);
6475 ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
6476 if (ret) {
6477 /* logical state didn't change, stay at init */
6478 ppd->host_link_state = HLS_UP_INIT;
6479 dd_dev_err(dd,
6480 "%s: logical state did not change to ARMED\n",
6481 __func__);
6482 }
6483 /*
6484 * The simulator does not currently implement SMA messages,
6485 * so neighbor_normal is not set. Set it here when we first
6486 * move to Armed.
6487 */
6488 if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
6489 ppd->neighbor_normal = 1;
6490 break;
6491 case HLS_UP_ACTIVE:
6492 if (ppd->host_link_state != HLS_UP_ARMED)
6493 goto unexpected;
6494
6495 ppd->host_link_state = HLS_UP_ACTIVE;
6496 set_logical_state(dd, LSTATE_ACTIVE);
6497 ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
6498 if (ret) {
6499 /* logical state didn't change, stay at armed */
6500 ppd->host_link_state = HLS_UP_ARMED;
6501 dd_dev_err(dd,
6502 "%s: logical state did not change to ACTIVE\n",
6503 __func__);
6504 } else {
6505
6506 /* tell all engines to go running */
6507 sdma_all_running(dd);
6508
6509 /* Signal the IB layer that the port has went active */
6510 event.device = &dd->verbs_dev.ibdev;
6511 event.element.port_num = ppd->port;
6512 event.event = IB_EVENT_PORT_ACTIVE;
6513 }
6514 break;
6515 case HLS_DN_POLL:
6516 if ((ppd->host_link_state == HLS_DN_DISABLE ||
6517 ppd->host_link_state == HLS_DN_OFFLINE) &&
6518 dd->dc_shutdown)
6519 dc_start(dd);
6520 /* Hand LED control to the DC */
6521 write_csr(dd, DCC_CFG_LED_CNTRL, 0);
6522
6523 if (ppd->host_link_state != HLS_DN_OFFLINE) {
6524 u8 tmp = ppd->link_enabled;
6525
6526 ret = goto_offline(ppd, ppd->remote_link_down_reason);
6527 if (ret) {
6528 ppd->link_enabled = tmp;
6529 break;
6530 }
6531 ppd->remote_link_down_reason = 0;
6532
6533 if (ppd->driver_link_ready)
6534 ppd->link_enabled = 1;
6535 }
6536
6537 ret = set_local_link_attributes(ppd);
6538 if (ret)
6539 break;
6540
6541 ppd->port_error_action = 0;
6542 ppd->host_link_state = HLS_DN_POLL;
6543
6544 if (quick_linkup) {
6545 /* quick linkup does not go into polling */
6546 ret = do_quick_linkup(dd);
6547 } else {
6548 ret1 = set_physical_link_state(dd, PLS_POLLING);
6549 if (ret1 != HCMD_SUCCESS) {
6550 dd_dev_err(dd,
6551 "Failed to transition to Polling link state, return 0x%x\n",
6552 ret1);
6553 ret = -EINVAL;
6554 }
6555 }
6556 ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE;
6557 /*
6558 * If an error occurred above, go back to offline. The
6559 * caller may reschedule another attempt.
6560 */
6561 if (ret)
6562 goto_offline(ppd, 0);
6563 break;
6564 case HLS_DN_DISABLE:
6565 /* link is disabled */
6566 ppd->link_enabled = 0;
6567
6568 /* allow any state to transition to disabled */
6569
6570 /* must transition to offline first */
6571 if (ppd->host_link_state != HLS_DN_OFFLINE) {
6572 ret = goto_offline(ppd, ppd->remote_link_down_reason);
6573 if (ret)
6574 break;
6575 ppd->remote_link_down_reason = 0;
6576 }
6577
6578 ret1 = set_physical_link_state(dd, PLS_DISABLED);
6579 if (ret1 != HCMD_SUCCESS) {
6580 dd_dev_err(dd,
6581 "Failed to transition to Disabled link state, return 0x%x\n",
6582 ret1);
6583 ret = -EINVAL;
6584 break;
6585 }
6586 ppd->host_link_state = HLS_DN_DISABLE;
6587 dc_shutdown(dd);
6588 break;
6589 case HLS_DN_OFFLINE:
6590 if (ppd->host_link_state == HLS_DN_DISABLE)
6591 dc_start(dd);
6592
6593 /* allow any state to transition to offline */
6594 ret = goto_offline(ppd, ppd->remote_link_down_reason);
6595 if (!ret)
6596 ppd->remote_link_down_reason = 0;
6597 break;
6598 case HLS_VERIFY_CAP:
6599 if (ppd->host_link_state != HLS_DN_POLL)
6600 goto unexpected;
6601 ppd->host_link_state = HLS_VERIFY_CAP;
6602 break;
6603 case HLS_GOING_UP:
6604 if (ppd->host_link_state != HLS_VERIFY_CAP)
6605 goto unexpected;
6606
6607 ret1 = set_physical_link_state(dd, PLS_LINKUP);
6608 if (ret1 != HCMD_SUCCESS) {
6609 dd_dev_err(dd,
6610 "Failed to transition to link up state, return 0x%x\n",
6611 ret1);
6612 ret = -EINVAL;
6613 break;
6614 }
6615 ppd->host_link_state = HLS_GOING_UP;
6616 break;
6617
6618 case HLS_GOING_OFFLINE: /* transient within goto_offline() */
6619 case HLS_LINK_COOLDOWN: /* transient within goto_offline() */
6620 default:
6621 dd_dev_info(dd, "%s: state 0x%x: not supported\n",
6622 __func__, state);
6623 ret = -EINVAL;
6624 break;
6625 }
6626
6627 is_down = !!(ppd->host_link_state & (HLS_DN_POLL |
6628 HLS_DN_DISABLE | HLS_DN_OFFLINE));
6629
6630 if (was_up && is_down && ppd->local_link_down_reason.sma == 0 &&
6631 ppd->neigh_link_down_reason.sma == 0) {
6632 ppd->local_link_down_reason.sma =
6633 ppd->local_link_down_reason.latest;
6634 ppd->neigh_link_down_reason.sma =
6635 ppd->neigh_link_down_reason.latest;
6636 }
6637
6638 goto done;
6639
6640unexpected:
6641 dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
6642 __func__, link_state_name(ppd->host_link_state),
6643 link_state_name(state));
6644 ret = -EINVAL;
6645
6646done:
6647 mutex_unlock(&ppd->hls_lock);
6648
6649 if (event.device)
6650 ib_dispatch_event(&event);
6651
6652 return ret;
6653}
6654
6655int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
6656{
6657 u64 reg;
6658 int ret = 0;
6659
6660 switch (which) {
6661 case HFI1_IB_CFG_LIDLMC:
6662 set_lidlmc(ppd);
6663 break;
6664 case HFI1_IB_CFG_VL_HIGH_LIMIT:
6665 /*
6666 * The VL Arbitrator high limit is sent in units of 4k
6667 * bytes, while HFI stores it in units of 64 bytes.
6668 */
6669 val *= 4096/64;
6670 reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
6671 << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
6672 write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
6673 break;
6674 case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
6675 /* HFI only supports POLL as the default link down state */
6676 if (val != HLS_DN_POLL)
6677 ret = -EINVAL;
6678 break;
6679 case HFI1_IB_CFG_OP_VLS:
6680 if (ppd->vls_operational != val) {
6681 ppd->vls_operational = val;
6682 if (!ppd->port)
6683 ret = -EINVAL;
6684 else
6685 ret = sdma_map_init(
6686 ppd->dd,
6687 ppd->port - 1,
6688 val,
6689 NULL);
6690 }
6691 break;
6692 /*
6693 * For link width, link width downgrade, and speed enable, always AND
6694 * the setting with what is actually supported. This has two benefits.
6695 * First, enabled can't have unsupported values, no matter what the
6696 * SM or FM might want. Second, the ALL_SUPPORTED wildcards that mean
6697 * "fill in with your supported value" have all the bits in the
6698 * field set, so simply ANDing with supported has the desired result.
6699 */
6700 case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
6701 ppd->link_width_enabled = val & ppd->link_width_supported;
6702 break;
6703 case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
6704 ppd->link_width_downgrade_enabled =
6705 val & ppd->link_width_downgrade_supported;
6706 break;
6707 case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
6708 ppd->link_speed_enabled = val & ppd->link_speed_supported;
6709 break;
6710 case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
6711 /*
6712 * HFI does not follow IB specs, save this value
6713 * so we can report it, if asked.
6714 */
6715 ppd->overrun_threshold = val;
6716 break;
6717 case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
6718 /*
6719 * HFI does not follow IB specs, save this value
6720 * so we can report it, if asked.
6721 */
6722 ppd->phy_error_threshold = val;
6723 break;
6724
6725 case HFI1_IB_CFG_MTU:
6726 set_send_length(ppd);
6727 break;
6728
6729 case HFI1_IB_CFG_PKEYS:
6730 if (HFI1_CAP_IS_KSET(PKEY_CHECK))
6731 set_partition_keys(ppd);
6732 break;
6733
6734 default:
6735 if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
6736 dd_dev_info(ppd->dd,
6737 "%s: which %s, val 0x%x: not implemented\n",
6738 __func__, ib_cfg_name(which), val);
6739 break;
6740 }
6741 return ret;
6742}
6743
6744/* begin functions related to vl arbitration table caching */
6745static void init_vl_arb_caches(struct hfi1_pportdata *ppd)
6746{
6747 int i;
6748
6749 BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
6750 VL_ARB_LOW_PRIO_TABLE_SIZE);
6751 BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
6752 VL_ARB_HIGH_PRIO_TABLE_SIZE);
6753
6754 /*
6755 * Note that we always return values directly from the
6756 * 'vl_arb_cache' (and do no CSR reads) in response to a
6757 * 'Get(VLArbTable)'. This is obviously correct after a
6758 * 'Set(VLArbTable)', since the cache will then be up to
6759 * date. But it's also correct prior to any 'Set(VLArbTable)'
6760 * since then both the cache, and the relevant h/w registers
6761 * will be zeroed.
6762 */
6763
6764 for (i = 0; i < MAX_PRIO_TABLE; i++)
6765 spin_lock_init(&ppd->vl_arb_cache[i].lock);
6766}
6767
6768/*
6769 * vl_arb_lock_cache
6770 *
6771 * All other vl_arb_* functions should be called only after locking
6772 * the cache.
6773 */
6774static inline struct vl_arb_cache *
6775vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
6776{
6777 if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE)
6778 return NULL;
6779 spin_lock(&ppd->vl_arb_cache[idx].lock);
6780 return &ppd->vl_arb_cache[idx];
6781}
6782
6783static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
6784{
6785 spin_unlock(&ppd->vl_arb_cache[idx].lock);
6786}
6787
6788static void vl_arb_get_cache(struct vl_arb_cache *cache,
6789 struct ib_vl_weight_elem *vl)
6790{
6791 memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl));
6792}
6793
6794static void vl_arb_set_cache(struct vl_arb_cache *cache,
6795 struct ib_vl_weight_elem *vl)
6796{
6797 memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
6798}
6799
6800static int vl_arb_match_cache(struct vl_arb_cache *cache,
6801 struct ib_vl_weight_elem *vl)
6802{
6803 return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
6804}
6805/* end functions related to vl arbitration table caching */
6806
6807static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
6808 u32 size, struct ib_vl_weight_elem *vl)
6809{
6810 struct hfi1_devdata *dd = ppd->dd;
6811 u64 reg;
6812 unsigned int i, is_up = 0;
6813 int drain, ret = 0;
6814
6815 mutex_lock(&ppd->hls_lock);
6816
6817 if (ppd->host_link_state & HLS_UP)
6818 is_up = 1;
6819
6820 drain = !is_ax(dd) && is_up;
6821
6822 if (drain)
6823 /*
6824 * Before adjusting VL arbitration weights, empty per-VL
6825 * FIFOs, otherwise a packet whose VL weight is being
6826 * set to 0 could get stuck in a FIFO with no chance to
6827 * egress.
6828 */
6829 ret = stop_drain_data_vls(dd);
6830
6831 if (ret) {
6832 dd_dev_err(
6833 dd,
6834 "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
6835 __func__);
6836 goto err;
6837 }
6838
6839 for (i = 0; i < size; i++, vl++) {
6840 /*
6841 * NOTE: The low priority shift and mask are used here, but
6842 * they are the same for both the low and high registers.
6843 */
6844 reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
6845 << SEND_LOW_PRIORITY_LIST_VL_SHIFT)
6846 | (((u64)vl->weight
6847 & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
6848 << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
6849 write_csr(dd, target + (i * 8), reg);
6850 }
6851 pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
6852
6853 if (drain)
6854 open_fill_data_vls(dd); /* reopen all VLs */
6855
6856err:
6857 mutex_unlock(&ppd->hls_lock);
6858
6859 return ret;
6860}
6861
6862/*
6863 * Read one credit merge VL register.
6864 */
6865static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr,
6866 struct vl_limit *vll)
6867{
6868 u64 reg = read_csr(dd, csr);
6869
6870 vll->dedicated = cpu_to_be16(
6871 (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT)
6872 & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK);
6873 vll->shared = cpu_to_be16(
6874 (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT)
6875 & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK);
6876}
6877
6878/*
6879 * Read the current credit merge limits.
6880 */
6881static int get_buffer_control(struct hfi1_devdata *dd,
6882 struct buffer_control *bc, u16 *overall_limit)
6883{
6884 u64 reg;
6885 int i;
6886
6887 /* not all entries are filled in */
6888 memset(bc, 0, sizeof(*bc));
6889
6890 /* OPA and HFI have a 1-1 mapping */
6891 for (i = 0; i < TXE_NUM_DATA_VL; i++)
6892 read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8*i), &bc->vl[i]);
6893
6894 /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
6895 read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
6896
6897 reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
6898 bc->overall_shared_limit = cpu_to_be16(
6899 (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
6900 & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK);
6901 if (overall_limit)
6902 *overall_limit = (reg
6903 >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
6904 & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK;
6905 return sizeof(struct buffer_control);
6906}
6907
6908static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
6909{
6910 u64 reg;
6911 int i;
6912
6913 /* each register contains 16 SC->VLnt mappings, 4 bits each */
6914 reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0);
6915 for (i = 0; i < sizeof(u64); i++) {
6916 u8 byte = *(((u8 *)&reg) + i);
6917
6918 dp->vlnt[2 * i] = byte & 0xf;
6919 dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4;
6920 }
6921
6922 reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16);
6923 for (i = 0; i < sizeof(u64); i++) {
6924 u8 byte = *(((u8 *)&reg) + i);
6925
6926 dp->vlnt[16 + (2 * i)] = byte & 0xf;
6927 dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4;
6928 }
6929 return sizeof(struct sc2vlnt);
6930}
6931
6932static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems,
6933 struct ib_vl_weight_elem *vl)
6934{
6935 unsigned int i;
6936
6937 for (i = 0; i < nelems; i++, vl++) {
6938 vl->vl = 0xf;
6939 vl->weight = 0;
6940 }
6941}
6942
6943static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
6944{
6945 write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
6946 DC_SC_VL_VAL(15_0,
6947 0, dp->vlnt[0] & 0xf,
6948 1, dp->vlnt[1] & 0xf,
6949 2, dp->vlnt[2] & 0xf,
6950 3, dp->vlnt[3] & 0xf,
6951 4, dp->vlnt[4] & 0xf,
6952 5, dp->vlnt[5] & 0xf,
6953 6, dp->vlnt[6] & 0xf,
6954 7, dp->vlnt[7] & 0xf,
6955 8, dp->vlnt[8] & 0xf,
6956 9, dp->vlnt[9] & 0xf,
6957 10, dp->vlnt[10] & 0xf,
6958 11, dp->vlnt[11] & 0xf,
6959 12, dp->vlnt[12] & 0xf,
6960 13, dp->vlnt[13] & 0xf,
6961 14, dp->vlnt[14] & 0xf,
6962 15, dp->vlnt[15] & 0xf));
6963 write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
6964 DC_SC_VL_VAL(31_16,
6965 16, dp->vlnt[16] & 0xf,
6966 17, dp->vlnt[17] & 0xf,
6967 18, dp->vlnt[18] & 0xf,
6968 19, dp->vlnt[19] & 0xf,
6969 20, dp->vlnt[20] & 0xf,
6970 21, dp->vlnt[21] & 0xf,
6971 22, dp->vlnt[22] & 0xf,
6972 23, dp->vlnt[23] & 0xf,
6973 24, dp->vlnt[24] & 0xf,
6974 25, dp->vlnt[25] & 0xf,
6975 26, dp->vlnt[26] & 0xf,
6976 27, dp->vlnt[27] & 0xf,
6977 28, dp->vlnt[28] & 0xf,
6978 29, dp->vlnt[29] & 0xf,
6979 30, dp->vlnt[30] & 0xf,
6980 31, dp->vlnt[31] & 0xf));
6981}
6982
6983static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
6984 u16 limit)
6985{
6986 if (limit != 0)
6987 dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
6988 what, (int)limit, idx);
6989}
6990
6991/* change only the shared limit portion of SendCmGLobalCredit */
6992static void set_global_shared(struct hfi1_devdata *dd, u16 limit)
6993{
6994 u64 reg;
6995
6996 reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
6997 reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK;
6998 reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT;
6999 write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
7000}
7001
7002/* change only the total credit limit portion of SendCmGLobalCredit */
7003static void set_global_limit(struct hfi1_devdata *dd, u16 limit)
7004{
7005 u64 reg;
7006
7007 reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
7008 reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK;
7009 reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
7010 write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
7011}
7012
7013/* set the given per-VL shared limit */
7014static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit)
7015{
7016 u64 reg;
7017 u32 addr;
7018
7019 if (vl < TXE_NUM_DATA_VL)
7020 addr = SEND_CM_CREDIT_VL + (8 * vl);
7021 else
7022 addr = SEND_CM_CREDIT_VL15;
7023
7024 reg = read_csr(dd, addr);
7025 reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK;
7026 reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT;
7027 write_csr(dd, addr, reg);
7028}
7029
7030/* set the given per-VL dedicated limit */
7031static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit)
7032{
7033 u64 reg;
7034 u32 addr;
7035
7036 if (vl < TXE_NUM_DATA_VL)
7037 addr = SEND_CM_CREDIT_VL + (8 * vl);
7038 else
7039 addr = SEND_CM_CREDIT_VL15;
7040
7041 reg = read_csr(dd, addr);
7042 reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK;
7043 reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT;
7044 write_csr(dd, addr, reg);
7045}
7046
7047/* spin until the given per-VL status mask bits clear */
7048static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask,
7049 const char *which)
7050{
7051 unsigned long timeout;
7052 u64 reg;
7053
7054 timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT);
7055 while (1) {
7056 reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask;
7057
7058 if (reg == 0)
7059 return; /* success */
7060 if (time_after(jiffies, timeout))
7061 break; /* timed out */
7062 udelay(1);
7063 }
7064
7065 dd_dev_err(dd,
7066 "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
7067 which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
7068 /*
7069 * If this occurs, it is likely there was a credit loss on the link.
7070 * The only recovery from that is a link bounce.
7071 */
7072 dd_dev_err(dd,
7073 "Continuing anyway. A credit loss may occur. Suggest a link bounce\n");
7074}
7075
7076/*
7077 * The number of credits on the VLs may be changed while everything
7078 * is "live", but the following algorithm must be followed due to
7079 * how the hardware is actually implemented. In particular,
7080 * Return_Credit_Status[] is the only correct status check.
7081 *
7082 * if (reducing Global_Shared_Credit_Limit or any shared limit changing)
7083 * set Global_Shared_Credit_Limit = 0
7084 * use_all_vl = 1
7085 * mask0 = all VLs that are changing either dedicated or shared limits
7086 * set Shared_Limit[mask0] = 0
7087 * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0
7088 * if (changing any dedicated limit)
7089 * mask1 = all VLs that are lowering dedicated limits
7090 * lower Dedicated_Limit[mask1]
7091 * spin until Return_Credit_Status[mask1] == 0
7092 * raise Dedicated_Limits
7093 * raise Shared_Limits
7094 * raise Global_Shared_Credit_Limit
7095 *
7096 * lower = if the new limit is lower, set the limit to the new value
7097 * raise = if the new limit is higher than the current value (may be changed
7098 * earlier in the algorithm), set the new limit to the new value
7099 */
7100static int set_buffer_control(struct hfi1_devdata *dd,
7101 struct buffer_control *new_bc)
7102{
7103 u64 changing_mask, ld_mask, stat_mask;
7104 int change_count;
7105 int i, use_all_mask;
7106 int this_shared_changing;
7107 /*
7108 * A0: add the variable any_shared_limit_changing below and in the
7109 * algorithm above. If removing A0 support, it can be removed.
7110 */
7111 int any_shared_limit_changing;
7112 struct buffer_control cur_bc;
7113 u8 changing[OPA_MAX_VLS];
7114 u8 lowering_dedicated[OPA_MAX_VLS];
7115 u16 cur_total;
7116 u32 new_total = 0;
7117 const u64 all_mask =
7118 SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
7119 | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
7120 | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
7121 | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
7122 | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
7123 | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
7124 | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
7125 | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
7126 | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
7127
7128#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
7129#define NUM_USABLE_VLS 16 /* look at VL15 and less */
7130
7131
7132 /* find the new total credits, do sanity check on unused VLs */
7133 for (i = 0; i < OPA_MAX_VLS; i++) {
7134 if (valid_vl(i)) {
7135 new_total += be16_to_cpu(new_bc->vl[i].dedicated);
7136 continue;
7137 }
7138 nonzero_msg(dd, i, "dedicated",
7139 be16_to_cpu(new_bc->vl[i].dedicated));
7140 nonzero_msg(dd, i, "shared",
7141 be16_to_cpu(new_bc->vl[i].shared));
7142 new_bc->vl[i].dedicated = 0;
7143 new_bc->vl[i].shared = 0;
7144 }
7145 new_total += be16_to_cpu(new_bc->overall_shared_limit);
7146 if (new_total > (u32)dd->link_credits)
7147 return -EINVAL;
7148 /* fetch the current values */
7149 get_buffer_control(dd, &cur_bc, &cur_total);
7150
7151 /*
7152 * Create the masks we will use.
7153 */
7154 memset(changing, 0, sizeof(changing));
7155 memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
7156 /* NOTE: Assumes that the individual VL bits are adjacent and in
7157 increasing order */
7158 stat_mask =
7159 SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
7160 changing_mask = 0;
7161 ld_mask = 0;
7162 change_count = 0;
7163 any_shared_limit_changing = 0;
7164 for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) {
7165 if (!valid_vl(i))
7166 continue;
7167 this_shared_changing = new_bc->vl[i].shared
7168 != cur_bc.vl[i].shared;
7169 if (this_shared_changing)
7170 any_shared_limit_changing = 1;
7171 if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated
7172 || this_shared_changing) {
7173 changing[i] = 1;
7174 changing_mask |= stat_mask;
7175 change_count++;
7176 }
7177 if (be16_to_cpu(new_bc->vl[i].dedicated) <
7178 be16_to_cpu(cur_bc.vl[i].dedicated)) {
7179 lowering_dedicated[i] = 1;
7180 ld_mask |= stat_mask;
7181 }
7182 }
7183
7184 /* bracket the credit change with a total adjustment */
7185 if (new_total > cur_total)
7186 set_global_limit(dd, new_total);
7187
7188 /*
7189 * Start the credit change algorithm.
7190 */
7191 use_all_mask = 0;
7192 if ((be16_to_cpu(new_bc->overall_shared_limit) <
7193 be16_to_cpu(cur_bc.overall_shared_limit))
7194 || (is_a0(dd) && any_shared_limit_changing)) {
7195 set_global_shared(dd, 0);
7196 cur_bc.overall_shared_limit = 0;
7197 use_all_mask = 1;
7198 }
7199
7200 for (i = 0; i < NUM_USABLE_VLS; i++) {
7201 if (!valid_vl(i))
7202 continue;
7203
7204 if (changing[i]) {
7205 set_vl_shared(dd, i, 0);
7206 cur_bc.vl[i].shared = 0;
7207 }
7208 }
7209
7210 wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
7211 "shared");
7212
7213 if (change_count > 0) {
7214 for (i = 0; i < NUM_USABLE_VLS; i++) {
7215 if (!valid_vl(i))
7216 continue;
7217
7218 if (lowering_dedicated[i]) {
7219 set_vl_dedicated(dd, i,
7220 be16_to_cpu(new_bc->vl[i].dedicated));
7221 cur_bc.vl[i].dedicated =
7222 new_bc->vl[i].dedicated;
7223 }
7224 }
7225
7226 wait_for_vl_status_clear(dd, ld_mask, "dedicated");
7227
7228 /* now raise all dedicated that are going up */
7229 for (i = 0; i < NUM_USABLE_VLS; i++) {
7230 if (!valid_vl(i))
7231 continue;
7232
7233 if (be16_to_cpu(new_bc->vl[i].dedicated) >
7234 be16_to_cpu(cur_bc.vl[i].dedicated))
7235 set_vl_dedicated(dd, i,
7236 be16_to_cpu(new_bc->vl[i].dedicated));
7237 }
7238 }
7239
7240 /* next raise all shared that are going up */
7241 for (i = 0; i < NUM_USABLE_VLS; i++) {
7242 if (!valid_vl(i))
7243 continue;
7244
7245 if (be16_to_cpu(new_bc->vl[i].shared) >
7246 be16_to_cpu(cur_bc.vl[i].shared))
7247 set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
7248 }
7249
7250 /* finally raise the global shared */
7251 if (be16_to_cpu(new_bc->overall_shared_limit) >
7252 be16_to_cpu(cur_bc.overall_shared_limit))
7253 set_global_shared(dd,
7254 be16_to_cpu(new_bc->overall_shared_limit));
7255
7256 /* bracket the credit change with a total adjustment */
7257 if (new_total < cur_total)
7258 set_global_limit(dd, new_total);
7259 return 0;
7260}
7261
7262/*
7263 * Read the given fabric manager table. Return the size of the
7264 * table (in bytes) on success, and a negative error code on
7265 * failure.
7266 */
7267int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
7268
7269{
7270 int size;
7271 struct vl_arb_cache *vlc;
7272
7273 switch (which) {
7274 case FM_TBL_VL_HIGH_ARB:
7275 size = 256;
7276 /*
7277 * OPA specifies 128 elements (of 2 bytes each), though
7278 * HFI supports only 16 elements in h/w.
7279 */
7280 vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
7281 vl_arb_get_cache(vlc, t);
7282 vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
7283 break;
7284 case FM_TBL_VL_LOW_ARB:
7285 size = 256;
7286 /*
7287 * OPA specifies 128 elements (of 2 bytes each), though
7288 * HFI supports only 16 elements in h/w.
7289 */
7290 vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
7291 vl_arb_get_cache(vlc, t);
7292 vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
7293 break;
7294 case FM_TBL_BUFFER_CONTROL:
7295 size = get_buffer_control(ppd->dd, t, NULL);
7296 break;
7297 case FM_TBL_SC2VLNT:
7298 size = get_sc2vlnt(ppd->dd, t);
7299 break;
7300 case FM_TBL_VL_PREEMPT_ELEMS:
7301 size = 256;
7302 /* OPA specifies 128 elements, of 2 bytes each */
7303 get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t);
7304 break;
7305 case FM_TBL_VL_PREEMPT_MATRIX:
7306 size = 256;
7307 /*
7308 * OPA specifies that this is the same size as the VL
7309 * arbitration tables (i.e., 256 bytes).
7310 */
7311 break;
7312 default:
7313 return -EINVAL;
7314 }
7315 return size;
7316}
7317
7318/*
7319 * Write the given fabric manager table.
7320 */
7321int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
7322{
7323 int ret = 0;
7324 struct vl_arb_cache *vlc;
7325
7326 switch (which) {
7327 case FM_TBL_VL_HIGH_ARB:
7328 vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
7329 if (vl_arb_match_cache(vlc, t)) {
7330 vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
7331 break;
7332 }
7333 vl_arb_set_cache(vlc, t);
7334 vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
7335 ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
7336 VL_ARB_HIGH_PRIO_TABLE_SIZE, t);
7337 break;
7338 case FM_TBL_VL_LOW_ARB:
7339 vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
7340 if (vl_arb_match_cache(vlc, t)) {
7341 vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
7342 break;
7343 }
7344 vl_arb_set_cache(vlc, t);
7345 vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
7346 ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
7347 VL_ARB_LOW_PRIO_TABLE_SIZE, t);
7348 break;
7349 case FM_TBL_BUFFER_CONTROL:
7350 ret = set_buffer_control(ppd->dd, t);
7351 break;
7352 case FM_TBL_SC2VLNT:
7353 set_sc2vlnt(ppd->dd, t);
7354 break;
7355 default:
7356 ret = -EINVAL;
7357 }
7358 return ret;
7359}
7360
7361/*
7362 * Disable all data VLs.
7363 *
7364 * Return 0 if disabled, non-zero if the VLs cannot be disabled.
7365 */
7366static int disable_data_vls(struct hfi1_devdata *dd)
7367{
7368 if (is_a0(dd))
7369 return 1;
7370
7371 pio_send_control(dd, PSC_DATA_VL_DISABLE);
7372
7373 return 0;
7374}
7375
7376/*
7377 * open_fill_data_vls() - the counterpart to stop_drain_data_vls().
7378 * Just re-enables all data VLs (the "fill" part happens
7379 * automatically - the name was chosen for symmetry with
7380 * stop_drain_data_vls()).
7381 *
7382 * Return 0 if successful, non-zero if the VLs cannot be enabled.
7383 */
7384int open_fill_data_vls(struct hfi1_devdata *dd)
7385{
7386 if (is_a0(dd))
7387 return 1;
7388
7389 pio_send_control(dd, PSC_DATA_VL_ENABLE);
7390
7391 return 0;
7392}
7393
7394/*
7395 * drain_data_vls() - assumes that disable_data_vls() has been called,
7396 * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA
7397 * engines to drop to 0.
7398 */
7399static void drain_data_vls(struct hfi1_devdata *dd)
7400{
7401 sc_wait(dd);
7402 sdma_wait(dd);
7403 pause_for_credit_return(dd);
7404}
7405
7406/*
7407 * stop_drain_data_vls() - disable, then drain all per-VL fifos.
7408 *
7409 * Use open_fill_data_vls() to resume using data VLs. This pair is
7410 * meant to be used like this:
7411 *
7412 * stop_drain_data_vls(dd);
7413 * // do things with per-VL resources
7414 * open_fill_data_vls(dd);
7415 */
7416int stop_drain_data_vls(struct hfi1_devdata *dd)
7417{
7418 int ret;
7419
7420 ret = disable_data_vls(dd);
7421 if (ret == 0)
7422 drain_data_vls(dd);
7423
7424 return ret;
7425}
7426
7427/*
7428 * Convert a nanosecond time to a cclock count. No matter how slow
7429 * the cclock, a non-zero ns will always have a non-zero result.
7430 */
7431u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
7432{
7433 u32 cclocks;
7434
7435 if (dd->icode == ICODE_FPGA_EMULATION)
7436 cclocks = (ns * 1000) / FPGA_CCLOCK_PS;
7437 else /* simulation pretends to be ASIC */
7438 cclocks = (ns * 1000) / ASIC_CCLOCK_PS;
7439 if (ns && !cclocks) /* if ns nonzero, must be at least 1 */
7440 cclocks = 1;
7441 return cclocks;
7442}
7443
7444/*
7445 * Convert a cclock count to nanoseconds. Not matter how slow
7446 * the cclock, a non-zero cclocks will always have a non-zero result.
7447 */
7448u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
7449{
7450 u32 ns;
7451
7452 if (dd->icode == ICODE_FPGA_EMULATION)
7453 ns = (cclocks * FPGA_CCLOCK_PS) / 1000;
7454 else /* simulation pretends to be ASIC */
7455 ns = (cclocks * ASIC_CCLOCK_PS) / 1000;
7456 if (cclocks && !ns)
7457 ns = 1;
7458 return ns;
7459}
7460
7461/*
7462 * Dynamically adjust the receive interrupt timeout for a context based on
7463 * incoming packet rate.
7464 *
7465 * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
7466 */
7467static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
7468{
7469 struct hfi1_devdata *dd = rcd->dd;
7470 u32 timeout = rcd->rcvavail_timeout;
7471
7472 /*
7473 * This algorithm doubles or halves the timeout depending on whether
7474 * the number of packets received in this interrupt were less than or
7475 * greater equal the interrupt count.
7476 *
7477 * The calculations below do not allow a steady state to be achieved.
7478 * Only at the endpoints it is possible to have an unchanging
7479 * timeout.
7480 */
7481 if (npkts < rcv_intr_count) {
7482 /*
7483 * Not enough packets arrived before the timeout, adjust
7484 * timeout downward.
7485 */
7486 if (timeout < 2) /* already at minimum? */
7487 return;
7488 timeout >>= 1;
7489 } else {
7490 /*
7491 * More than enough packets arrived before the timeout, adjust
7492 * timeout upward.
7493 */
7494 if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */
7495 return;
7496 timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
7497 }
7498
7499 rcd->rcvavail_timeout = timeout;
7500 /* timeout cannot be larger than rcv_intr_timeout_csr which has already
7501 been verified to be in range */
7502 write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
7503 (u64)timeout << RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
7504}
7505
7506void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
7507 u32 intr_adjust, u32 npkts)
7508{
7509 struct hfi1_devdata *dd = rcd->dd;
7510 u64 reg;
7511 u32 ctxt = rcd->ctxt;
7512
7513 /*
7514 * Need to write timeout register before updating RcvHdrHead to ensure
7515 * that a new value is used when the HW decides to restart counting.
7516 */
7517 if (intr_adjust)
7518 adjust_rcv_timeout(rcd, npkts);
7519 if (updegr) {
7520 reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
7521 << RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
7522 write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
7523 }
7524 mmiowb();
7525 reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
7526 (((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
7527 << RCV_HDR_HEAD_HEAD_SHIFT);
7528 write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
7529 mmiowb();
7530}
7531
7532u32 hdrqempty(struct hfi1_ctxtdata *rcd)
7533{
7534 u32 head, tail;
7535
7536 head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
7537 & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
7538
7539 if (rcd->rcvhdrtail_kvaddr)
7540 tail = get_rcvhdrtail(rcd);
7541 else
7542 tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
7543
7544 return head == tail;
7545}
7546
7547/*
7548 * Context Control and Receive Array encoding for buffer size:
7549 * 0x0 invalid
7550 * 0x1 4 KB
7551 * 0x2 8 KB
7552 * 0x3 16 KB
7553 * 0x4 32 KB
7554 * 0x5 64 KB
7555 * 0x6 128 KB
7556 * 0x7 256 KB
7557 * 0x8 512 KB (Receive Array only)
7558 * 0x9 1 MB (Receive Array only)
7559 * 0xa 2 MB (Receive Array only)
7560 *
7561 * 0xB-0xF - reserved (Receive Array only)
7562 *
7563 *
7564 * This routine assumes that the value has already been sanity checked.
7565 */
7566static u32 encoded_size(u32 size)
7567{
7568 switch (size) {
7569 case 4*1024: return 0x1;
7570 case 8*1024: return 0x2;
7571 case 16*1024: return 0x3;
7572 case 32*1024: return 0x4;
7573 case 64*1024: return 0x5;
7574 case 128*1024: return 0x6;
7575 case 256*1024: return 0x7;
7576 case 512*1024: return 0x8;
7577 case 1*1024*1024: return 0x9;
7578 case 2*1024*1024: return 0xa;
7579 }
7580 return 0x1; /* if invalid, go with the minimum size */
7581}
7582
7583void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt)
7584{
7585 struct hfi1_ctxtdata *rcd;
7586 u64 rcvctrl, reg;
7587 int did_enable = 0;
7588
7589 rcd = dd->rcd[ctxt];
7590 if (!rcd)
7591 return;
7592
7593 hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
7594
7595 rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
7596 /* if the context already enabled, don't do the extra steps */
7597 if ((op & HFI1_RCVCTRL_CTXT_ENB)
7598 && !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
7599 /* reset the tail and hdr addresses, and sequence count */
7600 write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
7601 rcd->rcvhdrq_phys);
7602 if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
7603 write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
7604 rcd->rcvhdrqtailaddr_phys);
7605 rcd->seq_cnt = 1;
7606
7607 /* reset the cached receive header queue head value */
7608 rcd->head = 0;
7609
7610 /*
7611 * Zero the receive header queue so we don't get false
7612 * positives when checking the sequence number. The
7613 * sequence numbers could land exactly on the same spot.
7614 * E.g. a rcd restart before the receive header wrapped.
7615 */
7616 memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
7617
7618 /* starting timeout */
7619 rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr;
7620
7621 /* enable the context */
7622 rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
7623
7624 /* clean the egr buffer size first */
7625 rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
7626 rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
7627 & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
7628 << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
7629
7630 /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
7631 write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
7632 did_enable = 1;
7633
7634 /* zero RcvEgrIndexHead */
7635 write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
7636
7637 /* set eager count and base index */
7638 reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
7639 & RCV_EGR_CTRL_EGR_CNT_MASK)
7640 << RCV_EGR_CTRL_EGR_CNT_SHIFT) |
7641 (((rcd->eager_base >> RCV_SHIFT)
7642 & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
7643 << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
7644 write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
7645
7646 /*
7647 * Set TID (expected) count and base index.
7648 * rcd->expected_count is set to individual RcvArray entries,
7649 * not pairs, and the CSR takes a pair-count in groups of
7650 * four, so divide by 8.
7651 */
7652 reg = (((rcd->expected_count >> RCV_SHIFT)
7653 & RCV_TID_CTRL_TID_PAIR_CNT_MASK)
7654 << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
7655 (((rcd->expected_base >> RCV_SHIFT)
7656 & RCV_TID_CTRL_TID_BASE_INDEX_MASK)
7657 << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
7658 write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg);
7659 if (ctxt == VL15CTXT)
7660 write_csr(dd, RCV_VL15, VL15CTXT);
7661 }
7662 if (op & HFI1_RCVCTRL_CTXT_DIS) {
7663 write_csr(dd, RCV_VL15, 0);
7664 rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
7665 }
7666 if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
7667 rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
7668 if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
7669 rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
7670 if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys)
7671 rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
7672 if (op & HFI1_RCVCTRL_TAILUPD_DIS)
7673 rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
7674 if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
7675 rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
7676 if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
7677 rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
7678 if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
7679 /* In one-packet-per-eager mode, the size comes from
7680 the RcvArray entry. */
7681 rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
7682 rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
7683 }
7684 if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
7685 rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
7686 if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
7687 rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
7688 if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
7689 rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
7690 if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
7691 rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
7692 if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
7693 rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
7694 rcd->rcvctrl = rcvctrl;
7695 hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
7696 write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
7697
7698 /* work around sticky RcvCtxtStatus.BlockedRHQFull */
7699 if (did_enable
7700 && (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
7701 reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
7702 if (reg != 0) {
7703 dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
7704 ctxt, reg);
7705 read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
7706 write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
7707 write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
7708 read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
7709 reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
7710 dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
7711 ctxt, reg, reg == 0 ? "not" : "still");
7712 }
7713 }
7714
7715 if (did_enable) {
7716 /*
7717 * The interrupt timeout and count must be set after
7718 * the context is enabled to take effect.
7719 */
7720 /* set interrupt timeout */
7721 write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
7722 (u64)rcd->rcvavail_timeout <<
7723 RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
7724
7725 /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
7726 reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
7727 write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
7728 }
7729
7730 if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS))
7731 /*
7732 * If the context has been disabled and the Tail Update has
7733 * been cleared, clear the RCV_HDR_TAIL_ADDR CSR so
7734 * it doesn't contain an address that is invalid.
7735 */
7736 write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, 0);
7737}
7738
7739u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep,
7740 u64 **cntrp)
7741{
7742 int ret;
7743 u64 val = 0;
7744
7745 if (namep) {
7746 ret = dd->cntrnameslen;
7747 if (pos != 0) {
7748 dd_dev_err(dd, "read_cntrs does not support indexing");
7749 return 0;
7750 }
7751 *namep = dd->cntrnames;
7752 } else {
7753 const struct cntr_entry *entry;
7754 int i, j;
7755
7756 ret = (dd->ndevcntrs) * sizeof(u64);
7757 if (pos != 0) {
7758 dd_dev_err(dd, "read_cntrs does not support indexing");
7759 return 0;
7760 }
7761
7762 /* Get the start of the block of counters */
7763 *cntrp = dd->cntrs;
7764
7765 /*
7766 * Now go and fill in each counter in the block.
7767 */
7768 for (i = 0; i < DEV_CNTR_LAST; i++) {
7769 entry = &dev_cntrs[i];
7770 hfi1_cdbg(CNTR, "reading %s", entry->name);
7771 if (entry->flags & CNTR_DISABLED) {
7772 /* Nothing */
7773 hfi1_cdbg(CNTR, "\tDisabled\n");
7774 } else {
7775 if (entry->flags & CNTR_VL) {
7776 hfi1_cdbg(CNTR, "\tPer VL\n");
7777 for (j = 0; j < C_VL_COUNT; j++) {
7778 val = entry->rw_cntr(entry,
7779 dd, j,
7780 CNTR_MODE_R,
7781 0);
7782 hfi1_cdbg(
7783 CNTR,
7784 "\t\tRead 0x%llx for %d\n",
7785 val, j);
7786 dd->cntrs[entry->offset + j] =
7787 val;
7788 }
7789 } else {
7790 val = entry->rw_cntr(entry, dd,
7791 CNTR_INVALID_VL,
7792 CNTR_MODE_R, 0);
7793 dd->cntrs[entry->offset] = val;
7794 hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
7795 }
7796 }
7797 }
7798 }
7799 return ret;
7800}
7801
7802/*
7803 * Used by sysfs to create files for hfi stats to read
7804 */
7805u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port,
7806 char **namep, u64 **cntrp)
7807{
7808 int ret;
7809 u64 val = 0;
7810
7811 if (namep) {
7812 ret = dd->portcntrnameslen;
7813 if (pos != 0) {
7814 dd_dev_err(dd, "index not supported");
7815 return 0;
7816 }
7817 *namep = dd->portcntrnames;
7818 } else {
7819 const struct cntr_entry *entry;
7820 struct hfi1_pportdata *ppd;
7821 int i, j;
7822
7823 ret = (dd->nportcntrs) * sizeof(u64);
7824 if (pos != 0) {
7825 dd_dev_err(dd, "indexing not supported");
7826 return 0;
7827 }
7828 ppd = (struct hfi1_pportdata *)(dd + 1 + port);
7829 *cntrp = ppd->cntrs;
7830
7831 for (i = 0; i < PORT_CNTR_LAST; i++) {
7832 entry = &port_cntrs[i];
7833 hfi1_cdbg(CNTR, "reading %s", entry->name);
7834 if (entry->flags & CNTR_DISABLED) {
7835 /* Nothing */
7836 hfi1_cdbg(CNTR, "\tDisabled\n");
7837 continue;
7838 }
7839
7840 if (entry->flags & CNTR_VL) {
7841 hfi1_cdbg(CNTR, "\tPer VL");
7842 for (j = 0; j < C_VL_COUNT; j++) {
7843 val = entry->rw_cntr(entry, ppd, j,
7844 CNTR_MODE_R,
7845 0);
7846 hfi1_cdbg(
7847 CNTR,
7848 "\t\tRead 0x%llx for %d",
7849 val, j);
7850 ppd->cntrs[entry->offset + j] = val;
7851 }
7852 } else {
7853 val = entry->rw_cntr(entry, ppd,
7854 CNTR_INVALID_VL,
7855 CNTR_MODE_R,
7856 0);
7857 ppd->cntrs[entry->offset] = val;
7858 hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
7859 }
7860 }
7861 }
7862 return ret;
7863}
7864
7865static void free_cntrs(struct hfi1_devdata *dd)
7866{
7867 struct hfi1_pportdata *ppd;
7868 int i;
7869
7870 if (dd->synth_stats_timer.data)
7871 del_timer_sync(&dd->synth_stats_timer);
7872 dd->synth_stats_timer.data = 0;
7873 ppd = (struct hfi1_pportdata *)(dd + 1);
7874 for (i = 0; i < dd->num_pports; i++, ppd++) {
7875 kfree(ppd->cntrs);
7876 kfree(ppd->scntrs);
7877 free_percpu(ppd->ibport_data.rc_acks);
7878 free_percpu(ppd->ibport_data.rc_qacks);
7879 free_percpu(ppd->ibport_data.rc_delayed_comp);
7880 ppd->cntrs = NULL;
7881 ppd->scntrs = NULL;
7882 ppd->ibport_data.rc_acks = NULL;
7883 ppd->ibport_data.rc_qacks = NULL;
7884 ppd->ibport_data.rc_delayed_comp = NULL;
7885 }
7886 kfree(dd->portcntrnames);
7887 dd->portcntrnames = NULL;
7888 kfree(dd->cntrs);
7889 dd->cntrs = NULL;
7890 kfree(dd->scntrs);
7891 dd->scntrs = NULL;
7892 kfree(dd->cntrnames);
7893 dd->cntrnames = NULL;
7894}
7895
7896#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
7897#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
7898
7899static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
7900 u64 *psval, void *context, int vl)
7901{
7902 u64 val;
7903 u64 sval = *psval;
7904
7905 if (entry->flags & CNTR_DISABLED) {
7906 dd_dev_err(dd, "Counter %s not enabled", entry->name);
7907 return 0;
7908 }
7909
7910 hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
7911
7912 val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
7913
7914 /* If its a synthetic counter there is more work we need to do */
7915 if (entry->flags & CNTR_SYNTH) {
7916 if (sval == CNTR_MAX) {
7917 /* No need to read already saturated */
7918 return CNTR_MAX;
7919 }
7920
7921 if (entry->flags & CNTR_32BIT) {
7922 /* 32bit counters can wrap multiple times */
7923 u64 upper = sval >> 32;
7924 u64 lower = (sval << 32) >> 32;
7925
7926 if (lower > val) { /* hw wrapped */
7927 if (upper == CNTR_32BIT_MAX)
7928 val = CNTR_MAX;
7929 else
7930 upper++;
7931 }
7932
7933 if (val != CNTR_MAX)
7934 val = (upper << 32) | val;
7935
7936 } else {
7937 /* If we rolled we are saturated */
7938 if ((val < sval) || (val > CNTR_MAX))
7939 val = CNTR_MAX;
7940 }
7941 }
7942
7943 *psval = val;
7944
7945 hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
7946
7947 return val;
7948}
7949
7950static u64 write_dev_port_cntr(struct hfi1_devdata *dd,
7951 struct cntr_entry *entry,
7952 u64 *psval, void *context, int vl, u64 data)
7953{
7954 u64 val;
7955
7956 if (entry->flags & CNTR_DISABLED) {
7957 dd_dev_err(dd, "Counter %s not enabled", entry->name);
7958 return 0;
7959 }
7960
7961 hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
7962
7963 if (entry->flags & CNTR_SYNTH) {
7964 *psval = data;
7965 if (entry->flags & CNTR_32BIT) {
7966 val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
7967 (data << 32) >> 32);
7968 val = data; /* return the full 64bit value */
7969 } else {
7970 val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
7971 data);
7972 }
7973 } else {
7974 val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data);
7975 }
7976
7977 *psval = val;
7978
7979 hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
7980
7981 return val;
7982}
7983
7984u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl)
7985{
7986 struct cntr_entry *entry;
7987 u64 *sval;
7988
7989 entry = &dev_cntrs[index];
7990 sval = dd->scntrs + entry->offset;
7991
7992 if (vl != CNTR_INVALID_VL)
7993 sval += vl;
7994
7995 return read_dev_port_cntr(dd, entry, sval, dd, vl);
7996}
7997
7998u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data)
7999{
8000 struct cntr_entry *entry;
8001 u64 *sval;
8002
8003 entry = &dev_cntrs[index];
8004 sval = dd->scntrs + entry->offset;
8005
8006 if (vl != CNTR_INVALID_VL)
8007 sval += vl;
8008
8009 return write_dev_port_cntr(dd, entry, sval, dd, vl, data);
8010}
8011
8012u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl)
8013{
8014 struct cntr_entry *entry;
8015 u64 *sval;
8016
8017 entry = &port_cntrs[index];
8018 sval = ppd->scntrs + entry->offset;
8019
8020 if (vl != CNTR_INVALID_VL)
8021 sval += vl;
8022
8023 if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
8024 (index <= C_RCV_HDR_OVF_LAST)) {
8025 /* We do not want to bother for disabled contexts */
8026 return 0;
8027 }
8028
8029 return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl);
8030}
8031
8032u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
8033{
8034 struct cntr_entry *entry;
8035 u64 *sval;
8036
8037 entry = &port_cntrs[index];
8038 sval = ppd->scntrs + entry->offset;
8039
8040 if (vl != CNTR_INVALID_VL)
8041 sval += vl;
8042
8043 if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
8044 (index <= C_RCV_HDR_OVF_LAST)) {
8045 /* We do not want to bother for disabled contexts */
8046 return 0;
8047 }
8048
8049 return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
8050}
8051
8052static void update_synth_timer(unsigned long opaque)
8053{
8054 u64 cur_tx;
8055 u64 cur_rx;
8056 u64 total_flits;
8057 u8 update = 0;
8058 int i, j, vl;
8059 struct hfi1_pportdata *ppd;
8060 struct cntr_entry *entry;
8061
8062 struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
8063
8064 /*
8065 * Rather than keep beating on the CSRs pick a minimal set that we can
8066 * check to watch for potential roll over. We can do this by looking at
8067 * the number of flits sent/recv. If the total flits exceeds 32bits then
8068 * we have to iterate all the counters and update.
8069 */
8070 entry = &dev_cntrs[C_DC_RCV_FLITS];
8071 cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
8072
8073 entry = &dev_cntrs[C_DC_XMIT_FLITS];
8074 cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
8075
8076 hfi1_cdbg(
8077 CNTR,
8078 "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n",
8079 dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx);
8080
8081 if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) {
8082 /*
8083 * May not be strictly necessary to update but it won't hurt and
8084 * simplifies the logic here.
8085 */
8086 update = 1;
8087 hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
8088 dd->unit);
8089 } else {
8090 total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
8091 hfi1_cdbg(CNTR,
8092 "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit,
8093 total_flits, (u64)CNTR_32BIT_MAX);
8094 if (total_flits >= CNTR_32BIT_MAX) {
8095 hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
8096 dd->unit);
8097 update = 1;
8098 }
8099 }
8100
8101 if (update) {
8102 hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit);
8103 for (i = 0; i < DEV_CNTR_LAST; i++) {
8104 entry = &dev_cntrs[i];
8105 if (entry->flags & CNTR_VL) {
8106 for (vl = 0; vl < C_VL_COUNT; vl++)
8107 read_dev_cntr(dd, i, vl);
8108 } else {
8109 read_dev_cntr(dd, i, CNTR_INVALID_VL);
8110 }
8111 }
8112 ppd = (struct hfi1_pportdata *)(dd + 1);
8113 for (i = 0; i < dd->num_pports; i++, ppd++) {
8114 for (j = 0; j < PORT_CNTR_LAST; j++) {
8115 entry = &port_cntrs[j];
8116 if (entry->flags & CNTR_VL) {
8117 for (vl = 0; vl < C_VL_COUNT; vl++)
8118 read_port_cntr(ppd, j, vl);
8119 } else {
8120 read_port_cntr(ppd, j, CNTR_INVALID_VL);
8121 }
8122 }
8123 }
8124
8125 /*
8126 * We want the value in the register. The goal is to keep track
8127 * of the number of "ticks" not the counter value. In other
8128 * words if the register rolls we want to notice it and go ahead
8129 * and force an update.
8130 */
8131 entry = &dev_cntrs[C_DC_XMIT_FLITS];
8132 dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
8133 CNTR_MODE_R, 0);
8134
8135 entry = &dev_cntrs[C_DC_RCV_FLITS];
8136 dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
8137 CNTR_MODE_R, 0);
8138
8139 hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx",
8140 dd->unit, dd->last_tx, dd->last_rx);
8141
8142 } else {
8143 hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
8144 }
8145
8146mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
8147}
8148
8149#define C_MAX_NAME 13 /* 12 chars + one for /0 */
8150static int init_cntrs(struct hfi1_devdata *dd)
8151{
8152 int i, rcv_ctxts, index, j;
8153 size_t sz;
8154 char *p;
8155 char name[C_MAX_NAME];
8156 struct hfi1_pportdata *ppd;
8157
8158 /* set up the stats timer; the add_timer is done at the end */
8159 init_timer(&dd->synth_stats_timer);
8160 dd->synth_stats_timer.function = update_synth_timer;
8161 dd->synth_stats_timer.data = (unsigned long) dd;
8162
8163 /***********************/
8164 /* per device counters */
8165 /***********************/
8166
8167 /* size names and determine how many we have*/
8168 dd->ndevcntrs = 0;
8169 sz = 0;
8170 index = 0;
8171
8172 for (i = 0; i < DEV_CNTR_LAST; i++) {
8173 hfi1_dbg_early("Init cntr %s\n", dev_cntrs[i].name);
8174 if (dev_cntrs[i].flags & CNTR_DISABLED) {
8175 hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
8176 continue;
8177 }
8178
8179 if (dev_cntrs[i].flags & CNTR_VL) {
8180 hfi1_dbg_early("\tProcessing VL cntr\n");
8181 dev_cntrs[i].offset = index;
8182 for (j = 0; j < C_VL_COUNT; j++) {
8183 memset(name, '\0', C_MAX_NAME);
8184 snprintf(name, C_MAX_NAME, "%s%d",
8185 dev_cntrs[i].name,
8186 vl_from_idx(j));
8187 sz += strlen(name);
8188 sz++;
8189 hfi1_dbg_early("\t\t%s\n", name);
8190 dd->ndevcntrs++;
8191 index++;
8192 }
8193 } else {
8194 /* +1 for newline */
8195 sz += strlen(dev_cntrs[i].name) + 1;
8196 dd->ndevcntrs++;
8197 dev_cntrs[i].offset = index;
8198 index++;
8199 hfi1_dbg_early("\tAdding %s\n", dev_cntrs[i].name);
8200 }
8201 }
8202
8203 /* allocate space for the counter values */
8204 dd->cntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
8205 if (!dd->cntrs)
8206 goto bail;
8207
8208 dd->scntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
8209 if (!dd->scntrs)
8210 goto bail;
8211
8212
8213 /* allocate space for the counter names */
8214 dd->cntrnameslen = sz;
8215 dd->cntrnames = kmalloc(sz, GFP_KERNEL);
8216 if (!dd->cntrnames)
8217 goto bail;
8218
8219 /* fill in the names */
8220 for (p = dd->cntrnames, i = 0, index = 0; i < DEV_CNTR_LAST; i++) {
8221 if (dev_cntrs[i].flags & CNTR_DISABLED) {
8222 /* Nothing */
8223 } else {
8224 if (dev_cntrs[i].flags & CNTR_VL) {
8225 for (j = 0; j < C_VL_COUNT; j++) {
8226 memset(name, '\0', C_MAX_NAME);
8227 snprintf(name, C_MAX_NAME, "%s%d",
8228 dev_cntrs[i].name,
8229 vl_from_idx(j));
8230 memcpy(p, name, strlen(name));
8231 p += strlen(name);
8232 *p++ = '\n';
8233 }
8234 } else {
8235 memcpy(p, dev_cntrs[i].name,
8236 strlen(dev_cntrs[i].name));
8237 p += strlen(dev_cntrs[i].name);
8238 *p++ = '\n';
8239 }
8240 index++;
8241 }
8242 }
8243
8244 /*********************/
8245 /* per port counters */
8246 /*********************/
8247
8248 /*
8249 * Go through the counters for the overflows and disable the ones we
8250 * don't need. This varies based on platform so we need to do it
8251 * dynamically here.
8252 */
8253 rcv_ctxts = dd->num_rcv_contexts;
8254 for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
8255 i <= C_RCV_HDR_OVF_LAST; i++) {
8256 port_cntrs[i].flags |= CNTR_DISABLED;
8257 }
8258
8259 /* size port counter names and determine how many we have*/
8260 sz = 0;
8261 dd->nportcntrs = 0;
8262 for (i = 0; i < PORT_CNTR_LAST; i++) {
8263 hfi1_dbg_early("Init pcntr %s\n", port_cntrs[i].name);
8264 if (port_cntrs[i].flags & CNTR_DISABLED) {
8265 hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
8266 continue;
8267 }
8268
8269 if (port_cntrs[i].flags & CNTR_VL) {
8270 hfi1_dbg_early("\tProcessing VL cntr\n");
8271 port_cntrs[i].offset = dd->nportcntrs;
8272 for (j = 0; j < C_VL_COUNT; j++) {
8273 memset(name, '\0', C_MAX_NAME);
8274 snprintf(name, C_MAX_NAME, "%s%d",
8275 port_cntrs[i].name,
8276 vl_from_idx(j));
8277 sz += strlen(name);
8278 sz++;
8279 hfi1_dbg_early("\t\t%s\n", name);
8280 dd->nportcntrs++;
8281 }
8282 } else {
8283 /* +1 for newline */
8284 sz += strlen(port_cntrs[i].name) + 1;
8285 port_cntrs[i].offset = dd->nportcntrs;
8286 dd->nportcntrs++;
8287 hfi1_dbg_early("\tAdding %s\n", port_cntrs[i].name);
8288 }
8289 }
8290
8291 /* allocate space for the counter names */
8292 dd->portcntrnameslen = sz;
8293 dd->portcntrnames = kmalloc(sz, GFP_KERNEL);
8294 if (!dd->portcntrnames)
8295 goto bail;
8296
8297 /* fill in port cntr names */
8298 for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) {
8299 if (port_cntrs[i].flags & CNTR_DISABLED)
8300 continue;
8301
8302 if (port_cntrs[i].flags & CNTR_VL) {
8303 for (j = 0; j < C_VL_COUNT; j++) {
8304 memset(name, '\0', C_MAX_NAME);
8305 snprintf(name, C_MAX_NAME, "%s%d",
8306 port_cntrs[i].name,
8307 vl_from_idx(j));
8308 memcpy(p, name, strlen(name));
8309 p += strlen(name);
8310 *p++ = '\n';
8311 }
8312 } else {
8313 memcpy(p, port_cntrs[i].name,
8314 strlen(port_cntrs[i].name));
8315 p += strlen(port_cntrs[i].name);
8316 *p++ = '\n';
8317 }
8318 }
8319
8320 /* allocate per port storage for counter values */
8321 ppd = (struct hfi1_pportdata *)(dd + 1);
8322 for (i = 0; i < dd->num_pports; i++, ppd++) {
8323 ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
8324 if (!ppd->cntrs)
8325 goto bail;
8326
8327 ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
8328 if (!ppd->scntrs)
8329 goto bail;
8330 }
8331
8332 /* CPU counters need to be allocated and zeroed */
8333 if (init_cpu_counters(dd))
8334 goto bail;
8335
8336 mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
8337 return 0;
8338bail:
8339 free_cntrs(dd);
8340 return -ENOMEM;
8341}
8342
8343
8344static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
8345{
8346 switch (chip_lstate) {
8347 default:
8348 dd_dev_err(dd,
8349 "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
8350 chip_lstate);
8351 /* fall through */
8352 case LSTATE_DOWN:
8353 return IB_PORT_DOWN;
8354 case LSTATE_INIT:
8355 return IB_PORT_INIT;
8356 case LSTATE_ARMED:
8357 return IB_PORT_ARMED;
8358 case LSTATE_ACTIVE:
8359 return IB_PORT_ACTIVE;
8360 }
8361}
8362
8363u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
8364{
8365 /* look at the HFI meta-states only */
8366 switch (chip_pstate & 0xf0) {
8367 default:
8368 dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
8369 chip_pstate);
8370 /* fall through */
8371 case PLS_DISABLED:
8372 return IB_PORTPHYSSTATE_DISABLED;
8373 case PLS_OFFLINE:
8374 return OPA_PORTPHYSSTATE_OFFLINE;
8375 case PLS_POLLING:
8376 return IB_PORTPHYSSTATE_POLLING;
8377 case PLS_CONFIGPHY:
8378 return IB_PORTPHYSSTATE_TRAINING;
8379 case PLS_LINKUP:
8380 return IB_PORTPHYSSTATE_LINKUP;
8381 case PLS_PHYTEST:
8382 return IB_PORTPHYSSTATE_PHY_TEST;
8383 }
8384}
8385
8386/* return the OPA port logical state name */
8387const char *opa_lstate_name(u32 lstate)
8388{
8389 static const char * const port_logical_names[] = {
8390 "PORT_NOP",
8391 "PORT_DOWN",
8392 "PORT_INIT",
8393 "PORT_ARMED",
8394 "PORT_ACTIVE",
8395 "PORT_ACTIVE_DEFER",
8396 };
8397 if (lstate < ARRAY_SIZE(port_logical_names))
8398 return port_logical_names[lstate];
8399 return "unknown";
8400}
8401
8402/* return the OPA port physical state name */
8403const char *opa_pstate_name(u32 pstate)
8404{
8405 static const char * const port_physical_names[] = {
8406 "PHYS_NOP",
8407 "reserved1",
8408 "PHYS_POLL",
8409 "PHYS_DISABLED",
8410 "PHYS_TRAINING",
8411 "PHYS_LINKUP",
8412 "PHYS_LINK_ERR_RECOVER",
8413 "PHYS_PHY_TEST",
8414 "reserved8",
8415 "PHYS_OFFLINE",
8416 "PHYS_GANGED",
8417 "PHYS_TEST",
8418 };
8419 if (pstate < ARRAY_SIZE(port_physical_names))
8420 return port_physical_names[pstate];
8421 return "unknown";
8422}
8423
8424/*
8425 * Read the hardware link state and set the driver's cached value of it.
8426 * Return the (new) current value.
8427 */
8428u32 get_logical_state(struct hfi1_pportdata *ppd)
8429{
8430 u32 new_state;
8431
8432 new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd));
8433 if (new_state != ppd->lstate) {
8434 dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
8435 opa_lstate_name(new_state), new_state);
8436 ppd->lstate = new_state;
8437 }
8438 /*
8439 * Set port status flags in the page mapped into userspace
8440 * memory. Do it here to ensure a reliable state - this is
8441 * the only function called by all state handling code.
8442 * Always set the flags due to the fact that the cache value
8443 * might have been changed explicitly outside of this
8444 * function.
8445 */
8446 if (ppd->statusp) {
8447 switch (ppd->lstate) {
8448 case IB_PORT_DOWN:
8449 case IB_PORT_INIT:
8450 *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
8451 HFI1_STATUS_IB_READY);
8452 break;
8453 case IB_PORT_ARMED:
8454 *ppd->statusp |= HFI1_STATUS_IB_CONF;
8455 break;
8456 case IB_PORT_ACTIVE:
8457 *ppd->statusp |= HFI1_STATUS_IB_READY;
8458 break;
8459 }
8460 }
8461 return ppd->lstate;
8462}
8463
8464/**
8465 * wait_logical_linkstate - wait for an IB link state change to occur
8466 * @ppd: port device
8467 * @state: the state to wait for
8468 * @msecs: the number of milliseconds to wait
8469 *
8470 * Wait up to msecs milliseconds for IB link state change to occur.
8471 * For now, take the easy polling route.
8472 * Returns 0 if state reached, otherwise -ETIMEDOUT.
8473 */
8474static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
8475 int msecs)
8476{
8477 unsigned long timeout;
8478
8479 timeout = jiffies + msecs_to_jiffies(msecs);
8480 while (1) {
8481 if (get_logical_state(ppd) == state)
8482 return 0;
8483 if (time_after(jiffies, timeout))
8484 break;
8485 msleep(20);
8486 }
8487 dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state);
8488
8489 return -ETIMEDOUT;
8490}
8491
8492u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
8493{
8494 static u32 remembered_state = 0xff;
8495 u32 pstate;
8496 u32 ib_pstate;
8497
8498 pstate = read_physical_state(ppd->dd);
8499 ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
8500 if (remembered_state != ib_pstate) {
8501 dd_dev_info(ppd->dd,
8502 "%s: physical state changed to %s (0x%x), phy 0x%x\n",
8503 __func__, opa_pstate_name(ib_pstate), ib_pstate,
8504 pstate);
8505 remembered_state = ib_pstate;
8506 }
8507 return ib_pstate;
8508}
8509
8510/*
8511 * Read/modify/write ASIC_QSFP register bits as selected by mask
8512 * data: 0 or 1 in the positions depending on what needs to be written
8513 * dir: 0 for read, 1 for write
8514 * mask: select by setting
8515 * I2CCLK (bit 0)
8516 * I2CDATA (bit 1)
8517 */
8518u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
8519 u32 mask)
8520{
8521 u64 qsfp_oe, target_oe;
8522
8523 target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
8524 if (mask) {
8525 /* We are writing register bits, so lock access */
8526 dir &= mask;
8527 data &= mask;
8528
8529 qsfp_oe = read_csr(dd, target_oe);
8530 qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir;
8531 write_csr(dd, target_oe, qsfp_oe);
8532 }
8533 /* We are exclusively reading bits here, but it is unlikely
8534 * we'll get valid data when we set the direction of the pin
8535 * in the same call, so read should call this function again
8536 * to get valid data
8537 */
8538 return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
8539}
8540
8541#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
8542(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
8543
8544#define SET_STATIC_RATE_CONTROL_SMASK(r) \
8545(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
8546
8547int hfi1_init_ctxt(struct send_context *sc)
8548{
8549 if (sc != NULL) {
8550 struct hfi1_devdata *dd = sc->dd;
8551 u64 reg;
8552 u8 set = (sc->type == SC_USER ?
8553 HFI1_CAP_IS_USET(STATIC_RATE_CTRL) :
8554 HFI1_CAP_IS_KSET(STATIC_RATE_CTRL));
8555 reg = read_kctxt_csr(dd, sc->hw_context,
8556 SEND_CTXT_CHECK_ENABLE);
8557 if (set)
8558 CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
8559 else
8560 SET_STATIC_RATE_CONTROL_SMASK(reg);
8561 write_kctxt_csr(dd, sc->hw_context,
8562 SEND_CTXT_CHECK_ENABLE, reg);
8563 }
8564 return 0;
8565}
8566
8567int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
8568{
8569 int ret = 0;
8570 u64 reg;
8571
8572 if (dd->icode != ICODE_RTL_SILICON) {
8573 if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
8574 dd_dev_info(dd, "%s: tempsense not supported by HW\n",
8575 __func__);
8576 return -EINVAL;
8577 }
8578 reg = read_csr(dd, ASIC_STS_THERM);
8579 temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) &
8580 ASIC_STS_THERM_CURR_TEMP_MASK);
8581 temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) &
8582 ASIC_STS_THERM_LO_TEMP_MASK);
8583 temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) &
8584 ASIC_STS_THERM_HI_TEMP_MASK);
8585 temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) &
8586 ASIC_STS_THERM_CRIT_TEMP_MASK);
8587 /* triggers is a 3-bit value - 1 bit per trigger. */
8588 temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7);
8589
8590 return ret;
8591}
8592
8593/* ========================================================================= */
8594
8595/*
8596 * Enable/disable chip from delivering interrupts.
8597 */
8598void set_intr_state(struct hfi1_devdata *dd, u32 enable)
8599{
8600 int i;
8601
8602 /*
8603 * In HFI, the mask needs to be 1 to allow interrupts.
8604 */
8605 if (enable) {
8606 u64 cce_int_mask;
8607 const int qsfp1_int_smask = QSFP1_INT % 64;
8608 const int qsfp2_int_smask = QSFP2_INT % 64;
8609
8610 /* enable all interrupts */
8611 for (i = 0; i < CCE_NUM_INT_CSRS; i++)
8612 write_csr(dd, CCE_INT_MASK + (8*i), ~(u64)0);
8613
8614 /*
8615 * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
8616 * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
8617 * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
8618 * the index of the appropriate CSR in the CCEIntMask CSR array
8619 */
8620 cce_int_mask = read_csr(dd, CCE_INT_MASK +
8621 (8*(QSFP1_INT/64)));
8622 if (dd->hfi1_id) {
8623 cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
8624 write_csr(dd, CCE_INT_MASK + (8*(QSFP1_INT/64)),
8625 cce_int_mask);
8626 } else {
8627 cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
8628 write_csr(dd, CCE_INT_MASK + (8*(QSFP2_INT/64)),
8629 cce_int_mask);
8630 }
8631 } else {
8632 for (i = 0; i < CCE_NUM_INT_CSRS; i++)
8633 write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
8634 }
8635}
8636
8637/*
8638 * Clear all interrupt sources on the chip.
8639 */
8640static void clear_all_interrupts(struct hfi1_devdata *dd)
8641{
8642 int i;
8643
8644 for (i = 0; i < CCE_NUM_INT_CSRS; i++)
8645 write_csr(dd, CCE_INT_CLEAR + (8*i), ~(u64)0);
8646
8647 write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
8648 write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
8649 write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
8650 write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
8651 write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
8652 write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
8653 write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0);
8654 for (i = 0; i < dd->chip_send_contexts; i++)
8655 write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0);
8656 for (i = 0; i < dd->chip_sdma_engines; i++)
8657 write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
8658
8659 write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0);
8660 write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0);
8661 write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
8662}
8663
8664/* Move to pcie.c? */
8665static void disable_intx(struct pci_dev *pdev)
8666{
8667 pci_intx(pdev, 0);
8668}
8669
8670static void clean_up_interrupts(struct hfi1_devdata *dd)
8671{
8672 int i;
8673
8674 /* remove irqs - must happen before disabling/turning off */
8675 if (dd->num_msix_entries) {
8676 /* MSI-X */
8677 struct hfi1_msix_entry *me = dd->msix_entries;
8678
8679 for (i = 0; i < dd->num_msix_entries; i++, me++) {
8680 if (me->arg == NULL) /* => no irq, no affinity */
8681 break;
8682 irq_set_affinity_hint(dd->msix_entries[i].msix.vector,
8683 NULL);
8684 free_irq(me->msix.vector, me->arg);
8685 }
8686 } else {
8687 /* INTx */
8688 if (dd->requested_intx_irq) {
8689 free_irq(dd->pcidev->irq, dd);
8690 dd->requested_intx_irq = 0;
8691 }
8692 }
8693
8694 /* turn off interrupts */
8695 if (dd->num_msix_entries) {
8696 /* MSI-X */
8697 hfi1_nomsix(dd);
8698 } else {
8699 /* INTx */
8700 disable_intx(dd->pcidev);
8701 }
8702
8703 /* clean structures */
8704 for (i = 0; i < dd->num_msix_entries; i++)
8705 free_cpumask_var(dd->msix_entries[i].mask);
8706 kfree(dd->msix_entries);
8707 dd->msix_entries = NULL;
8708 dd->num_msix_entries = 0;
8709}
8710
8711/*
8712 * Remap the interrupt source from the general handler to the given MSI-X
8713 * interrupt.
8714 */
8715static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
8716{
8717 u64 reg;
8718 int m, n;
8719
8720 /* clear from the handled mask of the general interrupt */
8721 m = isrc / 64;
8722 n = isrc % 64;
8723 dd->gi_mask[m] &= ~((u64)1 << n);
8724
8725 /* direct the chip source to the given MSI-X interrupt */
8726 m = isrc / 8;
8727 n = isrc % 8;
8728 reg = read_csr(dd, CCE_INT_MAP + (8*m));
8729 reg &= ~((u64)0xff << (8*n));
8730 reg |= ((u64)msix_intr & 0xff) << (8*n);
8731 write_csr(dd, CCE_INT_MAP + (8*m), reg);
8732}
8733
8734static void remap_sdma_interrupts(struct hfi1_devdata *dd,
8735 int engine, int msix_intr)
8736{
8737 /*
8738 * SDMA engine interrupt sources grouped by type, rather than
8739 * engine. Per-engine interrupts are as follows:
8740 * SDMA
8741 * SDMAProgress
8742 * SDMAIdle
8743 */
8744 remap_intr(dd, IS_SDMA_START + 0*TXE_NUM_SDMA_ENGINES + engine,
8745 msix_intr);
8746 remap_intr(dd, IS_SDMA_START + 1*TXE_NUM_SDMA_ENGINES + engine,
8747 msix_intr);
8748 remap_intr(dd, IS_SDMA_START + 2*TXE_NUM_SDMA_ENGINES + engine,
8749 msix_intr);
8750}
8751
8752static void remap_receive_available_interrupt(struct hfi1_devdata *dd,
8753 int rx, int msix_intr)
8754{
8755 remap_intr(dd, IS_RCVAVAIL_START + rx, msix_intr);
8756}
8757
8758static int request_intx_irq(struct hfi1_devdata *dd)
8759{
8760 int ret;
8761
8762 snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME"_%d",
8763 dd->unit);
8764 ret = request_irq(dd->pcidev->irq, general_interrupt,
8765 IRQF_SHARED, dd->intx_name, dd);
8766 if (ret)
8767 dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
8768 ret);
8769 else
8770 dd->requested_intx_irq = 1;
8771 return ret;
8772}
8773
8774static int request_msix_irqs(struct hfi1_devdata *dd)
8775{
8776 const struct cpumask *local_mask;
8777 cpumask_var_t def, rcv;
8778 bool def_ret, rcv_ret;
8779 int first_general, last_general;
8780 int first_sdma, last_sdma;
8781 int first_rx, last_rx;
8782 int first_cpu, restart_cpu, curr_cpu;
8783 int rcv_cpu, sdma_cpu;
8784 int i, ret = 0, possible;
8785 int ht;
8786
8787 /* calculate the ranges we are going to use */
8788 first_general = 0;
8789 first_sdma = last_general = first_general + 1;
8790 first_rx = last_sdma = first_sdma + dd->num_sdma;
8791 last_rx = first_rx + dd->n_krcv_queues;
8792
8793 /*
8794 * Interrupt affinity.
8795 *
8796 * non-rcv avail gets a default mask that
8797 * starts as possible cpus with threads reset
8798 * and each rcv avail reset.
8799 *
8800 * rcv avail gets node relative 1 wrapping back
8801 * to the node relative 1 as necessary.
8802 *
8803 */
8804 local_mask = cpumask_of_pcibus(dd->pcidev->bus);
8805 /* if first cpu is invalid, use NUMA 0 */
8806 if (cpumask_first(local_mask) >= nr_cpu_ids)
8807 local_mask = topology_core_cpumask(0);
8808
8809 def_ret = zalloc_cpumask_var(&def, GFP_KERNEL);
8810 rcv_ret = zalloc_cpumask_var(&rcv, GFP_KERNEL);
8811 if (!def_ret || !rcv_ret)
8812 goto bail;
8813 /* use local mask as default */
8814 cpumask_copy(def, local_mask);
8815 possible = cpumask_weight(def);
8816 /* disarm threads from default */
8817 ht = cpumask_weight(
8818 topology_sibling_cpumask(cpumask_first(local_mask)));
8819 for (i = possible/ht; i < possible; i++)
8820 cpumask_clear_cpu(i, def);
8821 /* reset possible */
8822 possible = cpumask_weight(def);
8823 /* def now has full cores on chosen node*/
8824 first_cpu = cpumask_first(def);
8825 if (nr_cpu_ids >= first_cpu)
8826 first_cpu++;
8827 restart_cpu = first_cpu;
8828 curr_cpu = restart_cpu;
8829
8830 for (i = first_cpu; i < dd->n_krcv_queues + first_cpu; i++) {
8831 cpumask_clear_cpu(curr_cpu, def);
8832 cpumask_set_cpu(curr_cpu, rcv);
8833 if (curr_cpu >= possible)
8834 curr_cpu = restart_cpu;
8835 else
8836 curr_cpu++;
8837 }
8838 /* def mask has non-rcv, rcv has recv mask */
8839 rcv_cpu = cpumask_first(rcv);
8840 sdma_cpu = cpumask_first(def);
8841
8842 /*
8843 * Sanity check - the code expects all SDMA chip source
8844 * interrupts to be in the same CSR, starting at bit 0. Verify
8845 * that this is true by checking the bit location of the start.
8846 */
8847 BUILD_BUG_ON(IS_SDMA_START % 64);
8848
8849 for (i = 0; i < dd->num_msix_entries; i++) {
8850 struct hfi1_msix_entry *me = &dd->msix_entries[i];
8851 const char *err_info;
8852 irq_handler_t handler;
8853 void *arg;
8854 int idx;
8855 struct hfi1_ctxtdata *rcd = NULL;
8856 struct sdma_engine *sde = NULL;
8857
8858 /* obtain the arguments to request_irq */
8859 if (first_general <= i && i < last_general) {
8860 idx = i - first_general;
8861 handler = general_interrupt;
8862 arg = dd;
8863 snprintf(me->name, sizeof(me->name),
8864 DRIVER_NAME"_%d", dd->unit);
8865 err_info = "general";
8866 } else if (first_sdma <= i && i < last_sdma) {
8867 idx = i - first_sdma;
8868 sde = &dd->per_sdma[idx];
8869 handler = sdma_interrupt;
8870 arg = sde;
8871 snprintf(me->name, sizeof(me->name),
8872 DRIVER_NAME"_%d sdma%d", dd->unit, idx);
8873 err_info = "sdma";
8874 remap_sdma_interrupts(dd, idx, i);
8875 } else if (first_rx <= i && i < last_rx) {
8876 idx = i - first_rx;
8877 rcd = dd->rcd[idx];
8878 /* no interrupt if no rcd */
8879 if (!rcd)
8880 continue;
8881 /*
8882 * Set the interrupt register and mask for this
8883 * context's interrupt.
8884 */
8885 rcd->ireg = (IS_RCVAVAIL_START+idx) / 64;
8886 rcd->imask = ((u64)1) <<
8887 ((IS_RCVAVAIL_START+idx) % 64);
8888 handler = receive_context_interrupt;
8889 arg = rcd;
8890 snprintf(me->name, sizeof(me->name),
8891 DRIVER_NAME"_%d kctxt%d", dd->unit, idx);
8892 err_info = "receive context";
8893 remap_receive_available_interrupt(dd, idx, i);
8894 } else {
8895 /* not in our expected range - complain, then
8896 ignore it */
8897 dd_dev_err(dd,
8898 "Unexpected extra MSI-X interrupt %d\n", i);
8899 continue;
8900 }
8901 /* no argument, no interrupt */
8902 if (arg == NULL)
8903 continue;
8904 /* make sure the name is terminated */
8905 me->name[sizeof(me->name)-1] = 0;
8906
8907 ret = request_irq(me->msix.vector, handler, 0, me->name, arg);
8908 if (ret) {
8909 dd_dev_err(dd,
8910 "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
8911 err_info, me->msix.vector, idx, ret);
8912 return ret;
8913 }
8914 /*
8915 * assign arg after request_irq call, so it will be
8916 * cleaned up
8917 */
8918 me->arg = arg;
8919
8920 if (!zalloc_cpumask_var(
8921 &dd->msix_entries[i].mask,
8922 GFP_KERNEL))
8923 goto bail;
8924 if (handler == sdma_interrupt) {
8925 dd_dev_info(dd, "sdma engine %d cpu %d\n",
8926 sde->this_idx, sdma_cpu);
8927 cpumask_set_cpu(sdma_cpu, dd->msix_entries[i].mask);
8928 sdma_cpu = cpumask_next(sdma_cpu, def);
8929 if (sdma_cpu >= nr_cpu_ids)
8930 sdma_cpu = cpumask_first(def);
8931 } else if (handler == receive_context_interrupt) {
8932 dd_dev_info(dd, "rcv ctxt %d cpu %d\n",
8933 rcd->ctxt, rcv_cpu);
8934 cpumask_set_cpu(rcv_cpu, dd->msix_entries[i].mask);
8935 rcv_cpu = cpumask_next(rcv_cpu, rcv);
8936 if (rcv_cpu >= nr_cpu_ids)
8937 rcv_cpu = cpumask_first(rcv);
8938 } else {
8939 /* otherwise first def */
8940 dd_dev_info(dd, "%s cpu %d\n",
8941 err_info, cpumask_first(def));
8942 cpumask_set_cpu(
8943 cpumask_first(def), dd->msix_entries[i].mask);
8944 }
8945 irq_set_affinity_hint(
8946 dd->msix_entries[i].msix.vector,
8947 dd->msix_entries[i].mask);
8948 }
8949
8950out:
8951 free_cpumask_var(def);
8952 free_cpumask_var(rcv);
8953 return ret;
8954bail:
8955 ret = -ENOMEM;
8956 goto out;
8957}
8958
8959/*
8960 * Set the general handler to accept all interrupts, remap all
8961 * chip interrupts back to MSI-X 0.
8962 */
8963static void reset_interrupts(struct hfi1_devdata *dd)
8964{
8965 int i;
8966
8967 /* all interrupts handled by the general handler */
8968 for (i = 0; i < CCE_NUM_INT_CSRS; i++)
8969 dd->gi_mask[i] = ~(u64)0;
8970
8971 /* all chip interrupts map to MSI-X 0 */
8972 for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
8973 write_csr(dd, CCE_INT_MAP + (8*i), 0);
8974}
8975
8976static int set_up_interrupts(struct hfi1_devdata *dd)
8977{
8978 struct hfi1_msix_entry *entries;
8979 u32 total, request;
8980 int i, ret;
8981 int single_interrupt = 0; /* we expect to have all the interrupts */
8982
8983 /*
8984 * Interrupt count:
8985 * 1 general, "slow path" interrupt (includes the SDMA engines
8986 * slow source, SDMACleanupDone)
8987 * N interrupts - one per used SDMA engine
8988 * M interrupt - one per kernel receive context
8989 */
8990 total = 1 + dd->num_sdma + dd->n_krcv_queues;
8991
8992 entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
8993 if (!entries) {
8994 dd_dev_err(dd, "cannot allocate msix table\n");
8995 ret = -ENOMEM;
8996 goto fail;
8997 }
8998 /* 1-1 MSI-X entry assignment */
8999 for (i = 0; i < total; i++)
9000 entries[i].msix.entry = i;
9001
9002 /* ask for MSI-X interrupts */
9003 request = total;
9004 request_msix(dd, &request, entries);
9005
9006 if (request == 0) {
9007 /* using INTx */
9008 /* dd->num_msix_entries already zero */
9009 kfree(entries);
9010 single_interrupt = 1;
9011 dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
9012 } else {
9013 /* using MSI-X */
9014 dd->num_msix_entries = request;
9015 dd->msix_entries = entries;
9016
9017 if (request != total) {
9018 /* using MSI-X, with reduced interrupts */
9019 dd_dev_err(
9020 dd,
9021 "cannot handle reduced interrupt case, want %u, got %u\n",
9022 total, request);
9023 ret = -EINVAL;
9024 goto fail;
9025 }
9026 dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
9027 }
9028
9029 /* mask all interrupts */
9030 set_intr_state(dd, 0);
9031 /* clear all pending interrupts */
9032 clear_all_interrupts(dd);
9033
9034 /* reset general handler mask, chip MSI-X mappings */
9035 reset_interrupts(dd);
9036
9037 if (single_interrupt)
9038 ret = request_intx_irq(dd);
9039 else
9040 ret = request_msix_irqs(dd);
9041 if (ret)
9042 goto fail;
9043
9044 return 0;
9045
9046fail:
9047 clean_up_interrupts(dd);
9048 return ret;
9049}
9050
9051/*
9052 * Set up context values in dd. Sets:
9053 *
9054 * num_rcv_contexts - number of contexts being used
9055 * n_krcv_queues - number of kernel contexts
9056 * first_user_ctxt - first non-kernel context in array of contexts
9057 * freectxts - number of free user contexts
9058 * num_send_contexts - number of PIO send contexts being used
9059 */
9060static int set_up_context_variables(struct hfi1_devdata *dd)
9061{
9062 int num_kernel_contexts;
9063 int num_user_contexts;
9064 int total_contexts;
9065 int ret;
9066 unsigned ngroups;
9067
9068 /*
9069 * Kernel contexts: (to be fixed later):
9070 * - min or 2 or 1 context/numa
9071 * - Context 0 - default/errors
9072 * - Context 1 - VL15
9073 */
9074 if (n_krcvqs)
9075 num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS;
9076 else
9077 num_kernel_contexts = num_online_nodes();
9078 num_kernel_contexts =
9079 max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
9080 /*
9081 * Every kernel receive context needs an ACK send context.
9082 * one send context is allocated for each VL{0-7} and VL15
9083 */
9084 if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) {
9085 dd_dev_err(dd,
9086 "Reducing # kernel rcv contexts to: %d, from %d\n",
9087 (int)(dd->chip_send_contexts - num_vls - 1),
9088 (int)num_kernel_contexts);
9089 num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
9090 }
9091 /*
9092 * User contexts: (to be fixed later)
9093 * - set to num_rcv_contexts if non-zero
9094 * - default to 1 user context per CPU
9095 */
9096 if (num_rcv_contexts)
9097 num_user_contexts = num_rcv_contexts;
9098 else
9099 num_user_contexts = num_online_cpus();
9100
9101 total_contexts = num_kernel_contexts + num_user_contexts;
9102
9103 /*
9104 * Adjust the counts given a global max.
9105 */
9106 if (total_contexts > dd->chip_rcv_contexts) {
9107 dd_dev_err(dd,
9108 "Reducing # user receive contexts to: %d, from %d\n",
9109 (int)(dd->chip_rcv_contexts - num_kernel_contexts),
9110 (int)num_user_contexts);
9111 num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts;
9112 /* recalculate */
9113 total_contexts = num_kernel_contexts + num_user_contexts;
9114 }
9115
9116 /* the first N are kernel contexts, the rest are user contexts */
9117 dd->num_rcv_contexts = total_contexts;
9118 dd->n_krcv_queues = num_kernel_contexts;
9119 dd->first_user_ctxt = num_kernel_contexts;
9120 dd->freectxts = num_user_contexts;
9121 dd_dev_info(dd,
9122 "rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
9123 (int)dd->chip_rcv_contexts,
9124 (int)dd->num_rcv_contexts,
9125 (int)dd->n_krcv_queues,
9126 (int)dd->num_rcv_contexts - dd->n_krcv_queues);
9127
9128 /*
9129 * Receive array allocation:
9130 * All RcvArray entries are divided into groups of 8. This
9131 * is required by the hardware and will speed up writes to
9132 * consecutive entries by using write-combining of the entire
9133 * cacheline.
9134 *
9135 * The number of groups are evenly divided among all contexts.
9136 * any left over groups will be given to the first N user
9137 * contexts.
9138 */
9139 dd->rcv_entries.group_size = RCV_INCREMENT;
9140 ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size;
9141 dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
9142 dd->rcv_entries.nctxt_extra = ngroups -
9143 (dd->num_rcv_contexts * dd->rcv_entries.ngroups);
9144 dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
9145 dd->rcv_entries.ngroups,
9146 dd->rcv_entries.nctxt_extra);
9147 if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
9148 MAX_EAGER_ENTRIES * 2) {
9149 dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
9150 dd->rcv_entries.group_size;
9151 dd_dev_info(dd,
9152 "RcvArray group count too high, change to %u\n",
9153 dd->rcv_entries.ngroups);
9154 dd->rcv_entries.nctxt_extra = 0;
9155 }
9156 /*
9157 * PIO send contexts
9158 */
9159 ret = init_sc_pools_and_sizes(dd);
9160 if (ret >= 0) { /* success */
9161 dd->num_send_contexts = ret;
9162 dd_dev_info(
9163 dd,
9164 "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n",
9165 dd->chip_send_contexts,
9166 dd->num_send_contexts,
9167 dd->sc_sizes[SC_KERNEL].count,
9168 dd->sc_sizes[SC_ACK].count,
9169 dd->sc_sizes[SC_USER].count);
9170 ret = 0; /* success */
9171 }
9172
9173 return ret;
9174}
9175
9176/*
9177 * Set the device/port partition key table. The MAD code
9178 * will ensure that, at least, the partial management
9179 * partition key is present in the table.
9180 */
9181static void set_partition_keys(struct hfi1_pportdata *ppd)
9182{
9183 struct hfi1_devdata *dd = ppd->dd;
9184 u64 reg = 0;
9185 int i;
9186
9187 dd_dev_info(dd, "Setting partition keys\n");
9188 for (i = 0; i < hfi1_get_npkeys(dd); i++) {
9189 reg |= (ppd->pkeys[i] &
9190 RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) <<
9191 ((i % 4) *
9192 RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT);
9193 /* Each register holds 4 PKey values. */
9194 if ((i % 4) == 3) {
9195 write_csr(dd, RCV_PARTITION_KEY +
9196 ((i - 3) * 2), reg);
9197 reg = 0;
9198 }
9199 }
9200
9201 /* Always enable HW pkeys check when pkeys table is set */
9202 add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
9203}
9204
9205/*
9206 * These CSRs and memories are uninitialized on reset and must be
9207 * written before reading to set the ECC/parity bits.
9208 *
9209 * NOTE: All user context CSRs that are not mmaped write-only
9210 * (e.g. the TID flows) must be initialized even if the driver never
9211 * reads them.
9212 */
9213static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
9214{
9215 int i, j;
9216
9217 /* CceIntMap */
9218 for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
9219 write_csr(dd, CCE_INT_MAP+(8*i), 0);
9220
9221 /* SendCtxtCreditReturnAddr */
9222 for (i = 0; i < dd->chip_send_contexts; i++)
9223 write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
9224
9225 /* PIO Send buffers */
9226 /* SDMA Send buffers */
9227 /* These are not normally read, and (presently) have no method
9228 to be read, so are not pre-initialized */
9229
9230 /* RcvHdrAddr */
9231 /* RcvHdrTailAddr */
9232 /* RcvTidFlowTable */
9233 for (i = 0; i < dd->chip_rcv_contexts; i++) {
9234 write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
9235 write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
9236 for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
9237 write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE+(8*j), 0);
9238 }
9239
9240 /* RcvArray */
9241 for (i = 0; i < dd->chip_rcv_array_count; i++)
9242 write_csr(dd, RCV_ARRAY + (8*i),
9243 RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
9244
9245 /* RcvQPMapTable */
9246 for (i = 0; i < 32; i++)
9247 write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
9248}
9249
9250/*
9251 * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
9252 */
9253static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
9254 u64 ctrl_bits)
9255{
9256 unsigned long timeout;
9257 u64 reg;
9258
9259 /* is the condition present? */
9260 reg = read_csr(dd, CCE_STATUS);
9261 if ((reg & status_bits) == 0)
9262 return;
9263
9264 /* clear the condition */
9265 write_csr(dd, CCE_CTRL, ctrl_bits);
9266
9267 /* wait for the condition to clear */
9268 timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT);
9269 while (1) {
9270 reg = read_csr(dd, CCE_STATUS);
9271 if ((reg & status_bits) == 0)
9272 return;
9273 if (time_after(jiffies, timeout)) {
9274 dd_dev_err(dd,
9275 "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
9276 status_bits, reg & status_bits);
9277 return;
9278 }
9279 udelay(1);
9280 }
9281}
9282
9283/* set CCE CSRs to chip reset defaults */
9284static void reset_cce_csrs(struct hfi1_devdata *dd)
9285{
9286 int i;
9287
9288 /* CCE_REVISION read-only */
9289 /* CCE_REVISION2 read-only */
9290 /* CCE_CTRL - bits clear automatically */
9291 /* CCE_STATUS read-only, use CceCtrl to clear */
9292 clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
9293 clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
9294 clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK);
9295 for (i = 0; i < CCE_NUM_SCRATCH; i++)
9296 write_csr(dd, CCE_SCRATCH + (8 * i), 0);
9297 /* CCE_ERR_STATUS read-only */
9298 write_csr(dd, CCE_ERR_MASK, 0);
9299 write_csr(dd, CCE_ERR_CLEAR, ~0ull);
9300 /* CCE_ERR_FORCE leave alone */
9301 for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
9302 write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
9303 write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR);
9304 /* CCE_PCIE_CTRL leave alone */
9305 for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
9306 write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
9307 write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
9308 CCE_MSIX_TABLE_UPPER_RESETCSR);
9309 }
9310 for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
9311 /* CCE_MSIX_PBA read-only */
9312 write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
9313 write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
9314 }
9315 for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
9316 write_csr(dd, CCE_INT_MAP, 0);
9317 for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
9318 /* CCE_INT_STATUS read-only */
9319 write_csr(dd, CCE_INT_MASK + (8 * i), 0);
9320 write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull);
9321 /* CCE_INT_FORCE leave alone */
9322 /* CCE_INT_BLOCKED read-only */
9323 }
9324 for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
9325 write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
9326}
9327
9328/* set ASIC CSRs to chip reset defaults */
9329static void reset_asic_csrs(struct hfi1_devdata *dd)
9330{
9331 static DEFINE_MUTEX(asic_mutex);
9332 static int called;
9333 int i;
9334
9335 /*
9336 * If the HFIs are shared between separate nodes or VMs,
9337 * then more will need to be done here. One idea is a module
9338 * parameter that returns early, letting the first power-on or
9339 * a known first load do the reset and blocking all others.
9340 */
9341
9342 /*
9343 * These CSRs should only be reset once - the first one here will
9344 * do the work. Use a mutex so that a non-first caller waits until
9345 * the first is finished before it can proceed.
9346 */
9347 mutex_lock(&asic_mutex);
9348 if (called)
9349 goto done;
9350 called = 1;
9351
9352 if (dd->icode != ICODE_FPGA_EMULATION) {
9353 /* emulation does not have an SBus - leave these alone */
9354 /*
9355 * All writes to ASIC_CFG_SBUS_REQUEST do something.
9356 * Notes:
9357 * o The reset is not zero if aimed at the core. See the
9358 * SBus documentation for details.
9359 * o If the SBus firmware has been updated (e.g. by the BIOS),
9360 * will the reset revert that?
9361 */
9362 /* ASIC_CFG_SBUS_REQUEST leave alone */
9363 write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
9364 }
9365 /* ASIC_SBUS_RESULT read-only */
9366 write_csr(dd, ASIC_STS_SBUS_COUNTERS, 0);
9367 for (i = 0; i < ASIC_NUM_SCRATCH; i++)
9368 write_csr(dd, ASIC_CFG_SCRATCH + (8 * i), 0);
9369 write_csr(dd, ASIC_CFG_MUTEX, 0); /* this will clear it */
9370 write_csr(dd, ASIC_CFG_DRV_STR, 0);
9371 write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0);
9372 /* ASIC_STS_THERM read-only */
9373 /* ASIC_CFG_RESET leave alone */
9374
9375 write_csr(dd, ASIC_PCIE_SD_HOST_CMD, 0);
9376 /* ASIC_PCIE_SD_HOST_STATUS read-only */
9377 write_csr(dd, ASIC_PCIE_SD_INTRPT_DATA_CODE, 0);
9378 write_csr(dd, ASIC_PCIE_SD_INTRPT_ENABLE, 0);
9379 /* ASIC_PCIE_SD_INTRPT_PROGRESS read-only */
9380 write_csr(dd, ASIC_PCIE_SD_INTRPT_STATUS, ~0ull); /* clear */
9381 /* ASIC_HFI0_PCIE_SD_INTRPT_RSPD_DATA read-only */
9382 /* ASIC_HFI1_PCIE_SD_INTRPT_RSPD_DATA read-only */
9383 for (i = 0; i < 16; i++)
9384 write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (8 * i), 0);
9385
9386 /* ASIC_GPIO_IN read-only */
9387 write_csr(dd, ASIC_GPIO_OE, 0);
9388 write_csr(dd, ASIC_GPIO_INVERT, 0);
9389 write_csr(dd, ASIC_GPIO_OUT, 0);
9390 write_csr(dd, ASIC_GPIO_MASK, 0);
9391 /* ASIC_GPIO_STATUS read-only */
9392 write_csr(dd, ASIC_GPIO_CLEAR, ~0ull);
9393 /* ASIC_GPIO_FORCE leave alone */
9394
9395 /* ASIC_QSFP1_IN read-only */
9396 write_csr(dd, ASIC_QSFP1_OE, 0);
9397 write_csr(dd, ASIC_QSFP1_INVERT, 0);
9398 write_csr(dd, ASIC_QSFP1_OUT, 0);
9399 write_csr(dd, ASIC_QSFP1_MASK, 0);
9400 /* ASIC_QSFP1_STATUS read-only */
9401 write_csr(dd, ASIC_QSFP1_CLEAR, ~0ull);
9402 /* ASIC_QSFP1_FORCE leave alone */
9403
9404 /* ASIC_QSFP2_IN read-only */
9405 write_csr(dd, ASIC_QSFP2_OE, 0);
9406 write_csr(dd, ASIC_QSFP2_INVERT, 0);
9407 write_csr(dd, ASIC_QSFP2_OUT, 0);
9408 write_csr(dd, ASIC_QSFP2_MASK, 0);
9409 /* ASIC_QSFP2_STATUS read-only */
9410 write_csr(dd, ASIC_QSFP2_CLEAR, ~0ull);
9411 /* ASIC_QSFP2_FORCE leave alone */
9412
9413 write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_RESETCSR);
9414 /* this also writes a NOP command, clearing paging mode */
9415 write_csr(dd, ASIC_EEP_ADDR_CMD, 0);
9416 write_csr(dd, ASIC_EEP_DATA, 0);
9417
9418done:
9419 mutex_unlock(&asic_mutex);
9420}
9421
9422/* set MISC CSRs to chip reset defaults */
9423static void reset_misc_csrs(struct hfi1_devdata *dd)
9424{
9425 int i;
9426
9427 for (i = 0; i < 32; i++) {
9428 write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0);
9429 write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
9430 write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
9431 }
9432 /* MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
9433 only be written 128-byte chunks */
9434 /* init RSA engine to clear lingering errors */
9435 write_csr(dd, MISC_CFG_RSA_CMD, 1);
9436 write_csr(dd, MISC_CFG_RSA_MU, 0);
9437 write_csr(dd, MISC_CFG_FW_CTRL, 0);
9438 /* MISC_STS_8051_DIGEST read-only */
9439 /* MISC_STS_SBM_DIGEST read-only */
9440 /* MISC_STS_PCIE_DIGEST read-only */
9441 /* MISC_STS_FAB_DIGEST read-only */
9442 /* MISC_ERR_STATUS read-only */
9443 write_csr(dd, MISC_ERR_MASK, 0);
9444 write_csr(dd, MISC_ERR_CLEAR, ~0ull);
9445 /* MISC_ERR_FORCE leave alone */
9446}
9447
9448/* set TXE CSRs to chip reset defaults */
9449static void reset_txe_csrs(struct hfi1_devdata *dd)
9450{
9451 int i;
9452
9453 /*
9454 * TXE Kernel CSRs
9455 */
9456 write_csr(dd, SEND_CTRL, 0);
9457 __cm_reset(dd, 0); /* reset CM internal state */
9458 /* SEND_CONTEXTS read-only */
9459 /* SEND_DMA_ENGINES read-only */
9460 /* SEND_PIO_MEM_SIZE read-only */
9461 /* SEND_DMA_MEM_SIZE read-only */
9462 write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0);
9463 pio_reset_all(dd); /* SEND_PIO_INIT_CTXT */
9464 /* SEND_PIO_ERR_STATUS read-only */
9465 write_csr(dd, SEND_PIO_ERR_MASK, 0);
9466 write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull);
9467 /* SEND_PIO_ERR_FORCE leave alone */
9468 /* SEND_DMA_ERR_STATUS read-only */
9469 write_csr(dd, SEND_DMA_ERR_MASK, 0);
9470 write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull);
9471 /* SEND_DMA_ERR_FORCE leave alone */
9472 /* SEND_EGRESS_ERR_STATUS read-only */
9473 write_csr(dd, SEND_EGRESS_ERR_MASK, 0);
9474 write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull);
9475 /* SEND_EGRESS_ERR_FORCE leave alone */
9476 write_csr(dd, SEND_BTH_QP, 0);
9477 write_csr(dd, SEND_STATIC_RATE_CONTROL, 0);
9478 write_csr(dd, SEND_SC2VLT0, 0);
9479 write_csr(dd, SEND_SC2VLT1, 0);
9480 write_csr(dd, SEND_SC2VLT2, 0);
9481 write_csr(dd, SEND_SC2VLT3, 0);
9482 write_csr(dd, SEND_LEN_CHECK0, 0);
9483 write_csr(dd, SEND_LEN_CHECK1, 0);
9484 /* SEND_ERR_STATUS read-only */
9485 write_csr(dd, SEND_ERR_MASK, 0);
9486 write_csr(dd, SEND_ERR_CLEAR, ~0ull);
9487 /* SEND_ERR_FORCE read-only */
9488 for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
9489 write_csr(dd, SEND_LOW_PRIORITY_LIST + (8*i), 0);
9490 for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
9491 write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8*i), 0);
9492 for (i = 0; i < dd->chip_send_contexts/NUM_CONTEXTS_PER_SET; i++)
9493 write_csr(dd, SEND_CONTEXT_SET_CTRL + (8*i), 0);
9494 for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
9495 write_csr(dd, SEND_COUNTER_ARRAY32 + (8*i), 0);
9496 for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
9497 write_csr(dd, SEND_COUNTER_ARRAY64 + (8*i), 0);
9498 write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
9499 write_csr(dd, SEND_CM_GLOBAL_CREDIT,
9500 SEND_CM_GLOBAL_CREDIT_RESETCSR);
9501 /* SEND_CM_CREDIT_USED_STATUS read-only */
9502 write_csr(dd, SEND_CM_TIMER_CTRL, 0);
9503 write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
9504 write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0);
9505 write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
9506 write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
9507 for (i = 0; i < TXE_NUM_DATA_VL; i++)
9508 write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
9509 write_csr(dd, SEND_CM_CREDIT_VL15, 0);
9510 /* SEND_CM_CREDIT_USED_VL read-only */
9511 /* SEND_CM_CREDIT_USED_VL15 read-only */
9512 /* SEND_EGRESS_CTXT_STATUS read-only */
9513 /* SEND_EGRESS_SEND_DMA_STATUS read-only */
9514 write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull);
9515 /* SEND_EGRESS_ERR_INFO read-only */
9516 /* SEND_EGRESS_ERR_SOURCE read-only */
9517
9518 /*
9519 * TXE Per-Context CSRs
9520 */
9521 for (i = 0; i < dd->chip_send_contexts; i++) {
9522 write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
9523 write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
9524 write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
9525 write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
9526 write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
9527 write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
9528 write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
9529 write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
9530 write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
9531 write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
9532 write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
9533 write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
9534 }
9535
9536 /*
9537 * TXE Per-SDMA CSRs
9538 */
9539 for (i = 0; i < dd->chip_sdma_engines; i++) {
9540 write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
9541 /* SEND_DMA_STATUS read-only */
9542 write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
9543 write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
9544 write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0);
9545 /* SEND_DMA_HEAD read-only */
9546 write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
9547 write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0);
9548 /* SEND_DMA_IDLE_CNT read-only */
9549 write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
9550 write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0);
9551 /* SEND_DMA_DESC_FETCHED_CNT read-only */
9552 /* SEND_DMA_ENG_ERR_STATUS read-only */
9553 write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
9554 write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull);
9555 /* SEND_DMA_ENG_ERR_FORCE leave alone */
9556 write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
9557 write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
9558 write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
9559 write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
9560 write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
9561 write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
9562 write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
9563 }
9564}
9565
9566/*
9567 * Expect on entry:
9568 * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
9569 */
9570static void init_rbufs(struct hfi1_devdata *dd)
9571{
9572 u64 reg;
9573 int count;
9574
9575 /*
9576 * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are
9577 * clear.
9578 */
9579 count = 0;
9580 while (1) {
9581 reg = read_csr(dd, RCV_STATUS);
9582 if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
9583 | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0)
9584 break;
9585 /*
9586 * Give up after 1ms - maximum wait time.
9587 *
9588 * RBuf size is 148KiB. Slowest possible is PCIe Gen1 x1 at
9589 * 250MB/s bandwidth. Lower rate to 66% for overhead to get:
9590 * 148 KB / (66% * 250MB/s) = 920us
9591 */
9592 if (count++ > 500) {
9593 dd_dev_err(dd,
9594 "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
9595 __func__, reg);
9596 break;
9597 }
9598 udelay(2); /* do not busy-wait the CSR */
9599 }
9600
9601 /* start the init - expect RcvCtrl to be 0 */
9602 write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
9603
9604 /*
9605 * Read to force the write of Rcvtrl.RxRbufInit. There is a brief
9606 * period after the write before RcvStatus.RxRbufInitDone is valid.
9607 * The delay in the first run through the loop below is sufficient and
9608 * required before the first read of RcvStatus.RxRbufInintDone.
9609 */
9610 read_csr(dd, RCV_CTRL);
9611
9612 /* wait for the init to finish */
9613 count = 0;
9614 while (1) {
9615 /* delay is required first time through - see above */
9616 udelay(2); /* do not busy-wait the CSR */
9617 reg = read_csr(dd, RCV_STATUS);
9618 if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK))
9619 break;
9620
9621 /* give up after 100us - slowest possible at 33MHz is 73us */
9622 if (count++ > 50) {
9623 dd_dev_err(dd,
9624 "%s: RcvStatus.RxRbufInit not set, continuing\n",
9625 __func__);
9626 break;
9627 }
9628 }
9629}
9630
9631/* set RXE CSRs to chip reset defaults */
9632static void reset_rxe_csrs(struct hfi1_devdata *dd)
9633{
9634 int i, j;
9635
9636 /*
9637 * RXE Kernel CSRs
9638 */
9639 write_csr(dd, RCV_CTRL, 0);
9640 init_rbufs(dd);
9641 /* RCV_STATUS read-only */
9642 /* RCV_CONTEXTS read-only */
9643 /* RCV_ARRAY_CNT read-only */
9644 /* RCV_BUF_SIZE read-only */
9645 write_csr(dd, RCV_BTH_QP, 0);
9646 write_csr(dd, RCV_MULTICAST, 0);
9647 write_csr(dd, RCV_BYPASS, 0);
9648 write_csr(dd, RCV_VL15, 0);
9649 /* this is a clear-down */
9650 write_csr(dd, RCV_ERR_INFO,
9651 RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
9652 /* RCV_ERR_STATUS read-only */
9653 write_csr(dd, RCV_ERR_MASK, 0);
9654 write_csr(dd, RCV_ERR_CLEAR, ~0ull);
9655 /* RCV_ERR_FORCE leave alone */
9656 for (i = 0; i < 32; i++)
9657 write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
9658 for (i = 0; i < 4; i++)
9659 write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0);
9660 for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
9661 write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
9662 for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
9663 write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
9664 for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) {
9665 write_csr(dd, RCV_RSM_CFG + (8 * i), 0);
9666 write_csr(dd, RCV_RSM_SELECT + (8 * i), 0);
9667 write_csr(dd, RCV_RSM_MATCH + (8 * i), 0);
9668 }
9669 for (i = 0; i < 32; i++)
9670 write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
9671
9672 /*
9673 * RXE Kernel and User Per-Context CSRs
9674 */
9675 for (i = 0; i < dd->chip_rcv_contexts; i++) {
9676 /* kernel */
9677 write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0);
9678 /* RCV_CTXT_STATUS read-only */
9679 write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
9680 write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
9681 write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
9682 write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
9683 write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
9684 write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
9685 write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
9686 write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
9687 write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
9688 write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
9689
9690 /* user */
9691 /* RCV_HDR_TAIL read-only */
9692 write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0);
9693 /* RCV_EGR_INDEX_TAIL read-only */
9694 write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
9695 /* RCV_EGR_OFFSET_TAIL read-only */
9696 for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
9697 write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j),
9698 0);
9699 }
9700 }
9701}
9702
9703/*
9704 * Set sc2vl tables.
9705 *
9706 * They power on to zeros, so to avoid send context errors
9707 * they need to be set:
9708 *
9709 * SC 0-7 -> VL 0-7 (respectively)
9710 * SC 15 -> VL 15
9711 * otherwise
9712 * -> VL 0
9713 */
9714static void init_sc2vl_tables(struct hfi1_devdata *dd)
9715{
9716 int i;
9717 /* init per architecture spec, constrained by hardware capability */
9718
9719 /* HFI maps sent packets */
9720 write_csr(dd, SEND_SC2VLT0, SC2VL_VAL(
9721 0,
9722 0, 0, 1, 1,
9723 2, 2, 3, 3,
9724 4, 4, 5, 5,
9725 6, 6, 7, 7));
9726 write_csr(dd, SEND_SC2VLT1, SC2VL_VAL(
9727 1,
9728 8, 0, 9, 0,
9729 10, 0, 11, 0,
9730 12, 0, 13, 0,
9731 14, 0, 15, 15));
9732 write_csr(dd, SEND_SC2VLT2, SC2VL_VAL(
9733 2,
9734 16, 0, 17, 0,
9735 18, 0, 19, 0,
9736 20, 0, 21, 0,
9737 22, 0, 23, 0));
9738 write_csr(dd, SEND_SC2VLT3, SC2VL_VAL(
9739 3,
9740 24, 0, 25, 0,
9741 26, 0, 27, 0,
9742 28, 0, 29, 0,
9743 30, 0, 31, 0));
9744
9745 /* DC maps received packets */
9746 write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL(
9747 15_0,
9748 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
9749 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15));
9750 write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL(
9751 31_16,
9752 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
9753 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0));
9754
9755 /* initialize the cached sc2vl values consistently with h/w */
9756 for (i = 0; i < 32; i++) {
9757 if (i < 8 || i == 15)
9758 *((u8 *)(dd->sc2vl) + i) = (u8)i;
9759 else
9760 *((u8 *)(dd->sc2vl) + i) = 0;
9761 }
9762}
9763
9764/*
9765 * Read chip sizes and then reset parts to sane, disabled, values. We cannot
9766 * depend on the chip going through a power-on reset - a driver may be loaded
9767 * and unloaded many times.
9768 *
9769 * Do not write any CSR values to the chip in this routine - there may be
9770 * a reset following the (possible) FLR in this routine.
9771 *
9772 */
9773static void init_chip(struct hfi1_devdata *dd)
9774{
9775 int i;
9776
9777 /*
9778 * Put the HFI CSRs in a known state.
9779 * Combine this with a DC reset.
9780 *
9781 * Stop the device from doing anything while we do a
9782 * reset. We know there are no other active users of
9783 * the device since we are now in charge. Turn off
9784 * off all outbound and inbound traffic and make sure
9785 * the device does not generate any interrupts.
9786 */
9787
9788 /* disable send contexts and SDMA engines */
9789 write_csr(dd, SEND_CTRL, 0);
9790 for (i = 0; i < dd->chip_send_contexts; i++)
9791 write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
9792 for (i = 0; i < dd->chip_sdma_engines; i++)
9793 write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
9794 /* disable port (turn off RXE inbound traffic) and contexts */
9795 write_csr(dd, RCV_CTRL, 0);
9796 for (i = 0; i < dd->chip_rcv_contexts; i++)
9797 write_csr(dd, RCV_CTXT_CTRL, 0);
9798 /* mask all interrupt sources */
9799 for (i = 0; i < CCE_NUM_INT_CSRS; i++)
9800 write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
9801
9802 /*
9803 * DC Reset: do a full DC reset before the register clear.
9804 * A recommended length of time to hold is one CSR read,
9805 * so reread the CceDcCtrl. Then, hold the DC in reset
9806 * across the clear.
9807 */
9808 write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
9809 (void) read_csr(dd, CCE_DC_CTRL);
9810
9811 if (use_flr) {
9812 /*
9813 * A FLR will reset the SPC core and part of the PCIe.
9814 * The parts that need to be restored have already been
9815 * saved.
9816 */
9817 dd_dev_info(dd, "Resetting CSRs with FLR\n");
9818
9819 /* do the FLR, the DC reset will remain */
9820 hfi1_pcie_flr(dd);
9821
9822 /* restore command and BARs */
9823 restore_pci_variables(dd);
9824
9825 if (is_a0(dd)) {
9826 dd_dev_info(dd, "Resetting CSRs with FLR\n");
9827 hfi1_pcie_flr(dd);
9828 restore_pci_variables(dd);
9829 }
9830
9831 } else {
9832 dd_dev_info(dd, "Resetting CSRs with writes\n");
9833 reset_cce_csrs(dd);
9834 reset_txe_csrs(dd);
9835 reset_rxe_csrs(dd);
9836 reset_asic_csrs(dd);
9837 reset_misc_csrs(dd);
9838 }
9839 /* clear the DC reset */
9840 write_csr(dd, CCE_DC_CTRL, 0);
9841 /* Set the LED off */
9842 if (is_a0(dd))
9843 setextled(dd, 0);
9844 /*
9845 * Clear the QSFP reset.
9846 * A0 leaves the out lines floating on power on, then on an FLR
9847 * enforces a 0 on all out pins. The driver does not touch
9848 * ASIC_QSFPn_OUT otherwise. This leaves RESET_N low and
9849 * anything plugged constantly in reset, if it pays attention
9850 * to RESET_N.
9851 * A prime example of this is SiPh. For now, set all pins high.
9852 * I2CCLK and I2CDAT will change per direction, and INT_N and
9853 * MODPRS_N are input only and their value is ignored.
9854 */
9855 if (is_a0(dd)) {
9856 write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
9857 write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
9858 }
9859}
9860
9861static void init_early_variables(struct hfi1_devdata *dd)
9862{
9863 int i;
9864
9865 /* assign link credit variables */
9866 dd->vau = CM_VAU;
9867 dd->link_credits = CM_GLOBAL_CREDITS;
9868 if (is_a0(dd))
9869 dd->link_credits--;
9870 dd->vcu = cu_to_vcu(hfi1_cu);
9871 /* enough room for 8 MAD packets plus header - 17K */
9872 dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau);
9873 if (dd->vl15_init > dd->link_credits)
9874 dd->vl15_init = dd->link_credits;
9875
9876 write_uninitialized_csrs_and_memories(dd);
9877
9878 if (HFI1_CAP_IS_KSET(PKEY_CHECK))
9879 for (i = 0; i < dd->num_pports; i++) {
9880 struct hfi1_pportdata *ppd = &dd->pport[i];
9881
9882 set_partition_keys(ppd);
9883 }
9884 init_sc2vl_tables(dd);
9885}
9886
9887static void init_kdeth_qp(struct hfi1_devdata *dd)
9888{
9889 /* user changed the KDETH_QP */
9890 if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
9891 /* out of range or illegal value */
9892 dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
9893 kdeth_qp = 0;
9894 }
9895 if (kdeth_qp == 0) /* not set, or failed range check */
9896 kdeth_qp = DEFAULT_KDETH_QP;
9897
9898 write_csr(dd, SEND_BTH_QP,
9899 (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK)
9900 << SEND_BTH_QP_KDETH_QP_SHIFT);
9901
9902 write_csr(dd, RCV_BTH_QP,
9903 (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK)
9904 << RCV_BTH_QP_KDETH_QP_SHIFT);
9905}
9906
9907/**
9908 * init_qpmap_table
9909 * @dd - device data
9910 * @first_ctxt - first context
9911 * @last_ctxt - first context
9912 *
9913 * This return sets the qpn mapping table that
9914 * is indexed by qpn[8:1].
9915 *
9916 * The routine will round robin the 256 settings
9917 * from first_ctxt to last_ctxt.
9918 *
9919 * The first/last looks ahead to having specialized
9920 * receive contexts for mgmt and bypass. Normal
9921 * verbs traffic will assumed to be on a range
9922 * of receive contexts.
9923 */
9924static void init_qpmap_table(struct hfi1_devdata *dd,
9925 u32 first_ctxt,
9926 u32 last_ctxt)
9927{
9928 u64 reg = 0;
9929 u64 regno = RCV_QP_MAP_TABLE;
9930 int i;
9931 u64 ctxt = first_ctxt;
9932
9933 for (i = 0; i < 256;) {
9934 if (ctxt == VL15CTXT) {
9935 ctxt++;
9936 if (ctxt > last_ctxt)
9937 ctxt = first_ctxt;
9938 continue;
9939 }
9940 reg |= ctxt << (8 * (i % 8));
9941 i++;
9942 ctxt++;
9943 if (ctxt > last_ctxt)
9944 ctxt = first_ctxt;
9945 if (i % 8 == 0) {
9946 write_csr(dd, regno, reg);
9947 reg = 0;
9948 regno += 8;
9949 }
9950 }
9951 if (i % 8)
9952 write_csr(dd, regno, reg);
9953
9954 add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
9955 | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
9956}
9957
9958/**
9959 * init_qos - init RX qos
9960 * @dd - device data
9961 * @first_context
9962 *
9963 * This routine initializes Rule 0 and the
9964 * RSM map table to implement qos.
9965 *
9966 * If all of the limit tests succeed,
9967 * qos is applied based on the array
9968 * interpretation of krcvqs where
9969 * entry 0 is VL0.
9970 *
9971 * The number of vl bits (n) and the number of qpn
9972 * bits (m) are computed to feed both the RSM map table
9973 * and the single rule.
9974 *
9975 */
9976static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
9977{
9978 u8 max_by_vl = 0;
9979 unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
9980 u64 *rsmmap;
9981 u64 reg;
9982 u8 rxcontext = is_a0(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */
9983
9984 /* validate */
9985 if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
9986 num_vls == 1 ||
9987 krcvqsset <= 1)
9988 goto bail;
9989 for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++)
9990 if (krcvqs[i] > max_by_vl)
9991 max_by_vl = krcvqs[i];
9992 if (max_by_vl > 32)
9993 goto bail;
9994 qpns_per_vl = __roundup_pow_of_two(max_by_vl);
9995 /* determine bits vl */
9996 n = ilog2(num_vls);
9997 /* determine bits for qpn */
9998 m = ilog2(qpns_per_vl);
9999 if ((m + n) > 7)
10000 goto bail;
10001 if (num_vls * qpns_per_vl > dd->chip_rcv_contexts)
10002 goto bail;
10003 rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
10004 memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
10005 /* init the local copy of the table */
10006 for (i = 0, ctxt = first_ctxt; i < num_vls; i++) {
10007 unsigned tctxt;
10008
10009 for (qpn = 0, tctxt = ctxt;
10010 krcvqs[i] && qpn < qpns_per_vl; qpn++) {
10011 unsigned idx, regoff, regidx;
10012
10013 /* generate index <= 128 */
10014 idx = (qpn << n) ^ i;
10015 regoff = (idx % 8) * 8;
10016 regidx = idx / 8;
10017 reg = rsmmap[regidx];
10018 /* replace 0xff with context number */
10019 reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
10020 << regoff);
10021 reg |= (u64)(tctxt++) << regoff;
10022 rsmmap[regidx] = reg;
10023 if (tctxt == ctxt + krcvqs[i])
10024 tctxt = ctxt;
10025 }
10026 ctxt += krcvqs[i];
10027 }
10028 /* flush cached copies to chip */
10029 for (i = 0; i < NUM_MAP_REGS; i++)
10030 write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
10031 /* add rule0 */
10032 write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
10033 RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK
10034 << RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
10035 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
10036 write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
10037 LRH_BTH_MATCH_OFFSET
10038 << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
10039 LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
10040 LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
10041 ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
10042 QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
10043 ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
10044 write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
10045 LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
10046 LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
10047 LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
10048 LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
10049 /* Enable RSM */
10050 add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
10051 kfree(rsmmap);
10052 /* map everything else (non-VL15) to context 0 */
10053 init_qpmap_table(
10054 dd,
10055 0,
10056 0);
10057 dd->qos_shift = n + 1;
10058 return;
10059bail:
10060 dd->qos_shift = 1;
10061 init_qpmap_table(
10062 dd,
10063 dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0,
10064 dd->n_krcv_queues - 1);
10065}
10066
10067static void init_rxe(struct hfi1_devdata *dd)
10068{
10069 /* enable all receive errors */
10070 write_csr(dd, RCV_ERR_MASK, ~0ull);
10071 /* setup QPN map table - start where VL15 context leaves off */
10072 init_qos(
10073 dd,
10074 dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0);
10075 /*
10076 * make sure RcvCtrl.RcvWcb <= PCIe Device Control
10077 * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
10078 * space, PciCfgCap2.MaxPayloadSize in HFI). There is only one
10079 * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and
10080 * Max_PayLoad_Size set to its minimum of 128.
10081 *
10082 * Presently, RcvCtrl.RcvWcb is not modified from its default of 0
10083 * (64 bytes). Max_Payload_Size is possibly modified upward in
10084 * tune_pcie_caps() which is called after this routine.
10085 */
10086}
10087
10088static void init_other(struct hfi1_devdata *dd)
10089{
10090 /* enable all CCE errors */
10091 write_csr(dd, CCE_ERR_MASK, ~0ull);
10092 /* enable *some* Misc errors */
10093 write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK);
10094 /* enable all DC errors, except LCB */
10095 write_csr(dd, DCC_ERR_FLG_EN, ~0ull);
10096 write_csr(dd, DC_DC8051_ERR_EN, ~0ull);
10097}
10098
10099/*
10100 * Fill out the given AU table using the given CU. A CU is defined in terms
10101 * AUs. The table is a an encoding: given the index, how many AUs does that
10102 * represent?
10103 *
10104 * NOTE: Assumes that the register layout is the same for the
10105 * local and remote tables.
10106 */
10107static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
10108 u32 csr0to3, u32 csr4to7)
10109{
10110 write_csr(dd, csr0to3,
10111 0ull <<
10112 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT
10113 | 1ull <<
10114 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT
10115 | 2ull * cu <<
10116 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT
10117 | 4ull * cu <<
10118 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
10119 write_csr(dd, csr4to7,
10120 8ull * cu <<
10121 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT
10122 | 16ull * cu <<
10123 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT
10124 | 32ull * cu <<
10125 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT
10126 | 64ull * cu <<
10127 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
10128
10129}
10130
10131static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
10132{
10133 assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
10134 SEND_CM_LOCAL_AU_TABLE4_TO7);
10135}
10136
10137void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
10138{
10139 assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
10140 SEND_CM_REMOTE_AU_TABLE4_TO7);
10141}
10142
10143static void init_txe(struct hfi1_devdata *dd)
10144{
10145 int i;
10146
10147 /* enable all PIO, SDMA, general, and Egress errors */
10148 write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
10149 write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
10150 write_csr(dd, SEND_ERR_MASK, ~0ull);
10151 write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
10152
10153 /* enable all per-context and per-SDMA engine errors */
10154 for (i = 0; i < dd->chip_send_contexts; i++)
10155 write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull);
10156 for (i = 0; i < dd->chip_sdma_engines; i++)
10157 write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
10158
10159 /* set the local CU to AU mapping */
10160 assign_local_cm_au_table(dd, dd->vcu);
10161
10162 /*
10163 * Set reasonable default for Credit Return Timer
10164 * Don't set on Simulator - causes it to choke.
10165 */
10166 if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
10167 write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
10168}
10169
10170int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey)
10171{
10172 struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
10173 unsigned sctxt;
10174 int ret = 0;
10175 u64 reg;
10176
10177 if (!rcd || !rcd->sc) {
10178 ret = -EINVAL;
10179 goto done;
10180 }
10181 sctxt = rcd->sc->hw_context;
10182 reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
10183 ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
10184 SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
10185 /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
10186 if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
10187 reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
10188 write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
10189 /*
10190 * Enable send-side J_KEY integrity check, unless this is A0 h/w
10191 * (due to A0 erratum).
10192 */
10193 if (!is_a0(dd)) {
10194 reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
10195 reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
10196 write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
10197 }
10198
10199 /* Enable J_KEY check on receive context. */
10200 reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
10201 ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
10202 RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
10203 write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg);
10204done:
10205 return ret;
10206}
10207
10208int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt)
10209{
10210 struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
10211 unsigned sctxt;
10212 int ret = 0;
10213 u64 reg;
10214
10215 if (!rcd || !rcd->sc) {
10216 ret = -EINVAL;
10217 goto done;
10218 }
10219 sctxt = rcd->sc->hw_context;
10220 write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
10221 /*
10222 * Disable send-side J_KEY integrity check, unless this is A0 h/w.
10223 * This check would not have been enabled for A0 h/w, see
10224 * set_ctxt_jkey().
10225 */
10226 if (!is_a0(dd)) {
10227 reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
10228 reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
10229 write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
10230 }
10231 /* Turn off the J_KEY on the receive side */
10232 write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0);
10233done:
10234 return ret;
10235}
10236
10237int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
10238{
10239 struct hfi1_ctxtdata *rcd;
10240 unsigned sctxt;
10241 int ret = 0;
10242 u64 reg;
10243
10244 if (ctxt < dd->num_rcv_contexts)
10245 rcd = dd->rcd[ctxt];
10246 else {
10247 ret = -EINVAL;
10248 goto done;
10249 }
10250 if (!rcd || !rcd->sc) {
10251 ret = -EINVAL;
10252 goto done;
10253 }
10254 sctxt = rcd->sc->hw_context;
10255 reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
10256 SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
10257 write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
10258 reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
10259 reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
10260 write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
10261done:
10262 return ret;
10263}
10264
10265int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt)
10266{
10267 struct hfi1_ctxtdata *rcd;
10268 unsigned sctxt;
10269 int ret = 0;
10270 u64 reg;
10271
10272 if (ctxt < dd->num_rcv_contexts)
10273 rcd = dd->rcd[ctxt];
10274 else {
10275 ret = -EINVAL;
10276 goto done;
10277 }
10278 if (!rcd || !rcd->sc) {
10279 ret = -EINVAL;
10280 goto done;
10281 }
10282 sctxt = rcd->sc->hw_context;
10283 reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
10284 reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
10285 write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
10286 write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0);
10287done:
10288 return ret;
10289}
10290
10291/*
10292 * Start doing the clean up the the chip. Our clean up happens in multiple
10293 * stages and this is just the first.
10294 */
10295void hfi1_start_cleanup(struct hfi1_devdata *dd)
10296{
10297 free_cntrs(dd);
10298 free_rcverr(dd);
10299 clean_up_interrupts(dd);
10300}
10301
10302#define HFI_BASE_GUID(dev) \
10303 ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
10304
10305/*
10306 * Certain chip functions need to be initialized only once per asic
10307 * instead of per-device. This function finds the peer device and
10308 * checks whether that chip initialization needs to be done by this
10309 * device.
10310 */
10311static void asic_should_init(struct hfi1_devdata *dd)
10312{
10313 unsigned long flags;
10314 struct hfi1_devdata *tmp, *peer = NULL;
10315
10316 spin_lock_irqsave(&hfi1_devs_lock, flags);
10317 /* Find our peer device */
10318 list_for_each_entry(tmp, &hfi1_dev_list, list) {
10319 if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
10320 dd->unit != tmp->unit) {
10321 peer = tmp;
10322 break;
10323 }
10324 }
10325
10326 /*
10327 * "Claim" the ASIC for initialization if it hasn't been
10328 " "claimed" yet.
10329 */
10330 if (!peer || !(peer->flags & HFI1_DO_INIT_ASIC))
10331 dd->flags |= HFI1_DO_INIT_ASIC;
10332 spin_unlock_irqrestore(&hfi1_devs_lock, flags);
10333}
10334
10335/**
10336 * Allocate an initialize the device structure for the hfi.
10337 * @dev: the pci_dev for hfi1_ib device
10338 * @ent: pci_device_id struct for this dev
10339 *
10340 * Also allocates, initializes, and returns the devdata struct for this
10341 * device instance
10342 *
10343 * This is global, and is called directly at init to set up the
10344 * chip-specific function pointers for later use.
10345 */
10346struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
10347 const struct pci_device_id *ent)
10348{
10349 struct hfi1_devdata *dd;
10350 struct hfi1_pportdata *ppd;
10351 u64 reg;
10352 int i, ret;
10353 static const char * const inames[] = { /* implementation names */
10354 "RTL silicon",
10355 "RTL VCS simulation",
10356 "RTL FPGA emulation",
10357 "Functional simulator"
10358 };
10359
10360 dd = hfi1_alloc_devdata(pdev,
10361 NUM_IB_PORTS * sizeof(struct hfi1_pportdata));
10362 if (IS_ERR(dd))
10363 goto bail;
10364 ppd = dd->pport;
10365 for (i = 0; i < dd->num_pports; i++, ppd++) {
10366 int vl;
10367 /* init common fields */
10368 hfi1_init_pportdata(pdev, ppd, dd, 0, 1);
10369 /* DC supports 4 link widths */
10370 ppd->link_width_supported =
10371 OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
10372 OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
10373 ppd->link_width_downgrade_supported =
10374 ppd->link_width_supported;
10375 /* start out enabling only 4X */
10376 ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
10377 ppd->link_width_downgrade_enabled =
10378 ppd->link_width_downgrade_supported;
10379 /* link width active is 0 when link is down */
10380 /* link width downgrade active is 0 when link is down */
10381
10382 if (num_vls < HFI1_MIN_VLS_SUPPORTED
10383 || num_vls > HFI1_MAX_VLS_SUPPORTED) {
10384 hfi1_early_err(&pdev->dev,
10385 "Invalid num_vls %u, using %u VLs\n",
10386 num_vls, HFI1_MAX_VLS_SUPPORTED);
10387 num_vls = HFI1_MAX_VLS_SUPPORTED;
10388 }
10389 ppd->vls_supported = num_vls;
10390 ppd->vls_operational = ppd->vls_supported;
10391 /* Set the default MTU. */
10392 for (vl = 0; vl < num_vls; vl++)
10393 dd->vld[vl].mtu = hfi1_max_mtu;
10394 dd->vld[15].mtu = MAX_MAD_PACKET;
10395 /*
10396 * Set the initial values to reasonable default, will be set
10397 * for real when link is up.
10398 */
10399 ppd->lstate = IB_PORT_DOWN;
10400 ppd->overrun_threshold = 0x4;
10401 ppd->phy_error_threshold = 0xf;
10402 ppd->port_crc_mode_enabled = link_crc_mask;
10403 /* initialize supported LTP CRC mode */
10404 ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
10405 /* initialize enabled LTP CRC mode */
10406 ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4;
10407 /* start in offline */
10408 ppd->host_link_state = HLS_DN_OFFLINE;
10409 init_vl_arb_caches(ppd);
10410 }
10411
10412 dd->link_default = HLS_DN_POLL;
10413
10414 /*
10415 * Do remaining PCIe setup and save PCIe values in dd.
10416 * Any error printing is already done by the init code.
10417 * On return, we have the chip mapped.
10418 */
10419 ret = hfi1_pcie_ddinit(dd, pdev, ent);
10420 if (ret < 0)
10421 goto bail_free;
10422
10423 /* verify that reads actually work, save revision for reset check */
10424 dd->revision = read_csr(dd, CCE_REVISION);
10425 if (dd->revision == ~(u64)0) {
10426 dd_dev_err(dd, "cannot read chip CSRs\n");
10427 ret = -EINVAL;
10428 goto bail_cleanup;
10429 }
10430 dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT)
10431 & CCE_REVISION_CHIP_REV_MAJOR_MASK;
10432 dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
10433 & CCE_REVISION_CHIP_REV_MINOR_MASK;
10434
10435 /* obtain the hardware ID - NOT related to unit, which is a
10436 software enumeration */
10437 reg = read_csr(dd, CCE_REVISION2);
10438 dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
10439 & CCE_REVISION2_HFI_ID_MASK;
10440 /* the variable size will remove unwanted bits */
10441 dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
10442 dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
10443 dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
10444 dd->icode < ARRAY_SIZE(inames) ? inames[dd->icode] : "unknown",
10445 (int)dd->irev);
10446
10447 /* speeds the hardware can support */
10448 dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
10449 /* speeds allowed to run at */
10450 dd->pport->link_speed_enabled = dd->pport->link_speed_supported;
10451 /* give a reasonable active value, will be set on link up */
10452 dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
10453
10454 dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS);
10455 dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS);
10456 dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES);
10457 dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE);
10458 dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE);
10459 /* fix up link widths for emulation _p */
10460 ppd = dd->pport;
10461 if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
10462 ppd->link_width_supported =
10463 ppd->link_width_enabled =
10464 ppd->link_width_downgrade_supported =
10465 ppd->link_width_downgrade_enabled =
10466 OPA_LINK_WIDTH_1X;
10467 }
10468 /* insure num_vls isn't larger than number of sdma engines */
10469 if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) {
10470 dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
10471 num_vls, HFI1_MAX_VLS_SUPPORTED);
10472 ppd->vls_supported = num_vls = HFI1_MAX_VLS_SUPPORTED;
10473 ppd->vls_operational = ppd->vls_supported;
10474 }
10475
10476 /*
10477 * Convert the ns parameter to the 64 * cclocks used in the CSR.
10478 * Limit the max if larger than the field holds. If timeout is
10479 * non-zero, then the calculated field will be at least 1.
10480 *
10481 * Must be after icode is set up - the cclock rate depends
10482 * on knowing the hardware being used.
10483 */
10484 dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64;
10485 if (dd->rcv_intr_timeout_csr >
10486 RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
10487 dd->rcv_intr_timeout_csr =
10488 RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK;
10489 else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
10490 dd->rcv_intr_timeout_csr = 1;
10491
10492 /* obtain chip sizes, reset chip CSRs */
10493 init_chip(dd);
10494
10495 /* read in the PCIe link speed information */
10496 ret = pcie_speeds(dd);
10497 if (ret)
10498 goto bail_cleanup;
10499
10500 /* needs to be done before we look for the peer device */
10501 read_guid(dd);
10502
10503 asic_should_init(dd);
10504
10505 /* read in firmware */
10506 ret = hfi1_firmware_init(dd);
10507 if (ret)
10508 goto bail_cleanup;
10509
10510 /*
10511 * In general, the PCIe Gen3 transition must occur after the
10512 * chip has been idled (so it won't initiate any PCIe transactions
10513 * e.g. an interrupt) and before the driver changes any registers
10514 * (the transition will reset the registers).
10515 *
10516 * In particular, place this call after:
10517 * - init_chip() - the chip will not initiate any PCIe transactions
10518 * - pcie_speeds() - reads the current link speed
10519 * - hfi1_firmware_init() - the needed firmware is ready to be
10520 * downloaded
10521 */
10522 ret = do_pcie_gen3_transition(dd);
10523 if (ret)
10524 goto bail_cleanup;
10525
10526 /* start setting dd values and adjusting CSRs */
10527 init_early_variables(dd);
10528
10529 parse_platform_config(dd);
10530
10531 /* add board names as they are defined */
10532 dd->boardname = kmalloc(64, GFP_KERNEL);
10533 if (!dd->boardname)
10534 goto bail_cleanup;
10535 snprintf(dd->boardname, 64, "Board ID 0x%llx",
10536 dd->revision >> CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT
10537 & CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK);
10538
10539 snprintf(dd->boardversion, BOARD_VERS_MAX,
10540 "ChipABI %u.%u, %s, ChipRev %u.%u, SW Compat %llu\n",
10541 HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN,
10542 dd->boardname,
10543 (u32)dd->majrev,
10544 (u32)dd->minrev,
10545 (dd->revision >> CCE_REVISION_SW_SHIFT)
10546 & CCE_REVISION_SW_MASK);
10547
10548 ret = set_up_context_variables(dd);
10549 if (ret)
10550 goto bail_cleanup;
10551
10552 /* set initial RXE CSRs */
10553 init_rxe(dd);
10554 /* set initial TXE CSRs */
10555 init_txe(dd);
10556 /* set initial non-RXE, non-TXE CSRs */
10557 init_other(dd);
10558 /* set up KDETH QP prefix in both RX and TX CSRs */
10559 init_kdeth_qp(dd);
10560
10561 /* send contexts must be set up before receive contexts */
10562 ret = init_send_contexts(dd);
10563 if (ret)
10564 goto bail_cleanup;
10565
10566 ret = hfi1_create_ctxts(dd);
10567 if (ret)
10568 goto bail_cleanup;
10569
10570 dd->rcvhdrsize = DEFAULT_RCVHDRSIZE;
10571 /*
10572 * rcd[0] is guaranteed to be valid by this point. Also, all
10573 * context are using the same value, as per the module parameter.
10574 */
10575 dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
10576
10577 ret = init_pervl_scs(dd);
10578 if (ret)
10579 goto bail_cleanup;
10580
10581 /* sdma init */
10582 for (i = 0; i < dd->num_pports; ++i) {
10583 ret = sdma_init(dd, i);
10584 if (ret)
10585 goto bail_cleanup;
10586 }
10587
10588 /* use contexts created by hfi1_create_ctxts */
10589 ret = set_up_interrupts(dd);
10590 if (ret)
10591 goto bail_cleanup;
10592
10593 /* set up LCB access - must be after set_up_interrupts() */
10594 init_lcb_access(dd);
10595
10596 snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
10597 dd->base_guid & 0xFFFFFF);
10598
10599 dd->oui1 = dd->base_guid >> 56 & 0xFF;
10600 dd->oui2 = dd->base_guid >> 48 & 0xFF;
10601 dd->oui3 = dd->base_guid >> 40 & 0xFF;
10602
10603 ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
10604 if (ret)
10605 goto bail_clear_intr;
10606 check_fabric_firmware_versions(dd);
10607
10608 thermal_init(dd);
10609
10610 ret = init_cntrs(dd);
10611 if (ret)
10612 goto bail_clear_intr;
10613
10614 ret = init_rcverr(dd);
10615 if (ret)
10616 goto bail_free_cntrs;
10617
10618 ret = eprom_init(dd);
10619 if (ret)
10620 goto bail_free_rcverr;
10621
10622 goto bail;
10623
10624bail_free_rcverr:
10625 free_rcverr(dd);
10626bail_free_cntrs:
10627 free_cntrs(dd);
10628bail_clear_intr:
10629 clean_up_interrupts(dd);
10630bail_cleanup:
10631 hfi1_pcie_ddcleanup(dd);
10632bail_free:
10633 hfi1_free_devdata(dd);
10634 dd = ERR_PTR(ret);
10635bail:
10636 return dd;
10637}
10638
10639static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
10640 u32 dw_len)
10641{
10642 u32 delta_cycles;
10643 u32 current_egress_rate = ppd->current_egress_rate;
10644 /* rates here are in units of 10^6 bits/sec */
10645
10646 if (desired_egress_rate == -1)
10647 return 0; /* shouldn't happen */
10648
10649 if (desired_egress_rate >= current_egress_rate)
10650 return 0; /* we can't help go faster, only slower */
10651
10652 delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) -
10653 egress_cycles(dw_len * 4, current_egress_rate);
10654
10655 return (u16)delta_cycles;
10656}
10657
10658
10659/**
10660 * create_pbc - build a pbc for transmission
10661 * @flags: special case flags or-ed in built pbc
10662 * @srate: static rate
10663 * @vl: vl
10664 * @dwlen: dword length (header words + data words + pbc words)
10665 *
10666 * Create a PBC with the given flags, rate, VL, and length.
10667 *
10668 * NOTE: The PBC created will not insert any HCRC - all callers but one are
10669 * for verbs, which does not use this PSM feature. The lone other caller
10670 * is for the diagnostic interface which calls this if the user does not
10671 * supply their own PBC.
10672 */
10673u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
10674 u32 dw_len)
10675{
10676 u64 pbc, delay = 0;
10677
10678 if (unlikely(srate_mbs))
10679 delay = delay_cycles(ppd, srate_mbs, dw_len);
10680
10681 pbc = flags
10682 | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
10683 | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
10684 | (vl & PBC_VL_MASK) << PBC_VL_SHIFT
10685 | (dw_len & PBC_LENGTH_DWS_MASK)
10686 << PBC_LENGTH_DWS_SHIFT;
10687
10688 return pbc;
10689}
10690
10691#define SBUS_THERMAL 0x4f
10692#define SBUS_THERM_MONITOR_MODE 0x1
10693
10694#define THERM_FAILURE(dev, ret, reason) \
10695 dd_dev_err((dd), \
10696 "Thermal sensor initialization failed: %s (%d)\n", \
10697 (reason), (ret))
10698
10699/*
10700 * Initialize the Avago Thermal sensor.
10701 *
10702 * After initialization, enable polling of thermal sensor through
10703 * SBus interface. In order for this to work, the SBus Master
10704 * firmware has to be loaded due to the fact that the HW polling
10705 * logic uses SBus interrupts, which are not supported with
10706 * default firmware. Otherwise, no data will be returned through
10707 * the ASIC_STS_THERM CSR.
10708 */
10709static int thermal_init(struct hfi1_devdata *dd)
10710{
10711 int ret = 0;
10712
10713 if (dd->icode != ICODE_RTL_SILICON ||
10714 !(dd->flags & HFI1_DO_INIT_ASIC))
10715 return ret;
10716
10717 acquire_hw_mutex(dd);
10718 dd_dev_info(dd, "Initializing thermal sensor\n");
10719 /* Thermal Sensor Initialization */
10720 /* Step 1: Reset the Thermal SBus Receiver */
10721 ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
10722 RESET_SBUS_RECEIVER, 0);
10723 if (ret) {
10724 THERM_FAILURE(dd, ret, "Bus Reset");
10725 goto done;
10726 }
10727 /* Step 2: Set Reset bit in Thermal block */
10728 ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
10729 WRITE_SBUS_RECEIVER, 0x1);
10730 if (ret) {
10731 THERM_FAILURE(dd, ret, "Therm Block Reset");
10732 goto done;
10733 }
10734 /* Step 3: Write clock divider value (100MHz -> 2MHz) */
10735 ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
10736 WRITE_SBUS_RECEIVER, 0x32);
10737 if (ret) {
10738 THERM_FAILURE(dd, ret, "Write Clock Div");
10739 goto done;
10740 }
10741 /* Step 4: Select temperature mode */
10742 ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
10743 WRITE_SBUS_RECEIVER,
10744 SBUS_THERM_MONITOR_MODE);
10745 if (ret) {
10746 THERM_FAILURE(dd, ret, "Write Mode Sel");
10747 goto done;
10748 }
10749 /* Step 5: De-assert block reset and start conversion */
10750 ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
10751 WRITE_SBUS_RECEIVER, 0x2);
10752 if (ret) {
10753 THERM_FAILURE(dd, ret, "Write Reset Deassert");
10754 goto done;
10755 }
10756 /* Step 5.1: Wait for first conversion (21.5ms per spec) */
10757 msleep(22);
10758
10759 /* Enable polling of thermal readings */
10760 write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
10761done:
10762 release_hw_mutex(dd);
10763 return ret;
10764}
10765
10766static void handle_temp_err(struct hfi1_devdata *dd)
10767{
10768 struct hfi1_pportdata *ppd = &dd->pport[0];
10769 /*
10770 * Thermal Critical Interrupt
10771 * Put the device into forced freeze mode, take link down to
10772 * offline, and put DC into reset.
10773 */
10774 dd_dev_emerg(dd,
10775 "Critical temperature reached! Forcing device into freeze mode!\n");
10776 dd->flags |= HFI1_FORCED_FREEZE;
10777 start_freeze_handling(ppd, FREEZE_SELF|FREEZE_ABORT);
10778 /*
10779 * Shut DC down as much and as quickly as possible.
10780 *
10781 * Step 1: Take the link down to OFFLINE. This will cause the
10782 * 8051 to put the Serdes in reset. However, we don't want to
10783 * go through the entire link state machine since we want to
10784 * shutdown ASAP. Furthermore, this is not a graceful shutdown
10785 * but rather an attempt to save the chip.
10786 * Code below is almost the same as quiet_serdes() but avoids
10787 * all the extra work and the sleeps.
10788 */
10789 ppd->driver_link_ready = 0;
10790 ppd->link_enabled = 0;
10791 set_physical_link_state(dd, PLS_OFFLINE |
10792 (OPA_LINKDOWN_REASON_SMA_DISABLED << 8));
10793 /*
10794 * Step 2: Shutdown LCB and 8051
10795 * After shutdown, do not restore DC_CFG_RESET value.
10796 */
10797 dc_shutdown(dd);
10798}
diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h
new file mode 100644
index 000000000000..f89a432c7334
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/chip.h
@@ -0,0 +1,1035 @@
1#ifndef _CHIP_H
2#define _CHIP_H
3/*
4 *
5 * This file is provided under a dual BSD/GPLv2 license. When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * Copyright(c) 2015 Intel Corporation.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of version 2 of the GNU General Public License as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * BSD LICENSE
22 *
23 * Copyright(c) 2015 Intel Corporation.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 *
29 * - Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * - Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in
33 * the documentation and/or other materials provided with the
34 * distribution.
35 * - Neither the name of Intel Corporation nor the names of its
36 * contributors may be used to endorse or promote products derived
37 * from this software without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
40 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
41 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
43 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
47 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
49 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 *
51 */
52
53/*
54 * This file contains all of the defines that is specific to the HFI chip
55 */
56
57/* sizes */
58#define CCE_NUM_MSIX_VECTORS 256
59#define CCE_NUM_INT_CSRS 12
60#define CCE_NUM_INT_MAP_CSRS 96
61#define NUM_INTERRUPT_SOURCES 768
62#define RXE_NUM_CONTEXTS 160
63#define RXE_PER_CONTEXT_SIZE 0x1000 /* 4k */
64#define RXE_NUM_TID_FLOWS 32
65#define RXE_NUM_DATA_VL 8
66#define TXE_NUM_CONTEXTS 160
67#define TXE_NUM_SDMA_ENGINES 16
68#define NUM_CONTEXTS_PER_SET 8
69#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16
70#define VL_ARB_LOW_PRIO_TABLE_SIZE 16
71#define VL_ARB_TABLE_SIZE 16
72#define TXE_NUM_32_BIT_COUNTER 7
73#define TXE_NUM_64_BIT_COUNTER 30
74#define TXE_NUM_DATA_VL 8
75#define TXE_PIO_SIZE (32 * 0x100000) /* 32 MB */
76#define PIO_BLOCK_SIZE 64 /* bytes */
77#define SDMA_BLOCK_SIZE 64 /* bytes */
78#define RCV_BUF_BLOCK_SIZE 64 /* bytes */
79#define PIO_CMASK 0x7ff /* counter mask for free and fill counters */
80#define MAX_EAGER_ENTRIES 2048 /* max receive eager entries */
81#define MAX_TID_PAIR_ENTRIES 1024 /* max receive expected pairs */
82/* Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed
83 at 64 bytes for all generation one devices */
84#define CM_VAU 3
85/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */
86#define CM_GLOBAL_CREDITS 0x940
87/* Number of PKey entries in the HW */
88#define MAX_PKEY_VALUES 16
89
90#include "chip_registers.h"
91
92#define RXE_PER_CONTEXT_USER (RXE + RXE_PER_CONTEXT_OFFSET)
93#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET)
94
95/* PBC flags */
96#define PBC_INTR (1ull << 31)
97#define PBC_DC_INFO_SHIFT (30)
98#define PBC_DC_INFO (1ull << PBC_DC_INFO_SHIFT)
99#define PBC_TEST_EBP (1ull << 29)
100#define PBC_PACKET_BYPASS (1ull << 28)
101#define PBC_CREDIT_RETURN (1ull << 25)
102#define PBC_INSERT_BYPASS_ICRC (1ull << 24)
103#define PBC_TEST_BAD_ICRC (1ull << 23)
104#define PBC_FECN (1ull << 22)
105
106/* PbcInsertHcrc field settings */
107#define PBC_IHCRC_LKDETH 0x0 /* insert @ local KDETH offset */
108#define PBC_IHCRC_GKDETH 0x1 /* insert @ global KDETH offset */
109#define PBC_IHCRC_NONE 0x2 /* no HCRC inserted */
110
111/* PBC fields */
112#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32
113#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull
114#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \
115 (PBC_STATIC_RATE_CONTROL_COUNT_MASK << \
116 PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
117
118#define PBC_INSERT_HCRC_SHIFT 26
119#define PBC_INSERT_HCRC_MASK 0x3ull
120#define PBC_INSERT_HCRC_SMASK \
121 (PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT)
122
123#define PBC_VL_SHIFT 12
124#define PBC_VL_MASK 0xfull
125#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT)
126
127#define PBC_LENGTH_DWS_SHIFT 0
128#define PBC_LENGTH_DWS_MASK 0xfffull
129#define PBC_LENGTH_DWS_SMASK \
130 (PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT)
131
132/* Credit Return Fields */
133#define CR_COUNTER_SHIFT 0
134#define CR_COUNTER_MASK 0x7ffull
135#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT)
136
137#define CR_STATUS_SHIFT 11
138#define CR_STATUS_MASK 0x1ull
139#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT)
140
141#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12
142#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull
143#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \
144 (CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \
145 CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT)
146
147#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13
148#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull
149#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \
150 (CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \
151 CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT)
152
153#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14
154#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull
155#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \
156 (CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \
157 CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT)
158
159#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15
160#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull
161#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \
162 (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
163 CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
164
165/* interrupt source numbers */
166#define IS_GENERAL_ERR_START 0
167#define IS_SDMAENG_ERR_START 16
168#define IS_SENDCTXT_ERR_START 32
169#define IS_SDMA_START 192 /* includes SDmaProgress,SDmaIdle */
170#define IS_VARIOUS_START 240
171#define IS_DC_START 248
172#define IS_RCVAVAIL_START 256
173#define IS_RCVURGENT_START 416
174#define IS_SENDCREDIT_START 576
175#define IS_RESERVED_START 736
176#define IS_MAX_SOURCES 768
177
178/* derived interrupt source values */
179#define IS_GENERAL_ERR_END IS_SDMAENG_ERR_START
180#define IS_SDMAENG_ERR_END IS_SENDCTXT_ERR_START
181#define IS_SENDCTXT_ERR_END IS_SDMA_START
182#define IS_SDMA_END IS_VARIOUS_START
183#define IS_VARIOUS_END IS_DC_START
184#define IS_DC_END IS_RCVAVAIL_START
185#define IS_RCVAVAIL_END IS_RCVURGENT_START
186#define IS_RCVURGENT_END IS_SENDCREDIT_START
187#define IS_SENDCREDIT_END IS_RESERVED_START
188#define IS_RESERVED_END IS_MAX_SOURCES
189
190/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
191#define QSFP1_INT 242
192#define QSFP2_INT 243
193
194/* DCC_CFG_PORT_CONFIG logical link states */
195#define LSTATE_DOWN 0x1
196#define LSTATE_INIT 0x2
197#define LSTATE_ARMED 0x3
198#define LSTATE_ACTIVE 0x4
199
200/* DC8051_STS_CUR_STATE port values (physical link states) */
201#define PLS_DISABLED 0x30
202#define PLS_OFFLINE 0x90
203#define PLS_OFFLINE_QUIET 0x90
204#define PLS_OFFLINE_PLANNED_DOWN_INFORM 0x91
205#define PLS_OFFLINE_READY_TO_QUIET_LT 0x92
206#define PLS_OFFLINE_REPORT_FAILURE 0x93
207#define PLS_OFFLINE_READY_TO_QUIET_BCC 0x94
208#define PLS_POLLING 0x20
209#define PLS_POLLING_QUIET 0x20
210#define PLS_POLLING_ACTIVE 0x21
211#define PLS_CONFIGPHY 0x40
212#define PLS_CONFIGPHY_DEBOUCE 0x40
213#define PLS_CONFIGPHY_ESTCOMM 0x41
214#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT 0x42
215#define PLS_CONFIGPHY_ESTcOMM_LOCAL_COMPLETE 0x43
216#define PLS_CONFIGPHY_OPTEQ 0x44
217#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING 0x44
218#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE 0x45
219#define PLS_CONFIGPHY_VERIFYCAP 0x46
220#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE 0x46
221#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47
222#define PLS_CONFIGLT 0x48
223#define PLS_CONFIGLT_CONFIGURE 0x48
224#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE 0x49
225#define PLS_LINKUP 0x50
226#define PLS_PHYTEST 0xB0
227#define PLS_INTERNAL_SERDES_LOOPBACK 0xe1
228#define PLS_QUICK_LINKUP 0xe2
229
230/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */
231#define HCMD_LOAD_CONFIG_DATA 0x01
232#define HCMD_READ_CONFIG_DATA 0x02
233#define HCMD_CHANGE_PHY_STATE 0x03
234#define HCMD_SEND_LCB_IDLE_MSG 0x04
235#define HCMD_MISC 0x05
236#define HCMD_READ_LCB_IDLE_MSG 0x06
237#define HCMD_READ_LCB_CSR 0x07
238#define HCMD_INTERFACE_TEST 0xff
239
240/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */
241#define HCMD_SUCCESS 2
242
243/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */
244#define SPICO_ROM_FAILED (1 << 0)
245#define UNKNOWN_FRAME (1 << 1)
246#define TARGET_BER_NOT_MET (1 << 2)
247#define FAILED_SERDES_INTERNAL_LOOPBACK (1 << 3)
248#define FAILED_SERDES_INIT (1 << 4)
249#define FAILED_LNI_POLLING (1 << 5)
250#define FAILED_LNI_DEBOUNCE (1 << 6)
251#define FAILED_LNI_ESTBCOMM (1 << 7)
252#define FAILED_LNI_OPTEQ (1 << 8)
253#define FAILED_LNI_VERIFY_CAP1 (1 << 9)
254#define FAILED_LNI_VERIFY_CAP2 (1 << 10)
255#define FAILED_LNI_CONFIGLT (1 << 11)
256
257#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \
258 | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \
259 | FAILED_LNI_VERIFY_CAP1 \
260 | FAILED_LNI_VERIFY_CAP2 \
261 | FAILED_LNI_CONFIGLT)
262
263/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */
264#define HOST_REQ_DONE (1 << 0)
265#define BC_PWR_MGM_MSG (1 << 1)
266#define BC_SMA_MSG (1 << 2)
267#define BC_BCC_UNKOWN_MSG (1 << 3)
268#define BC_IDLE_UNKNOWN_MSG (1 << 4)
269#define EXT_DEVICE_CFG_REQ (1 << 5)
270#define VERIFY_CAP_FRAME (1 << 6)
271#define LINKUP_ACHIEVED (1 << 7)
272#define LINK_GOING_DOWN (1 << 8)
273#define LINK_WIDTH_DOWNGRADED (1 << 9)
274
275/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */
276#define HREQ_LOAD_CONFIG 0x01
277#define HREQ_SAVE_CONFIG 0x02
278#define HREQ_READ_CONFIG 0x03
279#define HREQ_SET_TX_EQ_ABS 0x04
280#define HREQ_SET_TX_EQ_REL 0x05
281#define HREQ_ENABLE 0x06
282#define HREQ_CONFIG_DONE 0xfe
283#define HREQ_INTERFACE_TEST 0xff
284
285/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */
286#define HREQ_INVALID 0x01
287#define HREQ_SUCCESS 0x02
288#define HREQ_NOT_SUPPORTED 0x03
289#define HREQ_FEATURE_NOT_SUPPORTED 0x04 /* request specific feature */
290#define HREQ_REQUEST_REJECTED 0xfe
291#define HREQ_EXECUTION_ONGOING 0xff
292
293/* MISC host command functions */
294#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1
295#define HCMD_MISC_GRANT_LCB_ACCESS 0x2
296
297/* idle flit message types */
298#define IDLE_PHYSICAL_LINK_MGMT 0x1
299#define IDLE_CRU 0x2
300#define IDLE_SMA 0x3
301#define IDLE_POWER_MGMT 0x4
302
303/* idle flit message send fields (both send and read) */
304#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */
305#define IDLE_PAYLOAD_SHIFT 8
306#define IDLE_MSG_TYPE_MASK 0xf
307#define IDLE_MSG_TYPE_SHIFT 0
308
309/* idle flit message read fields */
310#define READ_IDLE_MSG_TYPE_MASK 0xf
311#define READ_IDLE_MSG_TYPE_SHIFT 0
312
313/* SMA idle flit payload commands */
314#define SMA_IDLE_ARM 1
315#define SMA_IDLE_ACTIVE 2
316
317/* DC_DC8051_CFG_MODE.GENERAL bits */
318#define DISABLE_SELF_GUID_CHECK 0x2
319
320/*
321 * Eager buffer minimum and maximum sizes supported by the hardware.
322 * All power-of-two sizes in between are supported as well.
323 * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory
324 * allocatable for Eager buffer to a single context. All others
325 * are limits for the RcvArray entries.
326 */
327#define MIN_EAGER_BUFFER (4 * 1024)
328#define MAX_EAGER_BUFFER (256 * 1024)
329#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */
330#define MAX_EXPECTED_BUFFER (2048 * 1024)
331
332/*
333 * Receive expected base and count and eager base and count increment -
334 * the CSR fields hold multiples of this value.
335 */
336#define RCV_SHIFT 3
337#define RCV_INCREMENT (1 << RCV_SHIFT)
338
339/*
340 * Receive header queue entry increment - the CSR holds multiples of
341 * this value.
342 */
343#define HDRQ_SIZE_SHIFT 5
344#define HDRQ_INCREMENT (1 << HDRQ_SIZE_SHIFT)
345
346/*
347 * Freeze handling flags
348 */
349#define FREEZE_ABORT 0x01 /* do not do recovery */
350#define FREEZE_SELF 0x02 /* initiate the freeze */
351#define FREEZE_LINK_DOWN 0x04 /* link is down */
352
353/*
354 * Chip implementation codes.
355 */
356#define ICODE_RTL_SILICON 0x00
357#define ICODE_RTL_VCS_SIMULATION 0x01
358#define ICODE_FPGA_EMULATION 0x02
359#define ICODE_FUNCTIONAL_SIMULATOR 0x03
360
361/*
362 * 8051 data memory size.
363 */
364#define DC8051_DATA_MEM_SIZE 0x1000
365
366/*
367 * 8051 firmware registers
368 */
369#define NUM_GENERAL_FIELDS 0x17
370#define NUM_LANE_FIELDS 0x8
371
372/* 8051 general register Field IDs */
373#define TX_SETTINGS 0x06
374#define VERIFY_CAP_LOCAL_PHY 0x07
375#define VERIFY_CAP_LOCAL_FABRIC 0x08
376#define VERIFY_CAP_LOCAL_LINK_WIDTH 0x09
377#define LOCAL_DEVICE_ID 0x0a
378#define LOCAL_LNI_INFO 0x0c
379#define REMOTE_LNI_INFO 0x0d
380#define MISC_STATUS 0x0e
381#define VERIFY_CAP_REMOTE_PHY 0x0f
382#define VERIFY_CAP_REMOTE_FABRIC 0x10
383#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11
384#define LAST_LOCAL_STATE_COMPLETE 0x12
385#define LAST_REMOTE_STATE_COMPLETE 0x13
386#define LINK_QUALITY_INFO 0x14
387#define REMOTE_DEVICE_ID 0x15
388
389/* Lane ID for general configuration registers */
390#define GENERAL_CONFIG 4
391
392/* LOAD_DATA 8051 command shifts and fields */
393#define LOAD_DATA_FIELD_ID_SHIFT 40
394#define LOAD_DATA_FIELD_ID_MASK 0xfull
395#define LOAD_DATA_LANE_ID_SHIFT 32
396#define LOAD_DATA_LANE_ID_MASK 0xfull
397#define LOAD_DATA_DATA_SHIFT 0x0
398#define LOAD_DATA_DATA_MASK 0xffffffffull
399
400/* READ_DATA 8051 command shifts and fields */
401#define READ_DATA_FIELD_ID_SHIFT 40
402#define READ_DATA_FIELD_ID_MASK 0xffull
403#define READ_DATA_LANE_ID_SHIFT 32
404#define READ_DATA_LANE_ID_MASK 0xffull
405#define READ_DATA_DATA_SHIFT 0x0
406#define READ_DATA_DATA_MASK 0xffffffffull
407
408/* TX settings fields */
409#define ENABLE_LANE_TX_SHIFT 0
410#define ENABLE_LANE_TX_MASK 0xff
411#define TX_POLARITY_INVERSION_SHIFT 8
412#define TX_POLARITY_INVERSION_MASK 0xff
413#define RX_POLARITY_INVERSION_SHIFT 16
414#define RX_POLARITY_INVERSION_MASK 0xff
415#define MAX_RATE_SHIFT 24
416#define MAX_RATE_MASK 0xff
417
418/* verify capability PHY fields */
419#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT 0x4
420#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK 0x1
421#define POWER_MANAGEMENT_SHIFT 0x0
422#define POWER_MANAGEMENT_MASK 0xf
423
424/* 8051 lane register Field IDs */
425#define SPICO_FW_VERSION 0x7 /* SPICO firmware version */
426
427/* SPICO firmware version fields */
428#define SPICO_ROM_VERSION_SHIFT 0
429#define SPICO_ROM_VERSION_MASK 0xffff
430#define SPICO_ROM_PROD_ID_SHIFT 16
431#define SPICO_ROM_PROD_ID_MASK 0xffff
432
433/* verify capability fabric fields */
434#define VAU_SHIFT 0
435#define VAU_MASK 0x0007
436#define Z_SHIFT 3
437#define Z_MASK 0x0001
438#define VCU_SHIFT 4
439#define VCU_MASK 0x0007
440#define VL15BUF_SHIFT 8
441#define VL15BUF_MASK 0x0fff
442#define CRC_SIZES_SHIFT 20
443#define CRC_SIZES_MASK 0x7
444
445/* verify capability local link width fields */
446#define LINK_WIDTH_SHIFT 0 /* also for remote link width */
447#define LINK_WIDTH_MASK 0xffff /* also for remote link width */
448#define LOCAL_FLAG_BITS_SHIFT 16
449#define LOCAL_FLAG_BITS_MASK 0xff
450#define MISC_CONFIG_BITS_SHIFT 24
451#define MISC_CONFIG_BITS_MASK 0xff
452
453/* verify capability remote link width fields */
454#define REMOTE_TX_RATE_SHIFT 16
455#define REMOTE_TX_RATE_MASK 0xff
456
457/* LOCAL_DEVICE_ID fields */
458#define LOCAL_DEVICE_REV_SHIFT 0
459#define LOCAL_DEVICE_REV_MASK 0xff
460#define LOCAL_DEVICE_ID_SHIFT 8
461#define LOCAL_DEVICE_ID_MASK 0xffff
462
463/* REMOTE_DEVICE_ID fields */
464#define REMOTE_DEVICE_REV_SHIFT 0
465#define REMOTE_DEVICE_REV_MASK 0xff
466#define REMOTE_DEVICE_ID_SHIFT 8
467#define REMOTE_DEVICE_ID_MASK 0xffff
468
469/* local LNI link width fields */
470#define ENABLE_LANE_RX_SHIFT 16
471#define ENABLE_LANE_RX_MASK 0xff
472
473/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */
474#define MGMT_ALLOWED_SHIFT 23
475#define MGMT_ALLOWED_MASK 0x1
476
477/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */
478#define LINK_QUALITY_SHIFT 24
479#define LINK_QUALITY_MASK 0x7
480
481/*
482 * mask, shift for reading 'planned_down_remote_reason_code'
483 * from LINK_QUALITY_INFO field
484 */
485#define DOWN_REMOTE_REASON_SHIFT 16
486#define DOWN_REMOTE_REASON_MASK 0xff
487
488/* verify capability PHY power management bits */
489#define PWRM_BER_CONTROL 0x1
490#define PWRM_BANDWIDTH_CONTROL 0x2
491
492/* verify capability fabric CRC size bits */
493enum {
494 CAP_CRC_14B = (1 << 0), /* 14b CRC */
495 CAP_CRC_48B = (1 << 1), /* 48b CRC */
496 CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */
497};
498
499#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B)
500
501/* misc status version fields */
502#define STS_FM_VERSION_A_SHIFT 16
503#define STS_FM_VERSION_A_MASK 0xff
504#define STS_FM_VERSION_B_SHIFT 24
505#define STS_FM_VERSION_B_MASK 0xff
506
507/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */
508#define LCB_CRC_16B 0x0 /* 16b CRC */
509#define LCB_CRC_14B 0x1 /* 14b CRC */
510#define LCB_CRC_48B 0x2 /* 48b CRC */
511#define LCB_CRC_12B_16B_PER_LANE 0x3 /* 12b-16b per lane CRC */
512
513/* the following enum is (almost) a copy/paste of the definition
514 * in the OPA spec, section 20.2.2.6.8 (PortInfo) */
515enum {
516 PORT_LTP_CRC_MODE_NONE = 0,
517 PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */
518 PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */
519 PORT_LTP_CRC_MODE_48 = 4,
520 /* 48-bit overlapping LTP CRC mode (optional) */
521 PORT_LTP_CRC_MODE_PER_LANE = 8
522 /* 12 to 16 bit per lane LTP CRC mode (optional) */
523};
524
525/* timeouts */
526#define LINK_RESTART_DELAY 1000 /* link restart delay, in ms */
527#define TIMEOUT_8051_START 5000 /* 8051 start timeout, in ms */
528#define DC8051_COMMAND_TIMEOUT 20000 /* DC8051 command timeout, in ms */
529#define FREEZE_STATUS_TIMEOUT 20 /* wait for freeze indicators, in ms */
530#define VL_STATUS_CLEAR_TIMEOUT 5000 /* per-VL status clear, in ms */
531#define CCE_STATUS_TIMEOUT 10 /* time to clear CCE Status, in ms */
532
533/* cclock tick time, in picoseconds per tick: 1/speed * 10^12 */
534#define ASIC_CCLOCK_PS 1242 /* 805 MHz */
535#define FPGA_CCLOCK_PS 30300 /* 33 MHz */
536
537/*
538 * Mask of enabled MISC errors. Do not enable the two RSA engine errors -
539 * see firmware.c:run_rsa() for details.
540 */
541#define DRIVER_MISC_MASK \
542 (~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \
543 | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK))
544
545/* valid values for the loopback module parameter */
546#define LOOPBACK_NONE 0 /* no loopback - default */
547#define LOOPBACK_SERDES 1
548#define LOOPBACK_LCB 2
549#define LOOPBACK_CABLE 3 /* external cable */
550
551/* read and write hardware registers */
552u64 read_csr(const struct hfi1_devdata *dd, u32 offset);
553void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value);
554
555/*
556 * The *_kctxt_* flavor of the CSR read/write functions are for
557 * per-context or per-SDMA CSRs that are not mappable to user-space.
558 * Their spacing is not a PAGE_SIZE multiple.
559 */
560static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt,
561 u32 offset0)
562{
563 /* kernel per-context CSRs are separated by 0x100 */
564 return read_csr(dd, offset0 + (0x100 * ctxt));
565}
566
567static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt,
568 u32 offset0, u64 value)
569{
570 /* kernel per-context CSRs are separated by 0x100 */
571 write_csr(dd, offset0 + (0x100 * ctxt), value);
572}
573
574int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data);
575int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
576
577void __iomem *get_csr_addr(
578 struct hfi1_devdata *dd,
579 u32 offset);
580
581static inline void __iomem *get_kctxt_csr_addr(
582 struct hfi1_devdata *dd,
583 int ctxt,
584 u32 offset0)
585{
586 return get_csr_addr(dd, offset0 + (0x100 * ctxt));
587}
588
589/*
590 * The *_uctxt_* flavor of the CSR read/write functions are for
591 * per-context CSRs that are mappable to user space. All these CSRs
592 * are spaced by a PAGE_SIZE multiple in order to be mappable to
593 * different processes without exposing other contexts' CSRs
594 */
595static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt,
596 u32 offset0)
597{
598 /* user per-context CSRs are separated by 0x1000 */
599 return read_csr(dd, offset0 + (0x1000 * ctxt));
600}
601
602static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt,
603 u32 offset0, u64 value)
604{
605 /* user per-context CSRs are separated by 0x1000 */
606 write_csr(dd, offset0 + (0x1000 * ctxt), value);
607}
608
609u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32);
610
611/* firmware.c */
612#define NUM_PCIE_SERDES 16 /* number of PCIe serdes on the SBus */
613extern const u8 pcie_serdes_broadcast[];
614extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES];
615/* SBus commands */
616#define RESET_SBUS_RECEIVER 0x20
617#define WRITE_SBUS_RECEIVER 0x21
618void sbus_request(struct hfi1_devdata *dd,
619 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
620int sbus_request_slow(struct hfi1_devdata *dd,
621 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
622void set_sbus_fast_mode(struct hfi1_devdata *dd);
623void clear_sbus_fast_mode(struct hfi1_devdata *dd);
624int hfi1_firmware_init(struct hfi1_devdata *dd);
625int load_pcie_firmware(struct hfi1_devdata *dd);
626int load_firmware(struct hfi1_devdata *dd);
627void dispose_firmware(void);
628int acquire_hw_mutex(struct hfi1_devdata *dd);
629void release_hw_mutex(struct hfi1_devdata *dd);
630void fabric_serdes_reset(struct hfi1_devdata *dd);
631int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result);
632
633/* chip.c */
634void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b);
635void read_guid(struct hfi1_devdata *dd);
636int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
637void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
638 u8 neigh_reason, u8 rem_reason);
639int set_link_state(struct hfi1_pportdata *, u32 state);
640int port_ltp_to_cap(int port_ltp);
641void handle_verify_cap(struct work_struct *work);
642void handle_freeze(struct work_struct *work);
643void handle_link_up(struct work_struct *work);
644void handle_link_down(struct work_struct *work);
645void handle_link_downgrade(struct work_struct *work);
646void handle_link_bounce(struct work_struct *work);
647void handle_sma_message(struct work_struct *work);
648void start_freeze_handling(struct hfi1_pportdata *ppd, int flags);
649int send_idle_sma(struct hfi1_devdata *dd, u64 message);
650int start_link(struct hfi1_pportdata *ppd);
651void init_qsfp(struct hfi1_pportdata *ppd);
652int bringup_serdes(struct hfi1_pportdata *ppd);
653void set_intr_state(struct hfi1_devdata *dd, u32 enable);
654void apply_link_downgrade_policy(struct hfi1_pportdata *ppd,
655 int refresh_widths);
656void update_usrhead(struct hfi1_ctxtdata *, u32, u32, u32, u32, u32);
657int stop_drain_data_vls(struct hfi1_devdata *dd);
658int open_fill_data_vls(struct hfi1_devdata *dd);
659u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns);
660u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock);
661void get_linkup_link_widths(struct hfi1_pportdata *ppd);
662void read_ltp_rtt(struct hfi1_devdata *dd);
663void clear_linkup_counters(struct hfi1_devdata *dd);
664u32 hdrqempty(struct hfi1_ctxtdata *rcd);
665int is_a0(struct hfi1_devdata *dd);
666int is_ax(struct hfi1_devdata *dd);
667int is_bx(struct hfi1_devdata *dd);
668u32 read_physical_state(struct hfi1_devdata *dd);
669u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
670u32 get_logical_state(struct hfi1_pportdata *ppd);
671const char *opa_lstate_name(u32 lstate);
672const char *opa_pstate_name(u32 pstate);
673u32 driver_physical_state(struct hfi1_pportdata *ppd);
674u32 driver_logical_state(struct hfi1_pportdata *ppd);
675
676int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
677int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
678#define LCB_START DC_LCB_CSRS
679#define LCB_END DC_8051_CSRS /* next block is 8051 */
680static inline int is_lcb_offset(u32 offset)
681{
682 return (offset >= LCB_START && offset < LCB_END);
683}
684
685extern uint num_vls;
686
687extern uint disable_integrity;
688u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl);
689u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data);
690u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl);
691u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data);
692
693/* Per VL indexes */
694enum {
695 C_VL_0 = 0,
696 C_VL_1,
697 C_VL_2,
698 C_VL_3,
699 C_VL_4,
700 C_VL_5,
701 C_VL_6,
702 C_VL_7,
703 C_VL_15,
704 C_VL_COUNT
705};
706
707static inline int vl_from_idx(int idx)
708{
709 return (idx == C_VL_15 ? 15 : idx);
710}
711
712static inline int idx_from_vl(int vl)
713{
714 return (vl == 15 ? C_VL_15 : vl);
715}
716
717/* Per device counter indexes */
718enum {
719 C_RCV_OVF = 0,
720 C_RX_TID_FULL,
721 C_RX_TID_INVALID,
722 C_RX_TID_FLGMS,
723 C_RX_CTX_RHQS,
724 C_RX_CTX_EGRS,
725 C_RCV_TID_FLSMS,
726 C_CCE_PCI_CR_ST,
727 C_CCE_PCI_TR_ST,
728 C_CCE_PIO_WR_ST,
729 C_CCE_ERR_INT,
730 C_CCE_SDMA_INT,
731 C_CCE_MISC_INT,
732 C_CCE_RCV_AV_INT,
733 C_CCE_RCV_URG_INT,
734 C_CCE_SEND_CR_INT,
735 C_DC_UNC_ERR,
736 C_DC_RCV_ERR,
737 C_DC_FM_CFG_ERR,
738 C_DC_RMT_PHY_ERR,
739 C_DC_DROPPED_PKT,
740 C_DC_MC_XMIT_PKTS,
741 C_DC_MC_RCV_PKTS,
742 C_DC_XMIT_CERR,
743 C_DC_RCV_CERR,
744 C_DC_RCV_FCC,
745 C_DC_XMIT_FCC,
746 C_DC_XMIT_FLITS,
747 C_DC_RCV_FLITS,
748 C_DC_XMIT_PKTS,
749 C_DC_RCV_PKTS,
750 C_DC_RX_FLIT_VL,
751 C_DC_RX_PKT_VL,
752 C_DC_RCV_FCN,
753 C_DC_RCV_FCN_VL,
754 C_DC_RCV_BCN,
755 C_DC_RCV_BCN_VL,
756 C_DC_RCV_BBL,
757 C_DC_RCV_BBL_VL,
758 C_DC_MARK_FECN,
759 C_DC_MARK_FECN_VL,
760 C_DC_TOTAL_CRC,
761 C_DC_CRC_LN0,
762 C_DC_CRC_LN1,
763 C_DC_CRC_LN2,
764 C_DC_CRC_LN3,
765 C_DC_CRC_MULT_LN,
766 C_DC_TX_REPLAY,
767 C_DC_RX_REPLAY,
768 C_DC_SEQ_CRC_CNT,
769 C_DC_ESC0_ONLY_CNT,
770 C_DC_ESC0_PLUS1_CNT,
771 C_DC_ESC0_PLUS2_CNT,
772 C_DC_REINIT_FROM_PEER_CNT,
773 C_DC_SBE_CNT,
774 C_DC_MISC_FLG_CNT,
775 C_DC_PRF_GOOD_LTP_CNT,
776 C_DC_PRF_ACCEPTED_LTP_CNT,
777 C_DC_PRF_RX_FLIT_CNT,
778 C_DC_PRF_TX_FLIT_CNT,
779 C_DC_PRF_CLK_CNTR,
780 C_DC_PG_DBG_FLIT_CRDTS_CNT,
781 C_DC_PG_STS_PAUSE_COMPLETE_CNT,
782 C_DC_PG_STS_TX_SBE_CNT,
783 C_DC_PG_STS_TX_MBE_CNT,
784 C_SW_CPU_INTR,
785 C_SW_CPU_RCV_LIM,
786 C_SW_VTX_WAIT,
787 C_SW_PIO_WAIT,
788 C_SW_KMEM_WAIT,
789 DEV_CNTR_LAST /* Must be kept last */
790};
791
792/* Per port counter indexes */
793enum {
794 C_TX_UNSUP_VL = 0,
795 C_TX_INVAL_LEN,
796 C_TX_MM_LEN_ERR,
797 C_TX_UNDERRUN,
798 C_TX_FLOW_STALL,
799 C_TX_DROPPED,
800 C_TX_HDR_ERR,
801 C_TX_PKT,
802 C_TX_WORDS,
803 C_TX_WAIT,
804 C_TX_FLIT_VL,
805 C_TX_PKT_VL,
806 C_TX_WAIT_VL,
807 C_RX_PKT,
808 C_RX_WORDS,
809 C_SW_LINK_DOWN,
810 C_SW_LINK_UP,
811 C_SW_XMIT_DSCD,
812 C_SW_XMIT_DSCD_VL,
813 C_SW_XMIT_CSTR_ERR,
814 C_SW_RCV_CSTR_ERR,
815 C_SW_IBP_LOOP_PKTS,
816 C_SW_IBP_RC_RESENDS,
817 C_SW_IBP_RNR_NAKS,
818 C_SW_IBP_OTHER_NAKS,
819 C_SW_IBP_RC_TIMEOUTS,
820 C_SW_IBP_PKT_DROPS,
821 C_SW_IBP_DMA_WAIT,
822 C_SW_IBP_RC_SEQNAK,
823 C_SW_IBP_RC_DUPREQ,
824 C_SW_IBP_RDMA_SEQ,
825 C_SW_IBP_UNALIGNED,
826 C_SW_IBP_SEQ_NAK,
827 C_SW_CPU_RC_ACKS,
828 C_SW_CPU_RC_QACKS,
829 C_SW_CPU_RC_DELAYED_COMP,
830 C_RCV_HDR_OVF_0,
831 C_RCV_HDR_OVF_1,
832 C_RCV_HDR_OVF_2,
833 C_RCV_HDR_OVF_3,
834 C_RCV_HDR_OVF_4,
835 C_RCV_HDR_OVF_5,
836 C_RCV_HDR_OVF_6,
837 C_RCV_HDR_OVF_7,
838 C_RCV_HDR_OVF_8,
839 C_RCV_HDR_OVF_9,
840 C_RCV_HDR_OVF_10,
841 C_RCV_HDR_OVF_11,
842 C_RCV_HDR_OVF_12,
843 C_RCV_HDR_OVF_13,
844 C_RCV_HDR_OVF_14,
845 C_RCV_HDR_OVF_15,
846 C_RCV_HDR_OVF_16,
847 C_RCV_HDR_OVF_17,
848 C_RCV_HDR_OVF_18,
849 C_RCV_HDR_OVF_19,
850 C_RCV_HDR_OVF_20,
851 C_RCV_HDR_OVF_21,
852 C_RCV_HDR_OVF_22,
853 C_RCV_HDR_OVF_23,
854 C_RCV_HDR_OVF_24,
855 C_RCV_HDR_OVF_25,
856 C_RCV_HDR_OVF_26,
857 C_RCV_HDR_OVF_27,
858 C_RCV_HDR_OVF_28,
859 C_RCV_HDR_OVF_29,
860 C_RCV_HDR_OVF_30,
861 C_RCV_HDR_OVF_31,
862 C_RCV_HDR_OVF_32,
863 C_RCV_HDR_OVF_33,
864 C_RCV_HDR_OVF_34,
865 C_RCV_HDR_OVF_35,
866 C_RCV_HDR_OVF_36,
867 C_RCV_HDR_OVF_37,
868 C_RCV_HDR_OVF_38,
869 C_RCV_HDR_OVF_39,
870 C_RCV_HDR_OVF_40,
871 C_RCV_HDR_OVF_41,
872 C_RCV_HDR_OVF_42,
873 C_RCV_HDR_OVF_43,
874 C_RCV_HDR_OVF_44,
875 C_RCV_HDR_OVF_45,
876 C_RCV_HDR_OVF_46,
877 C_RCV_HDR_OVF_47,
878 C_RCV_HDR_OVF_48,
879 C_RCV_HDR_OVF_49,
880 C_RCV_HDR_OVF_50,
881 C_RCV_HDR_OVF_51,
882 C_RCV_HDR_OVF_52,
883 C_RCV_HDR_OVF_53,
884 C_RCV_HDR_OVF_54,
885 C_RCV_HDR_OVF_55,
886 C_RCV_HDR_OVF_56,
887 C_RCV_HDR_OVF_57,
888 C_RCV_HDR_OVF_58,
889 C_RCV_HDR_OVF_59,
890 C_RCV_HDR_OVF_60,
891 C_RCV_HDR_OVF_61,
892 C_RCV_HDR_OVF_62,
893 C_RCV_HDR_OVF_63,
894 C_RCV_HDR_OVF_64,
895 C_RCV_HDR_OVF_65,
896 C_RCV_HDR_OVF_66,
897 C_RCV_HDR_OVF_67,
898 C_RCV_HDR_OVF_68,
899 C_RCV_HDR_OVF_69,
900 C_RCV_HDR_OVF_70,
901 C_RCV_HDR_OVF_71,
902 C_RCV_HDR_OVF_72,
903 C_RCV_HDR_OVF_73,
904 C_RCV_HDR_OVF_74,
905 C_RCV_HDR_OVF_75,
906 C_RCV_HDR_OVF_76,
907 C_RCV_HDR_OVF_77,
908 C_RCV_HDR_OVF_78,
909 C_RCV_HDR_OVF_79,
910 C_RCV_HDR_OVF_80,
911 C_RCV_HDR_OVF_81,
912 C_RCV_HDR_OVF_82,
913 C_RCV_HDR_OVF_83,
914 C_RCV_HDR_OVF_84,
915 C_RCV_HDR_OVF_85,
916 C_RCV_HDR_OVF_86,
917 C_RCV_HDR_OVF_87,
918 C_RCV_HDR_OVF_88,
919 C_RCV_HDR_OVF_89,
920 C_RCV_HDR_OVF_90,
921 C_RCV_HDR_OVF_91,
922 C_RCV_HDR_OVF_92,
923 C_RCV_HDR_OVF_93,
924 C_RCV_HDR_OVF_94,
925 C_RCV_HDR_OVF_95,
926 C_RCV_HDR_OVF_96,
927 C_RCV_HDR_OVF_97,
928 C_RCV_HDR_OVF_98,
929 C_RCV_HDR_OVF_99,
930 C_RCV_HDR_OVF_100,
931 C_RCV_HDR_OVF_101,
932 C_RCV_HDR_OVF_102,
933 C_RCV_HDR_OVF_103,
934 C_RCV_HDR_OVF_104,
935 C_RCV_HDR_OVF_105,
936 C_RCV_HDR_OVF_106,
937 C_RCV_HDR_OVF_107,
938 C_RCV_HDR_OVF_108,
939 C_RCV_HDR_OVF_109,
940 C_RCV_HDR_OVF_110,
941 C_RCV_HDR_OVF_111,
942 C_RCV_HDR_OVF_112,
943 C_RCV_HDR_OVF_113,
944 C_RCV_HDR_OVF_114,
945 C_RCV_HDR_OVF_115,
946 C_RCV_HDR_OVF_116,
947 C_RCV_HDR_OVF_117,
948 C_RCV_HDR_OVF_118,
949 C_RCV_HDR_OVF_119,
950 C_RCV_HDR_OVF_120,
951 C_RCV_HDR_OVF_121,
952 C_RCV_HDR_OVF_122,
953 C_RCV_HDR_OVF_123,
954 C_RCV_HDR_OVF_124,
955 C_RCV_HDR_OVF_125,
956 C_RCV_HDR_OVF_126,
957 C_RCV_HDR_OVF_127,
958 C_RCV_HDR_OVF_128,
959 C_RCV_HDR_OVF_129,
960 C_RCV_HDR_OVF_130,
961 C_RCV_HDR_OVF_131,
962 C_RCV_HDR_OVF_132,
963 C_RCV_HDR_OVF_133,
964 C_RCV_HDR_OVF_134,
965 C_RCV_HDR_OVF_135,
966 C_RCV_HDR_OVF_136,
967 C_RCV_HDR_OVF_137,
968 C_RCV_HDR_OVF_138,
969 C_RCV_HDR_OVF_139,
970 C_RCV_HDR_OVF_140,
971 C_RCV_HDR_OVF_141,
972 C_RCV_HDR_OVF_142,
973 C_RCV_HDR_OVF_143,
974 C_RCV_HDR_OVF_144,
975 C_RCV_HDR_OVF_145,
976 C_RCV_HDR_OVF_146,
977 C_RCV_HDR_OVF_147,
978 C_RCV_HDR_OVF_148,
979 C_RCV_HDR_OVF_149,
980 C_RCV_HDR_OVF_150,
981 C_RCV_HDR_OVF_151,
982 C_RCV_HDR_OVF_152,
983 C_RCV_HDR_OVF_153,
984 C_RCV_HDR_OVF_154,
985 C_RCV_HDR_OVF_155,
986 C_RCV_HDR_OVF_156,
987 C_RCV_HDR_OVF_157,
988 C_RCV_HDR_OVF_158,
989 C_RCV_HDR_OVF_159,
990 PORT_CNTR_LAST /* Must be kept last */
991};
992
993u64 get_all_cpu_total(u64 __percpu *cntr);
994void hfi1_start_cleanup(struct hfi1_devdata *dd);
995void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
996struct hfi1_message_header *hfi1_get_msgheader(
997 struct hfi1_devdata *dd, __le32 *rhf_addr);
998int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
999 struct hfi1_ctxt_info *kinfo);
1000u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
1001 u32 mask);
1002int hfi1_init_ctxt(struct send_context *sc);
1003void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
1004 u32 type, unsigned long pa, u16 order);
1005void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
1006void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt);
1007u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep,
1008 u64 **cntrp);
1009u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port,
1010 char **namep, u64 **cntrp);
1011u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd);
1012int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
1013int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
1014int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey);
1015int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt);
1016int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey);
1017int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt);
1018void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
1019
1020/*
1021 * Interrupt source table.
1022 *
1023 * Each entry is an interrupt source "type". It is ordered by increasing
1024 * number.
1025 */
1026struct is_table {
1027 int start; /* interrupt source type start */
1028 int end; /* interrupt source type end */
1029 /* routine that returns the name of the interrupt source */
1030 char *(*is_name)(char *name, size_t size, unsigned int source);
1031 /* routine to call when receiving an interrupt */
1032 void (*is_int)(struct hfi1_devdata *dd, unsigned int source);
1033};
1034
1035#endif /* _CHIP_H */
diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h
new file mode 100644
index 000000000000..bf45de29d8bd
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/chip_registers.h
@@ -0,0 +1,1292 @@
1#ifndef DEF_CHIP_REG
2#define DEF_CHIP_REG
3
4/*
5 *
6 * This file is provided under a dual BSD/GPLv2 license. When using or
7 * redistributing this file, you may do so under either license.
8 *
9 * GPL LICENSE SUMMARY
10 *
11 * Copyright(c) 2015 Intel Corporation.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of version 2 of the GNU General Public License as
15 * published by the Free Software Foundation.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * BSD LICENSE
23 *
24 * Copyright(c) 2015 Intel Corporation.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 *
30 * - Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * - Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in
34 * the documentation and/or other materials provided with the
35 * distribution.
36 * - Neither the name of Intel Corporation nor the names of its
37 * contributors may be used to endorse or promote products derived
38 * from this software without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
41 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
42 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
43 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
44 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
47 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
48 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
49 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
50 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51 *
52 */
53
54#define CORE 0x000000000000
55#define CCE (CORE + 0x000000000000)
56#define ASIC (CORE + 0x000000400000)
57#define MISC (CORE + 0x000000500000)
58#define DC_TOP_CSRS (CORE + 0x000000600000)
59#define CHIP_DEBUG (CORE + 0x000000700000)
60#define RXE (CORE + 0x000001000000)
61#define TXE (CORE + 0x000001800000)
62#define DCC_CSRS (DC_TOP_CSRS + 0x000000000000)
63#define DC_LCB_CSRS (DC_TOP_CSRS + 0x000000001000)
64#define DC_8051_CSRS (DC_TOP_CSRS + 0x000000002000)
65#define PCIE 0
66
67#define ASIC_NUM_SCRATCH 4
68#define CCE_ERR_INT_CNT 0
69#define CCE_MISC_INT_CNT 2
70#define CCE_NUM_32_BIT_COUNTERS 3
71#define CCE_NUM_32_BIT_INT_COUNTERS 6
72#define CCE_NUM_INT_CSRS 12
73#define CCE_NUM_INT_MAP_CSRS 96
74#define CCE_NUM_MSIX_PBAS 4
75#define CCE_NUM_MSIX_VECTORS 256
76#define CCE_NUM_SCRATCH 4
77#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2
78#define CCE_PCIE_TRGT_STALL_CNT 0
79#define CCE_PIO_WR_STALL_CNT 1
80#define CCE_RCV_AVAIL_INT_CNT 3
81#define CCE_RCV_URGENT_INT_CNT 4
82#define CCE_SDMA_INT_CNT 1
83#define CCE_SEND_CREDIT_INT_CNT 5
84#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040)
85#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull
86#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0
87#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull
88#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008)
89#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010)
90#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull
91#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16
92#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull
93#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull
94#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0
95#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull
96#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull
97#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48
98#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull
99#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull
100#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32
101#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull
102#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000)
103#define DCC_CFG_RESET_RESET_LCB_SHIFT 0
104#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2
105#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028)
106#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0
107#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40
108#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44
109#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48
110#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52
111#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56
112#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60
113#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4
114#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8
115#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12
116#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16
117#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20
118#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24
119#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28
120#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32
121#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36
122#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030)
123#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0
124#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4
125#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8
126#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12
127#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16
128#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20
129#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24
130#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28
131#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32
132#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36
133#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40
134#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44
135#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48
136#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52
137#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56
138#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60
139#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120)
140#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050)
141#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull
142#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull
143#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull
144#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull
145#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull
146#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull
147#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull
148#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull
149#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull
150#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull
151#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull
152#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull
153#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull
154#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060)
155#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
156#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
157#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull
158#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull
159#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull
160#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058)
161#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
162#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
163#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull
164#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull
165#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull
166#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull
167#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull
168#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull
169#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull
170#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull
171#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull
172#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull
173#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull
174#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull
175#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull
176#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull
177#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull
178#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull
179#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull
180#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull
181#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull
182#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull
183#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull
184#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull
185#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull
186#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull
187#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull
188#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull
189#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull
190#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull
191#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110)
192#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090)
193#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078)
194#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080)
195#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088)
196#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098)
197#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108)
198#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118)
199#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100)
200#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330)
201#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290)
202#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0)
203#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140)
204#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198)
205#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240)
206#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130)
207#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8)
208#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338)
209#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298)
210#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8)
211#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0)
212#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248)
213#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8)
214#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138)
215#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190)
216#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128)
217#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0)
218#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180)
219#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188)
220#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110)
221#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull
222#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull
223#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118)
224#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull
225#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8
226#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16
227#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120)
228#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull
229#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16
230#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull
231#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull
232#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull
233#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8
234#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028)
235#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull
236#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16
237#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull
238#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull
239#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8
240#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030)
241#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull
242#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull
243#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8
244#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull
245#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16
246#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038)
247#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070)
248#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008)
249#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull
250#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0
251#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull
252#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull
253#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000)
254#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull
255#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull
256#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018)
257#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull
258#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010)
259#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020)
260#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068)
261#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull
262#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull
263#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull
264#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull
265#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull
266#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8)
267#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull
268#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16
269#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull
270#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0
271#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8)
272#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0)
273#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull
274#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0)
275#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull
276#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull
277#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull
278#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull
279#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull
280#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull
281#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull
282#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull
283#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull
284#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull
285#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060)
286#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull
287#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16
288#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull
289#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0
290#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050)
291#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull
292#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058)
293#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040)
294#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048)
295#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull
296#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130)
297#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull
298#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128)
299#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0
300#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058)
301#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0
302#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020)
303#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull
304#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100)
305#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120)
306#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
307#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull
308#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060)
309#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8)
310#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0
311#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000)
312#define DC_LCB_CFG_RUN_EN_SHIFT 0
313#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018)
314#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8
315#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4
316#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0
317#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010)
318#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0
319#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008)
320#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0
321#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308)
322#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310)
323#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300)
324#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull
325#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull
326#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull
327#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull
328#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull
329#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull
330#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull
331#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull
332#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull
333#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
334#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull
335#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull
336#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull
337#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull
338#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull
339#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull
340#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull
341#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull
342#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull
343#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull
344#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull
345#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull
346#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull
347#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull
348#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull
349#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull
350#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull
351#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull
352#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull
353#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull
354#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328)
355#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330)
356#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338)
357#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340)
358#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348)
359#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368)
360#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370)
361#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378)
362#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390)
363#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380)
364#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358)
365#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388)
366#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360)
367#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320)
368#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350)
369#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580)
370#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8)
371#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608)
372#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600)
373#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408)
374#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420)
375#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400)
376#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410)
377#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
378#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
379#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
380#define RCV_BUF_OVFL_CNT 10
381#define RCV_CONTEXT_EGR_STALL 22
382#define RCV_CONTEXT_RHQ_STALL 21
383#define RCV_DATA_PKT_CNT 0
384#define RCV_DWORD_CNT 1
385#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20
386#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23
387#define RCV_TID_FULL_ERR_CNT 18
388#define RCV_TID_VALID_ERR_CNT 19
389#define RXE_NUM_32_BIT_COUNTERS 24
390#define RXE_NUM_64_BIT_COUNTERS 2
391#define RXE_NUM_RSM_INSTANCES 4
392#define RXE_NUM_TID_FLOWS 32
393#define RXE_PER_CONTEXT_OFFSET 0x0300000
394#define SEND_DATA_PKT_CNT 0
395#define SEND_DATA_PKT_VL0_CNT 12
396#define SEND_DATA_VL0_CNT 3
397#define SEND_DROPPED_PKT_CNT 5
398#define SEND_DWORD_CNT 1
399#define SEND_FLOW_STALL_CNT 4
400#define SEND_HEADERS_ERR_CNT 6
401#define SEND_LEN_ERR_CNT 1
402#define SEND_MAX_MIN_LEN_ERR_CNT 2
403#define SEND_UNDERRUN_CNT 3
404#define SEND_UNSUP_VL_ERR_CNT 0
405#define SEND_WAIT_CNT 2
406#define SEND_WAIT_VL0_CNT 21
407#define TXE_PIO_SEND_OFFSET 0x0800000
408#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048)
409#define ASIC_CFG_MUTEX (ASIC + 0x000000000040)
410#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008)
411#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull
412#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull
413#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000)
414#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16
415#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8
416#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32
417#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0
418#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020)
419#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050)
420#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308)
421#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull
422#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300)
423#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull
424#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8
425#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull
426#define ASIC_EEP_DATA (ASIC + 0x000000000310)
427#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230)
428#define ASIC_GPIO_FORCE (ASIC + 0x000000000238)
429#define ASIC_GPIO_IN (ASIC + 0x000000000200)
430#define ASIC_GPIO_INVERT (ASIC + 0x000000000210)
431#define ASIC_GPIO_MASK (ASIC + 0x000000000220)
432#define ASIC_GPIO_OE (ASIC + 0x000000000208)
433#define ASIC_GPIO_OUT (ASIC + 0x000000000218)
434#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100)
435#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0
436#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull
437#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2
438#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull
439#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12
440#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108)
441#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull
442#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2
443#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull
444#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0
445#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110)
446#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118)
447#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180)
448#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16
449#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0
450#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128)
451#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270)
452#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278)
453#define ASIC_QSFP1_IN (ASIC + 0x000000000240)
454#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250)
455#define ASIC_QSFP1_MASK (ASIC + 0x000000000260)
456#define ASIC_QSFP1_OE (ASIC + 0x000000000248)
457#define ASIC_QSFP1_OUT (ASIC + 0x000000000258)
458#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268)
459#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0)
460#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8)
461#define ASIC_QSFP2_IN (ASIC + 0x000000000280)
462#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290)
463#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0)
464#define ASIC_QSFP2_OE (ASIC + 0x000000000288)
465#define ASIC_QSFP2_OUT (ASIC + 0x000000000298)
466#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8)
467#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018)
468#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull
469#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0
470#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull
471#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16
472#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
473#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
474#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
475#define ASIC_STS_THERM (ASIC + 0x000000000058)
476#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
477#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
478#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull
479#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2
480#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull
481#define ASIC_STS_THERM_HI_TEMP_SHIFT 50
482#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull
483#define ASIC_STS_THERM_LO_TEMP_SHIFT 34
484#define ASIC_STS_THERM_LOW_SHIFT 13
485#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060)
486#define CCE_CTRL (CCE + 0x000000000010)
487#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull
488#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull
489#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull
490#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull
491#define CCE_DC_CTRL (CCE + 0x0000000000B8)
492#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull
493#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull
494#define CCE_ERR_CLEAR (CCE + 0x000000000050)
495#define CCE_ERR_MASK (CCE + 0x000000000048)
496#define CCE_ERR_STATUS (CCE + 0x000000000040)
497#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull
498#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull
499#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \
500 0x200ull
501#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \
502 0x800ull
503#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \
504 0x400ull
505#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull
506#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull
507#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull
508#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
509#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
510#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull
511#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull
512#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull
513#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull
514#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull
515#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull
516#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull
517#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull
518#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull
519#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull
520#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull
521#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull
522#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull
523#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull
524#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull
525#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull
526#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull
527#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull
528#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull
529#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull
530#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull
531#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull
532#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull
533#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull
534#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull
535#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull
536#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull
537#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull
538#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull
539#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull
540#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull
541#define CCE_INT_CLEAR (CCE + 0x000000110A00)
542#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00)
543#define CCE_INT_FORCE (CCE + 0x000000110B00)
544#define CCE_INT_MAP (CCE + 0x000000110500)
545#define CCE_INT_MASK (CCE + 0x000000110900)
546#define CCE_INT_STATUS (CCE + 0x000000110800)
547#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200)
548#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000)
549#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008)
550#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull
551#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400)
552#define CCE_REVISION (CCE + 0x000000000000)
553#define CCE_REVISION2 (CCE + 0x000000000008)
554#define CCE_REVISION2_HFI_ID_MASK 0x1ull
555#define CCE_REVISION2_HFI_ID_SHIFT 0
556#define CCE_REVISION2_IMPL_CODE_SHIFT 8
557#define CCE_REVISION2_IMPL_REVISION_SHIFT 16
558#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull
559#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32
560#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull
561#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8
562#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull
563#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0
564#define CCE_REVISION_SW_MASK 0xFFull
565#define CCE_REVISION_SW_SHIFT 24
566#define CCE_SCRATCH (CCE + 0x000000000020)
567#define CCE_STATUS (CCE + 0x000000000018)
568#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull
569#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull
570#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull
571#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull
572#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull
573#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull
574#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull
575#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull
576#define MISC_CFG_FW_CTRL (MISC + 0x000000001000)
577#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull
578#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2
579#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull
580#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08)
581#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400)
582#define MISC_CFG_RSA_MU (MISC + 0x000000000A10)
583#define MISC_CFG_RSA_R2 (MISC + 0x000000000000)
584#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200)
585#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00)
586#define MISC_ERR_CLEAR (MISC + 0x000000002010)
587#define MISC_ERR_MASK (MISC + 0x000000002008)
588#define MISC_ERR_STATUS (MISC + 0x000000002000)
589#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull
590#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull
591#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull
592#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull
593#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull
594#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull
595#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull
596#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull
597#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull
598#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull
599#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
600#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
601#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull
602#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0)
603#define PCI_CFG_REG1 (PCIE + 0x000000000004)
604#define PCI_CFG_REG11 (PCIE + 0x00000000002C)
605#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C)
606#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150)
607#define PCIE_CFG_TPH2 (PCIE + 0x000000000180)
608#define RCV_ARRAY (RXE + 0x000000200000)
609#define RCV_ARRAY_CNT (RXE + 0x000000000018)
610#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull
611#define RCV_ARRAY_RT_ADDR_SHIFT 0
612#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36
613#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull
614#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050)
615#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull
616#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0
617#define RCV_BTH_QP (RXE + 0x000000000028)
618#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull
619#define RCV_BTH_QP_KDETH_QP_SHIFT 16
620#define RCV_BYPASS (RXE + 0x000000000038)
621#define RCV_CONTEXTS (RXE + 0x000000000010)
622#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400)
623#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500)
624#define RCV_CTRL (RXE + 0x000000000000)
625#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull
626#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull
627#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull
628#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull
629#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull
630#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull
631#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull
632#define RCV_CTXT_CTRL (RXE + 0x000000100000)
633#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull
634#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull
635#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull
636#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8
637#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull
638#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull
639#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull
640#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull
641#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull
642#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull
643#define RCV_CTXT_STATUS (RXE + 0x000000100008)
644#define RCV_EGR_CTRL (RXE + 0x000000100010)
645#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull
646#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0
647#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull
648#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32
649#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018)
650#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull
651#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0
652#define RCV_ERR_CLEAR (RXE + 0x000000000070)
653#define RCV_ERR_INFO (RXE + 0x000000000050)
654#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full
655#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull
656#define RCV_ERR_MASK (RXE + 0x000000000068)
657#define RCV_ERR_STATUS (RXE + 0x000000000060)
658#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull
659#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull
660#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \
661 0x4000000000000000ull
662#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull
663#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull
664#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull
665#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull
666#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull
667#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \
668 0x40000000000000ull
669#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
670 0x20000000000000ull
671#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
672 0x800000000000000ull
673#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
674 0x400000000000000ull
675#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull
676#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull
677#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull
678#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull
679#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull
680#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull
681#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull
682#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
683 0x10000000000ull
684#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull
685#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
686 0x20000000000ull
687#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull
688#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull
689#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull
690#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull
691#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull
692#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull
693#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull
694#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull
695#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
696 0x200000ull
697#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull
698#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
699 0x8000000ull
700#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull
701#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull
702#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull
703#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull
704#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull
705#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull
706#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull
707#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull
708#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull
709#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull
710#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull
711#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
712 0x1000000000ull
713#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull
714#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull
715#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull
716#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull
717#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull
718#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull
719#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull
720#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull
721#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull
722#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull
723#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull
724#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull
725#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull
726#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull
727#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull
728#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull
729#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull
730#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull
731#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull
732#define RCV_HDR_ADDR (RXE + 0x000000100028)
733#define RCV_HDR_CNT (RXE + 0x000000100030)
734#define RCV_HDR_CNT_CNT_MASK 0x1FFull
735#define RCV_HDR_CNT_CNT_SHIFT 0
736#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038)
737#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull
738#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0
739#define RCV_HDR_HEAD (RXE + 0x000000300008)
740#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull
741#define RCV_HDR_HEAD_COUNTER_SHIFT 32
742#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull
743#define RCV_HDR_HEAD_HEAD_SHIFT 0
744#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull
745#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058)
746#define RCV_HDR_SIZE (RXE + 0x000000100040)
747#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full
748#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0
749#define RCV_HDR_TAIL (RXE + 0x000000300000)
750#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048)
751#define RCV_KEY_CTRL (RXE + 0x000000100020)
752#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull
753#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull
754#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0
755#define RCV_MULTICAST (RXE + 0x000000000030)
756#define RCV_PARTITION_KEY (RXE + 0x000000000200)
757#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull
758#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16
759#define RCV_QP_MAP_TABLE (RXE + 0x000000000100)
760#define RCV_RSM_CFG (RXE + 0x000000000600)
761#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
762#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
763#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
764#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
765#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
766#define RCV_RSM_MATCH (RXE + 0x000000000800)
767#define RCV_RSM_MATCH_MASK1_SHIFT 0
768#define RCV_RSM_MATCH_MASK2_SHIFT 16
769#define RCV_RSM_MATCH_VALUE1_SHIFT 8
770#define RCV_RSM_MATCH_VALUE2_SHIFT 24
771#define RCV_RSM_SELECT (RXE + 0x000000000700)
772#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0
773#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16
774#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32
775#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44
776#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48
777#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60
778#define RCV_STATUS (RXE + 0x000000000008)
779#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull
780#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull
781#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull
782#define RCV_TID_CTRL (RXE + 0x000000100018)
783#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull
784#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0
785#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull
786#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32
787#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800)
788#define RCV_VL15 (RXE + 0x000000000048)
789#define SEND_BTH_QP (TXE + 0x0000000000A0)
790#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull
791#define SEND_BTH_QP_KDETH_QP_SHIFT 16
792#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510)
793#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \
794 0x1000000000000ull
795#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \
796 0x8000000000000000ull
797#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \
798 0x2000000000000ull
799#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \
800 0x4000000000000ull
801#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \
802 0x8000000000000ull
803#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \
804 0x10000000000000ull
805#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \
806 0x20000000000000ull
807#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \
808 0x40000000000000ull
809#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \
810 0x80000000000000ull
811#define SEND_CM_CREDIT_VL (TXE + 0x000000000600)
812#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678)
813#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0
814#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull
815#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0
816#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull
817#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull
818#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16
819#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull
820#define SEND_CM_CTRL (TXE + 0x000000000500)
821#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull
822#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull
823#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508)
824#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16
825#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull
826#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull
827#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0
828#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull
829#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull
830#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32
831#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull
832#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520)
833#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0
834#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16
835#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32
836#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48
837#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528)
838#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0
839#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16
840#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32
841#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48
842#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530)
843#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538)
844#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518)
845#define SEND_CONTEXTS (TXE + 0x000000000010)
846#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200)
847#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300)
848#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400)
849#define SEND_CTRL (TXE + 0x000000000000)
850#define SEND_CTRL_CM_RESET_SMASK 0x4ull
851#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
852#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
853#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
854#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
855#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
856#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
857#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
858#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
859#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
860#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
861#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
862#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
863#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \
864 0x200000ull
865#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull
866#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull
867#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull
868#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull
869#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
870 0x100000ull
871#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull
872#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
873#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
874#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
875 0x80000ull
876#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \
877 0x40000ull
878#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
879 0x8000ull
880#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \
881 0x4000ull
882#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090)
883#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull
884#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull
885#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull
886#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0
887#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8)
888#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8
889#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0
890#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098)
891#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull
892#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0
893#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0)
894#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull
895#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16
896#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull
897#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0
898#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088)
899#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010)
900#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull
901#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull
902#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
903#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
904#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
905#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
906#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
907#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
908#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull
909#define SEND_CTXT_CTRL (TXE + 0x000000100000)
910#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull
911#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32
912#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull
913#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48
914#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull
915#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050)
916#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048)
917#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040)
918#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull
919#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull
920#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull
921#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull
922#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull
923#define SEND_CTXT_STATUS (TXE + 0x000000100008)
924#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull
925#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010)
926#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080)
927#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
928#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
929#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
930#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
931#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
932#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
933#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
934#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
935#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
936#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull
937#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
938 0x100000ull
939#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
940#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
941#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
942 0x80000ull
943#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull
944#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
945 0x8000ull
946#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull
947#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090)
948#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8)
949#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098)
950#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0)
951#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull
952#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16
953#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull
954#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0
955#define SEND_DMA_CHECK_VL (TXE + 0x000000200088)
956#define SEND_DMA_CTRL (TXE + 0x000000200000)
957#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull
958#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull
959#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull
960#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull
961#define SEND_DMA_DESC_CNT (TXE + 0x000000200050)
962#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull
963#define SEND_DMA_DESC_CNT_CNT_SHIFT 0
964#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070)
965#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull
966#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18
967#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068)
968#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060)
969#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull
970#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull
971#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull
972#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull
973#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull
974#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull
975#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull
976#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \
977 0x40000ull
978#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull
979#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \
980 0x20000ull
981#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull
982#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull
983#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \
984 0x100ull
985#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \
986 0x10000ull
987#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull
988#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull
989#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull
990#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull
991#define SEND_DMA_ENGINES (TXE + 0x000000000018)
992#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070)
993#define SEND_DMA_ERR_MASK (TXE + 0x000000000068)
994#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060)
995#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull
996#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull
997#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull
998#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull
999#define SEND_DMA_HEAD (TXE + 0x000000200028)
1000#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030)
1001#define SEND_DMA_LEN_GEN (TXE + 0x000000200018)
1002#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16
1003#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6
1004#define SEND_DMA_MEMORY (TXE + 0x0000002000B0)
1005#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16
1006#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0
1007#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028)
1008#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038)
1009#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048)
1010#define SEND_DMA_STATUS (TXE + 0x000000200008)
1011#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull
1012#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull
1013#define SEND_DMA_TAIL (TXE + 0x000000200020)
1014#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800)
1015#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull
1016#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0
1017#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \
1018 0x3FFFull
1019#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090)
1020#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00)
1021#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull
1022#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull
1023#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull
1024#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull
1025#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull
1026#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull
1027#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull
1028#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull
1029#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull
1030#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull
1031#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull
1032#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull
1033#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull
1034#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull
1035#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull
1036#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull
1037#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull
1038#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull
1039#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull
1040#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088)
1041#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08)
1042#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080)
1043#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull
1044#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \
1045 0x200000000000000ull
1046#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \
1047 0x20000000000ull
1048#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \
1049 0x800000000000ull
1050#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \
1051 0x2000000000000000ull
1052#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \
1053 0x200000000000ull
1054#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \
1055 0x8ull
1056#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \
1057 0x400000000000ull
1058#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull
1059#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull
1060#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull
1061#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \
1062 0x1000000000000ull
1063#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \
1064 0x100000000ull
1065#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \
1066 0x2000000000000ull
1067#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \
1068 0x200000000ull
1069#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \
1070 0x4000000000000ull
1071#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \
1072 0x400000000ull
1073#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \
1074 0x8000000000000ull
1075#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \
1076 0x800000000ull
1077#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \
1078 0x10000000000000ull
1079#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \
1080 0x1000000000ull
1081#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \
1082 0x20000000000000ull
1083#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \
1084 0x2000000000ull
1085#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \
1086 0x40000000000000ull
1087#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \
1088 0x4000000000ull
1089#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \
1090 0x80000000000000ull
1091#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \
1092 0x8000000000ull
1093#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \
1094 0x100000000000000ull
1095#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \
1096 0x10000000000ull
1097#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull
1098#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull
1099#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull
1100#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull
1101#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \
1102 0x1000000000000000ull
1103#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \
1104 0x8000000000000000ull
1105#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \
1106 0x100000000000ull
1107#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \
1108 0x800000000000000ull
1109#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \
1110 0x4000000000000000ull
1111#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \
1112 0x80000000000ull
1113#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull
1114#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull
1115#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull
1116#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \
1117 0x800ull
1118#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \
1119 0x10000ull
1120#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \
1121 0x4000000ull
1122#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \
1123 0x8000000ull
1124#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \
1125 0x10000000ull
1126#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \
1127 0x20000000ull
1128#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \
1129 0x40000000ull
1130#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \
1131 0x80000000ull
1132#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \
1133 0x20000ull
1134#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \
1135 0x40000ull
1136#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \
1137 0x80000ull
1138#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \
1139 0x100000ull
1140#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \
1141 0x200000ull
1142#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \
1143 0x400000ull
1144#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \
1145 0x800000ull
1146#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \
1147 0x1000000ull
1148#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \
1149 0x2000000ull
1150#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \
1151 0x100ull
1152#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00)
1153#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0
1154#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
1155 0x3FFFull
1156#define SEND_ERR_CLEAR (TXE + 0x0000000000F0)
1157#define SEND_ERR_MASK (TXE + 0x0000000000E8)
1158#define SEND_ERR_STATUS (TXE + 0x0000000000E0)
1159#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull
1160#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
1161#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
1162#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030)
1163#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull
1164#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0
1165#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180)
1166#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0)
1167#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull
1168#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12
1169#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8)
1170#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull
1171#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48
1172#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull
1173#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12
1174#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100)
1175#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull
1176#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16
1177#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull
1178#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0
1179#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050)
1180#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
1181#define SEND_PIO_ERR_MASK (TXE + 0x000000000048)
1182#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040)
1183#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
1184 0x1000000ull
1185#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull
1186#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull
1187#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
1188 0x100000000ull
1189#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull
1190#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull
1191#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
1192#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
1193 0x200000000ull
1194#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull
1195#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
1196 0x400000000ull
1197#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull
1198#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \
1199 0x800000000ull
1200#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull
1201#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull
1202#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull
1203#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull
1204#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull
1205#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull
1206#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull
1207#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull
1208#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
1209 0x100ull
1210#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull
1211#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull
1212#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull
1213#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull
1214#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull
1215#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull
1216#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull
1217#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull
1218#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull
1219#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull
1220#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull
1221#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull
1222#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull
1223#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038)
1224#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull
1225#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull
1226#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8
1227#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull
1228#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull
1229#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull
1230#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020)
1231#define SEND_SC2VLT0 (TXE + 0x0000000000B0)
1232#define SEND_SC2VLT0_SC0_SHIFT 0
1233#define SEND_SC2VLT0_SC1_SHIFT 8
1234#define SEND_SC2VLT0_SC2_SHIFT 16
1235#define SEND_SC2VLT0_SC3_SHIFT 24
1236#define SEND_SC2VLT0_SC4_SHIFT 32
1237#define SEND_SC2VLT0_SC5_SHIFT 40
1238#define SEND_SC2VLT0_SC6_SHIFT 48
1239#define SEND_SC2VLT0_SC7_SHIFT 56
1240#define SEND_SC2VLT1 (TXE + 0x0000000000B8)
1241#define SEND_SC2VLT1_SC10_SHIFT 16
1242#define SEND_SC2VLT1_SC11_SHIFT 24
1243#define SEND_SC2VLT1_SC12_SHIFT 32
1244#define SEND_SC2VLT1_SC13_SHIFT 40
1245#define SEND_SC2VLT1_SC14_SHIFT 48
1246#define SEND_SC2VLT1_SC15_SHIFT 56
1247#define SEND_SC2VLT1_SC8_SHIFT 0
1248#define SEND_SC2VLT1_SC9_SHIFT 8
1249#define SEND_SC2VLT2 (TXE + 0x0000000000C0)
1250#define SEND_SC2VLT2_SC16_SHIFT 0
1251#define SEND_SC2VLT2_SC17_SHIFT 8
1252#define SEND_SC2VLT2_SC18_SHIFT 16
1253#define SEND_SC2VLT2_SC19_SHIFT 24
1254#define SEND_SC2VLT2_SC20_SHIFT 32
1255#define SEND_SC2VLT2_SC21_SHIFT 40
1256#define SEND_SC2VLT2_SC22_SHIFT 48
1257#define SEND_SC2VLT2_SC23_SHIFT 56
1258#define SEND_SC2VLT3 (TXE + 0x0000000000C8)
1259#define SEND_SC2VLT3_SC24_SHIFT 0
1260#define SEND_SC2VLT3_SC25_SHIFT 8
1261#define SEND_SC2VLT3_SC26_SHIFT 16
1262#define SEND_SC2VLT3_SC27_SHIFT 24
1263#define SEND_SC2VLT3_SC28_SHIFT 32
1264#define SEND_SC2VLT3_SC29_SHIFT 40
1265#define SEND_SC2VLT3_SC30_SHIFT 48
1266#define SEND_SC2VLT3_SC31_SHIFT 56
1267#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8)
1268#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0
1269#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull
1270#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708)
1271#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898)
1272#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12
1273#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6
1274#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0
1275#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C)
1276#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4)
1277#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull
1278#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24
1279#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890)
1280#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull
1281#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894)
1282#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6
1283#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0
1284#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8)
1285#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8
1286#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull
1287#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull
1288#define CCE_INT_BLOCKED (CCE + 0x000000110C00)
1289#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040)
1290#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058)
1291
1292#endif /* DEF_CHIP_REG */
diff --git a/drivers/staging/rdma/hfi1/common.h b/drivers/staging/rdma/hfi1/common.h
new file mode 100644
index 000000000000..5f2293729cf9
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/common.h
@@ -0,0 +1,415 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#ifndef _COMMON_H
52#define _COMMON_H
53
54#include <rdma/hfi/hfi1_user.h>
55
56/*
57 * This file contains defines, structures, etc. that are used
58 * to communicate between kernel and user code.
59 */
60
61/* version of protocol header (known to chip also). In the long run,
62 * we should be able to generate and accept a range of version numbers;
63 * for now we only accept one, and it's compiled in.
64 */
65#define IPS_PROTO_VERSION 2
66
67/*
68 * These are compile time constants that you may want to enable or disable
69 * if you are trying to debug problems with code or performance.
70 * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in
71 * fast path code
72 * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be
73 * traced in fast path code
74 * _HFI1_TRACING define as 0 if you want to remove all tracing in a
75 * compilation unit
76 */
77
78/*
79 * If a packet's QP[23:16] bits match this value, then it is
80 * a PSM packet and the hardware will expect a KDETH header
81 * following the BTH.
82 */
83#define DEFAULT_KDETH_QP 0x80
84
85/* driver/hw feature set bitmask */
86#define HFI1_CAP_USER_SHIFT 24
87#define HFI1_CAP_MASK ((1UL << HFI1_CAP_USER_SHIFT) - 1)
88/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */
89#define HFI1_CAP_LOCKED_SHIFT 63
90#define HFI1_CAP_LOCKED_MASK 0x1ULL
91#define HFI1_CAP_LOCKED_SMASK (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT)
92/* extra bits used between kernel and user processes */
93#define HFI1_CAP_MISC_SHIFT (HFI1_CAP_USER_SHIFT * 2)
94#define HFI1_CAP_MISC_MASK ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \
95 HFI1_CAP_MISC_SHIFT)) - 1)
96
97#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; })
98#define HFI1_CAP_KCLEAR(cap) \
99 ({ \
100 hfi1_cap_mask &= ~HFI1_CAP_##cap; \
101 hfi1_cap_mask; \
102 })
103#define HFI1_CAP_USET(cap) \
104 ({ \
105 hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
106 hfi1_cap_mask; \
107 })
108#define HFI1_CAP_UCLEAR(cap) \
109 ({ \
110 hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
111 hfi1_cap_mask; \
112 })
113#define HFI1_CAP_SET(cap) \
114 ({ \
115 hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap << \
116 HFI1_CAP_USER_SHIFT)); \
117 hfi1_cap_mask; \
118 })
119#define HFI1_CAP_CLEAR(cap) \
120 ({ \
121 hfi1_cap_mask &= ~(HFI1_CAP_##cap | \
122 (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \
123 hfi1_cap_mask; \
124 })
125#define HFI1_CAP_LOCK() \
126 ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; })
127#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK))
128/*
129 * The set of capability bits that can be changed after initial load
130 * This set is the same for kernel and user contexts. However, for
131 * user contexts, the set can be further filtered by using the
132 * HFI1_CAP_RESERVED_MASK bits.
133 */
134#define HFI1_CAP_WRITABLE_MASK (HFI1_CAP_SDMA_AHG | \
135 HFI1_CAP_HDRSUPP | \
136 HFI1_CAP_MULTI_PKT_EGR | \
137 HFI1_CAP_NODROP_RHQ_FULL | \
138 HFI1_CAP_NODROP_EGR_FULL | \
139 HFI1_CAP_ALLOW_PERM_JKEY | \
140 HFI1_CAP_STATIC_RATE_CTRL | \
141 HFI1_CAP_PRINT_UNIMPL)
142/*
143 * A set of capability bits that are "global" and are not allowed to be
144 * set in the user bitmask.
145 */
146#define HFI1_CAP_RESERVED_MASK ((HFI1_CAP_SDMA | \
147 HFI1_CAP_USE_SDMA_HEAD | \
148 HFI1_CAP_EXTENDED_PSN | \
149 HFI1_CAP_PRINT_UNIMPL | \
150 HFI1_CAP_QSFP_ENABLED | \
151 HFI1_CAP_NO_INTEGRITY | \
152 HFI1_CAP_PKEY_CHECK) << \
153 HFI1_CAP_USER_SHIFT)
154/*
155 * Set of capabilities that need to be enabled for kernel context in
156 * order to be allowed for user contexts, as well.
157 */
158#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL)
159/* Default enabled capabilities (both kernel and user) */
160#define HFI1_CAP_MASK_DEFAULT (HFI1_CAP_HDRSUPP | \
161 HFI1_CAP_NODROP_RHQ_FULL | \
162 HFI1_CAP_NODROP_EGR_FULL | \
163 HFI1_CAP_SDMA | \
164 HFI1_CAP_PRINT_UNIMPL | \
165 HFI1_CAP_STATIC_RATE_CTRL | \
166 HFI1_CAP_QSFP_ENABLED | \
167 HFI1_CAP_PKEY_CHECK | \
168 HFI1_CAP_MULTI_PKT_EGR | \
169 HFI1_CAP_EXTENDED_PSN | \
170 ((HFI1_CAP_HDRSUPP | \
171 HFI1_CAP_MULTI_PKT_EGR | \
172 HFI1_CAP_STATIC_RATE_CTRL | \
173 HFI1_CAP_PKEY_CHECK | \
174 HFI1_CAP_EARLY_CREDIT_RETURN) << \
175 HFI1_CAP_USER_SHIFT))
176/*
177 * A bitmask of kernel/global capabilities that should be communicated
178 * to user level processes.
179 */
180#define HFI1_CAP_K2U (HFI1_CAP_SDMA | \
181 HFI1_CAP_EXTENDED_PSN | \
182 HFI1_CAP_PKEY_CHECK | \
183 HFI1_CAP_NO_INTEGRITY)
184
185#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << 16) | HFI1_USER_SWMINOR)
186
187#ifndef HFI1_KERN_TYPE
188#define HFI1_KERN_TYPE 0
189#endif
190
191/*
192 * Similarly, this is the kernel version going back to the user. It's
193 * slightly different, in that we want to tell if the driver was built as
194 * part of a Intel release, or from the driver from openfabrics.org,
195 * kernel.org, or a standard distribution, for support reasons.
196 * The high bit is 0 for non-Intel and 1 for Intel-built/supplied.
197 *
198 * It's returned by the driver to the user code during initialization in the
199 * spi_sw_version field of hfi1_base_info, so the user code can in turn
200 * check for compatibility with the kernel.
201*/
202#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION)
203
204/*
205 * Define the driver version number. This is something that refers only
206 * to the driver itself, not the software interfaces it supports.
207 */
208#ifndef HFI1_DRIVER_VERSION_BASE
209#define HFI1_DRIVER_VERSION_BASE "0.9-248"
210#endif
211
212/* create the final driver version string */
213#ifdef HFI1_IDSTR
214#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR
215#else
216#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE
217#endif
218
219/*
220 * Diagnostics can send a packet by writing the following
221 * struct to the diag packet special file.
222 *
223 * This allows a custom PBC qword, so that special modes and deliberate
224 * changes to CRCs can be used.
225 */
226#define _DIAG_PKT_VERS 1
227struct diag_pkt {
228 __u16 version; /* structure version */
229 __u16 unit; /* which device */
230 __u16 sw_index; /* send sw index to use */
231 __u16 len; /* data length, in bytes */
232 __u16 port; /* port number */
233 __u16 unused;
234 __u32 flags; /* call flags */
235 __u64 data; /* user data pointer */
236 __u64 pbc; /* PBC for the packet */
237};
238
239/* diag_pkt flags */
240#define F_DIAGPKT_WAIT 0x1 /* wait until packet is sent */
241
242/*
243 * The next set of defines are for packet headers, and chip register
244 * and memory bits that are visible to and/or used by user-mode software.
245 */
246
247/*
248 * Receive Header Flags
249 */
250#define RHF_PKT_LEN_SHIFT 0
251#define RHF_PKT_LEN_MASK 0xfffull
252#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT)
253
254#define RHF_RCV_TYPE_SHIFT 12
255#define RHF_RCV_TYPE_MASK 0x7ull
256#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT)
257
258#define RHF_USE_EGR_BFR_SHIFT 15
259#define RHF_USE_EGR_BFR_MASK 0x1ull
260#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT)
261
262#define RHF_EGR_INDEX_SHIFT 16
263#define RHF_EGR_INDEX_MASK 0x7ffull
264#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT)
265
266#define RHF_DC_INFO_SHIFT 27
267#define RHF_DC_INFO_MASK 0x1ull
268#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT)
269
270#define RHF_RCV_SEQ_SHIFT 28
271#define RHF_RCV_SEQ_MASK 0xfull
272#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT)
273
274#define RHF_EGR_OFFSET_SHIFT 32
275#define RHF_EGR_OFFSET_MASK 0xfffull
276#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT)
277#define RHF_HDRQ_OFFSET_SHIFT 44
278#define RHF_HDRQ_OFFSET_MASK 0x1ffull
279#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT)
280#define RHF_K_HDR_LEN_ERR (0x1ull << 53)
281#define RHF_DC_UNC_ERR (0x1ull << 54)
282#define RHF_DC_ERR (0x1ull << 55)
283#define RHF_RCV_TYPE_ERR_SHIFT 56
284#define RHF_RCV_TYPE_ERR_MASK 0x7ul
285#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT)
286#define RHF_TID_ERR (0x1ull << 59)
287#define RHF_LEN_ERR (0x1ull << 60)
288#define RHF_ECC_ERR (0x1ull << 61)
289#define RHF_VCRC_ERR (0x1ull << 62)
290#define RHF_ICRC_ERR (0x1ull << 63)
291
292#define RHF_ERROR_SMASK 0xffe0000000000000ull /* bits 63:53 */
293
294/* RHF receive types */
295#define RHF_RCV_TYPE_EXPECTED 0
296#define RHF_RCV_TYPE_EAGER 1
297#define RHF_RCV_TYPE_IB 2 /* normal IB, IB Raw, or IPv6 */
298#define RHF_RCV_TYPE_ERROR 3
299#define RHF_RCV_TYPE_BYPASS 4
300#define RHF_RCV_TYPE_INVALID5 5
301#define RHF_RCV_TYPE_INVALID6 6
302#define RHF_RCV_TYPE_INVALID7 7
303
304/* RHF receive type error - expected packet errors */
305#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR 0x2
306#define RHF_RTE_EXPECTED_FLOW_GEN_ERR 0x4
307
308/* RHF receive type error - eager packet errors */
309#define RHF_RTE_EAGER_NO_ERR 0x0
310
311/* RHF receive type error - IB packet errors */
312#define RHF_RTE_IB_NO_ERR 0x0
313
314/* RHF receive type error - error packet errors */
315#define RHF_RTE_ERROR_NO_ERR 0x0
316#define RHF_RTE_ERROR_OP_CODE_ERR 0x1
317#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2
318#define RHF_RTE_ERROR_KHDR_HCRC_ERR 0x3
319#define RHF_RTE_ERROR_KHDR_KVER_ERR 0x4
320#define RHF_RTE_ERROR_CONTEXT_ERR 0x5
321#define RHF_RTE_ERROR_KHDR_TID_ERR 0x6
322
323/* RHF receive type error - bypass packet errors */
324#define RHF_RTE_BYPASS_NO_ERR 0x0
325
326/*
327 * This structure contains the first field common to all protocols
328 * that employ this chip.
329 */
330struct hfi1_message_header {
331 __be16 lrh[4];
332};
333
334/* IB - LRH header constants */
335#define HFI1_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */
336#define HFI1_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */
337
338/* misc. */
339#define SIZE_OF_CRC 1
340
341#define LIM_MGMT_P_KEY 0x7FFF
342#define FULL_MGMT_P_KEY 0xFFFF
343
344#define DEFAULT_P_KEY LIM_MGMT_P_KEY
345#define HFI1_PERMISSIVE_LID 0xFFFF
346#define HFI1_AETH_CREDIT_SHIFT 24
347#define HFI1_AETH_CREDIT_MASK 0x1F
348#define HFI1_AETH_CREDIT_INVAL 0x1F
349#define HFI1_MSN_MASK 0xFFFFFF
350#define HFI1_QPN_MASK 0xFFFFFF
351#define HFI1_FECN_SHIFT 31
352#define HFI1_FECN_MASK 1
353#define HFI1_FECN_SMASK (1 << HFI1_FECN_SHIFT)
354#define HFI1_BECN_SHIFT 30
355#define HFI1_BECN_MASK 1
356#define HFI1_BECN_SMASK (1 << HFI1_BECN_SHIFT)
357#define HFI1_MULTICAST_LID_BASE 0xC000
358
359static inline __u64 rhf_to_cpu(const __le32 *rbuf)
360{
361 return __le64_to_cpu(*((__le64 *)rbuf));
362}
363
364static inline u64 rhf_err_flags(u64 rhf)
365{
366 return rhf & RHF_ERROR_SMASK;
367}
368
369static inline u32 rhf_rcv_type(u64 rhf)
370{
371 return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK;
372}
373
374static inline u32 rhf_rcv_type_err(u64 rhf)
375{
376 return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK;
377}
378
379/* return size is in bytes, not DWORDs */
380static inline u32 rhf_pkt_len(u64 rhf)
381{
382 return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2;
383}
384
385static inline u32 rhf_egr_index(u64 rhf)
386{
387 return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK;
388}
389
390static inline u32 rhf_rcv_seq(u64 rhf)
391{
392 return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK;
393}
394
395/* returned offset is in DWORDS */
396static inline u32 rhf_hdrq_offset(u64 rhf)
397{
398 return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK;
399}
400
401static inline u64 rhf_use_egr_bfr(u64 rhf)
402{
403 return rhf & RHF_USE_EGR_BFR_SMASK;
404}
405
406static inline u64 rhf_dc_info(u64 rhf)
407{
408 return rhf & RHF_DC_INFO_SMASK;
409}
410
411static inline u32 rhf_egr_buf_offset(u64 rhf)
412{
413 return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK;
414}
415#endif /* _COMMON_H */
diff --git a/drivers/staging/rdma/hfi1/cq.c b/drivers/staging/rdma/hfi1/cq.c
new file mode 100644
index 000000000000..4f046ffe7e60
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/cq.c
@@ -0,0 +1,558 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/err.h>
52#include <linux/slab.h>
53#include <linux/vmalloc.h>
54#include <linux/kthread.h>
55
56#include "verbs.h"
57#include "hfi.h"
58
59/**
60 * hfi1_cq_enter - add a new entry to the completion queue
61 * @cq: completion queue
62 * @entry: work completion entry to add
63 * @sig: true if @entry is a solicited entry
64 *
65 * This may be called with qp->s_lock held.
66 */
67void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int solicited)
68{
69 struct hfi1_cq_wc *wc;
70 unsigned long flags;
71 u32 head;
72 u32 next;
73
74 spin_lock_irqsave(&cq->lock, flags);
75
76 /*
77 * Note that the head pointer might be writable by user processes.
78 * Take care to verify it is a sane value.
79 */
80 wc = cq->queue;
81 head = wc->head;
82 if (head >= (unsigned) cq->ibcq.cqe) {
83 head = cq->ibcq.cqe;
84 next = 0;
85 } else
86 next = head + 1;
87 if (unlikely(next == wc->tail)) {
88 spin_unlock_irqrestore(&cq->lock, flags);
89 if (cq->ibcq.event_handler) {
90 struct ib_event ev;
91
92 ev.device = cq->ibcq.device;
93 ev.element.cq = &cq->ibcq;
94 ev.event = IB_EVENT_CQ_ERR;
95 cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
96 }
97 return;
98 }
99 if (cq->ip) {
100 wc->uqueue[head].wr_id = entry->wr_id;
101 wc->uqueue[head].status = entry->status;
102 wc->uqueue[head].opcode = entry->opcode;
103 wc->uqueue[head].vendor_err = entry->vendor_err;
104 wc->uqueue[head].byte_len = entry->byte_len;
105 wc->uqueue[head].ex.imm_data =
106 (__u32 __force)entry->ex.imm_data;
107 wc->uqueue[head].qp_num = entry->qp->qp_num;
108 wc->uqueue[head].src_qp = entry->src_qp;
109 wc->uqueue[head].wc_flags = entry->wc_flags;
110 wc->uqueue[head].pkey_index = entry->pkey_index;
111 wc->uqueue[head].slid = entry->slid;
112 wc->uqueue[head].sl = entry->sl;
113 wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
114 wc->uqueue[head].port_num = entry->port_num;
115 /* Make sure entry is written before the head index. */
116 smp_wmb();
117 } else
118 wc->kqueue[head] = *entry;
119 wc->head = next;
120
121 if (cq->notify == IB_CQ_NEXT_COMP ||
122 (cq->notify == IB_CQ_SOLICITED &&
123 (solicited || entry->status != IB_WC_SUCCESS))) {
124 struct kthread_worker *worker;
125 /*
126 * This will cause send_complete() to be called in
127 * another thread.
128 */
129 smp_read_barrier_depends(); /* see hfi1_cq_exit */
130 worker = cq->dd->worker;
131 if (likely(worker)) {
132 cq->notify = IB_CQ_NONE;
133 cq->triggered++;
134 queue_kthread_work(worker, &cq->comptask);
135 }
136 }
137
138 spin_unlock_irqrestore(&cq->lock, flags);
139}
140
141/**
142 * hfi1_poll_cq - poll for work completion entries
143 * @ibcq: the completion queue to poll
144 * @num_entries: the maximum number of entries to return
145 * @entry: pointer to array where work completions are placed
146 *
147 * Returns the number of completion entries polled.
148 *
149 * This may be called from interrupt context. Also called by ib_poll_cq()
150 * in the generic verbs code.
151 */
152int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
153{
154 struct hfi1_cq *cq = to_icq(ibcq);
155 struct hfi1_cq_wc *wc;
156 unsigned long flags;
157 int npolled;
158 u32 tail;
159
160 /* The kernel can only poll a kernel completion queue */
161 if (cq->ip) {
162 npolled = -EINVAL;
163 goto bail;
164 }
165
166 spin_lock_irqsave(&cq->lock, flags);
167
168 wc = cq->queue;
169 tail = wc->tail;
170 if (tail > (u32) cq->ibcq.cqe)
171 tail = (u32) cq->ibcq.cqe;
172 for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
173 if (tail == wc->head)
174 break;
175 /* The kernel doesn't need a RMB since it has the lock. */
176 *entry = wc->kqueue[tail];
177 if (tail >= cq->ibcq.cqe)
178 tail = 0;
179 else
180 tail++;
181 }
182 wc->tail = tail;
183
184 spin_unlock_irqrestore(&cq->lock, flags);
185
186bail:
187 return npolled;
188}
189
190static void send_complete(struct kthread_work *work)
191{
192 struct hfi1_cq *cq = container_of(work, struct hfi1_cq, comptask);
193
194 /*
195 * The completion handler will most likely rearm the notification
196 * and poll for all pending entries. If a new completion entry
197 * is added while we are in this routine, queue_work()
198 * won't call us again until we return so we check triggered to
199 * see if we need to call the handler again.
200 */
201 for (;;) {
202 u8 triggered = cq->triggered;
203
204 /*
205 * IPoIB connected mode assumes the callback is from a
206 * soft IRQ. We simulate this by blocking "bottom halves".
207 * See the implementation for ipoib_cm_handle_tx_wc(),
208 * netif_tx_lock_bh() and netif_tx_lock().
209 */
210 local_bh_disable();
211 cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
212 local_bh_enable();
213
214 if (cq->triggered == triggered)
215 return;
216 }
217}
218
219/**
220 * hfi1_create_cq - create a completion queue
221 * @ibdev: the device this completion queue is attached to
222 * @attr: creation attributes
223 * @context: unused by the driver
224 * @udata: user data for libibverbs.so
225 *
226 * Returns a pointer to the completion queue or negative errno values
227 * for failure.
228 *
229 * Called by ib_create_cq() in the generic verbs code.
230 */
231struct ib_cq *hfi1_create_cq(
232 struct ib_device *ibdev,
233 const struct ib_cq_init_attr *attr,
234 struct ib_ucontext *context,
235 struct ib_udata *udata)
236{
237 struct hfi1_ibdev *dev = to_idev(ibdev);
238 struct hfi1_cq *cq;
239 struct hfi1_cq_wc *wc;
240 struct ib_cq *ret;
241 u32 sz;
242 unsigned int entries = attr->cqe;
243
244 if (attr->flags)
245 return ERR_PTR(-EINVAL);
246
247 if (entries < 1 || entries > hfi1_max_cqes)
248 return ERR_PTR(-EINVAL);
249
250 /* Allocate the completion queue structure. */
251 cq = kmalloc(sizeof(*cq), GFP_KERNEL);
252 if (!cq)
253 return ERR_PTR(-ENOMEM);
254
255 /*
256 * Allocate the completion queue entries and head/tail pointers.
257 * This is allocated separately so that it can be resized and
258 * also mapped into user space.
259 * We need to use vmalloc() in order to support mmap and large
260 * numbers of entries.
261 */
262 sz = sizeof(*wc);
263 if (udata && udata->outlen >= sizeof(__u64))
264 sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
265 else
266 sz += sizeof(struct ib_wc) * (entries + 1);
267 wc = vmalloc_user(sz);
268 if (!wc) {
269 ret = ERR_PTR(-ENOMEM);
270 goto bail_cq;
271 }
272
273 /*
274 * Return the address of the WC as the offset to mmap.
275 * See hfi1_mmap() for details.
276 */
277 if (udata && udata->outlen >= sizeof(__u64)) {
278 int err;
279
280 cq->ip = hfi1_create_mmap_info(dev, sz, context, wc);
281 if (!cq->ip) {
282 ret = ERR_PTR(-ENOMEM);
283 goto bail_wc;
284 }
285
286 err = ib_copy_to_udata(udata, &cq->ip->offset,
287 sizeof(cq->ip->offset));
288 if (err) {
289 ret = ERR_PTR(err);
290 goto bail_ip;
291 }
292 } else
293 cq->ip = NULL;
294
295 spin_lock(&dev->n_cqs_lock);
296 if (dev->n_cqs_allocated == hfi1_max_cqs) {
297 spin_unlock(&dev->n_cqs_lock);
298 ret = ERR_PTR(-ENOMEM);
299 goto bail_ip;
300 }
301
302 dev->n_cqs_allocated++;
303 spin_unlock(&dev->n_cqs_lock);
304
305 if (cq->ip) {
306 spin_lock_irq(&dev->pending_lock);
307 list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
308 spin_unlock_irq(&dev->pending_lock);
309 }
310
311 /*
312 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
313 * The number of entries should be >= the number requested or return
314 * an error.
315 */
316 cq->dd = dd_from_dev(dev);
317 cq->ibcq.cqe = entries;
318 cq->notify = IB_CQ_NONE;
319 cq->triggered = 0;
320 spin_lock_init(&cq->lock);
321 init_kthread_work(&cq->comptask, send_complete);
322 wc->head = 0;
323 wc->tail = 0;
324 cq->queue = wc;
325
326 ret = &cq->ibcq;
327
328 goto done;
329
330bail_ip:
331 kfree(cq->ip);
332bail_wc:
333 vfree(wc);
334bail_cq:
335 kfree(cq);
336done:
337 return ret;
338}
339
340/**
341 * hfi1_destroy_cq - destroy a completion queue
342 * @ibcq: the completion queue to destroy.
343 *
344 * Returns 0 for success.
345 *
346 * Called by ib_destroy_cq() in the generic verbs code.
347 */
348int hfi1_destroy_cq(struct ib_cq *ibcq)
349{
350 struct hfi1_ibdev *dev = to_idev(ibcq->device);
351 struct hfi1_cq *cq = to_icq(ibcq);
352
353 flush_kthread_work(&cq->comptask);
354 spin_lock(&dev->n_cqs_lock);
355 dev->n_cqs_allocated--;
356 spin_unlock(&dev->n_cqs_lock);
357 if (cq->ip)
358 kref_put(&cq->ip->ref, hfi1_release_mmap_info);
359 else
360 vfree(cq->queue);
361 kfree(cq);
362
363 return 0;
364}
365
366/**
367 * hfi1_req_notify_cq - change the notification type for a completion queue
368 * @ibcq: the completion queue
369 * @notify_flags: the type of notification to request
370 *
371 * Returns 0 for success.
372 *
373 * This may be called from interrupt context. Also called by
374 * ib_req_notify_cq() in the generic verbs code.
375 */
376int hfi1_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
377{
378 struct hfi1_cq *cq = to_icq(ibcq);
379 unsigned long flags;
380 int ret = 0;
381
382 spin_lock_irqsave(&cq->lock, flags);
383 /*
384 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
385 * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
386 */
387 if (cq->notify != IB_CQ_NEXT_COMP)
388 cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
389
390 if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
391 cq->queue->head != cq->queue->tail)
392 ret = 1;
393
394 spin_unlock_irqrestore(&cq->lock, flags);
395
396 return ret;
397}
398
399/**
400 * hfi1_resize_cq - change the size of the CQ
401 * @ibcq: the completion queue
402 *
403 * Returns 0 for success.
404 */
405int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
406{
407 struct hfi1_cq *cq = to_icq(ibcq);
408 struct hfi1_cq_wc *old_wc;
409 struct hfi1_cq_wc *wc;
410 u32 head, tail, n;
411 int ret;
412 u32 sz;
413
414 if (cqe < 1 || cqe > hfi1_max_cqes) {
415 ret = -EINVAL;
416 goto bail;
417 }
418
419 /*
420 * Need to use vmalloc() if we want to support large #s of entries.
421 */
422 sz = sizeof(*wc);
423 if (udata && udata->outlen >= sizeof(__u64))
424 sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
425 else
426 sz += sizeof(struct ib_wc) * (cqe + 1);
427 wc = vmalloc_user(sz);
428 if (!wc) {
429 ret = -ENOMEM;
430 goto bail;
431 }
432
433 /* Check that we can write the offset to mmap. */
434 if (udata && udata->outlen >= sizeof(__u64)) {
435 __u64 offset = 0;
436
437 ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
438 if (ret)
439 goto bail_free;
440 }
441
442 spin_lock_irq(&cq->lock);
443 /*
444 * Make sure head and tail are sane since they
445 * might be user writable.
446 */
447 old_wc = cq->queue;
448 head = old_wc->head;
449 if (head > (u32) cq->ibcq.cqe)
450 head = (u32) cq->ibcq.cqe;
451 tail = old_wc->tail;
452 if (tail > (u32) cq->ibcq.cqe)
453 tail = (u32) cq->ibcq.cqe;
454 if (head < tail)
455 n = cq->ibcq.cqe + 1 + head - tail;
456 else
457 n = head - tail;
458 if (unlikely((u32)cqe < n)) {
459 ret = -EINVAL;
460 goto bail_unlock;
461 }
462 for (n = 0; tail != head; n++) {
463 if (cq->ip)
464 wc->uqueue[n] = old_wc->uqueue[tail];
465 else
466 wc->kqueue[n] = old_wc->kqueue[tail];
467 if (tail == (u32) cq->ibcq.cqe)
468 tail = 0;
469 else
470 tail++;
471 }
472 cq->ibcq.cqe = cqe;
473 wc->head = n;
474 wc->tail = 0;
475 cq->queue = wc;
476 spin_unlock_irq(&cq->lock);
477
478 vfree(old_wc);
479
480 if (cq->ip) {
481 struct hfi1_ibdev *dev = to_idev(ibcq->device);
482 struct hfi1_mmap_info *ip = cq->ip;
483
484 hfi1_update_mmap_info(dev, ip, sz, wc);
485
486 /*
487 * Return the offset to mmap.
488 * See hfi1_mmap() for details.
489 */
490 if (udata && udata->outlen >= sizeof(__u64)) {
491 ret = ib_copy_to_udata(udata, &ip->offset,
492 sizeof(ip->offset));
493 if (ret)
494 goto bail;
495 }
496
497 spin_lock_irq(&dev->pending_lock);
498 if (list_empty(&ip->pending_mmaps))
499 list_add(&ip->pending_mmaps, &dev->pending_mmaps);
500 spin_unlock_irq(&dev->pending_lock);
501 }
502
503 ret = 0;
504 goto bail;
505
506bail_unlock:
507 spin_unlock_irq(&cq->lock);
508bail_free:
509 vfree(wc);
510bail:
511 return ret;
512}
513
514int hfi1_cq_init(struct hfi1_devdata *dd)
515{
516 int ret = 0;
517 int cpu;
518 struct task_struct *task;
519
520 if (dd->worker)
521 return 0;
522 dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL);
523 if (!dd->worker)
524 return -ENOMEM;
525 init_kthread_worker(dd->worker);
526 task = kthread_create_on_node(
527 kthread_worker_fn,
528 dd->worker,
529 dd->assigned_node_id,
530 "hfi1_cq%d", dd->unit);
531 if (IS_ERR(task))
532 goto task_fail;
533 cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id));
534 kthread_bind(task, cpu);
535 wake_up_process(task);
536out:
537 return ret;
538task_fail:
539 ret = PTR_ERR(task);
540 kfree(dd->worker);
541 dd->worker = NULL;
542 goto out;
543}
544
545void hfi1_cq_exit(struct hfi1_devdata *dd)
546{
547 struct kthread_worker *worker;
548
549 worker = dd->worker;
550 if (!worker)
551 return;
552 /* blocks future queuing from send_complete() */
553 dd->worker = NULL;
554 smp_wmb(); /* See hfi1_cq_enter */
555 flush_kthread_worker(worker);
556 kthread_stop(worker->task);
557 kfree(worker);
558}
diff --git a/drivers/staging/rdma/hfi1/debugfs.c b/drivers/staging/rdma/hfi1/debugfs.c
new file mode 100644
index 000000000000..acd2269e9f14
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/debugfs.c
@@ -0,0 +1,899 @@
1#ifdef CONFIG_DEBUG_FS
2/*
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2015 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * BSD LICENSE
21 *
22 * Copyright(c) 2015 Intel Corporation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 *
28 * - Redistributions of source code must retain the above copyright
29 * notice, this list of conditions and the following disclaimer.
30 * - Redistributions in binary form must reproduce the above copyright
31 * notice, this list of conditions and the following disclaimer in
32 * the documentation and/or other materials provided with the
33 * distribution.
34 * - Neither the name of Intel Corporation nor the names of its
35 * contributors may be used to endorse or promote products derived
36 * from this software without specific prior written permission.
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
39 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
40 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
41 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
42 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
45 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
46 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
47 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
48 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49 *
50 */
51#include <linux/debugfs.h>
52#include <linux/seq_file.h>
53#include <linux/kernel.h>
54#include <linux/export.h>
55
56#include "hfi.h"
57#include "debugfs.h"
58#include "device.h"
59#include "qp.h"
60#include "sdma.h"
61
62static struct dentry *hfi1_dbg_root;
63
64#define private2dd(file) (file_inode(file)->i_private)
65#define private2ppd(file) (file_inode(file)->i_private)
66
67#define DEBUGFS_SEQ_FILE_OPS(name) \
68static const struct seq_operations _##name##_seq_ops = { \
69 .start = _##name##_seq_start, \
70 .next = _##name##_seq_next, \
71 .stop = _##name##_seq_stop, \
72 .show = _##name##_seq_show \
73}
74#define DEBUGFS_SEQ_FILE_OPEN(name) \
75static int _##name##_open(struct inode *inode, struct file *s) \
76{ \
77 struct seq_file *seq; \
78 int ret; \
79 ret = seq_open(s, &_##name##_seq_ops); \
80 if (ret) \
81 return ret; \
82 seq = s->private_data; \
83 seq->private = inode->i_private; \
84 return 0; \
85}
86
87#define DEBUGFS_FILE_OPS(name) \
88static const struct file_operations _##name##_file_ops = { \
89 .owner = THIS_MODULE, \
90 .open = _##name##_open, \
91 .read = seq_read, \
92 .llseek = seq_lseek, \
93 .release = seq_release \
94}
95
96#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \
97do { \
98 struct dentry *ent; \
99 ent = debugfs_create_file(name, mode, parent, \
100 data, ops); \
101 if (!ent) \
102 pr_warn("create of %s failed\n", name); \
103} while (0)
104
105
106#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
107 DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO)
108
109static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
110__acquires(RCU)
111{
112 struct hfi1_opcode_stats_perctx *opstats;
113
114 rcu_read_lock();
115 if (*pos >= ARRAY_SIZE(opstats->stats))
116 return NULL;
117 return pos;
118}
119
120static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
121{
122 struct hfi1_opcode_stats_perctx *opstats;
123
124 ++*pos;
125 if (*pos >= ARRAY_SIZE(opstats->stats))
126 return NULL;
127 return pos;
128}
129
130
131static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
132__releases(RCU)
133{
134 rcu_read_unlock();
135}
136
137static int _opcode_stats_seq_show(struct seq_file *s, void *v)
138{
139 loff_t *spos = v;
140 loff_t i = *spos, j;
141 u64 n_packets = 0, n_bytes = 0;
142 struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
143 struct hfi1_devdata *dd = dd_from_dev(ibd);
144
145 for (j = 0; j < dd->first_user_ctxt; j++) {
146 if (!dd->rcd[j])
147 continue;
148 n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
149 n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
150 }
151 if (!n_packets && !n_bytes)
152 return SEQ_SKIP;
153 seq_printf(s, "%02llx %llu/%llu\n", i,
154 (unsigned long long) n_packets,
155 (unsigned long long) n_bytes);
156
157 return 0;
158}
159
160DEBUGFS_SEQ_FILE_OPS(opcode_stats);
161DEBUGFS_SEQ_FILE_OPEN(opcode_stats)
162DEBUGFS_FILE_OPS(opcode_stats);
163
164static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
165{
166 struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
167 struct hfi1_devdata *dd = dd_from_dev(ibd);
168
169 if (!*pos)
170 return SEQ_START_TOKEN;
171 if (*pos >= dd->first_user_ctxt)
172 return NULL;
173 return pos;
174}
175
176static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
177{
178 struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
179 struct hfi1_devdata *dd = dd_from_dev(ibd);
180
181 if (v == SEQ_START_TOKEN)
182 return pos;
183
184 ++*pos;
185 if (*pos >= dd->first_user_ctxt)
186 return NULL;
187 return pos;
188}
189
190static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
191{
192 /* nothing allocated */
193}
194
195static int _ctx_stats_seq_show(struct seq_file *s, void *v)
196{
197 loff_t *spos;
198 loff_t i, j;
199 u64 n_packets = 0;
200 struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
201 struct hfi1_devdata *dd = dd_from_dev(ibd);
202
203 if (v == SEQ_START_TOKEN) {
204 seq_puts(s, "Ctx:npkts\n");
205 return 0;
206 }
207
208 spos = v;
209 i = *spos;
210
211 if (!dd->rcd[i])
212 return SEQ_SKIP;
213
214 for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
215 n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
216
217 if (!n_packets)
218 return SEQ_SKIP;
219
220 seq_printf(s, " %llu:%llu\n", i, n_packets);
221 return 0;
222}
223
224DEBUGFS_SEQ_FILE_OPS(ctx_stats);
225DEBUGFS_SEQ_FILE_OPEN(ctx_stats)
226DEBUGFS_FILE_OPS(ctx_stats);
227
228static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
229__acquires(RCU)
230{
231 struct qp_iter *iter;
232 loff_t n = *pos;
233
234 rcu_read_lock();
235 iter = qp_iter_init(s->private);
236 if (!iter)
237 return NULL;
238
239 while (n--) {
240 if (qp_iter_next(iter)) {
241 kfree(iter);
242 return NULL;
243 }
244 }
245
246 return iter;
247}
248
249static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
250 loff_t *pos)
251{
252 struct qp_iter *iter = iter_ptr;
253
254 (*pos)++;
255
256 if (qp_iter_next(iter)) {
257 kfree(iter);
258 return NULL;
259 }
260
261 return iter;
262}
263
264static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
265__releases(RCU)
266{
267 rcu_read_unlock();
268}
269
270static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
271{
272 struct qp_iter *iter = iter_ptr;
273
274 if (!iter)
275 return 0;
276
277 qp_iter_print(s, iter);
278
279 return 0;
280}
281
282DEBUGFS_SEQ_FILE_OPS(qp_stats);
283DEBUGFS_SEQ_FILE_OPEN(qp_stats)
284DEBUGFS_FILE_OPS(qp_stats);
285
286static void *_sdes_seq_start(struct seq_file *s, loff_t *pos)
287__acquires(RCU)
288{
289 struct hfi1_ibdev *ibd;
290 struct hfi1_devdata *dd;
291
292 rcu_read_lock();
293 ibd = (struct hfi1_ibdev *)s->private;
294 dd = dd_from_dev(ibd);
295 if (!dd->per_sdma || *pos >= dd->num_sdma)
296 return NULL;
297 return pos;
298}
299
300static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos)
301{
302 struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
303 struct hfi1_devdata *dd = dd_from_dev(ibd);
304
305 ++*pos;
306 if (!dd->per_sdma || *pos >= dd->num_sdma)
307 return NULL;
308 return pos;
309}
310
311
312static void _sdes_seq_stop(struct seq_file *s, void *v)
313__releases(RCU)
314{
315 rcu_read_unlock();
316}
317
318static int _sdes_seq_show(struct seq_file *s, void *v)
319{
320 struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
321 struct hfi1_devdata *dd = dd_from_dev(ibd);
322 loff_t *spos = v;
323 loff_t i = *spos;
324
325 sdma_seqfile_dump_sde(s, &dd->per_sdma[i]);
326 return 0;
327}
328
329DEBUGFS_SEQ_FILE_OPS(sdes);
330DEBUGFS_SEQ_FILE_OPEN(sdes)
331DEBUGFS_FILE_OPS(sdes);
332
333/* read the per-device counters */
334static ssize_t dev_counters_read(struct file *file, char __user *buf,
335 size_t count, loff_t *ppos)
336{
337 u64 *counters;
338 size_t avail;
339 struct hfi1_devdata *dd;
340 ssize_t rval;
341
342 rcu_read_lock();
343 dd = private2dd(file);
344 avail = hfi1_read_cntrs(dd, *ppos, NULL, &counters);
345 rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
346 rcu_read_unlock();
347 return rval;
348}
349
350/* read the per-device counters */
351static ssize_t dev_names_read(struct file *file, char __user *buf,
352 size_t count, loff_t *ppos)
353{
354 char *names;
355 size_t avail;
356 struct hfi1_devdata *dd;
357 ssize_t rval;
358
359 rcu_read_lock();
360 dd = private2dd(file);
361 avail = hfi1_read_cntrs(dd, *ppos, &names, NULL);
362 rval = simple_read_from_buffer(buf, count, ppos, names, avail);
363 rcu_read_unlock();
364 return rval;
365}
366
367struct counter_info {
368 char *name;
369 const struct file_operations ops;
370};
371
372/*
373 * Could use file_inode(file)->i_ino to figure out which file,
374 * instead of separate routine for each, but for now, this works...
375 */
376
377/* read the per-port names (same for each port) */
378static ssize_t portnames_read(struct file *file, char __user *buf,
379 size_t count, loff_t *ppos)
380{
381 char *names;
382 size_t avail;
383 struct hfi1_devdata *dd;
384 ssize_t rval;
385
386 rcu_read_lock();
387 dd = private2dd(file);
388 /* port number n/a here since names are constant */
389 avail = hfi1_read_portcntrs(dd, *ppos, 0, &names, NULL);
390 rval = simple_read_from_buffer(buf, count, ppos, names, avail);
391 rcu_read_unlock();
392 return rval;
393}
394
395/* read the per-port counters */
396static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf,
397 size_t count, loff_t *ppos)
398{
399 u64 *counters;
400 size_t avail;
401 struct hfi1_devdata *dd;
402 struct hfi1_pportdata *ppd;
403 ssize_t rval;
404
405 rcu_read_lock();
406 ppd = private2ppd(file);
407 dd = ppd->dd;
408 avail = hfi1_read_portcntrs(dd, *ppos, ppd->port - 1, NULL, &counters);
409 rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
410 rcu_read_unlock();
411 return rval;
412}
413
414/*
415 * read the per-port QSFP data for ppd
416 */
417static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf,
418 size_t count, loff_t *ppos)
419{
420 struct hfi1_pportdata *ppd;
421 char *tmp;
422 int ret;
423
424 rcu_read_lock();
425 ppd = private2ppd(file);
426 tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
427 if (!tmp) {
428 rcu_read_unlock();
429 return -ENOMEM;
430 }
431
432 ret = qsfp_dump(ppd, tmp, PAGE_SIZE);
433 if (ret > 0)
434 ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
435 rcu_read_unlock();
436 kfree(tmp);
437 return ret;
438}
439
440/* Do an i2c write operation on the chain for the given HFI. */
441static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf,
442 size_t count, loff_t *ppos, u32 target)
443{
444 struct hfi1_pportdata *ppd;
445 char *buff;
446 int ret;
447 int i2c_addr;
448 int offset;
449 int total_written;
450
451 rcu_read_lock();
452 ppd = private2ppd(file);
453
454 buff = kmalloc(count, GFP_KERNEL);
455 if (!buff) {
456 ret = -ENOMEM;
457 goto _return;
458 }
459
460 ret = copy_from_user(buff, buf, count);
461 if (ret > 0) {
462 ret = -EFAULT;
463 goto _free;
464 }
465
466 i2c_addr = (*ppos >> 16) & 0xff;
467 offset = *ppos & 0xffff;
468
469 total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count);
470 if (total_written < 0) {
471 ret = total_written;
472 goto _free;
473 }
474
475 *ppos += total_written;
476
477 ret = total_written;
478
479 _free:
480 kfree(buff);
481 _return:
482 rcu_read_unlock();
483 return ret;
484}
485
486/* Do an i2c write operation on chain for HFI 0. */
487static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf,
488 size_t count, loff_t *ppos)
489{
490 return __i2c_debugfs_write(file, buf, count, ppos, 0);
491}
492
493/* Do an i2c write operation on chain for HFI 1. */
494static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf,
495 size_t count, loff_t *ppos)
496{
497 return __i2c_debugfs_write(file, buf, count, ppos, 1);
498}
499
500/* Do an i2c read operation on the chain for the given HFI. */
501static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf,
502 size_t count, loff_t *ppos, u32 target)
503{
504 struct hfi1_pportdata *ppd;
505 char *buff;
506 int ret;
507 int i2c_addr;
508 int offset;
509 int total_read;
510
511 rcu_read_lock();
512 ppd = private2ppd(file);
513
514 buff = kmalloc(count, GFP_KERNEL);
515 if (!buff) {
516 ret = -ENOMEM;
517 goto _return;
518 }
519
520 i2c_addr = (*ppos >> 16) & 0xff;
521 offset = *ppos & 0xffff;
522
523 total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count);
524 if (total_read < 0) {
525 ret = total_read;
526 goto _free;
527 }
528
529 *ppos += total_read;
530
531 ret = copy_to_user(buf, buff, total_read);
532 if (ret > 0) {
533 ret = -EFAULT;
534 goto _free;
535 }
536
537 ret = total_read;
538
539 _free:
540 kfree(buff);
541 _return:
542 rcu_read_unlock();
543 return ret;
544}
545
546/* Do an i2c read operation on chain for HFI 0. */
547static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf,
548 size_t count, loff_t *ppos)
549{
550 return __i2c_debugfs_read(file, buf, count, ppos, 0);
551}
552
553/* Do an i2c read operation on chain for HFI 1. */
554static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf,
555 size_t count, loff_t *ppos)
556{
557 return __i2c_debugfs_read(file, buf, count, ppos, 1);
558}
559
560/* Do a QSFP write operation on the i2c chain for the given HFI. */
561static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf,
562 size_t count, loff_t *ppos, u32 target)
563{
564 struct hfi1_pportdata *ppd;
565 char *buff;
566 int ret;
567 int total_written;
568
569 rcu_read_lock();
570 if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
571 ret = -EINVAL;
572 goto _return;
573 }
574
575 ppd = private2ppd(file);
576
577 buff = kmalloc(count, GFP_KERNEL);
578 if (!buff) {
579 ret = -ENOMEM;
580 goto _return;
581 }
582
583 ret = copy_from_user(buff, buf, count);
584 if (ret > 0) {
585 ret = -EFAULT;
586 goto _free;
587 }
588
589 total_written = qsfp_write(ppd, target, *ppos, buff, count);
590 if (total_written < 0) {
591 ret = total_written;
592 goto _free;
593 }
594
595 *ppos += total_written;
596
597 ret = total_written;
598
599 _free:
600 kfree(buff);
601 _return:
602 rcu_read_unlock();
603 return ret;
604}
605
606/* Do a QSFP write operation on i2c chain for HFI 0. */
607static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf,
608 size_t count, loff_t *ppos)
609{
610 return __qsfp_debugfs_write(file, buf, count, ppos, 0);
611}
612
613/* Do a QSFP write operation on i2c chain for HFI 1. */
614static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf,
615 size_t count, loff_t *ppos)
616{
617 return __qsfp_debugfs_write(file, buf, count, ppos, 1);
618}
619
620/* Do a QSFP read operation on the i2c chain for the given HFI. */
621static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf,
622 size_t count, loff_t *ppos, u32 target)
623{
624 struct hfi1_pportdata *ppd;
625 char *buff;
626 int ret;
627 int total_read;
628
629 rcu_read_lock();
630 if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
631 ret = -EINVAL;
632 goto _return;
633 }
634
635 ppd = private2ppd(file);
636
637 buff = kmalloc(count, GFP_KERNEL);
638 if (!buff) {
639 ret = -ENOMEM;
640 goto _return;
641 }
642
643 total_read = qsfp_read(ppd, target, *ppos, buff, count);
644 if (total_read < 0) {
645 ret = total_read;
646 goto _free;
647 }
648
649 *ppos += total_read;
650
651 ret = copy_to_user(buf, buff, total_read);
652 if (ret > 0) {
653 ret = -EFAULT;
654 goto _free;
655 }
656
657 ret = total_read;
658
659 _free:
660 kfree(buff);
661 _return:
662 rcu_read_unlock();
663 return ret;
664}
665
666/* Do a QSFP read operation on i2c chain for HFI 0. */
667static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf,
668 size_t count, loff_t *ppos)
669{
670 return __qsfp_debugfs_read(file, buf, count, ppos, 0);
671}
672
673/* Do a QSFP read operation on i2c chain for HFI 1. */
674static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf,
675 size_t count, loff_t *ppos)
676{
677 return __qsfp_debugfs_read(file, buf, count, ppos, 1);
678}
679
680#define DEBUGFS_OPS(nm, readroutine, writeroutine) \
681{ \
682 .name = nm, \
683 .ops = { \
684 .read = readroutine, \
685 .write = writeroutine, \
686 .llseek = generic_file_llseek, \
687 }, \
688}
689
690static const struct counter_info cntr_ops[] = {
691 DEBUGFS_OPS("counter_names", dev_names_read, NULL),
692 DEBUGFS_OPS("counters", dev_counters_read, NULL),
693 DEBUGFS_OPS("portcounter_names", portnames_read, NULL),
694};
695
696static const struct counter_info port_cntr_ops[] = {
697 DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL),
698 DEBUGFS_OPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write),
699 DEBUGFS_OPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write),
700 DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL),
701 DEBUGFS_OPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write),
702 DEBUGFS_OPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write),
703};
704
705void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
706{
707 char name[sizeof("port0counters") + 1];
708 char link[10];
709 struct hfi1_devdata *dd = dd_from_dev(ibd);
710 struct hfi1_pportdata *ppd;
711 int unit = dd->unit;
712 int i, j;
713
714 if (!hfi1_dbg_root)
715 return;
716 snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
717 snprintf(link, sizeof(link), "%d", unit);
718 ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
719 if (!ibd->hfi1_ibdev_dbg) {
720 pr_warn("create of %s failed\n", name);
721 return;
722 }
723 ibd->hfi1_ibdev_link =
724 debugfs_create_symlink(link, hfi1_dbg_root, name);
725 if (!ibd->hfi1_ibdev_link) {
726 pr_warn("create of %s symlink failed\n", name);
727 return;
728 }
729 DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
730 DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
731 DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
732 DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
733 /* dev counter files */
734 for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
735 DEBUGFS_FILE_CREATE(cntr_ops[i].name,
736 ibd->hfi1_ibdev_dbg,
737 dd,
738 &cntr_ops[i].ops, S_IRUGO);
739 /* per port files */
740 for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
741 for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
742 snprintf(name,
743 sizeof(name),
744 port_cntr_ops[i].name,
745 j + 1);
746 DEBUGFS_FILE_CREATE(name,
747 ibd->hfi1_ibdev_dbg,
748 ppd,
749 &port_cntr_ops[i].ops,
750 port_cntr_ops[i].ops.write == NULL ?
751 S_IRUGO : S_IRUGO|S_IWUSR);
752 }
753}
754
755void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
756{
757 if (!hfi1_dbg_root)
758 goto out;
759 debugfs_remove(ibd->hfi1_ibdev_link);
760 debugfs_remove_recursive(ibd->hfi1_ibdev_dbg);
761out:
762 ibd->hfi1_ibdev_dbg = NULL;
763 synchronize_rcu();
764}
765
766/*
767 * driver stats field names, one line per stat, single string. Used by
768 * programs like hfistats to print the stats in a way which works for
769 * different versions of drivers, without changing program source.
770 * if hfi1_ib_stats changes, this needs to change. Names need to be
771 * 12 chars or less (w/o newline), for proper display by hfistats utility.
772 */
773static const char * const hfi1_statnames[] = {
774 /* must be element 0*/
775 "KernIntr",
776 "ErrorIntr",
777 "Tx_Errs",
778 "Rcv_Errs",
779 "H/W_Errs",
780 "NoPIOBufs",
781 "CtxtsOpen",
782 "RcvLen_Errs",
783 "EgrBufFull",
784 "EgrHdrFull"
785};
786
787static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos)
788__acquires(RCU)
789{
790 rcu_read_lock();
791 if (*pos >= ARRAY_SIZE(hfi1_statnames))
792 return NULL;
793 return pos;
794}
795
796static void *_driver_stats_names_seq_next(
797 struct seq_file *s,
798 void *v,
799 loff_t *pos)
800{
801 ++*pos;
802 if (*pos >= ARRAY_SIZE(hfi1_statnames))
803 return NULL;
804 return pos;
805}
806
807static void _driver_stats_names_seq_stop(struct seq_file *s, void *v)
808__releases(RCU)
809{
810 rcu_read_unlock();
811}
812
813static int _driver_stats_names_seq_show(struct seq_file *s, void *v)
814{
815 loff_t *spos = v;
816
817 seq_printf(s, "%s\n", hfi1_statnames[*spos]);
818 return 0;
819}
820
821DEBUGFS_SEQ_FILE_OPS(driver_stats_names);
822DEBUGFS_SEQ_FILE_OPEN(driver_stats_names)
823DEBUGFS_FILE_OPS(driver_stats_names);
824
825static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos)
826__acquires(RCU)
827{
828 rcu_read_lock();
829 if (*pos >= ARRAY_SIZE(hfi1_statnames))
830 return NULL;
831 return pos;
832}
833
834static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
835{
836 ++*pos;
837 if (*pos >= ARRAY_SIZE(hfi1_statnames))
838 return NULL;
839 return pos;
840}
841
842static void _driver_stats_seq_stop(struct seq_file *s, void *v)
843__releases(RCU)
844{
845 rcu_read_unlock();
846}
847
848static u64 hfi1_sps_ints(void)
849{
850 unsigned long flags;
851 struct hfi1_devdata *dd;
852 u64 sps_ints = 0;
853
854 spin_lock_irqsave(&hfi1_devs_lock, flags);
855 list_for_each_entry(dd, &hfi1_dev_list, list) {
856 sps_ints += get_all_cpu_total(dd->int_counter);
857 }
858 spin_unlock_irqrestore(&hfi1_devs_lock, flags);
859 return sps_ints;
860}
861
862static int _driver_stats_seq_show(struct seq_file *s, void *v)
863{
864 loff_t *spos = v;
865 char *buffer;
866 u64 *stats = (u64 *)&hfi1_stats;
867 size_t sz = seq_get_buf(s, &buffer);
868
869 if (sz < sizeof(u64))
870 return SEQ_SKIP;
871 /* special case for interrupts */
872 if (*spos == 0)
873 *(u64 *)buffer = hfi1_sps_ints();
874 else
875 *(u64 *)buffer = stats[*spos];
876 seq_commit(s, sizeof(u64));
877 return 0;
878}
879
880DEBUGFS_SEQ_FILE_OPS(driver_stats);
881DEBUGFS_SEQ_FILE_OPEN(driver_stats)
882DEBUGFS_FILE_OPS(driver_stats);
883
884void hfi1_dbg_init(void)
885{
886 hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL);
887 if (!hfi1_dbg_root)
888 pr_warn("init of debugfs failed\n");
889 DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
890 DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
891}
892
893void hfi1_dbg_exit(void)
894{
895 debugfs_remove_recursive(hfi1_dbg_root);
896 hfi1_dbg_root = NULL;
897}
898
899#endif
diff --git a/drivers/staging/rdma/hfi1/debugfs.h b/drivers/staging/rdma/hfi1/debugfs.h
new file mode 100644
index 000000000000..92d6fe146714
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/debugfs.h
@@ -0,0 +1,78 @@
1#ifndef _HFI1_DEBUGFS_H
2#define _HFI1_DEBUGFS_H
3/*
4 *
5 * This file is provided under a dual BSD/GPLv2 license. When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * Copyright(c) 2015 Intel Corporation.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of version 2 of the GNU General Public License as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * BSD LICENSE
22 *
23 * Copyright(c) 2015 Intel Corporation.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 *
29 * - Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * - Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in
33 * the documentation and/or other materials provided with the
34 * distribution.
35 * - Neither the name of Intel Corporation nor the names of its
36 * contributors may be used to endorse or promote products derived
37 * from this software without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
40 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
41 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
43 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
47 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
49 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 *
51 */
52
53struct hfi1_ibdev;
54#ifdef CONFIG_DEBUG_FS
55void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd);
56void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd);
57void hfi1_dbg_init(void);
58void hfi1_dbg_exit(void);
59#else
60static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
61{
62}
63
64void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
65{
66}
67
68void hfi1_dbg_init(void)
69{
70}
71
72void hfi1_dbg_exit(void)
73{
74}
75
76#endif
77
78#endif /* _HFI1_DEBUGFS_H */
diff --git a/drivers/staging/rdma/hfi1/device.c b/drivers/staging/rdma/hfi1/device.c
new file mode 100644
index 000000000000..07c87a87775f
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/device.c
@@ -0,0 +1,142 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/cdev.h>
52#include <linux/module.h>
53#include <linux/device.h>
54#include <linux/fs.h>
55
56#include "hfi.h"
57#include "device.h"
58
59static struct class *class;
60static dev_t hfi1_dev;
61
62int hfi1_cdev_init(int minor, const char *name,
63 const struct file_operations *fops,
64 struct cdev *cdev, struct device **devp)
65{
66 const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor);
67 struct device *device = NULL;
68 int ret;
69
70 cdev_init(cdev, fops);
71 cdev->owner = THIS_MODULE;
72 kobject_set_name(&cdev->kobj, name);
73
74 ret = cdev_add(cdev, dev, 1);
75 if (ret < 0) {
76 pr_err("Could not add cdev for minor %d, %s (err %d)\n",
77 minor, name, -ret);
78 goto done;
79 }
80
81 device = device_create(class, NULL, dev, NULL, "%s", name);
82 if (!IS_ERR(device))
83 goto done;
84 ret = PTR_ERR(device);
85 device = NULL;
86 pr_err("Could not create device for minor %d, %s (err %d)\n",
87 minor, name, -ret);
88 cdev_del(cdev);
89done:
90 *devp = device;
91 return ret;
92}
93
94void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp)
95{
96 struct device *device = *devp;
97
98 if (device) {
99 device_unregister(device);
100 *devp = NULL;
101
102 cdev_del(cdev);
103 }
104}
105
106static const char *hfi1_class_name = "hfi1";
107
108const char *class_name(void)
109{
110 return hfi1_class_name;
111}
112
113int __init dev_init(void)
114{
115 int ret;
116
117 ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME);
118 if (ret < 0) {
119 pr_err("Could not allocate chrdev region (err %d)\n", -ret);
120 goto done;
121 }
122
123 class = class_create(THIS_MODULE, class_name());
124 if (IS_ERR(class)) {
125 ret = PTR_ERR(class);
126 pr_err("Could not create device class (err %d)\n", -ret);
127 unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
128 }
129
130done:
131 return ret;
132}
133
134void dev_cleanup(void)
135{
136 if (class) {
137 class_destroy(class);
138 class = NULL;
139 }
140
141 unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
142}
diff --git a/drivers/staging/rdma/hfi1/device.h b/drivers/staging/rdma/hfi1/device.h
new file mode 100644
index 000000000000..98caecd3d807
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/device.h
@@ -0,0 +1,61 @@
1#ifndef _HFI1_DEVICE_H
2#define _HFI1_DEVICE_H
3/*
4 *
5 * This file is provided under a dual BSD/GPLv2 license. When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * Copyright(c) 2015 Intel Corporation.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of version 2 of the GNU General Public License as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * BSD LICENSE
22 *
23 * Copyright(c) 2015 Intel Corporation.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 *
29 * - Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * - Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in
33 * the documentation and/or other materials provided with the
34 * distribution.
35 * - Neither the name of Intel Corporation nor the names of its
36 * contributors may be used to endorse or promote products derived
37 * from this software without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
40 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
41 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
43 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
47 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
49 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 *
51 */
52
53int hfi1_cdev_init(int minor, const char *name,
54 const struct file_operations *fops,
55 struct cdev *cdev, struct device **devp);
56void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
57const char *class_name(void);
58int __init dev_init(void);
59void dev_cleanup(void);
60
61#endif /* _HFI1_DEVICE_H */
diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c
new file mode 100644
index 000000000000..6777d6b659cf
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/diag.c
@@ -0,0 +1,1873 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51/*
52 * This file contains support for diagnostic functions. It is accessed by
53 * opening the hfi1_diag device, normally minor number 129. Diagnostic use
54 * of the chip may render the chip or board unusable until the driver
55 * is unloaded, or in some cases, until the system is rebooted.
56 *
57 * Accesses to the chip through this interface are not similar to going
58 * through the /sys/bus/pci resource mmap interface.
59 */
60
61#include <linux/io.h>
62#include <linux/pci.h>
63#include <linux/poll.h>
64#include <linux/vmalloc.h>
65#include <linux/export.h>
66#include <linux/fs.h>
67#include <linux/uaccess.h>
68#include <linux/module.h>
69#include <rdma/ib_smi.h>
70#include "hfi.h"
71#include "device.h"
72#include "common.h"
73#include "trace.h"
74
75#undef pr_fmt
76#define pr_fmt(fmt) DRIVER_NAME ": " fmt
77#define snoop_dbg(fmt, ...) \
78 hfi1_cdbg(SNOOP, fmt, ##__VA_ARGS__)
79
80/* Snoop option mask */
81#define SNOOP_DROP_SEND (1 << 0)
82#define SNOOP_USE_METADATA (1 << 1)
83
84static u8 snoop_flags;
85
86/*
87 * Extract packet length from LRH header.
88 * Why & 0x7FF? Because len is only 11 bits in case it wasn't 0'd we throw the
89 * bogus bits away. This is in Dwords so multiply by 4 to get size in bytes
90 */
91#define HFI1_GET_PKT_LEN(x) (((be16_to_cpu((x)->lrh[2]) & 0x7FF)) << 2)
92
93enum hfi1_filter_status {
94 HFI1_FILTER_HIT,
95 HFI1_FILTER_ERR,
96 HFI1_FILTER_MISS
97};
98
99/* snoop processing functions */
100rhf_rcv_function_ptr snoop_rhf_rcv_functions[8] = {
101 [RHF_RCV_TYPE_EXPECTED] = snoop_recv_handler,
102 [RHF_RCV_TYPE_EAGER] = snoop_recv_handler,
103 [RHF_RCV_TYPE_IB] = snoop_recv_handler,
104 [RHF_RCV_TYPE_ERROR] = snoop_recv_handler,
105 [RHF_RCV_TYPE_BYPASS] = snoop_recv_handler,
106 [RHF_RCV_TYPE_INVALID5] = process_receive_invalid,
107 [RHF_RCV_TYPE_INVALID6] = process_receive_invalid,
108 [RHF_RCV_TYPE_INVALID7] = process_receive_invalid
109};
110
111/* Snoop packet structure */
112struct snoop_packet {
113 struct list_head list;
114 u32 total_len;
115 u8 data[];
116};
117
118/* Do not make these an enum or it will blow up the capture_md */
119#define PKT_DIR_EGRESS 0x0
120#define PKT_DIR_INGRESS 0x1
121
122/* Packet capture metadata returned to the user with the packet. */
123struct capture_md {
124 u8 port;
125 u8 dir;
126 u8 reserved[6];
127 union {
128 u64 pbc;
129 u64 rhf;
130 } u;
131};
132
133static atomic_t diagpkt_count = ATOMIC_INIT(0);
134static struct cdev diagpkt_cdev;
135static struct device *diagpkt_device;
136
137static ssize_t diagpkt_write(struct file *fp, const char __user *data,
138 size_t count, loff_t *off);
139
140static const struct file_operations diagpkt_file_ops = {
141 .owner = THIS_MODULE,
142 .write = diagpkt_write,
143 .llseek = noop_llseek,
144};
145
146/*
147 * This is used for communication with user space for snoop extended IOCTLs
148 */
149struct hfi1_link_info {
150 __be64 node_guid;
151 u8 port_mode;
152 u8 port_state;
153 u16 link_speed_active;
154 u16 link_width_active;
155 u16 vl15_init;
156 u8 port_number;
157 /*
158 * Add padding to make this a full IB SMP payload. Note: changing the
159 * size of this structure will make the IOCTLs created with _IOWR
160 * change.
161 * Be sure to run tests on all IOCTLs when making changes to this
162 * structure.
163 */
164 u8 res[47];
165};
166
167/*
168 * This starts our ioctl sequence numbers *way* off from the ones
169 * defined in ib_core.
170 */
171#define SNOOP_CAPTURE_VERSION 0x1
172
173#define IB_IOCTL_MAGIC 0x1b /* See Documentation/ioctl-number.txt */
174#define HFI1_SNOOP_IOC_MAGIC IB_IOCTL_MAGIC
175#define HFI1_SNOOP_IOC_BASE_SEQ 0x80
176
177#define HFI1_SNOOP_IOCGETLINKSTATE \
178 _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ)
179#define HFI1_SNOOP_IOCSETLINKSTATE \
180 _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+1)
181#define HFI1_SNOOP_IOCCLEARQUEUE \
182 _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+2)
183#define HFI1_SNOOP_IOCCLEARFILTER \
184 _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+3)
185#define HFI1_SNOOP_IOCSETFILTER \
186 _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+4)
187#define HFI1_SNOOP_IOCGETVERSION \
188 _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+5)
189#define HFI1_SNOOP_IOCSET_OPTS \
190 _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+6)
191
192/*
193 * These offsets +6/+7 could change, but these are already known and used
194 * IOCTL numbers so don't change them without a good reason.
195 */
196#define HFI1_SNOOP_IOCGETLINKSTATE_EXTRA \
197 _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+6, \
198 struct hfi1_link_info)
199#define HFI1_SNOOP_IOCSETLINKSTATE_EXTRA \
200 _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+7, \
201 struct hfi1_link_info)
202
203static int hfi1_snoop_open(struct inode *in, struct file *fp);
204static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
205 size_t pkt_len, loff_t *off);
206static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
207 size_t count, loff_t *off);
208static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg);
209static unsigned int hfi1_snoop_poll(struct file *fp,
210 struct poll_table_struct *wait);
211static int hfi1_snoop_release(struct inode *in, struct file *fp);
212
213struct hfi1_packet_filter_command {
214 int opcode;
215 int length;
216 void *value_ptr;
217};
218
219/* Can't re-use PKT_DIR_*GRESS here because 0 means no packets for this */
220#define HFI1_SNOOP_INGRESS 0x1
221#define HFI1_SNOOP_EGRESS 0x2
222
223enum hfi1_packet_filter_opcodes {
224 FILTER_BY_LID,
225 FILTER_BY_DLID,
226 FILTER_BY_MAD_MGMT_CLASS,
227 FILTER_BY_QP_NUMBER,
228 FILTER_BY_PKT_TYPE,
229 FILTER_BY_SERVICE_LEVEL,
230 FILTER_BY_PKEY,
231 FILTER_BY_DIRECTION,
232};
233
234static const struct file_operations snoop_file_ops = {
235 .owner = THIS_MODULE,
236 .open = hfi1_snoop_open,
237 .read = hfi1_snoop_read,
238 .unlocked_ioctl = hfi1_ioctl,
239 .poll = hfi1_snoop_poll,
240 .write = hfi1_snoop_write,
241 .release = hfi1_snoop_release
242};
243
244struct hfi1_filter_array {
245 int (*filter)(void *, void *, void *);
246};
247
248static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value);
249static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value);
250static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
251 void *value);
252static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value);
253static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
254 void *value);
255static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
256 void *value);
257static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value);
258static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value);
259
260static struct hfi1_filter_array hfi1_filters[] = {
261 { hfi1_filter_lid },
262 { hfi1_filter_dlid },
263 { hfi1_filter_mad_mgmt_class },
264 { hfi1_filter_qp_number },
265 { hfi1_filter_ibpacket_type },
266 { hfi1_filter_ib_service_level },
267 { hfi1_filter_ib_pkey },
268 { hfi1_filter_direction },
269};
270
271#define HFI1_MAX_FILTERS ARRAY_SIZE(hfi1_filters)
272#define HFI1_DIAG_MINOR_BASE 129
273
274static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name);
275
276int hfi1_diag_add(struct hfi1_devdata *dd)
277{
278 char name[16];
279 int ret = 0;
280
281 snprintf(name, sizeof(name), "%s_diagpkt%d", class_name(),
282 dd->unit);
283 /*
284 * Do this for each device as opposed to the normal diagpkt
285 * interface which is one per host
286 */
287 ret = hfi1_snoop_add(dd, name);
288 if (ret)
289 dd_dev_err(dd, "Unable to init snoop/capture device");
290
291 snprintf(name, sizeof(name), "%s_diagpkt", class_name());
292 if (atomic_inc_return(&diagpkt_count) == 1) {
293 ret = hfi1_cdev_init(HFI1_DIAGPKT_MINOR, name,
294 &diagpkt_file_ops, &diagpkt_cdev,
295 &diagpkt_device);
296 }
297
298 return ret;
299}
300
301/* this must be called w/ dd->snoop_in_lock held */
302static void drain_snoop_list(struct list_head *queue)
303{
304 struct list_head *pos, *q;
305 struct snoop_packet *packet;
306
307 list_for_each_safe(pos, q, queue) {
308 packet = list_entry(pos, struct snoop_packet, list);
309 list_del(pos);
310 kfree(packet);
311 }
312}
313
314static void hfi1_snoop_remove(struct hfi1_devdata *dd)
315{
316 unsigned long flags = 0;
317
318 spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
319 drain_snoop_list(&dd->hfi1_snoop.queue);
320 hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev);
321 spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
322}
323
324void hfi1_diag_remove(struct hfi1_devdata *dd)
325{
326
327 hfi1_snoop_remove(dd);
328 if (atomic_dec_and_test(&diagpkt_count))
329 hfi1_cdev_cleanup(&diagpkt_cdev, &diagpkt_device);
330 hfi1_cdev_cleanup(&dd->diag_cdev, &dd->diag_device);
331}
332
333
334/*
335 * Allocated structure shared between the credit return mechanism and
336 * diagpkt_send().
337 */
338struct diagpkt_wait {
339 struct completion credits_returned;
340 int code;
341 atomic_t count;
342};
343
344/*
345 * When each side is finished with the structure, they call this.
346 * The last user frees the structure.
347 */
348static void put_diagpkt_wait(struct diagpkt_wait *wait)
349{
350 if (atomic_dec_and_test(&wait->count))
351 kfree(wait);
352}
353
354/*
355 * Callback from the credit return code. Set the complete, which
356 * will let diapkt_send() continue.
357 */
358static void diagpkt_complete(void *arg, int code)
359{
360 struct diagpkt_wait *wait = (struct diagpkt_wait *)arg;
361
362 wait->code = code;
363 complete(&wait->credits_returned);
364 put_diagpkt_wait(wait); /* finished with the structure */
365}
366
367/**
368 * diagpkt_send - send a packet
369 * @dp: diag packet descriptor
370 */
371static ssize_t diagpkt_send(struct diag_pkt *dp)
372{
373 struct hfi1_devdata *dd;
374 struct send_context *sc;
375 struct pio_buf *pbuf;
376 u32 *tmpbuf = NULL;
377 ssize_t ret = 0;
378 u32 pkt_len, total_len;
379 pio_release_cb credit_cb = NULL;
380 void *credit_arg = NULL;
381 struct diagpkt_wait *wait = NULL;
382
383 dd = hfi1_lookup(dp->unit);
384 if (!dd || !(dd->flags & HFI1_PRESENT) || !dd->kregbase) {
385 ret = -ENODEV;
386 goto bail;
387 }
388 if (!(dd->flags & HFI1_INITTED)) {
389 /* no hardware, freeze, etc. */
390 ret = -ENODEV;
391 goto bail;
392 }
393
394 if (dp->version != _DIAG_PKT_VERS) {
395 dd_dev_err(dd, "Invalid version %u for diagpkt_write\n",
396 dp->version);
397 ret = -EINVAL;
398 goto bail;
399 }
400
401 /* send count must be an exact number of dwords */
402 if (dp->len & 3) {
403 ret = -EINVAL;
404 goto bail;
405 }
406
407 /* there is only port 1 */
408 if (dp->port != 1) {
409 ret = -EINVAL;
410 goto bail;
411 }
412
413 /* need a valid context */
414 if (dp->sw_index >= dd->num_send_contexts) {
415 ret = -EINVAL;
416 goto bail;
417 }
418 /* can only use kernel contexts */
419 if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) {
420 ret = -EINVAL;
421 goto bail;
422 }
423 /* must be allocated */
424 sc = dd->send_contexts[dp->sw_index].sc;
425 if (!sc) {
426 ret = -EINVAL;
427 goto bail;
428 }
429 /* must be enabled */
430 if (!(sc->flags & SCF_ENABLED)) {
431 ret = -EINVAL;
432 goto bail;
433 }
434
435 /* allocate a buffer and copy the data in */
436 tmpbuf = vmalloc(dp->len);
437 if (!tmpbuf) {
438 ret = -ENOMEM;
439 goto bail;
440 }
441
442 if (copy_from_user(tmpbuf,
443 (const void __user *) (unsigned long) dp->data,
444 dp->len)) {
445 ret = -EFAULT;
446 goto bail;
447 }
448
449 /*
450 * pkt_len is how much data we have to write, includes header and data.
451 * total_len is length of the packet in Dwords plus the PBC should not
452 * include the CRC.
453 */
454 pkt_len = dp->len >> 2;
455 total_len = pkt_len + 2; /* PBC + packet */
456
457 /* if 0, fill in a default */
458 if (dp->pbc == 0) {
459 struct hfi1_pportdata *ppd = dd->pport;
460
461 hfi1_cdbg(PKT, "Generating PBC");
462 dp->pbc = create_pbc(ppd, 0, 0, 0, total_len);
463 } else {
464 hfi1_cdbg(PKT, "Using passed in PBC");
465 }
466
467 hfi1_cdbg(PKT, "Egress PBC content is 0x%llx", dp->pbc);
468
469 /*
470 * The caller wants to wait until the packet is sent and to
471 * check for errors. The best we can do is wait until
472 * the buffer credits are returned and check if any packet
473 * error has occurred. If there are any late errors, this
474 * could miss it. If there are other senders who generate
475 * an error, this may find it. However, in general, it
476 * should catch most.
477 */
478 if (dp->flags & F_DIAGPKT_WAIT) {
479 /* always force a credit return */
480 dp->pbc |= PBC_CREDIT_RETURN;
481 /* turn on credit return interrupts */
482 sc_add_credit_return_intr(sc);
483 wait = kmalloc(sizeof(*wait), GFP_KERNEL);
484 if (!wait) {
485 ret = -ENOMEM;
486 goto bail;
487 }
488 init_completion(&wait->credits_returned);
489 atomic_set(&wait->count, 2);
490 wait->code = PRC_OK;
491
492 credit_cb = diagpkt_complete;
493 credit_arg = wait;
494 }
495
496 pbuf = sc_buffer_alloc(sc, total_len, credit_cb, credit_arg);
497 if (!pbuf) {
498 /*
499 * No send buffer means no credit callback. Undo
500 * the wait set-up that was done above. We free wait
501 * because the callback will never be called.
502 */
503 if (dp->flags & F_DIAGPKT_WAIT) {
504 sc_del_credit_return_intr(sc);
505 kfree(wait);
506 wait = NULL;
507 }
508 ret = -ENOSPC;
509 goto bail;
510 }
511
512 pio_copy(dd, pbuf, dp->pbc, tmpbuf, pkt_len);
513 /* no flush needed as the HW knows the packet size */
514
515 ret = sizeof(*dp);
516
517 if (dp->flags & F_DIAGPKT_WAIT) {
518 /* wait for credit return */
519 ret = wait_for_completion_interruptible(
520 &wait->credits_returned);
521 /*
522 * If the wait returns an error, the wait was interrupted,
523 * e.g. with a ^C in the user program. The callback is
524 * still pending. This is OK as the wait structure is
525 * kmalloc'ed and the structure will free itself when
526 * all users are done with it.
527 *
528 * A context disable occurs on a send context restart, so
529 * include that in the list of errors below to check for.
530 * NOTE: PRC_FILL_ERR is at best informational and cannot
531 * be depended on.
532 */
533 if (!ret && (((wait->code & PRC_STATUS_ERR)
534 || (wait->code & PRC_FILL_ERR)
535 || (wait->code & PRC_SC_DISABLE))))
536 ret = -EIO;
537
538 put_diagpkt_wait(wait); /* finished with the structure */
539 sc_del_credit_return_intr(sc);
540 }
541
542bail:
543 vfree(tmpbuf);
544 return ret;
545}
546
547static ssize_t diagpkt_write(struct file *fp, const char __user *data,
548 size_t count, loff_t *off)
549{
550 struct hfi1_devdata *dd;
551 struct send_context *sc;
552 u8 vl;
553
554 struct diag_pkt dp;
555
556 if (count != sizeof(dp))
557 return -EINVAL;
558
559 if (copy_from_user(&dp, data, sizeof(dp)))
560 return -EFAULT;
561
562 /*
563 * The Send Context is derived from the PbcVL value
564 * if PBC is populated
565 */
566 if (dp.pbc) {
567 dd = hfi1_lookup(dp.unit);
568 if (dd == NULL)
569 return -ENODEV;
570 vl = (dp.pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
571 sc = dd->vld[vl].sc;
572 if (sc) {
573 dp.sw_index = sc->sw_index;
574 hfi1_cdbg(
575 PKT,
576 "Packet sent over VL %d via Send Context %u(%u)",
577 vl, sc->sw_index, sc->hw_context);
578 }
579 }
580
581 return diagpkt_send(&dp);
582}
583
584static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name)
585{
586 int ret = 0;
587
588 dd->hfi1_snoop.mode_flag = 0;
589 spin_lock_init(&dd->hfi1_snoop.snoop_lock);
590 INIT_LIST_HEAD(&dd->hfi1_snoop.queue);
591 init_waitqueue_head(&dd->hfi1_snoop.waitq);
592
593 ret = hfi1_cdev_init(HFI1_SNOOP_CAPTURE_BASE + dd->unit, name,
594 &snoop_file_ops,
595 &dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev);
596
597 if (ret) {
598 dd_dev_err(dd, "Couldn't create %s device: %d", name, ret);
599 hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev,
600 &dd->hfi1_snoop.class_dev);
601 }
602
603 return ret;
604}
605
606static struct hfi1_devdata *hfi1_dd_from_sc_inode(struct inode *in)
607{
608 int unit = iminor(in) - HFI1_SNOOP_CAPTURE_BASE;
609 struct hfi1_devdata *dd = NULL;
610
611 dd = hfi1_lookup(unit);
612 return dd;
613
614}
615
616/* clear or restore send context integrity checks */
617static void adjust_integrity_checks(struct hfi1_devdata *dd)
618{
619 struct send_context *sc;
620 unsigned long sc_flags;
621 int i;
622
623 spin_lock_irqsave(&dd->sc_lock, sc_flags);
624 for (i = 0; i < dd->num_send_contexts; i++) {
625 int enable;
626
627 sc = dd->send_contexts[i].sc;
628
629 if (!sc)
630 continue; /* not allocated */
631
632 enable = likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
633 dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE;
634
635 set_pio_integrity(sc);
636
637 if (enable) /* take HFI_CAP_* flags into account */
638 hfi1_init_ctxt(sc);
639 }
640 spin_unlock_irqrestore(&dd->sc_lock, sc_flags);
641}
642
643static int hfi1_snoop_open(struct inode *in, struct file *fp)
644{
645 int ret;
646 int mode_flag = 0;
647 unsigned long flags = 0;
648 struct hfi1_devdata *dd;
649 struct list_head *queue;
650
651 mutex_lock(&hfi1_mutex);
652
653 dd = hfi1_dd_from_sc_inode(in);
654 if (dd == NULL) {
655 ret = -ENODEV;
656 goto bail;
657 }
658
659 /*
660 * File mode determines snoop or capture. Some existing user
661 * applications expect the capture device to be able to be opened RDWR
662 * because they expect a dedicated capture device. For this reason we
663 * support a module param to force capture mode even if the file open
664 * mode matches snoop.
665 */
666 if ((fp->f_flags & O_ACCMODE) == O_RDONLY) {
667 snoop_dbg("Capture Enabled");
668 mode_flag = HFI1_PORT_CAPTURE_MODE;
669 } else if ((fp->f_flags & O_ACCMODE) == O_RDWR) {
670 snoop_dbg("Snoop Enabled");
671 mode_flag = HFI1_PORT_SNOOP_MODE;
672 } else {
673 snoop_dbg("Invalid");
674 ret = -EINVAL;
675 goto bail;
676 }
677 queue = &dd->hfi1_snoop.queue;
678
679 /*
680 * We are not supporting snoop and capture at the same time.
681 */
682 spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
683 if (dd->hfi1_snoop.mode_flag) {
684 ret = -EBUSY;
685 spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
686 goto bail;
687 }
688
689 dd->hfi1_snoop.mode_flag = mode_flag;
690 drain_snoop_list(queue);
691
692 dd->hfi1_snoop.filter_callback = NULL;
693 dd->hfi1_snoop.filter_value = NULL;
694
695 /*
696 * Send side packet integrity checks are not helpful when snooping so
697 * disable and re-enable when we stop snooping.
698 */
699 if (mode_flag == HFI1_PORT_SNOOP_MODE) {
700 /* clear after snoop mode is on */
701 adjust_integrity_checks(dd); /* clear */
702
703 /*
704 * We also do not want to be doing the DLID LMC check for
705 * ingressed packets.
706 */
707 dd->hfi1_snoop.dcc_cfg = read_csr(dd, DCC_CFG_PORT_CONFIG1);
708 write_csr(dd, DCC_CFG_PORT_CONFIG1,
709 (dd->hfi1_snoop.dcc_cfg >> 32) << 32);
710 }
711
712 /*
713 * As soon as we set these function pointers the recv and send handlers
714 * are active. This is a race condition so we must make sure to drain
715 * the queue and init filter values above. Technically we should add
716 * locking here but all that will happen is on recv a packet will get
717 * allocated and get stuck on the snoop_lock before getting added to the
718 * queue. Same goes for send.
719 */
720 dd->rhf_rcv_function_map = snoop_rhf_rcv_functions;
721 dd->process_pio_send = snoop_send_pio_handler;
722 dd->process_dma_send = snoop_send_pio_handler;
723 dd->pio_inline_send = snoop_inline_pio_send;
724
725 spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
726 ret = 0;
727
728bail:
729 mutex_unlock(&hfi1_mutex);
730
731 return ret;
732}
733
734static int hfi1_snoop_release(struct inode *in, struct file *fp)
735{
736 unsigned long flags = 0;
737 struct hfi1_devdata *dd;
738 int mode_flag;
739
740 dd = hfi1_dd_from_sc_inode(in);
741 if (dd == NULL)
742 return -ENODEV;
743
744 spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
745
746 /* clear the snoop mode before re-adjusting send context CSRs */
747 mode_flag = dd->hfi1_snoop.mode_flag;
748 dd->hfi1_snoop.mode_flag = 0;
749
750 /*
751 * Drain the queue and clear the filters we are done with it. Don't
752 * forget to restore the packet integrity checks
753 */
754 drain_snoop_list(&dd->hfi1_snoop.queue);
755 if (mode_flag == HFI1_PORT_SNOOP_MODE) {
756 /* restore after snoop mode is clear */
757 adjust_integrity_checks(dd); /* restore */
758
759 /*
760 * Also should probably reset the DCC_CONFIG1 register for DLID
761 * checking on incoming packets again. Use the value saved when
762 * opening the snoop device.
763 */
764 write_csr(dd, DCC_CFG_PORT_CONFIG1, dd->hfi1_snoop.dcc_cfg);
765 }
766
767 dd->hfi1_snoop.filter_callback = NULL;
768 kfree(dd->hfi1_snoop.filter_value);
769 dd->hfi1_snoop.filter_value = NULL;
770
771 /*
772 * User is done snooping and capturing, return control to the normal
773 * handler. Re-enable SDMA handling.
774 */
775 dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
776 dd->process_pio_send = hfi1_verbs_send_pio;
777 dd->process_dma_send = hfi1_verbs_send_dma;
778 dd->pio_inline_send = pio_copy;
779
780 spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
781
782 snoop_dbg("snoop/capture device released");
783
784 return 0;
785}
786
787static unsigned int hfi1_snoop_poll(struct file *fp,
788 struct poll_table_struct *wait)
789{
790 int ret = 0;
791 unsigned long flags = 0;
792
793 struct hfi1_devdata *dd;
794
795 dd = hfi1_dd_from_sc_inode(fp->f_inode);
796 if (dd == NULL)
797 return -ENODEV;
798
799 spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
800
801 poll_wait(fp, &dd->hfi1_snoop.waitq, wait);
802 if (!list_empty(&dd->hfi1_snoop.queue))
803 ret |= POLLIN | POLLRDNORM;
804
805 spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
806 return ret;
807
808}
809
810static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
811 size_t count, loff_t *off)
812{
813 struct diag_pkt dpkt;
814 struct hfi1_devdata *dd;
815 size_t ret;
816 u8 byte_two, sl, sc5, sc4, vl, byte_one;
817 struct send_context *sc;
818 u32 len;
819 u64 pbc;
820 struct hfi1_ibport *ibp;
821 struct hfi1_pportdata *ppd;
822
823 dd = hfi1_dd_from_sc_inode(fp->f_inode);
824 if (dd == NULL)
825 return -ENODEV;
826
827 ppd = dd->pport;
828 snoop_dbg("received %lu bytes from user", count);
829
830 memset(&dpkt, 0, sizeof(struct diag_pkt));
831 dpkt.version = _DIAG_PKT_VERS;
832 dpkt.unit = dd->unit;
833 dpkt.port = 1;
834
835 if (likely(!(snoop_flags & SNOOP_USE_METADATA))) {
836 /*
837 * We need to generate the PBC and not let diagpkt_send do it,
838 * to do this we need the VL and the length in dwords.
839 * The VL can be determined by using the SL and looking up the
840 * SC. Then the SC can be converted into VL. The exception to
841 * this is those packets which are from an SMI queue pair.
842 * Since we can't detect anything about the QP here we have to
843 * rely on the SC. If its 0xF then we assume its SMI and
844 * do not look at the SL.
845 */
846 if (copy_from_user(&byte_one, data, 1))
847 return -EINVAL;
848
849 if (copy_from_user(&byte_two, data+1, 1))
850 return -EINVAL;
851
852 sc4 = (byte_one >> 4) & 0xf;
853 if (sc4 == 0xF) {
854 snoop_dbg("Detected VL15 packet ignoring SL in packet");
855 vl = sc4;
856 } else {
857 sl = (byte_two >> 4) & 0xf;
858 ibp = to_iport(&dd->verbs_dev.ibdev, 1);
859 sc5 = ibp->sl_to_sc[sl];
860 vl = sc_to_vlt(dd, sc5);
861 if (vl != sc4) {
862 snoop_dbg("VL %d does not match SC %d of packet",
863 vl, sc4);
864 return -EINVAL;
865 }
866 }
867
868 sc = dd->vld[vl].sc; /* Look up the context based on VL */
869 if (sc) {
870 dpkt.sw_index = sc->sw_index;
871 snoop_dbg("Sending on context %u(%u)", sc->sw_index,
872 sc->hw_context);
873 } else {
874 snoop_dbg("Could not find context for vl %d", vl);
875 return -EINVAL;
876 }
877
878 len = (count >> 2) + 2; /* Add in PBC */
879 pbc = create_pbc(ppd, 0, 0, vl, len);
880 } else {
881 if (copy_from_user(&pbc, data, sizeof(pbc)))
882 return -EINVAL;
883 vl = (pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
884 sc = dd->vld[vl].sc; /* Look up the context based on VL */
885 if (sc) {
886 dpkt.sw_index = sc->sw_index;
887 } else {
888 snoop_dbg("Could not find context for vl %d", vl);
889 return -EINVAL;
890 }
891 data += sizeof(pbc);
892 count -= sizeof(pbc);
893 }
894 dpkt.len = count;
895 dpkt.data = (unsigned long)data;
896
897 snoop_dbg("PBC: vl=0x%llx Length=0x%llx",
898 (pbc >> 12) & 0xf,
899 (pbc & 0xfff));
900
901 dpkt.pbc = pbc;
902 ret = diagpkt_send(&dpkt);
903 /*
904 * diagpkt_send only returns number of bytes in the diagpkt so patch
905 * that up here before returning.
906 */
907 if (ret == sizeof(dpkt))
908 return count;
909
910 return ret;
911}
912
913static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
914 size_t pkt_len, loff_t *off)
915{
916 ssize_t ret = 0;
917 unsigned long flags = 0;
918 struct snoop_packet *packet = NULL;
919 struct hfi1_devdata *dd;
920
921 dd = hfi1_dd_from_sc_inode(fp->f_inode);
922 if (dd == NULL)
923 return -ENODEV;
924
925 spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
926
927 while (list_empty(&dd->hfi1_snoop.queue)) {
928 spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
929
930 if (fp->f_flags & O_NONBLOCK)
931 return -EAGAIN;
932
933 if (wait_event_interruptible(
934 dd->hfi1_snoop.waitq,
935 !list_empty(&dd->hfi1_snoop.queue)))
936 return -EINTR;
937
938 spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
939 }
940
941 if (!list_empty(&dd->hfi1_snoop.queue)) {
942 packet = list_entry(dd->hfi1_snoop.queue.next,
943 struct snoop_packet, list);
944 list_del(&packet->list);
945 spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
946 if (pkt_len >= packet->total_len) {
947 if (copy_to_user(data, packet->data,
948 packet->total_len))
949 ret = -EFAULT;
950 else
951 ret = packet->total_len;
952 } else
953 ret = -EINVAL;
954
955 kfree(packet);
956 } else
957 spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
958
959 return ret;
960}
961
962static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
963{
964 struct hfi1_devdata *dd;
965 void *filter_value = NULL;
966 long ret = 0;
967 int value = 0;
968 u8 physState = 0;
969 u8 linkState = 0;
970 u16 devState = 0;
971 unsigned long flags = 0;
972 unsigned long *argp = NULL;
973 struct hfi1_packet_filter_command filter_cmd = {0};
974 int mode_flag = 0;
975 struct hfi1_pportdata *ppd = NULL;
976 unsigned int index;
977 struct hfi1_link_info link_info;
978
979 dd = hfi1_dd_from_sc_inode(fp->f_inode);
980 if (dd == NULL)
981 return -ENODEV;
982
983 spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
984
985 mode_flag = dd->hfi1_snoop.mode_flag;
986
987 if (((_IOC_DIR(cmd) & _IOC_READ)
988 && !access_ok(VERIFY_WRITE, (void __user *)arg, _IOC_SIZE(cmd)))
989 || ((_IOC_DIR(cmd) & _IOC_WRITE)
990 && !access_ok(VERIFY_READ, (void __user *)arg, _IOC_SIZE(cmd)))) {
991 ret = -EFAULT;
992 } else if (!capable(CAP_SYS_ADMIN)) {
993 ret = -EPERM;
994 } else if ((mode_flag & HFI1_PORT_CAPTURE_MODE) &&
995 (cmd != HFI1_SNOOP_IOCCLEARQUEUE) &&
996 (cmd != HFI1_SNOOP_IOCCLEARFILTER) &&
997 (cmd != HFI1_SNOOP_IOCSETFILTER)) {
998 /* Capture devices are allowed only 3 operations
999 * 1.Clear capture queue
1000 * 2.Clear capture filter
1001 * 3.Set capture filter
1002 * Other are invalid.
1003 */
1004 ret = -EINVAL;
1005 } else {
1006 switch (cmd) {
1007 case HFI1_SNOOP_IOCSETLINKSTATE:
1008 snoop_dbg("HFI1_SNOOP_IOCSETLINKSTATE is not valid");
1009 ret = -EINVAL;
1010 break;
1011
1012 case HFI1_SNOOP_IOCSETLINKSTATE_EXTRA:
1013 memset(&link_info, 0, sizeof(link_info));
1014
1015 ret = copy_from_user(&link_info,
1016 (struct hfi1_link_info __user *)arg,
1017 sizeof(link_info));
1018 if (ret)
1019 break;
1020
1021 value = link_info.port_state;
1022 index = link_info.port_number;
1023 if (index > dd->num_pports - 1) {
1024 ret = -EINVAL;
1025 break;
1026 }
1027
1028 ppd = &dd->pport[index];
1029 if (!ppd) {
1030 ret = -EINVAL;
1031 break;
1032 }
1033
1034 /* What we want to transition to */
1035 physState = (value >> 4) & 0xF;
1036 linkState = value & 0xF;
1037 snoop_dbg("Setting link state 0x%x", value);
1038
1039 switch (linkState) {
1040 case IB_PORT_NOP:
1041 if (physState == 0)
1042 break;
1043 /* fall through */
1044 case IB_PORT_DOWN:
1045 switch (physState) {
1046 case 0:
1047 devState = HLS_DN_DOWNDEF;
1048 break;
1049 case 2:
1050 devState = HLS_DN_POLL;
1051 break;
1052 case 3:
1053 devState = HLS_DN_DISABLE;
1054 break;
1055 default:
1056 ret = -EINVAL;
1057 goto done;
1058 }
1059 ret = set_link_state(ppd, devState);
1060 break;
1061 case IB_PORT_ARMED:
1062 ret = set_link_state(ppd, HLS_UP_ARMED);
1063 if (!ret)
1064 send_idle_sma(dd, SMA_IDLE_ARM);
1065 break;
1066 case IB_PORT_ACTIVE:
1067 ret = set_link_state(ppd, HLS_UP_ACTIVE);
1068 if (!ret)
1069 send_idle_sma(dd, SMA_IDLE_ACTIVE);
1070 break;
1071 default:
1072 ret = -EINVAL;
1073 break;
1074 }
1075
1076 if (ret)
1077 break;
1078 /* fall through */
1079 case HFI1_SNOOP_IOCGETLINKSTATE:
1080 case HFI1_SNOOP_IOCGETLINKSTATE_EXTRA:
1081 if (cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) {
1082 memset(&link_info, 0, sizeof(link_info));
1083 ret = copy_from_user(&link_info,
1084 (struct hfi1_link_info __user *)arg,
1085 sizeof(link_info));
1086 index = link_info.port_number;
1087 } else {
1088 ret = __get_user(index, (int __user *) arg);
1089 if (ret != 0)
1090 break;
1091 }
1092
1093 if (index > dd->num_pports - 1) {
1094 ret = -EINVAL;
1095 break;
1096 }
1097
1098 ppd = &dd->pport[index];
1099 if (!ppd) {
1100 ret = -EINVAL;
1101 break;
1102 }
1103 value = hfi1_ibphys_portstate(ppd);
1104 value <<= 4;
1105 value |= driver_lstate(ppd);
1106
1107 snoop_dbg("Link port | Link State: %d", value);
1108
1109 if ((cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) ||
1110 (cmd == HFI1_SNOOP_IOCSETLINKSTATE_EXTRA)) {
1111 link_info.port_state = value;
1112 link_info.node_guid = cpu_to_be64(ppd->guid);
1113 link_info.link_speed_active =
1114 ppd->link_speed_active;
1115 link_info.link_width_active =
1116 ppd->link_width_active;
1117 ret = copy_to_user(
1118 (struct hfi1_link_info __user *)arg,
1119 &link_info, sizeof(link_info));
1120 } else {
1121 ret = __put_user(value, (int __user *)arg);
1122 }
1123 break;
1124
1125 case HFI1_SNOOP_IOCCLEARQUEUE:
1126 snoop_dbg("Clearing snoop queue");
1127 drain_snoop_list(&dd->hfi1_snoop.queue);
1128 break;
1129
1130 case HFI1_SNOOP_IOCCLEARFILTER:
1131 snoop_dbg("Clearing filter");
1132 if (dd->hfi1_snoop.filter_callback) {
1133 /* Drain packets first */
1134 drain_snoop_list(&dd->hfi1_snoop.queue);
1135 dd->hfi1_snoop.filter_callback = NULL;
1136 }
1137 kfree(dd->hfi1_snoop.filter_value);
1138 dd->hfi1_snoop.filter_value = NULL;
1139 break;
1140
1141 case HFI1_SNOOP_IOCSETFILTER:
1142 snoop_dbg("Setting filter");
1143 /* just copy command structure */
1144 argp = (unsigned long *)arg;
1145 ret = copy_from_user(&filter_cmd, (void __user *)argp,
1146 sizeof(filter_cmd));
1147 if (ret < 0) {
1148 pr_alert("Error copying filter command\n");
1149 break;
1150 }
1151 if (filter_cmd.opcode >= HFI1_MAX_FILTERS) {
1152 pr_alert("Invalid opcode in request\n");
1153 ret = -EINVAL;
1154 break;
1155 }
1156
1157 snoop_dbg("Opcode %d Len %d Ptr %p",
1158 filter_cmd.opcode, filter_cmd.length,
1159 filter_cmd.value_ptr);
1160
1161 filter_value = kzalloc(
1162 filter_cmd.length * sizeof(u8),
1163 GFP_KERNEL);
1164 if (!filter_value) {
1165 pr_alert("Not enough memory\n");
1166 ret = -ENOMEM;
1167 break;
1168 }
1169 /* copy remaining data from userspace */
1170 ret = copy_from_user((u8 *)filter_value,
1171 (void __user *)filter_cmd.value_ptr,
1172 filter_cmd.length);
1173 if (ret < 0) {
1174 kfree(filter_value);
1175 pr_alert("Error copying filter data\n");
1176 break;
1177 }
1178 /* Drain packets first */
1179 drain_snoop_list(&dd->hfi1_snoop.queue);
1180 dd->hfi1_snoop.filter_callback =
1181 hfi1_filters[filter_cmd.opcode].filter;
1182 /* just in case we see back to back sets */
1183 kfree(dd->hfi1_snoop.filter_value);
1184 dd->hfi1_snoop.filter_value = filter_value;
1185
1186 break;
1187 case HFI1_SNOOP_IOCGETVERSION:
1188 value = SNOOP_CAPTURE_VERSION;
1189 snoop_dbg("Getting version: %d", value);
1190 ret = __put_user(value, (int __user *)arg);
1191 break;
1192 case HFI1_SNOOP_IOCSET_OPTS:
1193 snoop_flags = 0;
1194 ret = __get_user(value, (int __user *) arg);
1195 if (ret != 0)
1196 break;
1197
1198 snoop_dbg("Setting snoop option %d", value);
1199 if (value & SNOOP_DROP_SEND)
1200 snoop_flags |= SNOOP_DROP_SEND;
1201 if (value & SNOOP_USE_METADATA)
1202 snoop_flags |= SNOOP_USE_METADATA;
1203 break;
1204 default:
1205 ret = -ENOTTY;
1206 break;
1207 }
1208 }
1209done:
1210 spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
1211 return ret;
1212}
1213
1214static void snoop_list_add_tail(struct snoop_packet *packet,
1215 struct hfi1_devdata *dd)
1216{
1217 unsigned long flags = 0;
1218
1219 spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
1220 if (likely((dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) ||
1221 (dd->hfi1_snoop.mode_flag & HFI1_PORT_CAPTURE_MODE))) {
1222 list_add_tail(&packet->list, &dd->hfi1_snoop.queue);
1223 snoop_dbg("Added packet to list");
1224 }
1225
1226 /*
1227 * Technically we can could have closed the snoop device while waiting
1228 * on the above lock and it is gone now. The snoop mode_flag will
1229 * prevent us from adding the packet to the queue though.
1230 */
1231
1232 spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
1233 wake_up_interruptible(&dd->hfi1_snoop.waitq);
1234}
1235
1236static inline int hfi1_filter_check(void *val, const char *msg)
1237{
1238 if (!val) {
1239 snoop_dbg("Error invalid %s value for filter", msg);
1240 return HFI1_FILTER_ERR;
1241 }
1242 return 0;
1243}
1244
1245static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value)
1246{
1247 struct hfi1_ib_header *hdr;
1248 int ret;
1249
1250 ret = hfi1_filter_check(ibhdr, "header");
1251 if (ret)
1252 return ret;
1253 ret = hfi1_filter_check(value, "user");
1254 if (ret)
1255 return ret;
1256 hdr = (struct hfi1_ib_header *)ibhdr;
1257
1258 if (*((u16 *)value) == be16_to_cpu(hdr->lrh[3])) /* matches slid */
1259 return HFI1_FILTER_HIT; /* matched */
1260
1261 return HFI1_FILTER_MISS; /* Not matched */
1262}
1263
1264static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value)
1265{
1266 struct hfi1_ib_header *hdr;
1267 int ret;
1268
1269 ret = hfi1_filter_check(ibhdr, "header");
1270 if (ret)
1271 return ret;
1272 ret = hfi1_filter_check(value, "user");
1273 if (ret)
1274 return ret;
1275
1276 hdr = (struct hfi1_ib_header *)ibhdr;
1277
1278 if (*((u16 *)value) == be16_to_cpu(hdr->lrh[1]))
1279 return HFI1_FILTER_HIT;
1280
1281 return HFI1_FILTER_MISS;
1282}
1283
1284/* Not valid for outgoing packets, send handler passes null for data*/
1285static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
1286 void *value)
1287{
1288 struct hfi1_ib_header *hdr;
1289 struct hfi1_other_headers *ohdr = NULL;
1290 struct ib_smp *smp = NULL;
1291 u32 qpn = 0;
1292 int ret;
1293
1294 ret = hfi1_filter_check(ibhdr, "header");
1295 if (ret)
1296 return ret;
1297 ret = hfi1_filter_check(packet_data, "packet_data");
1298 if (ret)
1299 return ret;
1300 ret = hfi1_filter_check(value, "user");
1301 if (ret)
1302 return ret;
1303
1304 hdr = (struct hfi1_ib_header *)ibhdr;
1305
1306 /* Check for GRH */
1307 if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
1308 ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
1309 else
1310 ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
1311
1312 qpn = be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF;
1313 if (qpn <= 1) {
1314 smp = (struct ib_smp *)packet_data;
1315 if (*((u8 *)value) == smp->mgmt_class)
1316 return HFI1_FILTER_HIT;
1317 else
1318 return HFI1_FILTER_MISS;
1319 }
1320 return HFI1_FILTER_ERR;
1321}
1322
1323static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value)
1324{
1325
1326 struct hfi1_ib_header *hdr;
1327 struct hfi1_other_headers *ohdr = NULL;
1328 int ret;
1329
1330 ret = hfi1_filter_check(ibhdr, "header");
1331 if (ret)
1332 return ret;
1333 ret = hfi1_filter_check(value, "user");
1334 if (ret)
1335 return ret;
1336
1337 hdr = (struct hfi1_ib_header *)ibhdr;
1338
1339 /* Check for GRH */
1340 if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
1341 ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
1342 else
1343 ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
1344 if (*((u32 *)value) == (be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF))
1345 return HFI1_FILTER_HIT;
1346
1347 return HFI1_FILTER_MISS;
1348}
1349
1350static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
1351 void *value)
1352{
1353 u32 lnh = 0;
1354 u8 opcode = 0;
1355 struct hfi1_ib_header *hdr;
1356 struct hfi1_other_headers *ohdr = NULL;
1357 int ret;
1358
1359 ret = hfi1_filter_check(ibhdr, "header");
1360 if (ret)
1361 return ret;
1362 ret = hfi1_filter_check(value, "user");
1363 if (ret)
1364 return ret;
1365
1366 hdr = (struct hfi1_ib_header *)ibhdr;
1367
1368 lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
1369
1370 if (lnh == HFI1_LRH_BTH)
1371 ohdr = &hdr->u.oth;
1372 else if (lnh == HFI1_LRH_GRH)
1373 ohdr = &hdr->u.l.oth;
1374 else
1375 return HFI1_FILTER_ERR;
1376
1377 opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
1378
1379 if (*((u8 *)value) == ((opcode >> 5) & 0x7))
1380 return HFI1_FILTER_HIT;
1381
1382 return HFI1_FILTER_MISS;
1383}
1384
1385static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
1386 void *value)
1387{
1388 struct hfi1_ib_header *hdr;
1389 int ret;
1390
1391 ret = hfi1_filter_check(ibhdr, "header");
1392 if (ret)
1393 return ret;
1394 ret = hfi1_filter_check(value, "user");
1395 if (ret)
1396 return ret;
1397
1398 hdr = (struct hfi1_ib_header *)ibhdr;
1399
1400 if ((*((u8 *)value)) == ((be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF))
1401 return HFI1_FILTER_HIT;
1402
1403 return HFI1_FILTER_MISS;
1404}
1405
1406static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value)
1407{
1408
1409 u32 lnh = 0;
1410 struct hfi1_ib_header *hdr;
1411 struct hfi1_other_headers *ohdr = NULL;
1412 int ret;
1413
1414 ret = hfi1_filter_check(ibhdr, "header");
1415 if (ret)
1416 return ret;
1417 ret = hfi1_filter_check(value, "user");
1418 if (ret)
1419 return ret;
1420
1421 hdr = (struct hfi1_ib_header *)ibhdr;
1422
1423 lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
1424 if (lnh == HFI1_LRH_BTH)
1425 ohdr = &hdr->u.oth;
1426 else if (lnh == HFI1_LRH_GRH)
1427 ohdr = &hdr->u.l.oth;
1428 else
1429 return HFI1_FILTER_ERR;
1430
1431 /* P_key is 16-bit entity, however top most bit indicates
1432 * type of membership. 0 for limited and 1 for Full.
1433 * Limited members cannot accept information from other
1434 * Limited members, but communication is allowed between
1435 * every other combination of membership.
1436 * Hence we'll omit comparing top-most bit while filtering
1437 */
1438
1439 if ((*(u16 *)value & 0x7FFF) ==
1440 ((be32_to_cpu(ohdr->bth[0])) & 0x7FFF))
1441 return HFI1_FILTER_HIT;
1442
1443 return HFI1_FILTER_MISS;
1444}
1445
1446/*
1447 * If packet_data is NULL then this is coming from one of the send functions.
1448 * Thus we know if its an ingressed or egressed packet.
1449 */
1450static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value)
1451{
1452 u8 user_dir = *(u8 *)value;
1453 int ret;
1454
1455 ret = hfi1_filter_check(value, "user");
1456 if (ret)
1457 return ret;
1458
1459 if (packet_data) {
1460 /* Incoming packet */
1461 if (user_dir & HFI1_SNOOP_INGRESS)
1462 return HFI1_FILTER_HIT;
1463 } else {
1464 /* Outgoing packet */
1465 if (user_dir & HFI1_SNOOP_EGRESS)
1466 return HFI1_FILTER_HIT;
1467 }
1468
1469 return HFI1_FILTER_MISS;
1470}
1471
1472/*
1473 * Allocate a snoop packet. The structure that is stored in the ring buffer, not
1474 * to be confused with an hfi packet type.
1475 */
1476static struct snoop_packet *allocate_snoop_packet(u32 hdr_len,
1477 u32 data_len,
1478 u32 md_len)
1479{
1480
1481 struct snoop_packet *packet = NULL;
1482
1483 packet = kzalloc(sizeof(struct snoop_packet) + hdr_len + data_len
1484 + md_len,
1485 GFP_ATOMIC | __GFP_NOWARN);
1486 if (likely(packet))
1487 INIT_LIST_HEAD(&packet->list);
1488
1489
1490 return packet;
1491}
1492
1493/*
1494 * Instead of having snoop and capture code intermixed with the recv functions,
1495 * both the interrupt handler and hfi1_ib_rcv() we are going to hijack the call
1496 * and land in here for snoop/capture but if not enabled the call will go
1497 * through as before. This gives us a single point to constrain all of the snoop
1498 * snoop recv logic. There is nothing special that needs to happen for bypass
1499 * packets. This routine should not try to look into the packet. It just copied
1500 * it. There is no guarantee for filters when it comes to bypass packets as
1501 * there is no specific support. Bottom line is this routine does now even know
1502 * what a bypass packet is.
1503 */
1504int snoop_recv_handler(struct hfi1_packet *packet)
1505{
1506 struct hfi1_pportdata *ppd = packet->rcd->ppd;
1507 struct hfi1_ib_header *hdr = packet->hdr;
1508 int header_size = packet->hlen;
1509 void *data = packet->ebuf;
1510 u32 tlen = packet->tlen;
1511 struct snoop_packet *s_packet = NULL;
1512 int ret;
1513 int snoop_mode = 0;
1514 u32 md_len = 0;
1515 struct capture_md md;
1516
1517 snoop_dbg("PACKET IN: hdr size %d tlen %d data %p", header_size, tlen,
1518 data);
1519
1520 trace_snoop_capture(ppd->dd, header_size, hdr, tlen - header_size,
1521 data);
1522
1523 if (!ppd->dd->hfi1_snoop.filter_callback) {
1524 snoop_dbg("filter not set");
1525 ret = HFI1_FILTER_HIT;
1526 } else {
1527 ret = ppd->dd->hfi1_snoop.filter_callback(hdr, data,
1528 ppd->dd->hfi1_snoop.filter_value);
1529 }
1530
1531 switch (ret) {
1532 case HFI1_FILTER_ERR:
1533 snoop_dbg("Error in filter call");
1534 break;
1535 case HFI1_FILTER_MISS:
1536 snoop_dbg("Filter Miss");
1537 break;
1538 case HFI1_FILTER_HIT:
1539
1540 if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
1541 snoop_mode = 1;
1542 if ((snoop_mode == 0) ||
1543 unlikely(snoop_flags & SNOOP_USE_METADATA))
1544 md_len = sizeof(struct capture_md);
1545
1546
1547 s_packet = allocate_snoop_packet(header_size,
1548 tlen - header_size,
1549 md_len);
1550
1551 if (unlikely(s_packet == NULL)) {
1552 dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
1553 break;
1554 }
1555
1556 if (md_len > 0) {
1557 memset(&md, 0, sizeof(struct capture_md));
1558 md.port = 1;
1559 md.dir = PKT_DIR_INGRESS;
1560 md.u.rhf = packet->rhf;
1561 memcpy(s_packet->data, &md, md_len);
1562 }
1563
1564 /* We should always have a header */
1565 if (hdr) {
1566 memcpy(s_packet->data + md_len, hdr, header_size);
1567 } else {
1568 dd_dev_err(ppd->dd, "Unable to copy header to snoop/capture packet\n");
1569 kfree(s_packet);
1570 break;
1571 }
1572
1573 /*
1574 * Packets with no data are possible. If there is no data needed
1575 * to take care of the last 4 bytes which are normally included
1576 * with data buffers and are included in tlen. Since we kzalloc
1577 * the buffer we do not need to set any values but if we decide
1578 * not to use kzalloc we should zero them.
1579 */
1580 if (data)
1581 memcpy(s_packet->data + header_size + md_len, data,
1582 tlen - header_size);
1583
1584 s_packet->total_len = tlen + md_len;
1585 snoop_list_add_tail(s_packet, ppd->dd);
1586
1587 /*
1588 * If we are snooping the packet not capturing then throw away
1589 * after adding to the list.
1590 */
1591 snoop_dbg("Capturing packet");
1592 if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) {
1593 snoop_dbg("Throwing packet away");
1594 /*
1595 * If we are dropping the packet we still may need to
1596 * handle the case where error flags are set, this is
1597 * normally done by the type specific handler but that
1598 * won't be called in this case.
1599 */
1600 if (unlikely(rhf_err_flags(packet->rhf)))
1601 handle_eflags(packet);
1602
1603 /* throw the packet on the floor */
1604 return RHF_RCV_CONTINUE;
1605 }
1606 break;
1607 default:
1608 break;
1609 }
1610
1611 /*
1612 * We do not care what type of packet came in here - just pass it off
1613 * to the normal handler.
1614 */
1615 return ppd->dd->normal_rhf_rcv_functions[rhf_rcv_type(packet->rhf)]
1616 (packet);
1617}
1618
1619/*
1620 * Handle snooping and capturing packets when sdma is being used.
1621 */
1622int snoop_send_dma_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
1623 u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
1624 u32 plen, u32 dwords, u64 pbc)
1625{
1626 pr_alert("Snooping/Capture of Send DMA Packets Is Not Supported!\n");
1627 snoop_dbg("Unsupported Operation");
1628 return hfi1_verbs_send_dma(qp, ibhdr, hdrwords, ss, len, plen, dwords,
1629 0);
1630}
1631
1632/*
1633 * Handle snooping and capturing packets when pio is being used. Does not handle
1634 * bypass packets. The only way to send a bypass packet currently is to use the
1635 * diagpkt interface. When that interface is enable snoop/capture is not.
1636 */
1637int snoop_send_pio_handler(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
1638 u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
1639 u32 plen, u32 dwords, u64 pbc)
1640{
1641 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1642 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1643 struct snoop_packet *s_packet = NULL;
1644 u32 *hdr = (u32 *)&ahdr->ibh;
1645 u32 length = 0;
1646 struct hfi1_sge_state temp_ss;
1647 void *data = NULL;
1648 void *data_start = NULL;
1649 int ret;
1650 int snoop_mode = 0;
1651 int md_len = 0;
1652 struct capture_md md;
1653 u32 vl;
1654 u32 hdr_len = hdrwords << 2;
1655 u32 tlen = HFI1_GET_PKT_LEN(&ahdr->ibh);
1656
1657 md.u.pbc = 0;
1658
1659 snoop_dbg("PACKET OUT: hdrword %u len %u plen %u dwords %u tlen %u",
1660 hdrwords, len, plen, dwords, tlen);
1661 if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
1662 snoop_mode = 1;
1663 if ((snoop_mode == 0) ||
1664 unlikely(snoop_flags & SNOOP_USE_METADATA))
1665 md_len = sizeof(struct capture_md);
1666
1667 /* not using ss->total_len as arg 2 b/c that does not count CRC */
1668 s_packet = allocate_snoop_packet(hdr_len, tlen - hdr_len, md_len);
1669
1670 if (unlikely(s_packet == NULL)) {
1671 dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
1672 goto out;
1673 }
1674
1675 s_packet->total_len = tlen + md_len;
1676
1677 if (md_len > 0) {
1678 memset(&md, 0, sizeof(struct capture_md));
1679 md.port = 1;
1680 md.dir = PKT_DIR_EGRESS;
1681 if (likely(pbc == 0)) {
1682 vl = be16_to_cpu(ahdr->ibh.lrh[0]) >> 12;
1683 md.u.pbc = create_pbc(ppd, 0, qp->s_srate, vl, plen);
1684 } else {
1685 md.u.pbc = 0;
1686 }
1687 memcpy(s_packet->data, &md, md_len);
1688 } else {
1689 md.u.pbc = pbc;
1690 }
1691
1692 /* Copy header */
1693 if (likely(hdr)) {
1694 memcpy(s_packet->data + md_len, hdr, hdr_len);
1695 } else {
1696 dd_dev_err(ppd->dd,
1697 "Unable to copy header to snoop/capture packet\n");
1698 kfree(s_packet);
1699 goto out;
1700 }
1701
1702 if (ss) {
1703 data = s_packet->data + hdr_len + md_len;
1704 data_start = data;
1705
1706 /*
1707 * Copy SGE State
1708 * The update_sge() function below will not modify the
1709 * individual SGEs in the array. It will make a copy each time
1710 * and operate on that. So we only need to copy this instance
1711 * and it won't impact PIO.
1712 */
1713 temp_ss = *ss;
1714 length = len;
1715
1716 snoop_dbg("Need to copy %d bytes", length);
1717 while (length) {
1718 void *addr = temp_ss.sge.vaddr;
1719 u32 slen = temp_ss.sge.length;
1720
1721 if (slen > length) {
1722 slen = length;
1723 snoop_dbg("slen %d > len %d", slen, length);
1724 }
1725 snoop_dbg("copy %d to %p", slen, addr);
1726 memcpy(data, addr, slen);
1727 update_sge(&temp_ss, slen);
1728 length -= slen;
1729 data += slen;
1730 snoop_dbg("data is now %p bytes left %d", data, length);
1731 }
1732 snoop_dbg("Completed SGE copy");
1733 }
1734
1735 /*
1736 * Why do the filter check down here? Because the event tracing has its
1737 * own filtering and we need to have the walked the SGE list.
1738 */
1739 if (!ppd->dd->hfi1_snoop.filter_callback) {
1740 snoop_dbg("filter not set\n");
1741 ret = HFI1_FILTER_HIT;
1742 } else {
1743 ret = ppd->dd->hfi1_snoop.filter_callback(
1744 &ahdr->ibh,
1745 NULL,
1746 ppd->dd->hfi1_snoop.filter_value);
1747 }
1748
1749 switch (ret) {
1750 case HFI1_FILTER_ERR:
1751 snoop_dbg("Error in filter call");
1752 /* fall through */
1753 case HFI1_FILTER_MISS:
1754 snoop_dbg("Filter Miss");
1755 kfree(s_packet);
1756 break;
1757 case HFI1_FILTER_HIT:
1758 snoop_dbg("Capturing packet");
1759 snoop_list_add_tail(s_packet, ppd->dd);
1760
1761 if (unlikely((snoop_flags & SNOOP_DROP_SEND) &&
1762 (ppd->dd->hfi1_snoop.mode_flag &
1763 HFI1_PORT_SNOOP_MODE))) {
1764 unsigned long flags;
1765
1766 snoop_dbg("Dropping packet");
1767 if (qp->s_wqe) {
1768 spin_lock_irqsave(&qp->s_lock, flags);
1769 hfi1_send_complete(
1770 qp,
1771 qp->s_wqe,
1772 IB_WC_SUCCESS);
1773 spin_unlock_irqrestore(&qp->s_lock, flags);
1774 } else if (qp->ibqp.qp_type == IB_QPT_RC) {
1775 spin_lock_irqsave(&qp->s_lock, flags);
1776 hfi1_rc_send_complete(qp, &ahdr->ibh);
1777 spin_unlock_irqrestore(&qp->s_lock, flags);
1778 }
1779 return 0;
1780 }
1781 break;
1782 default:
1783 kfree(s_packet);
1784 break;
1785 }
1786out:
1787 return hfi1_verbs_send_pio(qp, ahdr, hdrwords, ss, len, plen, dwords,
1788 md.u.pbc);
1789}
1790
1791/*
1792 * Callers of this must pass a hfi1_ib_header type for the from ptr. Currently
1793 * this can be used anywhere, but the intention is for inline ACKs for RC and
1794 * CCA packets. We don't restrict this usage though.
1795 */
1796void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
1797 u64 pbc, const void *from, size_t count)
1798{
1799 int snoop_mode = 0;
1800 int md_len = 0;
1801 struct capture_md md;
1802 struct snoop_packet *s_packet = NULL;
1803
1804 /*
1805 * count is in dwords so we need to convert to bytes.
1806 * We also need to account for CRC which would be tacked on by hardware.
1807 */
1808 int packet_len = (count << 2) + 4;
1809 int ret;
1810
1811 snoop_dbg("ACK OUT: len %d", packet_len);
1812
1813 if (!dd->hfi1_snoop.filter_callback) {
1814 snoop_dbg("filter not set");
1815 ret = HFI1_FILTER_HIT;
1816 } else {
1817 ret = dd->hfi1_snoop.filter_callback(
1818 (struct hfi1_ib_header *)from,
1819 NULL,
1820 dd->hfi1_snoop.filter_value);
1821 }
1822
1823 switch (ret) {
1824 case HFI1_FILTER_ERR:
1825 snoop_dbg("Error in filter call");
1826 /* fall through */
1827 case HFI1_FILTER_MISS:
1828 snoop_dbg("Filter Miss");
1829 break;
1830 case HFI1_FILTER_HIT:
1831 snoop_dbg("Capturing packet");
1832 if (dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
1833 snoop_mode = 1;
1834 if ((snoop_mode == 0) ||
1835 unlikely(snoop_flags & SNOOP_USE_METADATA))
1836 md_len = sizeof(struct capture_md);
1837
1838 s_packet = allocate_snoop_packet(packet_len, 0, md_len);
1839
1840 if (unlikely(s_packet == NULL)) {
1841 dd_dev_warn_ratelimited(dd, "Unable to allocate snoop/capture packet\n");
1842 goto inline_pio_out;
1843 }
1844
1845 s_packet->total_len = packet_len + md_len;
1846
1847 /* Fill in the metadata for the packet */
1848 if (md_len > 0) {
1849 memset(&md, 0, sizeof(struct capture_md));
1850 md.port = 1;
1851 md.dir = PKT_DIR_EGRESS;
1852 md.u.pbc = pbc;
1853 memcpy(s_packet->data, &md, md_len);
1854 }
1855
1856 /* Add the packet data which is a single buffer */
1857 memcpy(s_packet->data + md_len, from, packet_len);
1858
1859 snoop_list_add_tail(s_packet, dd);
1860
1861 if (unlikely((snoop_flags & SNOOP_DROP_SEND) && snoop_mode)) {
1862 snoop_dbg("Dropping packet");
1863 return;
1864 }
1865 break;
1866 default:
1867 break;
1868 }
1869
1870inline_pio_out:
1871 pio_copy(dd, pbuf, pbc, from, count);
1872
1873}
diff --git a/drivers/staging/rdma/hfi1/dma.c b/drivers/staging/rdma/hfi1/dma.c
new file mode 100644
index 000000000000..e03bd735173c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/dma.c
@@ -0,0 +1,186 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50#include <linux/types.h>
51#include <linux/scatterlist.h>
52
53#include "verbs.h"
54
55#define BAD_DMA_ADDRESS ((u64) 0)
56
57/*
58 * The following functions implement driver specific replacements
59 * for the ib_dma_*() functions.
60 *
61 * These functions return kernel virtual addresses instead of
62 * device bus addresses since the driver uses the CPU to copy
63 * data instead of using hardware DMA.
64 */
65
66static int hfi1_mapping_error(struct ib_device *dev, u64 dma_addr)
67{
68 return dma_addr == BAD_DMA_ADDRESS;
69}
70
71static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr,
72 size_t size, enum dma_data_direction direction)
73{
74 if (WARN_ON(!valid_dma_direction(direction)))
75 return BAD_DMA_ADDRESS;
76
77 return (u64) cpu_addr;
78}
79
80static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
81 enum dma_data_direction direction)
82{
83 /* This is a stub, nothing to be done here */
84}
85
86static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page,
87 unsigned long offset, size_t size,
88 enum dma_data_direction direction)
89{
90 u64 addr;
91
92 if (WARN_ON(!valid_dma_direction(direction)))
93 return BAD_DMA_ADDRESS;
94
95 if (offset + size > PAGE_SIZE)
96 return BAD_DMA_ADDRESS;
97
98 addr = (u64) page_address(page);
99 if (addr)
100 addr += offset;
101
102 return addr;
103}
104
105static void hfi1_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
106 enum dma_data_direction direction)
107{
108 /* This is a stub, nothing to be done here */
109}
110
111static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl,
112 int nents, enum dma_data_direction direction)
113{
114 struct scatterlist *sg;
115 u64 addr;
116 int i;
117 int ret = nents;
118
119 if (WARN_ON(!valid_dma_direction(direction)))
120 return BAD_DMA_ADDRESS;
121
122 for_each_sg(sgl, sg, nents, i) {
123 addr = (u64) page_address(sg_page(sg));
124 if (!addr) {
125 ret = 0;
126 break;
127 }
128 sg->dma_address = addr + sg->offset;
129#ifdef CONFIG_NEED_SG_DMA_LENGTH
130 sg->dma_length = sg->length;
131#endif
132 }
133 return ret;
134}
135
136static void hfi1_unmap_sg(struct ib_device *dev,
137 struct scatterlist *sg, int nents,
138 enum dma_data_direction direction)
139{
140 /* This is a stub, nothing to be done here */
141}
142
143static void hfi1_sync_single_for_cpu(struct ib_device *dev, u64 addr,
144 size_t size, enum dma_data_direction dir)
145{
146}
147
148static void hfi1_sync_single_for_device(struct ib_device *dev, u64 addr,
149 size_t size,
150 enum dma_data_direction dir)
151{
152}
153
154static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size,
155 u64 *dma_handle, gfp_t flag)
156{
157 struct page *p;
158 void *addr = NULL;
159
160 p = alloc_pages(flag, get_order(size));
161 if (p)
162 addr = page_address(p);
163 if (dma_handle)
164 *dma_handle = (u64) addr;
165 return addr;
166}
167
168static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size,
169 void *cpu_addr, u64 dma_handle)
170{
171 free_pages((unsigned long) cpu_addr, get_order(size));
172}
173
174struct ib_dma_mapping_ops hfi1_dma_mapping_ops = {
175 .mapping_error = hfi1_mapping_error,
176 .map_single = hfi1_dma_map_single,
177 .unmap_single = hfi1_dma_unmap_single,
178 .map_page = hfi1_dma_map_page,
179 .unmap_page = hfi1_dma_unmap_page,
180 .map_sg = hfi1_map_sg,
181 .unmap_sg = hfi1_unmap_sg,
182 .sync_single_for_cpu = hfi1_sync_single_for_cpu,
183 .sync_single_for_device = hfi1_sync_single_for_device,
184 .alloc_coherent = hfi1_dma_alloc_coherent,
185 .free_coherent = hfi1_dma_free_coherent
186};
diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c
new file mode 100644
index 000000000000..c0a59001e5cd
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/driver.c
@@ -0,0 +1,1241 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/spinlock.h>
52#include <linux/pci.h>
53#include <linux/io.h>
54#include <linux/delay.h>
55#include <linux/netdevice.h>
56#include <linux/vmalloc.h>
57#include <linux/module.h>
58#include <linux/prefetch.h>
59
60#include "hfi.h"
61#include "trace.h"
62#include "qp.h"
63#include "sdma.h"
64
65#undef pr_fmt
66#define pr_fmt(fmt) DRIVER_NAME ": " fmt
67
68/*
69 * The size has to be longer than this string, so we can append
70 * board/chip information to it in the initialization code.
71 */
72const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
73
74DEFINE_SPINLOCK(hfi1_devs_lock);
75LIST_HEAD(hfi1_dev_list);
76DEFINE_MUTEX(hfi1_mutex); /* general driver use */
77
78unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
79module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
80MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is 8192");
81
82unsigned int hfi1_cu = 1;
83module_param_named(cu, hfi1_cu, uint, S_IRUGO);
84MODULE_PARM_DESC(cu, "Credit return units");
85
86unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
87static int hfi1_caps_set(const char *, const struct kernel_param *);
88static int hfi1_caps_get(char *, const struct kernel_param *);
89static const struct kernel_param_ops cap_ops = {
90 .set = hfi1_caps_set,
91 .get = hfi1_caps_get
92};
93module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO);
94MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
95
96MODULE_LICENSE("Dual BSD/GPL");
97MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
98MODULE_VERSION(HFI1_DRIVER_VERSION);
99
100/*
101 * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
102 */
103#define MAX_PKT_RECV 64
104#define EGR_HEAD_UPDATE_THRESHOLD 16
105
106struct hfi1_ib_stats hfi1_stats;
107
108static int hfi1_caps_set(const char *val, const struct kernel_param *kp)
109{
110 int ret = 0;
111 unsigned long *cap_mask_ptr = (unsigned long *)kp->arg,
112 cap_mask = *cap_mask_ptr, value, diff,
113 write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) |
114 HFI1_CAP_WRITABLE_MASK);
115
116 ret = kstrtoul(val, 0, &value);
117 if (ret) {
118 pr_warn("Invalid module parameter value for 'cap_mask'\n");
119 goto done;
120 }
121 /* Get the changed bits (except the locked bit) */
122 diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK);
123
124 /* Remove any bits that are not allowed to change after driver load */
125 if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) {
126 pr_warn("Ignoring non-writable capability bits %#lx\n",
127 diff & ~write_mask);
128 diff &= write_mask;
129 }
130
131 /* Mask off any reserved bits */
132 diff &= ~HFI1_CAP_RESERVED_MASK;
133 /* Clear any previously set and changing bits */
134 cap_mask &= ~diff;
135 /* Update the bits with the new capability */
136 cap_mask |= (value & diff);
137 /* Check for any kernel/user restrictions */
138 diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^
139 ((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT);
140 cap_mask &= ~diff;
141 /* Set the bitmask to the final set */
142 *cap_mask_ptr = cap_mask;
143done:
144 return ret;
145}
146
147static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
148{
149 unsigned long cap_mask = *(unsigned long *)kp->arg;
150
151 cap_mask &= ~HFI1_CAP_LOCKED_SMASK;
152 cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT);
153
154 return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
155}
156
157const char *get_unit_name(int unit)
158{
159 static char iname[16];
160
161 snprintf(iname, sizeof(iname), DRIVER_NAME"_%u", unit);
162 return iname;
163}
164
165/*
166 * Return count of units with at least one port ACTIVE.
167 */
168int hfi1_count_active_units(void)
169{
170 struct hfi1_devdata *dd;
171 struct hfi1_pportdata *ppd;
172 unsigned long flags;
173 int pidx, nunits_active = 0;
174
175 spin_lock_irqsave(&hfi1_devs_lock, flags);
176 list_for_each_entry(dd, &hfi1_dev_list, list) {
177 if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase)
178 continue;
179 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
180 ppd = dd->pport + pidx;
181 if (ppd->lid && ppd->linkup) {
182 nunits_active++;
183 break;
184 }
185 }
186 }
187 spin_unlock_irqrestore(&hfi1_devs_lock, flags);
188 return nunits_active;
189}
190
191/*
192 * Return count of all units, optionally return in arguments
193 * the number of usable (present) units, and the number of
194 * ports that are up.
195 */
196int hfi1_count_units(int *npresentp, int *nupp)
197{
198 int nunits = 0, npresent = 0, nup = 0;
199 struct hfi1_devdata *dd;
200 unsigned long flags;
201 int pidx;
202 struct hfi1_pportdata *ppd;
203
204 spin_lock_irqsave(&hfi1_devs_lock, flags);
205
206 list_for_each_entry(dd, &hfi1_dev_list, list) {
207 nunits++;
208 if ((dd->flags & HFI1_PRESENT) && dd->kregbase)
209 npresent++;
210 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
211 ppd = dd->pport + pidx;
212 if (ppd->lid && ppd->linkup)
213 nup++;
214 }
215 }
216
217 spin_unlock_irqrestore(&hfi1_devs_lock, flags);
218
219 if (npresentp)
220 *npresentp = npresent;
221 if (nupp)
222 *nupp = nup;
223
224 return nunits;
225}
226
227/*
228 * Get address of eager buffer from it's index (allocated in chunks, not
229 * contiguous).
230 */
231static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
232 u8 *update)
233{
234 u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf);
235
236 *update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset;
237 return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) +
238 (offset * RCV_BUF_BLOCK_SIZE));
239}
240
241/*
242 * Validate and encode the a given RcvArray Buffer size.
243 * The function will check whether the given size falls within
244 * allowed size ranges for the respective type and, optionally,
245 * return the proper encoding.
246 */
247inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
248{
249 if (unlikely(!IS_ALIGNED(size, PAGE_SIZE)))
250 return 0;
251 if (unlikely(size < MIN_EAGER_BUFFER))
252 return 0;
253 if (size >
254 (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER))
255 return 0;
256 if (encoded)
257 *encoded = ilog2(size / PAGE_SIZE) + 1;
258 return 1;
259}
260
261static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
262 struct hfi1_packet *packet)
263{
264 struct hfi1_message_header *rhdr = packet->hdr;
265 u32 rte = rhf_rcv_type_err(packet->rhf);
266 int lnh = be16_to_cpu(rhdr->lrh[0]) & 3;
267 struct hfi1_ibport *ibp = &ppd->ibport_data;
268
269 if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
270 return;
271
272 if (packet->rhf & RHF_TID_ERR) {
273 /* For TIDERR and RC QPs preemptively schedule a NAK */
274 struct hfi1_ib_header *hdr = (struct hfi1_ib_header *)rhdr;
275 struct hfi1_other_headers *ohdr = NULL;
276 u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
277 u16 lid = be16_to_cpu(hdr->lrh[1]);
278 u32 qp_num;
279 u32 rcv_flags = 0;
280
281 /* Sanity check packet */
282 if (tlen < 24)
283 goto drop;
284
285 /* Check for GRH */
286 if (lnh == HFI1_LRH_BTH)
287 ohdr = &hdr->u.oth;
288 else if (lnh == HFI1_LRH_GRH) {
289 u32 vtf;
290
291 ohdr = &hdr->u.l.oth;
292 if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
293 goto drop;
294 vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
295 if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
296 goto drop;
297 rcv_flags |= HFI1_HAS_GRH;
298 } else
299 goto drop;
300
301 /* Get the destination QP number. */
302 qp_num = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
303 if (lid < HFI1_MULTICAST_LID_BASE) {
304 struct hfi1_qp *qp;
305
306 rcu_read_lock();
307 qp = hfi1_lookup_qpn(ibp, qp_num);
308 if (!qp) {
309 rcu_read_unlock();
310 goto drop;
311 }
312
313 /*
314 * Handle only RC QPs - for other QP types drop error
315 * packet.
316 */
317 spin_lock(&qp->r_lock);
318
319 /* Check for valid receive state. */
320 if (!(ib_hfi1_state_ops[qp->state] &
321 HFI1_PROCESS_RECV_OK)) {
322 ibp->n_pkt_drops++;
323 }
324
325 switch (qp->ibqp.qp_type) {
326 case IB_QPT_RC:
327 hfi1_rc_hdrerr(
328 rcd,
329 hdr,
330 rcv_flags,
331 qp);
332 break;
333 default:
334 /* For now don't handle any other QP types */
335 break;
336 }
337
338 spin_unlock(&qp->r_lock);
339 rcu_read_unlock();
340 } /* Unicast QP */
341 } /* Valid packet with TIDErr */
342
343 /* handle "RcvTypeErr" flags */
344 switch (rte) {
345 case RHF_RTE_ERROR_OP_CODE_ERR:
346 {
347 u32 opcode;
348 void *ebuf = NULL;
349 __be32 *bth = NULL;
350
351 if (rhf_use_egr_bfr(packet->rhf))
352 ebuf = packet->ebuf;
353
354 if (ebuf == NULL)
355 goto drop; /* this should never happen */
356
357 if (lnh == HFI1_LRH_BTH)
358 bth = (__be32 *)ebuf;
359 else if (lnh == HFI1_LRH_GRH)
360 bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh));
361 else
362 goto drop;
363
364 opcode = be32_to_cpu(bth[0]) >> 24;
365 opcode &= 0xff;
366
367 if (opcode == IB_OPCODE_CNP) {
368 /*
369 * Only in pre-B0 h/w is the CNP_OPCODE handled
370 * via this code path (errata 291394).
371 */
372 struct hfi1_qp *qp = NULL;
373 u32 lqpn, rqpn;
374 u16 rlid;
375 u8 svc_type, sl, sc5;
376
377 sc5 = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf;
378 if (rhf_dc_info(packet->rhf))
379 sc5 |= 0x10;
380 sl = ibp->sc_to_sl[sc5];
381
382 lqpn = be32_to_cpu(bth[1]) & HFI1_QPN_MASK;
383 rcu_read_lock();
384 qp = hfi1_lookup_qpn(ibp, lqpn);
385 if (qp == NULL) {
386 rcu_read_unlock();
387 goto drop;
388 }
389
390 switch (qp->ibqp.qp_type) {
391 case IB_QPT_UD:
392 rlid = 0;
393 rqpn = 0;
394 svc_type = IB_CC_SVCTYPE_UD;
395 break;
396 case IB_QPT_UC:
397 rlid = be16_to_cpu(rhdr->lrh[3]);
398 rqpn = qp->remote_qpn;
399 svc_type = IB_CC_SVCTYPE_UC;
400 break;
401 default:
402 goto drop;
403 }
404
405 process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
406 rcu_read_unlock();
407 }
408
409 packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK;
410 break;
411 }
412 default:
413 break;
414 }
415
416drop:
417 return;
418}
419
420static inline void init_packet(struct hfi1_ctxtdata *rcd,
421 struct hfi1_packet *packet)
422{
423
424 packet->rsize = rcd->rcvhdrqentsize; /* words */
425 packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
426 packet->rcd = rcd;
427 packet->updegr = 0;
428 packet->etail = -1;
429 packet->rhf_addr = (__le32 *) rcd->rcvhdrq + rcd->head +
430 rcd->dd->rhf_offset;
431 packet->rhf = rhf_to_cpu(packet->rhf_addr);
432 packet->rhqoff = rcd->head;
433 packet->numpkt = 0;
434 packet->rcv_flags = 0;
435}
436
437#ifndef CONFIG_PRESCAN_RXQ
438static void prescan_rxq(struct hfi1_packet *packet) {}
439#else /* CONFIG_PRESCAN_RXQ */
440static int prescan_receive_queue;
441
442static void process_ecn(struct hfi1_qp *qp, struct hfi1_ib_header *hdr,
443 struct hfi1_other_headers *ohdr,
444 u64 rhf, struct ib_grh *grh)
445{
446 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
447 u32 bth1;
448 u8 sc5, svc_type;
449 int is_fecn, is_becn;
450
451 switch (qp->ibqp.qp_type) {
452 case IB_QPT_UD:
453 svc_type = IB_CC_SVCTYPE_UD;
454 break;
455 case IB_QPT_UC: /* LATER */
456 case IB_QPT_RC: /* LATER */
457 default:
458 return;
459 }
460
461 is_fecn = (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
462 HFI1_FECN_MASK;
463 is_becn = (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
464 HFI1_BECN_MASK;
465
466 sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
467 if (rhf_dc_info(rhf))
468 sc5 |= 0x10;
469
470 if (is_fecn) {
471 u32 src_qpn = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK;
472 u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
473 u16 dlid = be16_to_cpu(hdr->lrh[1]);
474 u16 slid = be16_to_cpu(hdr->lrh[3]);
475
476 return_cnp(ibp, qp, src_qpn, pkey, dlid, slid, sc5, grh);
477 }
478
479 if (is_becn) {
480 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
481 u32 lqpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
482 u8 sl = ibp->sc_to_sl[sc5];
483
484 process_becn(ppd, sl, 0, lqpn, 0, svc_type);
485 }
486
487 /* turn off BECN, or FECN */
488 bth1 = be32_to_cpu(ohdr->bth[1]);
489 bth1 &= ~(HFI1_FECN_MASK << HFI1_FECN_SHIFT);
490 bth1 &= ~(HFI1_BECN_MASK << HFI1_BECN_SHIFT);
491 ohdr->bth[1] = cpu_to_be32(bth1);
492}
493
494struct ps_mdata {
495 struct hfi1_ctxtdata *rcd;
496 u32 rsize;
497 u32 maxcnt;
498 u32 ps_head;
499 u32 ps_tail;
500 u32 ps_seq;
501};
502
503static inline void init_ps_mdata(struct ps_mdata *mdata,
504 struct hfi1_packet *packet)
505{
506 struct hfi1_ctxtdata *rcd = packet->rcd;
507
508 mdata->rcd = rcd;
509 mdata->rsize = packet->rsize;
510 mdata->maxcnt = packet->maxcnt;
511
512 if (rcd->ps_state.initialized == 0) {
513 mdata->ps_head = packet->rhqoff;
514 rcd->ps_state.initialized++;
515 } else
516 mdata->ps_head = rcd->ps_state.ps_head;
517
518 if (HFI1_CAP_IS_KSET(DMA_RTAIL)) {
519 mdata->ps_tail = packet->hdrqtail;
520 mdata->ps_seq = 0; /* not used with DMA_RTAIL */
521 } else {
522 mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
523 mdata->ps_seq = rcd->seq_cnt;
524 }
525}
526
527static inline int ps_done(struct ps_mdata *mdata, u64 rhf)
528{
529 if (HFI1_CAP_IS_KSET(DMA_RTAIL))
530 return mdata->ps_head == mdata->ps_tail;
531 return mdata->ps_seq != rhf_rcv_seq(rhf);
532}
533
534static inline void update_ps_mdata(struct ps_mdata *mdata)
535{
536 struct hfi1_ctxtdata *rcd = mdata->rcd;
537
538 mdata->ps_head += mdata->rsize;
539 if (mdata->ps_head > mdata->maxcnt)
540 mdata->ps_head = 0;
541 rcd->ps_state.ps_head = mdata->ps_head;
542 if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
543 if (++mdata->ps_seq > 13)
544 mdata->ps_seq = 1;
545 }
546}
547
548/*
549 * prescan_rxq - search through the receive queue looking for packets
550 * containing Excplicit Congestion Notifications (FECNs, or BECNs).
551 * When an ECN is found, process the Congestion Notification, and toggle
552 * it off.
553 */
554static void prescan_rxq(struct hfi1_packet *packet)
555{
556 struct hfi1_ctxtdata *rcd = packet->rcd;
557 struct ps_mdata mdata;
558
559 if (!prescan_receive_queue)
560 return;
561
562 init_ps_mdata(&mdata, packet);
563
564 while (1) {
565 struct hfi1_devdata *dd = rcd->dd;
566 struct hfi1_ibport *ibp = &rcd->ppd->ibport_data;
567 __le32 *rhf_addr = (__le32 *) rcd->rcvhdrq + mdata.ps_head +
568 dd->rhf_offset;
569 struct hfi1_qp *qp;
570 struct hfi1_ib_header *hdr;
571 struct hfi1_other_headers *ohdr;
572 struct ib_grh *grh = NULL;
573 u64 rhf = rhf_to_cpu(rhf_addr);
574 u32 etype = rhf_rcv_type(rhf), qpn;
575 int is_ecn = 0;
576 u8 lnh;
577
578 if (ps_done(&mdata, rhf))
579 break;
580
581 if (etype != RHF_RCV_TYPE_IB)
582 goto next;
583
584 hdr = (struct hfi1_ib_header *)
585 hfi1_get_msgheader(dd, rhf_addr);
586 lnh = be16_to_cpu(hdr->lrh[0]) & 3;
587
588 if (lnh == HFI1_LRH_BTH)
589 ohdr = &hdr->u.oth;
590 else if (lnh == HFI1_LRH_GRH) {
591 ohdr = &hdr->u.l.oth;
592 grh = &hdr->u.l.grh;
593 } else
594 goto next; /* just in case */
595
596 is_ecn |= be32_to_cpu(ohdr->bth[1]) &
597 (HFI1_FECN_MASK << HFI1_FECN_SHIFT);
598 is_ecn |= be32_to_cpu(ohdr->bth[1]) &
599 (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
600
601 if (!is_ecn)
602 goto next;
603
604 qpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
605 rcu_read_lock();
606 qp = hfi1_lookup_qpn(ibp, qpn);
607
608 if (qp == NULL) {
609 rcu_read_unlock();
610 goto next;
611 }
612
613 process_ecn(qp, hdr, ohdr, rhf, grh);
614 rcu_read_unlock();
615next:
616 update_ps_mdata(&mdata);
617 }
618}
619#endif /* CONFIG_PRESCAN_RXQ */
620
621#define RCV_PKT_OK 0x0
622#define RCV_PKT_MAX 0x1
623
624static inline int process_rcv_packet(struct hfi1_packet *packet)
625{
626 int ret = RCV_PKT_OK;
627
628 packet->hdr = hfi1_get_msgheader(packet->rcd->dd,
629 packet->rhf_addr);
630 packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
631 packet->etype = rhf_rcv_type(packet->rhf);
632 /* total length */
633 packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
634 /* retrieve eager buffer details */
635 packet->ebuf = NULL;
636 if (rhf_use_egr_bfr(packet->rhf)) {
637 packet->etail = rhf_egr_index(packet->rhf);
638 packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
639 &packet->updegr);
640 /*
641 * Prefetch the contents of the eager buffer. It is
642 * OK to send a negative length to prefetch_range().
643 * The +2 is the size of the RHF.
644 */
645 prefetch_range(packet->ebuf,
646 packet->tlen - ((packet->rcd->rcvhdrqentsize -
647 (rhf_hdrq_offset(packet->rhf)+2)) * 4));
648 }
649
650 /*
651 * Call a type specific handler for the packet. We
652 * should be able to trust that etype won't be beyond
653 * the range of valid indexes. If so something is really
654 * wrong and we can probably just let things come
655 * crashing down. There is no need to eat another
656 * comparison in this performance critical code.
657 */
658 packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet);
659 packet->numpkt++;
660
661 /* Set up for the next packet */
662 packet->rhqoff += packet->rsize;
663 if (packet->rhqoff >= packet->maxcnt)
664 packet->rhqoff = 0;
665
666 if (packet->numpkt == MAX_PKT_RECV) {
667 ret = RCV_PKT_MAX;
668 this_cpu_inc(*packet->rcd->dd->rcv_limit);
669 }
670
671 packet->rhf_addr = (__le32 *) packet->rcd->rcvhdrq + packet->rhqoff +
672 packet->rcd->dd->rhf_offset;
673 packet->rhf = rhf_to_cpu(packet->rhf_addr);
674
675 return ret;
676}
677
678static inline void process_rcv_update(int last, struct hfi1_packet *packet)
679{
680 /*
681 * Update head regs etc., every 16 packets, if not last pkt,
682 * to help prevent rcvhdrq overflows, when many packets
683 * are processed and queue is nearly full.
684 * Don't request an interrupt for intermediate updates.
685 */
686 if (!last && !(packet->numpkt & 0xf)) {
687 update_usrhead(packet->rcd, packet->rhqoff, packet->updegr,
688 packet->etail, 0, 0);
689 packet->updegr = 0;
690 }
691 packet->rcv_flags = 0;
692}
693
694static inline void finish_packet(struct hfi1_packet *packet)
695{
696
697 /*
698 * Nothing we need to free for the packet.
699 *
700 * The only thing we need to do is a final update and call for an
701 * interrupt
702 */
703 update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
704 packet->etail, rcv_intr_dynamic, packet->numpkt);
705
706}
707
708static inline void process_rcv_qp_work(struct hfi1_packet *packet)
709{
710
711 struct hfi1_ctxtdata *rcd;
712 struct hfi1_qp *qp, *nqp;
713
714 rcd = packet->rcd;
715 rcd->head = packet->rhqoff;
716
717 /*
718 * Iterate over all QPs waiting to respond.
719 * The list won't change since the IRQ is only run on one CPU.
720 */
721 list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
722 list_del_init(&qp->rspwait);
723 if (qp->r_flags & HFI1_R_RSP_NAK) {
724 qp->r_flags &= ~HFI1_R_RSP_NAK;
725 hfi1_send_rc_ack(rcd, qp, 0);
726 }
727 if (qp->r_flags & HFI1_R_RSP_SEND) {
728 unsigned long flags;
729
730 qp->r_flags &= ~HFI1_R_RSP_SEND;
731 spin_lock_irqsave(&qp->s_lock, flags);
732 if (ib_hfi1_state_ops[qp->state] &
733 HFI1_PROCESS_OR_FLUSH_SEND)
734 hfi1_schedule_send(qp);
735 spin_unlock_irqrestore(&qp->s_lock, flags);
736 }
737 if (atomic_dec_and_test(&qp->refcount))
738 wake_up(&qp->wait);
739 }
740}
741
742/*
743 * Handle receive interrupts when using the no dma rtail option.
744 */
745void handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd)
746{
747 u32 seq;
748 int last = 0;
749 struct hfi1_packet packet;
750
751 init_packet(rcd, &packet);
752 seq = rhf_rcv_seq(packet.rhf);
753 if (seq != rcd->seq_cnt)
754 goto bail;
755
756 prescan_rxq(&packet);
757
758 while (!last) {
759 last = process_rcv_packet(&packet);
760 seq = rhf_rcv_seq(packet.rhf);
761 if (++rcd->seq_cnt > 13)
762 rcd->seq_cnt = 1;
763 if (seq != rcd->seq_cnt)
764 last = 1;
765 process_rcv_update(last, &packet);
766 }
767 process_rcv_qp_work(&packet);
768bail:
769 finish_packet(&packet);
770}
771
772void handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd)
773{
774 u32 hdrqtail;
775 int last = 0;
776 struct hfi1_packet packet;
777
778 init_packet(rcd, &packet);
779 hdrqtail = get_rcvhdrtail(rcd);
780 if (packet.rhqoff == hdrqtail)
781 goto bail;
782 smp_rmb(); /* prevent speculative reads of dma'ed hdrq */
783
784 prescan_rxq(&packet);
785
786 while (!last) {
787 last = process_rcv_packet(&packet);
788 if (packet.rhqoff == hdrqtail)
789 last = 1;
790 process_rcv_update(last, &packet);
791 }
792 process_rcv_qp_work(&packet);
793bail:
794 finish_packet(&packet);
795
796}
797
798static inline void set_all_nodma_rtail(struct hfi1_devdata *dd)
799{
800 int i;
801
802 for (i = 0; i < dd->first_user_ctxt; i++)
803 dd->rcd[i]->do_interrupt =
804 &handle_receive_interrupt_nodma_rtail;
805}
806
807static inline void set_all_dma_rtail(struct hfi1_devdata *dd)
808{
809 int i;
810
811 for (i = 0; i < dd->first_user_ctxt; i++)
812 dd->rcd[i]->do_interrupt =
813 &handle_receive_interrupt_dma_rtail;
814}
815
816/*
817 * handle_receive_interrupt - receive a packet
818 * @rcd: the context
819 *
820 * Called from interrupt handler for errors or receive interrupt.
821 * This is the slow path interrupt handler.
822 */
823void handle_receive_interrupt(struct hfi1_ctxtdata *rcd)
824{
825
826 struct hfi1_devdata *dd = rcd->dd;
827 u32 hdrqtail;
828 int last = 0, needset = 1;
829 struct hfi1_packet packet;
830
831 init_packet(rcd, &packet);
832
833 if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
834 u32 seq = rhf_rcv_seq(packet.rhf);
835
836 if (seq != rcd->seq_cnt)
837 goto bail;
838 hdrqtail = 0;
839 } else {
840 hdrqtail = get_rcvhdrtail(rcd);
841 if (packet.rhqoff == hdrqtail)
842 goto bail;
843 smp_rmb(); /* prevent speculative reads of dma'ed hdrq */
844 }
845
846 prescan_rxq(&packet);
847
848 while (!last) {
849
850 if (unlikely(dd->do_drop && atomic_xchg(&dd->drop_packet,
851 DROP_PACKET_OFF) == DROP_PACKET_ON)) {
852 dd->do_drop = 0;
853
854 /* On to the next packet */
855 packet.rhqoff += packet.rsize;
856 packet.rhf_addr = (__le32 *) rcd->rcvhdrq +
857 packet.rhqoff +
858 dd->rhf_offset;
859 packet.rhf = rhf_to_cpu(packet.rhf_addr);
860
861 } else {
862 last = process_rcv_packet(&packet);
863 }
864
865 if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
866 u32 seq = rhf_rcv_seq(packet.rhf);
867
868 if (++rcd->seq_cnt > 13)
869 rcd->seq_cnt = 1;
870 if (seq != rcd->seq_cnt)
871 last = 1;
872 if (needset) {
873 dd_dev_info(dd,
874 "Switching to NO_DMA_RTAIL\n");
875 set_all_nodma_rtail(dd);
876 needset = 0;
877 }
878 } else {
879 if (packet.rhqoff == hdrqtail)
880 last = 1;
881 if (needset) {
882 dd_dev_info(dd,
883 "Switching to DMA_RTAIL\n");
884 set_all_dma_rtail(dd);
885 needset = 0;
886 }
887 }
888
889 process_rcv_update(last, &packet);
890 }
891
892 process_rcv_qp_work(&packet);
893
894bail:
895 /*
896 * Always write head at end, and setup rcv interrupt, even
897 * if no packets were processed.
898 */
899 finish_packet(&packet);
900}
901
902/*
903 * Convert a given MTU size to the on-wire MAD packet enumeration.
904 * Return -1 if the size is invalid.
905 */
906int mtu_to_enum(u32 mtu, int default_if_bad)
907{
908 switch (mtu) {
909 case 0: return OPA_MTU_0;
910 case 256: return OPA_MTU_256;
911 case 512: return OPA_MTU_512;
912 case 1024: return OPA_MTU_1024;
913 case 2048: return OPA_MTU_2048;
914 case 4096: return OPA_MTU_4096;
915 case 8192: return OPA_MTU_8192;
916 case 10240: return OPA_MTU_10240;
917 }
918 return default_if_bad;
919}
920
921u16 enum_to_mtu(int mtu)
922{
923 switch (mtu) {
924 case OPA_MTU_0: return 0;
925 case OPA_MTU_256: return 256;
926 case OPA_MTU_512: return 512;
927 case OPA_MTU_1024: return 1024;
928 case OPA_MTU_2048: return 2048;
929 case OPA_MTU_4096: return 4096;
930 case OPA_MTU_8192: return 8192;
931 case OPA_MTU_10240: return 10240;
932 default: return 0xffff;
933 }
934}
935
936/*
937 * set_mtu - set the MTU
938 * @ppd: the per port data
939 *
940 * We can handle "any" incoming size, the issue here is whether we
941 * need to restrict our outgoing size. We do not deal with what happens
942 * to programs that are already running when the size changes.
943 */
944int set_mtu(struct hfi1_pportdata *ppd)
945{
946 struct hfi1_devdata *dd = ppd->dd;
947 int i, drain, ret = 0, is_up = 0;
948
949 ppd->ibmtu = 0;
950 for (i = 0; i < ppd->vls_supported; i++)
951 if (ppd->ibmtu < dd->vld[i].mtu)
952 ppd->ibmtu = dd->vld[i].mtu;
953 ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
954
955 mutex_lock(&ppd->hls_lock);
956 if (ppd->host_link_state == HLS_UP_INIT
957 || ppd->host_link_state == HLS_UP_ARMED
958 || ppd->host_link_state == HLS_UP_ACTIVE)
959 is_up = 1;
960
961 drain = !is_ax(dd) && is_up;
962
963 if (drain)
964 /*
965 * MTU is specified per-VL. To ensure that no packet gets
966 * stuck (due, e.g., to the MTU for the packet's VL being
967 * reduced), empty the per-VL FIFOs before adjusting MTU.
968 */
969 ret = stop_drain_data_vls(dd);
970
971 if (ret) {
972 dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n",
973 __func__);
974 goto err;
975 }
976
977 hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0);
978
979 if (drain)
980 open_fill_data_vls(dd); /* reopen all VLs */
981
982err:
983 mutex_unlock(&ppd->hls_lock);
984
985 return ret;
986}
987
988int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc)
989{
990 struct hfi1_devdata *dd = ppd->dd;
991
992 ppd->lid = lid;
993 ppd->lmc = lmc;
994 hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
995
996 dd_dev_info(dd, "IB%u:%u got a lid: 0x%x\n", dd->unit, ppd->port, lid);
997
998 return 0;
999}
1000
1001/*
1002 * Following deal with the "obviously simple" task of overriding the state
1003 * of the LEDs, which normally indicate link physical and logical status.
1004 * The complications arise in dealing with different hardware mappings
1005 * and the board-dependent routine being called from interrupts.
1006 * and then there's the requirement to _flash_ them.
1007 */
1008#define LED_OVER_FREQ_SHIFT 8
1009#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
1010/* Below is "non-zero" to force override, but both actual LEDs are off */
1011#define LED_OVER_BOTH_OFF (8)
1012
1013static void run_led_override(unsigned long opaque)
1014{
1015 struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque;
1016 struct hfi1_devdata *dd = ppd->dd;
1017 int timeoff;
1018 int ph_idx;
1019
1020 if (!(dd->flags & HFI1_INITTED))
1021 return;
1022
1023 ph_idx = ppd->led_override_phase++ & 1;
1024 ppd->led_override = ppd->led_override_vals[ph_idx];
1025 timeoff = ppd->led_override_timeoff;
1026
1027 /*
1028 * don't re-fire the timer if user asked for it to be off; we let
1029 * it fire one more time after they turn it off to simplify
1030 */
1031 if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
1032 mod_timer(&ppd->led_override_timer, jiffies + timeoff);
1033}
1034
1035void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val)
1036{
1037 struct hfi1_devdata *dd = ppd->dd;
1038 int timeoff, freq;
1039
1040 if (!(dd->flags & HFI1_INITTED))
1041 return;
1042
1043 /* First check if we are blinking. If not, use 1HZ polling */
1044 timeoff = HZ;
1045 freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
1046
1047 if (freq) {
1048 /* For blink, set each phase from one nybble of val */
1049 ppd->led_override_vals[0] = val & 0xF;
1050 ppd->led_override_vals[1] = (val >> 4) & 0xF;
1051 timeoff = (HZ << 4)/freq;
1052 } else {
1053 /* Non-blink set both phases the same. */
1054 ppd->led_override_vals[0] = val & 0xF;
1055 ppd->led_override_vals[1] = val & 0xF;
1056 }
1057 ppd->led_override_timeoff = timeoff;
1058
1059 /*
1060 * If the timer has not already been started, do so. Use a "quick"
1061 * timeout so the function will be called soon, to look at our request.
1062 */
1063 if (atomic_inc_return(&ppd->led_override_timer_active) == 1) {
1064 /* Need to start timer */
1065 init_timer(&ppd->led_override_timer);
1066 ppd->led_override_timer.function = run_led_override;
1067 ppd->led_override_timer.data = (unsigned long) ppd;
1068 ppd->led_override_timer.expires = jiffies + 1;
1069 add_timer(&ppd->led_override_timer);
1070 } else {
1071 if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
1072 mod_timer(&ppd->led_override_timer, jiffies + 1);
1073 atomic_dec(&ppd->led_override_timer_active);
1074 }
1075}
1076
1077/**
1078 * hfi1_reset_device - reset the chip if possible
1079 * @unit: the device to reset
1080 *
1081 * Whether or not reset is successful, we attempt to re-initialize the chip
1082 * (that is, much like a driver unload/reload). We clear the INITTED flag
1083 * so that the various entry points will fail until we reinitialize. For
1084 * now, we only allow this if no user contexts are open that use chip resources
1085 */
1086int hfi1_reset_device(int unit)
1087{
1088 int ret, i;
1089 struct hfi1_devdata *dd = hfi1_lookup(unit);
1090 struct hfi1_pportdata *ppd;
1091 unsigned long flags;
1092 int pidx;
1093
1094 if (!dd) {
1095 ret = -ENODEV;
1096 goto bail;
1097 }
1098
1099 dd_dev_info(dd, "Reset on unit %u requested\n", unit);
1100
1101 if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
1102 dd_dev_info(dd,
1103 "Invalid unit number %u or not initialized or not present\n",
1104 unit);
1105 ret = -ENXIO;
1106 goto bail;
1107 }
1108
1109 spin_lock_irqsave(&dd->uctxt_lock, flags);
1110 if (dd->rcd)
1111 for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
1112 if (!dd->rcd[i] || !dd->rcd[i]->cnt)
1113 continue;
1114 spin_unlock_irqrestore(&dd->uctxt_lock, flags);
1115 ret = -EBUSY;
1116 goto bail;
1117 }
1118 spin_unlock_irqrestore(&dd->uctxt_lock, flags);
1119
1120 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
1121 ppd = dd->pport + pidx;
1122 if (atomic_read(&ppd->led_override_timer_active)) {
1123 /* Need to stop LED timer, _then_ shut off LEDs */
1124 del_timer_sync(&ppd->led_override_timer);
1125 atomic_set(&ppd->led_override_timer_active, 0);
1126 }
1127
1128 /* Shut off LEDs after we are sure timer is not running */
1129 ppd->led_override = LED_OVER_BOTH_OFF;
1130 }
1131 if (dd->flags & HFI1_HAS_SEND_DMA)
1132 sdma_exit(dd);
1133
1134 hfi1_reset_cpu_counters(dd);
1135
1136 ret = hfi1_init(dd, 1);
1137
1138 if (ret)
1139 dd_dev_err(dd,
1140 "Reinitialize unit %u after reset failed with %d\n",
1141 unit, ret);
1142 else
1143 dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
1144 unit);
1145
1146bail:
1147 return ret;
1148}
1149
1150void handle_eflags(struct hfi1_packet *packet)
1151{
1152 struct hfi1_ctxtdata *rcd = packet->rcd;
1153 u32 rte = rhf_rcv_type_err(packet->rhf);
1154
1155 dd_dev_err(rcd->dd,
1156 "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
1157 rcd->ctxt, packet->rhf,
1158 packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
1159 packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
1160 packet->rhf & RHF_DC_ERR ? "dc " : "",
1161 packet->rhf & RHF_TID_ERR ? "tid " : "",
1162 packet->rhf & RHF_LEN_ERR ? "len " : "",
1163 packet->rhf & RHF_ECC_ERR ? "ecc " : "",
1164 packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
1165 packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
1166 rte);
1167
1168 rcv_hdrerr(rcd, rcd->ppd, packet);
1169}
1170
1171/*
1172 * The following functions are called by the interrupt handler. They are type
1173 * specific handlers for each packet type.
1174 */
1175int process_receive_ib(struct hfi1_packet *packet)
1176{
1177 trace_hfi1_rcvhdr(packet->rcd->ppd->dd,
1178 packet->rcd->ctxt,
1179 rhf_err_flags(packet->rhf),
1180 RHF_RCV_TYPE_IB,
1181 packet->hlen,
1182 packet->tlen,
1183 packet->updegr,
1184 rhf_egr_index(packet->rhf));
1185
1186 if (unlikely(rhf_err_flags(packet->rhf))) {
1187 handle_eflags(packet);
1188 return RHF_RCV_CONTINUE;
1189 }
1190
1191 hfi1_ib_rcv(packet);
1192 return RHF_RCV_CONTINUE;
1193}
1194
1195int process_receive_bypass(struct hfi1_packet *packet)
1196{
1197 if (unlikely(rhf_err_flags(packet->rhf)))
1198 handle_eflags(packet);
1199
1200 dd_dev_err(packet->rcd->dd,
1201 "Bypass packets are not supported in normal operation. Dropping\n");
1202 return RHF_RCV_CONTINUE;
1203}
1204
1205int process_receive_error(struct hfi1_packet *packet)
1206{
1207 handle_eflags(packet);
1208
1209 if (unlikely(rhf_err_flags(packet->rhf)))
1210 dd_dev_err(packet->rcd->dd,
1211 "Unhandled error packet received. Dropping.\n");
1212
1213 return RHF_RCV_CONTINUE;
1214}
1215
1216int kdeth_process_expected(struct hfi1_packet *packet)
1217{
1218 if (unlikely(rhf_err_flags(packet->rhf)))
1219 handle_eflags(packet);
1220
1221 dd_dev_err(packet->rcd->dd,
1222 "Unhandled expected packet received. Dropping.\n");
1223 return RHF_RCV_CONTINUE;
1224}
1225
1226int kdeth_process_eager(struct hfi1_packet *packet)
1227{
1228 if (unlikely(rhf_err_flags(packet->rhf)))
1229 handle_eflags(packet);
1230
1231 dd_dev_err(packet->rcd->dd,
1232 "Unhandled eager packet received. Dropping.\n");
1233 return RHF_RCV_CONTINUE;
1234}
1235
1236int process_receive_invalid(struct hfi1_packet *packet)
1237{
1238 dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
1239 rhf_rcv_type(packet->rhf));
1240 return RHF_RCV_CONTINUE;
1241}
diff --git a/drivers/staging/rdma/hfi1/eprom.c b/drivers/staging/rdma/hfi1/eprom.c
new file mode 100644
index 000000000000..b61d3ae93ed1
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/eprom.c
@@ -0,0 +1,475 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50#include <linux/delay.h>
51#include "hfi.h"
52#include "common.h"
53#include "eprom.h"
54
55/*
56 * The EPROM is logically divided into two partitions:
57 * partition 0: the first 128K, visible from PCI ROM BAR
58 * partition 1: the rest
59 */
60#define P0_SIZE (128 * 1024)
61#define P1_START P0_SIZE
62
63/* largest erase size supported by the controller */
64#define SIZE_32KB (32 * 1024)
65#define MASK_32KB (SIZE_32KB - 1)
66
67/* controller page size, in bytes */
68#define EP_PAGE_SIZE 256
69#define EEP_PAGE_MASK (EP_PAGE_SIZE - 1)
70
71/* controller commands */
72#define CMD_SHIFT 24
73#define CMD_NOP (0)
74#define CMD_PAGE_PROGRAM(addr) ((0x02 << CMD_SHIFT) | addr)
75#define CMD_READ_DATA(addr) ((0x03 << CMD_SHIFT) | addr)
76#define CMD_READ_SR1 ((0x05 << CMD_SHIFT))
77#define CMD_WRITE_ENABLE ((0x06 << CMD_SHIFT))
78#define CMD_SECTOR_ERASE_32KB(addr) ((0x52 << CMD_SHIFT) | addr)
79#define CMD_CHIP_ERASE ((0x60 << CMD_SHIFT))
80#define CMD_READ_MANUF_DEV_ID ((0x90 << CMD_SHIFT))
81#define CMD_RELEASE_POWERDOWN_NOID ((0xab << CMD_SHIFT))
82
83/* controller interface speeds */
84#define EP_SPEED_FULL 0x2 /* full speed */
85
86/* controller status register 1 bits */
87#define SR1_BUSY 0x1ull /* the BUSY bit in SR1 */
88
89/* sleep length while waiting for controller */
90#define WAIT_SLEEP_US 100 /* must be larger than 5 (see usage) */
91#define COUNT_DELAY_SEC(n) ((n) * (1000000/WAIT_SLEEP_US))
92
93/* GPIO pins */
94#define EPROM_WP_N (1ull << 14) /* EPROM write line */
95
96/*
97 * Use the EP mutex to guard against other callers from within the driver.
98 * Also covers usage of eprom_available.
99 */
100static DEFINE_MUTEX(eprom_mutex);
101static int eprom_available; /* default: not available */
102
103/*
104 * Turn on external enable line that allows writing on the flash.
105 */
106static void write_enable(struct hfi1_devdata *dd)
107{
108 /* raise signal */
109 write_csr(dd, ASIC_GPIO_OUT,
110 read_csr(dd, ASIC_GPIO_OUT) | EPROM_WP_N);
111 /* raise enable */
112 write_csr(dd, ASIC_GPIO_OE,
113 read_csr(dd, ASIC_GPIO_OE) | EPROM_WP_N);
114}
115
116/*
117 * Turn off external enable line that allows writing on the flash.
118 */
119static void write_disable(struct hfi1_devdata *dd)
120{
121 /* lower signal */
122 write_csr(dd, ASIC_GPIO_OUT,
123 read_csr(dd, ASIC_GPIO_OUT) & ~EPROM_WP_N);
124 /* lower enable */
125 write_csr(dd, ASIC_GPIO_OE,
126 read_csr(dd, ASIC_GPIO_OE) & ~EPROM_WP_N);
127}
128
129/*
130 * Wait for the device to become not busy. Must be called after all
131 * write or erase operations.
132 */
133static int wait_for_not_busy(struct hfi1_devdata *dd)
134{
135 unsigned long count = 0;
136 u64 reg;
137 int ret = 0;
138
139 /* starts page mode */
140 write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_SR1);
141 while (1) {
142 udelay(WAIT_SLEEP_US);
143 usleep_range(WAIT_SLEEP_US - 5, WAIT_SLEEP_US + 5);
144 count++;
145 reg = read_csr(dd, ASIC_EEP_DATA);
146 if ((reg & SR1_BUSY) == 0)
147 break;
148 /* 200s is the largest time for a 128Mb device */
149 if (count > COUNT_DELAY_SEC(200)) {
150 dd_dev_err(dd, "waited too long for SPI FLASH busy to clear - failing\n");
151 ret = -ETIMEDOUT;
152 break; /* break, not goto - must stop page mode */
153 }
154 }
155
156 /* stop page mode with a NOP */
157 write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP);
158
159 return ret;
160}
161
162/*
163 * Read the device ID from the SPI controller.
164 */
165static u32 read_device_id(struct hfi1_devdata *dd)
166{
167 /* read the Manufacture Device ID */
168 write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_MANUF_DEV_ID);
169 return (u32)read_csr(dd, ASIC_EEP_DATA);
170}
171
172/*
173 * Erase the whole flash.
174 */
175static int erase_chip(struct hfi1_devdata *dd)
176{
177 int ret;
178
179 write_enable(dd);
180
181 write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
182 write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_CHIP_ERASE);
183 ret = wait_for_not_busy(dd);
184
185 write_disable(dd);
186
187 return ret;
188}
189
190/*
191 * Erase a range using the 32KB erase command.
192 */
193static int erase_32kb_range(struct hfi1_devdata *dd, u32 start, u32 end)
194{
195 int ret = 0;
196
197 if (end < start)
198 return -EINVAL;
199
200 if ((start & MASK_32KB) || (end & MASK_32KB)) {
201 dd_dev_err(dd,
202 "%s: non-aligned range (0x%x,0x%x) for a 32KB erase\n",
203 __func__, start, end);
204 return -EINVAL;
205 }
206
207 write_enable(dd);
208
209 for (; start < end; start += SIZE_32KB) {
210 write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
211 write_csr(dd, ASIC_EEP_ADDR_CMD,
212 CMD_SECTOR_ERASE_32KB(start));
213 ret = wait_for_not_busy(dd);
214 if (ret)
215 goto done;
216 }
217
218done:
219 write_disable(dd);
220
221 return ret;
222}
223
224/*
225 * Read a 256 byte (64 dword) EPROM page.
226 * All callers have verified the offset is at a page boundary.
227 */
228static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result)
229{
230 int i;
231
232 write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset));
233 for (i = 0; i < EP_PAGE_SIZE/sizeof(u32); i++)
234 result[i] = (u32)read_csr(dd, ASIC_EEP_DATA);
235 write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */
236}
237
238/*
239 * Read length bytes starting at offset. Copy to user address addr.
240 */
241static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
242{
243 u32 offset;
244 u32 buffer[EP_PAGE_SIZE/sizeof(u32)];
245 int ret = 0;
246
247 /* reject anything not on an EPROM page boundary */
248 if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
249 return -EINVAL;
250
251 for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
252 read_page(dd, start + offset, buffer);
253 if (copy_to_user((void __user *)(addr + offset),
254 buffer, EP_PAGE_SIZE)) {
255 ret = -EFAULT;
256 goto done;
257 }
258 }
259
260done:
261 return ret;
262}
263
264/*
265 * Write a 256 byte (64 dword) EPROM page.
266 * All callers have verified the offset is at a page boundary.
267 */
268static int write_page(struct hfi1_devdata *dd, u32 offset, u32 *data)
269{
270 int i;
271
272 write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
273 write_csr(dd, ASIC_EEP_DATA, data[0]);
274 write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_PAGE_PROGRAM(offset));
275 for (i = 1; i < EP_PAGE_SIZE/sizeof(u32); i++)
276 write_csr(dd, ASIC_EEP_DATA, data[i]);
277 /* will close the open page */
278 return wait_for_not_busy(dd);
279}
280
281/*
282 * Write length bytes starting at offset. Read from user address addr.
283 */
284static int write_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
285{
286 u32 offset;
287 u32 buffer[EP_PAGE_SIZE/sizeof(u32)];
288 int ret = 0;
289
290 /* reject anything not on an EPROM page boundary */
291 if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
292 return -EINVAL;
293
294 write_enable(dd);
295
296 for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
297 if (copy_from_user(buffer, (void __user *)(addr + offset),
298 EP_PAGE_SIZE)) {
299 ret = -EFAULT;
300 goto done;
301 }
302 ret = write_page(dd, start + offset, buffer);
303 if (ret)
304 goto done;
305 }
306
307done:
308 write_disable(dd);
309 return ret;
310}
311
312/*
313 * Perform the given operation on the EPROM. Called from user space. The
314 * user credentials have already been checked.
315 *
316 * Return 0 on success, -ERRNO on error
317 */
318int handle_eprom_command(const struct hfi1_cmd *cmd)
319{
320 struct hfi1_devdata *dd;
321 u32 dev_id;
322 int ret = 0;
323
324 /*
325 * The EPROM is per-device, so use unit 0 as that will always
326 * exist.
327 */
328 dd = hfi1_lookup(0);
329 if (!dd) {
330 pr_err("%s: cannot find unit 0!\n", __func__);
331 return -EINVAL;
332 }
333
334 /* lock against other callers touching the ASIC block */
335 mutex_lock(&eprom_mutex);
336
337 /* some platforms do not have an EPROM */
338 if (!eprom_available) {
339 ret = -ENOSYS;
340 goto done_asic;
341 }
342
343 /* lock against the other HFI on another OS */
344 ret = acquire_hw_mutex(dd);
345 if (ret) {
346 dd_dev_err(dd,
347 "%s: unable to acquire hw mutex, no EPROM support\n",
348 __func__);
349 goto done_asic;
350 }
351
352 dd_dev_info(dd, "%s: cmd: type %d, len 0x%x, addr 0x%016llx\n",
353 __func__, cmd->type, cmd->len, cmd->addr);
354
355 switch (cmd->type) {
356 case HFI1_CMD_EP_INFO:
357 if (cmd->len != sizeof(u32)) {
358 ret = -ERANGE;
359 break;
360 }
361 dev_id = read_device_id(dd);
362 /* addr points to a u32 user buffer */
363 if (copy_to_user((void __user *)cmd->addr, &dev_id,
364 sizeof(u32)))
365 ret = -EFAULT;
366 break;
367 case HFI1_CMD_EP_ERASE_CHIP:
368 ret = erase_chip(dd);
369 break;
370 case HFI1_CMD_EP_ERASE_P0:
371 if (cmd->len != P0_SIZE) {
372 ret = -ERANGE;
373 break;
374 }
375 ret = erase_32kb_range(dd, 0, cmd->len);
376 break;
377 case HFI1_CMD_EP_ERASE_P1:
378 /* check for overflow */
379 if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
380 ret = -ERANGE;
381 break;
382 }
383 ret = erase_32kb_range(dd, P1_START, P1_START + cmd->len);
384 break;
385 case HFI1_CMD_EP_READ_P0:
386 if (cmd->len != P0_SIZE) {
387 ret = -ERANGE;
388 break;
389 }
390 ret = read_length(dd, 0, cmd->len, cmd->addr);
391 break;
392 case HFI1_CMD_EP_READ_P1:
393 /* check for overflow */
394 if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
395 ret = -ERANGE;
396 break;
397 }
398 ret = read_length(dd, P1_START, cmd->len, cmd->addr);
399 break;
400 case HFI1_CMD_EP_WRITE_P0:
401 if (cmd->len > P0_SIZE) {
402 ret = -ERANGE;
403 break;
404 }
405 ret = write_length(dd, 0, cmd->len, cmd->addr);
406 break;
407 case HFI1_CMD_EP_WRITE_P1:
408 /* check for overflow */
409 if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
410 ret = -ERANGE;
411 break;
412 }
413 ret = write_length(dd, P1_START, cmd->len, cmd->addr);
414 break;
415 default:
416 dd_dev_err(dd, "%s: unexpected command %d\n",
417 __func__, cmd->type);
418 ret = -EINVAL;
419 break;
420 }
421
422 release_hw_mutex(dd);
423done_asic:
424 mutex_unlock(&eprom_mutex);
425 return ret;
426}
427
428/*
429 * Initialize the EPROM handler.
430 */
431int eprom_init(struct hfi1_devdata *dd)
432{
433 int ret = 0;
434
435 /* only the discrete chip has an EPROM, nothing to do */
436 if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
437 return 0;
438
439 /* lock against other callers */
440 mutex_lock(&eprom_mutex);
441 if (eprom_available) /* already initialized */
442 goto done_asic;
443
444 /*
445 * Lock against the other HFI on another OS - the mutex above
446 * would have caught anything in this driver. It is OK if
447 * both OSes reset the EPROM - as long as they don't do it at
448 * the same time.
449 */
450 ret = acquire_hw_mutex(dd);
451 if (ret) {
452 dd_dev_err(dd,
453 "%s: unable to acquire hw mutex, no EPROM support\n",
454 __func__);
455 goto done_asic;
456 }
457
458 /* reset EPROM to be sure it is in a good state */
459
460 /* set reset */
461 write_csr(dd, ASIC_EEP_CTL_STAT,
462 ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
463 /* clear reset, set speed */
464 write_csr(dd, ASIC_EEP_CTL_STAT,
465 EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
466
467 /* wake the device with command "release powerdown NoID" */
468 write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
469
470 eprom_available = 1;
471 release_hw_mutex(dd);
472done_asic:
473 mutex_unlock(&eprom_mutex);
474 return ret;
475}
diff --git a/drivers/staging/rdma/hfi1/eprom.h b/drivers/staging/rdma/hfi1/eprom.h
new file mode 100644
index 000000000000..64a64276be81
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/eprom.h
@@ -0,0 +1,55 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51struct hfi1_cmd;
52struct hfi1_devdata;
53
54int eprom_init(struct hfi1_devdata *dd);
55int handle_eprom_command(const struct hfi1_cmd *cmd);
diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
new file mode 100644
index 000000000000..469861750b76
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/file_ops.c
@@ -0,0 +1,2140 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50#include <linux/pci.h>
51#include <linux/poll.h>
52#include <linux/cdev.h>
53#include <linux/swap.h>
54#include <linux/vmalloc.h>
55#include <linux/highmem.h>
56#include <linux/io.h>
57#include <linux/jiffies.h>
58#include <asm/pgtable.h>
59#include <linux/delay.h>
60#include <linux/export.h>
61#include <linux/module.h>
62#include <linux/cred.h>
63#include <linux/uio.h>
64
65#include "hfi.h"
66#include "pio.h"
67#include "device.h"
68#include "common.h"
69#include "trace.h"
70#include "user_sdma.h"
71#include "eprom.h"
72
73#undef pr_fmt
74#define pr_fmt(fmt) DRIVER_NAME ": " fmt
75
76#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
77
78/*
79 * File operation functions
80 */
81static int hfi1_file_open(struct inode *, struct file *);
82static int hfi1_file_close(struct inode *, struct file *);
83static ssize_t hfi1_file_write(struct file *, const char __user *,
84 size_t, loff_t *);
85static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *);
86static unsigned int hfi1_poll(struct file *, struct poll_table_struct *);
87static int hfi1_file_mmap(struct file *, struct vm_area_struct *);
88
89static u64 kvirt_to_phys(void *);
90static int assign_ctxt(struct file *, struct hfi1_user_info *);
91static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *);
92static int user_init(struct file *);
93static int get_ctxt_info(struct file *, void __user *, __u32);
94static int get_base_info(struct file *, void __user *, __u32);
95static int setup_ctxt(struct file *);
96static int setup_subctxt(struct hfi1_ctxtdata *);
97static int get_user_context(struct file *, struct hfi1_user_info *,
98 int, unsigned);
99static int find_shared_ctxt(struct file *, const struct hfi1_user_info *);
100static int allocate_ctxt(struct file *, struct hfi1_devdata *,
101 struct hfi1_user_info *);
102static unsigned int poll_urgent(struct file *, struct poll_table_struct *);
103static unsigned int poll_next(struct file *, struct poll_table_struct *);
104static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
105static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
106static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
107static int vma_fault(struct vm_area_struct *, struct vm_fault *);
108static int exp_tid_setup(struct file *, struct hfi1_tid_info *);
109static int exp_tid_free(struct file *, struct hfi1_tid_info *);
110static void unlock_exp_tids(struct hfi1_ctxtdata *);
111
112static const struct file_operations hfi1_file_ops = {
113 .owner = THIS_MODULE,
114 .write = hfi1_file_write,
115 .write_iter = hfi1_write_iter,
116 .open = hfi1_file_open,
117 .release = hfi1_file_close,
118 .poll = hfi1_poll,
119 .mmap = hfi1_file_mmap,
120 .llseek = noop_llseek,
121};
122
123static struct vm_operations_struct vm_ops = {
124 .fault = vma_fault,
125};
126
127/*
128 * Types of memories mapped into user processes' space
129 */
130enum mmap_types {
131 PIO_BUFS = 1,
132 PIO_BUFS_SOP,
133 PIO_CRED,
134 RCV_HDRQ,
135 RCV_EGRBUF,
136 UREGS,
137 EVENTS,
138 STATUS,
139 RTAIL,
140 SUBCTXT_UREGS,
141 SUBCTXT_RCV_HDRQ,
142 SUBCTXT_EGRBUF,
143 SDMA_COMP
144};
145
146/*
147 * Masks and offsets defining the mmap tokens
148 */
149#define HFI1_MMAP_OFFSET_MASK 0xfffULL
150#define HFI1_MMAP_OFFSET_SHIFT 0
151#define HFI1_MMAP_SUBCTXT_MASK 0xfULL
152#define HFI1_MMAP_SUBCTXT_SHIFT 12
153#define HFI1_MMAP_CTXT_MASK 0xffULL
154#define HFI1_MMAP_CTXT_SHIFT 16
155#define HFI1_MMAP_TYPE_MASK 0xfULL
156#define HFI1_MMAP_TYPE_SHIFT 24
157#define HFI1_MMAP_MAGIC_MASK 0xffffffffULL
158#define HFI1_MMAP_MAGIC_SHIFT 32
159
160#define HFI1_MMAP_MAGIC 0xdabbad00
161
162#define HFI1_MMAP_TOKEN_SET(field, val) \
163 (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
164#define HFI1_MMAP_TOKEN_GET(field, token) \
165 (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
166#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr) \
167 (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
168 HFI1_MMAP_TOKEN_SET(TYPE, type) | \
169 HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
170 HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
171 HFI1_MMAP_TOKEN_SET(OFFSET, ((unsigned long)addr & ~PAGE_MASK)))
172
173#define EXP_TID_SET(field, value) \
174 (((value) & EXP_TID_TID##field##_MASK) << \
175 EXP_TID_TID##field##_SHIFT)
176#define EXP_TID_CLEAR(tid, field) { \
177 (tid) &= ~(EXP_TID_TID##field##_MASK << \
178 EXP_TID_TID##field##_SHIFT); \
179 }
180#define EXP_TID_RESET(tid, field, value) do { \
181 EXP_TID_CLEAR(tid, field); \
182 (tid) |= EXP_TID_SET(field, value); \
183 } while (0)
184
185#define dbg(fmt, ...) \
186 pr_info(fmt, ##__VA_ARGS__)
187
188
189static inline int is_valid_mmap(u64 token)
190{
191 return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
192}
193
194static int hfi1_file_open(struct inode *inode, struct file *fp)
195{
196 /* The real work is performed later in assign_ctxt() */
197 fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
198 if (fp->private_data) /* no cpu affinity by default */
199 ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1;
200 return fp->private_data ? 0 : -ENOMEM;
201}
202
203static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
204 size_t count, loff_t *offset)
205{
206 const struct hfi1_cmd __user *ucmd;
207 struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
208 struct hfi1_cmd cmd;
209 struct hfi1_user_info uinfo;
210 struct hfi1_tid_info tinfo;
211 ssize_t consumed = 0, copy = 0, ret = 0;
212 void *dest = NULL;
213 __u64 user_val = 0;
214 int uctxt_required = 1;
215 int must_be_root = 0;
216
217 if (count < sizeof(cmd)) {
218 ret = -EINVAL;
219 goto bail;
220 }
221
222 ucmd = (const struct hfi1_cmd __user *)data;
223 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) {
224 ret = -EFAULT;
225 goto bail;
226 }
227
228 consumed = sizeof(cmd);
229
230 switch (cmd.type) {
231 case HFI1_CMD_ASSIGN_CTXT:
232 uctxt_required = 0; /* assigned user context not required */
233 copy = sizeof(uinfo);
234 dest = &uinfo;
235 break;
236 case HFI1_CMD_SDMA_STATUS_UPD:
237 case HFI1_CMD_CREDIT_UPD:
238 copy = 0;
239 break;
240 case HFI1_CMD_TID_UPDATE:
241 case HFI1_CMD_TID_FREE:
242 copy = sizeof(tinfo);
243 dest = &tinfo;
244 break;
245 case HFI1_CMD_USER_INFO:
246 case HFI1_CMD_RECV_CTRL:
247 case HFI1_CMD_POLL_TYPE:
248 case HFI1_CMD_ACK_EVENT:
249 case HFI1_CMD_CTXT_INFO:
250 case HFI1_CMD_SET_PKEY:
251 case HFI1_CMD_CTXT_RESET:
252 copy = 0;
253 user_val = cmd.addr;
254 break;
255 case HFI1_CMD_EP_INFO:
256 case HFI1_CMD_EP_ERASE_CHIP:
257 case HFI1_CMD_EP_ERASE_P0:
258 case HFI1_CMD_EP_ERASE_P1:
259 case HFI1_CMD_EP_READ_P0:
260 case HFI1_CMD_EP_READ_P1:
261 case HFI1_CMD_EP_WRITE_P0:
262 case HFI1_CMD_EP_WRITE_P1:
263 uctxt_required = 0; /* assigned user context not required */
264 must_be_root = 1; /* validate user */
265 copy = 0;
266 break;
267 default:
268 ret = -EINVAL;
269 goto bail;
270 }
271
272 /* If the command comes with user data, copy it. */
273 if (copy) {
274 if (copy_from_user(dest, (void __user *)cmd.addr, copy)) {
275 ret = -EFAULT;
276 goto bail;
277 }
278 consumed += copy;
279 }
280
281 /*
282 * Make sure there is a uctxt when needed.
283 */
284 if (uctxt_required && !uctxt) {
285 ret = -EINVAL;
286 goto bail;
287 }
288
289 /* only root can do these operations */
290 if (must_be_root && !capable(CAP_SYS_ADMIN)) {
291 ret = -EPERM;
292 goto bail;
293 }
294
295 switch (cmd.type) {
296 case HFI1_CMD_ASSIGN_CTXT:
297 ret = assign_ctxt(fp, &uinfo);
298 if (ret < 0)
299 goto bail;
300 ret = setup_ctxt(fp);
301 if (ret)
302 goto bail;
303 ret = user_init(fp);
304 break;
305 case HFI1_CMD_CTXT_INFO:
306 ret = get_ctxt_info(fp, (void __user *)(unsigned long)
307 user_val, cmd.len);
308 break;
309 case HFI1_CMD_USER_INFO:
310 ret = get_base_info(fp, (void __user *)(unsigned long)
311 user_val, cmd.len);
312 break;
313 case HFI1_CMD_SDMA_STATUS_UPD:
314 break;
315 case HFI1_CMD_CREDIT_UPD:
316 if (uctxt && uctxt->sc)
317 sc_return_credits(uctxt->sc);
318 break;
319 case HFI1_CMD_TID_UPDATE:
320 ret = exp_tid_setup(fp, &tinfo);
321 if (!ret) {
322 unsigned long addr;
323 /*
324 * Copy the number of tidlist entries we used
325 * and the length of the buffer we registered.
326 * These fields are adjacent in the structure so
327 * we can copy them at the same time.
328 */
329 addr = (unsigned long)cmd.addr +
330 offsetof(struct hfi1_tid_info, tidcnt);
331 if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
332 sizeof(tinfo.tidcnt) +
333 sizeof(tinfo.length)))
334 ret = -EFAULT;
335 }
336 break;
337 case HFI1_CMD_TID_FREE:
338 ret = exp_tid_free(fp, &tinfo);
339 break;
340 case HFI1_CMD_RECV_CTRL:
341 ret = manage_rcvq(uctxt, subctxt_fp(fp), (int)user_val);
342 break;
343 case HFI1_CMD_POLL_TYPE:
344 uctxt->poll_type = (typeof(uctxt->poll_type))user_val;
345 break;
346 case HFI1_CMD_ACK_EVENT:
347 ret = user_event_ack(uctxt, subctxt_fp(fp), user_val);
348 break;
349 case HFI1_CMD_SET_PKEY:
350 if (HFI1_CAP_IS_USET(PKEY_CHECK))
351 ret = set_ctxt_pkey(uctxt, subctxt_fp(fp), user_val);
352 else
353 ret = -EPERM;
354 break;
355 case HFI1_CMD_CTXT_RESET: {
356 struct send_context *sc;
357 struct hfi1_devdata *dd;
358
359 if (!uctxt || !uctxt->dd || !uctxt->sc) {
360 ret = -EINVAL;
361 break;
362 }
363 /*
364 * There is no protection here. User level has to
365 * guarantee that no one will be writing to the send
366 * context while it is being re-initialized.
367 * If user level breaks that guarantee, it will break
368 * it's own context and no one else's.
369 */
370 dd = uctxt->dd;
371 sc = uctxt->sc;
372 /*
373 * Wait until the interrupt handler has marked the
374 * context as halted or frozen. Report error if we time
375 * out.
376 */
377 wait_event_interruptible_timeout(
378 sc->halt_wait, (sc->flags & SCF_HALTED),
379 msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
380 if (!(sc->flags & SCF_HALTED)) {
381 ret = -ENOLCK;
382 break;
383 }
384 /*
385 * If the send context was halted due to a Freeze,
386 * wait until the device has been "unfrozen" before
387 * resetting the context.
388 */
389 if (sc->flags & SCF_FROZEN) {
390 wait_event_interruptible_timeout(
391 dd->event_queue,
392 !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
393 msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
394 if (dd->flags & HFI1_FROZEN) {
395 ret = -ENOLCK;
396 break;
397 }
398 if (dd->flags & HFI1_FORCED_FREEZE) {
399 /* Don't allow context reset if we are into
400 * forced freeze */
401 ret = -ENODEV;
402 break;
403 }
404 sc_disable(sc);
405 ret = sc_enable(sc);
406 hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
407 uctxt->ctxt);
408 } else
409 ret = sc_restart(sc);
410 if (!ret)
411 sc_return_credits(sc);
412 break;
413 }
414 case HFI1_CMD_EP_INFO:
415 case HFI1_CMD_EP_ERASE_CHIP:
416 case HFI1_CMD_EP_ERASE_P0:
417 case HFI1_CMD_EP_ERASE_P1:
418 case HFI1_CMD_EP_READ_P0:
419 case HFI1_CMD_EP_READ_P1:
420 case HFI1_CMD_EP_WRITE_P0:
421 case HFI1_CMD_EP_WRITE_P1:
422 ret = handle_eprom_command(&cmd);
423 break;
424 }
425
426 if (ret >= 0)
427 ret = consumed;
428bail:
429 return ret;
430}
431
432static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
433{
434 struct hfi1_user_sdma_pkt_q *pq;
435 struct hfi1_user_sdma_comp_q *cq;
436 int ret = 0, done = 0, reqs = 0;
437 unsigned long dim = from->nr_segs;
438
439 if (!user_sdma_comp_fp(kiocb->ki_filp) ||
440 !user_sdma_pkt_fp(kiocb->ki_filp)) {
441 ret = -EIO;
442 goto done;
443 }
444
445 if (!iter_is_iovec(from) || !dim) {
446 ret = -EINVAL;
447 goto done;
448 }
449
450 hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
451 ctxt_fp(kiocb->ki_filp)->ctxt, subctxt_fp(kiocb->ki_filp),
452 dim);
453 pq = user_sdma_pkt_fp(kiocb->ki_filp);
454 cq = user_sdma_comp_fp(kiocb->ki_filp);
455
456 if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
457 ret = -ENOSPC;
458 goto done;
459 }
460
461 while (dim) {
462 unsigned long count = 0;
463
464 ret = hfi1_user_sdma_process_request(
465 kiocb->ki_filp, (struct iovec *)(from->iov + done),
466 dim, &count);
467 if (ret)
468 goto done;
469 dim -= count;
470 done += count;
471 reqs++;
472 }
473done:
474 return ret ? ret : reqs;
475}
476
477static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
478{
479 struct hfi1_ctxtdata *uctxt;
480 struct hfi1_devdata *dd;
481 unsigned long flags, pfn;
482 u64 token = vma->vm_pgoff << PAGE_SHIFT,
483 memaddr = 0;
484 u8 subctxt, mapio = 0, vmf = 0, type;
485 ssize_t memlen = 0;
486 int ret = 0;
487 u16 ctxt;
488
489 uctxt = ctxt_fp(fp);
490 if (!is_valid_mmap(token) || !uctxt ||
491 !(vma->vm_flags & VM_SHARED)) {
492 ret = -EINVAL;
493 goto done;
494 }
495 dd = uctxt->dd;
496 ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
497 subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
498 type = HFI1_MMAP_TOKEN_GET(TYPE, token);
499 if (ctxt != uctxt->ctxt || subctxt != subctxt_fp(fp)) {
500 ret = -EINVAL;
501 goto done;
502 }
503
504 flags = vma->vm_flags;
505
506 switch (type) {
507 case PIO_BUFS:
508 case PIO_BUFS_SOP:
509 memaddr = ((dd->physaddr + TXE_PIO_SEND) +
510 /* chip pio base */
511 (uctxt->sc->hw_context * (1 << 16))) +
512 /* 64K PIO space / ctxt */
513 (type == PIO_BUFS_SOP ?
514 (TXE_PIO_SIZE / 2) : 0); /* sop? */
515 /*
516 * Map only the amount allocated to the context, not the
517 * entire available context's PIO space.
518 */
519 memlen = ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE,
520 PAGE_SIZE);
521 flags &= ~VM_MAYREAD;
522 flags |= VM_DONTCOPY | VM_DONTEXPAND;
523 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
524 mapio = 1;
525 break;
526 case PIO_CRED:
527 if (flags & VM_WRITE) {
528 ret = -EPERM;
529 goto done;
530 }
531 /*
532 * The credit return location for this context could be on the
533 * second or third page allocated for credit returns (if number
534 * of enabled contexts > 64 and 128 respectively).
535 */
536 memaddr = dd->cr_base[uctxt->numa_id].pa +
537 (((u64)uctxt->sc->hw_free -
538 (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
539 memlen = PAGE_SIZE;
540 flags &= ~VM_MAYWRITE;
541 flags |= VM_DONTCOPY | VM_DONTEXPAND;
542 /*
543 * The driver has already allocated memory for credit
544 * returns and programmed it into the chip. Has that
545 * memory been flagged as non-cached?
546 */
547 /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
548 mapio = 1;
549 break;
550 case RCV_HDRQ:
551 memaddr = uctxt->rcvhdrq_phys;
552 memlen = uctxt->rcvhdrq_size;
553 break;
554 case RCV_EGRBUF: {
555 unsigned long addr;
556 int i;
557 /*
558 * The RcvEgr buffer need to be handled differently
559 * as multiple non-contiguous pages need to be mapped
560 * into the user process.
561 */
562 memlen = uctxt->egrbufs.size;
563 if ((vma->vm_end - vma->vm_start) != memlen) {
564 dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
565 (vma->vm_end - vma->vm_start), memlen);
566 ret = -EINVAL;
567 goto done;
568 }
569 if (vma->vm_flags & VM_WRITE) {
570 ret = -EPERM;
571 goto done;
572 }
573 vma->vm_flags &= ~VM_MAYWRITE;
574 addr = vma->vm_start;
575 for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
576 ret = remap_pfn_range(
577 vma, addr,
578 uctxt->egrbufs.buffers[i].phys >> PAGE_SHIFT,
579 uctxt->egrbufs.buffers[i].len,
580 vma->vm_page_prot);
581 if (ret < 0)
582 goto done;
583 addr += uctxt->egrbufs.buffers[i].len;
584 }
585 ret = 0;
586 goto done;
587 }
588 case UREGS:
589 /*
590 * Map only the page that contains this context's user
591 * registers.
592 */
593 memaddr = (unsigned long)
594 (dd->physaddr + RXE_PER_CONTEXT_USER)
595 + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
596 /*
597 * TidFlow table is on the same page as the rest of the
598 * user registers.
599 */
600 memlen = PAGE_SIZE;
601 flags |= VM_DONTCOPY | VM_DONTEXPAND;
602 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
603 mapio = 1;
604 break;
605 case EVENTS:
606 /*
607 * Use the page where this context's flags are. User level
608 * knows where it's own bitmap is within the page.
609 */
610 memaddr = ((unsigned long)dd->events +
611 ((uctxt->ctxt - dd->first_user_ctxt) *
612 HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
613 memlen = PAGE_SIZE;
614 /*
615 * v3.7 removes VM_RESERVED but the effect is kept by
616 * using VM_IO.
617 */
618 flags |= VM_IO | VM_DONTEXPAND;
619 vmf = 1;
620 break;
621 case STATUS:
622 memaddr = kvirt_to_phys((void *)dd->status);
623 memlen = PAGE_SIZE;
624 flags |= VM_IO | VM_DONTEXPAND;
625 break;
626 case RTAIL:
627 if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
628 /*
629 * If the memory allocation failed, the context alloc
630 * also would have failed, so we would never get here
631 */
632 ret = -EINVAL;
633 goto done;
634 }
635 if (flags & VM_WRITE) {
636 ret = -EPERM;
637 goto done;
638 }
639 memaddr = uctxt->rcvhdrqtailaddr_phys;
640 memlen = PAGE_SIZE;
641 flags &= ~VM_MAYWRITE;
642 break;
643 case SUBCTXT_UREGS:
644 memaddr = (u64)uctxt->subctxt_uregbase;
645 memlen = PAGE_SIZE;
646 flags |= VM_IO | VM_DONTEXPAND;
647 vmf = 1;
648 break;
649 case SUBCTXT_RCV_HDRQ:
650 memaddr = (u64)uctxt->subctxt_rcvhdr_base;
651 memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
652 flags |= VM_IO | VM_DONTEXPAND;
653 vmf = 1;
654 break;
655 case SUBCTXT_EGRBUF:
656 memaddr = (u64)uctxt->subctxt_rcvegrbuf;
657 memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
658 flags |= VM_IO | VM_DONTEXPAND;
659 flags &= ~VM_MAYWRITE;
660 vmf = 1;
661 break;
662 case SDMA_COMP: {
663 struct hfi1_user_sdma_comp_q *cq;
664
665 if (!user_sdma_comp_fp(fp)) {
666 ret = -EFAULT;
667 goto done;
668 }
669 cq = user_sdma_comp_fp(fp);
670 memaddr = (u64)cq->comps;
671 memlen = ALIGN(sizeof(*cq->comps) * cq->nentries, PAGE_SIZE);
672 flags |= VM_IO | VM_DONTEXPAND;
673 vmf = 1;
674 break;
675 }
676 default:
677 ret = -EINVAL;
678 break;
679 }
680
681 if ((vma->vm_end - vma->vm_start) != memlen) {
682 hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
683 uctxt->ctxt, subctxt_fp(fp),
684 (vma->vm_end - vma->vm_start), memlen);
685 ret = -EINVAL;
686 goto done;
687 }
688
689 vma->vm_flags = flags;
690 dd_dev_info(dd,
691 "%s: %u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
692 __func__, ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
693 vma->vm_end - vma->vm_start, vma->vm_flags);
694 pfn = (unsigned long)(memaddr >> PAGE_SHIFT);
695 if (vmf) {
696 vma->vm_pgoff = pfn;
697 vma->vm_ops = &vm_ops;
698 ret = 0;
699 } else if (mapio) {
700 ret = io_remap_pfn_range(vma, vma->vm_start, pfn, memlen,
701 vma->vm_page_prot);
702 } else {
703 ret = remap_pfn_range(vma, vma->vm_start, pfn, memlen,
704 vma->vm_page_prot);
705 }
706done:
707 return ret;
708}
709
710/*
711 * Local (non-chip) user memory is not mapped right away but as it is
712 * accessed by the user-level code.
713 */
714static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
715{
716 struct page *page;
717
718 page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
719 if (!page)
720 return VM_FAULT_SIGBUS;
721
722 get_page(page);
723 vmf->page = page;
724
725 return 0;
726}
727
728static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
729{
730 struct hfi1_ctxtdata *uctxt;
731 unsigned pollflag;
732
733 uctxt = ctxt_fp(fp);
734 if (!uctxt)
735 pollflag = POLLERR;
736 else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
737 pollflag = poll_urgent(fp, pt);
738 else if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
739 pollflag = poll_next(fp, pt);
740 else /* invalid */
741 pollflag = POLLERR;
742
743 return pollflag;
744}
745
746static int hfi1_file_close(struct inode *inode, struct file *fp)
747{
748 struct hfi1_filedata *fdata = fp->private_data;
749 struct hfi1_ctxtdata *uctxt = fdata->uctxt;
750 struct hfi1_devdata *dd;
751 unsigned long flags, *ev;
752
753 fp->private_data = NULL;
754
755 if (!uctxt)
756 goto done;
757
758 hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
759 dd = uctxt->dd;
760 mutex_lock(&hfi1_mutex);
761
762 flush_wc();
763 /* drain user sdma queue */
764 if (fdata->pq)
765 hfi1_user_sdma_free_queues(fdata);
766
767 /*
768 * Clear any left over, unhandled events so the next process that
769 * gets this context doesn't get confused.
770 */
771 ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
772 HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
773 *ev = 0;
774
775 if (--uctxt->cnt) {
776 uctxt->active_slaves &= ~(1 << fdata->subctxt);
777 uctxt->subpid[fdata->subctxt] = 0;
778 mutex_unlock(&hfi1_mutex);
779 goto done;
780 }
781
782 spin_lock_irqsave(&dd->uctxt_lock, flags);
783 /*
784 * Disable receive context and interrupt available, reset all
785 * RcvCtxtCtrl bits to default values.
786 */
787 hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
788 HFI1_RCVCTRL_TIDFLOW_DIS |
789 HFI1_RCVCTRL_INTRAVAIL_DIS |
790 HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
791 HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
792 HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
793 /* Clear the context's J_KEY */
794 hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
795 /*
796 * Reset context integrity checks to default.
797 * (writes to CSRs probably belong in chip.c)
798 */
799 write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
800 hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
801 sc_disable(uctxt->sc);
802 uctxt->pid = 0;
803 spin_unlock_irqrestore(&dd->uctxt_lock, flags);
804
805 dd->rcd[uctxt->ctxt] = NULL;
806 uctxt->rcvwait_to = 0;
807 uctxt->piowait_to = 0;
808 uctxt->rcvnowait = 0;
809 uctxt->pionowait = 0;
810 uctxt->event_flags = 0;
811
812 hfi1_clear_tids(uctxt);
813 hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
814
815 if (uctxt->tid_pg_list)
816 unlock_exp_tids(uctxt);
817
818 hfi1_stats.sps_ctxts--;
819 dd->freectxts++;
820 mutex_unlock(&hfi1_mutex);
821 hfi1_free_ctxtdata(dd, uctxt);
822done:
823 kfree(fdata);
824 return 0;
825}
826
827/*
828 * Convert kernel *virtual* addresses to physical addresses.
829 * This is used to vmalloc'ed addresses.
830 */
831static u64 kvirt_to_phys(void *addr)
832{
833 struct page *page;
834 u64 paddr = 0;
835
836 page = vmalloc_to_page(addr);
837 if (page)
838 paddr = page_to_pfn(page) << PAGE_SHIFT;
839
840 return paddr;
841}
842
843static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
844{
845 int i_minor, ret = 0;
846 unsigned swmajor, swminor, alg = HFI1_ALG_ACROSS;
847
848 swmajor = uinfo->userversion >> 16;
849 if (swmajor != HFI1_USER_SWMAJOR) {
850 ret = -ENODEV;
851 goto done;
852 }
853
854 swminor = uinfo->userversion & 0xffff;
855
856 if (uinfo->hfi1_alg < HFI1_ALG_COUNT)
857 alg = uinfo->hfi1_alg;
858
859 mutex_lock(&hfi1_mutex);
860 /* First, lets check if we need to setup a shared context? */
861 if (uinfo->subctxt_cnt)
862 ret = find_shared_ctxt(fp, uinfo);
863
864 /*
865 * We execute the following block if we couldn't find a
866 * shared context or if context sharing is not required.
867 */
868 if (!ret) {
869 i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
870 ret = get_user_context(fp, uinfo, i_minor - 1, alg);
871 }
872 mutex_unlock(&hfi1_mutex);
873done:
874 return ret;
875}
876
877static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo,
878 int devno, unsigned alg)
879{
880 struct hfi1_devdata *dd = NULL;
881 int ret = 0, devmax, npresent, nup, dev;
882
883 devmax = hfi1_count_units(&npresent, &nup);
884 if (!npresent) {
885 ret = -ENXIO;
886 goto done;
887 }
888 if (!nup) {
889 ret = -ENETDOWN;
890 goto done;
891 }
892 if (devno >= 0) {
893 dd = hfi1_lookup(devno);
894 if (!dd)
895 ret = -ENODEV;
896 else if (!dd->freectxts)
897 ret = -EBUSY;
898 } else {
899 struct hfi1_devdata *pdd;
900
901 if (alg == HFI1_ALG_ACROSS) {
902 unsigned free = 0U;
903
904 for (dev = 0; dev < devmax; dev++) {
905 pdd = hfi1_lookup(dev);
906 if (pdd && pdd->freectxts &&
907 pdd->freectxts > free) {
908 dd = pdd;
909 free = pdd->freectxts;
910 }
911 }
912 } else {
913 for (dev = 0; dev < devmax; dev++) {
914 pdd = hfi1_lookup(dev);
915 if (pdd && pdd->freectxts) {
916 dd = pdd;
917 break;
918 }
919 }
920 }
921 if (!dd)
922 ret = -EBUSY;
923 }
924done:
925 return ret ? ret : allocate_ctxt(fp, dd, uinfo);
926}
927
928static int find_shared_ctxt(struct file *fp,
929 const struct hfi1_user_info *uinfo)
930{
931 int devmax, ndev, i;
932 int ret = 0;
933
934 devmax = hfi1_count_units(NULL, NULL);
935
936 for (ndev = 0; ndev < devmax; ndev++) {
937 struct hfi1_devdata *dd = hfi1_lookup(ndev);
938
939 /* device portion of usable() */
940 if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase))
941 continue;
942 for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
943 struct hfi1_ctxtdata *uctxt = dd->rcd[i];
944
945 /* Skip ctxts which are not yet open */
946 if (!uctxt || !uctxt->cnt)
947 continue;
948 /* Skip ctxt if it doesn't match the requested one */
949 if (memcmp(uctxt->uuid, uinfo->uuid,
950 sizeof(uctxt->uuid)) ||
951 uctxt->subctxt_id != uinfo->subctxt_id ||
952 uctxt->subctxt_cnt != uinfo->subctxt_cnt)
953 continue;
954
955 /* Verify the sharing process matches the master */
956 if (uctxt->userversion != uinfo->userversion ||
957 uctxt->cnt >= uctxt->subctxt_cnt) {
958 ret = -EINVAL;
959 goto done;
960 }
961 ctxt_fp(fp) = uctxt;
962 subctxt_fp(fp) = uctxt->cnt++;
963 uctxt->subpid[subctxt_fp(fp)] = current->pid;
964 uctxt->active_slaves |= 1 << subctxt_fp(fp);
965 ret = 1;
966 goto done;
967 }
968 }
969
970done:
971 return ret;
972}
973
974static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
975 struct hfi1_user_info *uinfo)
976{
977 struct hfi1_ctxtdata *uctxt;
978 unsigned ctxt;
979 int ret;
980
981 if (dd->flags & HFI1_FROZEN) {
982 /*
983 * Pick an error that is unique from all other errors
984 * that are returned so the user process knows that
985 * it tried to allocate while the SPC was frozen. It
986 * it should be able to retry with success in a short
987 * while.
988 */
989 return -EIO;
990 }
991
992 for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++)
993 if (!dd->rcd[ctxt])
994 break;
995
996 if (ctxt == dd->num_rcv_contexts)
997 return -EBUSY;
998
999 uctxt = hfi1_create_ctxtdata(dd->pport, ctxt);
1000 if (!uctxt) {
1001 dd_dev_err(dd,
1002 "Unable to allocate ctxtdata memory, failing open\n");
1003 return -ENOMEM;
1004 }
1005 /*
1006 * Allocate and enable a PIO send context.
1007 */
1008 uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
1009 uctxt->numa_id);
1010 if (!uctxt->sc)
1011 return -ENOMEM;
1012
1013 dbg("allocated send context %u(%u)\n", uctxt->sc->sw_index,
1014 uctxt->sc->hw_context);
1015 ret = sc_enable(uctxt->sc);
1016 if (ret)
1017 return ret;
1018 /*
1019 * Setup shared context resources if the user-level has requested
1020 * shared contexts and this is the 'master' process.
1021 * This has to be done here so the rest of the sub-contexts find the
1022 * proper master.
1023 */
1024 if (uinfo->subctxt_cnt && !subctxt_fp(fp)) {
1025 ret = init_subctxts(uctxt, uinfo);
1026 /*
1027 * On error, we don't need to disable and de-allocate the
1028 * send context because it will be done during file close
1029 */
1030 if (ret)
1031 return ret;
1032 }
1033 uctxt->userversion = uinfo->userversion;
1034 uctxt->pid = current->pid;
1035 uctxt->flags = HFI1_CAP_UGET(MASK);
1036 init_waitqueue_head(&uctxt->wait);
1037 strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
1038 memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
1039 uctxt->jkey = generate_jkey(current_uid());
1040 INIT_LIST_HEAD(&uctxt->sdma_queues);
1041 spin_lock_init(&uctxt->sdma_qlock);
1042 hfi1_stats.sps_ctxts++;
1043 dd->freectxts--;
1044 ctxt_fp(fp) = uctxt;
1045
1046 return 0;
1047}
1048
1049static int init_subctxts(struct hfi1_ctxtdata *uctxt,
1050 const struct hfi1_user_info *uinfo)
1051{
1052 int ret = 0;
1053 unsigned num_subctxts;
1054
1055 num_subctxts = uinfo->subctxt_cnt;
1056 if (num_subctxts > HFI1_MAX_SHARED_CTXTS) {
1057 ret = -EINVAL;
1058 goto bail;
1059 }
1060
1061 uctxt->subctxt_cnt = uinfo->subctxt_cnt;
1062 uctxt->subctxt_id = uinfo->subctxt_id;
1063 uctxt->active_slaves = 1;
1064 uctxt->redirect_seq_cnt = 1;
1065 set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
1066bail:
1067 return ret;
1068}
1069
1070static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
1071{
1072 int ret = 0;
1073 unsigned num_subctxts = uctxt->subctxt_cnt;
1074
1075 uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
1076 if (!uctxt->subctxt_uregbase) {
1077 ret = -ENOMEM;
1078 goto bail;
1079 }
1080 /* We can take the size of the RcvHdr Queue from the master */
1081 uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
1082 num_subctxts);
1083 if (!uctxt->subctxt_rcvhdr_base) {
1084 ret = -ENOMEM;
1085 goto bail_ureg;
1086 }
1087
1088 uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
1089 num_subctxts);
1090 if (!uctxt->subctxt_rcvegrbuf) {
1091 ret = -ENOMEM;
1092 goto bail_rhdr;
1093 }
1094 goto bail;
1095bail_rhdr:
1096 vfree(uctxt->subctxt_rcvhdr_base);
1097bail_ureg:
1098 vfree(uctxt->subctxt_uregbase);
1099 uctxt->subctxt_uregbase = NULL;
1100bail:
1101 return ret;
1102}
1103
1104static int user_init(struct file *fp)
1105{
1106 int ret;
1107 unsigned int rcvctrl_ops = 0;
1108 struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
1109
1110 /* make sure that the context has already been setup */
1111 if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags)) {
1112 ret = -EFAULT;
1113 goto done;
1114 }
1115
1116 /*
1117 * Subctxts don't need to initialize anything since master
1118 * has done it.
1119 */
1120 if (subctxt_fp(fp)) {
1121 ret = wait_event_interruptible(uctxt->wait,
1122 !test_bit(HFI1_CTXT_MASTER_UNINIT,
1123 &uctxt->event_flags));
1124 goto done;
1125 }
1126
1127 /* initialize poll variables... */
1128 uctxt->urgent = 0;
1129 uctxt->urgent_poll = 0;
1130
1131 /*
1132 * Now enable the ctxt for receive.
1133 * For chips that are set to DMA the tail register to memory
1134 * when they change (and when the update bit transitions from
1135 * 0 to 1. So for those chips, we turn it off and then back on.
1136 * This will (very briefly) affect any other open ctxts, but the
1137 * duration is very short, and therefore isn't an issue. We
1138 * explicitly set the in-memory tail copy to 0 beforehand, so we
1139 * don't have to wait to be sure the DMA update has happened
1140 * (chip resets head/tail to 0 on transition to enable).
1141 */
1142 if (uctxt->rcvhdrtail_kvaddr)
1143 clear_rcvhdrtail(uctxt);
1144
1145 /* Setup J_KEY before enabling the context */
1146 hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
1147
1148 rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
1149 if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP))
1150 rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
1151 /*
1152 * Ignore the bit in the flags for now until proper
1153 * support for multiple packet per rcv array entry is
1154 * added.
1155 */
1156 if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
1157 rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
1158 if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
1159 rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
1160 if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
1161 rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
1162 if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
1163 rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
1164 hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
1165
1166 /* Notify any waiting slaves */
1167 if (uctxt->subctxt_cnt) {
1168 clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
1169 wake_up(&uctxt->wait);
1170 }
1171 ret = 0;
1172
1173done:
1174 return ret;
1175}
1176
1177static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
1178{
1179 struct hfi1_ctxt_info cinfo;
1180 struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
1181 struct hfi1_filedata *fd = fp->private_data;
1182 int ret = 0;
1183
1184 ret = hfi1_get_base_kinfo(uctxt, &cinfo);
1185 if (ret < 0)
1186 goto done;
1187 cinfo.num_active = hfi1_count_active_units();
1188 cinfo.unit = uctxt->dd->unit;
1189 cinfo.ctxt = uctxt->ctxt;
1190 cinfo.subctxt = subctxt_fp(fp);
1191 cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
1192 uctxt->dd->rcv_entries.group_size) +
1193 uctxt->expected_count;
1194 cinfo.credits = uctxt->sc->credits;
1195 cinfo.numa_node = uctxt->numa_id;
1196 cinfo.rec_cpu = fd->rec_cpu_num;
1197 cinfo.send_ctxt = uctxt->sc->hw_context;
1198
1199 cinfo.egrtids = uctxt->egrbufs.alloced;
1200 cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
1201 cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
1202 cinfo.sdma_ring_size = user_sdma_comp_fp(fp)->nentries;
1203 cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
1204
1205 trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, subctxt_fp(fp), cinfo);
1206 if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
1207 ret = -EFAULT;
1208done:
1209 return ret;
1210}
1211
1212static int setup_ctxt(struct file *fp)
1213{
1214 struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
1215 struct hfi1_devdata *dd = uctxt->dd;
1216 int ret = 0;
1217
1218 /*
1219 * Context should be set up only once (including allocation and
1220 * programming of eager buffers. This is done if context sharing
1221 * is not requested or by the master process.
1222 */
1223 if (!uctxt->subctxt_cnt || !subctxt_fp(fp)) {
1224 ret = hfi1_init_ctxt(uctxt->sc);
1225 if (ret)
1226 goto done;
1227
1228 /* Now allocate the RcvHdr queue and eager buffers. */
1229 ret = hfi1_create_rcvhdrq(dd, uctxt);
1230 if (ret)
1231 goto done;
1232 ret = hfi1_setup_eagerbufs(uctxt);
1233 if (ret)
1234 goto done;
1235 if (uctxt->subctxt_cnt && !subctxt_fp(fp)) {
1236 ret = setup_subctxt(uctxt);
1237 if (ret)
1238 goto done;
1239 }
1240 /* Setup Expected Rcv memories */
1241 uctxt->tid_pg_list = vzalloc(uctxt->expected_count *
1242 sizeof(struct page **));
1243 if (!uctxt->tid_pg_list) {
1244 ret = -ENOMEM;
1245 goto done;
1246 }
1247 uctxt->physshadow = vzalloc(uctxt->expected_count *
1248 sizeof(*uctxt->physshadow));
1249 if (!uctxt->physshadow) {
1250 ret = -ENOMEM;
1251 goto done;
1252 }
1253 /* allocate expected TID map and initialize the cursor */
1254 atomic_set(&uctxt->tidcursor, 0);
1255 uctxt->numtidgroups = uctxt->expected_count /
1256 dd->rcv_entries.group_size;
1257 uctxt->tidmapcnt = uctxt->numtidgroups / BITS_PER_LONG +
1258 !!(uctxt->numtidgroups % BITS_PER_LONG);
1259 uctxt->tidusemap = kzalloc_node(uctxt->tidmapcnt *
1260 sizeof(*uctxt->tidusemap),
1261 GFP_KERNEL, uctxt->numa_id);
1262 if (!uctxt->tidusemap) {
1263 ret = -ENOMEM;
1264 goto done;
1265 }
1266 /*
1267 * In case that the number of groups is not a multiple of
1268 * 64 (the number of groups in a tidusemap element), mark
1269 * the extra ones as used. This will effectively make them
1270 * permanently used and should never be assigned. Otherwise,
1271 * the code which checks how many free groups we have will
1272 * get completely confused about the state of the bits.
1273 */
1274 if (uctxt->numtidgroups % BITS_PER_LONG)
1275 uctxt->tidusemap[uctxt->tidmapcnt - 1] =
1276 ~((1ULL << (uctxt->numtidgroups %
1277 BITS_PER_LONG)) - 1);
1278 trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0,
1279 uctxt->tidusemap, uctxt->tidmapcnt);
1280 }
1281 ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
1282 if (ret)
1283 goto done;
1284
1285 set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags);
1286done:
1287 return ret;
1288}
1289
1290static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
1291{
1292 struct hfi1_base_info binfo;
1293 struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
1294 struct hfi1_devdata *dd = uctxt->dd;
1295 ssize_t sz;
1296 unsigned offset;
1297 int ret = 0;
1298
1299 trace_hfi1_uctxtdata(uctxt->dd, uctxt);
1300
1301 memset(&binfo, 0, sizeof(binfo));
1302 binfo.hw_version = dd->revision;
1303 binfo.sw_version = HFI1_KERN_SWVERSION;
1304 binfo.bthqp = kdeth_qp;
1305 binfo.jkey = uctxt->jkey;
1306 /*
1307 * If more than 64 contexts are enabled the allocated credit
1308 * return will span two or three contiguous pages. Since we only
1309 * map the page containing the context's credit return address,
1310 * we need to calculate the offset in the proper page.
1311 */
1312 offset = ((u64)uctxt->sc->hw_free -
1313 (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
1314 binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
1315 subctxt_fp(fp), offset);
1316 binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
1317 subctxt_fp(fp),
1318 uctxt->sc->base_addr);
1319 binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
1320 uctxt->ctxt,
1321 subctxt_fp(fp),
1322 uctxt->sc->base_addr);
1323 binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
1324 subctxt_fp(fp),
1325 uctxt->rcvhdrq);
1326 binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
1327 subctxt_fp(fp),
1328 uctxt->egrbufs.rcvtids[0].phys);
1329 binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
1330 subctxt_fp(fp), 0);
1331 /*
1332 * user regs are at
1333 * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
1334 */
1335 binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
1336 subctxt_fp(fp), 0);
1337 offset = ((((uctxt->ctxt - dd->first_user_ctxt) *
1338 HFI1_MAX_SHARED_CTXTS) + subctxt_fp(fp)) *
1339 sizeof(*dd->events)) & ~PAGE_MASK;
1340 binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
1341 subctxt_fp(fp),
1342 offset);
1343 binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
1344 subctxt_fp(fp),
1345 dd->status);
1346 if (HFI1_CAP_IS_USET(DMA_RTAIL))
1347 binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
1348 subctxt_fp(fp), 0);
1349 if (uctxt->subctxt_cnt) {
1350 binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
1351 uctxt->ctxt,
1352 subctxt_fp(fp), 0);
1353 binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
1354 uctxt->ctxt,
1355 subctxt_fp(fp), 0);
1356 binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
1357 uctxt->ctxt,
1358 subctxt_fp(fp), 0);
1359 }
1360 sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
1361 if (copy_to_user(ubase, &binfo, sz))
1362 ret = -EFAULT;
1363 return ret;
1364}
1365
1366static unsigned int poll_urgent(struct file *fp,
1367 struct poll_table_struct *pt)
1368{
1369 struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
1370 struct hfi1_devdata *dd = uctxt->dd;
1371 unsigned pollflag;
1372
1373 poll_wait(fp, &uctxt->wait, pt);
1374
1375 spin_lock_irq(&dd->uctxt_lock);
1376 if (uctxt->urgent != uctxt->urgent_poll) {
1377 pollflag = POLLIN | POLLRDNORM;
1378 uctxt->urgent_poll = uctxt->urgent;
1379 } else {
1380 pollflag = 0;
1381 set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
1382 }
1383 spin_unlock_irq(&dd->uctxt_lock);
1384
1385 return pollflag;
1386}
1387
1388static unsigned int poll_next(struct file *fp,
1389 struct poll_table_struct *pt)
1390{
1391 struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
1392 struct hfi1_devdata *dd = uctxt->dd;
1393 unsigned pollflag;
1394
1395 poll_wait(fp, &uctxt->wait, pt);
1396
1397 spin_lock_irq(&dd->uctxt_lock);
1398 if (hdrqempty(uctxt)) {
1399 set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
1400 hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
1401 pollflag = 0;
1402 } else
1403 pollflag = POLLIN | POLLRDNORM;
1404 spin_unlock_irq(&dd->uctxt_lock);
1405
1406 return pollflag;
1407}
1408
1409/*
1410 * Find all user contexts in use, and set the specified bit in their
1411 * event mask.
1412 * See also find_ctxt() for a similar use, that is specific to send buffers.
1413 */
1414int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
1415{
1416 struct hfi1_ctxtdata *uctxt;
1417 struct hfi1_devdata *dd = ppd->dd;
1418 unsigned ctxt;
1419 int ret = 0;
1420 unsigned long flags;
1421
1422 if (!dd->events) {
1423 ret = -EINVAL;
1424 goto done;
1425 }
1426
1427 spin_lock_irqsave(&dd->uctxt_lock, flags);
1428 for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts;
1429 ctxt++) {
1430 uctxt = dd->rcd[ctxt];
1431 if (uctxt) {
1432 unsigned long *evs = dd->events +
1433 (uctxt->ctxt - dd->first_user_ctxt) *
1434 HFI1_MAX_SHARED_CTXTS;
1435 int i;
1436 /*
1437 * subctxt_cnt is 0 if not shared, so do base
1438 * separately, first, then remaining subctxt, if any
1439 */
1440 set_bit(evtbit, evs);
1441 for (i = 1; i < uctxt->subctxt_cnt; i++)
1442 set_bit(evtbit, evs + i);
1443 }
1444 }
1445 spin_unlock_irqrestore(&dd->uctxt_lock, flags);
1446done:
1447 return ret;
1448}
1449
1450/**
1451 * manage_rcvq - manage a context's receive queue
1452 * @uctxt: the context
1453 * @subctxt: the sub-context
1454 * @start_stop: action to carry out
1455 *
1456 * start_stop == 0 disables receive on the context, for use in queue
1457 * overflow conditions. start_stop==1 re-enables, to be used to
1458 * re-init the software copy of the head register
1459 */
1460static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
1461 int start_stop)
1462{
1463 struct hfi1_devdata *dd = uctxt->dd;
1464 unsigned int rcvctrl_op;
1465
1466 if (subctxt)
1467 goto bail;
1468 /* atomically clear receive enable ctxt. */
1469 if (start_stop) {
1470 /*
1471 * On enable, force in-memory copy of the tail register to
1472 * 0, so that protocol code doesn't have to worry about
1473 * whether or not the chip has yet updated the in-memory
1474 * copy or not on return from the system call. The chip
1475 * always resets it's tail register back to 0 on a
1476 * transition from disabled to enabled.
1477 */
1478 if (uctxt->rcvhdrtail_kvaddr)
1479 clear_rcvhdrtail(uctxt);
1480 rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
1481 } else
1482 rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
1483 hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
1484 /* always; new head should be equal to new tail; see above */
1485bail:
1486 return 0;
1487}
1488
1489/*
1490 * clear the event notifier events for this context.
1491 * User process then performs actions appropriate to bit having been
1492 * set, if desired, and checks again in future.
1493 */
1494static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
1495 unsigned long events)
1496{
1497 int i;
1498 struct hfi1_devdata *dd = uctxt->dd;
1499 unsigned long *evs;
1500
1501 if (!dd->events)
1502 return 0;
1503
1504 evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
1505 HFI1_MAX_SHARED_CTXTS) + subctxt;
1506
1507 for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
1508 if (!test_bit(i, &events))
1509 continue;
1510 clear_bit(i, evs);
1511 }
1512 return 0;
1513}
1514
1515#define num_user_pages(vaddr, len) \
1516 (1 + (((((unsigned long)(vaddr) + \
1517 (unsigned long)(len) - 1) & PAGE_MASK) - \
1518 ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
1519
1520/**
1521 * tzcnt - count the number of trailing zeros in a 64bit value
1522 * @value: the value to be examined
1523 *
1524 * Returns the number of trailing least significant zeros in the
1525 * the input value. If the value is zero, return the number of
1526 * bits of the value.
1527 */
1528static inline u8 tzcnt(u64 value)
1529{
1530 return value ? __builtin_ctzl(value) : sizeof(value) * 8;
1531}
1532
1533static inline unsigned num_free_groups(unsigned long map, u16 *start)
1534{
1535 unsigned free;
1536 u16 bitidx = *start;
1537
1538 if (bitidx >= BITS_PER_LONG)
1539 return 0;
1540 /* "Turn off" any bits set before our bit index */
1541 map &= ~((1ULL << bitidx) - 1);
1542 free = tzcnt(map) - bitidx;
1543 while (!free && bitidx < BITS_PER_LONG) {
1544 /* Zero out the last set bit so we look at the rest */
1545 map &= ~(1ULL << bitidx);
1546 /*
1547 * Account for the previously checked bits and advance
1548 * the bit index. We don't have to check for bitidx
1549 * getting bigger than BITS_PER_LONG here as it would
1550 * mean extra instructions that we don't need. If it
1551 * did happen, it would push free to a negative value
1552 * which will break the loop.
1553 */
1554 free = tzcnt(map) - ++bitidx;
1555 }
1556 *start = bitidx;
1557 return free;
1558}
1559
1560static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo)
1561{
1562 int ret = 0;
1563 struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
1564 struct hfi1_devdata *dd = uctxt->dd;
1565 unsigned tid, mapped = 0, npages, ngroups, exp_groups,
1566 tidpairs = uctxt->expected_count / 2;
1567 struct page **pages;
1568 unsigned long vaddr, tidmap[uctxt->tidmapcnt];
1569 dma_addr_t *phys;
1570 u32 tidlist[tidpairs], pairidx = 0, tidcursor;
1571 u16 useidx, idx, bitidx, tidcnt = 0;
1572
1573 vaddr = tinfo->vaddr;
1574
1575 if (vaddr & ~PAGE_MASK) {
1576 ret = -EINVAL;
1577 goto bail;
1578 }
1579
1580 npages = num_user_pages(vaddr, tinfo->length);
1581 if (!npages) {
1582 ret = -EINVAL;
1583 goto bail;
1584 }
1585 if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
1586 npages * PAGE_SIZE)) {
1587 dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
1588 (void *)vaddr, npages);
1589 ret = -EFAULT;
1590 goto bail;
1591 }
1592
1593 memset(tidmap, 0, sizeof(tidmap[0]) * uctxt->tidmapcnt);
1594 memset(tidlist, 0, sizeof(tidlist[0]) * tidpairs);
1595
1596 exp_groups = uctxt->expected_count / dd->rcv_entries.group_size;
1597 /* which group set do we look at first? */
1598 tidcursor = atomic_read(&uctxt->tidcursor);
1599 useidx = (tidcursor >> 16) & 0xffff;
1600 bitidx = tidcursor & 0xffff;
1601
1602 /*
1603 * Keep going until we've mapped all pages or we've exhausted all
1604 * RcvArray entries.
1605 * This iterates over the number of tidmaps + 1
1606 * (idx <= uctxt->tidmapcnt) so we check the bitmap which we
1607 * started from one more time for any free bits before the
1608 * starting point bit.
1609 */
1610 for (mapped = 0, idx = 0;
1611 mapped < npages && idx <= uctxt->tidmapcnt;) {
1612 u64 i, offset = 0;
1613 unsigned free, pinned, pmapped = 0, bits_used;
1614 u16 grp;
1615
1616 /*
1617 * "Reserve" the needed group bits under lock so other
1618 * processes can't step in the middle of it. Once
1619 * reserved, we don't need the lock anymore since we
1620 * are guaranteed the groups.
1621 */
1622 spin_lock(&uctxt->exp_lock);
1623 if (uctxt->tidusemap[useidx] == -1ULL ||
1624 bitidx >= BITS_PER_LONG) {
1625 /* no free groups in the set, use the next */
1626 useidx = (useidx + 1) % uctxt->tidmapcnt;
1627 idx++;
1628 bitidx = 0;
1629 spin_unlock(&uctxt->exp_lock);
1630 continue;
1631 }
1632 ngroups = ((npages - mapped) / dd->rcv_entries.group_size) +
1633 !!((npages - mapped) % dd->rcv_entries.group_size);
1634
1635 /*
1636 * If we've gotten here, the current set of groups does have
1637 * one or more free groups.
1638 */
1639 free = num_free_groups(uctxt->tidusemap[useidx], &bitidx);
1640 if (!free) {
1641 /*
1642 * Despite the check above, free could still come back
1643 * as 0 because we don't check the entire bitmap but
1644 * we start from bitidx.
1645 */
1646 spin_unlock(&uctxt->exp_lock);
1647 continue;
1648 }
1649 bits_used = min(free, ngroups);
1650 tidmap[useidx] |= ((1ULL << bits_used) - 1) << bitidx;
1651 uctxt->tidusemap[useidx] |= tidmap[useidx];
1652 spin_unlock(&uctxt->exp_lock);
1653
1654 /*
1655 * At this point, we know where in the map we have free bits.
1656 * properly offset into the various "shadow" arrays and compute
1657 * the RcvArray entry index.
1658 */
1659 offset = ((useidx * BITS_PER_LONG) + bitidx) *
1660 dd->rcv_entries.group_size;
1661 pages = uctxt->tid_pg_list + offset;
1662 phys = uctxt->physshadow + offset;
1663 tid = uctxt->expected_base + offset;
1664
1665 /* Calculate how many pages we can pin based on free bits */
1666 pinned = min((bits_used * dd->rcv_entries.group_size),
1667 (npages - mapped));
1668 /*
1669 * Now that we know how many free RcvArray entries we have,
1670 * we can pin that many user pages.
1671 */
1672 ret = hfi1_get_user_pages(vaddr + (mapped * PAGE_SIZE),
1673 pinned, pages);
1674 if (ret) {
1675 /*
1676 * We can't continue because the pages array won't be
1677 * initialized. This should never happen,
1678 * unless perhaps the user has mpin'ed the pages
1679 * themselves.
1680 */
1681 dd_dev_info(dd,
1682 "Failed to lock addr %p, %u pages: errno %d\n",
1683 (void *) vaddr, pinned, -ret);
1684 /*
1685 * Let go of the bits that we reserved since we are not
1686 * going to use them.
1687 */
1688 spin_lock(&uctxt->exp_lock);
1689 uctxt->tidusemap[useidx] &=
1690 ~(((1ULL << bits_used) - 1) << bitidx);
1691 spin_unlock(&uctxt->exp_lock);
1692 goto done;
1693 }
1694 /*
1695 * How many groups do we need based on how many pages we have
1696 * pinned?
1697 */
1698 ngroups = (pinned / dd->rcv_entries.group_size) +
1699 !!(pinned % dd->rcv_entries.group_size);
1700 /*
1701 * Keep programming RcvArray entries for all the <ngroups> free
1702 * groups.
1703 */
1704 for (i = 0, grp = 0; grp < ngroups; i++, grp++) {
1705 unsigned j;
1706 u32 pair_size = 0, tidsize;
1707 /*
1708 * This inner loop will program an entire group or the
1709 * array of pinned pages (which ever limit is hit
1710 * first).
1711 */
1712 for (j = 0; j < dd->rcv_entries.group_size &&
1713 pmapped < pinned; j++, pmapped++, tid++) {
1714 tidsize = PAGE_SIZE;
1715 phys[pmapped] = hfi1_map_page(dd->pcidev,
1716 pages[pmapped], 0,
1717 tidsize, PCI_DMA_FROMDEVICE);
1718 trace_hfi1_exp_rcv_set(uctxt->ctxt,
1719 subctxt_fp(fp),
1720 tid, vaddr,
1721 phys[pmapped],
1722 pages[pmapped]);
1723 /*
1724 * Each RcvArray entry is programmed with one
1725 * page * worth of memory. This will handle
1726 * the 8K MTU as well as anything smaller
1727 * due to the fact that both entries in the
1728 * RcvTidPair are programmed with a page.
1729 * PSM currently does not handle anything
1730 * bigger than 8K MTU, so should we even worry
1731 * about 10K here?
1732 */
1733 hfi1_put_tid(dd, tid, PT_EXPECTED,
1734 phys[pmapped],
1735 ilog2(tidsize >> PAGE_SHIFT) + 1);
1736 pair_size += tidsize >> PAGE_SHIFT;
1737 EXP_TID_RESET(tidlist[pairidx], LEN, pair_size);
1738 if (!(tid % 2)) {
1739 tidlist[pairidx] |=
1740 EXP_TID_SET(IDX,
1741 (tid - uctxt->expected_base)
1742 / 2);
1743 tidlist[pairidx] |=
1744 EXP_TID_SET(CTRL, 1);
1745 tidcnt++;
1746 } else {
1747 tidlist[pairidx] |=
1748 EXP_TID_SET(CTRL, 2);
1749 pair_size = 0;
1750 pairidx++;
1751 }
1752 }
1753 /*
1754 * We've programmed the entire group (or as much of the
1755 * group as we'll use. Now, it's time to push it out...
1756 */
1757 flush_wc();
1758 }
1759 mapped += pinned;
1760 atomic_set(&uctxt->tidcursor,
1761 (((useidx & 0xffffff) << 16) |
1762 ((bitidx + bits_used) & 0xffffff)));
1763 }
1764 trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0, uctxt->tidusemap,
1765 uctxt->tidmapcnt);
1766
1767done:
1768 /* If we've mapped anything, copy relevant info to user */
1769 if (mapped) {
1770 if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
1771 tidlist, sizeof(tidlist[0]) * tidcnt)) {
1772 ret = -EFAULT;
1773 goto done;
1774 }
1775 /* copy TID info to user */
1776 if (copy_to_user((void __user *)(unsigned long)tinfo->tidmap,
1777 tidmap, sizeof(tidmap[0]) * uctxt->tidmapcnt))
1778 ret = -EFAULT;
1779 }
1780bail:
1781 /*
1782 * Calculate mapped length. New Exp TID protocol does not "unwind" and
1783 * report an error if it can't map the entire buffer. It just reports
1784 * the length that was mapped.
1785 */
1786 tinfo->length = mapped * PAGE_SIZE;
1787 tinfo->tidcnt = tidcnt;
1788 return ret;
1789}
1790
1791static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo)
1792{
1793 struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
1794 struct hfi1_devdata *dd = uctxt->dd;
1795 unsigned long tidmap[uctxt->tidmapcnt];
1796 struct page **pages;
1797 dma_addr_t *phys;
1798 u16 idx, bitidx, tid;
1799 int ret = 0;
1800
1801 if (copy_from_user(&tidmap, (void __user *)(unsigned long)
1802 tinfo->tidmap,
1803 sizeof(tidmap[0]) * uctxt->tidmapcnt)) {
1804 ret = -EFAULT;
1805 goto done;
1806 }
1807 for (idx = 0; idx < uctxt->tidmapcnt; idx++) {
1808 unsigned long map;
1809
1810 bitidx = 0;
1811 if (!tidmap[idx])
1812 continue;
1813 map = tidmap[idx];
1814 while ((bitidx = tzcnt(map)) < BITS_PER_LONG) {
1815 int i, pcount = 0;
1816 struct page *pshadow[dd->rcv_entries.group_size];
1817 unsigned offset = ((idx * BITS_PER_LONG) + bitidx) *
1818 dd->rcv_entries.group_size;
1819
1820 pages = uctxt->tid_pg_list + offset;
1821 phys = uctxt->physshadow + offset;
1822 tid = uctxt->expected_base + offset;
1823 for (i = 0; i < dd->rcv_entries.group_size;
1824 i++, tid++) {
1825 if (pages[i]) {
1826 hfi1_put_tid(dd, tid, PT_INVALID,
1827 0, 0);
1828 trace_hfi1_exp_rcv_free(uctxt->ctxt,
1829 subctxt_fp(fp),
1830 tid, phys[i],
1831 pages[i]);
1832 pci_unmap_page(dd->pcidev, phys[i],
1833 PAGE_SIZE, PCI_DMA_FROMDEVICE);
1834 pshadow[pcount] = pages[i];
1835 pages[i] = NULL;
1836 pcount++;
1837 phys[i] = 0;
1838 }
1839 }
1840 flush_wc();
1841 hfi1_release_user_pages(pshadow, pcount);
1842 clear_bit(bitidx, &uctxt->tidusemap[idx]);
1843 map &= ~(1ULL<<bitidx);
1844 }
1845 }
1846 trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 1, uctxt->tidusemap,
1847 uctxt->tidmapcnt);
1848done:
1849 return ret;
1850}
1851
1852static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt)
1853{
1854 struct hfi1_devdata *dd = uctxt->dd;
1855 unsigned tid;
1856
1857 dd_dev_info(dd, "ctxt %u unlocking any locked expTID pages\n",
1858 uctxt->ctxt);
1859 for (tid = 0; tid < uctxt->expected_count; tid++) {
1860 struct page *p = uctxt->tid_pg_list[tid];
1861 dma_addr_t phys;
1862
1863 if (!p)
1864 continue;
1865
1866 phys = uctxt->physshadow[tid];
1867 uctxt->physshadow[tid] = 0;
1868 uctxt->tid_pg_list[tid] = NULL;
1869 pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, PCI_DMA_FROMDEVICE);
1870 hfi1_release_user_pages(&p, 1);
1871 }
1872}
1873
1874static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
1875 u16 pkey)
1876{
1877 int ret = -ENOENT, i, intable = 0;
1878 struct hfi1_pportdata *ppd = uctxt->ppd;
1879 struct hfi1_devdata *dd = uctxt->dd;
1880
1881 if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) {
1882 ret = -EINVAL;
1883 goto done;
1884 }
1885
1886 for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
1887 if (pkey == ppd->pkeys[i]) {
1888 intable = 1;
1889 break;
1890 }
1891
1892 if (intable)
1893 ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
1894done:
1895 return ret;
1896}
1897
1898static int ui_open(struct inode *inode, struct file *filp)
1899{
1900 struct hfi1_devdata *dd;
1901
1902 dd = container_of(inode->i_cdev, struct hfi1_devdata, ui_cdev);
1903 filp->private_data = dd; /* for other methods */
1904 return 0;
1905}
1906
1907static int ui_release(struct inode *inode, struct file *filp)
1908{
1909 /* nothing to do */
1910 return 0;
1911}
1912
1913static loff_t ui_lseek(struct file *filp, loff_t offset, int whence)
1914{
1915 struct hfi1_devdata *dd = filp->private_data;
1916
1917 switch (whence) {
1918 case SEEK_SET:
1919 break;
1920 case SEEK_CUR:
1921 offset += filp->f_pos;
1922 break;
1923 case SEEK_END:
1924 offset = ((dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE) -
1925 offset;
1926 break;
1927 default:
1928 return -EINVAL;
1929 }
1930
1931 if (offset < 0)
1932 return -EINVAL;
1933
1934 if (offset >= (dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE)
1935 return -EINVAL;
1936
1937 filp->f_pos = offset;
1938
1939 return filp->f_pos;
1940}
1941
1942
1943/* NOTE: assumes unsigned long is 8 bytes */
1944static ssize_t ui_read(struct file *filp, char __user *buf, size_t count,
1945 loff_t *f_pos)
1946{
1947 struct hfi1_devdata *dd = filp->private_data;
1948 void __iomem *base = dd->kregbase;
1949 unsigned long total, csr_off,
1950 barlen = (dd->kregend - dd->kregbase);
1951 u64 data;
1952
1953 /* only read 8 byte quantities */
1954 if ((count % 8) != 0)
1955 return -EINVAL;
1956 /* offset must be 8-byte aligned */
1957 if ((*f_pos % 8) != 0)
1958 return -EINVAL;
1959 /* destination buffer must be 8-byte aligned */
1960 if ((unsigned long)buf % 8 != 0)
1961 return -EINVAL;
1962 /* must be in range */
1963 if (*f_pos + count > (barlen + DC8051_DATA_MEM_SIZE))
1964 return -EINVAL;
1965 /* only set the base if we are not starting past the BAR */
1966 if (*f_pos < barlen)
1967 base += *f_pos;
1968 csr_off = *f_pos;
1969 for (total = 0; total < count; total += 8, csr_off += 8) {
1970 /* accessing LCB CSRs requires more checks */
1971 if (is_lcb_offset(csr_off)) {
1972 if (read_lcb_csr(dd, csr_off, (u64 *)&data))
1973 break; /* failed */
1974 }
1975 /*
1976 * Cannot read ASIC GPIO/QSFP* clear and force CSRs without a
1977 * false parity error. Avoid the whole issue by not reading
1978 * them. These registers are defined as having a read value
1979 * of 0.
1980 */
1981 else if (csr_off == ASIC_GPIO_CLEAR
1982 || csr_off == ASIC_GPIO_FORCE
1983 || csr_off == ASIC_QSFP1_CLEAR
1984 || csr_off == ASIC_QSFP1_FORCE
1985 || csr_off == ASIC_QSFP2_CLEAR
1986 || csr_off == ASIC_QSFP2_FORCE)
1987 data = 0;
1988 else if (csr_off >= barlen) {
1989 /*
1990 * read_8051_data can read more than just 8 bytes at
1991 * a time. However, folding this into the loop and
1992 * handling the reads in 8 byte increments allows us
1993 * to smoothly transition from chip memory to 8051
1994 * memory.
1995 */
1996 if (read_8051_data(dd,
1997 (u32)(csr_off - barlen),
1998 sizeof(data), &data))
1999 break; /* failed */
2000 } else
2001 data = readq(base + total);
2002 if (put_user(data, (unsigned long __user *)(buf + total)))
2003 break;
2004 }
2005 *f_pos += total;
2006 return total;
2007}
2008
2009/* NOTE: assumes unsigned long is 8 bytes */
2010static ssize_t ui_write(struct file *filp, const char __user *buf,
2011 size_t count, loff_t *f_pos)
2012{
2013 struct hfi1_devdata *dd = filp->private_data;
2014 void __iomem *base;
2015 unsigned long total, data, csr_off;
2016 int in_lcb;
2017
2018 /* only write 8 byte quantities */
2019 if ((count % 8) != 0)
2020 return -EINVAL;
2021 /* offset must be 8-byte aligned */
2022 if ((*f_pos % 8) != 0)
2023 return -EINVAL;
2024 /* source buffer must be 8-byte aligned */
2025 if ((unsigned long)buf % 8 != 0)
2026 return -EINVAL;
2027 /* must be in range */
2028 if (*f_pos + count > dd->kregend - dd->kregbase)
2029 return -EINVAL;
2030
2031 base = (void __iomem *)dd->kregbase + *f_pos;
2032 csr_off = *f_pos;
2033 in_lcb = 0;
2034 for (total = 0; total < count; total += 8, csr_off += 8) {
2035 if (get_user(data, (unsigned long __user *)(buf + total)))
2036 break;
2037 /* accessing LCB CSRs requires a special procedure */
2038 if (is_lcb_offset(csr_off)) {
2039 if (!in_lcb) {
2040 int ret = acquire_lcb_access(dd, 1);
2041
2042 if (ret)
2043 break;
2044 in_lcb = 1;
2045 }
2046 } else {
2047 if (in_lcb) {
2048 release_lcb_access(dd, 1);
2049 in_lcb = 0;
2050 }
2051 }
2052 writeq(data, base + total);
2053 }
2054 if (in_lcb)
2055 release_lcb_access(dd, 1);
2056 *f_pos += total;
2057 return total;
2058}
2059
2060static const struct file_operations ui_file_ops = {
2061 .owner = THIS_MODULE,
2062 .llseek = ui_lseek,
2063 .read = ui_read,
2064 .write = ui_write,
2065 .open = ui_open,
2066 .release = ui_release,
2067};
2068#define UI_OFFSET 192 /* device minor offset for UI devices */
2069static int create_ui = 1;
2070
2071static struct cdev wildcard_cdev;
2072static struct device *wildcard_device;
2073
2074static atomic_t user_count = ATOMIC_INIT(0);
2075
2076static void user_remove(struct hfi1_devdata *dd)
2077{
2078 if (atomic_dec_return(&user_count) == 0)
2079 hfi1_cdev_cleanup(&wildcard_cdev, &wildcard_device);
2080
2081 hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
2082 hfi1_cdev_cleanup(&dd->ui_cdev, &dd->ui_device);
2083}
2084
2085static int user_add(struct hfi1_devdata *dd)
2086{
2087 char name[10];
2088 int ret;
2089
2090 if (atomic_inc_return(&user_count) == 1) {
2091 ret = hfi1_cdev_init(0, class_name(), &hfi1_file_ops,
2092 &wildcard_cdev, &wildcard_device);
2093 if (ret)
2094 goto done;
2095 }
2096
2097 snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
2098 ret = hfi1_cdev_init(dd->unit + 1, name, &hfi1_file_ops,
2099 &dd->user_cdev, &dd->user_device);
2100 if (ret)
2101 goto done;
2102
2103 if (create_ui) {
2104 snprintf(name, sizeof(name),
2105 "%s_ui%d", class_name(), dd->unit);
2106 ret = hfi1_cdev_init(dd->unit + UI_OFFSET, name, &ui_file_ops,
2107 &dd->ui_cdev, &dd->ui_device);
2108 if (ret)
2109 goto done;
2110 }
2111
2112 return 0;
2113done:
2114 user_remove(dd);
2115 return ret;
2116}
2117
2118/*
2119 * Create per-unit files in /dev
2120 */
2121int hfi1_device_create(struct hfi1_devdata *dd)
2122{
2123 int r, ret;
2124
2125 r = user_add(dd);
2126 ret = hfi1_diag_add(dd);
2127 if (r && !ret)
2128 ret = r;
2129 return ret;
2130}
2131
2132/*
2133 * Remove per-unit files in /dev
2134 * void, core kernel returns no errors for this stuff
2135 */
2136void hfi1_device_remove(struct hfi1_devdata *dd)
2137{
2138 user_remove(dd);
2139 hfi1_diag_remove(dd);
2140}
diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c
new file mode 100644
index 000000000000..5c2f2ed8f224
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/firmware.c
@@ -0,0 +1,1620 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/firmware.h>
52#include <linux/mutex.h>
53#include <linux/module.h>
54#include <linux/delay.h>
55#include <linux/crc32.h>
56
57#include "hfi.h"
58#include "trace.h"
59
60/*
61 * Make it easy to toggle firmware file name and if it gets loaded by
62 * editing the following. This may be something we do while in development
63 * but not necessarily something a user would ever need to use.
64 */
65#define DEFAULT_FW_8051_NAME_FPGA "hfi_dc8051.bin"
66#define DEFAULT_FW_8051_NAME_ASIC "hfi1_dc8051.fw"
67#define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw"
68#define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw"
69#define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw"
70#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat"
71
72static uint fw_8051_load = 1;
73static uint fw_fabric_serdes_load = 1;
74static uint fw_pcie_serdes_load = 1;
75static uint fw_sbus_load = 1;
76static uint platform_config_load = 1;
77
78/* Firmware file names get set in hfi1_firmware_init() based on the above */
79static char *fw_8051_name;
80static char *fw_fabric_serdes_name;
81static char *fw_sbus_name;
82static char *fw_pcie_serdes_name;
83static char *platform_config_name;
84
85#define SBUS_MAX_POLL_COUNT 100
86#define SBUS_COUNTER(reg, name) \
87 (((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \
88 ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK)
89
90/*
91 * Firmware security header.
92 */
93struct css_header {
94 u32 module_type;
95 u32 header_len;
96 u32 header_version;
97 u32 module_id;
98 u32 module_vendor;
99 u32 date; /* BCD yyyymmdd */
100 u32 size; /* in DWORDs */
101 u32 key_size; /* in DWORDs */
102 u32 modulus_size; /* in DWORDs */
103 u32 exponent_size; /* in DWORDs */
104 u32 reserved[22];
105};
106/* expected field values */
107#define CSS_MODULE_TYPE 0x00000006
108#define CSS_HEADER_LEN 0x000000a1
109#define CSS_HEADER_VERSION 0x00010000
110#define CSS_MODULE_VENDOR 0x00008086
111
112#define KEY_SIZE 256
113#define MU_SIZE 8
114#define EXPONENT_SIZE 4
115
116/* the file itself */
117struct firmware_file {
118 struct css_header css_header;
119 u8 modulus[KEY_SIZE];
120 u8 exponent[EXPONENT_SIZE];
121 u8 signature[KEY_SIZE];
122 u8 firmware[];
123};
124
125struct augmented_firmware_file {
126 struct css_header css_header;
127 u8 modulus[KEY_SIZE];
128 u8 exponent[EXPONENT_SIZE];
129 u8 signature[KEY_SIZE];
130 u8 r2[KEY_SIZE];
131 u8 mu[MU_SIZE];
132 u8 firmware[];
133};
134
135/* augmented file size difference */
136#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \
137 sizeof(struct firmware_file))
138
139struct firmware_details {
140 /* Linux core piece */
141 const struct firmware *fw;
142
143 struct css_header *css_header;
144 u8 *firmware_ptr; /* pointer to binary data */
145 u32 firmware_len; /* length in bytes */
146 u8 *modulus; /* pointer to the modulus */
147 u8 *exponent; /* pointer to the exponent */
148 u8 *signature; /* pointer to the signature */
149 u8 *r2; /* pointer to r2 */
150 u8 *mu; /* pointer to mu */
151 struct augmented_firmware_file dummy_header;
152};
153
154/*
155 * The mutex protects fw_state, fw_err, and all of the firmware_details
156 * variables.
157 */
158static DEFINE_MUTEX(fw_mutex);
159enum fw_state {
160 FW_EMPTY,
161 FW_ACQUIRED,
162 FW_ERR
163};
164static enum fw_state fw_state = FW_EMPTY;
165static int fw_err;
166static struct firmware_details fw_8051;
167static struct firmware_details fw_fabric;
168static struct firmware_details fw_pcie;
169static struct firmware_details fw_sbus;
170static const struct firmware *platform_config;
171
172/* flags for turn_off_spicos() */
173#define SPICO_SBUS 0x1
174#define SPICO_FABRIC 0x2
175#define ENABLE_SPICO_SMASK 0x1
176
177/* security block commands */
178#define RSA_CMD_INIT 0x1
179#define RSA_CMD_START 0x2
180
181/* security block status */
182#define RSA_STATUS_IDLE 0x0
183#define RSA_STATUS_ACTIVE 0x1
184#define RSA_STATUS_DONE 0x2
185#define RSA_STATUS_FAILED 0x3
186
187/* RSA engine timeout, in ms */
188#define RSA_ENGINE_TIMEOUT 100 /* ms */
189
190/* hardware mutex timeout, in ms */
191#define HM_TIMEOUT 4000 /* 4 s */
192
193/* 8051 memory access timeout, in us */
194#define DC8051_ACCESS_TIMEOUT 100 /* us */
195
196/* the number of fabric SerDes on the SBus */
197#define NUM_FABRIC_SERDES 4
198
199/* SBus fabric SerDes addresses, one set per HFI */
200static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
201 { 0x01, 0x02, 0x03, 0x04 },
202 { 0x28, 0x29, 0x2a, 0x2b }
203};
204
205/* SBus PCIe SerDes addresses, one set per HFI */
206static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = {
207 { 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16,
208 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 },
209 { 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d,
210 0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d }
211};
212
213/* SBus PCIe PCS addresses, one set per HFI */
214const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = {
215 { 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17,
216 0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 },
217 { 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
218 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e }
219};
220
221/* SBus fabric SerDes broadcast addresses, one per HFI */
222static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 };
223static const u8 all_fabric_serdes_broadcast = 0xe1;
224
225/* SBus PCIe SerDes broadcast addresses, one per HFI */
226const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 };
227static const u8 all_pcie_serdes_broadcast = 0xe0;
228
229/* forwards */
230static void dispose_one_firmware(struct firmware_details *fdet);
231
232/*
233 * Read a single 64-bit value from 8051 data memory.
234 *
235 * Expects:
236 * o caller to have already set up data read, no auto increment
237 * o caller to turn off read enable when finished
238 *
239 * The address argument is a byte offset. Bits 0:2 in the address are
240 * ignored - i.e. the hardware will always do aligned 8-byte reads as if
241 * the lower bits are zero.
242 *
243 * Return 0 on success, -ENXIO on a read error (timeout).
244 */
245static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result)
246{
247 u64 reg;
248 int count;
249
250 /* start the read at the given address */
251 reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
252 << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
253 | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK;
254 write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
255
256 /* wait until ACCESS_COMPLETED is set */
257 count = 0;
258 while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
259 & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
260 == 0) {
261 count++;
262 if (count > DC8051_ACCESS_TIMEOUT) {
263 dd_dev_err(dd, "timeout reading 8051 data\n");
264 return -ENXIO;
265 }
266 ndelay(10);
267 }
268
269 /* gather the data */
270 *result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA);
271
272 return 0;
273}
274
275/*
276 * Read 8051 data starting at addr, for len bytes. Will read in 8-byte chunks.
277 * Return 0 on success, -errno on error.
278 */
279int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result)
280{
281 unsigned long flags;
282 u32 done;
283 int ret = 0;
284
285 spin_lock_irqsave(&dd->dc8051_memlock, flags);
286
287 /* data read set-up, no auto-increment */
288 write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
289
290 for (done = 0; done < len; addr += 8, done += 8, result++) {
291 ret = __read_8051_data(dd, addr, result);
292 if (ret)
293 break;
294 }
295
296 /* turn off read enable */
297 write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
298
299 spin_unlock_irqrestore(&dd->dc8051_memlock, flags);
300
301 return ret;
302}
303
304/*
305 * Write data or code to the 8051 code or data RAM.
306 */
307static int write_8051(struct hfi1_devdata *dd, int code, u32 start,
308 const u8 *data, u32 len)
309{
310 u64 reg;
311 u32 offset;
312 int aligned, count;
313
314 /* check alignment */
315 aligned = ((unsigned long)data & 0x7) == 0;
316
317 /* write set-up */
318 reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull)
319 | DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK;
320 write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg);
321
322 reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
323 << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
324 | DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK;
325 write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
326
327 /* write */
328 for (offset = 0; offset < len; offset += 8) {
329 int bytes = len - offset;
330
331 if (bytes < 8) {
332 reg = 0;
333 memcpy(&reg, &data[offset], bytes);
334 } else if (aligned) {
335 reg = *(u64 *)&data[offset];
336 } else {
337 memcpy(&reg, &data[offset], 8);
338 }
339 write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg);
340
341 /* wait until ACCESS_COMPLETED is set */
342 count = 0;
343 while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
344 & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
345 == 0) {
346 count++;
347 if (count > DC8051_ACCESS_TIMEOUT) {
348 dd_dev_err(dd, "timeout writing 8051 data\n");
349 return -ENXIO;
350 }
351 udelay(1);
352 }
353 }
354
355 /* turn off write access, auto increment (also sets to data access) */
356 write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
357 write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
358
359 return 0;
360}
361
362/* return 0 if values match, non-zero and complain otherwise */
363static int invalid_header(struct hfi1_devdata *dd, const char *what,
364 u32 actual, u32 expected)
365{
366 if (actual == expected)
367 return 0;
368
369 dd_dev_err(dd,
370 "invalid firmware header field %s: expected 0x%x, actual 0x%x\n",
371 what, expected, actual);
372 return 1;
373}
374
375/*
376 * Verify that the static fields in the CSS header match.
377 */
378static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css)
379{
380 /* verify CSS header fields (most sizes are in DW, so add /4) */
381 if (invalid_header(dd, "module_type", css->module_type, CSS_MODULE_TYPE)
382 || invalid_header(dd, "header_len", css->header_len,
383 (sizeof(struct firmware_file)/4))
384 || invalid_header(dd, "header_version",
385 css->header_version, CSS_HEADER_VERSION)
386 || invalid_header(dd, "module_vendor",
387 css->module_vendor, CSS_MODULE_VENDOR)
388 || invalid_header(dd, "key_size",
389 css->key_size, KEY_SIZE/4)
390 || invalid_header(dd, "modulus_size",
391 css->modulus_size, KEY_SIZE/4)
392 || invalid_header(dd, "exponent_size",
393 css->exponent_size, EXPONENT_SIZE/4)) {
394 return -EINVAL;
395 }
396 return 0;
397}
398
399/*
400 * Make sure there are at least some bytes after the prefix.
401 */
402static int payload_check(struct hfi1_devdata *dd, const char *name,
403 long file_size, long prefix_size)
404{
405 /* make sure we have some payload */
406 if (prefix_size >= file_size) {
407 dd_dev_err(dd,
408 "firmware \"%s\", size %ld, must be larger than %ld bytes\n",
409 name, file_size, prefix_size);
410 return -EINVAL;
411 }
412
413 return 0;
414}
415
416/*
417 * Request the firmware from the system. Extract the pieces and fill in
418 * fdet. If successful, the caller will need to call dispose_one_firmware().
419 * Returns 0 on success, -ERRNO on error.
420 */
421static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name,
422 struct firmware_details *fdet)
423{
424 struct css_header *css;
425 int ret;
426
427 memset(fdet, 0, sizeof(*fdet));
428
429 ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev);
430 if (ret) {
431 dd_dev_err(dd, "cannot load firmware \"%s\", err %d\n",
432 name, ret);
433 return ret;
434 }
435
436 /* verify the firmware */
437 if (fdet->fw->size < sizeof(struct css_header)) {
438 dd_dev_err(dd, "firmware \"%s\" is too small\n", name);
439 ret = -EINVAL;
440 goto done;
441 }
442 css = (struct css_header *)fdet->fw->data;
443
444 hfi1_cdbg(FIRMWARE, "Firmware %s details:", name);
445 hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size);
446 hfi1_cdbg(FIRMWARE, "CSS structure:");
447 hfi1_cdbg(FIRMWARE, " module_type 0x%x", css->module_type);
448 hfi1_cdbg(FIRMWARE, " header_len 0x%03x (0x%03x bytes)",
449 css->header_len, 4 * css->header_len);
450 hfi1_cdbg(FIRMWARE, " header_version 0x%x", css->header_version);
451 hfi1_cdbg(FIRMWARE, " module_id 0x%x", css->module_id);
452 hfi1_cdbg(FIRMWARE, " module_vendor 0x%x", css->module_vendor);
453 hfi1_cdbg(FIRMWARE, " date 0x%x", css->date);
454 hfi1_cdbg(FIRMWARE, " size 0x%03x (0x%03x bytes)",
455 css->size, 4 * css->size);
456 hfi1_cdbg(FIRMWARE, " key_size 0x%03x (0x%03x bytes)",
457 css->key_size, 4 * css->key_size);
458 hfi1_cdbg(FIRMWARE, " modulus_size 0x%03x (0x%03x bytes)",
459 css->modulus_size, 4 * css->modulus_size);
460 hfi1_cdbg(FIRMWARE, " exponent_size 0x%03x (0x%03x bytes)",
461 css->exponent_size, 4 * css->exponent_size);
462 hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes",
463 fdet->fw->size - sizeof(struct firmware_file));
464
465 /*
466 * If the file does not have a valid CSS header, fail.
467 * Otherwise, check the CSS size field for an expected size.
468 * The augmented file has r2 and mu inserted after the header
469 * was generated, so there will be a known difference between
470 * the CSS header size and the actual file size. Use this
471 * difference to identify an augmented file.
472 *
473 * Note: css->size is in DWORDs, multiply by 4 to get bytes.
474 */
475 ret = verify_css_header(dd, css);
476 if (ret) {
477 dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name);
478 } else if ((css->size*4) == fdet->fw->size) {
479 /* non-augmented firmware file */
480 struct firmware_file *ff = (struct firmware_file *)
481 fdet->fw->data;
482
483 /* make sure there are bytes in the payload */
484 ret = payload_check(dd, name, fdet->fw->size,
485 sizeof(struct firmware_file));
486 if (ret == 0) {
487 fdet->css_header = css;
488 fdet->modulus = ff->modulus;
489 fdet->exponent = ff->exponent;
490 fdet->signature = ff->signature;
491 fdet->r2 = fdet->dummy_header.r2; /* use dummy space */
492 fdet->mu = fdet->dummy_header.mu; /* use dummy space */
493 fdet->firmware_ptr = ff->firmware;
494 fdet->firmware_len = fdet->fw->size -
495 sizeof(struct firmware_file);
496 /*
497 * Header does not include r2 and mu - generate here.
498 * For now, fail.
499 */
500 dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n");
501 ret = -EINVAL;
502 }
503 } else if ((css->size*4) + AUGMENT_SIZE == fdet->fw->size) {
504 /* augmented firmware file */
505 struct augmented_firmware_file *aff =
506 (struct augmented_firmware_file *)fdet->fw->data;
507
508 /* make sure there are bytes in the payload */
509 ret = payload_check(dd, name, fdet->fw->size,
510 sizeof(struct augmented_firmware_file));
511 if (ret == 0) {
512 fdet->css_header = css;
513 fdet->modulus = aff->modulus;
514 fdet->exponent = aff->exponent;
515 fdet->signature = aff->signature;
516 fdet->r2 = aff->r2;
517 fdet->mu = aff->mu;
518 fdet->firmware_ptr = aff->firmware;
519 fdet->firmware_len = fdet->fw->size -
520 sizeof(struct augmented_firmware_file);
521 }
522 } else {
523 /* css->size check failed */
524 dd_dev_err(dd,
525 "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n",
526 fdet->fw->size/4, (fdet->fw->size - AUGMENT_SIZE)/4,
527 css->size);
528
529 ret = -EINVAL;
530 }
531
532done:
533 /* if returning an error, clean up after ourselves */
534 if (ret)
535 dispose_one_firmware(fdet);
536 return ret;
537}
538
539static void dispose_one_firmware(struct firmware_details *fdet)
540{
541 release_firmware(fdet->fw);
542 fdet->fw = NULL;
543}
544
545/*
546 * Called by all HFIs when loading their firmware - i.e. device probe time.
547 * The first one will do the actual firmware load. Use a mutex to resolve
548 * any possible race condition.
549 *
550 * The call to this routine cannot be moved to driver load because the kernel
551 * call request_firmware() requires a device which is only available after
552 * the first device probe.
553 */
554static int obtain_firmware(struct hfi1_devdata *dd)
555{
556 int err = 0;
557
558 mutex_lock(&fw_mutex);
559 if (fw_state == FW_ACQUIRED) {
560 goto done; /* already acquired */
561 } else if (fw_state == FW_ERR) {
562 err = fw_err;
563 goto done; /* already tried and failed */
564 }
565
566 if (fw_8051_load) {
567 err = obtain_one_firmware(dd, fw_8051_name, &fw_8051);
568 if (err)
569 goto done;
570 }
571
572 if (fw_fabric_serdes_load) {
573 err = obtain_one_firmware(dd, fw_fabric_serdes_name,
574 &fw_fabric);
575 if (err)
576 goto done;
577 }
578
579 if (fw_sbus_load) {
580 err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus);
581 if (err)
582 goto done;
583 }
584
585 if (fw_pcie_serdes_load) {
586 err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie);
587 if (err)
588 goto done;
589 }
590
591 if (platform_config_load) {
592 platform_config = NULL;
593 err = request_firmware(&platform_config, platform_config_name,
594 &dd->pcidev->dev);
595 if (err) {
596 err = 0;
597 platform_config = NULL;
598 }
599 }
600
601 /* success */
602 fw_state = FW_ACQUIRED;
603
604done:
605 if (err) {
606 fw_err = err;
607 fw_state = FW_ERR;
608 }
609 mutex_unlock(&fw_mutex);
610
611 return err;
612}
613
614/*
615 * Called when the driver unloads. The timing is asymmetric with its
616 * counterpart, obtain_firmware(). If called at device remove time,
617 * then it is conceivable that another device could probe while the
618 * firmware is being disposed. The mutexes can be moved to do that
619 * safely, but then the firmware would be requested from the OS multiple
620 * times.
621 *
622 * No mutex is needed as the driver is unloading and there cannot be any
623 * other callers.
624 */
625void dispose_firmware(void)
626{
627 dispose_one_firmware(&fw_8051);
628 dispose_one_firmware(&fw_fabric);
629 dispose_one_firmware(&fw_pcie);
630 dispose_one_firmware(&fw_sbus);
631
632 release_firmware(platform_config);
633 platform_config = NULL;
634
635 /* retain the error state, otherwise revert to empty */
636 if (fw_state != FW_ERR)
637 fw_state = FW_EMPTY;
638}
639
640/*
641 * Write a block of data to a given array CSR. All calls will be in
642 * multiples of 8 bytes.
643 */
644static void write_rsa_data(struct hfi1_devdata *dd, int what,
645 const u8 *data, int nbytes)
646{
647 int qw_size = nbytes/8;
648 int i;
649
650 if (((unsigned long)data & 0x7) == 0) {
651 /* aligned */
652 u64 *ptr = (u64 *)data;
653
654 for (i = 0; i < qw_size; i++, ptr++)
655 write_csr(dd, what + (8*i), *ptr);
656 } else {
657 /* not aligned */
658 for (i = 0; i < qw_size; i++, data += 8) {
659 u64 value;
660
661 memcpy(&value, data, 8);
662 write_csr(dd, what + (8*i), value);
663 }
664 }
665}
666
667/*
668 * Write a block of data to a given CSR as a stream of writes. All calls will
669 * be in multiples of 8 bytes.
670 */
671static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what,
672 const u8 *data, int nbytes)
673{
674 u64 *ptr = (u64 *)data;
675 int qw_size = nbytes/8;
676
677 for (; qw_size > 0; qw_size--, ptr++)
678 write_csr(dd, what, *ptr);
679}
680
681/*
682 * Download the signature and start the RSA mechanism. Wait for
683 * RSA_ENGINE_TIMEOUT before giving up.
684 */
685static int run_rsa(struct hfi1_devdata *dd, const char *who,
686 const u8 *signature)
687{
688 unsigned long timeout;
689 u64 reg;
690 u32 status;
691 int ret = 0;
692
693 /* write the signature */
694 write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE);
695
696 /* initialize RSA */
697 write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT);
698
699 /*
700 * Make sure the engine is idle and insert a delay between the two
701 * writes to MISC_CFG_RSA_CMD.
702 */
703 status = (read_csr(dd, MISC_CFG_FW_CTRL)
704 & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
705 >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
706 if (status != RSA_STATUS_IDLE) {
707 dd_dev_err(dd, "%s security engine not idle - giving up\n",
708 who);
709 return -EBUSY;
710 }
711
712 /* start RSA */
713 write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START);
714
715 /*
716 * Look for the result.
717 *
718 * The RSA engine is hooked up to two MISC errors. The driver
719 * masks these errors as they do not respond to the standard
720 * error "clear down" mechanism. Look for these errors here and
721 * clear them when possible. This routine will exit with the
722 * errors of the current run still set.
723 *
724 * MISC_FW_AUTH_FAILED_ERR
725 * Firmware authorization failed. This can be cleared by
726 * re-initializing the RSA engine, then clearing the status bit.
727 * Do not re-init the RSA angine immediately after a successful
728 * run - this will reset the current authorization.
729 *
730 * MISC_KEY_MISMATCH_ERR
731 * Key does not match. The only way to clear this is to load
732 * a matching key then clear the status bit. If this error
733 * is raised, it will persist outside of this routine until a
734 * matching key is loaded.
735 */
736 timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies;
737 while (1) {
738 status = (read_csr(dd, MISC_CFG_FW_CTRL)
739 & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
740 >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
741
742 if (status == RSA_STATUS_IDLE) {
743 /* should not happen */
744 dd_dev_err(dd, "%s firmware security bad idle state\n",
745 who);
746 ret = -EINVAL;
747 break;
748 } else if (status == RSA_STATUS_DONE) {
749 /* finished successfully */
750 break;
751 } else if (status == RSA_STATUS_FAILED) {
752 /* finished unsuccessfully */
753 ret = -EINVAL;
754 break;
755 }
756 /* else still active */
757
758 if (time_after(jiffies, timeout)) {
759 /*
760 * Timed out while active. We can't reset the engine
761 * if it is stuck active, but run through the
762 * error code to see what error bits are set.
763 */
764 dd_dev_err(dd, "%s firmware security time out\n", who);
765 ret = -ETIMEDOUT;
766 break;
767 }
768
769 msleep(20);
770 }
771
772 /*
773 * Arrive here on success or failure. Clear all RSA engine
774 * errors. All current errors will stick - the RSA logic is keeping
775 * error high. All previous errors will clear - the RSA logic
776 * is not keeping the error high.
777 */
778 write_csr(dd, MISC_ERR_CLEAR,
779 MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK
780 | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK);
781 /*
782 * All that is left are the current errors. Print failure details,
783 * if any.
784 */
785 reg = read_csr(dd, MISC_ERR_STATUS);
786 if (ret) {
787 if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK)
788 dd_dev_err(dd, "%s firmware authorization failed\n",
789 who);
790 if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)
791 dd_dev_err(dd, "%s firmware key mismatch\n", who);
792 }
793
794 return ret;
795}
796
797static void load_security_variables(struct hfi1_devdata *dd,
798 struct firmware_details *fdet)
799{
800 /* Security variables a. Write the modulus */
801 write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE);
802 /* Security variables b. Write the r2 */
803 write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE);
804 /* Security variables c. Write the mu */
805 write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE);
806 /* Security variables d. Write the header */
807 write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD,
808 (u8 *)fdet->css_header, sizeof(struct css_header));
809}
810
811/* return the 8051 firmware state */
812static inline u32 get_firmware_state(struct hfi1_devdata *dd)
813{
814 u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
815
816 return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT)
817 & DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK;
818}
819
820/*
821 * Wait until the firmware is up and ready to take host requests.
822 * Return 0 on success, -ETIMEDOUT on timeout.
823 */
824int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout)
825{
826 unsigned long timeout;
827
828 /* in the simulator, the fake 8051 is always ready */
829 if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
830 return 0;
831
832 timeout = msecs_to_jiffies(mstimeout) + jiffies;
833 while (1) {
834 if (get_firmware_state(dd) == 0xa0) /* ready */
835 return 0;
836 if (time_after(jiffies, timeout)) /* timed out */
837 return -ETIMEDOUT;
838 usleep_range(1950, 2050); /* sleep 2ms-ish */
839 }
840}
841
842/*
843 * Load the 8051 firmware.
844 */
845static int load_8051_firmware(struct hfi1_devdata *dd,
846 struct firmware_details *fdet)
847{
848 u64 reg;
849 int ret;
850 u8 ver_a, ver_b;
851
852 /*
853 * DC Reset sequence
854 * Load DC 8051 firmware
855 */
856 /*
857 * DC reset step 1: Reset DC8051
858 */
859 reg = DC_DC8051_CFG_RST_M8051W_SMASK
860 | DC_DC8051_CFG_RST_CRAM_SMASK
861 | DC_DC8051_CFG_RST_DRAM_SMASK
862 | DC_DC8051_CFG_RST_IRAM_SMASK
863 | DC_DC8051_CFG_RST_SFR_SMASK;
864 write_csr(dd, DC_DC8051_CFG_RST, reg);
865
866 /*
867 * DC reset step 2 (optional): Load 8051 data memory with link
868 * configuration
869 */
870
871 /*
872 * DC reset step 3: Load DC8051 firmware
873 */
874 /* release all but the core reset */
875 reg = DC_DC8051_CFG_RST_M8051W_SMASK;
876 write_csr(dd, DC_DC8051_CFG_RST, reg);
877
878 /* Firmware load step 1 */
879 load_security_variables(dd, fdet);
880
881 /*
882 * Firmware load step 2. Clear MISC_CFG_FW_CTRL.FW_8051_LOADED
883 */
884 write_csr(dd, MISC_CFG_FW_CTRL, 0);
885
886 /* Firmware load steps 3-5 */
887 ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr,
888 fdet->firmware_len);
889 if (ret)
890 return ret;
891
892 /*
893 * DC reset step 4. Host starts the DC8051 firmware
894 */
895 /*
896 * Firmware load step 6. Set MISC_CFG_FW_CTRL.FW_8051_LOADED
897 */
898 write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK);
899
900 /* Firmware load steps 7-10 */
901 ret = run_rsa(dd, "8051", fdet->signature);
902 if (ret)
903 return ret;
904
905 /* clear all reset bits, releasing the 8051 */
906 write_csr(dd, DC_DC8051_CFG_RST, 0ull);
907
908 /*
909 * DC reset step 5. Wait for firmware to be ready to accept host
910 * requests.
911 */
912 ret = wait_fm_ready(dd, TIMEOUT_8051_START);
913 if (ret) { /* timed out */
914 dd_dev_err(dd, "8051 start timeout, current state 0x%x\n",
915 get_firmware_state(dd));
916 return -ETIMEDOUT;
917 }
918
919 read_misc_status(dd, &ver_a, &ver_b);
920 dd_dev_info(dd, "8051 firmware version %d.%d\n",
921 (int)ver_b, (int)ver_a);
922 dd->dc8051_ver = dc8051_ver(ver_b, ver_a);
923
924 return 0;
925}
926
927/* SBus Master broadcast address */
928#define SBUS_MASTER_BROADCAST 0xfd
929
930/*
931 * Write the SBus request register
932 *
933 * No need for masking - the arguments are sized exactly.
934 */
935void sbus_request(struct hfi1_devdata *dd,
936 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
937{
938 write_csr(dd, ASIC_CFG_SBUS_REQUEST,
939 ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT)
940 | ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT)
941 | ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT)
942 | ((u64)receiver_addr
943 << ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
944}
945
946/*
947 * Turn off the SBus and fabric serdes spicos.
948 *
949 * + Must be called with Sbus fast mode turned on.
950 * + Must be called after fabric serdes broadcast is set up.
951 * + Must be called before the 8051 is loaded - assumes 8051 is not loaded
952 * when using MISC_CFG_FW_CTRL.
953 */
954static void turn_off_spicos(struct hfi1_devdata *dd, int flags)
955{
956 /* only needed on A0 */
957 if (!is_a0(dd))
958 return;
959
960 dd_dev_info(dd, "Turning off spicos:%s%s\n",
961 flags & SPICO_SBUS ? " SBus" : "",
962 flags & SPICO_FABRIC ? " fabric" : "");
963
964 write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK);
965 /* disable SBus spico */
966 if (flags & SPICO_SBUS)
967 sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01,
968 WRITE_SBUS_RECEIVER, 0x00000040);
969
970 /* disable the fabric serdes spicos */
971 if (flags & SPICO_FABRIC)
972 sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id],
973 0x07, WRITE_SBUS_RECEIVER, 0x00000000);
974 write_csr(dd, MISC_CFG_FW_CTRL, 0);
975}
976
977/*
978 * Reset all of the fabric serdes for our HFI.
979 */
980void fabric_serdes_reset(struct hfi1_devdata *dd)
981{
982 u8 ra;
983
984 if (dd->icode != ICODE_RTL_SILICON) /* only for RTL */
985 return;
986
987 ra = fabric_serdes_broadcast[dd->hfi1_id];
988
989 acquire_hw_mutex(dd);
990 set_sbus_fast_mode(dd);
991 /* place SerDes in reset and disable SPICO */
992 sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
993 /* wait 100 refclk cycles @ 156.25MHz => 640ns */
994 udelay(1);
995 /* remove SerDes reset */
996 sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
997 /* turn SPICO enable on */
998 sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
999 clear_sbus_fast_mode(dd);
1000 release_hw_mutex(dd);
1001}
1002
1003/* Access to the SBus in this routine should probably be serialized */
1004int sbus_request_slow(struct hfi1_devdata *dd,
1005 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
1006{
1007 u64 reg, count = 0;
1008
1009 sbus_request(dd, receiver_addr, data_addr, command, data_in);
1010 write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
1011 ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK);
1012 /* Wait for both DONE and RCV_DATA_VALID to go high */
1013 reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
1014 while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
1015 (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) {
1016 if (count++ >= SBUS_MAX_POLL_COUNT) {
1017 u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
1018 /*
1019 * If the loop has timed out, we are OK if DONE bit
1020 * is set and RCV_DATA_VALID and EXECUTE counters
1021 * are the same. If not, we cannot proceed.
1022 */
1023 if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
1024 (SBUS_COUNTER(counts, RCV_DATA_VALID) ==
1025 SBUS_COUNTER(counts, EXECUTE)))
1026 break;
1027 return -ETIMEDOUT;
1028 }
1029 udelay(1);
1030 reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
1031 }
1032 count = 0;
1033 write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
1034 /* Wait for DONE to clear after EXECUTE is cleared */
1035 reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
1036 while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) {
1037 if (count++ >= SBUS_MAX_POLL_COUNT)
1038 return -ETIME;
1039 udelay(1);
1040 reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
1041 }
1042 return 0;
1043}
1044
1045static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
1046 struct firmware_details *fdet)
1047{
1048 int i, err;
1049 const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */
1050
1051 dd_dev_info(dd, "Downloading fabric firmware\n");
1052
1053 /* step 1: load security variables */
1054 load_security_variables(dd, fdet);
1055 /* step 2: place SerDes in reset and disable SPICO */
1056 sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
1057 /* wait 100 refclk cycles @ 156.25MHz => 640ns */
1058 udelay(1);
1059 /* step 3: remove SerDes reset */
1060 sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
1061 /* step 4: assert IMEM override */
1062 sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000);
1063 /* step 5: download SerDes machine code */
1064 for (i = 0; i < fdet->firmware_len; i += 4) {
1065 sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER,
1066 *(u32 *)&fdet->firmware_ptr[i]);
1067 }
1068 /* step 6: IMEM override off */
1069 sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000);
1070 /* step 7: turn ECC on */
1071 sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000);
1072
1073 /* steps 8-11: run the RSA engine */
1074 err = run_rsa(dd, "fabric serdes", fdet->signature);
1075 if (err)
1076 return err;
1077
1078 /* step 12: turn SPICO enable on */
1079 sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
1080 /* step 13: enable core hardware interrupts */
1081 sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000);
1082
1083 return 0;
1084}
1085
1086static int load_sbus_firmware(struct hfi1_devdata *dd,
1087 struct firmware_details *fdet)
1088{
1089 int i, err;
1090 const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
1091
1092 dd_dev_info(dd, "Downloading SBus firmware\n");
1093
1094 /* step 1: load security variables */
1095 load_security_variables(dd, fdet);
1096 /* step 2: place SPICO into reset and enable off */
1097 sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0);
1098 /* step 3: remove reset, enable off, IMEM_CNTRL_EN on */
1099 sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240);
1100 /* step 4: set starting IMEM address for burst download */
1101 sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000);
1102 /* step 5: download the SBus Master machine code */
1103 for (i = 0; i < fdet->firmware_len; i += 4) {
1104 sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER,
1105 *(u32 *)&fdet->firmware_ptr[i]);
1106 }
1107 /* step 6: set IMEM_CNTL_EN off */
1108 sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040);
1109 /* step 7: turn ECC on */
1110 sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000);
1111
1112 /* steps 8-11: run the RSA engine */
1113 err = run_rsa(dd, "SBus", fdet->signature);
1114 if (err)
1115 return err;
1116
1117 /* step 12: set SPICO_ENABLE on */
1118 sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
1119
1120 return 0;
1121}
1122
1123static int load_pcie_serdes_firmware(struct hfi1_devdata *dd,
1124 struct firmware_details *fdet)
1125{
1126 int i;
1127 const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
1128
1129 dd_dev_info(dd, "Downloading PCIe firmware\n");
1130
1131 /* step 1: load security variables */
1132 load_security_variables(dd, fdet);
1133 /* step 2: assert single step (halts the SBus Master spico) */
1134 sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001);
1135 /* step 3: enable XDMEM access */
1136 sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40);
1137 /* step 4: load firmware into SBus Master XDMEM */
1138 /* NOTE: the dmem address, write_en, and wdata are all pre-packed,
1139 we only need to pick up the bytes and write them */
1140 for (i = 0; i < fdet->firmware_len; i += 4) {
1141 sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER,
1142 *(u32 *)&fdet->firmware_ptr[i]);
1143 }
1144 /* step 5: disable XDMEM access */
1145 sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
1146 /* step 6: allow SBus Spico to run */
1147 sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000);
1148
1149 /* steps 7-11: run RSA, if it succeeds, firmware is available to
1150 be swapped */
1151 return run_rsa(dd, "PCIe serdes", fdet->signature);
1152}
1153
1154/*
1155 * Set the given broadcast values on the given list of devices.
1156 */
1157static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2,
1158 const u8 *addrs, int count)
1159{
1160 while (--count >= 0) {
1161 /*
1162 * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave
1163 * defaults for everything else. Do not read-modify-write,
1164 * per instruction from the manufacturer.
1165 *
1166 * Register 0xfd:
1167 * bits what
1168 * ----- ---------------------------------
1169 * 0 IGNORE_BROADCAST (default 0)
1170 * 11:4 BROADCAST_GROUP_1 (default 0xff)
1171 * 23:16 BROADCAST_GROUP_2 (default 0xff)
1172 */
1173 sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER,
1174 (u32)bg1 << 4 | (u32)bg2 << 16);
1175 }
1176}
1177
1178int acquire_hw_mutex(struct hfi1_devdata *dd)
1179{
1180 unsigned long timeout;
1181 int try = 0;
1182 u8 mask = 1 << dd->hfi1_id;
1183 u8 user;
1184
1185retry:
1186 timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies;
1187 while (1) {
1188 write_csr(dd, ASIC_CFG_MUTEX, mask);
1189 user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
1190 if (user == mask)
1191 return 0; /* success */
1192 if (time_after(jiffies, timeout))
1193 break; /* timed out */
1194 msleep(20);
1195 }
1196
1197 /* timed out */
1198 dd_dev_err(dd,
1199 "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n",
1200 (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up");
1201
1202 if (try == 0) {
1203 /* break mutex and retry */
1204 write_csr(dd, ASIC_CFG_MUTEX, 0);
1205 try++;
1206 goto retry;
1207 }
1208
1209 return -EBUSY;
1210}
1211
1212void release_hw_mutex(struct hfi1_devdata *dd)
1213{
1214 write_csr(dd, ASIC_CFG_MUTEX, 0);
1215}
1216
1217void set_sbus_fast_mode(struct hfi1_devdata *dd)
1218{
1219 write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
1220 ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK);
1221}
1222
1223void clear_sbus_fast_mode(struct hfi1_devdata *dd)
1224{
1225 u64 reg, count = 0;
1226
1227 reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
1228 while (SBUS_COUNTER(reg, EXECUTE) !=
1229 SBUS_COUNTER(reg, RCV_DATA_VALID)) {
1230 if (count++ >= SBUS_MAX_POLL_COUNT)
1231 break;
1232 udelay(1);
1233 reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
1234 }
1235 write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
1236}
1237
1238int load_firmware(struct hfi1_devdata *dd)
1239{
1240 int ret;
1241
1242 if (fw_sbus_load || fw_fabric_serdes_load) {
1243 ret = acquire_hw_mutex(dd);
1244 if (ret)
1245 return ret;
1246
1247 set_sbus_fast_mode(dd);
1248
1249 /*
1250 * The SBus contains part of the fabric firmware and so must
1251 * also be downloaded.
1252 */
1253 if (fw_sbus_load) {
1254 turn_off_spicos(dd, SPICO_SBUS);
1255 ret = load_sbus_firmware(dd, &fw_sbus);
1256 if (ret)
1257 goto clear;
1258 }
1259
1260 if (fw_fabric_serdes_load) {
1261 set_serdes_broadcast(dd, all_fabric_serdes_broadcast,
1262 fabric_serdes_broadcast[dd->hfi1_id],
1263 fabric_serdes_addrs[dd->hfi1_id],
1264 NUM_FABRIC_SERDES);
1265 turn_off_spicos(dd, SPICO_FABRIC);
1266 ret = load_fabric_serdes_firmware(dd, &fw_fabric);
1267 }
1268
1269clear:
1270 clear_sbus_fast_mode(dd);
1271 release_hw_mutex(dd);
1272 if (ret)
1273 return ret;
1274 }
1275
1276 if (fw_8051_load) {
1277 ret = load_8051_firmware(dd, &fw_8051);
1278 if (ret)
1279 return ret;
1280 }
1281
1282 return 0;
1283}
1284
1285int hfi1_firmware_init(struct hfi1_devdata *dd)
1286{
1287 /* only RTL can use these */
1288 if (dd->icode != ICODE_RTL_SILICON) {
1289 fw_fabric_serdes_load = 0;
1290 fw_pcie_serdes_load = 0;
1291 fw_sbus_load = 0;
1292 }
1293
1294 /* no 8051 or QSFP on simulator */
1295 if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
1296 fw_8051_load = 0;
1297 platform_config_load = 0;
1298 }
1299
1300 if (!fw_8051_name) {
1301 if (dd->icode == ICODE_RTL_SILICON)
1302 fw_8051_name = DEFAULT_FW_8051_NAME_ASIC;
1303 else
1304 fw_8051_name = DEFAULT_FW_8051_NAME_FPGA;
1305 }
1306 if (!fw_fabric_serdes_name)
1307 fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME;
1308 if (!fw_sbus_name)
1309 fw_sbus_name = DEFAULT_FW_SBUS_NAME;
1310 if (!fw_pcie_serdes_name)
1311 fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME;
1312 if (!platform_config_name)
1313 platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME;
1314
1315 return obtain_firmware(dd);
1316}
1317
1318int parse_platform_config(struct hfi1_devdata *dd)
1319{
1320 struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
1321 u32 *ptr = NULL;
1322 u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0;
1323 u32 record_idx = 0, table_type = 0, table_length_dwords = 0;
1324
1325 if (platform_config == NULL) {
1326 dd_dev_info(dd, "%s: Missing config file\n", __func__);
1327 goto bail;
1328 }
1329 ptr = (u32 *)platform_config->data;
1330
1331 magic_num = *ptr;
1332 ptr++;
1333 if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) {
1334 dd_dev_info(dd, "%s: Bad config file\n", __func__);
1335 goto bail;
1336 }
1337
1338 while (ptr < (u32 *)(platform_config->data + platform_config->size)) {
1339 header1 = *ptr;
1340 header2 = *(ptr + 1);
1341 if (header1 != ~header2) {
1342 dd_dev_info(dd, "%s: Failed validation at offset %ld\n",
1343 __func__, (ptr - (u32 *)platform_config->data));
1344 goto bail;
1345 }
1346
1347 record_idx = *ptr &
1348 ((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1);
1349
1350 table_length_dwords = (*ptr >>
1351 PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) &
1352 ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1);
1353
1354 table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) &
1355 ((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1);
1356
1357 /* Done with this set of headers */
1358 ptr += 2;
1359
1360 if (record_idx) {
1361 /* data table */
1362 switch (table_type) {
1363 case PLATFORM_CONFIG_SYSTEM_TABLE:
1364 pcfgcache->config_tables[table_type].num_table =
1365 1;
1366 break;
1367 case PLATFORM_CONFIG_PORT_TABLE:
1368 pcfgcache->config_tables[table_type].num_table =
1369 2;
1370 break;
1371 case PLATFORM_CONFIG_RX_PRESET_TABLE:
1372 /* fall through */
1373 case PLATFORM_CONFIG_TX_PRESET_TABLE:
1374 /* fall through */
1375 case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
1376 /* fall through */
1377 case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
1378 pcfgcache->config_tables[table_type].num_table =
1379 table_length_dwords;
1380 break;
1381 default:
1382 dd_dev_info(dd,
1383 "%s: Unknown data table %d, offset %ld\n",
1384 __func__, table_type,
1385 (ptr - (u32 *)platform_config->data));
1386 goto bail; /* We don't trust this file now */
1387 }
1388 pcfgcache->config_tables[table_type].table = ptr;
1389 } else {
1390 /* metadata table */
1391 switch (table_type) {
1392 case PLATFORM_CONFIG_SYSTEM_TABLE:
1393 /* fall through */
1394 case PLATFORM_CONFIG_PORT_TABLE:
1395 /* fall through */
1396 case PLATFORM_CONFIG_RX_PRESET_TABLE:
1397 /* fall through */
1398 case PLATFORM_CONFIG_TX_PRESET_TABLE:
1399 /* fall through */
1400 case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
1401 /* fall through */
1402 case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
1403 break;
1404 default:
1405 dd_dev_info(dd,
1406 "%s: Unknown metadata table %d, offset %ld\n",
1407 __func__, table_type,
1408 (ptr - (u32 *)platform_config->data));
1409 goto bail; /* We don't trust this file now */
1410 }
1411 pcfgcache->config_tables[table_type].table_metadata =
1412 ptr;
1413 }
1414
1415 /* Calculate and check table crc */
1416 crc = crc32_le(~(u32)0, (unsigned char const *)ptr,
1417 (table_length_dwords * 4));
1418 crc ^= ~(u32)0;
1419
1420 /* Jump the table */
1421 ptr += table_length_dwords;
1422 if (crc != *ptr) {
1423 dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n",
1424 __func__, (ptr - (u32 *)platform_config->data));
1425 goto bail;
1426 }
1427 /* Jump the CRC DWORD */
1428 ptr++;
1429 }
1430
1431 pcfgcache->cache_valid = 1;
1432 return 0;
1433bail:
1434 memset(pcfgcache, 0, sizeof(struct platform_config_cache));
1435 return -EINVAL;
1436}
1437
1438static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table,
1439 int field, u32 *field_len_bits, u32 *field_start_bits)
1440{
1441 struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
1442 u32 *src_ptr = NULL;
1443
1444 if (!pcfgcache->cache_valid)
1445 return -EINVAL;
1446
1447 switch (table) {
1448 case PLATFORM_CONFIG_SYSTEM_TABLE:
1449 /* fall through */
1450 case PLATFORM_CONFIG_PORT_TABLE:
1451 /* fall through */
1452 case PLATFORM_CONFIG_RX_PRESET_TABLE:
1453 /* fall through */
1454 case PLATFORM_CONFIG_TX_PRESET_TABLE:
1455 /* fall through */
1456 case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
1457 /* fall through */
1458 case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
1459 if (field && field < platform_config_table_limits[table])
1460 src_ptr =
1461 pcfgcache->config_tables[table].table_metadata + field;
1462 break;
1463 default:
1464 dd_dev_info(dd, "%s: Unknown table\n", __func__);
1465 break;
1466 }
1467
1468 if (!src_ptr)
1469 return -EINVAL;
1470
1471 if (field_start_bits)
1472 *field_start_bits = *src_ptr &
1473 ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
1474
1475 if (field_len_bits)
1476 *field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT)
1477 & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
1478
1479 return 0;
1480}
1481
1482/* This is the central interface to getting data out of the platform config
1483 * file. It depends on parse_platform_config() having populated the
1484 * platform_config_cache in hfi1_devdata, and checks the cache_valid member to
1485 * validate the sanity of the cache.
1486 *
1487 * The non-obvious parameters:
1488 * @table_index: Acts as a look up key into which instance of the tables the
1489 * relevant field is fetched from.
1490 *
1491 * This applies to the data tables that have multiple instances. The port table
1492 * is an exception to this rule as each HFI only has one port and thus the
1493 * relevant table can be distinguished by hfi_id.
1494 *
1495 * @data: pointer to memory that will be populated with the field requested.
1496 * @len: length of memory pointed by @data in bytes.
1497 */
1498int get_platform_config_field(struct hfi1_devdata *dd,
1499 enum platform_config_table_type_encoding table_type,
1500 int table_index, int field_index, u32 *data, u32 len)
1501{
1502 int ret = 0, wlen = 0, seek = 0;
1503 u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL;
1504 struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
1505
1506 if (data)
1507 memset(data, 0, len);
1508 else
1509 return -EINVAL;
1510
1511 ret = get_platform_fw_field_metadata(dd, table_type, field_index,
1512 &field_len_bits, &field_start_bits);
1513 if (ret)
1514 return -EINVAL;
1515
1516 /* Convert length to bits */
1517 len *= 8;
1518
1519 /* Our metadata function checked cache_valid and field_index for us */
1520 switch (table_type) {
1521 case PLATFORM_CONFIG_SYSTEM_TABLE:
1522 src_ptr = pcfgcache->config_tables[table_type].table;
1523
1524 if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) {
1525 if (len < field_len_bits)
1526 return -EINVAL;
1527
1528 seek = field_start_bits/8;
1529 wlen = field_len_bits/8;
1530
1531 src_ptr = (u32 *)((u8 *)src_ptr + seek);
1532
1533 /* We expect the field to be byte aligned and whole byte
1534 * lengths if we are here */
1535 memcpy(data, src_ptr, wlen);
1536 return 0;
1537 }
1538 break;
1539 case PLATFORM_CONFIG_PORT_TABLE:
1540 /* Port table is 4 DWORDS in META_VERSION 0 */
1541 src_ptr = dd->hfi1_id ?
1542 pcfgcache->config_tables[table_type].table + 4 :
1543 pcfgcache->config_tables[table_type].table;
1544 break;
1545 case PLATFORM_CONFIG_RX_PRESET_TABLE:
1546 /* fall through */
1547 case PLATFORM_CONFIG_TX_PRESET_TABLE:
1548 /* fall through */
1549 case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
1550 /* fall through */
1551 case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
1552 src_ptr = pcfgcache->config_tables[table_type].table;
1553
1554 if (table_index <
1555 pcfgcache->config_tables[table_type].num_table)
1556 src_ptr += table_index;
1557 else
1558 src_ptr = NULL;
1559 break;
1560 default:
1561 dd_dev_info(dd, "%s: Unknown table\n", __func__);
1562 break;
1563 }
1564
1565 if (!src_ptr || len < field_len_bits)
1566 return -EINVAL;
1567
1568 src_ptr += (field_start_bits/32);
1569 *data = (*src_ptr >> (field_start_bits % 32)) &
1570 ((1 << field_len_bits) - 1);
1571
1572 return 0;
1573}
1574
1575/*
1576 * Download the firmware needed for the Gen3 PCIe SerDes. An update
1577 * to the SBus firmware is needed before updating the PCIe firmware.
1578 *
1579 * Note: caller must be holding the HW mutex.
1580 */
1581int load_pcie_firmware(struct hfi1_devdata *dd)
1582{
1583 int ret = 0;
1584
1585 /* both firmware loads below use the SBus */
1586 set_sbus_fast_mode(dd);
1587
1588 if (fw_sbus_load) {
1589 turn_off_spicos(dd, SPICO_SBUS);
1590 ret = load_sbus_firmware(dd, &fw_sbus);
1591 if (ret)
1592 goto done;
1593 }
1594
1595 if (fw_pcie_serdes_load) {
1596 dd_dev_info(dd, "Setting PCIe SerDes broadcast\n");
1597 set_serdes_broadcast(dd, all_pcie_serdes_broadcast,
1598 pcie_serdes_broadcast[dd->hfi1_id],
1599 pcie_serdes_addrs[dd->hfi1_id],
1600 NUM_PCIE_SERDES);
1601 ret = load_pcie_serdes_firmware(dd, &fw_pcie);
1602 if (ret)
1603 goto done;
1604 }
1605
1606done:
1607 clear_sbus_fast_mode(dd);
1608
1609 return ret;
1610}
1611
1612/*
1613 * Read the GUID from the hardware, store it in dd.
1614 */
1615void read_guid(struct hfi1_devdata *dd)
1616{
1617 dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID);
1618 dd_dev_info(dd, "GUID %llx",
1619 (unsigned long long)dd->base_guid);
1620}
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
new file mode 100644
index 000000000000..8ca171bf3e36
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -0,0 +1,1821 @@
1#ifndef _HFI1_KERNEL_H
2#define _HFI1_KERNEL_H
3/*
4 *
5 * This file is provided under a dual BSD/GPLv2 license. When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * Copyright(c) 2015 Intel Corporation.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of version 2 of the GNU General Public License as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * BSD LICENSE
22 *
23 * Copyright(c) 2015 Intel Corporation.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 *
29 * - Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * - Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in
33 * the documentation and/or other materials provided with the
34 * distribution.
35 * - Neither the name of Intel Corporation nor the names of its
36 * contributors may be used to endorse or promote products derived
37 * from this software without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
40 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
41 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
43 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
47 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
49 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 *
51 */
52
53#include <linux/interrupt.h>
54#include <linux/pci.h>
55#include <linux/dma-mapping.h>
56#include <linux/mutex.h>
57#include <linux/list.h>
58#include <linux/scatterlist.h>
59#include <linux/slab.h>
60#include <linux/io.h>
61#include <linux/fs.h>
62#include <linux/completion.h>
63#include <linux/kref.h>
64#include <linux/sched.h>
65#include <linux/cdev.h>
66#include <linux/delay.h>
67#include <linux/kthread.h>
68
69#include "chip_registers.h"
70#include "common.h"
71#include "verbs.h"
72#include "pio.h"
73#include "chip.h"
74#include "mad.h"
75#include "qsfp.h"
76#include "platform_config.h"
77
78/* bumped 1 from s/w major version of TrueScale */
79#define HFI1_CHIP_VERS_MAJ 3U
80
81/* don't care about this except printing */
82#define HFI1_CHIP_VERS_MIN 0U
83
84/* The Organization Unique Identifier (Mfg code), and its position in GUID */
85#define HFI1_OUI 0x001175
86#define HFI1_OUI_LSB 40
87
88#define DROP_PACKET_OFF 0
89#define DROP_PACKET_ON 1
90
91extern unsigned long hfi1_cap_mask;
92#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
93#define HFI1_CAP_UGET_MASK(mask, cap) \
94 (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap)
95#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap))
96#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap))
97#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap))
98#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap))
99#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \
100 HFI1_CAP_MISC_MASK)
101
102/*
103 * per driver stats, either not device nor port-specific, or
104 * summed over all of the devices and ports.
105 * They are described by name via ipathfs filesystem, so layout
106 * and number of elements can change without breaking compatibility.
107 * If members are added or deleted hfi1_statnames[] in debugfs.c must
108 * change to match.
109 */
110struct hfi1_ib_stats {
111 __u64 sps_ints; /* number of interrupts handled */
112 __u64 sps_errints; /* number of error interrupts */
113 __u64 sps_txerrs; /* tx-related packet errors */
114 __u64 sps_rcverrs; /* non-crc rcv packet errors */
115 __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
116 __u64 sps_nopiobufs; /* no pio bufs avail from kernel */
117 __u64 sps_ctxts; /* number of contexts currently open */
118 __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
119 __u64 sps_buffull;
120 __u64 sps_hdrfull;
121};
122
123extern struct hfi1_ib_stats hfi1_stats;
124extern const struct pci_error_handlers hfi1_pci_err_handler;
125
126/*
127 * First-cut criterion for "device is active" is
128 * two thousand dwords combined Tx, Rx traffic per
129 * 5-second interval. SMA packets are 64 dwords,
130 * and occur "a few per second", presumably each way.
131 */
132#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000)
133
134/*
135 * Below contains all data related to a single context (formerly called port).
136 */
137
138#ifdef CONFIG_DEBUG_FS
139struct hfi1_opcode_stats_perctx;
140#endif
141
142/*
143 * struct ps_state keeps state associated with RX queue "prescanning"
144 * (prescanning for FECNs, and BECNs), if prescanning is in use.
145 */
146struct ps_state {
147 u32 ps_head;
148 int initialized;
149};
150
151struct ctxt_eager_bufs {
152 ssize_t size; /* total size of eager buffers */
153 u32 count; /* size of buffers array */
154 u32 numbufs; /* number of buffers allocated */
155 u32 alloced; /* number of rcvarray entries used */
156 u32 rcvtid_size; /* size of each eager rcv tid */
157 u32 threshold; /* head update threshold */
158 struct eager_buffer {
159 void *addr;
160 dma_addr_t phys;
161 ssize_t len;
162 } *buffers;
163 struct {
164 void *addr;
165 dma_addr_t phys;
166 } *rcvtids;
167};
168
169struct hfi1_ctxtdata {
170 /* shadow the ctxt's RcvCtrl register */
171 u64 rcvctrl;
172 /* rcvhdrq base, needs mmap before useful */
173 void *rcvhdrq;
174 /* kernel virtual address where hdrqtail is updated */
175 volatile __le64 *rcvhdrtail_kvaddr;
176 /*
177 * Shared page for kernel to signal user processes that send buffers
178 * need disarming. The process should call HFI1_CMD_DISARM_BUFS
179 * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
180 */
181 unsigned long *user_event_mask;
182 /* when waiting for rcv or pioavail */
183 wait_queue_head_t wait;
184 /* rcvhdrq size (for freeing) */
185 size_t rcvhdrq_size;
186 /* number of rcvhdrq entries */
187 u16 rcvhdrq_cnt;
188 /* size of each of the rcvhdrq entries */
189 u16 rcvhdrqentsize;
190 /* mmap of hdrq, must fit in 44 bits */
191 dma_addr_t rcvhdrq_phys;
192 dma_addr_t rcvhdrqtailaddr_phys;
193 struct ctxt_eager_bufs egrbufs;
194 /* this receive context's assigned PIO ACK send context */
195 struct send_context *sc;
196
197 /* dynamic receive available interrupt timeout */
198 u32 rcvavail_timeout;
199 /*
200 * number of opens (including slave sub-contexts) on this instance
201 * (ignoring forks, dup, etc. for now)
202 */
203 int cnt;
204 /*
205 * how much space to leave at start of eager TID entries for
206 * protocol use, on each TID
207 */
208 /* instead of calculating it */
209 unsigned ctxt;
210 /* non-zero if ctxt is being shared. */
211 u16 subctxt_cnt;
212 /* non-zero if ctxt is being shared. */
213 u16 subctxt_id;
214 u8 uuid[16];
215 /* job key */
216 u16 jkey;
217 /* number of RcvArray groups for this context. */
218 u32 rcv_array_groups;
219 /* index of first eager TID entry. */
220 u32 eager_base;
221 /* number of expected TID entries */
222 u32 expected_count;
223 /* index of first expected TID entry. */
224 u32 expected_base;
225 /* cursor into the exp group sets */
226 atomic_t tidcursor;
227 /* number of exp TID groups assigned to the ctxt */
228 u16 numtidgroups;
229 /* size of exp TID group fields in tidusemap */
230 u16 tidmapcnt;
231 /* exp TID group usage bitfield array */
232 unsigned long *tidusemap;
233 /* pinned pages for exp sends, allocated at open */
234 struct page **tid_pg_list;
235 /* dma handles for exp tid pages */
236 dma_addr_t *physshadow;
237 /* lock protecting all Expected TID data */
238 spinlock_t exp_lock;
239 /* number of pio bufs for this ctxt (all procs, if shared) */
240 u32 piocnt;
241 /* first pio buffer for this ctxt */
242 u32 pio_base;
243 /* chip offset of PIO buffers for this ctxt */
244 u32 piobufs;
245 /* per-context configuration flags */
246 u16 flags;
247 /* per-context event flags for fileops/intr communication */
248 unsigned long event_flags;
249 /* WAIT_RCV that timed out, no interrupt */
250 u32 rcvwait_to;
251 /* WAIT_PIO that timed out, no interrupt */
252 u32 piowait_to;
253 /* WAIT_RCV already happened, no wait */
254 u32 rcvnowait;
255 /* WAIT_PIO already happened, no wait */
256 u32 pionowait;
257 /* total number of polled urgent packets */
258 u32 urgent;
259 /* saved total number of polled urgent packets for poll edge trigger */
260 u32 urgent_poll;
261 /* pid of process using this ctxt */
262 pid_t pid;
263 pid_t subpid[HFI1_MAX_SHARED_CTXTS];
264 /* same size as task_struct .comm[], command that opened context */
265 char comm[16];
266 /* so file ops can get at unit */
267 struct hfi1_devdata *dd;
268 /* so functions that need physical port can get it easily */
269 struct hfi1_pportdata *ppd;
270 /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
271 void *subctxt_uregbase;
272 /* An array of pages for the eager receive buffers * N */
273 void *subctxt_rcvegrbuf;
274 /* An array of pages for the eager header queue entries * N */
275 void *subctxt_rcvhdr_base;
276 /* The version of the library which opened this ctxt */
277 u32 userversion;
278 /* Bitmask of active slaves */
279 u32 active_slaves;
280 /* Type of packets or conditions we want to poll for */
281 u16 poll_type;
282 /* receive packet sequence counter */
283 u8 seq_cnt;
284 u8 redirect_seq_cnt;
285 /* ctxt rcvhdrq head offset */
286 u32 head;
287 u32 pkt_count;
288 /* QPs waiting for context processing */
289 struct list_head qp_wait_list;
290 /* interrupt handling */
291 u64 imask; /* clear interrupt mask */
292 int ireg; /* clear interrupt register */
293 unsigned numa_id; /* numa node of this context */
294 /* verbs stats per CTX */
295 struct hfi1_opcode_stats_perctx *opstats;
296 /*
297 * This is the kernel thread that will keep making
298 * progress on the user sdma requests behind the scenes.
299 * There is one per context (shared contexts use the master's).
300 */
301 struct task_struct *progress;
302 struct list_head sdma_queues;
303 spinlock_t sdma_qlock;
304
305#ifdef CONFIG_PRESCAN_RXQ
306 struct ps_state ps_state;
307#endif /* CONFIG_PRESCAN_RXQ */
308
309 /*
310 * The interrupt handler for a particular receive context can vary
311 * throughout it's lifetime. This is not a lock protected data member so
312 * it must be updated atomically and the prev and new value must always
313 * be valid. Worst case is we process an extra interrupt and up to 64
314 * packets with the wrong interrupt handler.
315 */
316 void (*do_interrupt)(struct hfi1_ctxtdata *rcd);
317};
318
319/*
320 * Represents a single packet at a high level. Put commonly computed things in
321 * here so we do not have to keep doing them over and over. The rule of thumb is
322 * if something is used one time to derive some value, store that something in
323 * here. If it is used multiple times, then store the result of that derivation
324 * in here.
325 */
326struct hfi1_packet {
327 void *ebuf;
328 void *hdr;
329 struct hfi1_ctxtdata *rcd;
330 __le32 *rhf_addr;
331 struct hfi1_qp *qp;
332 struct hfi1_other_headers *ohdr;
333 u64 rhf;
334 u32 maxcnt;
335 u32 rhqoff;
336 u32 hdrqtail;
337 int numpkt;
338 u16 tlen;
339 u16 hlen;
340 s16 etail;
341 u16 rsize;
342 u8 updegr;
343 u8 rcv_flags;
344 u8 etype;
345};
346
347static inline bool has_sc4_bit(struct hfi1_packet *p)
348{
349 return !!rhf_dc_info(p->rhf);
350}
351
352/*
353 * Private data for snoop/capture support.
354 */
355struct hfi1_snoop_data {
356 int mode_flag;
357 struct cdev cdev;
358 struct device *class_dev;
359 spinlock_t snoop_lock;
360 struct list_head queue;
361 wait_queue_head_t waitq;
362 void *filter_value;
363 int (*filter_callback)(void *hdr, void *data, void *value);
364 u64 dcc_cfg; /* saved value of DCC Cfg register */
365};
366
367/* snoop mode_flag values */
368#define HFI1_PORT_SNOOP_MODE 1U
369#define HFI1_PORT_CAPTURE_MODE 2U
370
371struct hfi1_sge_state;
372
373/*
374 * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
375 * Mostly for MADs that set or query link parameters, also ipath
376 * config interfaces
377 */
378#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
379#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */
380#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */
381#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */
382#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
383#define HFI1_IB_CFG_SPD 5 /* current Link spd */
384#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
385#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
386#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
387#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
388#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */
389#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
390#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
391#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
392#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
393#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
394#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */
395#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */
396#define HFI1_IB_CFG_VL_HIGH_LIMIT 19
397#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
398#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */
399
400/*
401 * HFI or Host Link States
402 *
403 * These describe the states the driver thinks the logical and physical
404 * states are in. Used as an argument to set_link_state(). Implemented
405 * as bits for easy multi-state checking. The actual state can only be
406 * one.
407 */
408#define __HLS_UP_INIT_BP 0
409#define __HLS_UP_ARMED_BP 1
410#define __HLS_UP_ACTIVE_BP 2
411#define __HLS_DN_DOWNDEF_BP 3 /* link down default */
412#define __HLS_DN_POLL_BP 4
413#define __HLS_DN_DISABLE_BP 5
414#define __HLS_DN_OFFLINE_BP 6
415#define __HLS_VERIFY_CAP_BP 7
416#define __HLS_GOING_UP_BP 8
417#define __HLS_GOING_OFFLINE_BP 9
418#define __HLS_LINK_COOLDOWN_BP 10
419
420#define HLS_UP_INIT (1 << __HLS_UP_INIT_BP)
421#define HLS_UP_ARMED (1 << __HLS_UP_ARMED_BP)
422#define HLS_UP_ACTIVE (1 << __HLS_UP_ACTIVE_BP)
423#define HLS_DN_DOWNDEF (1 << __HLS_DN_DOWNDEF_BP) /* link down default */
424#define HLS_DN_POLL (1 << __HLS_DN_POLL_BP)
425#define HLS_DN_DISABLE (1 << __HLS_DN_DISABLE_BP)
426#define HLS_DN_OFFLINE (1 << __HLS_DN_OFFLINE_BP)
427#define HLS_VERIFY_CAP (1 << __HLS_VERIFY_CAP_BP)
428#define HLS_GOING_UP (1 << __HLS_GOING_UP_BP)
429#define HLS_GOING_OFFLINE (1 << __HLS_GOING_OFFLINE_BP)
430#define HLS_LINK_COOLDOWN (1 << __HLS_LINK_COOLDOWN_BP)
431
432#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
433
434/* use this MTU size if none other is given */
435#define HFI1_DEFAULT_ACTIVE_MTU 8192
436/* use this MTU size as the default maximum */
437#define HFI1_DEFAULT_MAX_MTU 8192
438/* default partition key */
439#define DEFAULT_PKEY 0xffff
440
441/*
442 * Possible fabric manager config parameters for fm_{get,set}_table()
443 */
444#define FM_TBL_VL_HIGH_ARB 1 /* Get/set VL high prio weights */
445#define FM_TBL_VL_LOW_ARB 2 /* Get/set VL low prio weights */
446#define FM_TBL_BUFFER_CONTROL 3 /* Get/set Buffer Control */
447#define FM_TBL_SC2VLNT 4 /* Get/set SC->VLnt */
448#define FM_TBL_VL_PREEMPT_ELEMS 5 /* Get (no set) VL preempt elems */
449#define FM_TBL_VL_PREEMPT_MATRIX 6 /* Get (no set) VL preempt matrix */
450
451/*
452 * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
453 * these are bits so they can be combined, e.g.
454 * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB
455 */
456#define HFI1_RCVCTRL_TAILUPD_ENB 0x01
457#define HFI1_RCVCTRL_TAILUPD_DIS 0x02
458#define HFI1_RCVCTRL_CTXT_ENB 0x04
459#define HFI1_RCVCTRL_CTXT_DIS 0x08
460#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10
461#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20
462#define HFI1_RCVCTRL_PKEY_ENB 0x40 /* Note, default is enabled */
463#define HFI1_RCVCTRL_PKEY_DIS 0x80
464#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400
465#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800
466#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000
467#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000
468#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000
469#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
470#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
471#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
472
473/* partition enforcement flags */
474#define HFI1_PART_ENFORCE_IN 0x1
475#define HFI1_PART_ENFORCE_OUT 0x2
476
477/* how often we check for synthetic counter wrap around */
478#define SYNTH_CNT_TIME 2
479
480/* Counter flags */
481#define CNTR_NORMAL 0x0 /* Normal counters, just read register */
482#define CNTR_SYNTH 0x1 /* Synthetic counters, saturate at all 1s */
483#define CNTR_DISABLED 0x2 /* Disable this counter */
484#define CNTR_32BIT 0x4 /* Simulate 64 bits for this counter */
485#define CNTR_VL 0x8 /* Per VL counter */
486#define CNTR_INVALID_VL -1 /* Specifies invalid VL */
487#define CNTR_MODE_W 0x0
488#define CNTR_MODE_R 0x1
489
490/* VLs Supported/Operational */
491#define HFI1_MIN_VLS_SUPPORTED 1
492#define HFI1_MAX_VLS_SUPPORTED 8
493
494static inline void incr_cntr64(u64 *cntr)
495{
496 if (*cntr < (u64)-1LL)
497 (*cntr)++;
498}
499
500static inline void incr_cntr32(u32 *cntr)
501{
502 if (*cntr < (u32)-1LL)
503 (*cntr)++;
504}
505
506#define MAX_NAME_SIZE 64
507struct hfi1_msix_entry {
508 struct msix_entry msix;
509 void *arg;
510 char name[MAX_NAME_SIZE];
511 cpumask_var_t mask;
512};
513
514/* per-SL CCA information */
515struct cca_timer {
516 struct hrtimer hrtimer;
517 struct hfi1_pportdata *ppd; /* read-only */
518 int sl; /* read-only */
519 u16 ccti; /* read/write - current value of CCTI */
520};
521
522struct link_down_reason {
523 /*
524 * SMA-facing value. Should be set from .latest when
525 * HLS_UP_* -> HLS_DN_* transition actually occurs.
526 */
527 u8 sma;
528 u8 latest;
529};
530
531enum {
532 LO_PRIO_TABLE,
533 HI_PRIO_TABLE,
534 MAX_PRIO_TABLE
535};
536
537struct vl_arb_cache {
538 spinlock_t lock;
539 struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE];
540};
541
542/*
543 * The structure below encapsulates data relevant to a physical IB Port.
544 * Current chips support only one such port, but the separation
545 * clarifies things a bit. Note that to conform to IB conventions,
546 * port-numbers are one-based. The first or only port is port1.
547 */
548struct hfi1_pportdata {
549 struct hfi1_ibport ibport_data;
550
551 struct hfi1_devdata *dd;
552 struct kobject pport_cc_kobj;
553 struct kobject sc2vl_kobj;
554 struct kobject sl2sc_kobj;
555 struct kobject vl2mtu_kobj;
556
557 /* QSFP support */
558 struct qsfp_data qsfp_info;
559
560 /* GUID for this interface, in host order */
561 u64 guid;
562 /* GUID for peer interface, in host order */
563 u64 neighbor_guid;
564
565 /* up or down physical link state */
566 u32 linkup;
567
568 /*
569 * this address is mapped read-only into user processes so they can
570 * get status cheaply, whenever they want. One qword of status per port
571 */
572 u64 *statusp;
573
574 /* SendDMA related entries */
575
576 struct workqueue_struct *hfi1_wq;
577
578 /* move out of interrupt context */
579 struct work_struct link_vc_work;
580 struct work_struct link_up_work;
581 struct work_struct link_down_work;
582 struct work_struct sma_message_work;
583 struct work_struct freeze_work;
584 struct work_struct link_downgrade_work;
585 struct work_struct link_bounce_work;
586 /* host link state variables */
587 struct mutex hls_lock;
588 u32 host_link_state;
589
590 spinlock_t sdma_alllock ____cacheline_aligned_in_smp;
591
592 u32 lstate; /* logical link state */
593
594 /* these are the "32 bit" regs */
595
596 u32 ibmtu; /* The MTU programmed for this unit */
597 /*
598 * Current max size IB packet (in bytes) including IB headers, that
599 * we can send. Changes when ibmtu changes.
600 */
601 u32 ibmaxlen;
602 u32 current_egress_rate; /* units [10^6 bits/sec] */
603 /* LID programmed for this instance */
604 u16 lid;
605 /* list of pkeys programmed; 0 if not set */
606 u16 pkeys[MAX_PKEY_VALUES];
607 u16 link_width_supported;
608 u16 link_width_downgrade_supported;
609 u16 link_speed_supported;
610 u16 link_width_enabled;
611 u16 link_width_downgrade_enabled;
612 u16 link_speed_enabled;
613 u16 link_width_active;
614 u16 link_width_downgrade_tx_active;
615 u16 link_width_downgrade_rx_active;
616 u16 link_speed_active;
617 u8 vls_supported;
618 u8 vls_operational;
619 /* LID mask control */
620 u8 lmc;
621 /* Rx Polarity inversion (compensate for ~tx on partner) */
622 u8 rx_pol_inv;
623
624 u8 hw_pidx; /* physical port index */
625 u8 port; /* IB port number and index into dd->pports - 1 */
626 /* type of neighbor node */
627 u8 neighbor_type;
628 u8 neighbor_normal;
629 u8 neighbor_fm_security; /* 1 if firmware checking is disabled */
630 u8 neighbor_port_number;
631 u8 is_sm_config_started;
632 u8 offline_disabled_reason;
633 u8 is_active_optimize_enabled;
634 u8 driver_link_ready; /* driver ready for active link */
635 u8 link_enabled; /* link enabled? */
636 u8 linkinit_reason;
637 u8 local_tx_rate; /* rate given to 8051 firmware */
638
639 /* placeholders for IB MAD packet settings */
640 u8 overrun_threshold;
641 u8 phy_error_threshold;
642
643 /* used to override LED behavior */
644 u8 led_override; /* Substituted for normal value, if non-zero */
645 u16 led_override_timeoff; /* delta to next timer event */
646 u8 led_override_vals[2]; /* Alternates per blink-frame */
647 u8 led_override_phase; /* Just counts, LSB picks from vals[] */
648 atomic_t led_override_timer_active;
649 /* Used to flash LEDs in override mode */
650 struct timer_list led_override_timer;
651 u32 sm_trap_qp;
652 u32 sa_qp;
653
654 /*
655 * cca_timer_lock protects access to the per-SL cca_timer
656 * structures (specifically the ccti member).
657 */
658 spinlock_t cca_timer_lock ____cacheline_aligned_in_smp;
659 struct cca_timer cca_timer[OPA_MAX_SLS];
660
661 /* List of congestion control table entries */
662 struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX];
663
664 /* congestion entries, each entry corresponding to a SL */
665 struct opa_congestion_setting_entry_shadow
666 congestion_entries[OPA_MAX_SLS];
667
668 /*
669 * cc_state_lock protects (write) access to the per-port
670 * struct cc_state.
671 */
672 spinlock_t cc_state_lock ____cacheline_aligned_in_smp;
673
674 struct cc_state __rcu *cc_state;
675
676 /* Total number of congestion control table entries */
677 u16 total_cct_entry;
678
679 /* Bit map identifying service level */
680 u32 cc_sl_control_map;
681
682 /* CA's max number of 64 entry units in the congestion control table */
683 u8 cc_max_table_entries;
684
685 /* begin congestion log related entries
686 * cc_log_lock protects all congestion log related data */
687 spinlock_t cc_log_lock ____cacheline_aligned_in_smp;
688 u8 threshold_cong_event_map[OPA_MAX_SLS/8];
689 u16 threshold_event_counter;
690 struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS];
691 int cc_log_idx; /* index for logging events */
692 int cc_mad_idx; /* index for reporting events */
693 /* end congestion log related entries */
694
695 struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE];
696
697 /* port relative counter buffer */
698 u64 *cntrs;
699 /* port relative synthetic counter buffer */
700 u64 *scntrs;
701 /* we synthesize port_xmit_discards from several egress errors */
702 u64 port_xmit_discards;
703 u64 port_xmit_constraint_errors;
704 u64 port_rcv_constraint_errors;
705 /* count of 'link_err' interrupts from DC */
706 u64 link_downed;
707 /* number of times link retrained successfully */
708 u64 link_up;
709 /* port_ltp_crc_mode is returned in 'portinfo' MADs */
710 u16 port_ltp_crc_mode;
711 /* port_crc_mode_enabled is the crc we support */
712 u8 port_crc_mode_enabled;
713 /* mgmt_allowed is also returned in 'portinfo' MADs */
714 u8 mgmt_allowed;
715 u8 part_enforce; /* partition enforcement flags */
716 struct link_down_reason local_link_down_reason;
717 struct link_down_reason neigh_link_down_reason;
718 /* Value to be sent to link peer on LinkDown .*/
719 u8 remote_link_down_reason;
720 /* Error events that will cause a port bounce. */
721 u32 port_error_action;
722};
723
724typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
725
726typedef void (*opcode_handler)(struct hfi1_packet *packet);
727
728/* return values for the RHF receive functions */
729#define RHF_RCV_CONTINUE 0 /* keep going */
730#define RHF_RCV_DONE 1 /* stop, this packet processed */
731#define RHF_RCV_REPROCESS 2 /* stop. retain this packet */
732
733struct rcv_array_data {
734 u8 group_size;
735 u16 ngroups;
736 u16 nctxt_extra;
737};
738
739struct per_vl_data {
740 u16 mtu;
741 struct send_context *sc;
742};
743
744/* 16 to directly index */
745#define PER_VL_SEND_CONTEXTS 16
746
747struct err_info_rcvport {
748 u8 status_and_code;
749 u64 packet_flit1;
750 u64 packet_flit2;
751};
752
753struct err_info_constraint {
754 u8 status;
755 u16 pkey;
756 u32 slid;
757};
758
759struct hfi1_temp {
760 unsigned int curr; /* current temperature */
761 unsigned int lo_lim; /* low temperature limit */
762 unsigned int hi_lim; /* high temperature limit */
763 unsigned int crit_lim; /* critical temperature limit */
764 u8 triggers; /* temperature triggers */
765};
766
767/* device data struct now contains only "general per-device" info.
768 * fields related to a physical IB port are in a hfi1_pportdata struct.
769 */
770struct sdma_engine;
771struct sdma_vl_map;
772
773#define BOARD_VERS_MAX 96 /* how long the version string can be */
774#define SERIAL_MAX 16 /* length of the serial number */
775
776struct hfi1_devdata {
777 struct hfi1_ibdev verbs_dev; /* must be first */
778 struct list_head list;
779 /* pointers to related structs for this device */
780 /* pci access data structure */
781 struct pci_dev *pcidev;
782 struct cdev user_cdev;
783 struct cdev diag_cdev;
784 struct cdev ui_cdev;
785 struct device *user_device;
786 struct device *diag_device;
787 struct device *ui_device;
788
789 /* mem-mapped pointer to base of chip regs */
790 u8 __iomem *kregbase;
791 /* end of mem-mapped chip space excluding sendbuf and user regs */
792 u8 __iomem *kregend;
793 /* physical address of chip for io_remap, etc. */
794 resource_size_t physaddr;
795 /* receive context data */
796 struct hfi1_ctxtdata **rcd;
797 /* send context data */
798 struct send_context_info *send_contexts;
799 /* map hardware send contexts to software index */
800 u8 *hw_to_sw;
801 /* spinlock for allocating and releasing send context resources */
802 spinlock_t sc_lock;
803 /* Per VL data. Enough for all VLs but not all elements are set/used. */
804 struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
805 /* seqlock for sc2vl */
806 seqlock_t sc2vl_lock;
807 u64 sc2vl[4];
808 /* Send Context initialization lock. */
809 spinlock_t sc_init_lock;
810
811 /* fields common to all SDMA engines */
812
813 /* default flags to last descriptor */
814 u64 default_desc1;
815 volatile __le64 *sdma_heads_dma; /* DMA'ed by chip */
816 dma_addr_t sdma_heads_phys;
817 void *sdma_pad_dma; /* DMA'ed by chip */
818 dma_addr_t sdma_pad_phys;
819 /* for deallocation */
820 size_t sdma_heads_size;
821 /* number from the chip */
822 u32 chip_sdma_engines;
823 /* num used */
824 u32 num_sdma;
825 /* lock for sdma_map */
826 spinlock_t sde_map_lock;
827 /* array of engines sized by num_sdma */
828 struct sdma_engine *per_sdma;
829 /* array of vl maps */
830 struct sdma_vl_map __rcu *sdma_map;
831 /* SPC freeze waitqueue and variable */
832 wait_queue_head_t sdma_unfreeze_wq;
833 atomic_t sdma_unfreeze_count;
834
835
836 /* hfi1_pportdata, points to array of (physical) port-specific
837 * data structs, indexed by pidx (0..n-1)
838 */
839 struct hfi1_pportdata *pport;
840
841 /* mem-mapped pointer to base of PIO buffers */
842 void __iomem *piobase;
843 /*
844 * write-combining mem-mapped pointer to base of RcvArray
845 * memory.
846 */
847 void __iomem *rcvarray_wc;
848 /*
849 * credit return base - a per-NUMA range of DMA address that
850 * the chip will use to update the per-context free counter
851 */
852 struct credit_return_base *cr_base;
853
854 /* send context numbers and sizes for each type */
855 struct sc_config_sizes sc_sizes[SC_MAX];
856
857 u32 lcb_access_count; /* count of LCB users */
858
859 char *boardname; /* human readable board info */
860
861 /* device (not port) flags, basically device capabilities */
862 u32 flags;
863
864 /* reset value */
865 u64 z_int_counter;
866 u64 z_rcv_limit;
867 /* percpu int_counter */
868 u64 __percpu *int_counter;
869 u64 __percpu *rcv_limit;
870
871 /* number of receive contexts in use by the driver */
872 u32 num_rcv_contexts;
873 /* number of pio send contexts in use by the driver */
874 u32 num_send_contexts;
875 /*
876 * number of ctxts available for PSM open
877 */
878 u32 freectxts;
879 /* base receive interrupt timeout, in CSR units */
880 u32 rcv_intr_timeout_csr;
881
882 u64 __iomem *egrtidbase;
883 spinlock_t sendctrl_lock; /* protect changes to SendCtrl */
884 spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
885 /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
886 spinlock_t uctxt_lock; /* rcd and user context changes */
887 /* exclusive access to 8051 */
888 spinlock_t dc8051_lock;
889 /* exclusive access to 8051 memory */
890 spinlock_t dc8051_memlock;
891 int dc8051_timed_out; /* remember if the 8051 timed out */
892 /*
893 * A page that will hold event notification bitmaps for all
894 * contexts. This page will be mapped into all processes.
895 */
896 unsigned long *events;
897 /*
898 * per unit status, see also portdata statusp
899 * mapped read-only into user processes so they can get unit and
900 * IB link status cheaply
901 */
902 struct hfi1_status *status;
903 u32 freezelen; /* max length of freezemsg */
904
905 /* revision register shadow */
906 u64 revision;
907 /* Base GUID for device (network order) */
908 u64 base_guid;
909
910 /* these are the "32 bit" regs */
911
912 /* value we put in kr_rcvhdrsize */
913 u32 rcvhdrsize;
914 /* number of receive contexts the chip supports */
915 u32 chip_rcv_contexts;
916 /* number of receive array entries */
917 u32 chip_rcv_array_count;
918 /* number of PIO send contexts the chip supports */
919 u32 chip_send_contexts;
920 /* number of bytes in the PIO memory buffer */
921 u32 chip_pio_mem_size;
922 /* number of bytes in the SDMA memory buffer */
923 u32 chip_sdma_mem_size;
924
925 /* size of each rcvegrbuffer */
926 u32 rcvegrbufsize;
927 /* log2 of above */
928 u16 rcvegrbufsize_shift;
929 /* both sides of the PCIe link are gen3 capable */
930 u8 link_gen3_capable;
931 /* localbus width (1, 2,4,8,16,32) from config space */
932 u32 lbus_width;
933 /* localbus speed in MHz */
934 u32 lbus_speed;
935 int unit; /* unit # of this chip */
936 int node; /* home node of this chip */
937
938 /* save these PCI fields to restore after a reset */
939 u32 pcibar0;
940 u32 pcibar1;
941 u32 pci_rom;
942 u16 pci_command;
943 u16 pcie_devctl;
944 u16 pcie_lnkctl;
945 u16 pcie_devctl2;
946 u32 pci_msix0;
947 u32 pci_lnkctl3;
948 u32 pci_tph2;
949
950 /*
951 * ASCII serial number, from flash, large enough for original
952 * all digit strings, and longer serial number format
953 */
954 u8 serial[SERIAL_MAX];
955 /* human readable board version */
956 u8 boardversion[BOARD_VERS_MAX];
957 u8 lbus_info[32]; /* human readable localbus info */
958 /* chip major rev, from CceRevision */
959 u8 majrev;
960 /* chip minor rev, from CceRevision */
961 u8 minrev;
962 /* hardware ID */
963 u8 hfi1_id;
964 /* implementation code */
965 u8 icode;
966 /* default link down value (poll/sleep) */
967 u8 link_default;
968 /* vAU of this device */
969 u8 vau;
970 /* vCU of this device */
971 u8 vcu;
972 /* link credits of this device */
973 u16 link_credits;
974 /* initial vl15 credits to use */
975 u16 vl15_init;
976
977 /* Misc small ints */
978 /* Number of physical ports available */
979 u8 num_pports;
980 /* Lowest context number which can be used by user processes */
981 u8 first_user_ctxt;
982 u8 n_krcv_queues;
983 u8 qos_shift;
984 u8 qpn_mask;
985
986 u16 rhf_offset; /* offset of RHF within receive header entry */
987 u16 irev; /* implementation revision */
988 u16 dc8051_ver; /* 8051 firmware version */
989
990 struct platform_config_cache pcfg_cache;
991 /* control high-level access to qsfp */
992 struct mutex qsfp_i2c_mutex;
993
994 struct diag_client *diag_client;
995 spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
996
997 u8 psxmitwait_supported;
998 /* cycle length of PS* counters in HW (in picoseconds) */
999 u16 psxmitwait_check_rate;
1000 /* high volume overflow errors deferred to tasklet */
1001 struct tasklet_struct error_tasklet;
1002 /* per device cq worker */
1003 struct kthread_worker *worker;
1004
1005 /* MSI-X information */
1006 struct hfi1_msix_entry *msix_entries;
1007 u32 num_msix_entries;
1008
1009 /* INTx information */
1010 u32 requested_intx_irq; /* did we request one? */
1011 char intx_name[MAX_NAME_SIZE]; /* INTx name */
1012
1013 /* general interrupt: mask of handled interrupts */
1014 u64 gi_mask[CCE_NUM_INT_CSRS];
1015
1016 struct rcv_array_data rcv_entries;
1017
1018 /*
1019 * 64 bit synthetic counters
1020 */
1021 struct timer_list synth_stats_timer;
1022
1023 /*
1024 * device counters
1025 */
1026 char *cntrnames;
1027 size_t cntrnameslen;
1028 size_t ndevcntrs;
1029 u64 *cntrs;
1030 u64 *scntrs;
1031
1032 /*
1033 * remembered values for synthetic counters
1034 */
1035 u64 last_tx;
1036 u64 last_rx;
1037
1038 /*
1039 * per-port counters
1040 */
1041 size_t nportcntrs;
1042 char *portcntrnames;
1043 size_t portcntrnameslen;
1044
1045 struct hfi1_snoop_data hfi1_snoop;
1046
1047 struct err_info_rcvport err_info_rcvport;
1048 struct err_info_constraint err_info_rcv_constraint;
1049 struct err_info_constraint err_info_xmit_constraint;
1050 u8 err_info_uncorrectable;
1051 u8 err_info_fmconfig;
1052
1053 atomic_t drop_packet;
1054 u8 do_drop;
1055
1056 /* receive interrupt functions */
1057 rhf_rcv_function_ptr *rhf_rcv_function_map;
1058 rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
1059
1060 /*
1061 * Handlers for outgoing data so that snoop/capture does not
1062 * have to have its hooks in the send path
1063 */
1064 int (*process_pio_send)(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
1065 u32 hdrwords, struct hfi1_sge_state *ss,
1066 u32 len, u32 plen, u32 dwords, u64 pbc);
1067 int (*process_dma_send)(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
1068 u32 hdrwords, struct hfi1_sge_state *ss,
1069 u32 len, u32 plen, u32 dwords, u64 pbc);
1070 void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
1071 u64 pbc, const void *from, size_t count);
1072
1073 /* OUI comes from the HW. Used everywhere as 3 separate bytes. */
1074 u8 oui1;
1075 u8 oui2;
1076 u8 oui3;
1077 /* Timer and counter used to detect RcvBufOvflCnt changes */
1078 struct timer_list rcverr_timer;
1079 u32 rcv_ovfl_cnt;
1080
1081 int assigned_node_id;
1082 wait_queue_head_t event_queue;
1083
1084 /* Save the enabled LCB error bits */
1085 u64 lcb_err_en;
1086 u8 dc_shutdown;
1087};
1088
1089/* 8051 firmware version helper */
1090#define dc8051_ver(a, b) ((a) << 8 | (b))
1091
1092/* f_put_tid types */
1093#define PT_EXPECTED 0
1094#define PT_EAGER 1
1095#define PT_INVALID 2
1096
1097/* Private data for file operations */
1098struct hfi1_filedata {
1099 struct hfi1_ctxtdata *uctxt;
1100 unsigned subctxt;
1101 struct hfi1_user_sdma_comp_q *cq;
1102 struct hfi1_user_sdma_pkt_q *pq;
1103 /* for cpu affinity; -1 if none */
1104 int rec_cpu_num;
1105};
1106
1107extern struct list_head hfi1_dev_list;
1108extern spinlock_t hfi1_devs_lock;
1109struct hfi1_devdata *hfi1_lookup(int unit);
1110extern u32 hfi1_cpulist_count;
1111extern unsigned long *hfi1_cpulist;
1112
1113extern unsigned int snoop_drop_send;
1114extern unsigned int snoop_force_capture;
1115int hfi1_init(struct hfi1_devdata *, int);
1116int hfi1_count_units(int *npresentp, int *nupp);
1117int hfi1_count_active_units(void);
1118
1119int hfi1_diag_add(struct hfi1_devdata *);
1120void hfi1_diag_remove(struct hfi1_devdata *);
1121void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup);
1122
1123void handle_user_interrupt(struct hfi1_ctxtdata *rcd);
1124
1125int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *);
1126int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *);
1127int hfi1_create_ctxts(struct hfi1_devdata *dd);
1128struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32);
1129void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *,
1130 struct hfi1_devdata *, u8, u8);
1131void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *);
1132
1133void handle_receive_interrupt(struct hfi1_ctxtdata *);
1134void handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd);
1135void handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd);
1136int hfi1_reset_device(int);
1137
1138/* return the driver's idea of the logical OPA port state */
1139static inline u32 driver_lstate(struct hfi1_pportdata *ppd)
1140{
1141 return ppd->lstate; /* use the cached value */
1142}
1143
1144static inline u16 generate_jkey(kuid_t uid)
1145{
1146 return from_kuid(current_user_ns(), uid) & 0xffff;
1147}
1148
1149/*
1150 * active_egress_rate
1151 *
1152 * returns the active egress rate in units of [10^6 bits/sec]
1153 */
1154static inline u32 active_egress_rate(struct hfi1_pportdata *ppd)
1155{
1156 u16 link_speed = ppd->link_speed_active;
1157 u16 link_width = ppd->link_width_active;
1158 u32 egress_rate;
1159
1160 if (link_speed == OPA_LINK_SPEED_25G)
1161 egress_rate = 25000;
1162 else /* assume OPA_LINK_SPEED_12_5G */
1163 egress_rate = 12500;
1164
1165 switch (link_width) {
1166 case OPA_LINK_WIDTH_4X:
1167 egress_rate *= 4;
1168 break;
1169 case OPA_LINK_WIDTH_3X:
1170 egress_rate *= 3;
1171 break;
1172 case OPA_LINK_WIDTH_2X:
1173 egress_rate *= 2;
1174 break;
1175 default:
1176 /* assume IB_WIDTH_1X */
1177 break;
1178 }
1179
1180 return egress_rate;
1181}
1182
1183/*
1184 * egress_cycles
1185 *
1186 * Returns the number of 'fabric clock cycles' to egress a packet
1187 * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock
1188 * rate is (approximately) 805 MHz, the units of the returned value
1189 * are (1/805 MHz).
1190 */
1191static inline u32 egress_cycles(u32 len, u32 rate)
1192{
1193 u32 cycles;
1194
1195 /*
1196 * cycles is:
1197 *
1198 * (length) [bits] / (rate) [bits/sec]
1199 * ---------------------------------------------------
1200 * fabric_clock_period == 1 /(805 * 10^6) [cycles/sec]
1201 */
1202
1203 cycles = len * 8; /* bits */
1204 cycles *= 805;
1205 cycles /= rate;
1206
1207 return cycles;
1208}
1209
1210void set_link_ipg(struct hfi1_pportdata *ppd);
1211void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
1212 u32 rqpn, u8 svc_type);
1213void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn,
1214 u32 pkey, u32 slid, u32 dlid, u8 sc5,
1215 const struct ib_grh *old_grh);
1216
1217#define PACKET_EGRESS_TIMEOUT 350
1218static inline void pause_for_credit_return(struct hfi1_devdata *dd)
1219{
1220 /* Pause at least 1us, to ensure chip returns all credits */
1221 u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000;
1222
1223 udelay(usec ? usec : 1);
1224}
1225
1226/**
1227 * sc_to_vlt() reverse lookup sc to vl
1228 * @dd - devdata
1229 * @sc5 - 5 bit sc
1230 */
1231static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5)
1232{
1233 unsigned seq;
1234 u8 rval;
1235
1236 if (sc5 >= OPA_MAX_SCS)
1237 return (u8)(0xff);
1238
1239 do {
1240 seq = read_seqbegin(&dd->sc2vl_lock);
1241 rval = *(((u8 *)dd->sc2vl) + sc5);
1242 } while (read_seqretry(&dd->sc2vl_lock, seq));
1243
1244 return rval;
1245}
1246
1247#define PKEY_MEMBER_MASK 0x8000
1248#define PKEY_LOW_15_MASK 0x7fff
1249
1250/*
1251 * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent
1252 * being an entry from the ingress partition key table), return 0
1253 * otherwise. Use the matching criteria for ingress partition keys
1254 * specified in the OPAv1 spec., section 9.10.14.
1255 */
1256static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent)
1257{
1258 u16 mkey = pkey & PKEY_LOW_15_MASK;
1259 u16 ment = ent & PKEY_LOW_15_MASK;
1260
1261 if (mkey == ment) {
1262 /*
1263 * If pkey[15] is clear (limited partition member),
1264 * is bit 15 in the corresponding table element
1265 * clear (limited member)?
1266 */
1267 if (!(pkey & PKEY_MEMBER_MASK))
1268 return !!(ent & PKEY_MEMBER_MASK);
1269 return 1;
1270 }
1271 return 0;
1272}
1273
1274/*
1275 * ingress_pkey_table_search - search the entire pkey table for
1276 * an entry which matches 'pkey'. return 0 if a match is found,
1277 * and 1 otherwise.
1278 */
1279static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey)
1280{
1281 int i;
1282
1283 for (i = 0; i < MAX_PKEY_VALUES; i++) {
1284 if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i]))
1285 return 0;
1286 }
1287 return 1;
1288}
1289
1290/*
1291 * ingress_pkey_table_fail - record a failure of ingress pkey validation,
1292 * i.e., increment port_rcv_constraint_errors for the port, and record
1293 * the 'error info' for this failure.
1294 */
1295static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
1296 u16 slid)
1297{
1298 struct hfi1_devdata *dd = ppd->dd;
1299
1300 incr_cntr64(&ppd->port_rcv_constraint_errors);
1301 if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) {
1302 dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK;
1303 dd->err_info_rcv_constraint.slid = slid;
1304 dd->err_info_rcv_constraint.pkey = pkey;
1305 }
1306}
1307
1308/*
1309 * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1
1310 * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx
1311 * is a hint as to the best place in the partition key table to begin
1312 * searching. This function should not be called on the data path because
1313 * of performance reasons. On datapath pkey check is expected to be done
1314 * by HW and rcv_pkey_check function should be called instead.
1315 */
1316static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
1317 u8 sc5, u8 idx, u16 slid)
1318{
1319 if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
1320 return 0;
1321
1322 /* If SC15, pkey[0:14] must be 0x7fff */
1323 if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
1324 goto bad;
1325
1326 /* Is the pkey = 0x0, or 0x8000? */
1327 if ((pkey & PKEY_LOW_15_MASK) == 0)
1328 goto bad;
1329
1330 /* The most likely matching pkey has index 'idx' */
1331 if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx]))
1332 return 0;
1333
1334 /* no match - try the whole table */
1335 if (!ingress_pkey_table_search(ppd, pkey))
1336 return 0;
1337
1338bad:
1339 ingress_pkey_table_fail(ppd, pkey, slid);
1340 return 1;
1341}
1342
1343/*
1344 * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1
1345 * otherwise. It only ensures pkey is vlid for QP0. This function
1346 * should be called on the data path instead of ingress_pkey_check
1347 * as on data path, pkey check is done by HW (except for QP0).
1348 */
1349static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
1350 u8 sc5, u16 slid)
1351{
1352 if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
1353 return 0;
1354
1355 /* If SC15, pkey[0:14] must be 0x7fff */
1356 if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
1357 goto bad;
1358
1359 return 0;
1360bad:
1361 ingress_pkey_table_fail(ppd, pkey, slid);
1362 return 1;
1363}
1364
1365/* MTU handling */
1366
1367/* MTU enumeration, 256-4k match IB */
1368#define OPA_MTU_0 0
1369#define OPA_MTU_256 1
1370#define OPA_MTU_512 2
1371#define OPA_MTU_1024 3
1372#define OPA_MTU_2048 4
1373#define OPA_MTU_4096 5
1374
1375u32 lrh_max_header_bytes(struct hfi1_devdata *dd);
1376int mtu_to_enum(u32 mtu, int default_if_bad);
1377u16 enum_to_mtu(int);
1378static inline int valid_ib_mtu(unsigned int mtu)
1379{
1380 return mtu == 256 || mtu == 512 ||
1381 mtu == 1024 || mtu == 2048 ||
1382 mtu == 4096;
1383}
1384static inline int valid_opa_max_mtu(unsigned int mtu)
1385{
1386 return mtu >= 2048 &&
1387 (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240);
1388}
1389
1390int set_mtu(struct hfi1_pportdata *);
1391
1392int hfi1_set_lid(struct hfi1_pportdata *, u32, u8);
1393void hfi1_disable_after_error(struct hfi1_devdata *);
1394int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int);
1395int hfi1_rcvbuf_validate(u32, u8, u16 *);
1396
1397int fm_get_table(struct hfi1_pportdata *, int, void *);
1398int fm_set_table(struct hfi1_pportdata *, int, void *);
1399
1400void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf);
1401void reset_link_credits(struct hfi1_devdata *dd);
1402void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu);
1403
1404int snoop_recv_handler(struct hfi1_packet *packet);
1405int snoop_send_dma_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
1406 u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
1407 u32 plen, u32 dwords, u64 pbc);
1408int snoop_send_pio_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
1409 u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
1410 u32 plen, u32 dwords, u64 pbc);
1411void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
1412 u64 pbc, const void *from, size_t count);
1413
1414/* for use in system calls, where we want to know device type, etc. */
1415#define ctxt_fp(fp) \
1416 (((struct hfi1_filedata *)(fp)->private_data)->uctxt)
1417#define subctxt_fp(fp) \
1418 (((struct hfi1_filedata *)(fp)->private_data)->subctxt)
1419#define tidcursor_fp(fp) \
1420 (((struct hfi1_filedata *)(fp)->private_data)->tidcursor)
1421#define user_sdma_pkt_fp(fp) \
1422 (((struct hfi1_filedata *)(fp)->private_data)->pq)
1423#define user_sdma_comp_fp(fp) \
1424 (((struct hfi1_filedata *)(fp)->private_data)->cq)
1425
1426static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd)
1427{
1428 return ppd->dd;
1429}
1430
1431static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev)
1432{
1433 return container_of(dev, struct hfi1_devdata, verbs_dev);
1434}
1435
1436static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev)
1437{
1438 return dd_from_dev(to_idev(ibdev));
1439}
1440
1441static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp)
1442{
1443 return container_of(ibp, struct hfi1_pportdata, ibport_data);
1444}
1445
1446static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
1447{
1448 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1449 unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
1450
1451 WARN_ON(pidx >= dd->num_pports);
1452 return &dd->pport[pidx].ibport_data;
1453}
1454
1455/*
1456 * Return the indexed PKEY from the port PKEY table.
1457 */
1458static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
1459{
1460 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1461 u16 ret;
1462
1463 if (index >= ARRAY_SIZE(ppd->pkeys))
1464 ret = 0;
1465 else
1466 ret = ppd->pkeys[index];
1467
1468 return ret;
1469}
1470
1471/*
1472 * Readers of cc_state must call get_cc_state() under rcu_read_lock().
1473 * Writers of cc_state must call get_cc_state() under cc_state_lock.
1474 */
1475static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
1476{
1477 return rcu_dereference(ppd->cc_state);
1478}
1479
1480/*
1481 * values for dd->flags (_device_ related flags)
1482 */
1483#define HFI1_INITTED 0x1 /* chip and driver up and initted */
1484#define HFI1_PRESENT 0x2 /* chip accesses can be done */
1485#define HFI1_FROZEN 0x4 /* chip in SPC freeze */
1486#define HFI1_HAS_SDMA_TIMEOUT 0x8
1487#define HFI1_HAS_SEND_DMA 0x10 /* Supports Send DMA */
1488#define HFI1_FORCED_FREEZE 0x80 /* driver forced freeze mode */
1489#define HFI1_DO_INIT_ASIC 0x100 /* This device will init the ASIC */
1490
1491/* IB dword length mask in PBC (lower 11 bits); same for all chips */
1492#define HFI1_PBC_LENGTH_MASK ((1 << 11) - 1)
1493
1494
1495/* ctxt_flag bit offsets */
1496 /* context has been setup */
1497#define HFI1_CTXT_SETUP_DONE 1
1498 /* waiting for a packet to arrive */
1499#define HFI1_CTXT_WAITING_RCV 2
1500 /* master has not finished initializing */
1501#define HFI1_CTXT_MASTER_UNINIT 4
1502 /* waiting for an urgent packet to arrive */
1503#define HFI1_CTXT_WAITING_URG 5
1504
1505/* free up any allocated data at closes */
1506struct hfi1_devdata *hfi1_init_dd(struct pci_dev *,
1507 const struct pci_device_id *);
1508void hfi1_free_devdata(struct hfi1_devdata *);
1509void cc_state_reclaim(struct rcu_head *rcu);
1510struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
1511
1512/*
1513 * Set LED override, only the two LSBs have "public" meaning, but
1514 * any non-zero value substitutes them for the Link and LinkTrain
1515 * LED states.
1516 */
1517#define HFI1_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
1518#define HFI1_LED_LOG 2 /* Logical (link) YELLOW LED */
1519void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val);
1520
1521#define HFI1_CREDIT_RETURN_RATE (100)
1522
1523/*
1524 * The number of words for the KDETH protocol field. If this is
1525 * larger then the actual field used, then part of the payload
1526 * will be in the header.
1527 *
1528 * Optimally, we want this sized so that a typical case will
1529 * use full cache lines. The typical local KDETH header would
1530 * be:
1531 *
1532 * Bytes Field
1533 * 8 LRH
1534 * 12 BHT
1535 * ?? KDETH
1536 * 8 RHF
1537 * ---
1538 * 28 + KDETH
1539 *
1540 * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS
1541 */
1542#define DEFAULT_RCVHDRSIZE 9
1543
1544/*
1545 * Maximal header byte count:
1546 *
1547 * Bytes Field
1548 * 8 LRH
1549 * 40 GRH (optional)
1550 * 12 BTH
1551 * ?? KDETH
1552 * 8 RHF
1553 * ---
1554 * 68 + KDETH
1555 *
1556 * We also want to maintain a cache line alignment to assist DMA'ing
1557 * of the header bytes. Round up to a good size.
1558 */
1559#define DEFAULT_RCVHDR_ENTSIZE 32
1560
1561int hfi1_get_user_pages(unsigned long, size_t, struct page **);
1562void hfi1_release_user_pages(struct page **, size_t);
1563
1564static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
1565{
1566 *((u64 *) rcd->rcvhdrtail_kvaddr) = 0ULL;
1567}
1568
1569static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
1570{
1571 /*
1572 * volatile because it's a DMA target from the chip, routine is
1573 * inlined, and don't want register caching or reordering.
1574 */
1575 return (u32) le64_to_cpu(*rcd->rcvhdrtail_kvaddr);
1576}
1577
1578/*
1579 * sysfs interface.
1580 */
1581
1582extern const char ib_hfi1_version[];
1583
1584int hfi1_device_create(struct hfi1_devdata *);
1585void hfi1_device_remove(struct hfi1_devdata *);
1586
1587int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
1588 struct kobject *kobj);
1589int hfi1_verbs_register_sysfs(struct hfi1_devdata *);
1590void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *);
1591/* Hook for sysfs read of QSFP */
1592int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
1593
1594int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *);
1595void hfi1_pcie_cleanup(struct pci_dev *);
1596int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *,
1597 const struct pci_device_id *);
1598void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
1599void hfi1_pcie_flr(struct hfi1_devdata *);
1600int pcie_speeds(struct hfi1_devdata *);
1601void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *);
1602void hfi1_enable_intx(struct pci_dev *);
1603void hfi1_nomsix(struct hfi1_devdata *);
1604void restore_pci_variables(struct hfi1_devdata *dd);
1605int do_pcie_gen3_transition(struct hfi1_devdata *dd);
1606int parse_platform_config(struct hfi1_devdata *dd);
1607int get_platform_config_field(struct hfi1_devdata *dd,
1608 enum platform_config_table_type_encoding table_type,
1609 int table_index, int field_index, u32 *data, u32 len);
1610
1611dma_addr_t hfi1_map_page(struct pci_dev *, struct page *, unsigned long,
1612 size_t, int);
1613const char *get_unit_name(int unit);
1614
1615/*
1616 * Flush write combining store buffers (if present) and perform a write
1617 * barrier.
1618 */
1619static inline void flush_wc(void)
1620{
1621 asm volatile("sfence" : : : "memory");
1622}
1623
1624void handle_eflags(struct hfi1_packet *packet);
1625int process_receive_ib(struct hfi1_packet *packet);
1626int process_receive_bypass(struct hfi1_packet *packet);
1627int process_receive_error(struct hfi1_packet *packet);
1628int kdeth_process_expected(struct hfi1_packet *packet);
1629int kdeth_process_eager(struct hfi1_packet *packet);
1630int process_receive_invalid(struct hfi1_packet *packet);
1631
1632extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8];
1633
1634void update_sge(struct hfi1_sge_state *ss, u32 length);
1635
1636/* global module parameter variables */
1637extern unsigned int hfi1_max_mtu;
1638extern unsigned int hfi1_cu;
1639extern unsigned int user_credit_return_threshold;
1640extern uint num_rcv_contexts;
1641extern unsigned n_krcvqs;
1642extern u8 krcvqs[];
1643extern int krcvqsset;
1644extern uint kdeth_qp;
1645extern uint loopback;
1646extern uint quick_linkup;
1647extern uint rcv_intr_timeout;
1648extern uint rcv_intr_count;
1649extern uint rcv_intr_dynamic;
1650extern ushort link_crc_mask;
1651
1652extern struct mutex hfi1_mutex;
1653
1654/* Number of seconds before our card status check... */
1655#define STATUS_TIMEOUT 60
1656
1657#define DRIVER_NAME "hfi1"
1658#define HFI1_USER_MINOR_BASE 0
1659#define HFI1_TRACE_MINOR 127
1660#define HFI1_DIAGPKT_MINOR 128
1661#define HFI1_DIAG_MINOR_BASE 129
1662#define HFI1_SNOOP_CAPTURE_BASE 200
1663#define HFI1_NMINORS 255
1664
1665#define PCI_VENDOR_ID_INTEL 0x8086
1666#define PCI_DEVICE_ID_INTEL0 0x24f0
1667#define PCI_DEVICE_ID_INTEL1 0x24f1
1668
1669#define HFI1_PKT_USER_SC_INTEGRITY \
1670 (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK \
1671 | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK \
1672 | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
1673
1674#define HFI1_PKT_KERNEL_SC_INTEGRITY \
1675 (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK)
1676
1677static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
1678 u16 ctxt_type)
1679{
1680 u64 base_sc_integrity =
1681 SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
1682 | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
1683 | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
1684 | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
1685 | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
1686 | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK
1687 | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
1688 | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
1689 | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
1690 | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK
1691 | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
1692 | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
1693 | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK
1694 | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK
1695 | SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
1696 | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK
1697 | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK;
1698
1699 if (ctxt_type == SC_USER)
1700 base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY;
1701 else
1702 base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
1703
1704 if (is_a0(dd))
1705 /* turn off send-side job key checks - A0 erratum */
1706 return base_sc_integrity &
1707 ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
1708 return base_sc_integrity;
1709}
1710
1711static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
1712{
1713 u64 base_sdma_integrity =
1714 SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
1715 | SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
1716 | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
1717 | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
1718 | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
1719 | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
1720 | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
1721 | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
1722 | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK
1723 | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
1724 | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
1725 | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK
1726 | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK
1727 | SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
1728 | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK
1729 | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK;
1730
1731 if (is_a0(dd))
1732 /* turn off send-side job key checks - A0 erratum */
1733 return base_sdma_integrity &
1734 ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
1735 return base_sdma_integrity;
1736}
1737
1738/*
1739 * hfi1_early_err is used (only!) to print early errors before devdata is
1740 * allocated, or when dd->pcidev may not be valid, and at the tail end of
1741 * cleanup when devdata may have been freed, etc. hfi1_dev_porterr is
1742 * the same as dd_dev_err, but is used when the message really needs
1743 * the IB port# to be definitive as to what's happening..
1744 */
1745#define hfi1_early_err(dev, fmt, ...) \
1746 dev_err(dev, fmt, ##__VA_ARGS__)
1747
1748#define hfi1_early_info(dev, fmt, ...) \
1749 dev_info(dev, fmt, ##__VA_ARGS__)
1750
1751#define dd_dev_emerg(dd, fmt, ...) \
1752 dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
1753 get_unit_name((dd)->unit), ##__VA_ARGS__)
1754#define dd_dev_err(dd, fmt, ...) \
1755 dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
1756 get_unit_name((dd)->unit), ##__VA_ARGS__)
1757#define dd_dev_warn(dd, fmt, ...) \
1758 dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
1759 get_unit_name((dd)->unit), ##__VA_ARGS__)
1760
1761#define dd_dev_warn_ratelimited(dd, fmt, ...) \
1762 dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
1763 get_unit_name((dd)->unit), ##__VA_ARGS__)
1764
1765#define dd_dev_info(dd, fmt, ...) \
1766 dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
1767 get_unit_name((dd)->unit), ##__VA_ARGS__)
1768
1769#define hfi1_dev_porterr(dd, port, fmt, ...) \
1770 dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \
1771 get_unit_name((dd)->unit), (dd)->unit, (port), \
1772 ##__VA_ARGS__)
1773
1774/*
1775 * this is used for formatting hw error messages...
1776 */
1777struct hfi1_hwerror_msgs {
1778 u64 mask;
1779 const char *msg;
1780 size_t sz;
1781};
1782
1783/* in intr.c... */
1784void hfi1_format_hwerrors(u64 hwerrs,
1785 const struct hfi1_hwerror_msgs *hwerrmsgs,
1786 size_t nhwerrmsgs, char *msg, size_t lmsg);
1787
1788#define USER_OPCODE_CHECK_VAL 0xC0
1789#define USER_OPCODE_CHECK_MASK 0xC0
1790#define OPCODE_CHECK_VAL_DISABLED 0x0
1791#define OPCODE_CHECK_MASK_DISABLED 0x0
1792
1793static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd)
1794{
1795 struct hfi1_pportdata *ppd;
1796 int i;
1797
1798 dd->z_int_counter = get_all_cpu_total(dd->int_counter);
1799 dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit);
1800
1801 ppd = (struct hfi1_pportdata *)(dd + 1);
1802 for (i = 0; i < dd->num_pports; i++, ppd++) {
1803 ppd->ibport_data.z_rc_acks =
1804 get_all_cpu_total(ppd->ibport_data.rc_acks);
1805 ppd->ibport_data.z_rc_qacks =
1806 get_all_cpu_total(ppd->ibport_data.rc_qacks);
1807 }
1808}
1809
1810/* Control LED state */
1811static inline void setextled(struct hfi1_devdata *dd, u32 on)
1812{
1813 if (on)
1814 write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F);
1815 else
1816 write_csr(dd, DCC_CFG_LED_CNTRL, 0x10);
1817}
1818
1819int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
1820
1821#endif /* _HFI1_KERNEL_H */
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
new file mode 100644
index 000000000000..a877eda8c13c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -0,0 +1,1722 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/pci.h>
52#include <linux/netdevice.h>
53#include <linux/vmalloc.h>
54#include <linux/delay.h>
55#include <linux/idr.h>
56#include <linux/module.h>
57#include <linux/printk.h>
58#include <linux/hrtimer.h>
59
60#include "hfi.h"
61#include "device.h"
62#include "common.h"
63#include "mad.h"
64#include "sdma.h"
65#include "debugfs.h"
66#include "verbs.h"
67
68#undef pr_fmt
69#define pr_fmt(fmt) DRIVER_NAME ": " fmt
70
71/*
72 * min buffers we want to have per context, after driver
73 */
74#define HFI1_MIN_USER_CTXT_BUFCNT 7
75
76#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
77#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
78#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
79
80/*
81 * Number of user receive contexts we are configured to use (to allow for more
82 * pio buffers per ctxt, etc.) Zero means use one user context per CPU.
83 */
84uint num_rcv_contexts;
85module_param_named(num_rcv_contexts, num_rcv_contexts, uint, S_IRUGO);
86MODULE_PARM_DESC(
87 num_rcv_contexts, "Set max number of user receive contexts to use");
88
89u8 krcvqs[RXE_NUM_DATA_VL];
90int krcvqsset;
91module_param_array(krcvqs, byte, &krcvqsset, S_IRUGO);
92MODULE_PARM_DESC(krcvqs, "Array of the number of kernel receive queues by VL");
93
94/* computed based on above array */
95unsigned n_krcvqs;
96
97static unsigned hfi1_rcvarr_split = 25;
98module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
99MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
100
101static uint eager_buffer_size = (2 << 20); /* 2MB */
102module_param(eager_buffer_size, uint, S_IRUGO);
103MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB");
104
105static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
106module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
107MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
108
109static uint hfi1_hdrq_entsize = 32;
110module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO);
111MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B");
112
113unsigned int user_credit_return_threshold = 33; /* default is 33% */
114module_param(user_credit_return_threshold, uint, S_IRUGO);
115MODULE_PARM_DESC(user_credit_return_theshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
116
117static inline u64 encode_rcv_header_entry_size(u16);
118
119static struct idr hfi1_unit_table;
120u32 hfi1_cpulist_count;
121unsigned long *hfi1_cpulist;
122
123/*
124 * Common code for creating the receive context array.
125 */
126int hfi1_create_ctxts(struct hfi1_devdata *dd)
127{
128 unsigned i;
129 int ret;
130 int local_node_id = pcibus_to_node(dd->pcidev->bus);
131
132 if (local_node_id < 0)
133 local_node_id = numa_node_id();
134 dd->assigned_node_id = local_node_id;
135
136 dd->rcd = kcalloc(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL);
137 if (!dd->rcd) {
138 dd_dev_err(dd,
139 "Unable to allocate receive context array, failing\n");
140 goto nomem;
141 }
142
143 /* create one or more kernel contexts */
144 for (i = 0; i < dd->first_user_ctxt; ++i) {
145 struct hfi1_pportdata *ppd;
146 struct hfi1_ctxtdata *rcd;
147
148 ppd = dd->pport + (i % dd->num_pports);
149 rcd = hfi1_create_ctxtdata(ppd, i);
150 if (!rcd) {
151 dd_dev_err(dd,
152 "Unable to allocate kernel receive context, failing\n");
153 goto nomem;
154 }
155 /*
156 * Set up the kernel context flags here and now because they
157 * use default values for all receive side memories. User
158 * contexts will be handled as they are created.
159 */
160 rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
161 HFI1_CAP_KGET(NODROP_RHQ_FULL) |
162 HFI1_CAP_KGET(NODROP_EGR_FULL) |
163 HFI1_CAP_KGET(DMA_RTAIL);
164 rcd->seq_cnt = 1;
165
166 rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
167 if (!rcd->sc) {
168 dd_dev_err(dd,
169 "Unable to allocate kernel send context, failing\n");
170 dd->rcd[rcd->ctxt] = NULL;
171 hfi1_free_ctxtdata(dd, rcd);
172 goto nomem;
173 }
174
175 ret = hfi1_init_ctxt(rcd->sc);
176 if (ret < 0) {
177 dd_dev_err(dd,
178 "Failed to setup kernel receive context, failing\n");
179 sc_free(rcd->sc);
180 dd->rcd[rcd->ctxt] = NULL;
181 hfi1_free_ctxtdata(dd, rcd);
182 ret = -EFAULT;
183 goto bail;
184 }
185 }
186
187 return 0;
188nomem:
189 ret = -ENOMEM;
190bail:
191 kfree(dd->rcd);
192 dd->rcd = NULL;
193 return ret;
194}
195
196/*
197 * Common code for user and kernel context setup.
198 */
199struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt)
200{
201 struct hfi1_devdata *dd = ppd->dd;
202 struct hfi1_ctxtdata *rcd;
203 unsigned kctxt_ngroups = 0;
204 u32 base;
205
206 if (dd->rcv_entries.nctxt_extra >
207 dd->num_rcv_contexts - dd->first_user_ctxt)
208 kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
209 (dd->num_rcv_contexts - dd->first_user_ctxt));
210 rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
211 if (rcd) {
212 u32 rcvtids, max_entries;
213
214 dd_dev_info(dd, "%s: setting up context %u\n", __func__, ctxt);
215
216 INIT_LIST_HEAD(&rcd->qp_wait_list);
217 rcd->ppd = ppd;
218 rcd->dd = dd;
219 rcd->cnt = 1;
220 rcd->ctxt = ctxt;
221 dd->rcd[ctxt] = rcd;
222 rcd->numa_id = numa_node_id();
223 rcd->rcv_array_groups = dd->rcv_entries.ngroups;
224
225 spin_lock_init(&rcd->exp_lock);
226
227 /*
228 * Calculate the context's RcvArray entry starting point.
229 * We do this here because we have to take into account all
230 * the RcvArray entries that previous context would have
231 * taken and we have to account for any extra groups
232 * assigned to the kernel or user contexts.
233 */
234 if (ctxt < dd->first_user_ctxt) {
235 if (ctxt < kctxt_ngroups) {
236 base = ctxt * (dd->rcv_entries.ngroups + 1);
237 rcd->rcv_array_groups++;
238 } else
239 base = kctxt_ngroups +
240 (ctxt * dd->rcv_entries.ngroups);
241 } else {
242 u16 ct = ctxt - dd->first_user_ctxt;
243
244 base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
245 kctxt_ngroups);
246 if (ct < dd->rcv_entries.nctxt_extra) {
247 base += ct * (dd->rcv_entries.ngroups + 1);
248 rcd->rcv_array_groups++;
249 } else
250 base += dd->rcv_entries.nctxt_extra +
251 (ct * dd->rcv_entries.ngroups);
252 }
253 rcd->eager_base = base * dd->rcv_entries.group_size;
254
255 /* Validate and initialize Rcv Hdr Q variables */
256 if (rcvhdrcnt % HDRQ_INCREMENT) {
257 dd_dev_err(dd,
258 "ctxt%u: header queue count %d must be divisible by %d\n",
259 rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT);
260 goto bail;
261 }
262 rcd->rcvhdrq_cnt = rcvhdrcnt;
263 rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
264 /*
265 * Simple Eager buffer allocation: we have already pre-allocated
266 * the number of RcvArray entry groups. Each ctxtdata structure
267 * holds the number of groups for that context.
268 *
269 * To follow CSR requirements and maintain cacheline alignment,
270 * make sure all sizes and bases are multiples of group_size.
271 *
272 * The expected entry count is what is left after assigning
273 * eager.
274 */
275 max_entries = rcd->rcv_array_groups *
276 dd->rcv_entries.group_size;
277 rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
278 rcd->egrbufs.count = round_down(rcvtids,
279 dd->rcv_entries.group_size);
280 if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
281 dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
282 rcd->ctxt);
283 rcd->egrbufs.count = MAX_EAGER_ENTRIES;
284 }
285 dd_dev_info(dd, "ctxt%u: max Eager buffer RcvArray entries: %u\n",
286 rcd->ctxt, rcd->egrbufs.count);
287
288 /*
289 * Allocate array that will hold the eager buffer accounting
290 * data.
291 * This will allocate the maximum possible buffer count based
292 * on the value of the RcvArray split parameter.
293 * The resulting value will be rounded down to the closest
294 * multiple of dd->rcv_entries.group_size.
295 */
296 rcd->egrbufs.buffers = kzalloc(sizeof(*rcd->egrbufs.buffers) *
297 rcd->egrbufs.count, GFP_KERNEL);
298 if (!rcd->egrbufs.buffers)
299 goto bail;
300 rcd->egrbufs.rcvtids = kzalloc(sizeof(*rcd->egrbufs.rcvtids) *
301 rcd->egrbufs.count, GFP_KERNEL);
302 if (!rcd->egrbufs.rcvtids)
303 goto bail;
304 rcd->egrbufs.size = eager_buffer_size;
305 /*
306 * The size of the buffers programmed into the RcvArray
307 * entries needs to be big enough to handle the highest
308 * MTU supported.
309 */
310 if (rcd->egrbufs.size < hfi1_max_mtu) {
311 rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
312 dd_dev_info(dd,
313 "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
314 rcd->ctxt, rcd->egrbufs.size);
315 }
316 rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
317
318 if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
319 rcd->opstats = kzalloc(sizeof(*rcd->opstats),
320 GFP_KERNEL);
321 if (!rcd->opstats) {
322 dd_dev_err(dd,
323 "ctxt%u: Unable to allocate per ctxt stats buffer\n",
324 rcd->ctxt);
325 goto bail;
326 }
327 }
328 }
329 return rcd;
330bail:
331 kfree(rcd->opstats);
332 kfree(rcd->egrbufs.rcvtids);
333 kfree(rcd->egrbufs.buffers);
334 kfree(rcd);
335 return NULL;
336}
337
338/*
339 * Convert a receive header entry size that to the encoding used in the CSR.
340 *
341 * Return a zero if the given size is invalid.
342 */
343static inline u64 encode_rcv_header_entry_size(u16 size)
344{
345 /* there are only 3 valid receive header entry sizes */
346 if (size == 2)
347 return 1;
348 if (size == 16)
349 return 2;
350 else if (size == 32)
351 return 4;
352 return 0; /* invalid */
353}
354
355/*
356 * Select the largest ccti value over all SLs to determine the intra-
357 * packet gap for the link.
358 *
359 * called with cca_timer_lock held (to protect access to cca_timer
360 * array), and rcu_read_lock() (to protect access to cc_state).
361 */
362void set_link_ipg(struct hfi1_pportdata *ppd)
363{
364 struct hfi1_devdata *dd = ppd->dd;
365 struct cc_state *cc_state;
366 int i;
367 u16 cce, ccti_limit, max_ccti = 0;
368 u16 shift, mult;
369 u64 src;
370 u32 current_egress_rate; /* Mbits /sec */
371 u32 max_pkt_time;
372 /*
373 * max_pkt_time is the maximum packet egress time in units
374 * of the fabric clock period 1/(805 MHz).
375 */
376
377 cc_state = get_cc_state(ppd);
378
379 if (cc_state == NULL)
380 /*
381 * This should _never_ happen - rcu_read_lock() is held,
382 * and set_link_ipg() should not be called if cc_state
383 * is NULL.
384 */
385 return;
386
387 for (i = 0; i < OPA_MAX_SLS; i++) {
388 u16 ccti = ppd->cca_timer[i].ccti;
389
390 if (ccti > max_ccti)
391 max_ccti = ccti;
392 }
393
394 ccti_limit = cc_state->cct.ccti_limit;
395 if (max_ccti > ccti_limit)
396 max_ccti = ccti_limit;
397
398 cce = cc_state->cct.entries[max_ccti].entry;
399 shift = (cce & 0xc000) >> 14;
400 mult = (cce & 0x3fff);
401
402 current_egress_rate = active_egress_rate(ppd);
403
404 max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
405
406 src = (max_pkt_time >> shift) * mult;
407
408 src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
409 src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
410
411 write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
412}
413
414static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
415{
416 struct cca_timer *cca_timer;
417 struct hfi1_pportdata *ppd;
418 int sl;
419 u16 ccti, ccti_timer, ccti_min;
420 struct cc_state *cc_state;
421
422 cca_timer = container_of(t, struct cca_timer, hrtimer);
423 ppd = cca_timer->ppd;
424 sl = cca_timer->sl;
425
426 rcu_read_lock();
427
428 cc_state = get_cc_state(ppd);
429
430 if (cc_state == NULL) {
431 rcu_read_unlock();
432 return HRTIMER_NORESTART;
433 }
434
435 /*
436 * 1) decrement ccti for SL
437 * 2) calculate IPG for link (set_link_ipg())
438 * 3) restart timer, unless ccti is at min value
439 */
440
441 ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
442 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
443
444 spin_lock(&ppd->cca_timer_lock);
445
446 ccti = cca_timer->ccti;
447
448 if (ccti > ccti_min) {
449 cca_timer->ccti--;
450 set_link_ipg(ppd);
451 }
452
453 spin_unlock(&ppd->cca_timer_lock);
454
455 rcu_read_unlock();
456
457 if (ccti > ccti_min) {
458 unsigned long nsec = 1024 * ccti_timer;
459 /* ccti_timer is in units of 1.024 usec */
460 hrtimer_forward_now(t, ns_to_ktime(nsec));
461 return HRTIMER_RESTART;
462 }
463 return HRTIMER_NORESTART;
464}
465
466/*
467 * Common code for initializing the physical port structure.
468 */
469void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
470 struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
471{
472 int i, size;
473 uint default_pkey_idx;
474
475 ppd->dd = dd;
476 ppd->hw_pidx = hw_pidx;
477 ppd->port = port; /* IB port number, not index */
478
479 default_pkey_idx = 1;
480
481 ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
482 if (loopback) {
483 hfi1_early_err(&pdev->dev,
484 "Faking data partition 0x8001 in idx %u\n",
485 !default_pkey_idx);
486 ppd->pkeys[!default_pkey_idx] = 0x8001;
487 }
488
489 INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
490 INIT_WORK(&ppd->link_up_work, handle_link_up);
491 INIT_WORK(&ppd->link_down_work, handle_link_down);
492 INIT_WORK(&ppd->freeze_work, handle_freeze);
493 INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
494 INIT_WORK(&ppd->sma_message_work, handle_sma_message);
495 INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
496 mutex_init(&ppd->hls_lock);
497 spin_lock_init(&ppd->sdma_alllock);
498 spin_lock_init(&ppd->qsfp_info.qsfp_lock);
499
500 ppd->sm_trap_qp = 0x0;
501 ppd->sa_qp = 0x1;
502
503 ppd->hfi1_wq = NULL;
504
505 spin_lock_init(&ppd->cca_timer_lock);
506
507 for (i = 0; i < OPA_MAX_SLS; i++) {
508 hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
509 HRTIMER_MODE_REL);
510 ppd->cca_timer[i].ppd = ppd;
511 ppd->cca_timer[i].sl = i;
512 ppd->cca_timer[i].ccti = 0;
513 ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
514 }
515
516 ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
517
518 spin_lock_init(&ppd->cc_state_lock);
519 spin_lock_init(&ppd->cc_log_lock);
520 size = sizeof(struct cc_state);
521 RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL));
522 if (!rcu_dereference(ppd->cc_state))
523 goto bail;
524 return;
525
526bail:
527
528 hfi1_early_err(&pdev->dev,
529 "Congestion Control Agent disabled for port %d\n", port);
530}
531
532/*
533 * Do initialization for device that is only needed on
534 * first detect, not on resets.
535 */
536static int loadtime_init(struct hfi1_devdata *dd)
537{
538 return 0;
539}
540
541/**
542 * init_after_reset - re-initialize after a reset
543 * @dd: the hfi1_ib device
544 *
545 * sanity check at least some of the values after reset, and
546 * ensure no receive or transmit (explicitly, in case reset
547 * failed
548 */
549static int init_after_reset(struct hfi1_devdata *dd)
550{
551 int i;
552
553 /*
554 * Ensure chip does no sends or receives, tail updates, or
555 * pioavail updates while we re-initialize. This is mostly
556 * for the driver data structures, not chip registers.
557 */
558 for (i = 0; i < dd->num_rcv_contexts; i++)
559 hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
560 HFI1_RCVCTRL_INTRAVAIL_DIS |
561 HFI1_RCVCTRL_TAILUPD_DIS, i);
562 pio_send_control(dd, PSC_GLOBAL_DISABLE);
563 for (i = 0; i < dd->num_send_contexts; i++)
564 sc_disable(dd->send_contexts[i].sc);
565
566 return 0;
567}
568
569static void enable_chip(struct hfi1_devdata *dd)
570{
571 u32 rcvmask;
572 u32 i;
573
574 /* enable PIO send */
575 pio_send_control(dd, PSC_GLOBAL_ENABLE);
576
577 /*
578 * Enable kernel ctxts' receive and receive interrupt.
579 * Other ctxts done as user opens and initializes them.
580 */
581 rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
582 for (i = 0; i < dd->first_user_ctxt; ++i) {
583 rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
584 HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
585 if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR))
586 rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
587 if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL))
588 rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
589 if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL))
590 rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
591 hfi1_rcvctrl(dd, rcvmask, i);
592 sc_enable(dd->rcd[i]->sc);
593 }
594}
595
596/**
597 * create_workqueues - create per port workqueues
598 * @dd: the hfi1_ib device
599 */
600static int create_workqueues(struct hfi1_devdata *dd)
601{
602 int pidx;
603 struct hfi1_pportdata *ppd;
604
605 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
606 ppd = dd->pport + pidx;
607 if (!ppd->hfi1_wq) {
608 char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */
609
610 snprintf(wq_name, sizeof(wq_name), "hfi%d_%d",
611 dd->unit, pidx);
612 ppd->hfi1_wq =
613 create_singlethread_workqueue(wq_name);
614 if (!ppd->hfi1_wq)
615 goto wq_error;
616 }
617 }
618 return 0;
619wq_error:
620 pr_err("create_singlethread_workqueue failed for port %d\n",
621 pidx + 1);
622 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
623 ppd = dd->pport + pidx;
624 if (ppd->hfi1_wq) {
625 destroy_workqueue(ppd->hfi1_wq);
626 ppd->hfi1_wq = NULL;
627 }
628 }
629 return -ENOMEM;
630}
631
632/**
633 * hfi1_init - do the actual initialization sequence on the chip
634 * @dd: the hfi1_ib device
635 * @reinit: re-initializing, so don't allocate new memory
636 *
637 * Do the actual initialization sequence on the chip. This is done
638 * both from the init routine called from the PCI infrastructure, and
639 * when we reset the chip, or detect that it was reset internally,
640 * or it's administratively re-enabled.
641 *
642 * Memory allocation here and in called routines is only done in
643 * the first case (reinit == 0). We have to be careful, because even
644 * without memory allocation, we need to re-write all the chip registers
645 * TIDs, etc. after the reset or enable has completed.
646 */
647int hfi1_init(struct hfi1_devdata *dd, int reinit)
648{
649 int ret = 0, pidx, lastfail = 0;
650 unsigned i, len;
651 struct hfi1_ctxtdata *rcd;
652 struct hfi1_pportdata *ppd;
653
654 /* Set up recv low level handlers */
655 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] =
656 kdeth_process_expected;
657 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] =
658 kdeth_process_eager;
659 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib;
660 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] =
661 process_receive_error;
662 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] =
663 process_receive_bypass;
664 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] =
665 process_receive_invalid;
666 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] =
667 process_receive_invalid;
668 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] =
669 process_receive_invalid;
670 dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
671
672 /* Set up send low level handlers */
673 dd->process_pio_send = hfi1_verbs_send_pio;
674 dd->process_dma_send = hfi1_verbs_send_dma;
675 dd->pio_inline_send = pio_copy;
676
677 if (is_a0(dd)) {
678 atomic_set(&dd->drop_packet, DROP_PACKET_ON);
679 dd->do_drop = 1;
680 } else {
681 atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
682 dd->do_drop = 0;
683 }
684
685 /* make sure the link is not "up" */
686 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
687 ppd = dd->pport + pidx;
688 ppd->linkup = 0;
689 }
690
691 if (reinit)
692 ret = init_after_reset(dd);
693 else
694 ret = loadtime_init(dd);
695 if (ret)
696 goto done;
697
698 /* dd->rcd can be NULL if early initialization failed */
699 for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
700 /*
701 * Set up the (kernel) rcvhdr queue and egr TIDs. If doing
702 * re-init, the simplest way to handle this is to free
703 * existing, and re-allocate.
704 * Need to re-create rest of ctxt 0 ctxtdata as well.
705 */
706 rcd = dd->rcd[i];
707 if (!rcd)
708 continue;
709
710 rcd->do_interrupt = &handle_receive_interrupt;
711
712 lastfail = hfi1_create_rcvhdrq(dd, rcd);
713 if (!lastfail)
714 lastfail = hfi1_setup_eagerbufs(rcd);
715 if (lastfail)
716 dd_dev_err(dd,
717 "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
718 }
719 if (lastfail)
720 ret = lastfail;
721
722 /* Allocate enough memory for user event notification. */
723 len = ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
724 sizeof(*dd->events), PAGE_SIZE);
725 dd->events = vmalloc_user(len);
726 if (!dd->events)
727 dd_dev_err(dd, "Failed to allocate user events page\n");
728 /*
729 * Allocate a page for device and port status.
730 * Page will be shared amongst all user processes.
731 */
732 dd->status = vmalloc_user(PAGE_SIZE);
733 if (!dd->status)
734 dd_dev_err(dd, "Failed to allocate dev status page\n");
735 else
736 dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) -
737 sizeof(dd->status->freezemsg));
738 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
739 ppd = dd->pport + pidx;
740 if (dd->status)
741 /* Currently, we only have one port */
742 ppd->statusp = &dd->status->port;
743
744 set_mtu(ppd);
745 }
746
747 /* enable chip even if we have an error, so we can debug cause */
748 enable_chip(dd);
749
750 ret = hfi1_cq_init(dd);
751done:
752 /*
753 * Set status even if port serdes is not initialized
754 * so that diags will work.
755 */
756 if (dd->status)
757 dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
758 HFI1_STATUS_INITTED;
759 if (!ret) {
760 /* enable all interrupts from the chip */
761 set_intr_state(dd, 1);
762
763 /* chip is OK for user apps; mark it as initialized */
764 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
765 ppd = dd->pport + pidx;
766
767 /* initialize the qsfp if it exists
768 * Requires interrupts to be enabled so we are notified
769 * when the QSFP completes reset, and has
770 * to be done before bringing up the SERDES
771 */
772 init_qsfp(ppd);
773
774 /* start the serdes - must be after interrupts are
775 enabled so we are notified when the link goes up */
776 lastfail = bringup_serdes(ppd);
777 if (lastfail)
778 dd_dev_info(dd,
779 "Failed to bring up port %u\n",
780 ppd->port);
781
782 /*
783 * Set status even if port serdes is not initialized
784 * so that diags will work.
785 */
786 if (ppd->statusp)
787 *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
788 HFI1_STATUS_INITTED;
789 if (!ppd->link_speed_enabled)
790 continue;
791 }
792 }
793
794 /* if ret is non-zero, we probably should do some cleanup here... */
795 return ret;
796}
797
798static inline struct hfi1_devdata *__hfi1_lookup(int unit)
799{
800 return idr_find(&hfi1_unit_table, unit);
801}
802
803struct hfi1_devdata *hfi1_lookup(int unit)
804{
805 struct hfi1_devdata *dd;
806 unsigned long flags;
807
808 spin_lock_irqsave(&hfi1_devs_lock, flags);
809 dd = __hfi1_lookup(unit);
810 spin_unlock_irqrestore(&hfi1_devs_lock, flags);
811
812 return dd;
813}
814
815/*
816 * Stop the timers during unit shutdown, or after an error late
817 * in initialization.
818 */
819static void stop_timers(struct hfi1_devdata *dd)
820{
821 struct hfi1_pportdata *ppd;
822 int pidx;
823
824 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
825 ppd = dd->pport + pidx;
826 if (ppd->led_override_timer.data) {
827 del_timer_sync(&ppd->led_override_timer);
828 atomic_set(&ppd->led_override_timer_active, 0);
829 }
830 }
831}
832
833/**
834 * shutdown_device - shut down a device
835 * @dd: the hfi1_ib device
836 *
837 * This is called to make the device quiet when we are about to
838 * unload the driver, and also when the device is administratively
839 * disabled. It does not free any data structures.
840 * Everything it does has to be setup again by hfi1_init(dd, 1)
841 */
842static void shutdown_device(struct hfi1_devdata *dd)
843{
844 struct hfi1_pportdata *ppd;
845 unsigned pidx;
846 int i;
847
848 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
849 ppd = dd->pport + pidx;
850
851 ppd->linkup = 0;
852 if (ppd->statusp)
853 *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
854 HFI1_STATUS_IB_READY);
855 }
856 dd->flags &= ~HFI1_INITTED;
857
858 /* mask interrupts, but not errors */
859 set_intr_state(dd, 0);
860
861 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
862 ppd = dd->pport + pidx;
863 for (i = 0; i < dd->num_rcv_contexts; i++)
864 hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
865 HFI1_RCVCTRL_CTXT_DIS |
866 HFI1_RCVCTRL_INTRAVAIL_DIS |
867 HFI1_RCVCTRL_PKEY_DIS |
868 HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i);
869 /*
870 * Gracefully stop all sends allowing any in progress to
871 * trickle out first.
872 */
873 for (i = 0; i < dd->num_send_contexts; i++)
874 sc_flush(dd->send_contexts[i].sc);
875 }
876
877 /*
878 * Enough for anything that's going to trickle out to have actually
879 * done so.
880 */
881 udelay(20);
882
883 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
884 ppd = dd->pport + pidx;
885
886 /* disable all contexts */
887 for (i = 0; i < dd->num_send_contexts; i++)
888 sc_disable(dd->send_contexts[i].sc);
889 /* disable the send device */
890 pio_send_control(dd, PSC_GLOBAL_DISABLE);
891
892 /*
893 * Clear SerdesEnable.
894 * We can't count on interrupts since we are stopping.
895 */
896 hfi1_quiet_serdes(ppd);
897
898 if (ppd->hfi1_wq) {
899 destroy_workqueue(ppd->hfi1_wq);
900 ppd->hfi1_wq = NULL;
901 }
902 }
903 sdma_exit(dd);
904}
905
906/**
907 * hfi1_free_ctxtdata - free a context's allocated data
908 * @dd: the hfi1_ib device
909 * @rcd: the ctxtdata structure
910 *
911 * free up any allocated data for a context
912 * This should not touch anything that would affect a simultaneous
913 * re-allocation of context data, because it is called after hfi1_mutex
914 * is released (and can be called from reinit as well).
915 * It should never change any chip state, or global driver state.
916 */
917void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
918{
919 unsigned e;
920
921 if (!rcd)
922 return;
923
924 if (rcd->rcvhdrq) {
925 dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
926 rcd->rcvhdrq, rcd->rcvhdrq_phys);
927 rcd->rcvhdrq = NULL;
928 if (rcd->rcvhdrtail_kvaddr) {
929 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
930 (void *)rcd->rcvhdrtail_kvaddr,
931 rcd->rcvhdrqtailaddr_phys);
932 rcd->rcvhdrtail_kvaddr = NULL;
933 }
934 }
935
936 /* all the RcvArray entries should have been cleared by now */
937 kfree(rcd->egrbufs.rcvtids);
938
939 for (e = 0; e < rcd->egrbufs.alloced; e++) {
940 if (rcd->egrbufs.buffers[e].phys)
941 dma_free_coherent(&dd->pcidev->dev,
942 rcd->egrbufs.buffers[e].len,
943 rcd->egrbufs.buffers[e].addr,
944 rcd->egrbufs.buffers[e].phys);
945 }
946 kfree(rcd->egrbufs.buffers);
947
948 sc_free(rcd->sc);
949 vfree(rcd->physshadow);
950 vfree(rcd->tid_pg_list);
951 vfree(rcd->user_event_mask);
952 vfree(rcd->subctxt_uregbase);
953 vfree(rcd->subctxt_rcvegrbuf);
954 vfree(rcd->subctxt_rcvhdr_base);
955 kfree(rcd->tidusemap);
956 kfree(rcd->opstats);
957 kfree(rcd);
958}
959
960void hfi1_free_devdata(struct hfi1_devdata *dd)
961{
962 unsigned long flags;
963
964 spin_lock_irqsave(&hfi1_devs_lock, flags);
965 idr_remove(&hfi1_unit_table, dd->unit);
966 list_del(&dd->list);
967 spin_unlock_irqrestore(&hfi1_devs_lock, flags);
968 hfi1_dbg_ibdev_exit(&dd->verbs_dev);
969 rcu_barrier(); /* wait for rcu callbacks to complete */
970 free_percpu(dd->int_counter);
971 free_percpu(dd->rcv_limit);
972 ib_dealloc_device(&dd->verbs_dev.ibdev);
973}
974
975/*
976 * Allocate our primary per-unit data structure. Must be done via verbs
977 * allocator, because the verbs cleanup process both does cleanup and
978 * free of the data structure.
979 * "extra" is for chip-specific data.
980 *
981 * Use the idr mechanism to get a unit number for this unit.
982 */
983struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
984{
985 unsigned long flags;
986 struct hfi1_devdata *dd;
987 int ret;
988
989 dd = (struct hfi1_devdata *)ib_alloc_device(sizeof(*dd) + extra);
990 if (!dd)
991 return ERR_PTR(-ENOMEM);
992 /* extra is * number of ports */
993 dd->num_pports = extra / sizeof(struct hfi1_pportdata);
994 dd->pport = (struct hfi1_pportdata *)(dd + 1);
995
996 INIT_LIST_HEAD(&dd->list);
997 dd->node = dev_to_node(&pdev->dev);
998 if (dd->node < 0)
999 dd->node = 0;
1000 idr_preload(GFP_KERNEL);
1001 spin_lock_irqsave(&hfi1_devs_lock, flags);
1002
1003 ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
1004 if (ret >= 0) {
1005 dd->unit = ret;
1006 list_add(&dd->list, &hfi1_dev_list);
1007 }
1008
1009 spin_unlock_irqrestore(&hfi1_devs_lock, flags);
1010 idr_preload_end();
1011
1012 if (ret < 0) {
1013 hfi1_early_err(&pdev->dev,
1014 "Could not allocate unit ID: error %d\n", -ret);
1015 goto bail;
1016 }
1017 /*
1018 * Initialize all locks for the device. This needs to be as early as
1019 * possible so locks are usable.
1020 */
1021 spin_lock_init(&dd->sc_lock);
1022 spin_lock_init(&dd->sendctrl_lock);
1023 spin_lock_init(&dd->rcvctrl_lock);
1024 spin_lock_init(&dd->uctxt_lock);
1025 spin_lock_init(&dd->hfi1_diag_trans_lock);
1026 spin_lock_init(&dd->sc_init_lock);
1027 spin_lock_init(&dd->dc8051_lock);
1028 spin_lock_init(&dd->dc8051_memlock);
1029 mutex_init(&dd->qsfp_i2c_mutex);
1030 seqlock_init(&dd->sc2vl_lock);
1031 spin_lock_init(&dd->sde_map_lock);
1032 init_waitqueue_head(&dd->event_queue);
1033
1034 dd->int_counter = alloc_percpu(u64);
1035 if (!dd->int_counter) {
1036 ret = -ENOMEM;
1037 hfi1_early_err(&pdev->dev,
1038 "Could not allocate per-cpu int_counter\n");
1039 goto bail;
1040 }
1041
1042 dd->rcv_limit = alloc_percpu(u64);
1043 if (!dd->rcv_limit) {
1044 ret = -ENOMEM;
1045 hfi1_early_err(&pdev->dev,
1046 "Could not allocate per-cpu rcv_limit\n");
1047 goto bail;
1048 }
1049
1050 if (!hfi1_cpulist_count) {
1051 u32 count = num_online_cpus();
1052
1053 hfi1_cpulist = kzalloc(BITS_TO_LONGS(count) *
1054 sizeof(long), GFP_KERNEL);
1055 if (hfi1_cpulist)
1056 hfi1_cpulist_count = count;
1057 else
1058 hfi1_early_err(
1059 &pdev->dev,
1060 "Could not alloc cpulist info, cpu affinity might be wrong\n");
1061 }
1062 hfi1_dbg_ibdev_init(&dd->verbs_dev);
1063 return dd;
1064
1065bail:
1066 if (!list_empty(&dd->list))
1067 list_del_init(&dd->list);
1068 ib_dealloc_device(&dd->verbs_dev.ibdev);
1069 return ERR_PTR(ret);
1070}
1071
1072/*
1073 * Called from freeze mode handlers, and from PCI error
1074 * reporting code. Should be paranoid about state of
1075 * system and data structures.
1076 */
1077void hfi1_disable_after_error(struct hfi1_devdata *dd)
1078{
1079 if (dd->flags & HFI1_INITTED) {
1080 u32 pidx;
1081
1082 dd->flags &= ~HFI1_INITTED;
1083 if (dd->pport)
1084 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
1085 struct hfi1_pportdata *ppd;
1086
1087 ppd = dd->pport + pidx;
1088 if (dd->flags & HFI1_PRESENT)
1089 set_link_state(ppd, HLS_DN_DISABLE);
1090
1091 if (ppd->statusp)
1092 *ppd->statusp &= ~HFI1_STATUS_IB_READY;
1093 }
1094 }
1095
1096 /*
1097 * Mark as having had an error for driver, and also
1098 * for /sys and status word mapped to user programs.
1099 * This marks unit as not usable, until reset.
1100 */
1101 if (dd->status)
1102 dd->status->dev |= HFI1_STATUS_HWERROR;
1103}
1104
1105static void remove_one(struct pci_dev *);
1106static int init_one(struct pci_dev *, const struct pci_device_id *);
1107
1108#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
1109#define PFX DRIVER_NAME ": "
1110
1111static const struct pci_device_id hfi1_pci_tbl[] = {
1112 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
1113 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
1114 { 0, }
1115};
1116
1117MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
1118
1119static struct pci_driver hfi1_pci_driver = {
1120 .name = DRIVER_NAME,
1121 .probe = init_one,
1122 .remove = remove_one,
1123 .id_table = hfi1_pci_tbl,
1124 .err_handler = &hfi1_pci_err_handler,
1125};
1126
1127static void __init compute_krcvqs(void)
1128{
1129 int i;
1130
1131 for (i = 0; i < krcvqsset; i++)
1132 n_krcvqs += krcvqs[i];
1133}
1134
1135/*
1136 * Do all the generic driver unit- and chip-independent memory
1137 * allocation and initialization.
1138 */
1139static int __init hfi1_mod_init(void)
1140{
1141 int ret;
1142
1143 ret = dev_init();
1144 if (ret)
1145 goto bail;
1146
1147 /* validate max MTU before any devices start */
1148 if (!valid_opa_max_mtu(hfi1_max_mtu)) {
1149 pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
1150 hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
1151 hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
1152 }
1153 /* valid CUs run from 1-128 in powers of 2 */
1154 if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
1155 hfi1_cu = 1;
1156 /* valid credit return threshold is 0-100, variable is unsigned */
1157 if (user_credit_return_threshold > 100)
1158 user_credit_return_threshold = 100;
1159
1160 compute_krcvqs();
1161 /* sanitize receive interrupt count, time must wait until after
1162 the hardware type is known */
1163 if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
1164 rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
1165 /* reject invalid combinations */
1166 if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
1167 pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
1168 rcv_intr_count = 1;
1169 }
1170 if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
1171 /*
1172 * Avoid indefinite packet delivery by requiring a timeout
1173 * if count is > 1.
1174 */
1175 pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
1176 rcv_intr_timeout = 1;
1177 }
1178 if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
1179 /*
1180 * The dynamic algorithm expects a non-zero timeout
1181 * and a count > 1.
1182 */
1183 pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
1184 rcv_intr_dynamic = 0;
1185 }
1186
1187 /* sanitize link CRC options */
1188 link_crc_mask &= SUPPORTED_CRCS;
1189
1190 /*
1191 * These must be called before the driver is registered with
1192 * the PCI subsystem.
1193 */
1194 idr_init(&hfi1_unit_table);
1195
1196 hfi1_dbg_init();
1197 ret = pci_register_driver(&hfi1_pci_driver);
1198 if (ret < 0) {
1199 pr_err("Unable to register driver: error %d\n", -ret);
1200 goto bail_dev;
1201 }
1202 goto bail; /* all OK */
1203
1204bail_dev:
1205 hfi1_dbg_exit();
1206 idr_destroy(&hfi1_unit_table);
1207 dev_cleanup();
1208bail:
1209 return ret;
1210}
1211
1212module_init(hfi1_mod_init);
1213
1214/*
1215 * Do the non-unit driver cleanup, memory free, etc. at unload.
1216 */
1217static void __exit hfi1_mod_cleanup(void)
1218{
1219 pci_unregister_driver(&hfi1_pci_driver);
1220 hfi1_dbg_exit();
1221 hfi1_cpulist_count = 0;
1222 kfree(hfi1_cpulist);
1223
1224 idr_destroy(&hfi1_unit_table);
1225 dispose_firmware(); /* asymmetric with obtain_firmware() */
1226 dev_cleanup();
1227}
1228
1229module_exit(hfi1_mod_cleanup);
1230
1231/* this can only be called after a successful initialization */
1232static void cleanup_device_data(struct hfi1_devdata *dd)
1233{
1234 int ctxt;
1235 int pidx;
1236 struct hfi1_ctxtdata **tmp;
1237 unsigned long flags;
1238
1239 /* users can't do anything more with chip */
1240 for (pidx = 0; pidx < dd->num_pports; ++pidx) {
1241 struct hfi1_pportdata *ppd = &dd->pport[pidx];
1242 struct cc_state *cc_state;
1243 int i;
1244
1245 if (ppd->statusp)
1246 *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
1247
1248 for (i = 0; i < OPA_MAX_SLS; i++)
1249 hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
1250
1251 spin_lock(&ppd->cc_state_lock);
1252 cc_state = get_cc_state(ppd);
1253 rcu_assign_pointer(ppd->cc_state, NULL);
1254 spin_unlock(&ppd->cc_state_lock);
1255
1256 if (cc_state)
1257 call_rcu(&cc_state->rcu, cc_state_reclaim);
1258 }
1259
1260 free_credit_return(dd);
1261
1262 /*
1263 * Free any resources still in use (usually just kernel contexts)
1264 * at unload; we do for ctxtcnt, because that's what we allocate.
1265 * We acquire lock to be really paranoid that rcd isn't being
1266 * accessed from some interrupt-related code (that should not happen,
1267 * but best to be sure).
1268 */
1269 spin_lock_irqsave(&dd->uctxt_lock, flags);
1270 tmp = dd->rcd;
1271 dd->rcd = NULL;
1272 spin_unlock_irqrestore(&dd->uctxt_lock, flags);
1273 for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) {
1274 struct hfi1_ctxtdata *rcd = tmp[ctxt];
1275
1276 tmp[ctxt] = NULL; /* debugging paranoia */
1277 if (rcd) {
1278 hfi1_clear_tids(rcd);
1279 hfi1_free_ctxtdata(dd, rcd);
1280 }
1281 }
1282 kfree(tmp);
1283 /* must follow rcv context free - need to remove rcv's hooks */
1284 for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
1285 sc_free(dd->send_contexts[ctxt].sc);
1286 dd->num_send_contexts = 0;
1287 kfree(dd->send_contexts);
1288 dd->send_contexts = NULL;
1289 kfree(dd->boardname);
1290 vfree(dd->events);
1291 vfree(dd->status);
1292 hfi1_cq_exit(dd);
1293}
1294
1295/*
1296 * Clean up on unit shutdown, or error during unit load after
1297 * successful initialization.
1298 */
1299static void postinit_cleanup(struct hfi1_devdata *dd)
1300{
1301 hfi1_start_cleanup(dd);
1302
1303 hfi1_pcie_ddcleanup(dd);
1304 hfi1_pcie_cleanup(dd->pcidev);
1305
1306 cleanup_device_data(dd);
1307
1308 hfi1_free_devdata(dd);
1309}
1310
1311static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
1312{
1313 int ret = 0, j, pidx, initfail;
1314 struct hfi1_devdata *dd = NULL;
1315
1316 /* First, lock the non-writable module parameters */
1317 HFI1_CAP_LOCK();
1318
1319 /* Validate some global module parameters */
1320 if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
1321 hfi1_early_err(&pdev->dev, "Header queue count too small\n");
1322 ret = -EINVAL;
1323 goto bail;
1324 }
1325 /* use the encoding function as a sanitization check */
1326 if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
1327 hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
1328 hfi1_hdrq_entsize);
1329 goto bail;
1330 }
1331
1332 /* The receive eager buffer size must be set before the receive
1333 * contexts are created.
1334 *
1335 * Set the eager buffer size. Validate that it falls in a range
1336 * allowed by the hardware - all powers of 2 between the min and
1337 * max. The maximum valid MTU is within the eager buffer range
1338 * so we do not need to cap the max_mtu by an eager buffer size
1339 * setting.
1340 */
1341 if (eager_buffer_size) {
1342 if (!is_power_of_2(eager_buffer_size))
1343 eager_buffer_size =
1344 roundup_pow_of_two(eager_buffer_size);
1345 eager_buffer_size =
1346 clamp_val(eager_buffer_size,
1347 MIN_EAGER_BUFFER * 8,
1348 MAX_EAGER_BUFFER_TOTAL);
1349 hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
1350 eager_buffer_size);
1351 } else {
1352 hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
1353 ret = -EINVAL;
1354 goto bail;
1355 }
1356
1357 /* restrict value of hfi1_rcvarr_split */
1358 hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
1359
1360 ret = hfi1_pcie_init(pdev, ent);
1361 if (ret)
1362 goto bail;
1363
1364 /*
1365 * Do device-specific initialization, function table setup, dd
1366 * allocation, etc.
1367 */
1368 switch (ent->device) {
1369 case PCI_DEVICE_ID_INTEL0:
1370 case PCI_DEVICE_ID_INTEL1:
1371 dd = hfi1_init_dd(pdev, ent);
1372 break;
1373 default:
1374 hfi1_early_err(&pdev->dev,
1375 "Failing on unknown Intel deviceid 0x%x\n",
1376 ent->device);
1377 ret = -ENODEV;
1378 }
1379
1380 if (IS_ERR(dd))
1381 ret = PTR_ERR(dd);
1382 if (ret)
1383 goto clean_bail; /* error already printed */
1384
1385 ret = create_workqueues(dd);
1386 if (ret)
1387 goto clean_bail;
1388
1389 /* do the generic initialization */
1390 initfail = hfi1_init(dd, 0);
1391
1392 ret = hfi1_register_ib_device(dd);
1393
1394 /*
1395 * Now ready for use. this should be cleared whenever we
1396 * detect a reset, or initiate one. If earlier failure,
1397 * we still create devices, so diags, etc. can be used
1398 * to determine cause of problem.
1399 */
1400 if (!initfail && !ret)
1401 dd->flags |= HFI1_INITTED;
1402
1403 j = hfi1_device_create(dd);
1404 if (j)
1405 dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
1406
1407 if (initfail || ret) {
1408 stop_timers(dd);
1409 flush_workqueue(ib_wq);
1410 for (pidx = 0; pidx < dd->num_pports; ++pidx)
1411 hfi1_quiet_serdes(dd->pport + pidx);
1412 if (!j)
1413 hfi1_device_remove(dd);
1414 if (!ret)
1415 hfi1_unregister_ib_device(dd);
1416 postinit_cleanup(dd);
1417 if (initfail)
1418 ret = initfail;
1419 goto bail; /* everything already cleaned */
1420 }
1421
1422 sdma_start(dd);
1423
1424 return 0;
1425
1426clean_bail:
1427 hfi1_pcie_cleanup(pdev);
1428bail:
1429 return ret;
1430}
1431
1432static void remove_one(struct pci_dev *pdev)
1433{
1434 struct hfi1_devdata *dd = pci_get_drvdata(pdev);
1435
1436 /* unregister from IB core */
1437 hfi1_unregister_ib_device(dd);
1438
1439 /*
1440 * Disable the IB link, disable interrupts on the device,
1441 * clear dma engines, etc.
1442 */
1443 shutdown_device(dd);
1444
1445 stop_timers(dd);
1446
1447 /* wait until all of our (qsfp) queue_work() calls complete */
1448 flush_workqueue(ib_wq);
1449
1450 hfi1_device_remove(dd);
1451
1452 postinit_cleanup(dd);
1453}
1454
1455/**
1456 * hfi1_create_rcvhdrq - create a receive header queue
1457 * @dd: the hfi1_ib device
1458 * @rcd: the context data
1459 *
1460 * This must be contiguous memory (from an i/o perspective), and must be
1461 * DMA'able (which means for some systems, it will go through an IOMMU,
1462 * or be forced into a low address range).
1463 */
1464int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
1465{
1466 unsigned amt;
1467 u64 reg;
1468
1469 if (!rcd->rcvhdrq) {
1470 dma_addr_t phys_hdrqtail;
1471 gfp_t gfp_flags;
1472
1473 /*
1474 * rcvhdrqentsize is in DWs, so we have to convert to bytes
1475 * (* sizeof(u32)).
1476 */
1477 amt = ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
1478 sizeof(u32), PAGE_SIZE);
1479
1480 gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
1481 GFP_USER : GFP_KERNEL;
1482 rcd->rcvhdrq = dma_zalloc_coherent(
1483 &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
1484 gfp_flags | __GFP_COMP);
1485
1486 if (!rcd->rcvhdrq) {
1487 dd_dev_err(dd,
1488 "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
1489 amt, rcd->ctxt);
1490 goto bail;
1491 }
1492
1493 /* Event mask is per device now and is in hfi1_devdata */
1494 /*if (rcd->ctxt >= dd->first_user_ctxt) {
1495 rcd->user_event_mask = vmalloc_user(PAGE_SIZE);
1496 if (!rcd->user_event_mask)
1497 goto bail_free_hdrq;
1498 }*/
1499
1500 if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
1501 rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
1502 &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
1503 gfp_flags);
1504 if (!rcd->rcvhdrtail_kvaddr)
1505 goto bail_free;
1506 rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
1507 }
1508
1509 rcd->rcvhdrq_size = amt;
1510 }
1511 /*
1512 * These values are per-context:
1513 * RcvHdrCnt
1514 * RcvHdrEntSize
1515 * RcvHdrSize
1516 */
1517 reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
1518 & RCV_HDR_CNT_CNT_MASK)
1519 << RCV_HDR_CNT_CNT_SHIFT;
1520 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
1521 reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
1522 & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
1523 << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
1524 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
1525 reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK)
1526 << RCV_HDR_SIZE_HDR_SIZE_SHIFT;
1527 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
1528 return 0;
1529
1530bail_free:
1531 dd_dev_err(dd,
1532 "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
1533 rcd->ctxt);
1534 vfree(rcd->user_event_mask);
1535 rcd->user_event_mask = NULL;
1536 dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
1537 rcd->rcvhdrq_phys);
1538 rcd->rcvhdrq = NULL;
1539bail:
1540 return -ENOMEM;
1541}
1542
1543/**
1544 * allocate eager buffers, both kernel and user contexts.
1545 * @rcd: the context we are setting up.
1546 *
1547 * Allocate the eager TID buffers and program them into hip.
1548 * They are no longer completely contiguous, we do multiple allocation
1549 * calls. Otherwise we get the OOM code involved, by asking for too
1550 * much per call, with disastrous results on some kernels.
1551 */
1552int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
1553{
1554 struct hfi1_devdata *dd = rcd->dd;
1555 u32 max_entries, egrtop, alloced_bytes = 0, idx = 0;
1556 gfp_t gfp_flags;
1557 u16 order;
1558 int ret = 0;
1559 u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
1560
1561 /*
1562 * GFP_USER, but without GFP_FS, so buffer cache can be
1563 * coalesced (we hope); otherwise, even at order 4,
1564 * heavy filesystem activity makes these fail, and we can
1565 * use compound pages.
1566 */
1567 gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
1568
1569 /*
1570 * The minimum size of the eager buffers is a groups of MTU-sized
1571 * buffers.
1572 * The global eager_buffer_size parameter is checked against the
1573 * theoretical lower limit of the value. Here, we check against the
1574 * MTU.
1575 */
1576 if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
1577 rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
1578 /*
1579 * If using one-pkt-per-egr-buffer, lower the eager buffer
1580 * size to the max MTU (page-aligned).
1581 */
1582 if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
1583 rcd->egrbufs.rcvtid_size = round_mtu;
1584
1585 /*
1586 * Eager buffers sizes of 1MB or less require smaller TID sizes
1587 * to satisfy the "multiple of 8 RcvArray entries" requirement.
1588 */
1589 if (rcd->egrbufs.size <= (1 << 20))
1590 rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
1591 rounddown_pow_of_two(rcd->egrbufs.size / 8));
1592
1593 while (alloced_bytes < rcd->egrbufs.size &&
1594 rcd->egrbufs.alloced < rcd->egrbufs.count) {
1595 rcd->egrbufs.buffers[idx].addr =
1596 dma_zalloc_coherent(&dd->pcidev->dev,
1597 rcd->egrbufs.rcvtid_size,
1598 &rcd->egrbufs.buffers[idx].phys,
1599 gfp_flags);
1600 if (rcd->egrbufs.buffers[idx].addr) {
1601 rcd->egrbufs.buffers[idx].len =
1602 rcd->egrbufs.rcvtid_size;
1603 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
1604 rcd->egrbufs.buffers[idx].addr;
1605 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys =
1606 rcd->egrbufs.buffers[idx].phys;
1607 rcd->egrbufs.alloced++;
1608 alloced_bytes += rcd->egrbufs.rcvtid_size;
1609 idx++;
1610 } else {
1611 u32 new_size, i, j;
1612 u64 offset = 0;
1613
1614 /*
1615 * Fail the eager buffer allocation if:
1616 * - we are already using the lowest acceptable size
1617 * - we are using one-pkt-per-egr-buffer (this implies
1618 * that we are accepting only one size)
1619 */
1620 if (rcd->egrbufs.rcvtid_size == round_mtu ||
1621 !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
1622 dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
1623 rcd->ctxt);
1624 goto bail_rcvegrbuf_phys;
1625 }
1626
1627 new_size = rcd->egrbufs.rcvtid_size / 2;
1628
1629 /*
1630 * If the first attempt to allocate memory failed, don't
1631 * fail everything but continue with the next lower
1632 * size.
1633 */
1634 if (idx == 0) {
1635 rcd->egrbufs.rcvtid_size = new_size;
1636 continue;
1637 }
1638
1639 /*
1640 * Re-partition already allocated buffers to a smaller
1641 * size.
1642 */
1643 rcd->egrbufs.alloced = 0;
1644 for (i = 0, j = 0, offset = 0; j < idx; i++) {
1645 if (i >= rcd->egrbufs.count)
1646 break;
1647 rcd->egrbufs.rcvtids[i].phys =
1648 rcd->egrbufs.buffers[j].phys + offset;
1649 rcd->egrbufs.rcvtids[i].addr =
1650 rcd->egrbufs.buffers[j].addr + offset;
1651 rcd->egrbufs.alloced++;
1652 if ((rcd->egrbufs.buffers[j].phys + offset +
1653 new_size) ==
1654 (rcd->egrbufs.buffers[j].phys +
1655 rcd->egrbufs.buffers[j].len)) {
1656 j++;
1657 offset = 0;
1658 } else
1659 offset += new_size;
1660 }
1661 rcd->egrbufs.rcvtid_size = new_size;
1662 }
1663 }
1664 rcd->egrbufs.numbufs = idx;
1665 rcd->egrbufs.size = alloced_bytes;
1666
1667 dd_dev_info(dd, "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
1668 rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size,
1669 rcd->egrbufs.size);
1670
1671 /*
1672 * Set the contexts rcv array head update threshold to the closest
1673 * power of 2 (so we can use a mask instead of modulo) below half
1674 * the allocated entries.
1675 */
1676 rcd->egrbufs.threshold =
1677 rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
1678 /*
1679 * Compute the expected RcvArray entry base. This is done after
1680 * allocating the eager buffers in order to maximize the
1681 * expected RcvArray entries for the context.
1682 */
1683 max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
1684 egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
1685 rcd->expected_count = max_entries - egrtop;
1686 if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
1687 rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
1688
1689 rcd->expected_base = rcd->eager_base + egrtop;
1690 dd_dev_info(dd, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
1691 rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
1692 rcd->eager_base, rcd->expected_base);
1693
1694 if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
1695 dd_dev_err(dd, "ctxt%u: current Eager buffer size is invalid %u\n",
1696 rcd->ctxt, rcd->egrbufs.rcvtid_size);
1697 ret = -EINVAL;
1698 goto bail;
1699 }
1700
1701 for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
1702 hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
1703 rcd->egrbufs.rcvtids[idx].phys, order);
1704 cond_resched();
1705 }
1706 goto bail;
1707
1708bail_rcvegrbuf_phys:
1709 for (idx = 0; idx < rcd->egrbufs.alloced &&
1710 rcd->egrbufs.buffers[idx].addr;
1711 idx++) {
1712 dma_free_coherent(&dd->pcidev->dev,
1713 rcd->egrbufs.buffers[idx].len,
1714 rcd->egrbufs.buffers[idx].addr,
1715 rcd->egrbufs.buffers[idx].phys);
1716 rcd->egrbufs.buffers[idx].addr = NULL;
1717 rcd->egrbufs.buffers[idx].phys = 0;
1718 rcd->egrbufs.buffers[idx].len = 0;
1719 }
1720bail:
1721 return ret;
1722}
diff --git a/drivers/staging/rdma/hfi1/intr.c b/drivers/staging/rdma/hfi1/intr.c
new file mode 100644
index 000000000000..426582b9ab65
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/intr.c
@@ -0,0 +1,207 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/pci.h>
52#include <linux/delay.h>
53
54#include "hfi.h"
55#include "common.h"
56#include "sdma.h"
57
58/**
59 * format_hwmsg - format a single hwerror message
60 * @msg message buffer
61 * @msgl length of message buffer
62 * @hwmsg message to add to message buffer
63 */
64static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
65{
66 strlcat(msg, "[", msgl);
67 strlcat(msg, hwmsg, msgl);
68 strlcat(msg, "]", msgl);
69}
70
71/**
72 * hfi1_format_hwerrors - format hardware error messages for display
73 * @hwerrs hardware errors bit vector
74 * @hwerrmsgs hardware error descriptions
75 * @nhwerrmsgs number of hwerrmsgs
76 * @msg message buffer
77 * @msgl message buffer length
78 */
79void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs,
80 size_t nhwerrmsgs, char *msg, size_t msgl)
81{
82 int i;
83
84 for (i = 0; i < nhwerrmsgs; i++)
85 if (hwerrs & hwerrmsgs[i].mask)
86 format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
87}
88
89static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev)
90{
91 struct ib_event event;
92 struct hfi1_devdata *dd = ppd->dd;
93
94 /*
95 * Only call ib_dispatch_event() if the IB device has been
96 * registered. HFI1_INITED is set iff the driver has successfully
97 * registered with the IB core.
98 */
99 if (!(dd->flags & HFI1_INITTED))
100 return;
101 event.device = &dd->verbs_dev.ibdev;
102 event.element.port_num = ppd->port;
103 event.event = ev;
104 ib_dispatch_event(&event);
105}
106
107/*
108 * Handle a linkup or link down notification.
109 * This is called outside an interrupt.
110 */
111void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup)
112{
113 struct hfi1_pportdata *ppd = &dd->pport[0];
114 enum ib_event_type ev;
115
116 if (!(ppd->linkup ^ !!linkup))
117 return; /* no change, nothing to do */
118
119 if (linkup) {
120 /*
121 * Quick linkup and all link up on the simulator does not
122 * trigger or implement:
123 * - VerifyCap interrupt
124 * - VerifyCap frames
125 * But rather moves directly to LinkUp.
126 *
127 * Do the work of the VerifyCap interrupt handler,
128 * handle_verify_cap(), but do not try moving the state to
129 * LinkUp as we are already there.
130 *
131 * NOTE: This uses this device's vAU, vCU, and vl15_init for
132 * the remote values. Both sides must be using the values.
133 */
134 if (quick_linkup
135 || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
136 set_up_vl15(dd, dd->vau, dd->vl15_init);
137 assign_remote_cm_au_table(dd, dd->vcu);
138 ppd->neighbor_guid =
139 read_csr(dd,
140 DC_DC8051_STS_REMOTE_GUID);
141 ppd->neighbor_type =
142 read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
143 DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
144 ppd->neighbor_port_number =
145 read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
146 DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
147 dd_dev_info(dd,
148 "Neighbor GUID: %llx Neighbor type %d\n",
149 ppd->neighbor_guid,
150 ppd->neighbor_type);
151 }
152
153 /* physical link went up */
154 ppd->linkup = 1;
155 ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE;
156
157 /* link widths are not available until the link is fully up */
158 get_linkup_link_widths(ppd);
159
160 } else {
161 /* physical link went down */
162 ppd->linkup = 0;
163
164 /* clear HW details of the previous connection */
165 reset_link_credits(dd);
166
167 /* freeze after a link down to guarantee a clean egress */
168 start_freeze_handling(ppd, FREEZE_SELF|FREEZE_LINK_DOWN);
169
170 ev = IB_EVENT_PORT_ERR;
171
172 hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT);
173
174 /* if we are down, the neighbor is down */
175 ppd->neighbor_normal = 0;
176
177 /* notify IB of the link change */
178 signal_ib_event(ppd, ev);
179 }
180
181
182}
183
184/*
185 * Handle receive or urgent interrupts for user contexts. This means a user
186 * process was waiting for a packet to arrive, and didn't want to poll.
187 */
188void handle_user_interrupt(struct hfi1_ctxtdata *rcd)
189{
190 struct hfi1_devdata *dd = rcd->dd;
191 unsigned long flags;
192
193 spin_lock_irqsave(&dd->uctxt_lock, flags);
194 if (!rcd->cnt)
195 goto done;
196
197 if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
198 wake_up_interruptible(&rcd->wait);
199 hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt);
200 } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
201 &rcd->event_flags)) {
202 rcd->urgent++;
203 wake_up_interruptible(&rcd->wait);
204 }
205done:
206 spin_unlock_irqrestore(&dd->uctxt_lock, flags);
207}
diff --git a/drivers/staging/rdma/hfi1/iowait.h b/drivers/staging/rdma/hfi1/iowait.h
new file mode 100644
index 000000000000..fa361b405851
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/iowait.h
@@ -0,0 +1,186 @@
1#ifndef _HFI1_IOWAIT_H
2#define _HFI1_IOWAIT_H
3/*
4 *
5 * This file is provided under a dual BSD/GPLv2 license. When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * Copyright(c) 2015 Intel Corporation.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of version 2 of the GNU General Public License as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * BSD LICENSE
22 *
23 * Copyright(c) 2015 Intel Corporation.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 *
29 * - Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * - Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in
33 * the documentation and/or other materials provided with the
34 * distribution.
35 * - Neither the name of Intel Corporation nor the names of its
36 * contributors may be used to endorse or promote products derived
37 * from this software without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
40 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
41 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
43 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
47 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
49 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 *
51 */
52
53#include <linux/list.h>
54#include <linux/workqueue.h>
55#include <linux/sched.h>
56
57/*
58 * typedef (*restart_t)() - restart callback
59 * @work: pointer to work structure
60 */
61typedef void (*restart_t)(struct work_struct *work);
62
63struct sdma_txreq;
64struct sdma_engine;
65/**
66 * struct iowait - linkage for delayed progress/waiting
67 * @list: used to add/insert into QP/PQ wait lists
68 * @tx_head: overflow list of sdma_txreq's
69 * @sleep: no space callback
70 * @wakeup: space callback
71 * @iowork: workqueue overhead
72 * @wait_dma: wait for sdma_busy == 0
73 * @sdma_busy: # of packets in flight
74 * @count: total number of descriptors in tx_head'ed list
75 * @tx_limit: limit for overflow queuing
76 * @tx_count: number of tx entry's in tx_head'ed list
77 *
78 * This is to be embedded in user's state structure
79 * (QP or PQ).
80 *
81 * The sleep and wakeup members are a
82 * bit misnamed. They do not strictly
83 * speaking sleep or wake up, but they
84 * are callbacks for the ULP to implement
85 * what ever queuing/dequeuing of
86 * the embedded iowait and its containing struct
87 * when a resource shortage like SDMA ring space is seen.
88 *
89 * Both potentially have locks help
90 * so sleeping is not allowed.
91 *
92 * The wait_dma member along with the iow
93 */
94
95struct iowait {
96 struct list_head list;
97 struct list_head tx_head;
98 int (*sleep)(
99 struct sdma_engine *sde,
100 struct iowait *wait,
101 struct sdma_txreq *tx,
102 unsigned seq);
103 void (*wakeup)(struct iowait *wait, int reason);
104 struct work_struct iowork;
105 wait_queue_head_t wait_dma;
106 atomic_t sdma_busy;
107 u32 count;
108 u32 tx_limit;
109 u32 tx_count;
110};
111
112#define SDMA_AVAIL_REASON 0
113
114/**
115 * iowait_init() - initialize wait structure
116 * @wait: wait struct to initialize
117 * @tx_limit: limit for overflow queuing
118 * @func: restart function for workqueue
119 * @sleep: sleep function for no space
120 * @wakeup: wakeup function for no space
121 *
122 * This function initializes the iowait
123 * structure embedded in the QP or PQ.
124 *
125 */
126
127static inline void iowait_init(
128 struct iowait *wait,
129 u32 tx_limit,
130 void (*func)(struct work_struct *work),
131 int (*sleep)(
132 struct sdma_engine *sde,
133 struct iowait *wait,
134 struct sdma_txreq *tx,
135 unsigned seq),
136 void (*wakeup)(struct iowait *wait, int reason))
137{
138 wait->count = 0;
139 INIT_LIST_HEAD(&wait->list);
140 INIT_LIST_HEAD(&wait->tx_head);
141 INIT_WORK(&wait->iowork, func);
142 init_waitqueue_head(&wait->wait_dma);
143 atomic_set(&wait->sdma_busy, 0);
144 wait->tx_limit = tx_limit;
145 wait->sleep = sleep;
146 wait->wakeup = wakeup;
147}
148
149/**
150 * iowait_schedule() - initialize wait structure
151 * @wait: wait struct to schedule
152 * @wq: workqueue for schedule
153 */
154static inline void iowait_schedule(
155 struct iowait *wait,
156 struct workqueue_struct *wq)
157{
158 queue_work(wq, &wait->iowork);
159}
160
161/**
162 * iowait_sdma_drain() - wait for DMAs to drain
163 *
164 * @wait: iowait structure
165 *
166 * This will delay until the iowait sdmas have
167 * completed.
168 */
169static inline void iowait_sdma_drain(struct iowait *wait)
170{
171 wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
172}
173
174/**
175 * iowait_drain_wakeup() - trigger iowait_drain() waiter
176 *
177 * @wait: iowait structure
178 *
179 * This will trigger any waiters.
180 */
181static inline void iowait_drain_wakeup(struct iowait *wait)
182{
183 wake_up(&wait->wait_dma);
184}
185
186#endif
diff --git a/drivers/staging/rdma/hfi1/keys.c b/drivers/staging/rdma/hfi1/keys.c
new file mode 100644
index 000000000000..f6eff177ace1
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/keys.c
@@ -0,0 +1,411 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include "hfi.h"
52
53/**
54 * hfi1_alloc_lkey - allocate an lkey
55 * @mr: memory region that this lkey protects
56 * @dma_region: 0->normal key, 1->restricted DMA key
57 *
58 * Returns 0 if successful, otherwise returns -errno.
59 *
60 * Increments mr reference count as required.
61 *
62 * Sets the lkey field mr for non-dma regions.
63 *
64 */
65
66int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region)
67{
68 unsigned long flags;
69 u32 r;
70 u32 n;
71 int ret = 0;
72 struct hfi1_ibdev *dev = to_idev(mr->pd->device);
73 struct hfi1_lkey_table *rkt = &dev->lk_table;
74
75 hfi1_get_mr(mr);
76 spin_lock_irqsave(&rkt->lock, flags);
77
78 /* special case for dma_mr lkey == 0 */
79 if (dma_region) {
80 struct hfi1_mregion *tmr;
81
82 tmr = rcu_access_pointer(dev->dma_mr);
83 if (!tmr) {
84 rcu_assign_pointer(dev->dma_mr, mr);
85 mr->lkey_published = 1;
86 } else {
87 hfi1_put_mr(mr);
88 }
89 goto success;
90 }
91
92 /* Find the next available LKEY */
93 r = rkt->next;
94 n = r;
95 for (;;) {
96 if (!rcu_access_pointer(rkt->table[r]))
97 break;
98 r = (r + 1) & (rkt->max - 1);
99 if (r == n)
100 goto bail;
101 }
102 rkt->next = (r + 1) & (rkt->max - 1);
103 /*
104 * Make sure lkey is never zero which is reserved to indicate an
105 * unrestricted LKEY.
106 */
107 rkt->gen++;
108 /*
109 * bits are capped in verbs.c to ensure enough bits for
110 * generation number
111 */
112 mr->lkey = (r << (32 - hfi1_lkey_table_size)) |
113 ((((1 << (24 - hfi1_lkey_table_size)) - 1) & rkt->gen)
114 << 8);
115 if (mr->lkey == 0) {
116 mr->lkey |= 1 << 8;
117 rkt->gen++;
118 }
119 rcu_assign_pointer(rkt->table[r], mr);
120 mr->lkey_published = 1;
121success:
122 spin_unlock_irqrestore(&rkt->lock, flags);
123out:
124 return ret;
125bail:
126 hfi1_put_mr(mr);
127 spin_unlock_irqrestore(&rkt->lock, flags);
128 ret = -ENOMEM;
129 goto out;
130}
131
132/**
133 * hfi1_free_lkey - free an lkey
134 * @mr: mr to free from tables
135 */
136void hfi1_free_lkey(struct hfi1_mregion *mr)
137{
138 unsigned long flags;
139 u32 lkey = mr->lkey;
140 u32 r;
141 struct hfi1_ibdev *dev = to_idev(mr->pd->device);
142 struct hfi1_lkey_table *rkt = &dev->lk_table;
143 int freed = 0;
144
145 spin_lock_irqsave(&rkt->lock, flags);
146 if (!mr->lkey_published)
147 goto out;
148 if (lkey == 0)
149 RCU_INIT_POINTER(dev->dma_mr, NULL);
150 else {
151 r = lkey >> (32 - hfi1_lkey_table_size);
152 RCU_INIT_POINTER(rkt->table[r], NULL);
153 }
154 mr->lkey_published = 0;
155 freed++;
156out:
157 spin_unlock_irqrestore(&rkt->lock, flags);
158 if (freed) {
159 synchronize_rcu();
160 hfi1_put_mr(mr);
161 }
162}
163
164/**
165 * hfi1_lkey_ok - check IB SGE for validity and initialize
166 * @rkt: table containing lkey to check SGE against
167 * @pd: protection domain
168 * @isge: outgoing internal SGE
169 * @sge: SGE to check
170 * @acc: access flags
171 *
172 * Return 1 if valid and successful, otherwise returns 0.
173 *
174 * increments the reference count upon success
175 *
176 * Check the IB SGE for validity and initialize our internal version
177 * of it.
178 */
179int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd,
180 struct hfi1_sge *isge, struct ib_sge *sge, int acc)
181{
182 struct hfi1_mregion *mr;
183 unsigned n, m;
184 size_t off;
185
186 /*
187 * We use LKEY == zero for kernel virtual addresses
188 * (see hfi1_get_dma_mr and dma.c).
189 */
190 rcu_read_lock();
191 if (sge->lkey == 0) {
192 struct hfi1_ibdev *dev = to_idev(pd->ibpd.device);
193
194 if (pd->user)
195 goto bail;
196 mr = rcu_dereference(dev->dma_mr);
197 if (!mr)
198 goto bail;
199 atomic_inc(&mr->refcount);
200 rcu_read_unlock();
201
202 isge->mr = mr;
203 isge->vaddr = (void *) sge->addr;
204 isge->length = sge->length;
205 isge->sge_length = sge->length;
206 isge->m = 0;
207 isge->n = 0;
208 goto ok;
209 }
210 mr = rcu_dereference(
211 rkt->table[(sge->lkey >> (32 - hfi1_lkey_table_size))]);
212 if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
213 goto bail;
214
215 off = sge->addr - mr->user_base;
216 if (unlikely(sge->addr < mr->user_base ||
217 off + sge->length > mr->length ||
218 (mr->access_flags & acc) != acc))
219 goto bail;
220 atomic_inc(&mr->refcount);
221 rcu_read_unlock();
222
223 off += mr->offset;
224 if (mr->page_shift) {
225 /*
226 page sizes are uniform power of 2 so no loop is necessary
227 entries_spanned_by_off is the number of times the loop below
228 would have executed.
229 */
230 size_t entries_spanned_by_off;
231
232 entries_spanned_by_off = off >> mr->page_shift;
233 off -= (entries_spanned_by_off << mr->page_shift);
234 m = entries_spanned_by_off / HFI1_SEGSZ;
235 n = entries_spanned_by_off % HFI1_SEGSZ;
236 } else {
237 m = 0;
238 n = 0;
239 while (off >= mr->map[m]->segs[n].length) {
240 off -= mr->map[m]->segs[n].length;
241 n++;
242 if (n >= HFI1_SEGSZ) {
243 m++;
244 n = 0;
245 }
246 }
247 }
248 isge->mr = mr;
249 isge->vaddr = mr->map[m]->segs[n].vaddr + off;
250 isge->length = mr->map[m]->segs[n].length - off;
251 isge->sge_length = sge->length;
252 isge->m = m;
253 isge->n = n;
254ok:
255 return 1;
256bail:
257 rcu_read_unlock();
258 return 0;
259}
260
261/**
262 * hfi1_rkey_ok - check the IB virtual address, length, and RKEY
263 * @qp: qp for validation
264 * @sge: SGE state
265 * @len: length of data
266 * @vaddr: virtual address to place data
267 * @rkey: rkey to check
268 * @acc: access flags
269 *
270 * Return 1 if successful, otherwise 0.
271 *
272 * increments the reference count upon success
273 */
274int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge,
275 u32 len, u64 vaddr, u32 rkey, int acc)
276{
277 struct hfi1_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
278 struct hfi1_mregion *mr;
279 unsigned n, m;
280 size_t off;
281
282 /*
283 * We use RKEY == zero for kernel virtual addresses
284 * (see hfi1_get_dma_mr and dma.c).
285 */
286 rcu_read_lock();
287 if (rkey == 0) {
288 struct hfi1_pd *pd = to_ipd(qp->ibqp.pd);
289 struct hfi1_ibdev *dev = to_idev(pd->ibpd.device);
290
291 if (pd->user)
292 goto bail;
293 mr = rcu_dereference(dev->dma_mr);
294 if (!mr)
295 goto bail;
296 atomic_inc(&mr->refcount);
297 rcu_read_unlock();
298
299 sge->mr = mr;
300 sge->vaddr = (void *) vaddr;
301 sge->length = len;
302 sge->sge_length = len;
303 sge->m = 0;
304 sge->n = 0;
305 goto ok;
306 }
307
308 mr = rcu_dereference(
309 rkt->table[(rkey >> (32 - hfi1_lkey_table_size))]);
310 if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
311 goto bail;
312
313 off = vaddr - mr->iova;
314 if (unlikely(vaddr < mr->iova || off + len > mr->length ||
315 (mr->access_flags & acc) == 0))
316 goto bail;
317 atomic_inc(&mr->refcount);
318 rcu_read_unlock();
319
320 off += mr->offset;
321 if (mr->page_shift) {
322 /*
323 page sizes are uniform power of 2 so no loop is necessary
324 entries_spanned_by_off is the number of times the loop below
325 would have executed.
326 */
327 size_t entries_spanned_by_off;
328
329 entries_spanned_by_off = off >> mr->page_shift;
330 off -= (entries_spanned_by_off << mr->page_shift);
331 m = entries_spanned_by_off / HFI1_SEGSZ;
332 n = entries_spanned_by_off % HFI1_SEGSZ;
333 } else {
334 m = 0;
335 n = 0;
336 while (off >= mr->map[m]->segs[n].length) {
337 off -= mr->map[m]->segs[n].length;
338 n++;
339 if (n >= HFI1_SEGSZ) {
340 m++;
341 n = 0;
342 }
343 }
344 }
345 sge->mr = mr;
346 sge->vaddr = mr->map[m]->segs[n].vaddr + off;
347 sge->length = mr->map[m]->segs[n].length - off;
348 sge->sge_length = len;
349 sge->m = m;
350 sge->n = n;
351ok:
352 return 1;
353bail:
354 rcu_read_unlock();
355 return 0;
356}
357
358/*
359 * Initialize the memory region specified by the work request.
360 */
361int hfi1_fast_reg_mr(struct hfi1_qp *qp, struct ib_send_wr *wr)
362{
363 struct hfi1_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
364 struct hfi1_pd *pd = to_ipd(qp->ibqp.pd);
365 struct hfi1_mregion *mr;
366 u32 rkey = wr->wr.fast_reg.rkey;
367 unsigned i, n, m;
368 int ret = -EINVAL;
369 unsigned long flags;
370 u64 *page_list;
371 size_t ps;
372
373 spin_lock_irqsave(&rkt->lock, flags);
374 if (pd->user || rkey == 0)
375 goto bail;
376
377 mr = rcu_dereference_protected(
378 rkt->table[(rkey >> (32 - hfi1_lkey_table_size))],
379 lockdep_is_held(&rkt->lock));
380 if (unlikely(mr == NULL || qp->ibqp.pd != mr->pd))
381 goto bail;
382
383 if (wr->wr.fast_reg.page_list_len > mr->max_segs)
384 goto bail;
385
386 ps = 1UL << wr->wr.fast_reg.page_shift;
387 if (wr->wr.fast_reg.length > ps * wr->wr.fast_reg.page_list_len)
388 goto bail;
389
390 mr->user_base = wr->wr.fast_reg.iova_start;
391 mr->iova = wr->wr.fast_reg.iova_start;
392 mr->lkey = rkey;
393 mr->length = wr->wr.fast_reg.length;
394 mr->access_flags = wr->wr.fast_reg.access_flags;
395 page_list = wr->wr.fast_reg.page_list->page_list;
396 m = 0;
397 n = 0;
398 for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
399 mr->map[m]->segs[n].vaddr = (void *) page_list[i];
400 mr->map[m]->segs[n].length = ps;
401 if (++n == HFI1_SEGSZ) {
402 m++;
403 n = 0;
404 }
405 }
406
407 ret = 0;
408bail:
409 spin_unlock_irqrestore(&rkt->lock, flags);
410 return ret;
411}
diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c
new file mode 100644
index 000000000000..37269eb90c34
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mad.c
@@ -0,0 +1,4257 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/net.h>
52#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \
53 / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16)))
54
55#include "hfi.h"
56#include "mad.h"
57#include "trace.h"
58
59/* the reset value from the FM is supposed to be 0xffff, handle both */
60#define OPA_LINK_WIDTH_RESET_OLD 0x0fff
61#define OPA_LINK_WIDTH_RESET 0xffff
62
63static int reply(struct ib_mad_hdr *smp)
64{
65 /*
66 * The verbs framework will handle the directed/LID route
67 * packet changes.
68 */
69 smp->method = IB_MGMT_METHOD_GET_RESP;
70 if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
71 smp->status |= IB_SMP_DIRECTION;
72 return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
73}
74
75static inline void clear_opa_smp_data(struct opa_smp *smp)
76{
77 void *data = opa_get_smp_data(smp);
78 size_t size = opa_get_smp_data_size(smp);
79
80 memset(data, 0, size);
81}
82
83static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
84{
85 struct ib_mad_send_buf *send_buf;
86 struct ib_mad_agent *agent;
87 struct ib_smp *smp;
88 int ret;
89 unsigned long flags;
90 unsigned long timeout;
91 int pkey_idx;
92 u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
93
94 agent = ibp->send_agent;
95 if (!agent)
96 return;
97
98 /* o14-3.2.1 */
99 if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE)
100 return;
101
102 /* o14-2 */
103 if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout))
104 return;
105
106 pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
107 if (pkey_idx < 0) {
108 pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n",
109 __func__, hfi1_get_pkey(ibp, 1));
110 pkey_idx = 1;
111 }
112
113 send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0,
114 IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
115 GFP_ATOMIC, IB_MGMT_BASE_VERSION);
116 if (IS_ERR(send_buf))
117 return;
118
119 smp = send_buf->mad;
120 smp->base_version = IB_MGMT_BASE_VERSION;
121 smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
122 smp->class_version = 1;
123 smp->method = IB_MGMT_METHOD_TRAP;
124 ibp->tid++;
125 smp->tid = cpu_to_be64(ibp->tid);
126 smp->attr_id = IB_SMP_ATTR_NOTICE;
127 /* o14-1: smp->mkey = 0; */
128 memcpy(smp->data, data, len);
129
130 spin_lock_irqsave(&ibp->lock, flags);
131 if (!ibp->sm_ah) {
132 if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
133 struct ib_ah *ah;
134
135 ah = hfi1_create_qp0_ah(ibp, ibp->sm_lid);
136 if (IS_ERR(ah))
137 ret = PTR_ERR(ah);
138 else {
139 send_buf->ah = ah;
140 ibp->sm_ah = to_iah(ah);
141 ret = 0;
142 }
143 } else
144 ret = -EINVAL;
145 } else {
146 send_buf->ah = &ibp->sm_ah->ibah;
147 ret = 0;
148 }
149 spin_unlock_irqrestore(&ibp->lock, flags);
150
151 if (!ret)
152 ret = ib_post_send_mad(send_buf, NULL);
153 if (!ret) {
154 /* 4.096 usec. */
155 timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000;
156 ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout);
157 } else {
158 ib_free_send_mad(send_buf);
159 ibp->trap_timeout = 0;
160 }
161}
162
163/*
164 * Send a bad [PQ]_Key trap (ch. 14.3.8).
165 */
166void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
167 u32 qp1, u32 qp2, __be16 lid1, __be16 lid2)
168{
169 struct ib_mad_notice_attr data;
170
171 if (trap_num == IB_NOTICE_TRAP_BAD_PKEY)
172 ibp->pkey_violations++;
173 else
174 ibp->qkey_violations++;
175 ibp->n_pkt_drops++;
176
177 /* Send violation trap */
178 data.generic_type = IB_NOTICE_TYPE_SECURITY;
179 data.prod_type_msb = 0;
180 data.prod_type_lsb = IB_NOTICE_PROD_CA;
181 data.trap_num = trap_num;
182 data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
183 data.toggle_count = 0;
184 memset(&data.details, 0, sizeof(data.details));
185 data.details.ntc_257_258.lid1 = lid1;
186 data.details.ntc_257_258.lid2 = lid2;
187 data.details.ntc_257_258.key = cpu_to_be32(key);
188 data.details.ntc_257_258.sl_qp1 = cpu_to_be32((sl << 28) | qp1);
189 data.details.ntc_257_258.qp2 = cpu_to_be32(qp2);
190
191 send_trap(ibp, &data, sizeof(data));
192}
193
194/*
195 * Send a bad M_Key trap (ch. 14.3.9).
196 */
197static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
198 __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
199{
200 struct ib_mad_notice_attr data;
201
202 /* Send violation trap */
203 data.generic_type = IB_NOTICE_TYPE_SECURITY;
204 data.prod_type_msb = 0;
205 data.prod_type_lsb = IB_NOTICE_PROD_CA;
206 data.trap_num = IB_NOTICE_TRAP_BAD_MKEY;
207 data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
208 data.toggle_count = 0;
209 memset(&data.details, 0, sizeof(data.details));
210 data.details.ntc_256.lid = data.issuer_lid;
211 data.details.ntc_256.method = mad->method;
212 data.details.ntc_256.attr_id = mad->attr_id;
213 data.details.ntc_256.attr_mod = mad->attr_mod;
214 data.details.ntc_256.mkey = mkey;
215 if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
216
217 data.details.ntc_256.dr_slid = (__force __be16)dr_slid;
218 data.details.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
219 if (hop_cnt > ARRAY_SIZE(data.details.ntc_256.dr_rtn_path)) {
220 data.details.ntc_256.dr_trunc_hop |=
221 IB_NOTICE_TRAP_DR_TRUNC;
222 hop_cnt = ARRAY_SIZE(data.details.ntc_256.dr_rtn_path);
223 }
224 data.details.ntc_256.dr_trunc_hop |= hop_cnt;
225 memcpy(data.details.ntc_256.dr_rtn_path, return_path,
226 hop_cnt);
227 }
228
229 send_trap(ibp, &data, sizeof(data));
230}
231
232/*
233 * Send a Port Capability Mask Changed trap (ch. 14.3.11).
234 */
235void hfi1_cap_mask_chg(struct hfi1_ibport *ibp)
236{
237 struct ib_mad_notice_attr data;
238
239 data.generic_type = IB_NOTICE_TYPE_INFO;
240 data.prod_type_msb = 0;
241 data.prod_type_lsb = IB_NOTICE_PROD_CA;
242 data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
243 data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
244 data.toggle_count = 0;
245 memset(&data.details, 0, sizeof(data.details));
246 data.details.ntc_144.lid = data.issuer_lid;
247 data.details.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags);
248
249 send_trap(ibp, &data, sizeof(data));
250}
251
252/*
253 * Send a System Image GUID Changed trap (ch. 14.3.12).
254 */
255void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
256{
257 struct ib_mad_notice_attr data;
258
259 data.generic_type = IB_NOTICE_TYPE_INFO;
260 data.prod_type_msb = 0;
261 data.prod_type_lsb = IB_NOTICE_PROD_CA;
262 data.trap_num = IB_NOTICE_TRAP_SYS_GUID_CHG;
263 data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
264 data.toggle_count = 0;
265 memset(&data.details, 0, sizeof(data.details));
266 data.details.ntc_145.lid = data.issuer_lid;
267 data.details.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
268
269 send_trap(ibp, &data, sizeof(data));
270}
271
272/*
273 * Send a Node Description Changed trap (ch. 14.3.13).
274 */
275void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
276{
277 struct ib_mad_notice_attr data;
278
279 data.generic_type = IB_NOTICE_TYPE_INFO;
280 data.prod_type_msb = 0;
281 data.prod_type_lsb = IB_NOTICE_PROD_CA;
282 data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
283 data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
284 data.toggle_count = 0;
285 memset(&data.details, 0, sizeof(data.details));
286 data.details.ntc_144.lid = data.issuer_lid;
287 data.details.ntc_144.local_changes = 1;
288 data.details.ntc_144.change_flags = IB_NOTICE_TRAP_NODE_DESC_CHG;
289
290 send_trap(ibp, &data, sizeof(data));
291}
292
293static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
294 u8 *data, struct ib_device *ibdev,
295 u8 port, u32 *resp_len)
296{
297 struct opa_node_description *nd;
298
299 if (am) {
300 smp->status |= IB_SMP_INVALID_FIELD;
301 return reply((struct ib_mad_hdr *)smp);
302 }
303
304 nd = (struct opa_node_description *)data;
305
306 memcpy(nd->data, ibdev->node_desc, sizeof(nd->data));
307
308 if (resp_len)
309 *resp_len += sizeof(*nd);
310
311 return reply((struct ib_mad_hdr *)smp);
312}
313
314static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
315 struct ib_device *ibdev, u8 port,
316 u32 *resp_len)
317{
318 struct opa_node_info *ni;
319 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
320 unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
321
322 ni = (struct opa_node_info *)data;
323
324 /* GUID 0 is illegal */
325 if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) {
326 smp->status |= IB_SMP_INVALID_FIELD;
327 return reply((struct ib_mad_hdr *)smp);
328 }
329
330 ni->port_guid = cpu_to_be64(dd->pport[pidx].guid);
331 ni->base_version = OPA_MGMT_BASE_VERSION;
332 ni->class_version = OPA_SMI_CLASS_VERSION;
333 ni->node_type = 1; /* channel adapter */
334 ni->num_ports = ibdev->phys_port_cnt;
335 /* This is already in network order */
336 ni->system_image_guid = ib_hfi1_sys_image_guid;
337 /* Use first-port GUID as node */
338 ni->node_guid = cpu_to_be64(dd->pport->guid);
339 ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
340 ni->device_id = cpu_to_be16(dd->pcidev->device);
341 ni->revision = cpu_to_be32(dd->minrev);
342 ni->local_port_num = port;
343 ni->vendor_id[0] = dd->oui1;
344 ni->vendor_id[1] = dd->oui2;
345 ni->vendor_id[2] = dd->oui3;
346
347 if (resp_len)
348 *resp_len += sizeof(*ni);
349
350 return reply((struct ib_mad_hdr *)smp);
351}
352
353static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
354 u8 port)
355{
356 struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
357 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
358 unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
359
360 /* GUID 0 is illegal */
361 if (smp->attr_mod || pidx >= dd->num_pports ||
362 dd->pport[pidx].guid == 0)
363 smp->status |= IB_SMP_INVALID_FIELD;
364 else
365 nip->port_guid = cpu_to_be64(dd->pport[pidx].guid);
366
367 nip->base_version = OPA_MGMT_BASE_VERSION;
368 nip->class_version = OPA_SMI_CLASS_VERSION;
369 nip->node_type = 1; /* channel adapter */
370 nip->num_ports = ibdev->phys_port_cnt;
371 /* This is already in network order */
372 nip->sys_guid = ib_hfi1_sys_image_guid;
373 /* Use first-port GUID as node */
374 nip->node_guid = cpu_to_be64(dd->pport->guid);
375 nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
376 nip->device_id = cpu_to_be16(dd->pcidev->device);
377 nip->revision = cpu_to_be32(dd->minrev);
378 nip->local_port_num = port;
379 nip->vendor_id[0] = dd->oui1;
380 nip->vendor_id[1] = dd->oui2;
381 nip->vendor_id[2] = dd->oui3;
382
383 return reply((struct ib_mad_hdr *)smp);
384}
385
386static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w)
387{
388 (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w);
389}
390
391static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w)
392{
393 (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w);
394}
395
396static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s)
397{
398 (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s);
399}
400
401static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
402 int mad_flags, __be64 mkey, __be32 dr_slid,
403 u8 return_path[], u8 hop_cnt)
404{
405 int valid_mkey = 0;
406 int ret = 0;
407
408 /* Is the mkey in the process of expiring? */
409 if (ibp->mkey_lease_timeout &&
410 time_after_eq(jiffies, ibp->mkey_lease_timeout)) {
411 /* Clear timeout and mkey protection field. */
412 ibp->mkey_lease_timeout = 0;
413 ibp->mkeyprot = 0;
414 }
415
416 if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->mkey == 0 ||
417 ibp->mkey == mkey)
418 valid_mkey = 1;
419
420 /* Unset lease timeout on any valid Get/Set/TrapRepress */
421 if (valid_mkey && ibp->mkey_lease_timeout &&
422 (mad->method == IB_MGMT_METHOD_GET ||
423 mad->method == IB_MGMT_METHOD_SET ||
424 mad->method == IB_MGMT_METHOD_TRAP_REPRESS))
425 ibp->mkey_lease_timeout = 0;
426
427 if (!valid_mkey) {
428 switch (mad->method) {
429 case IB_MGMT_METHOD_GET:
430 /* Bad mkey not a violation below level 2 */
431 if (ibp->mkeyprot < 2)
432 break;
433 case IB_MGMT_METHOD_SET:
434 case IB_MGMT_METHOD_TRAP_REPRESS:
435 if (ibp->mkey_violations != 0xFFFF)
436 ++ibp->mkey_violations;
437 if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period)
438 ibp->mkey_lease_timeout = jiffies +
439 ibp->mkey_lease_period * HZ;
440 /* Generate a trap notice. */
441 bad_mkey(ibp, mad, mkey, dr_slid, return_path,
442 hop_cnt);
443 ret = 1;
444 }
445 }
446
447 return ret;
448}
449
450/*
451 * The SMA caches reads from LCB registers in case the LCB is unavailable.
452 * (The LCB is unavailable in certain link states, for example.)
453 */
454struct lcb_datum {
455 u32 off;
456 u64 val;
457};
458
459static struct lcb_datum lcb_cache[] = {
460 { DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 },
461};
462
463static int write_lcb_cache(u32 off, u64 val)
464{
465 int i;
466
467 for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
468 if (lcb_cache[i].off == off) {
469 lcb_cache[i].val = val;
470 return 0;
471 }
472 }
473
474 pr_warn("%s bad offset 0x%x\n", __func__, off);
475 return -1;
476}
477
478static int read_lcb_cache(u32 off, u64 *val)
479{
480 int i;
481
482 for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
483 if (lcb_cache[i].off == off) {
484 *val = lcb_cache[i].val;
485 return 0;
486 }
487 }
488
489 pr_warn("%s bad offset 0x%x\n", __func__, off);
490 return -1;
491}
492
493void read_ltp_rtt(struct hfi1_devdata *dd)
494{
495 u64 reg;
496
497 if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, &reg))
498 dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__);
499 else
500 write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg);
501}
502
503static u8 __opa_porttype(struct hfi1_pportdata *ppd)
504{
505 if (qsfp_mod_present(ppd)) {
506 if (ppd->qsfp_info.cache_valid)
507 return OPA_PORT_TYPE_STANDARD;
508 return OPA_PORT_TYPE_DISCONNECTED;
509 }
510 return OPA_PORT_TYPE_UNKNOWN;
511}
512
513static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
514 struct ib_device *ibdev, u8 port,
515 u32 *resp_len)
516{
517 int i;
518 struct hfi1_devdata *dd;
519 struct hfi1_pportdata *ppd;
520 struct hfi1_ibport *ibp;
521 struct opa_port_info *pi = (struct opa_port_info *)data;
522 u8 mtu;
523 u8 credit_rate;
524 u32 state;
525 u32 num_ports = OPA_AM_NPORT(am);
526 u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
527 u32 buffer_units;
528 u64 tmp = 0;
529
530 if (num_ports != 1) {
531 smp->status |= IB_SMP_INVALID_FIELD;
532 return reply((struct ib_mad_hdr *)smp);
533 }
534
535 dd = dd_from_ibdev(ibdev);
536 /* IB numbers ports from 1, hw from 0 */
537 ppd = dd->pport + (port - 1);
538 ibp = &ppd->ibport_data;
539
540 if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
541 ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
542 smp->status |= IB_SMP_INVALID_FIELD;
543 return reply((struct ib_mad_hdr *)smp);
544 }
545
546 pi->lid = cpu_to_be32(ppd->lid);
547
548 /* Only return the mkey if the protection field allows it. */
549 if (!(smp->method == IB_MGMT_METHOD_GET &&
550 ibp->mkey != smp->mkey &&
551 ibp->mkeyprot == 1))
552 pi->mkey = ibp->mkey;
553
554 pi->subnet_prefix = ibp->gid_prefix;
555 pi->sm_lid = cpu_to_be32(ibp->sm_lid);
556 pi->ib_cap_mask = cpu_to_be32(ibp->port_cap_flags);
557 pi->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period);
558 pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp);
559 pi->sa_qp = cpu_to_be32(ppd->sa_qp);
560
561 pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled);
562 pi->link_width.supported = cpu_to_be16(ppd->link_width_supported);
563 pi->link_width.active = cpu_to_be16(ppd->link_width_active);
564
565 pi->link_width_downgrade.supported =
566 cpu_to_be16(ppd->link_width_downgrade_supported);
567 pi->link_width_downgrade.enabled =
568 cpu_to_be16(ppd->link_width_downgrade_enabled);
569 pi->link_width_downgrade.tx_active =
570 cpu_to_be16(ppd->link_width_downgrade_tx_active);
571 pi->link_width_downgrade.rx_active =
572 cpu_to_be16(ppd->link_width_downgrade_rx_active);
573
574 pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported);
575 pi->link_speed.active = cpu_to_be16(ppd->link_speed_active);
576 pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled);
577
578 state = driver_lstate(ppd);
579
580 if (start_of_sm_config && (state == IB_PORT_INIT))
581 ppd->is_sm_config_started = 1;
582
583 pi->port_phys_conf = __opa_porttype(ppd) & 0xf;
584
585#if PI_LED_ENABLE_SUP
586 pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
587 pi->port_states.ledenable_offlinereason |=
588 ppd->is_sm_config_started << 5;
589 pi->port_states.ledenable_offlinereason |=
590 ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON;
591#else
592 pi->port_states.offline_reason = ppd->neighbor_normal << 4;
593 pi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
594 pi->port_states.offline_reason |= ppd->offline_disabled_reason &
595 OPA_PI_MASK_OFFLINE_REASON;
596#endif /* PI_LED_ENABLE_SUP */
597
598 pi->port_states.portphysstate_portstate =
599 (hfi1_ibphys_portstate(ppd) << 4) | state;
600
601 pi->mkeyprotect_lmc = (ibp->mkeyprot << 6) | ppd->lmc;
602
603 memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu));
604 for (i = 0; i < ppd->vls_supported; i++) {
605 mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU);
606 if ((i % 2) == 0)
607 pi->neigh_mtu.pvlx_to_mtu[i/2] |= (mtu << 4);
608 else
609 pi->neigh_mtu.pvlx_to_mtu[i/2] |= mtu;
610 }
611 /* don't forget VL 15 */
612 mtu = mtu_to_enum(dd->vld[15].mtu, 2048);
613 pi->neigh_mtu.pvlx_to_mtu[15/2] |= mtu;
614 pi->smsl = ibp->sm_sl & OPA_PI_MASK_SMSL;
615 pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS);
616 pi->partenforce_filterraw |=
617 (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON);
618 if (ppd->part_enforce & HFI1_PART_ENFORCE_IN)
619 pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN;
620 if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT)
621 pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT;
622 pi->mkey_violations = cpu_to_be16(ibp->mkey_violations);
623 /* P_KeyViolations are counted by hardware. */
624 pi->pkey_violations = cpu_to_be16(ibp->pkey_violations);
625 pi->qkey_violations = cpu_to_be16(ibp->qkey_violations);
626
627 pi->vl.cap = ppd->vls_supported;
628 pi->vl.high_limit = cpu_to_be16(ibp->vl_high_limit);
629 pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP);
630 pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP);
631
632 pi->clientrereg_subnettimeout = ibp->subnet_timeout;
633
634 pi->port_link_mode = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 |
635 OPA_PORT_LINK_MODE_OPA << 5 |
636 OPA_PORT_LINK_MODE_OPA);
637
638 pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode);
639
640 pi->port_mode = cpu_to_be16(
641 ppd->is_active_optimize_enabled ?
642 OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0);
643
644 pi->port_packet_format.supported =
645 cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
646 pi->port_packet_format.enabled =
647 cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
648
649 /* flit_control.interleave is (OPA V1, version .76):
650 * bits use
651 * ---- ---
652 * 2 res
653 * 2 DistanceSupported
654 * 2 DistanceEnabled
655 * 5 MaxNextLevelTxEnabled
656 * 5 MaxNestLevelRxSupported
657 *
658 * HFI supports only "distance mode 1" (see OPA V1, version .76,
659 * section 9.6.2), so set DistanceSupported, DistanceEnabled
660 * to 0x1.
661 */
662 pi->flit_control.interleave = cpu_to_be16(0x1400);
663
664 pi->link_down_reason = ppd->local_link_down_reason.sma;
665 pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma;
666 pi->port_error_action = cpu_to_be32(ppd->port_error_action);
667 pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096);
668
669 /* 32.768 usec. response time (guessing) */
670 pi->resptimevalue = 3;
671
672 pi->local_port_num = port;
673
674 /* buffer info for FM */
675 pi->overall_buffer_space = cpu_to_be16(dd->link_credits);
676
677 pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid);
678 pi->neigh_port_num = ppd->neighbor_port_number;
679 pi->port_neigh_mode =
680 (ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) |
681 (ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) |
682 (ppd->neighbor_fm_security ?
683 OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0);
684
685 /* HFIs shall always return VL15 credits to their
686 * neighbor in a timely manner, without any credit return pacing.
687 */
688 credit_rate = 0;
689 buffer_units = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC;
690 buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK;
691 buffer_units |= (credit_rate << 6) &
692 OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE;
693 buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
694 pi->buffer_units = cpu_to_be32(buffer_units);
695
696 pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported);
697
698 /* HFI supports a replay buffer 128 LTPs in size */
699 pi->replay_depth.buffer = 0x80;
700 /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
701 read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp);
702
703 /* this counter is 16 bits wide, but the replay_depth.wire
704 * variable is only 8 bits */
705 if (tmp > 0xff)
706 tmp = 0xff;
707 pi->replay_depth.wire = tmp;
708
709 if (resp_len)
710 *resp_len += sizeof(struct opa_port_info);
711
712 return reply((struct ib_mad_hdr *)smp);
713}
714
715/**
716 * get_pkeys - return the PKEY table
717 * @dd: the hfi1_ib device
718 * @port: the IB port number
719 * @pkeys: the pkey table is placed here
720 */
721static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
722{
723 struct hfi1_pportdata *ppd = dd->pport + port - 1;
724
725 memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys));
726
727 return 0;
728}
729
730static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
731 struct ib_device *ibdev, u8 port,
732 u32 *resp_len)
733{
734 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
735 u32 n_blocks_req = OPA_AM_NBLK(am);
736 u32 start_block = am & 0x7ff;
737 __be16 *p;
738 u16 *q;
739 int i;
740 u16 n_blocks_avail;
741 unsigned npkeys = hfi1_get_npkeys(dd);
742 size_t size;
743
744 if (n_blocks_req == 0) {
745 pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
746 port, start_block, n_blocks_req);
747 smp->status |= IB_SMP_INVALID_FIELD;
748 return reply((struct ib_mad_hdr *)smp);
749 }
750
751 n_blocks_avail = (u16) (npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1;
752
753 size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
754
755 if (start_block + n_blocks_req > n_blocks_avail ||
756 n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
757 pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
758 "avail 0x%x; blk/smp 0x%lx\n",
759 start_block, n_blocks_req, n_blocks_avail,
760 OPA_NUM_PKEY_BLOCKS_PER_SMP);
761 smp->status |= IB_SMP_INVALID_FIELD;
762 return reply((struct ib_mad_hdr *)smp);
763 }
764
765 p = (__be16 *) data;
766 q = (u16 *)data;
767 /* get the real pkeys if we are requesting the first block */
768 if (start_block == 0) {
769 get_pkeys(dd, port, q);
770 for (i = 0; i < npkeys; i++)
771 p[i] = cpu_to_be16(q[i]);
772 if (resp_len)
773 *resp_len += size;
774 } else
775 smp->status |= IB_SMP_INVALID_FIELD;
776
777 return reply((struct ib_mad_hdr *)smp);
778}
779
780enum {
781 HFI_TRANSITION_DISALLOWED,
782 HFI_TRANSITION_IGNORED,
783 HFI_TRANSITION_ALLOWED,
784 HFI_TRANSITION_UNDEFINED,
785};
786
787/*
788 * Use shortened names to improve readability of
789 * {logical,physical}_state_transitions
790 */
791enum {
792 __D = HFI_TRANSITION_DISALLOWED,
793 __I = HFI_TRANSITION_IGNORED,
794 __A = HFI_TRANSITION_ALLOWED,
795 __U = HFI_TRANSITION_UNDEFINED,
796};
797
798/*
799 * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are
800 * represented in physical_state_transitions.
801 */
802#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1)
803
804/*
805 * Within physical_state_transitions, rows represent "old" states,
806 * columns "new" states, and physical_state_transitions.allowed[old][new]
807 * indicates if the transition from old state to new state is legal (see
808 * OPAg1v1, Table 6-4).
809 */
810static const struct {
811 u8 allowed[__N_PHYSTATES][__N_PHYSTATES];
812} physical_state_transitions = {
813 {
814 /* 2 3 4 5 6 7 8 9 10 11 */
815 /* 2 */ { __A, __A, __D, __D, __D, __D, __D, __D, __D, __D },
816 /* 3 */ { __A, __I, __D, __D, __D, __D, __D, __D, __D, __A },
817 /* 4 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
818 /* 5 */ { __A, __A, __D, __I, __D, __D, __D, __D, __D, __D },
819 /* 6 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
820 /* 7 */ { __D, __A, __D, __D, __D, __I, __D, __D, __D, __D },
821 /* 8 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
822 /* 9 */ { __I, __A, __D, __D, __D, __D, __D, __I, __D, __D },
823 /*10 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
824 /*11 */ { __D, __A, __D, __D, __D, __D, __D, __D, __D, __I },
825 }
826};
827
828/*
829 * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented
830 * logical_state_transitions
831 */
832
833#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1)
834
835/*
836 * Within logical_state_transitions rows represent "old" states,
837 * columns "new" states, and logical_state_transitions.allowed[old][new]
838 * indicates if the transition from old state to new state is legal (see
839 * OPAg1v1, Table 9-12).
840 */
841static const struct {
842 u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES];
843} logical_state_transitions = {
844 {
845 /* 1 2 3 4 5 */
846 /* 1 */ { __I, __D, __D, __D, __U},
847 /* 2 */ { __D, __I, __A, __D, __U},
848 /* 3 */ { __D, __D, __I, __A, __U},
849 /* 4 */ { __D, __D, __I, __I, __U},
850 /* 5 */ { __U, __U, __U, __U, __U},
851 }
852};
853
854static int logical_transition_allowed(int old, int new)
855{
856 if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER ||
857 new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) {
858 pr_warn("invalid logical state(s) (old %d new %d)\n",
859 old, new);
860 return HFI_TRANSITION_UNDEFINED;
861 }
862
863 if (new == IB_PORT_NOP)
864 return HFI_TRANSITION_ALLOWED; /* always allowed */
865
866 /* adjust states for indexing into logical_state_transitions */
867 old -= IB_PORT_DOWN;
868 new -= IB_PORT_DOWN;
869
870 if (old < 0 || new < 0)
871 return HFI_TRANSITION_UNDEFINED;
872 return logical_state_transitions.allowed[old][new];
873}
874
875static int physical_transition_allowed(int old, int new)
876{
877 if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX ||
878 new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) {
879 pr_warn("invalid physical state(s) (old %d new %d)\n",
880 old, new);
881 return HFI_TRANSITION_UNDEFINED;
882 }
883
884 if (new == IB_PORTPHYSSTATE_NOP)
885 return HFI_TRANSITION_ALLOWED; /* always allowed */
886
887 /* adjust states for indexing into physical_state_transitions */
888 old -= IB_PORTPHYSSTATE_POLLING;
889 new -= IB_PORTPHYSSTATE_POLLING;
890
891 if (old < 0 || new < 0)
892 return HFI_TRANSITION_UNDEFINED;
893 return physical_state_transitions.allowed[old][new];
894}
895
896static int port_states_transition_allowed(struct hfi1_pportdata *ppd,
897 u32 logical_new, u32 physical_new)
898{
899 u32 physical_old = driver_physical_state(ppd);
900 u32 logical_old = driver_logical_state(ppd);
901 int ret, logical_allowed, physical_allowed;
902
903 logical_allowed = ret =
904 logical_transition_allowed(logical_old, logical_new);
905
906 if (ret == HFI_TRANSITION_DISALLOWED ||
907 ret == HFI_TRANSITION_UNDEFINED) {
908 pr_warn("invalid logical state transition %s -> %s\n",
909 opa_lstate_name(logical_old),
910 opa_lstate_name(logical_new));
911 return ret;
912 }
913
914 physical_allowed = ret =
915 physical_transition_allowed(physical_old, physical_new);
916
917 if (ret == HFI_TRANSITION_DISALLOWED ||
918 ret == HFI_TRANSITION_UNDEFINED) {
919 pr_warn("invalid physical state transition %s -> %s\n",
920 opa_pstate_name(physical_old),
921 opa_pstate_name(physical_new));
922 return ret;
923 }
924
925 if (logical_allowed == HFI_TRANSITION_IGNORED &&
926 physical_allowed == HFI_TRANSITION_IGNORED)
927 return HFI_TRANSITION_IGNORED;
928
929 /*
930 * Either physical_allowed or logical_allowed is
931 * HFI_TRANSITION_ALLOWED.
932 */
933 return HFI_TRANSITION_ALLOWED;
934}
935
936static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
937 u32 logical_state, u32 phys_state,
938 int suppress_idle_sma)
939{
940 struct hfi1_devdata *dd = ppd->dd;
941 u32 link_state;
942 int ret;
943
944 ret = port_states_transition_allowed(ppd, logical_state, phys_state);
945 if (ret == HFI_TRANSITION_DISALLOWED ||
946 ret == HFI_TRANSITION_UNDEFINED) {
947 /* error message emitted above */
948 smp->status |= IB_SMP_INVALID_FIELD;
949 return 0;
950 }
951
952 if (ret == HFI_TRANSITION_IGNORED)
953 return 0;
954
955 if ((phys_state != IB_PORTPHYSSTATE_NOP) &&
956 !(logical_state == IB_PORT_DOWN ||
957 logical_state == IB_PORT_NOP)){
958 pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n",
959 logical_state, phys_state);
960 smp->status |= IB_SMP_INVALID_FIELD;
961 }
962
963 /*
964 * Logical state changes are summarized in OPAv1g1 spec.,
965 * Table 9-12; physical state changes are summarized in
966 * OPAv1g1 spec., Table 6.4.
967 */
968 switch (logical_state) {
969 case IB_PORT_NOP:
970 if (phys_state == IB_PORTPHYSSTATE_NOP)
971 break;
972 /* FALLTHROUGH */
973 case IB_PORT_DOWN:
974 if (phys_state == IB_PORTPHYSSTATE_NOP)
975 link_state = HLS_DN_DOWNDEF;
976 else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
977 link_state = HLS_DN_POLL;
978 set_link_down_reason(ppd,
979 OPA_LINKDOWN_REASON_FM_BOUNCE, 0,
980 OPA_LINKDOWN_REASON_FM_BOUNCE);
981 } else if (phys_state == IB_PORTPHYSSTATE_DISABLED)
982 link_state = HLS_DN_DISABLE;
983 else {
984 pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n",
985 phys_state);
986 smp->status |= IB_SMP_INVALID_FIELD;
987 break;
988 }
989
990 set_link_state(ppd, link_state);
991 if (link_state == HLS_DN_DISABLE &&
992 (ppd->offline_disabled_reason >
993 OPA_LINKDOWN_REASON_SMA_DISABLED ||
994 ppd->offline_disabled_reason ==
995 OPA_LINKDOWN_REASON_NONE))
996 ppd->offline_disabled_reason =
997 OPA_LINKDOWN_REASON_SMA_DISABLED;
998 /*
999 * Don't send a reply if the response would be sent
1000 * through the disabled port.
1001 */
1002 if (link_state == HLS_DN_DISABLE && smp->hop_cnt)
1003 return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
1004 break;
1005 case IB_PORT_ARMED:
1006 ret = set_link_state(ppd, HLS_UP_ARMED);
1007 if ((ret == 0) && (suppress_idle_sma == 0))
1008 send_idle_sma(dd, SMA_IDLE_ARM);
1009 break;
1010 case IB_PORT_ACTIVE:
1011 if (ppd->neighbor_normal) {
1012 ret = set_link_state(ppd, HLS_UP_ACTIVE);
1013 if (ret == 0)
1014 send_idle_sma(dd, SMA_IDLE_ACTIVE);
1015 } else {
1016 pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n");
1017 smp->status |= IB_SMP_INVALID_FIELD;
1018 }
1019 break;
1020 default:
1021 pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n",
1022 logical_state);
1023 smp->status |= IB_SMP_INVALID_FIELD;
1024 }
1025
1026 return 0;
1027}
1028
1029/**
1030 * subn_set_opa_portinfo - set port information
1031 * @smp: the incoming SM packet
1032 * @ibdev: the infiniband device
1033 * @port: the port on the device
1034 *
1035 */
1036static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
1037 struct ib_device *ibdev, u8 port,
1038 u32 *resp_len)
1039{
1040 struct opa_port_info *pi = (struct opa_port_info *)data;
1041 struct ib_event event;
1042 struct hfi1_devdata *dd;
1043 struct hfi1_pportdata *ppd;
1044 struct hfi1_ibport *ibp;
1045 u8 clientrereg;
1046 unsigned long flags;
1047 u32 smlid, opa_lid; /* tmp vars to hold LID values */
1048 u16 lid;
1049 u8 ls_old, ls_new, ps_new;
1050 u8 vls;
1051 u8 msl;
1052 u8 crc_enabled;
1053 u16 lse, lwe, mtu;
1054 u32 num_ports = OPA_AM_NPORT(am);
1055 u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
1056 int ret, i, invalid = 0, call_set_mtu = 0;
1057 int call_link_downgrade_policy = 0;
1058
1059 if (num_ports != 1) {
1060 smp->status |= IB_SMP_INVALID_FIELD;
1061 return reply((struct ib_mad_hdr *)smp);
1062 }
1063
1064 opa_lid = be32_to_cpu(pi->lid);
1065 if (opa_lid & 0xFFFF0000) {
1066 pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid);
1067 smp->status |= IB_SMP_INVALID_FIELD;
1068 goto get_only;
1069 }
1070
1071 lid = (u16)(opa_lid & 0x0000FFFF);
1072
1073 smlid = be32_to_cpu(pi->sm_lid);
1074 if (smlid & 0xFFFF0000) {
1075 pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid);
1076 smp->status |= IB_SMP_INVALID_FIELD;
1077 goto get_only;
1078 }
1079 smlid &= 0x0000FFFF;
1080
1081 clientrereg = (pi->clientrereg_subnettimeout &
1082 OPA_PI_MASK_CLIENT_REREGISTER);
1083
1084 dd = dd_from_ibdev(ibdev);
1085 /* IB numbers ports from 1, hw from 0 */
1086 ppd = dd->pport + (port - 1);
1087 ibp = &ppd->ibport_data;
1088 event.device = ibdev;
1089 event.element.port_num = port;
1090
1091 ls_old = driver_lstate(ppd);
1092
1093 ibp->mkey = pi->mkey;
1094 ibp->gid_prefix = pi->subnet_prefix;
1095 ibp->mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
1096
1097 /* Must be a valid unicast LID address. */
1098 if ((lid == 0 && ls_old > IB_PORT_INIT) ||
1099 lid >= HFI1_MULTICAST_LID_BASE) {
1100 smp->status |= IB_SMP_INVALID_FIELD;
1101 pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
1102 lid);
1103 } else if (ppd->lid != lid ||
1104 ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) {
1105 if (ppd->lid != lid)
1106 hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT);
1107 if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC))
1108 hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT);
1109 hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC);
1110 event.event = IB_EVENT_LID_CHANGE;
1111 ib_dispatch_event(&event);
1112 }
1113
1114 msl = pi->smsl & OPA_PI_MASK_SMSL;
1115 if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON)
1116 ppd->linkinit_reason =
1117 (pi->partenforce_filterraw &
1118 OPA_PI_MASK_LINKINIT_REASON);
1119 /* enable/disable SW pkey checking as per FM control */
1120 if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_IN)
1121 ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
1122 else
1123 ppd->part_enforce &= ~HFI1_PART_ENFORCE_IN;
1124
1125 if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_OUT)
1126 ppd->part_enforce |= HFI1_PART_ENFORCE_OUT;
1127 else
1128 ppd->part_enforce &= ~HFI1_PART_ENFORCE_OUT;
1129
1130 /* Must be a valid unicast LID address. */
1131 if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
1132 smlid >= HFI1_MULTICAST_LID_BASE) {
1133 smp->status |= IB_SMP_INVALID_FIELD;
1134 pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
1135 } else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) {
1136 pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid);
1137 spin_lock_irqsave(&ibp->lock, flags);
1138 if (ibp->sm_ah) {
1139 if (smlid != ibp->sm_lid)
1140 ibp->sm_ah->attr.dlid = smlid;
1141 if (msl != ibp->sm_sl)
1142 ibp->sm_ah->attr.sl = msl;
1143 }
1144 spin_unlock_irqrestore(&ibp->lock, flags);
1145 if (smlid != ibp->sm_lid)
1146 ibp->sm_lid = smlid;
1147 if (msl != ibp->sm_sl)
1148 ibp->sm_sl = msl;
1149 event.event = IB_EVENT_SM_CHANGE;
1150 ib_dispatch_event(&event);
1151 }
1152
1153 if (pi->link_down_reason == 0) {
1154 ppd->local_link_down_reason.sma = 0;
1155 ppd->local_link_down_reason.latest = 0;
1156 }
1157
1158 if (pi->neigh_link_down_reason == 0) {
1159 ppd->neigh_link_down_reason.sma = 0;
1160 ppd->neigh_link_down_reason.latest = 0;
1161 }
1162
1163 ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp);
1164 ppd->sa_qp = be32_to_cpu(pi->sa_qp);
1165
1166 ppd->port_error_action = be32_to_cpu(pi->port_error_action);
1167 lwe = be16_to_cpu(pi->link_width.enabled);
1168 if (lwe) {
1169 if (lwe == OPA_LINK_WIDTH_RESET
1170 || lwe == OPA_LINK_WIDTH_RESET_OLD)
1171 set_link_width_enabled(ppd, ppd->link_width_supported);
1172 else if ((lwe & ~ppd->link_width_supported) == 0)
1173 set_link_width_enabled(ppd, lwe);
1174 else
1175 smp->status |= IB_SMP_INVALID_FIELD;
1176 }
1177 lwe = be16_to_cpu(pi->link_width_downgrade.enabled);
1178 /* LWD.E is always applied - 0 means "disabled" */
1179 if (lwe == OPA_LINK_WIDTH_RESET
1180 || lwe == OPA_LINK_WIDTH_RESET_OLD) {
1181 set_link_width_downgrade_enabled(ppd,
1182 ppd->link_width_downgrade_supported);
1183 } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) {
1184 /* only set and apply if something changed */
1185 if (lwe != ppd->link_width_downgrade_enabled) {
1186 set_link_width_downgrade_enabled(ppd, lwe);
1187 call_link_downgrade_policy = 1;
1188 }
1189 } else
1190 smp->status |= IB_SMP_INVALID_FIELD;
1191
1192 lse = be16_to_cpu(pi->link_speed.enabled);
1193 if (lse) {
1194 if (lse & be16_to_cpu(pi->link_speed.supported))
1195 set_link_speed_enabled(ppd, lse);
1196 else
1197 smp->status |= IB_SMP_INVALID_FIELD;
1198 }
1199
1200 ibp->mkeyprot = (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
1201 ibp->vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
1202 (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT,
1203 ibp->vl_high_limit);
1204
1205 if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
1206 ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
1207 smp->status |= IB_SMP_INVALID_FIELD;
1208 return reply((struct ib_mad_hdr *)smp);
1209 }
1210 for (i = 0; i < ppd->vls_supported; i++) {
1211 if ((i % 2) == 0)
1212 mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i/2] >> 4)
1213 & 0xF);
1214 else
1215 mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i/2] & 0xF);
1216 if (mtu == 0xffff) {
1217 pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n",
1218 mtu,
1219 (pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF);
1220 smp->status |= IB_SMP_INVALID_FIELD;
1221 mtu = hfi1_max_mtu; /* use a valid MTU */
1222 }
1223 if (dd->vld[i].mtu != mtu) {
1224 dd_dev_info(dd,
1225 "MTU change on vl %d from %d to %d\n",
1226 i, dd->vld[i].mtu, mtu);
1227 dd->vld[i].mtu = mtu;
1228 call_set_mtu++;
1229 }
1230 }
1231 /* As per OPAV1 spec: VL15 must support and be configured
1232 * for operation with a 2048 or larger MTU.
1233 */
1234 mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15/2] & 0xF);
1235 if (mtu < 2048 || mtu == 0xffff)
1236 mtu = 2048;
1237 if (dd->vld[15].mtu != mtu) {
1238 dd_dev_info(dd,
1239 "MTU change on vl 15 from %d to %d\n",
1240 dd->vld[15].mtu, mtu);
1241 dd->vld[15].mtu = mtu;
1242 call_set_mtu++;
1243 }
1244 if (call_set_mtu)
1245 set_mtu(ppd);
1246
1247 /* Set operational VLs */
1248 vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL;
1249 if (vls) {
1250 if (vls > ppd->vls_supported) {
1251 pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n",
1252 pi->operational_vls);
1253 smp->status |= IB_SMP_INVALID_FIELD;
1254 } else {
1255 if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS,
1256 vls) == -EINVAL)
1257 smp->status |= IB_SMP_INVALID_FIELD;
1258 }
1259 }
1260
1261 if (pi->mkey_violations == 0)
1262 ibp->mkey_violations = 0;
1263
1264 if (pi->pkey_violations == 0)
1265 ibp->pkey_violations = 0;
1266
1267 if (pi->qkey_violations == 0)
1268 ibp->qkey_violations = 0;
1269
1270 ibp->subnet_timeout =
1271 pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT;
1272
1273 crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode);
1274 crc_enabled >>= 4;
1275 crc_enabled &= 0xf;
1276
1277 if (crc_enabled != 0)
1278 ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled);
1279
1280 ppd->is_active_optimize_enabled =
1281 !!(be16_to_cpu(pi->port_mode)
1282 & OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE);
1283
1284 ls_new = pi->port_states.portphysstate_portstate &
1285 OPA_PI_MASK_PORT_STATE;
1286 ps_new = (pi->port_states.portphysstate_portstate &
1287 OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4;
1288
1289 if (ls_old == IB_PORT_INIT) {
1290 if (start_of_sm_config) {
1291 if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
1292 ppd->is_sm_config_started = 1;
1293 } else if (ls_new == IB_PORT_ARMED) {
1294 if (ppd->is_sm_config_started == 0)
1295 invalid = 1;
1296 }
1297 }
1298
1299 /* Handle CLIENT_REREGISTER event b/c SM asked us for it */
1300 if (clientrereg) {
1301 event.event = IB_EVENT_CLIENT_REREGISTER;
1302 ib_dispatch_event(&event);
1303 }
1304
1305 /*
1306 * Do the port state change now that the other link parameters
1307 * have been set.
1308 * Changing the port physical state only makes sense if the link
1309 * is down or is being set to down.
1310 */
1311
1312 ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
1313 if (ret)
1314 return ret;
1315
1316 ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
1317
1318 /* restore re-reg bit per o14-12.2.1 */
1319 pi->clientrereg_subnettimeout |= clientrereg;
1320
1321 /*
1322 * Apply the new link downgrade policy. This may result in a link
1323 * bounce. Do this after everything else so things are settled.
1324 * Possible problem: if setting the port state above fails, then
1325 * the policy change is not applied.
1326 */
1327 if (call_link_downgrade_policy)
1328 apply_link_downgrade_policy(ppd, 0);
1329
1330 return ret;
1331
1332get_only:
1333 return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
1334}
1335
1336/**
1337 * set_pkeys - set the PKEY table for ctxt 0
1338 * @dd: the hfi1_ib device
1339 * @port: the IB port number
1340 * @pkeys: the PKEY table
1341 */
1342static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
1343{
1344 struct hfi1_pportdata *ppd;
1345 int i;
1346 int changed = 0;
1347 int update_includes_mgmt_partition = 0;
1348
1349 /*
1350 * IB port one/two always maps to context zero/one,
1351 * always a kernel context, no locking needed
1352 * If we get here with ppd setup, no need to check
1353 * that rcd is valid.
1354 */
1355 ppd = dd->pport + (port - 1);
1356 /*
1357 * If the update does not include the management pkey, don't do it.
1358 */
1359 for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
1360 if (pkeys[i] == LIM_MGMT_P_KEY) {
1361 update_includes_mgmt_partition = 1;
1362 break;
1363 }
1364 }
1365
1366 if (!update_includes_mgmt_partition)
1367 return 1;
1368
1369 for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
1370 u16 key = pkeys[i];
1371 u16 okey = ppd->pkeys[i];
1372
1373 if (key == okey)
1374 continue;
1375 /*
1376 * The SM gives us the complete PKey table. We have
1377 * to ensure that we put the PKeys in the matching
1378 * slots.
1379 */
1380 ppd->pkeys[i] = key;
1381 changed = 1;
1382 }
1383
1384 if (changed) {
1385 struct ib_event event;
1386
1387 (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
1388
1389 event.event = IB_EVENT_PKEY_CHANGE;
1390 event.device = &dd->verbs_dev.ibdev;
1391 event.element.port_num = port;
1392 ib_dispatch_event(&event);
1393 }
1394 return 0;
1395}
1396
1397static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
1398 struct ib_device *ibdev, u8 port,
1399 u32 *resp_len)
1400{
1401 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1402 u32 n_blocks_sent = OPA_AM_NBLK(am);
1403 u32 start_block = am & 0x7ff;
1404 u16 *p = (u16 *) data;
1405 __be16 *q = (__be16 *)data;
1406 int i;
1407 u16 n_blocks_avail;
1408 unsigned npkeys = hfi1_get_npkeys(dd);
1409
1410 if (n_blocks_sent == 0) {
1411 pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
1412 port, start_block, n_blocks_sent);
1413 smp->status |= IB_SMP_INVALID_FIELD;
1414 return reply((struct ib_mad_hdr *)smp);
1415 }
1416
1417 n_blocks_avail = (u16)(npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1;
1418
1419 if (start_block + n_blocks_sent > n_blocks_avail ||
1420 n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
1421 pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
1422 start_block, n_blocks_sent, n_blocks_avail,
1423 OPA_NUM_PKEY_BLOCKS_PER_SMP);
1424 smp->status |= IB_SMP_INVALID_FIELD;
1425 return reply((struct ib_mad_hdr *)smp);
1426 }
1427
1428 for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++)
1429 p[i] = be16_to_cpu(q[i]);
1430
1431 if (start_block == 0 && set_pkeys(dd, port, p) != 0) {
1432 smp->status |= IB_SMP_INVALID_FIELD;
1433 return reply((struct ib_mad_hdr *)smp);
1434 }
1435
1436 return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len);
1437}
1438
1439static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
1440{
1441 u64 *val = (u64 *)data;
1442
1443 *val++ = read_csr(dd, SEND_SC2VLT0);
1444 *val++ = read_csr(dd, SEND_SC2VLT1);
1445 *val++ = read_csr(dd, SEND_SC2VLT2);
1446 *val++ = read_csr(dd, SEND_SC2VLT3);
1447 return 0;
1448}
1449
1450#define ILLEGAL_VL 12
1451/*
1452 * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except
1453 * for SC15, which must map to VL15). If we don't remap things this
1454 * way it is possible for VL15 counters to increment when we try to
1455 * send on a SC which is mapped to an invalid VL.
1456 */
1457static void filter_sc2vlt(void *data)
1458{
1459 int i;
1460 u8 *pd = (u8 *)data;
1461
1462 for (i = 0; i < OPA_MAX_SCS; i++) {
1463 if (i == 15)
1464 continue;
1465 if ((pd[i] & 0x1f) == 0xf)
1466 pd[i] = ILLEGAL_VL;
1467 }
1468}
1469
1470static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
1471{
1472 u64 *val = (u64 *)data;
1473
1474 filter_sc2vlt(data);
1475
1476 write_csr(dd, SEND_SC2VLT0, *val++);
1477 write_csr(dd, SEND_SC2VLT1, *val++);
1478 write_csr(dd, SEND_SC2VLT2, *val++);
1479 write_csr(dd, SEND_SC2VLT3, *val++);
1480 write_seqlock_irq(&dd->sc2vl_lock);
1481 memcpy(dd->sc2vl, (u64 *)data, sizeof(dd->sc2vl));
1482 write_sequnlock_irq(&dd->sc2vl_lock);
1483 return 0;
1484}
1485
1486static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
1487 struct ib_device *ibdev, u8 port,
1488 u32 *resp_len)
1489{
1490 struct hfi1_ibport *ibp = to_iport(ibdev, port);
1491 u8 *p = (u8 *)data;
1492 size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
1493 unsigned i;
1494
1495 if (am) {
1496 smp->status |= IB_SMP_INVALID_FIELD;
1497 return reply((struct ib_mad_hdr *)smp);
1498 }
1499
1500 for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
1501 *p++ = ibp->sl_to_sc[i];
1502
1503 if (resp_len)
1504 *resp_len += size;
1505
1506 return reply((struct ib_mad_hdr *)smp);
1507}
1508
1509static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
1510 struct ib_device *ibdev, u8 port,
1511 u32 *resp_len)
1512{
1513 struct hfi1_ibport *ibp = to_iport(ibdev, port);
1514 u8 *p = (u8 *)data;
1515 int i;
1516
1517 if (am) {
1518 smp->status |= IB_SMP_INVALID_FIELD;
1519 return reply((struct ib_mad_hdr *)smp);
1520 }
1521
1522 for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
1523 ibp->sl_to_sc[i] = *p++;
1524
1525 return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len);
1526}
1527
1528static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
1529 struct ib_device *ibdev, u8 port,
1530 u32 *resp_len)
1531{
1532 struct hfi1_ibport *ibp = to_iport(ibdev, port);
1533 u8 *p = (u8 *)data;
1534 size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
1535 unsigned i;
1536
1537 if (am) {
1538 smp->status |= IB_SMP_INVALID_FIELD;
1539 return reply((struct ib_mad_hdr *)smp);
1540 }
1541
1542 for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
1543 *p++ = ibp->sc_to_sl[i];
1544
1545 if (resp_len)
1546 *resp_len += size;
1547
1548 return reply((struct ib_mad_hdr *)smp);
1549}
1550
1551static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
1552 struct ib_device *ibdev, u8 port,
1553 u32 *resp_len)
1554{
1555 struct hfi1_ibport *ibp = to_iport(ibdev, port);
1556 u8 *p = (u8 *)data;
1557 int i;
1558
1559 if (am) {
1560 smp->status |= IB_SMP_INVALID_FIELD;
1561 return reply((struct ib_mad_hdr *)smp);
1562 }
1563
1564 for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
1565 ibp->sc_to_sl[i] = *p++;
1566
1567 return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len);
1568}
1569
1570static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
1571 struct ib_device *ibdev, u8 port,
1572 u32 *resp_len)
1573{
1574 u32 n_blocks = OPA_AM_NBLK(am);
1575 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1576 void *vp = (void *) data;
1577 size_t size = 4 * sizeof(u64);
1578
1579 if (n_blocks != 1) {
1580 smp->status |= IB_SMP_INVALID_FIELD;
1581 return reply((struct ib_mad_hdr *)smp);
1582 }
1583
1584 get_sc2vlt_tables(dd, vp);
1585
1586 if (resp_len)
1587 *resp_len += size;
1588
1589 return reply((struct ib_mad_hdr *)smp);
1590}
1591
1592static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
1593 struct ib_device *ibdev, u8 port,
1594 u32 *resp_len)
1595{
1596 u32 n_blocks = OPA_AM_NBLK(am);
1597 int async_update = OPA_AM_ASYNC(am);
1598 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1599 void *vp = (void *) data;
1600 struct hfi1_pportdata *ppd;
1601 int lstate;
1602
1603 if (n_blocks != 1 || async_update) {
1604 smp->status |= IB_SMP_INVALID_FIELD;
1605 return reply((struct ib_mad_hdr *)smp);
1606 }
1607
1608 /* IB numbers ports from 1, hw from 0 */
1609 ppd = dd->pport + (port - 1);
1610 lstate = driver_lstate(ppd);
1611 /* it's known that async_update is 0 by this point, but include
1612 * the explicit check for clarity */
1613 if (!async_update &&
1614 (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) {
1615 smp->status |= IB_SMP_INVALID_FIELD;
1616 return reply((struct ib_mad_hdr *)smp);
1617 }
1618
1619 set_sc2vlt_tables(dd, vp);
1620
1621 return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len);
1622}
1623
1624static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
1625 struct ib_device *ibdev, u8 port,
1626 u32 *resp_len)
1627{
1628 u32 n_blocks = OPA_AM_NPORT(am);
1629 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1630 struct hfi1_pportdata *ppd;
1631 void *vp = (void *) data;
1632 int size;
1633
1634 if (n_blocks != 1) {
1635 smp->status |= IB_SMP_INVALID_FIELD;
1636 return reply((struct ib_mad_hdr *)smp);
1637 }
1638
1639 ppd = dd->pport + (port - 1);
1640
1641 size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
1642
1643 if (resp_len)
1644 *resp_len += size;
1645
1646 return reply((struct ib_mad_hdr *)smp);
1647}
1648
1649static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
1650 struct ib_device *ibdev, u8 port,
1651 u32 *resp_len)
1652{
1653 u32 n_blocks = OPA_AM_NPORT(am);
1654 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1655 struct hfi1_pportdata *ppd;
1656 void *vp = (void *) data;
1657 int lstate;
1658
1659 if (n_blocks != 1) {
1660 smp->status |= IB_SMP_INVALID_FIELD;
1661 return reply((struct ib_mad_hdr *)smp);
1662 }
1663
1664 /* IB numbers ports from 1, hw from 0 */
1665 ppd = dd->pport + (port - 1);
1666 lstate = driver_lstate(ppd);
1667 if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) {
1668 smp->status |= IB_SMP_INVALID_FIELD;
1669 return reply((struct ib_mad_hdr *)smp);
1670 }
1671
1672 ppd = dd->pport + (port - 1);
1673
1674 fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
1675
1676 return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
1677 resp_len);
1678}
1679
1680static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
1681 struct ib_device *ibdev, u8 port,
1682 u32 *resp_len)
1683{
1684 u32 nports = OPA_AM_NPORT(am);
1685 u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
1686 u32 lstate;
1687 struct hfi1_ibport *ibp;
1688 struct hfi1_pportdata *ppd;
1689 struct opa_port_state_info *psi = (struct opa_port_state_info *) data;
1690
1691 if (nports != 1) {
1692 smp->status |= IB_SMP_INVALID_FIELD;
1693 return reply((struct ib_mad_hdr *)smp);
1694 }
1695
1696 ibp = to_iport(ibdev, port);
1697 ppd = ppd_from_ibp(ibp);
1698
1699 lstate = driver_lstate(ppd);
1700
1701 if (start_of_sm_config && (lstate == IB_PORT_INIT))
1702 ppd->is_sm_config_started = 1;
1703
1704#if PI_LED_ENABLE_SUP
1705 psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
1706 psi->port_states.ledenable_offlinereason |=
1707 ppd->is_sm_config_started << 5;
1708 psi->port_states.ledenable_offlinereason |=
1709 ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON;
1710#else
1711 psi->port_states.offline_reason = ppd->neighbor_normal << 4;
1712 psi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
1713 psi->port_states.offline_reason |= ppd->offline_disabled_reason &
1714 OPA_PI_MASK_OFFLINE_REASON;
1715#endif /* PI_LED_ENABLE_SUP */
1716
1717 psi->port_states.portphysstate_portstate =
1718 (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
1719 psi->link_width_downgrade_tx_active =
1720 ppd->link_width_downgrade_tx_active;
1721 psi->link_width_downgrade_rx_active =
1722 ppd->link_width_downgrade_rx_active;
1723 if (resp_len)
1724 *resp_len += sizeof(struct opa_port_state_info);
1725
1726 return reply((struct ib_mad_hdr *)smp);
1727}
1728
1729static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
1730 struct ib_device *ibdev, u8 port,
1731 u32 *resp_len)
1732{
1733 u32 nports = OPA_AM_NPORT(am);
1734 u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
1735 u32 ls_old;
1736 u8 ls_new, ps_new;
1737 struct hfi1_ibport *ibp;
1738 struct hfi1_pportdata *ppd;
1739 struct opa_port_state_info *psi = (struct opa_port_state_info *) data;
1740 int ret, invalid = 0;
1741
1742 if (nports != 1) {
1743 smp->status |= IB_SMP_INVALID_FIELD;
1744 return reply((struct ib_mad_hdr *)smp);
1745 }
1746
1747 ibp = to_iport(ibdev, port);
1748 ppd = ppd_from_ibp(ibp);
1749
1750 ls_old = driver_lstate(ppd);
1751
1752 ls_new = port_states_to_logical_state(&psi->port_states);
1753 ps_new = port_states_to_phys_state(&psi->port_states);
1754
1755 if (ls_old == IB_PORT_INIT) {
1756 if (start_of_sm_config) {
1757 if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
1758 ppd->is_sm_config_started = 1;
1759 } else if (ls_new == IB_PORT_ARMED) {
1760 if (ppd->is_sm_config_started == 0)
1761 invalid = 1;
1762 }
1763 }
1764
1765 ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
1766 if (ret)
1767 return ret;
1768
1769 if (invalid)
1770 smp->status |= IB_SMP_INVALID_FIELD;
1771
1772 return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len);
1773}
1774
1775static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
1776 struct ib_device *ibdev, u8 port,
1777 u32 *resp_len)
1778{
1779 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1780 u32 addr = OPA_AM_CI_ADDR(am);
1781 u32 len = OPA_AM_CI_LEN(am) + 1;
1782 int ret;
1783
1784#define __CI_PAGE_SIZE (1 << 7) /* 128 bytes */
1785#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1)
1786#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK)
1787
1788 /* check that addr is within spec, and
1789 * addr and (addr + len - 1) are on the same "page" */
1790 if (addr >= 4096 ||
1791 (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
1792 smp->status |= IB_SMP_INVALID_FIELD;
1793 return reply((struct ib_mad_hdr *)smp);
1794 }
1795
1796 ret = get_cable_info(dd, port, addr, len, data);
1797
1798 if (ret == -ENODEV) {
1799 smp->status |= IB_SMP_UNSUP_METH_ATTR;
1800 return reply((struct ib_mad_hdr *)smp);
1801 }
1802
1803 /* The address range for the CableInfo SMA query is wider than the
1804 * memory available on the QSFP cable. We want to return a valid
1805 * response, albeit zeroed out, for address ranges beyond available
1806 * memory but that are within the CableInfo query spec
1807 */
1808 if (ret < 0 && ret != -ERANGE) {
1809 smp->status |= IB_SMP_INVALID_FIELD;
1810 return reply((struct ib_mad_hdr *)smp);
1811 }
1812
1813 if (resp_len)
1814 *resp_len += len;
1815
1816 return reply((struct ib_mad_hdr *)smp);
1817}
1818
1819static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
1820 struct ib_device *ibdev, u8 port, u32 *resp_len)
1821{
1822 u32 num_ports = OPA_AM_NPORT(am);
1823 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1824 struct hfi1_pportdata *ppd;
1825 struct buffer_control *p = (struct buffer_control *) data;
1826 int size;
1827
1828 if (num_ports != 1) {
1829 smp->status |= IB_SMP_INVALID_FIELD;
1830 return reply((struct ib_mad_hdr *)smp);
1831 }
1832
1833 ppd = dd->pport + (port - 1);
1834 size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
1835 trace_bct_get(dd, p);
1836 if (resp_len)
1837 *resp_len += size;
1838
1839 return reply((struct ib_mad_hdr *)smp);
1840}
1841
1842static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
1843 struct ib_device *ibdev, u8 port, u32 *resp_len)
1844{
1845 u32 num_ports = OPA_AM_NPORT(am);
1846 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1847 struct hfi1_pportdata *ppd;
1848 struct buffer_control *p = (struct buffer_control *) data;
1849
1850 if (num_ports != 1) {
1851 smp->status |= IB_SMP_INVALID_FIELD;
1852 return reply((struct ib_mad_hdr *)smp);
1853 }
1854 ppd = dd->pport + (port - 1);
1855 trace_bct_set(dd, p);
1856 if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) {
1857 smp->status |= IB_SMP_INVALID_FIELD;
1858 return reply((struct ib_mad_hdr *)smp);
1859 }
1860
1861 return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len);
1862}
1863
1864static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
1865 struct ib_device *ibdev, u8 port,
1866 u32 *resp_len)
1867{
1868 struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
1869 u32 num_ports = OPA_AM_NPORT(am);
1870 u8 section = (am & 0x00ff0000) >> 16;
1871 u8 *p = data;
1872 int size = 0;
1873
1874 if (num_ports != 1) {
1875 smp->status |= IB_SMP_INVALID_FIELD;
1876 return reply((struct ib_mad_hdr *)smp);
1877 }
1878
1879 switch (section) {
1880 case OPA_VLARB_LOW_ELEMENTS:
1881 size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
1882 break;
1883 case OPA_VLARB_HIGH_ELEMENTS:
1884 size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
1885 break;
1886 case OPA_VLARB_PREEMPT_ELEMENTS:
1887 size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
1888 break;
1889 case OPA_VLARB_PREEMPT_MATRIX:
1890 size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
1891 break;
1892 default:
1893 pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
1894 be32_to_cpu(smp->attr_mod));
1895 smp->status |= IB_SMP_INVALID_FIELD;
1896 break;
1897 }
1898
1899 if (size > 0 && resp_len)
1900 *resp_len += size;
1901
1902 return reply((struct ib_mad_hdr *)smp);
1903}
1904
1905static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
1906 struct ib_device *ibdev, u8 port,
1907 u32 *resp_len)
1908{
1909 struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
1910 u32 num_ports = OPA_AM_NPORT(am);
1911 u8 section = (am & 0x00ff0000) >> 16;
1912 u8 *p = data;
1913
1914 if (num_ports != 1) {
1915 smp->status |= IB_SMP_INVALID_FIELD;
1916 return reply((struct ib_mad_hdr *)smp);
1917 }
1918
1919 switch (section) {
1920 case OPA_VLARB_LOW_ELEMENTS:
1921 (void) fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
1922 break;
1923 case OPA_VLARB_HIGH_ELEMENTS:
1924 (void) fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
1925 break;
1926 /* neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
1927 * can be changed from the default values */
1928 case OPA_VLARB_PREEMPT_ELEMENTS:
1929 /* FALLTHROUGH */
1930 case OPA_VLARB_PREEMPT_MATRIX:
1931 smp->status |= IB_SMP_UNSUP_METH_ATTR;
1932 break;
1933 default:
1934 pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n",
1935 be32_to_cpu(smp->attr_mod));
1936 smp->status |= IB_SMP_INVALID_FIELD;
1937 break;
1938 }
1939
1940 return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len);
1941}
1942
1943struct opa_pma_mad {
1944 struct ib_mad_hdr mad_hdr;
1945 u8 data[2024];
1946} __packed;
1947
1948struct opa_class_port_info {
1949 u8 base_version;
1950 u8 class_version;
1951 __be16 cap_mask;
1952 __be32 cap_mask2_resp_time;
1953
1954 u8 redirect_gid[16];
1955 __be32 redirect_tc_fl;
1956 __be32 redirect_lid;
1957 __be32 redirect_sl_qp;
1958 __be32 redirect_qkey;
1959
1960 u8 trap_gid[16];
1961 __be32 trap_tc_fl;
1962 __be32 trap_lid;
1963 __be32 trap_hl_qp;
1964 __be32 trap_qkey;
1965
1966 __be16 trap_pkey;
1967 __be16 redirect_pkey;
1968
1969 u8 trap_sl_rsvd;
1970 u8 reserved[3];
1971} __packed;
1972
1973struct opa_port_status_req {
1974 __u8 port_num;
1975 __u8 reserved[3];
1976 __be32 vl_select_mask;
1977};
1978
1979#define VL_MASK_ALL 0x000080ff
1980
1981struct opa_port_status_rsp {
1982 __u8 port_num;
1983 __u8 reserved[3];
1984 __be32 vl_select_mask;
1985
1986 /* Data counters */
1987 __be64 port_xmit_data;
1988 __be64 port_rcv_data;
1989 __be64 port_xmit_pkts;
1990 __be64 port_rcv_pkts;
1991 __be64 port_multicast_xmit_pkts;
1992 __be64 port_multicast_rcv_pkts;
1993 __be64 port_xmit_wait;
1994 __be64 sw_port_congestion;
1995 __be64 port_rcv_fecn;
1996 __be64 port_rcv_becn;
1997 __be64 port_xmit_time_cong;
1998 __be64 port_xmit_wasted_bw;
1999 __be64 port_xmit_wait_data;
2000 __be64 port_rcv_bubble;
2001 __be64 port_mark_fecn;
2002 /* Error counters */
2003 __be64 port_rcv_constraint_errors;
2004 __be64 port_rcv_switch_relay_errors;
2005 __be64 port_xmit_discards;
2006 __be64 port_xmit_constraint_errors;
2007 __be64 port_rcv_remote_physical_errors;
2008 __be64 local_link_integrity_errors;
2009 __be64 port_rcv_errors;
2010 __be64 excessive_buffer_overruns;
2011 __be64 fm_config_errors;
2012 __be32 link_error_recovery;
2013 __be32 link_downed;
2014 u8 uncorrectable_errors;
2015
2016 u8 link_quality_indicator; /* 5res, 3bit */
2017 u8 res2[6];
2018 struct _vls_pctrs {
2019 /* per-VL Data counters */
2020 __be64 port_vl_xmit_data;
2021 __be64 port_vl_rcv_data;
2022 __be64 port_vl_xmit_pkts;
2023 __be64 port_vl_rcv_pkts;
2024 __be64 port_vl_xmit_wait;
2025 __be64 sw_port_vl_congestion;
2026 __be64 port_vl_rcv_fecn;
2027 __be64 port_vl_rcv_becn;
2028 __be64 port_xmit_time_cong;
2029 __be64 port_vl_xmit_wasted_bw;
2030 __be64 port_vl_xmit_wait_data;
2031 __be64 port_vl_rcv_bubble;
2032 __be64 port_vl_mark_fecn;
2033 __be64 port_vl_xmit_discards;
2034 } vls[0]; /* real array size defined by # bits set in vl_select_mask */
2035};
2036
2037enum counter_selects {
2038 CS_PORT_XMIT_DATA = (1 << 31),
2039 CS_PORT_RCV_DATA = (1 << 30),
2040 CS_PORT_XMIT_PKTS = (1 << 29),
2041 CS_PORT_RCV_PKTS = (1 << 28),
2042 CS_PORT_MCAST_XMIT_PKTS = (1 << 27),
2043 CS_PORT_MCAST_RCV_PKTS = (1 << 26),
2044 CS_PORT_XMIT_WAIT = (1 << 25),
2045 CS_SW_PORT_CONGESTION = (1 << 24),
2046 CS_PORT_RCV_FECN = (1 << 23),
2047 CS_PORT_RCV_BECN = (1 << 22),
2048 CS_PORT_XMIT_TIME_CONG = (1 << 21),
2049 CS_PORT_XMIT_WASTED_BW = (1 << 20),
2050 CS_PORT_XMIT_WAIT_DATA = (1 << 19),
2051 CS_PORT_RCV_BUBBLE = (1 << 18),
2052 CS_PORT_MARK_FECN = (1 << 17),
2053 CS_PORT_RCV_CONSTRAINT_ERRORS = (1 << 16),
2054 CS_PORT_RCV_SWITCH_RELAY_ERRORS = (1 << 15),
2055 CS_PORT_XMIT_DISCARDS = (1 << 14),
2056 CS_PORT_XMIT_CONSTRAINT_ERRORS = (1 << 13),
2057 CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS = (1 << 12),
2058 CS_LOCAL_LINK_INTEGRITY_ERRORS = (1 << 11),
2059 CS_PORT_RCV_ERRORS = (1 << 10),
2060 CS_EXCESSIVE_BUFFER_OVERRUNS = (1 << 9),
2061 CS_FM_CONFIG_ERRORS = (1 << 8),
2062 CS_LINK_ERROR_RECOVERY = (1 << 7),
2063 CS_LINK_DOWNED = (1 << 6),
2064 CS_UNCORRECTABLE_ERRORS = (1 << 5),
2065};
2066
2067struct opa_clear_port_status {
2068 __be64 port_select_mask[4];
2069 __be32 counter_select_mask;
2070};
2071
2072struct opa_aggregate {
2073 __be16 attr_id;
2074 __be16 err_reqlength; /* 1 bit, 8 res, 7 bit */
2075 __be32 attr_mod;
2076 u8 data[0];
2077};
2078
2079/* Request contains first two fields, response contains those plus the rest */
2080struct opa_port_data_counters_msg {
2081 __be64 port_select_mask[4];
2082 __be32 vl_select_mask;
2083
2084 /* Response fields follow */
2085 __be32 reserved1;
2086 struct _port_dctrs {
2087 u8 port_number;
2088 u8 reserved2[3];
2089 __be32 link_quality_indicator; /* 29res, 3bit */
2090
2091 /* Data counters */
2092 __be64 port_xmit_data;
2093 __be64 port_rcv_data;
2094 __be64 port_xmit_pkts;
2095 __be64 port_rcv_pkts;
2096 __be64 port_multicast_xmit_pkts;
2097 __be64 port_multicast_rcv_pkts;
2098 __be64 port_xmit_wait;
2099 __be64 sw_port_congestion;
2100 __be64 port_rcv_fecn;
2101 __be64 port_rcv_becn;
2102 __be64 port_xmit_time_cong;
2103 __be64 port_xmit_wasted_bw;
2104 __be64 port_xmit_wait_data;
2105 __be64 port_rcv_bubble;
2106 __be64 port_mark_fecn;
2107
2108 __be64 port_error_counter_summary;
2109 /* Sum of error counts/port */
2110
2111 struct _vls_dctrs {
2112 /* per-VL Data counters */
2113 __be64 port_vl_xmit_data;
2114 __be64 port_vl_rcv_data;
2115 __be64 port_vl_xmit_pkts;
2116 __be64 port_vl_rcv_pkts;
2117 __be64 port_vl_xmit_wait;
2118 __be64 sw_port_vl_congestion;
2119 __be64 port_vl_rcv_fecn;
2120 __be64 port_vl_rcv_becn;
2121 __be64 port_xmit_time_cong;
2122 __be64 port_vl_xmit_wasted_bw;
2123 __be64 port_vl_xmit_wait_data;
2124 __be64 port_vl_rcv_bubble;
2125 __be64 port_vl_mark_fecn;
2126 } vls[0];
2127 /* array size defined by #bits set in vl_select_mask*/
2128 } port[1]; /* array size defined by #ports in attribute modifier */
2129};
2130
2131struct opa_port_error_counters64_msg {
2132 /* Request contains first two fields, response contains the
2133 * whole magilla */
2134 __be64 port_select_mask[4];
2135 __be32 vl_select_mask;
2136
2137 /* Response-only fields follow */
2138 __be32 reserved1;
2139 struct _port_ectrs {
2140 u8 port_number;
2141 u8 reserved2[7];
2142 __be64 port_rcv_constraint_errors;
2143 __be64 port_rcv_switch_relay_errors;
2144 __be64 port_xmit_discards;
2145 __be64 port_xmit_constraint_errors;
2146 __be64 port_rcv_remote_physical_errors;
2147 __be64 local_link_integrity_errors;
2148 __be64 port_rcv_errors;
2149 __be64 excessive_buffer_overruns;
2150 __be64 fm_config_errors;
2151 __be32 link_error_recovery;
2152 __be32 link_downed;
2153 u8 uncorrectable_errors;
2154 u8 reserved3[7];
2155 struct _vls_ectrs {
2156 __be64 port_vl_xmit_discards;
2157 } vls[0];
2158 /* array size defined by #bits set in vl_select_mask */
2159 } port[1]; /* array size defined by #ports in attribute modifier */
2160};
2161
2162struct opa_port_error_info_msg {
2163 __be64 port_select_mask[4];
2164 __be32 error_info_select_mask;
2165 __be32 reserved1;
2166 struct _port_ei {
2167
2168 u8 port_number;
2169 u8 reserved2[7];
2170
2171 /* PortRcvErrorInfo */
2172 struct {
2173 u8 status_and_code;
2174 union {
2175 u8 raw[17];
2176 struct {
2177 /* EI1to12 format */
2178 u8 packet_flit1[8];
2179 u8 packet_flit2[8];
2180 u8 remaining_flit_bits12;
2181 } ei1to12;
2182 struct {
2183 u8 packet_bytes[8];
2184 u8 remaining_flit_bits;
2185 } ei13;
2186 } ei;
2187 u8 reserved3[6];
2188 } __packed port_rcv_ei;
2189
2190 /* ExcessiveBufferOverrunInfo */
2191 struct {
2192 u8 status_and_sc;
2193 u8 reserved4[7];
2194 } __packed excessive_buffer_overrun_ei;
2195
2196 /* PortXmitConstraintErrorInfo */
2197 struct {
2198 u8 status;
2199 u8 reserved5;
2200 __be16 pkey;
2201 __be32 slid;
2202 } __packed port_xmit_constraint_ei;
2203
2204 /* PortRcvConstraintErrorInfo */
2205 struct {
2206 u8 status;
2207 u8 reserved6;
2208 __be16 pkey;
2209 __be32 slid;
2210 } __packed port_rcv_constraint_ei;
2211
2212 /* PortRcvSwitchRelayErrorInfo */
2213 struct {
2214 u8 status_and_code;
2215 u8 reserved7[3];
2216 __u32 error_info;
2217 } __packed port_rcv_switch_relay_ei;
2218
2219 /* UncorrectableErrorInfo */
2220 struct {
2221 u8 status_and_code;
2222 u8 reserved8;
2223 } __packed uncorrectable_ei;
2224
2225 /* FMConfigErrorInfo */
2226 struct {
2227 u8 status_and_code;
2228 u8 error_info;
2229 } __packed fm_config_ei;
2230 __u32 reserved9;
2231 } port[1]; /* actual array size defined by #ports in attr modifier */
2232};
2233
2234/* opa_port_error_info_msg error_info_select_mask bit definitions */
2235enum error_info_selects {
2236 ES_PORT_RCV_ERROR_INFO = (1 << 31),
2237 ES_EXCESSIVE_BUFFER_OVERRUN_INFO = (1 << 30),
2238 ES_PORT_XMIT_CONSTRAINT_ERROR_INFO = (1 << 29),
2239 ES_PORT_RCV_CONSTRAINT_ERROR_INFO = (1 << 28),
2240 ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO = (1 << 27),
2241 ES_UNCORRECTABLE_ERROR_INFO = (1 << 26),
2242 ES_FM_CONFIG_ERROR_INFO = (1 << 25)
2243};
2244
2245static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp,
2246 struct ib_device *ibdev, u32 *resp_len)
2247{
2248 struct opa_class_port_info *p =
2249 (struct opa_class_port_info *)pmp->data;
2250
2251 memset(pmp->data, 0, sizeof(pmp->data));
2252
2253 if (pmp->mad_hdr.attr_mod != 0)
2254 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2255
2256 p->base_version = OPA_MGMT_BASE_VERSION;
2257 p->class_version = OPA_SMI_CLASS_VERSION;
2258 /*
2259 * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
2260 */
2261 p->cap_mask2_resp_time = cpu_to_be32(18);
2262
2263 if (resp_len)
2264 *resp_len += sizeof(*p);
2265
2266 return reply((struct ib_mad_hdr *)pmp);
2267}
2268
2269static void a0_portstatus(struct hfi1_pportdata *ppd,
2270 struct opa_port_status_rsp *rsp, u32 vl_select_mask)
2271{
2272 if (!is_bx(ppd->dd)) {
2273 unsigned long vl;
2274 int vfi = 0;
2275 u64 max_vl_xmit_wait = 0, tmp;
2276 u32 vl_all_mask = VL_MASK_ALL;
2277 u64 rcv_data, rcv_bubble;
2278
2279 rcv_data = be64_to_cpu(rsp->port_rcv_data);
2280 rcv_bubble = be64_to_cpu(rsp->port_rcv_bubble);
2281 /* In the measured time period, calculate the total number
2282 * of flits that were received. Subtract out one false
2283 * rcv_bubble increment for every 32 received flits but
2284 * don't let the number go negative.
2285 */
2286 if (rcv_bubble >= (rcv_data>>5)) {
2287 rcv_bubble -= (rcv_data>>5);
2288 rsp->port_rcv_bubble = cpu_to_be64(rcv_bubble);
2289 }
2290 for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
2291 8 * sizeof(vl_select_mask)) {
2292 rcv_data = be64_to_cpu(rsp->vls[vfi].port_vl_rcv_data);
2293 rcv_bubble =
2294 be64_to_cpu(rsp->vls[vfi].port_vl_rcv_bubble);
2295 if (rcv_bubble >= (rcv_data>>5)) {
2296 rcv_bubble -= (rcv_data>>5);
2297 rsp->vls[vfi].port_vl_rcv_bubble =
2298 cpu_to_be64(rcv_bubble);
2299 }
2300 vfi++;
2301 }
2302
2303 for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
2304 8 * sizeof(vl_all_mask)) {
2305 tmp = read_port_cntr(ppd, C_TX_WAIT_VL,
2306 idx_from_vl(vl));
2307 if (tmp > max_vl_xmit_wait)
2308 max_vl_xmit_wait = tmp;
2309 }
2310 rsp->port_xmit_wait = cpu_to_be64(max_vl_xmit_wait);
2311 }
2312}
2313
2314
2315static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
2316 struct ib_device *ibdev, u8 port, u32 *resp_len)
2317{
2318 struct opa_port_status_req *req =
2319 (struct opa_port_status_req *)pmp->data;
2320 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
2321 struct opa_port_status_rsp *rsp;
2322 u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
2323 unsigned long vl;
2324 size_t response_data_size;
2325 u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
2326 u8 port_num = req->port_num;
2327 u8 num_vls = hweight32(vl_select_mask);
2328 struct _vls_pctrs *vlinfo;
2329 struct hfi1_ibport *ibp = to_iport(ibdev, port);
2330 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
2331 int vfi;
2332 u64 tmp, tmp2;
2333
2334 response_data_size = sizeof(struct opa_port_status_rsp) +
2335 num_vls * sizeof(struct _vls_pctrs);
2336 if (response_data_size > sizeof(pmp->data)) {
2337 pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
2338 return reply((struct ib_mad_hdr *)pmp);
2339 }
2340
2341 if (nports != 1 || (port_num && port_num != port)
2342 || num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
2343 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2344 return reply((struct ib_mad_hdr *)pmp);
2345 }
2346
2347 memset(pmp->data, 0, sizeof(pmp->data));
2348
2349 rsp = (struct opa_port_status_rsp *)pmp->data;
2350 if (port_num)
2351 rsp->port_num = port_num;
2352 else
2353 rsp->port_num = port;
2354
2355 rsp->port_rcv_constraint_errors =
2356 cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
2357 CNTR_INVALID_VL));
2358
2359 hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
2360
2361 rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
2362 rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
2363 CNTR_INVALID_VL));
2364 rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
2365 CNTR_INVALID_VL));
2366 rsp->port_rcv_bubble =
2367 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL));
2368 rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
2369 CNTR_INVALID_VL));
2370 rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
2371 CNTR_INVALID_VL));
2372 rsp->port_multicast_xmit_pkts =
2373 cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
2374 CNTR_INVALID_VL));
2375 rsp->port_multicast_rcv_pkts =
2376 cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
2377 CNTR_INVALID_VL));
2378 rsp->port_xmit_wait =
2379 cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
2380 rsp->port_rcv_fecn =
2381 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
2382 rsp->port_rcv_becn =
2383 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
2384 rsp->port_xmit_discards =
2385 cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
2386 CNTR_INVALID_VL));
2387 rsp->port_xmit_constraint_errors =
2388 cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
2389 CNTR_INVALID_VL));
2390 rsp->port_rcv_remote_physical_errors =
2391 cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
2392 CNTR_INVALID_VL));
2393 tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
2394 tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
2395 if (tmp2 < tmp) {
2396 /* overflow/wrapped */
2397 rsp->local_link_integrity_errors = cpu_to_be64(~0);
2398 } else {
2399 rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
2400 }
2401 tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
2402 tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
2403 CNTR_INVALID_VL);
2404 if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
2405 /* overflow/wrapped */
2406 rsp->link_error_recovery = cpu_to_be32(~0);
2407 } else {
2408 rsp->link_error_recovery = cpu_to_be32(tmp2);
2409 }
2410 rsp->port_rcv_errors =
2411 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
2412 rsp->excessive_buffer_overruns =
2413 cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
2414 rsp->fm_config_errors =
2415 cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
2416 CNTR_INVALID_VL));
2417 rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
2418 CNTR_INVALID_VL));
2419
2420 /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */
2421 tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
2422 rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
2423
2424 vlinfo = &(rsp->vls[0]);
2425 vfi = 0;
2426 /* The vl_select_mask has been checked above, and we know
2427 * that it contains only entries which represent valid VLs.
2428 * So in the for_each_set_bit() loop below, we don't need
2429 * any additional checks for vl.
2430 */
2431 for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
2432 8 * sizeof(vl_select_mask)) {
2433 memset(vlinfo, 0, sizeof(*vlinfo));
2434
2435 tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
2436 rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp);
2437 rsp->vls[vfi].port_vl_rcv_bubble =
2438 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL_VL,
2439 idx_from_vl(vl)));
2440
2441 rsp->vls[vfi].port_vl_rcv_pkts =
2442 cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
2443 idx_from_vl(vl)));
2444
2445 rsp->vls[vfi].port_vl_xmit_data =
2446 cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
2447 idx_from_vl(vl)));
2448
2449 rsp->vls[vfi].port_vl_xmit_pkts =
2450 cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
2451 idx_from_vl(vl)));
2452
2453 rsp->vls[vfi].port_vl_xmit_wait =
2454 cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
2455 idx_from_vl(vl)));
2456
2457 rsp->vls[vfi].port_vl_rcv_fecn =
2458 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
2459 idx_from_vl(vl)));
2460
2461 rsp->vls[vfi].port_vl_rcv_becn =
2462 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
2463 idx_from_vl(vl)));
2464
2465 vlinfo++;
2466 vfi++;
2467 }
2468
2469 a0_portstatus(ppd, rsp, vl_select_mask);
2470
2471 if (resp_len)
2472 *resp_len += response_data_size;
2473
2474 return reply((struct ib_mad_hdr *)pmp);
2475}
2476
2477static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port)
2478{
2479 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
2480 struct hfi1_ibport *ibp = to_iport(ibdev, port);
2481 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
2482 u64 error_counter_summary = 0, tmp;
2483
2484 error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
2485 CNTR_INVALID_VL);
2486 /* port_rcv_switch_relay_errors is 0 for HFIs */
2487 error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD,
2488 CNTR_INVALID_VL);
2489 error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
2490 CNTR_INVALID_VL);
2491 error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
2492 CNTR_INVALID_VL);
2493 error_counter_summary += read_dev_cntr(dd, C_DC_TX_REPLAY,
2494 CNTR_INVALID_VL);
2495 error_counter_summary += read_dev_cntr(dd, C_DC_RX_REPLAY,
2496 CNTR_INVALID_VL);
2497 error_counter_summary += read_dev_cntr(dd, C_DC_SEQ_CRC_CNT,
2498 CNTR_INVALID_VL);
2499 error_counter_summary += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
2500 CNTR_INVALID_VL);
2501 error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR,
2502 CNTR_INVALID_VL);
2503 error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
2504 error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR,
2505 CNTR_INVALID_VL);
2506 /* ppd->link_downed is a 32-bit value */
2507 error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN,
2508 CNTR_INVALID_VL);
2509 tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
2510 /* this is an 8-bit quantity */
2511 error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff;
2512
2513 return error_counter_summary;
2514}
2515
2516static void a0_datacounters(struct hfi1_devdata *dd, struct _port_dctrs *rsp,
2517 u32 vl_select_mask)
2518{
2519 if (!is_bx(dd)) {
2520 unsigned long vl;
2521 int vfi = 0;
2522 u64 rcv_data, rcv_bubble, sum_vl_xmit_wait = 0;
2523
2524 rcv_data = be64_to_cpu(rsp->port_rcv_data);
2525 rcv_bubble = be64_to_cpu(rsp->port_rcv_bubble);
2526 /* In the measured time period, calculate the total number
2527 * of flits that were received. Subtract out one false
2528 * rcv_bubble increment for every 32 received flits but
2529 * don't let the number go negative.
2530 */
2531 if (rcv_bubble >= (rcv_data>>5)) {
2532 rcv_bubble -= (rcv_data>>5);
2533 rsp->port_rcv_bubble = cpu_to_be64(rcv_bubble);
2534 }
2535 for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
2536 8 * sizeof(vl_select_mask)) {
2537 rcv_data = be64_to_cpu(rsp->vls[vfi].port_vl_rcv_data);
2538 rcv_bubble =
2539 be64_to_cpu(rsp->vls[vfi].port_vl_rcv_bubble);
2540 if (rcv_bubble >= (rcv_data>>5)) {
2541 rcv_bubble -= (rcv_data>>5);
2542 rsp->vls[vfi].port_vl_rcv_bubble =
2543 cpu_to_be64(rcv_bubble);
2544 }
2545 vfi++;
2546 }
2547 vfi = 0;
2548 for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
2549 8 * sizeof(vl_select_mask)) {
2550 u64 tmp = sum_vl_xmit_wait +
2551 be64_to_cpu(rsp->vls[vfi++].port_vl_xmit_wait);
2552 if (tmp < sum_vl_xmit_wait) {
2553 /* we wrapped */
2554 sum_vl_xmit_wait = (u64) ~0;
2555 break;
2556 }
2557 sum_vl_xmit_wait = tmp;
2558 }
2559 if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
2560 rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
2561 }
2562}
2563
2564static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
2565 struct ib_device *ibdev, u8 port, u32 *resp_len)
2566{
2567 struct opa_port_data_counters_msg *req =
2568 (struct opa_port_data_counters_msg *)pmp->data;
2569 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
2570 struct hfi1_ibport *ibp = to_iport(ibdev, port);
2571 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
2572 struct _port_dctrs *rsp;
2573 struct _vls_dctrs *vlinfo;
2574 size_t response_data_size;
2575 u32 num_ports;
2576 u8 num_pslm;
2577 u8 lq, num_vls;
2578 u64 port_mask;
2579 unsigned long port_num;
2580 unsigned long vl;
2581 u32 vl_select_mask;
2582 int vfi;
2583
2584 num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
2585 num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
2586 num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
2587 vl_select_mask = be32_to_cpu(req->vl_select_mask);
2588
2589 if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) {
2590 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2591 return reply((struct ib_mad_hdr *)pmp);
2592 }
2593
2594 /* Sanity check */
2595 response_data_size = sizeof(struct opa_port_data_counters_msg) +
2596 num_vls * sizeof(struct _vls_dctrs);
2597
2598 if (response_data_size > sizeof(pmp->data)) {
2599 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2600 return reply((struct ib_mad_hdr *)pmp);
2601 }
2602
2603 /*
2604 * The bit set in the mask needs to be consistent with the
2605 * port the request came in on.
2606 */
2607 port_mask = be64_to_cpu(req->port_select_mask[3]);
2608 port_num = find_first_bit((unsigned long *)&port_mask,
2609 sizeof(port_mask));
2610
2611 if ((u8)port_num != port) {
2612 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2613 return reply((struct ib_mad_hdr *)pmp);
2614 }
2615
2616 rsp = (struct _port_dctrs *)&(req->port[0]);
2617 memset(rsp, 0, sizeof(*rsp));
2618
2619 rsp->port_number = port;
2620 /*
2621 * Note that link_quality_indicator is a 32 bit quantity in
2622 * 'datacounters' queries (as opposed to 'portinfo' queries,
2623 * where it's a byte).
2624 */
2625 hfi1_read_link_quality(dd, &lq);
2626 rsp->link_quality_indicator = cpu_to_be32((u32)lq);
2627
2628 /* rsp->sw_port_congestion is 0 for HFIs */
2629 /* rsp->port_xmit_time_cong is 0 for HFIs */
2630 /* rsp->port_xmit_wasted_bw ??? */
2631 /* rsp->port_xmit_wait_data ??? */
2632 /* rsp->port_mark_fecn is 0 for HFIs */
2633
2634 rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
2635 CNTR_INVALID_VL));
2636 rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
2637 CNTR_INVALID_VL));
2638 rsp->port_rcv_bubble =
2639 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL));
2640 rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
2641 CNTR_INVALID_VL));
2642 rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
2643 CNTR_INVALID_VL));
2644 rsp->port_multicast_xmit_pkts =
2645 cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
2646 CNTR_INVALID_VL));
2647 rsp->port_multicast_rcv_pkts =
2648 cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
2649 CNTR_INVALID_VL));
2650 rsp->port_xmit_wait =
2651 cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
2652 rsp->port_rcv_fecn =
2653 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
2654 rsp->port_rcv_becn =
2655 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
2656
2657 rsp->port_error_counter_summary =
2658 cpu_to_be64(get_error_counter_summary(ibdev, port));
2659
2660 vlinfo = &(rsp->vls[0]);
2661 vfi = 0;
2662 /* The vl_select_mask has been checked above, and we know
2663 * that it contains only entries which represent valid VLs.
2664 * So in the for_each_set_bit() loop below, we don't need
2665 * any additional checks for vl.
2666 */
2667 for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
2668 8 * sizeof(req->vl_select_mask)) {
2669 memset(vlinfo, 0, sizeof(*vlinfo));
2670
2671 rsp->vls[vfi].port_vl_xmit_data =
2672 cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
2673 idx_from_vl(vl)));
2674
2675 rsp->vls[vfi].port_vl_rcv_data =
2676 cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL,
2677 idx_from_vl(vl)));
2678 rsp->vls[vfi].port_vl_rcv_bubble =
2679 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL_VL,
2680 idx_from_vl(vl)));
2681
2682 rsp->vls[vfi].port_vl_xmit_pkts =
2683 cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
2684 idx_from_vl(vl)));
2685
2686 rsp->vls[vfi].port_vl_rcv_pkts =
2687 cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
2688 idx_from_vl(vl)));
2689
2690 rsp->vls[vfi].port_vl_xmit_wait =
2691 cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
2692 idx_from_vl(vl)));
2693
2694 rsp->vls[vfi].port_vl_rcv_fecn =
2695 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
2696 idx_from_vl(vl)));
2697 rsp->vls[vfi].port_vl_rcv_becn =
2698 cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
2699 idx_from_vl(vl)));
2700
2701 /* rsp->port_vl_xmit_time_cong is 0 for HFIs */
2702 /* rsp->port_vl_xmit_wasted_bw ??? */
2703 /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ???
2704 * does this differ from rsp->vls[vfi].port_vl_xmit_wait */
2705 /*rsp->vls[vfi].port_vl_mark_fecn =
2706 cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
2707 + offset));
2708 */
2709 vlinfo++;
2710 vfi++;
2711 }
2712
2713 a0_datacounters(dd, rsp, vl_select_mask);
2714
2715 if (resp_len)
2716 *resp_len += response_data_size;
2717
2718 return reply((struct ib_mad_hdr *)pmp);
2719}
2720
2721static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
2722 struct ib_device *ibdev, u8 port, u32 *resp_len)
2723{
2724 size_t response_data_size;
2725 struct _port_ectrs *rsp;
2726 unsigned long port_num;
2727 struct opa_port_error_counters64_msg *req;
2728 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
2729 u32 num_ports;
2730 u8 num_pslm;
2731 u8 num_vls;
2732 struct hfi1_ibport *ibp;
2733 struct hfi1_pportdata *ppd;
2734 struct _vls_ectrs *vlinfo;
2735 unsigned long vl;
2736 u64 port_mask, tmp, tmp2;
2737 u32 vl_select_mask;
2738 int vfi;
2739
2740 req = (struct opa_port_error_counters64_msg *)pmp->data;
2741
2742 num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
2743
2744 num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
2745 num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
2746
2747 if (num_ports != 1 || num_ports != num_pslm) {
2748 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2749 return reply((struct ib_mad_hdr *)pmp);
2750 }
2751
2752 response_data_size = sizeof(struct opa_port_error_counters64_msg) +
2753 num_vls * sizeof(struct _vls_ectrs);
2754
2755 if (response_data_size > sizeof(pmp->data)) {
2756 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2757 return reply((struct ib_mad_hdr *)pmp);
2758 }
2759 /*
2760 * The bit set in the mask needs to be consistent with the
2761 * port the request came in on.
2762 */
2763 port_mask = be64_to_cpu(req->port_select_mask[3]);
2764 port_num = find_first_bit((unsigned long *)&port_mask,
2765 sizeof(port_mask));
2766
2767 if ((u8)port_num != port) {
2768 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2769 return reply((struct ib_mad_hdr *)pmp);
2770 }
2771
2772 rsp = (struct _port_ectrs *)&(req->port[0]);
2773
2774 ibp = to_iport(ibdev, port_num);
2775 ppd = ppd_from_ibp(ibp);
2776
2777 memset(rsp, 0, sizeof(*rsp));
2778 rsp->port_number = (u8)port_num;
2779
2780 rsp->port_rcv_constraint_errors =
2781 cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
2782 CNTR_INVALID_VL));
2783 /* port_rcv_switch_relay_errors is 0 for HFIs */
2784 rsp->port_xmit_discards =
2785 cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
2786 CNTR_INVALID_VL));
2787 rsp->port_rcv_remote_physical_errors =
2788 cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
2789 CNTR_INVALID_VL));
2790 tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
2791 tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
2792 if (tmp2 < tmp) {
2793 /* overflow/wrapped */
2794 rsp->local_link_integrity_errors = cpu_to_be64(~0);
2795 } else {
2796 rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
2797 }
2798 tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
2799 tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
2800 CNTR_INVALID_VL);
2801 if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
2802 /* overflow/wrapped */
2803 rsp->link_error_recovery = cpu_to_be32(~0);
2804 } else {
2805 rsp->link_error_recovery = cpu_to_be32(tmp2);
2806 }
2807 rsp->port_xmit_constraint_errors =
2808 cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
2809 CNTR_INVALID_VL));
2810 rsp->excessive_buffer_overruns =
2811 cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
2812 rsp->fm_config_errors =
2813 cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
2814 CNTR_INVALID_VL));
2815 rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
2816 CNTR_INVALID_VL));
2817 tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
2818 rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
2819
2820 vlinfo = (struct _vls_ectrs *)&(rsp->vls[0]);
2821 vfi = 0;
2822 vl_select_mask = be32_to_cpu(req->vl_select_mask);
2823 for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
2824 8 * sizeof(req->vl_select_mask)) {
2825 memset(vlinfo, 0, sizeof(*vlinfo));
2826 /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */
2827 vlinfo += 1;
2828 vfi++;
2829 }
2830
2831 if (resp_len)
2832 *resp_len += response_data_size;
2833
2834 return reply((struct ib_mad_hdr *)pmp);
2835}
2836
2837static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
2838 struct ib_device *ibdev, u8 port, u32 *resp_len)
2839{
2840 size_t response_data_size;
2841 struct _port_ei *rsp;
2842 struct opa_port_error_info_msg *req;
2843 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
2844 u64 port_mask;
2845 u32 num_ports;
2846 unsigned long port_num;
2847 u8 num_pslm;
2848 u64 reg;
2849
2850 req = (struct opa_port_error_info_msg *)pmp->data;
2851 rsp = (struct _port_ei *)&(req->port[0]);
2852
2853 num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
2854 num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
2855
2856 memset(rsp, 0, sizeof(*rsp));
2857
2858 if (num_ports != 1 || num_ports != num_pslm) {
2859 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2860 return reply((struct ib_mad_hdr *)pmp);
2861 }
2862
2863 /* Sanity check */
2864 response_data_size = sizeof(struct opa_port_error_info_msg);
2865
2866 if (response_data_size > sizeof(pmp->data)) {
2867 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2868 return reply((struct ib_mad_hdr *)pmp);
2869 }
2870
2871 /*
2872 * The bit set in the mask needs to be consistent with the port
2873 * the request came in on.
2874 */
2875 port_mask = be64_to_cpu(req->port_select_mask[3]);
2876 port_num = find_first_bit((unsigned long *)&port_mask,
2877 sizeof(port_mask));
2878
2879 if ((u8)port_num != port) {
2880 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2881 return reply((struct ib_mad_hdr *)pmp);
2882 }
2883
2884 /* PortRcvErrorInfo */
2885 rsp->port_rcv_ei.status_and_code =
2886 dd->err_info_rcvport.status_and_code;
2887 memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1,
2888 &dd->err_info_rcvport.packet_flit1, sizeof(u64));
2889 memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2,
2890 &dd->err_info_rcvport.packet_flit2, sizeof(u64));
2891
2892 /* ExcessiverBufferOverrunInfo */
2893 reg = read_csr(dd, RCV_ERR_INFO);
2894 if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) {
2895 /* if the RcvExcessBufferOverrun bit is set, save SC of
2896 * first pkt that encountered an excess buffer overrun */
2897 u8 tmp = (u8)reg;
2898
2899 tmp &= RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK;
2900 tmp <<= 2;
2901 rsp->excessive_buffer_overrun_ei.status_and_sc = tmp;
2902 /* set the status bit */
2903 rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80;
2904 }
2905
2906 rsp->port_xmit_constraint_ei.status =
2907 dd->err_info_xmit_constraint.status;
2908 rsp->port_xmit_constraint_ei.pkey =
2909 cpu_to_be16(dd->err_info_xmit_constraint.pkey);
2910 rsp->port_xmit_constraint_ei.slid =
2911 cpu_to_be32(dd->err_info_xmit_constraint.slid);
2912
2913 rsp->port_rcv_constraint_ei.status =
2914 dd->err_info_rcv_constraint.status;
2915 rsp->port_rcv_constraint_ei.pkey =
2916 cpu_to_be16(dd->err_info_rcv_constraint.pkey);
2917 rsp->port_rcv_constraint_ei.slid =
2918 cpu_to_be32(dd->err_info_rcv_constraint.slid);
2919
2920 /* UncorrectableErrorInfo */
2921 rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable;
2922
2923 /* FMConfigErrorInfo */
2924 rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig;
2925
2926 if (resp_len)
2927 *resp_len += response_data_size;
2928
2929 return reply((struct ib_mad_hdr *)pmp);
2930}
2931
2932static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
2933 struct ib_device *ibdev, u8 port, u32 *resp_len)
2934{
2935 struct opa_clear_port_status *req =
2936 (struct opa_clear_port_status *)pmp->data;
2937 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
2938 struct hfi1_ibport *ibp = to_iport(ibdev, port);
2939 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
2940 u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
2941 u64 portn = be64_to_cpu(req->port_select_mask[3]);
2942 u32 counter_select = be32_to_cpu(req->counter_select_mask);
2943 u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
2944 unsigned long vl;
2945
2946 if ((nports != 1) || (portn != 1 << port)) {
2947 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
2948 return reply((struct ib_mad_hdr *)pmp);
2949 }
2950 /*
2951 * only counters returned by pma_get_opa_portstatus() are
2952 * handled, so when pma_get_opa_portstatus() gets a fix,
2953 * the corresponding change should be made here as well.
2954 */
2955
2956 if (counter_select & CS_PORT_XMIT_DATA)
2957 write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0);
2958
2959 if (counter_select & CS_PORT_RCV_DATA)
2960 write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0);
2961
2962 if (counter_select & CS_PORT_XMIT_PKTS)
2963 write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0);
2964
2965 if (counter_select & CS_PORT_RCV_PKTS)
2966 write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0);
2967
2968 if (counter_select & CS_PORT_MCAST_XMIT_PKTS)
2969 write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0);
2970
2971 if (counter_select & CS_PORT_MCAST_RCV_PKTS)
2972 write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0);
2973
2974 if (counter_select & CS_PORT_XMIT_WAIT)
2975 write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0);
2976
2977 /* ignore cs_sw_portCongestion for HFIs */
2978
2979 if (counter_select & CS_PORT_RCV_FECN)
2980 write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0);
2981
2982 if (counter_select & CS_PORT_RCV_BECN)
2983 write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0);
2984
2985 /* ignore cs_port_xmit_time_cong for HFIs */
2986 /* ignore cs_port_xmit_wasted_bw for now */
2987 /* ignore cs_port_xmit_wait_data for now */
2988 if (counter_select & CS_PORT_RCV_BUBBLE)
2989 write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0);
2990
2991 /* Only applicable for switch */
2992 /*if (counter_select & CS_PORT_MARK_FECN)
2993 write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);*/
2994
2995 if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS)
2996 write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0);
2997
2998 /* ignore cs_port_rcv_switch_relay_errors for HFIs */
2999 if (counter_select & CS_PORT_XMIT_DISCARDS)
3000 write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0);
3001
3002 if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS)
3003 write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0);
3004
3005 if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
3006 write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
3007
3008 if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) {
3009 write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
3010 write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
3011 }
3012
3013 if (counter_select & CS_LINK_ERROR_RECOVERY) {
3014 write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
3015 write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
3016 CNTR_INVALID_VL, 0);
3017 }
3018
3019 if (counter_select & CS_PORT_RCV_ERRORS)
3020 write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
3021
3022 if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) {
3023 write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
3024 dd->rcv_ovfl_cnt = 0;
3025 }
3026
3027 if (counter_select & CS_FM_CONFIG_ERRORS)
3028 write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0);
3029
3030 if (counter_select & CS_LINK_DOWNED)
3031 write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0);
3032
3033 if (counter_select & CS_UNCORRECTABLE_ERRORS)
3034 write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
3035
3036 for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
3037 8 * sizeof(vl_select_mask)) {
3038
3039 if (counter_select & CS_PORT_XMIT_DATA)
3040 write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
3041
3042 if (counter_select & CS_PORT_RCV_DATA)
3043 write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0);
3044
3045 if (counter_select & CS_PORT_XMIT_PKTS)
3046 write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0);
3047
3048 if (counter_select & CS_PORT_RCV_PKTS)
3049 write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0);
3050
3051 if (counter_select & CS_PORT_XMIT_WAIT)
3052 write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0);
3053
3054 /* sw_port_vl_congestion is 0 for HFIs */
3055 if (counter_select & CS_PORT_RCV_FECN)
3056 write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0);
3057
3058 if (counter_select & CS_PORT_RCV_BECN)
3059 write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0);
3060
3061 /* port_vl_xmit_time_cong is 0 for HFIs */
3062 /* port_vl_xmit_wasted_bw ??? */
3063 /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */
3064 if (counter_select & CS_PORT_RCV_BUBBLE)
3065 write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0);
3066
3067 /*if (counter_select & CS_PORT_MARK_FECN)
3068 write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
3069 */
3070 /* port_vl_xmit_discards ??? */
3071 }
3072
3073 if (resp_len)
3074 *resp_len += sizeof(*req);
3075
3076 return reply((struct ib_mad_hdr *)pmp);
3077}
3078
3079static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
3080 struct ib_device *ibdev, u8 port, u32 *resp_len)
3081{
3082 struct _port_ei *rsp;
3083 struct opa_port_error_info_msg *req;
3084 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
3085 u64 port_mask;
3086 u32 num_ports;
3087 unsigned long port_num;
3088 u8 num_pslm;
3089 u32 error_info_select;
3090
3091 req = (struct opa_port_error_info_msg *)pmp->data;
3092 rsp = (struct _port_ei *)&(req->port[0]);
3093
3094 num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
3095 num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
3096
3097 memset(rsp, 0, sizeof(*rsp));
3098
3099 if (num_ports != 1 || num_ports != num_pslm) {
3100 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
3101 return reply((struct ib_mad_hdr *)pmp);
3102 }
3103
3104 /*
3105 * The bit set in the mask needs to be consistent with the port
3106 * the request came in on.
3107 */
3108 port_mask = be64_to_cpu(req->port_select_mask[3]);
3109 port_num = find_first_bit((unsigned long *)&port_mask,
3110 sizeof(port_mask));
3111
3112 if ((u8)port_num != port) {
3113 pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
3114 return reply((struct ib_mad_hdr *)pmp);
3115 }
3116
3117 error_info_select = be32_to_cpu(req->error_info_select_mask);
3118
3119 /* PortRcvErrorInfo */
3120 if (error_info_select & ES_PORT_RCV_ERROR_INFO)
3121 /* turn off status bit */
3122 dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
3123
3124 /* ExcessiverBufferOverrunInfo */
3125 if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO)
3126 /* status bit is essentially kept in the h/w - bit 5 of
3127 * RCV_ERR_INFO */
3128 write_csr(dd, RCV_ERR_INFO,
3129 RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
3130
3131 if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO)
3132 dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
3133
3134 if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO)
3135 dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK;
3136
3137 /* UncorrectableErrorInfo */
3138 if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO)
3139 /* turn off status bit */
3140 dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK;
3141
3142 /* FMConfigErrorInfo */
3143 if (error_info_select & ES_FM_CONFIG_ERROR_INFO)
3144 /* turn off status bit */
3145 dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK;
3146
3147 if (resp_len)
3148 *resp_len += sizeof(*req);
3149
3150 return reply((struct ib_mad_hdr *)pmp);
3151}
3152
3153struct opa_congestion_info_attr {
3154 __be16 congestion_info;
3155 u8 control_table_cap; /* Multiple of 64 entry unit CCTs */
3156 u8 congestion_log_length;
3157} __packed;
3158
3159static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
3160 struct ib_device *ibdev, u8 port,
3161 u32 *resp_len)
3162{
3163 struct opa_congestion_info_attr *p =
3164 (struct opa_congestion_info_attr *)data;
3165 struct hfi1_ibport *ibp = to_iport(ibdev, port);
3166 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
3167
3168 p->congestion_info = 0;
3169 p->control_table_cap = ppd->cc_max_table_entries;
3170 p->congestion_log_length = OPA_CONG_LOG_ELEMS;
3171
3172 if (resp_len)
3173 *resp_len += sizeof(*p);
3174
3175 return reply((struct ib_mad_hdr *)smp);
3176}
3177
3178static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
3179 u8 *data,
3180 struct ib_device *ibdev,
3181 u8 port, u32 *resp_len)
3182{
3183 int i;
3184 struct opa_congestion_setting_attr *p =
3185 (struct opa_congestion_setting_attr *) data;
3186 struct hfi1_ibport *ibp = to_iport(ibdev, port);
3187 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
3188 struct opa_congestion_setting_entry_shadow *entries;
3189 struct cc_state *cc_state;
3190
3191 rcu_read_lock();
3192
3193 cc_state = get_cc_state(ppd);
3194
3195 if (cc_state == NULL) {
3196 rcu_read_unlock();
3197 return reply((struct ib_mad_hdr *)smp);
3198 }
3199
3200 entries = cc_state->cong_setting.entries;
3201 p->port_control = cpu_to_be16(cc_state->cong_setting.port_control);
3202 p->control_map = cpu_to_be32(cc_state->cong_setting.control_map);
3203 for (i = 0; i < OPA_MAX_SLS; i++) {
3204 p->entries[i].ccti_increase = entries[i].ccti_increase;
3205 p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
3206 p->entries[i].trigger_threshold =
3207 entries[i].trigger_threshold;
3208 p->entries[i].ccti_min = entries[i].ccti_min;
3209 }
3210
3211 rcu_read_unlock();
3212
3213 if (resp_len)
3214 *resp_len += sizeof(*p);
3215
3216 return reply((struct ib_mad_hdr *)smp);
3217}
3218
3219static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
3220 struct ib_device *ibdev, u8 port,
3221 u32 *resp_len)
3222{
3223 struct opa_congestion_setting_attr *p =
3224 (struct opa_congestion_setting_attr *) data;
3225 struct hfi1_ibport *ibp = to_iport(ibdev, port);
3226 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
3227 struct opa_congestion_setting_entry_shadow *entries;
3228 int i;
3229
3230 ppd->cc_sl_control_map = be32_to_cpu(p->control_map);
3231
3232 entries = ppd->congestion_entries;
3233 for (i = 0; i < OPA_MAX_SLS; i++) {
3234 entries[i].ccti_increase = p->entries[i].ccti_increase;
3235 entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer);
3236 entries[i].trigger_threshold =
3237 p->entries[i].trigger_threshold;
3238 entries[i].ccti_min = p->entries[i].ccti_min;
3239 }
3240
3241 return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
3242 resp_len);
3243}
3244
3245static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
3246 u8 *data, struct ib_device *ibdev,
3247 u8 port, u32 *resp_len)
3248{
3249 struct hfi1_ibport *ibp = to_iport(ibdev, port);
3250 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
3251 struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data;
3252 s64 ts;
3253 int i;
3254
3255 if (am != 0) {
3256 smp->status |= IB_SMP_INVALID_FIELD;
3257 return reply((struct ib_mad_hdr *)smp);
3258 }
3259
3260 spin_lock(&ppd->cc_log_lock);
3261
3262 cong_log->log_type = OPA_CC_LOG_TYPE_HFI;
3263 cong_log->congestion_flags = 0;
3264 cong_log->threshold_event_counter =
3265 cpu_to_be16(ppd->threshold_event_counter);
3266 memcpy(cong_log->threshold_cong_event_map,
3267 ppd->threshold_cong_event_map,
3268 sizeof(cong_log->threshold_cong_event_map));
3269 /* keep timestamp in units of 1.024 usec */
3270 ts = ktime_to_ns(ktime_get()) / 1024;
3271 cong_log->current_time_stamp = cpu_to_be32(ts);
3272 for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) {
3273 struct opa_hfi1_cong_log_event_internal *cce =
3274 &ppd->cc_events[ppd->cc_mad_idx++];
3275 if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS)
3276 ppd->cc_mad_idx = 0;
3277 /*
3278 * Entries which are older than twice the time
3279 * required to wrap the counter are supposed to
3280 * be zeroed (CA10-49 IBTA, release 1.2.1, V1).
3281 */
3282 if ((u64)(ts - cce->timestamp) > (2 * UINT_MAX))
3283 continue;
3284 memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3);
3285 memcpy(cong_log->events[i].remote_qp_number_cn_entry,
3286 &cce->rqpn, 3);
3287 cong_log->events[i].sl_svc_type_cn_entry =
3288 ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7);
3289 cong_log->events[i].remote_lid_cn_entry =
3290 cpu_to_be32(cce->rlid);
3291 cong_log->events[i].timestamp_cn_entry =
3292 cpu_to_be32(cce->timestamp);
3293 }
3294
3295 /*
3296 * Reset threshold_cong_event_map, and threshold_event_counter
3297 * to 0 when log is read.
3298 */
3299 memset(ppd->threshold_cong_event_map, 0x0,
3300 sizeof(ppd->threshold_cong_event_map));
3301 ppd->threshold_event_counter = 0;
3302
3303 spin_unlock(&ppd->cc_log_lock);
3304
3305 if (resp_len)
3306 *resp_len += sizeof(struct opa_hfi1_cong_log);
3307
3308 return reply((struct ib_mad_hdr *)smp);
3309}
3310
3311static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
3312 struct ib_device *ibdev, u8 port,
3313 u32 *resp_len)
3314{
3315 struct ib_cc_table_attr *cc_table_attr =
3316 (struct ib_cc_table_attr *) data;
3317 struct hfi1_ibport *ibp = to_iport(ibdev, port);
3318 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
3319 u32 start_block = OPA_AM_START_BLK(am);
3320 u32 n_blocks = OPA_AM_NBLK(am);
3321 struct ib_cc_table_entry_shadow *entries;
3322 int i, j;
3323 u32 sentry, eentry;
3324 struct cc_state *cc_state;
3325
3326 /* sanity check n_blocks, start_block */
3327 if (n_blocks == 0 ||
3328 start_block + n_blocks > ppd->cc_max_table_entries) {
3329 smp->status |= IB_SMP_INVALID_FIELD;
3330 return reply((struct ib_mad_hdr *)smp);
3331 }
3332
3333 rcu_read_lock();
3334
3335 cc_state = get_cc_state(ppd);
3336
3337 if (cc_state == NULL) {
3338 rcu_read_unlock();
3339 return reply((struct ib_mad_hdr *)smp);
3340 }
3341
3342 sentry = start_block * IB_CCT_ENTRIES;
3343 eentry = sentry + (IB_CCT_ENTRIES * n_blocks);
3344
3345 cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit);
3346
3347 entries = cc_state->cct.entries;
3348
3349 /* return n_blocks, though the last block may not be full */
3350 for (j = 0, i = sentry; i < eentry; j++, i++)
3351 cc_table_attr->ccti_entries[j].entry =
3352 cpu_to_be16(entries[i].entry);
3353
3354 rcu_read_unlock();
3355
3356 if (resp_len)
3357 *resp_len += sizeof(u16)*(IB_CCT_ENTRIES * n_blocks + 1);
3358
3359 return reply((struct ib_mad_hdr *)smp);
3360}
3361
3362void cc_state_reclaim(struct rcu_head *rcu)
3363{
3364 struct cc_state *cc_state = container_of(rcu, struct cc_state, rcu);
3365
3366 kfree(cc_state);
3367}
3368
3369static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
3370 struct ib_device *ibdev, u8 port,
3371 u32 *resp_len)
3372{
3373 struct ib_cc_table_attr *p = (struct ib_cc_table_attr *) data;
3374 struct hfi1_ibport *ibp = to_iport(ibdev, port);
3375 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
3376 u32 start_block = OPA_AM_START_BLK(am);
3377 u32 n_blocks = OPA_AM_NBLK(am);
3378 struct ib_cc_table_entry_shadow *entries;
3379 int i, j;
3380 u32 sentry, eentry;
3381 u16 ccti_limit;
3382 struct cc_state *old_cc_state, *new_cc_state;
3383
3384 /* sanity check n_blocks, start_block */
3385 if (n_blocks == 0 ||
3386 start_block + n_blocks > ppd->cc_max_table_entries) {
3387 smp->status |= IB_SMP_INVALID_FIELD;
3388 return reply((struct ib_mad_hdr *)smp);
3389 }
3390
3391 sentry = start_block * IB_CCT_ENTRIES;
3392 eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) +
3393 (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1;
3394
3395 /* sanity check ccti_limit */
3396 ccti_limit = be16_to_cpu(p->ccti_limit);
3397 if (ccti_limit + 1 > eentry) {
3398 smp->status |= IB_SMP_INVALID_FIELD;
3399 return reply((struct ib_mad_hdr *)smp);
3400 }
3401
3402 new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
3403 if (new_cc_state == NULL)
3404 goto getit;
3405
3406 spin_lock(&ppd->cc_state_lock);
3407
3408 old_cc_state = get_cc_state(ppd);
3409
3410 if (old_cc_state == NULL) {
3411 spin_unlock(&ppd->cc_state_lock);
3412 kfree(new_cc_state);
3413 return reply((struct ib_mad_hdr *)smp);
3414 }
3415
3416 *new_cc_state = *old_cc_state;
3417
3418 new_cc_state->cct.ccti_limit = ccti_limit;
3419
3420 entries = ppd->ccti_entries;
3421 ppd->total_cct_entry = ccti_limit + 1;
3422
3423 for (j = 0, i = sentry; i < eentry; j++, i++)
3424 entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry);
3425
3426 memcpy(new_cc_state->cct.entries, entries,
3427 eentry * sizeof(struct ib_cc_table_entry));
3428
3429 new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
3430 new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
3431 memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
3432 OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
3433
3434 rcu_assign_pointer(ppd->cc_state, new_cc_state);
3435
3436 spin_unlock(&ppd->cc_state_lock);
3437
3438 call_rcu(&old_cc_state->rcu, cc_state_reclaim);
3439
3440getit:
3441 return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len);
3442}
3443
3444struct opa_led_info {
3445 __be32 rsvd_led_mask;
3446 __be32 rsvd;
3447};
3448
3449#define OPA_LED_SHIFT 31
3450#define OPA_LED_MASK (1 << OPA_LED_SHIFT)
3451
3452static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
3453 struct ib_device *ibdev, u8 port,
3454 u32 *resp_len)
3455{
3456 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
3457 struct opa_led_info *p = (struct opa_led_info *) data;
3458 u32 nport = OPA_AM_NPORT(am);
3459 u64 reg;
3460
3461 if (nport != 1 || OPA_AM_PORTNUM(am)) {
3462 smp->status |= IB_SMP_INVALID_FIELD;
3463 return reply((struct ib_mad_hdr *)smp);
3464 }
3465
3466 reg = read_csr(dd, DCC_CFG_LED_CNTRL);
3467 if ((reg & DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK) &&
3468 ((reg & DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK) == 0xf))
3469 p->rsvd_led_mask = cpu_to_be32(OPA_LED_MASK);
3470
3471 if (resp_len)
3472 *resp_len += sizeof(struct opa_led_info);
3473
3474 return reply((struct ib_mad_hdr *)smp);
3475}
3476
3477static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
3478 struct ib_device *ibdev, u8 port,
3479 u32 *resp_len)
3480{
3481 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
3482 struct opa_led_info *p = (struct opa_led_info *) data;
3483 u32 nport = OPA_AM_NPORT(am);
3484 int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
3485
3486 if (nport != 1 || OPA_AM_PORTNUM(am)) {
3487 smp->status |= IB_SMP_INVALID_FIELD;
3488 return reply((struct ib_mad_hdr *)smp);
3489 }
3490
3491 setextled(dd, on);
3492
3493 return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len);
3494}
3495
3496static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
3497 u8 *data, struct ib_device *ibdev, u8 port,
3498 u32 *resp_len)
3499{
3500 int ret;
3501 struct hfi1_ibport *ibp = to_iport(ibdev, port);
3502
3503 switch (attr_id) {
3504 case IB_SMP_ATTR_NODE_DESC:
3505 ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
3506 resp_len);
3507 break;
3508 case IB_SMP_ATTR_NODE_INFO:
3509 ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
3510 resp_len);
3511 break;
3512 case IB_SMP_ATTR_PORT_INFO:
3513 ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
3514 resp_len);
3515 break;
3516 case IB_SMP_ATTR_PKEY_TABLE:
3517 ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
3518 resp_len);
3519 break;
3520 case OPA_ATTRIB_ID_SL_TO_SC_MAP:
3521 ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
3522 resp_len);
3523 break;
3524 case OPA_ATTRIB_ID_SC_TO_SL_MAP:
3525 ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
3526 resp_len);
3527 break;
3528 case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
3529 ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
3530 resp_len);
3531 break;
3532 case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
3533 ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
3534 resp_len);
3535 break;
3536 case OPA_ATTRIB_ID_PORT_STATE_INFO:
3537 ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
3538 resp_len);
3539 break;
3540 case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
3541 ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
3542 resp_len);
3543 break;
3544 case OPA_ATTRIB_ID_CABLE_INFO:
3545 ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
3546 resp_len);
3547 break;
3548 case IB_SMP_ATTR_VL_ARB_TABLE:
3549 ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
3550 resp_len);
3551 break;
3552 case OPA_ATTRIB_ID_CONGESTION_INFO:
3553 ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
3554 resp_len);
3555 break;
3556 case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
3557 ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
3558 port, resp_len);
3559 break;
3560 case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
3561 ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
3562 port, resp_len);
3563 break;
3564 case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
3565 ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
3566 resp_len);
3567 break;
3568 case IB_SMP_ATTR_LED_INFO:
3569 ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
3570 resp_len);
3571 break;
3572 case IB_SMP_ATTR_SM_INFO:
3573 if (ibp->port_cap_flags & IB_PORT_SM_DISABLED)
3574 return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
3575 if (ibp->port_cap_flags & IB_PORT_SM)
3576 return IB_MAD_RESULT_SUCCESS;
3577 /* FALLTHROUGH */
3578 default:
3579 smp->status |= IB_SMP_UNSUP_METH_ATTR;
3580 ret = reply((struct ib_mad_hdr *)smp);
3581 break;
3582 }
3583 return ret;
3584}
3585
3586static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
3587 u8 *data, struct ib_device *ibdev, u8 port,
3588 u32 *resp_len)
3589{
3590 int ret;
3591 struct hfi1_ibport *ibp = to_iport(ibdev, port);
3592
3593 switch (attr_id) {
3594 case IB_SMP_ATTR_PORT_INFO:
3595 ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
3596 resp_len);
3597 break;
3598 case IB_SMP_ATTR_PKEY_TABLE:
3599 ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
3600 resp_len);
3601 break;
3602 case OPA_ATTRIB_ID_SL_TO_SC_MAP:
3603 ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
3604 resp_len);
3605 break;
3606 case OPA_ATTRIB_ID_SC_TO_SL_MAP:
3607 ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
3608 resp_len);
3609 break;
3610 case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
3611 ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
3612 resp_len);
3613 break;
3614 case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
3615 ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
3616 resp_len);
3617 break;
3618 case OPA_ATTRIB_ID_PORT_STATE_INFO:
3619 ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
3620 resp_len);
3621 break;
3622 case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
3623 ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
3624 resp_len);
3625 break;
3626 case IB_SMP_ATTR_VL_ARB_TABLE:
3627 ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
3628 resp_len);
3629 break;
3630 case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
3631 ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
3632 port, resp_len);
3633 break;
3634 case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
3635 ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
3636 resp_len);
3637 break;
3638 case IB_SMP_ATTR_LED_INFO:
3639 ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
3640 resp_len);
3641 break;
3642 case IB_SMP_ATTR_SM_INFO:
3643 if (ibp->port_cap_flags & IB_PORT_SM_DISABLED)
3644 return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
3645 if (ibp->port_cap_flags & IB_PORT_SM)
3646 return IB_MAD_RESULT_SUCCESS;
3647 /* FALLTHROUGH */
3648 default:
3649 smp->status |= IB_SMP_UNSUP_METH_ATTR;
3650 ret = reply((struct ib_mad_hdr *)smp);
3651 break;
3652 }
3653 return ret;
3654}
3655
3656static inline void set_aggr_error(struct opa_aggregate *ag)
3657{
3658 ag->err_reqlength |= cpu_to_be16(0x8000);
3659}
3660
3661static int subn_get_opa_aggregate(struct opa_smp *smp,
3662 struct ib_device *ibdev, u8 port,
3663 u32 *resp_len)
3664{
3665 int i;
3666 u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
3667 u8 *next_smp = opa_get_smp_data(smp);
3668
3669 if (num_attr < 1 || num_attr > 117) {
3670 smp->status |= IB_SMP_INVALID_FIELD;
3671 return reply((struct ib_mad_hdr *)smp);
3672 }
3673
3674 for (i = 0; i < num_attr; i++) {
3675 struct opa_aggregate *agg;
3676 size_t agg_data_len;
3677 size_t agg_size;
3678 u32 am;
3679
3680 agg = (struct opa_aggregate *)next_smp;
3681 agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
3682 agg_size = sizeof(*agg) + agg_data_len;
3683 am = be32_to_cpu(agg->attr_mod);
3684
3685 *resp_len += agg_size;
3686
3687 if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
3688 smp->status |= IB_SMP_INVALID_FIELD;
3689 return reply((struct ib_mad_hdr *)smp);
3690 }
3691
3692 /* zero the payload for this segment */
3693 memset(next_smp + sizeof(*agg), 0, agg_data_len);
3694
3695 (void) subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
3696 ibdev, port, NULL);
3697 if (smp->status & ~IB_SMP_DIRECTION) {
3698 set_aggr_error(agg);
3699 return reply((struct ib_mad_hdr *)smp);
3700 }
3701 next_smp += agg_size;
3702
3703 }
3704
3705 return reply((struct ib_mad_hdr *)smp);
3706}
3707
3708static int subn_set_opa_aggregate(struct opa_smp *smp,
3709 struct ib_device *ibdev, u8 port,
3710 u32 *resp_len)
3711{
3712 int i;
3713 u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
3714 u8 *next_smp = opa_get_smp_data(smp);
3715
3716 if (num_attr < 1 || num_attr > 117) {
3717 smp->status |= IB_SMP_INVALID_FIELD;
3718 return reply((struct ib_mad_hdr *)smp);
3719 }
3720
3721 for (i = 0; i < num_attr; i++) {
3722 struct opa_aggregate *agg;
3723 size_t agg_data_len;
3724 size_t agg_size;
3725 u32 am;
3726
3727 agg = (struct opa_aggregate *)next_smp;
3728 agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
3729 agg_size = sizeof(*agg) + agg_data_len;
3730 am = be32_to_cpu(agg->attr_mod);
3731
3732 *resp_len += agg_size;
3733
3734 if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
3735 smp->status |= IB_SMP_INVALID_FIELD;
3736 return reply((struct ib_mad_hdr *)smp);
3737 }
3738
3739 (void) subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
3740 ibdev, port, NULL);
3741 if (smp->status & ~IB_SMP_DIRECTION) {
3742 set_aggr_error(agg);
3743 return reply((struct ib_mad_hdr *)smp);
3744 }
3745 next_smp += agg_size;
3746
3747 }
3748
3749 return reply((struct ib_mad_hdr *)smp);
3750}
3751
3752/*
3753 * OPAv1 specifies that, on the transition to link up, these counters
3754 * are cleared:
3755 * PortRcvErrors [*]
3756 * LinkErrorRecovery
3757 * LocalLinkIntegrityErrors
3758 * ExcessiveBufferOverruns [*]
3759 *
3760 * [*] Error info associated with these counters is retained, but the
3761 * error info status is reset to 0.
3762 */
3763void clear_linkup_counters(struct hfi1_devdata *dd)
3764{
3765 /* PortRcvErrors */
3766 write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
3767 dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
3768 /* LinkErrorRecovery */
3769 write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
3770 write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
3771 /* LocalLinkIntegrityErrors */
3772 write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
3773 write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
3774 /* ExcessiveBufferOverruns */
3775 write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
3776 dd->rcv_ovfl_cnt = 0;
3777 dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
3778}
3779
3780/*
3781 * is_local_mad() returns 1 if 'mad' is sent from, and destined to the
3782 * local node, 0 otherwise.
3783 */
3784static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad,
3785 const struct ib_wc *in_wc)
3786{
3787 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
3788 const struct opa_smp *smp = (const struct opa_smp *)mad;
3789
3790 if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
3791 return (smp->hop_cnt == 0 &&
3792 smp->route.dr.dr_slid == OPA_LID_PERMISSIVE &&
3793 smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE);
3794 }
3795
3796 return (in_wc->slid == ppd->lid);
3797}
3798
3799/*
3800 * opa_local_smp_check() should only be called on MADs for which
3801 * is_local_mad() returns true. It applies the SMP checks that are
3802 * specific to SMPs which are sent from, and destined to this node.
3803 * opa_local_smp_check() returns 0 if the SMP passes its checks, 1
3804 * otherwise.
3805 *
3806 * SMPs which arrive from other nodes are instead checked by
3807 * opa_smp_check().
3808 */
3809static int opa_local_smp_check(struct hfi1_ibport *ibp,
3810 const struct ib_wc *in_wc)
3811{
3812 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
3813 u16 slid = in_wc->slid;
3814 u16 pkey;
3815
3816 if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
3817 return 1;
3818
3819 pkey = ppd->pkeys[in_wc->pkey_index];
3820 /*
3821 * We need to do the "node-local" checks specified in OPAv1,
3822 * rev 0.90, section 9.10.26, which are:
3823 * - pkey is 0x7fff, or 0xffff
3824 * - Source QPN == 0 || Destination QPN == 0
3825 * - the MAD header's management class is either
3826 * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or
3827 * IB_MGMT_CLASS_SUBN_LID_ROUTED
3828 * - SLID != 0
3829 *
3830 * However, we know (and so don't need to check again) that,
3831 * for local SMPs, the MAD stack passes MADs with:
3832 * - Source QPN of 0
3833 * - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
3834 * - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or
3835 * our own port's lid
3836 *
3837 */
3838 if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
3839 return 0;
3840 ingress_pkey_table_fail(ppd, pkey, slid);
3841 return 1;
3842}
3843
3844static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
3845 u8 port, const struct opa_mad *in_mad,
3846 struct opa_mad *out_mad,
3847 u32 *resp_len)
3848{
3849 struct opa_smp *smp = (struct opa_smp *)out_mad;
3850 struct hfi1_ibport *ibp = to_iport(ibdev, port);
3851 u8 *data;
3852 u32 am;
3853 __be16 attr_id;
3854 int ret;
3855
3856 *out_mad = *in_mad;
3857 data = opa_get_smp_data(smp);
3858
3859 am = be32_to_cpu(smp->attr_mod);
3860 attr_id = smp->attr_id;
3861 if (smp->class_version != OPA_SMI_CLASS_VERSION) {
3862 smp->status |= IB_SMP_UNSUP_VERSION;
3863 ret = reply((struct ib_mad_hdr *)smp);
3864 goto bail;
3865 }
3866 ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey,
3867 smp->route.dr.dr_slid, smp->route.dr.return_path,
3868 smp->hop_cnt);
3869 if (ret) {
3870 u32 port_num = be32_to_cpu(smp->attr_mod);
3871
3872 /*
3873 * If this is a get/set portinfo, we already check the
3874 * M_Key if the MAD is for another port and the M_Key
3875 * is OK on the receiving port. This check is needed
3876 * to increment the error counters when the M_Key
3877 * fails to match on *both* ports.
3878 */
3879 if (attr_id == IB_SMP_ATTR_PORT_INFO &&
3880 (smp->method == IB_MGMT_METHOD_GET ||
3881 smp->method == IB_MGMT_METHOD_SET) &&
3882 port_num && port_num <= ibdev->phys_port_cnt &&
3883 port != port_num)
3884 (void) check_mkey(to_iport(ibdev, port_num),
3885 (struct ib_mad_hdr *)smp, 0,
3886 smp->mkey, smp->route.dr.dr_slid,
3887 smp->route.dr.return_path,
3888 smp->hop_cnt);
3889 ret = IB_MAD_RESULT_FAILURE;
3890 goto bail;
3891 }
3892
3893 *resp_len = opa_get_smp_header_size(smp);
3894
3895 switch (smp->method) {
3896 case IB_MGMT_METHOD_GET:
3897 switch (attr_id) {
3898 default:
3899 clear_opa_smp_data(smp);
3900 ret = subn_get_opa_sma(attr_id, smp, am, data,
3901 ibdev, port, resp_len);
3902 goto bail;
3903 case OPA_ATTRIB_ID_AGGREGATE:
3904 ret = subn_get_opa_aggregate(smp, ibdev, port,
3905 resp_len);
3906 goto bail;
3907 }
3908 case IB_MGMT_METHOD_SET:
3909 switch (attr_id) {
3910 default:
3911 ret = subn_set_opa_sma(attr_id, smp, am, data,
3912 ibdev, port, resp_len);
3913 goto bail;
3914 case OPA_ATTRIB_ID_AGGREGATE:
3915 ret = subn_set_opa_aggregate(smp, ibdev, port,
3916 resp_len);
3917 goto bail;
3918 }
3919 case IB_MGMT_METHOD_TRAP:
3920 case IB_MGMT_METHOD_REPORT:
3921 case IB_MGMT_METHOD_REPORT_RESP:
3922 case IB_MGMT_METHOD_GET_RESP:
3923 /*
3924 * The ib_mad module will call us to process responses
3925 * before checking for other consumers.
3926 * Just tell the caller to process it normally.
3927 */
3928 ret = IB_MAD_RESULT_SUCCESS;
3929 goto bail;
3930 default:
3931 smp->status |= IB_SMP_UNSUP_METHOD;
3932 ret = reply((struct ib_mad_hdr *)smp);
3933 }
3934
3935bail:
3936 return ret;
3937}
3938
3939static int process_subn(struct ib_device *ibdev, int mad_flags,
3940 u8 port, const struct ib_mad *in_mad,
3941 struct ib_mad *out_mad)
3942{
3943 struct ib_smp *smp = (struct ib_smp *)out_mad;
3944 struct hfi1_ibport *ibp = to_iport(ibdev, port);
3945 int ret;
3946
3947 *out_mad = *in_mad;
3948 if (smp->class_version != 1) {
3949 smp->status |= IB_SMP_UNSUP_VERSION;
3950 ret = reply((struct ib_mad_hdr *)smp);
3951 goto bail;
3952 }
3953
3954 ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags,
3955 smp->mkey, (__force __be32)smp->dr_slid,
3956 smp->return_path, smp->hop_cnt);
3957 if (ret) {
3958 u32 port_num = be32_to_cpu(smp->attr_mod);
3959
3960 /*
3961 * If this is a get/set portinfo, we already check the
3962 * M_Key if the MAD is for another port and the M_Key
3963 * is OK on the receiving port. This check is needed
3964 * to increment the error counters when the M_Key
3965 * fails to match on *both* ports.
3966 */
3967 if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
3968 (smp->method == IB_MGMT_METHOD_GET ||
3969 smp->method == IB_MGMT_METHOD_SET) &&
3970 port_num && port_num <= ibdev->phys_port_cnt &&
3971 port != port_num)
3972 (void) check_mkey(to_iport(ibdev, port_num),
3973 (struct ib_mad_hdr *)smp, 0,
3974 smp->mkey,
3975 (__force __be32)smp->dr_slid,
3976 smp->return_path, smp->hop_cnt);
3977 ret = IB_MAD_RESULT_FAILURE;
3978 goto bail;
3979 }
3980
3981 switch (smp->method) {
3982 case IB_MGMT_METHOD_GET:
3983 switch (smp->attr_id) {
3984 case IB_SMP_ATTR_NODE_INFO:
3985 ret = subn_get_nodeinfo(smp, ibdev, port);
3986 goto bail;
3987 default:
3988 smp->status |= IB_SMP_UNSUP_METH_ATTR;
3989 ret = reply((struct ib_mad_hdr *)smp);
3990 goto bail;
3991 }
3992 }
3993
3994bail:
3995 return ret;
3996}
3997
3998static int process_perf_opa(struct ib_device *ibdev, u8 port,
3999 const struct opa_mad *in_mad,
4000 struct opa_mad *out_mad, u32 *resp_len)
4001{
4002 struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad;
4003 int ret;
4004
4005 *out_mad = *in_mad;
4006
4007 if (pmp->mad_hdr.class_version != OPA_SMI_CLASS_VERSION) {
4008 pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
4009 return reply((struct ib_mad_hdr *)pmp);
4010 }
4011
4012 *resp_len = sizeof(pmp->mad_hdr);
4013
4014 switch (pmp->mad_hdr.method) {
4015 case IB_MGMT_METHOD_GET:
4016 switch (pmp->mad_hdr.attr_id) {
4017 case IB_PMA_CLASS_PORT_INFO:
4018 ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len);
4019 goto bail;
4020 case OPA_PM_ATTRIB_ID_PORT_STATUS:
4021 ret = pma_get_opa_portstatus(pmp, ibdev, port,
4022 resp_len);
4023 goto bail;
4024 case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS:
4025 ret = pma_get_opa_datacounters(pmp, ibdev, port,
4026 resp_len);
4027 goto bail;
4028 case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS:
4029 ret = pma_get_opa_porterrors(pmp, ibdev, port,
4030 resp_len);
4031 goto bail;
4032 case OPA_PM_ATTRIB_ID_ERROR_INFO:
4033 ret = pma_get_opa_errorinfo(pmp, ibdev, port,
4034 resp_len);
4035 goto bail;
4036 default:
4037 pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
4038 ret = reply((struct ib_mad_hdr *)pmp);
4039 goto bail;
4040 }
4041
4042 case IB_MGMT_METHOD_SET:
4043 switch (pmp->mad_hdr.attr_id) {
4044 case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS:
4045 ret = pma_set_opa_portstatus(pmp, ibdev, port,
4046 resp_len);
4047 goto bail;
4048 case OPA_PM_ATTRIB_ID_ERROR_INFO:
4049 ret = pma_set_opa_errorinfo(pmp, ibdev, port,
4050 resp_len);
4051 goto bail;
4052 default:
4053 pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
4054 ret = reply((struct ib_mad_hdr *)pmp);
4055 goto bail;
4056 }
4057
4058 case IB_MGMT_METHOD_TRAP:
4059 case IB_MGMT_METHOD_GET_RESP:
4060 /*
4061 * The ib_mad module will call us to process responses
4062 * before checking for other consumers.
4063 * Just tell the caller to process it normally.
4064 */
4065 ret = IB_MAD_RESULT_SUCCESS;
4066 goto bail;
4067
4068 default:
4069 pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
4070 ret = reply((struct ib_mad_hdr *)pmp);
4071 }
4072
4073bail:
4074 return ret;
4075}
4076
4077static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
4078 u8 port, const struct ib_wc *in_wc,
4079 const struct ib_grh *in_grh,
4080 const struct opa_mad *in_mad,
4081 struct opa_mad *out_mad, size_t *out_mad_size,
4082 u16 *out_mad_pkey_index)
4083{
4084 int ret;
4085 int pkey_idx;
4086 u32 resp_len = 0;
4087 struct hfi1_ibport *ibp = to_iport(ibdev, port);
4088
4089 pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
4090 if (pkey_idx < 0) {
4091 pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n",
4092 hfi1_get_pkey(ibp, 1));
4093 pkey_idx = 1;
4094 }
4095 *out_mad_pkey_index = (u16)pkey_idx;
4096
4097 switch (in_mad->mad_hdr.mgmt_class) {
4098 case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
4099 case IB_MGMT_CLASS_SUBN_LID_ROUTED:
4100 if (is_local_mad(ibp, in_mad, in_wc)) {
4101 ret = opa_local_smp_check(ibp, in_wc);
4102 if (ret)
4103 return IB_MAD_RESULT_FAILURE;
4104 }
4105 ret = process_subn_opa(ibdev, mad_flags, port, in_mad,
4106 out_mad, &resp_len);
4107 goto bail;
4108 case IB_MGMT_CLASS_PERF_MGMT:
4109 ret = process_perf_opa(ibdev, port, in_mad, out_mad,
4110 &resp_len);
4111 goto bail;
4112
4113 default:
4114 ret = IB_MAD_RESULT_SUCCESS;
4115 }
4116
4117bail:
4118 if (ret & IB_MAD_RESULT_REPLY)
4119 *out_mad_size = round_up(resp_len, 8);
4120 else if (ret & IB_MAD_RESULT_SUCCESS)
4121 *out_mad_size = in_wc->byte_len - sizeof(struct ib_grh);
4122
4123 return ret;
4124}
4125
4126static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
4127 const struct ib_wc *in_wc,
4128 const struct ib_grh *in_grh,
4129 const struct ib_mad *in_mad,
4130 struct ib_mad *out_mad)
4131{
4132 int ret;
4133
4134 switch (in_mad->mad_hdr.mgmt_class) {
4135 case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
4136 case IB_MGMT_CLASS_SUBN_LID_ROUTED:
4137 ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
4138 goto bail;
4139 default:
4140 ret = IB_MAD_RESULT_SUCCESS;
4141 }
4142
4143bail:
4144 return ret;
4145}
4146
4147/**
4148 * hfi1_process_mad - process an incoming MAD packet
4149 * @ibdev: the infiniband device this packet came in on
4150 * @mad_flags: MAD flags
4151 * @port: the port number this packet came in on
4152 * @in_wc: the work completion entry for this packet
4153 * @in_grh: the global route header for this packet
4154 * @in_mad: the incoming MAD
4155 * @out_mad: any outgoing MAD reply
4156 *
4157 * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
4158 * interested in processing.
4159 *
4160 * Note that the verbs framework has already done the MAD sanity checks,
4161 * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
4162 * MADs.
4163 *
4164 * This is called by the ib_mad module.
4165 */
4166int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
4167 const struct ib_wc *in_wc, const struct ib_grh *in_grh,
4168 const struct ib_mad_hdr *in_mad, size_t in_mad_size,
4169 struct ib_mad_hdr *out_mad, size_t *out_mad_size,
4170 u16 *out_mad_pkey_index)
4171{
4172 switch (in_mad->base_version) {
4173 case OPA_MGMT_BASE_VERSION:
4174 if (unlikely(in_mad_size != sizeof(struct opa_mad))) {
4175 dev_err(ibdev->dma_device, "invalid in_mad_size\n");
4176 return IB_MAD_RESULT_FAILURE;
4177 }
4178 return hfi1_process_opa_mad(ibdev, mad_flags, port,
4179 in_wc, in_grh,
4180 (struct opa_mad *)in_mad,
4181 (struct opa_mad *)out_mad,
4182 out_mad_size,
4183 out_mad_pkey_index);
4184 case IB_MGMT_BASE_VERSION:
4185 return hfi1_process_ib_mad(ibdev, mad_flags, port,
4186 in_wc, in_grh,
4187 (const struct ib_mad *)in_mad,
4188 (struct ib_mad *)out_mad);
4189 default:
4190 break;
4191 }
4192
4193 return IB_MAD_RESULT_FAILURE;
4194}
4195
4196static void send_handler(struct ib_mad_agent *agent,
4197 struct ib_mad_send_wc *mad_send_wc)
4198{
4199 ib_free_send_mad(mad_send_wc->send_buf);
4200}
4201
4202int hfi1_create_agents(struct hfi1_ibdev *dev)
4203{
4204 struct hfi1_devdata *dd = dd_from_dev(dev);
4205 struct ib_mad_agent *agent;
4206 struct hfi1_ibport *ibp;
4207 int p;
4208 int ret;
4209
4210 for (p = 0; p < dd->num_pports; p++) {
4211 ibp = &dd->pport[p].ibport_data;
4212 agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI,
4213 NULL, 0, send_handler,
4214 NULL, NULL, 0);
4215 if (IS_ERR(agent)) {
4216 ret = PTR_ERR(agent);
4217 goto err;
4218 }
4219
4220 ibp->send_agent = agent;
4221 }
4222
4223 return 0;
4224
4225err:
4226 for (p = 0; p < dd->num_pports; p++) {
4227 ibp = &dd->pport[p].ibport_data;
4228 if (ibp->send_agent) {
4229 agent = ibp->send_agent;
4230 ibp->send_agent = NULL;
4231 ib_unregister_mad_agent(agent);
4232 }
4233 }
4234
4235 return ret;
4236}
4237
4238void hfi1_free_agents(struct hfi1_ibdev *dev)
4239{
4240 struct hfi1_devdata *dd = dd_from_dev(dev);
4241 struct ib_mad_agent *agent;
4242 struct hfi1_ibport *ibp;
4243 int p;
4244
4245 for (p = 0; p < dd->num_pports; p++) {
4246 ibp = &dd->pport[p].ibport_data;
4247 if (ibp->send_agent) {
4248 agent = ibp->send_agent;
4249 ibp->send_agent = NULL;
4250 ib_unregister_mad_agent(agent);
4251 }
4252 if (ibp->sm_ah) {
4253 ib_destroy_ah(&ibp->sm_ah->ibah);
4254 ibp->sm_ah = NULL;
4255 }
4256 }
4257}
diff --git a/drivers/staging/rdma/hfi1/mad.h b/drivers/staging/rdma/hfi1/mad.h
new file mode 100644
index 000000000000..47457501c044
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mad.h
@@ -0,0 +1,325 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50#ifndef _HFI1_MAD_H
51#define _HFI1_MAD_H
52
53#include <rdma/ib_pma.h>
54#define USE_PI_LED_ENABLE 1 /* use led enabled bit in struct
55 * opa_port_states, if available */
56#include <rdma/opa_smi.h>
57#include <rdma/opa_port_info.h>
58#ifndef PI_LED_ENABLE_SUP
59#define PI_LED_ENABLE_SUP 0
60#endif
61#include "opa_compat.h"
62
63
64
65#define IB_VLARB_LOWPRI_0_31 1
66#define IB_VLARB_LOWPRI_32_63 2
67#define IB_VLARB_HIGHPRI_0_31 3
68#define IB_VLARB_HIGHPRI_32_63 4
69
70#define OPA_MAX_PREEMPT_CAP 32
71#define OPA_VLARB_LOW_ELEMENTS 0
72#define OPA_VLARB_HIGH_ELEMENTS 1
73#define OPA_VLARB_PREEMPT_ELEMENTS 2
74#define OPA_VLARB_PREEMPT_MATRIX 3
75
76#define IB_PMA_PORT_COUNTERS_CONG cpu_to_be16(0xFF00)
77
78struct ib_pma_portcounters_cong {
79 u8 reserved;
80 u8 reserved1;
81 __be16 port_check_rate;
82 __be16 symbol_error_counter;
83 u8 link_error_recovery_counter;
84 u8 link_downed_counter;
85 __be16 port_rcv_errors;
86 __be16 port_rcv_remphys_errors;
87 __be16 port_rcv_switch_relay_errors;
88 __be16 port_xmit_discards;
89 u8 port_xmit_constraint_errors;
90 u8 port_rcv_constraint_errors;
91 u8 reserved2;
92 u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
93 __be16 reserved3;
94 __be16 vl15_dropped;
95 __be64 port_xmit_data;
96 __be64 port_rcv_data;
97 __be64 port_xmit_packets;
98 __be64 port_rcv_packets;
99 __be64 port_xmit_wait;
100 __be64 port_adr_events;
101} __packed;
102
103#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004)
104#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008)
105#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C)
106#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C)
107
108#define OPA_MAX_PREEMPT_CAP 32
109#define OPA_VLARB_LOW_ELEMENTS 0
110#define OPA_VLARB_HIGH_ELEMENTS 1
111#define OPA_VLARB_PREEMPT_ELEMENTS 2
112#define OPA_VLARB_PREEMPT_MATRIX 3
113
114#define HFI1_XMIT_RATE_UNSUPPORTED 0x0
115#define HFI1_XMIT_RATE_PICO 0x7
116/* number of 4nsec cycles equaling 2secs */
117#define HFI1_CONG_TIMER_PSINTERVAL 0x1DCD64EC
118
119#define IB_CC_SVCTYPE_RC 0x0
120#define IB_CC_SVCTYPE_UC 0x1
121#define IB_CC_SVCTYPE_RD 0x2
122#define IB_CC_SVCTYPE_UD 0x3
123
124
125/*
126 * There should be an equivalent IB #define for the following, but
127 * I cannot find it.
128 */
129#define OPA_CC_LOG_TYPE_HFI 2
130
131struct opa_hfi1_cong_log_event_internal {
132 u32 lqpn;
133 u32 rqpn;
134 u8 sl;
135 u8 svc_type;
136 u32 rlid;
137 s64 timestamp; /* wider than 32 bits to detect 32 bit rollover */
138};
139
140struct opa_hfi1_cong_log_event {
141 u8 local_qp_cn_entry[3];
142 u8 remote_qp_number_cn_entry[3];
143 u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */
144 u8 reserved;
145 __be32 remote_lid_cn_entry;
146 __be32 timestamp_cn_entry;
147} __packed;
148
149#define OPA_CONG_LOG_ELEMS 96
150
151struct opa_hfi1_cong_log {
152 u8 log_type;
153 u8 congestion_flags;
154 __be16 threshold_event_counter;
155 __be32 current_time_stamp;
156 u8 threshold_cong_event_map[OPA_MAX_SLS/8];
157 struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS];
158} __packed;
159
160#define IB_CC_TABLE_CAP_DEFAULT 31
161
162/* Port control flags */
163#define IB_CC_CCS_PC_SL_BASED 0x01
164
165struct opa_congestion_setting_entry {
166 u8 ccti_increase;
167 u8 reserved;
168 __be16 ccti_timer;
169 u8 trigger_threshold;
170 u8 ccti_min; /* min CCTI for cc table */
171} __packed;
172
173struct opa_congestion_setting_entry_shadow {
174 u8 ccti_increase;
175 u8 reserved;
176 u16 ccti_timer;
177 u8 trigger_threshold;
178 u8 ccti_min; /* min CCTI for cc table */
179} __packed;
180
181struct opa_congestion_setting_attr {
182 __be32 control_map;
183 __be16 port_control;
184 struct opa_congestion_setting_entry entries[OPA_MAX_SLS];
185} __packed;
186
187struct opa_congestion_setting_attr_shadow {
188 u32 control_map;
189 u16 port_control;
190 struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS];
191} __packed;
192
193#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
194#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
195
196/* 64 Congestion Control table entries in a single MAD */
197#define IB_CCT_ENTRIES 64
198#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
199
200struct ib_cc_table_entry {
201 __be16 entry; /* shift:2, multiplier:14 */
202};
203
204struct ib_cc_table_entry_shadow {
205 u16 entry; /* shift:2, multiplier:14 */
206};
207
208struct ib_cc_table_attr {
209 __be16 ccti_limit; /* max CCTI for cc table */
210 struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
211} __packed;
212
213struct ib_cc_table_attr_shadow {
214 u16 ccti_limit; /* max CCTI for cc table */
215 struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
216} __packed;
217
218#define CC_TABLE_SHADOW_MAX \
219 (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
220
221struct cc_table_shadow {
222 u16 ccti_limit; /* max CCTI for cc table */
223 struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
224} __packed;
225
226/*
227 * struct cc_state combines the (active) per-port congestion control
228 * table, and the (active) per-SL congestion settings. cc_state data
229 * may need to be read in code paths that we want to be fast, so it
230 * is an RCU protected structure.
231 */
232struct cc_state {
233 struct rcu_head rcu;
234 struct cc_table_shadow cct;
235 struct opa_congestion_setting_attr_shadow cong_setting;
236};
237
238/*
239 * OPA BufferControl MAD
240 */
241
242/* attribute modifier macros */
243#define OPA_AM_NPORT_SHIFT 24
244#define OPA_AM_NPORT_MASK 0xff
245#define OPA_AM_NPORT_SMASK (OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT)
246#define OPA_AM_NPORT(am) (((am) >> OPA_AM_NPORT_SHIFT) & \
247 OPA_AM_NPORT_MASK)
248
249#define OPA_AM_NBLK_SHIFT 24
250#define OPA_AM_NBLK_MASK 0xff
251#define OPA_AM_NBLK_SMASK (OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT)
252#define OPA_AM_NBLK(am) (((am) >> OPA_AM_NBLK_SHIFT) & \
253 OPA_AM_NBLK_MASK)
254
255#define OPA_AM_START_BLK_SHIFT 0
256#define OPA_AM_START_BLK_MASK 0xff
257#define OPA_AM_START_BLK_SMASK (OPA_AM_START_BLK_MASK << \
258 OPA_AM_START_BLK_SHIFT)
259#define OPA_AM_START_BLK(am) (((am) >> OPA_AM_START_BLK_SHIFT) & \
260 OPA_AM_START_BLK_MASK)
261
262#define OPA_AM_PORTNUM_SHIFT 0
263#define OPA_AM_PORTNUM_MASK 0xff
264#define OPA_AM_PORTNUM_SMASK (OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT)
265#define OPA_AM_PORTNUM(am) (((am) >> OPA_AM_PORTNUM_SHIFT) & \
266 OPA_AM_PORTNUM_MASK)
267
268#define OPA_AM_ASYNC_SHIFT 12
269#define OPA_AM_ASYNC_MASK 0x1
270#define OPA_AM_ASYNC_SMASK (OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT)
271#define OPA_AM_ASYNC(am) (((am) >> OPA_AM_ASYNC_SHIFT) & \
272 OPA_AM_ASYNC_MASK)
273
274#define OPA_AM_START_SM_CFG_SHIFT 9
275#define OPA_AM_START_SM_CFG_MASK 0x1
276#define OPA_AM_START_SM_CFG_SMASK (OPA_AM_START_SM_CFG_MASK << \
277 OPA_AM_START_SM_CFG_SHIFT)
278#define OPA_AM_START_SM_CFG(am) (((am) >> OPA_AM_START_SM_CFG_SHIFT) \
279 & OPA_AM_START_SM_CFG_MASK)
280
281#define OPA_AM_CI_ADDR_SHIFT 19
282#define OPA_AM_CI_ADDR_MASK 0xfff
283#define OPA_AM_CI_ADDR_SMASK (OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT)
284#define OPA_AM_CI_ADDR(am) (((am) >> OPA_AM_CI_ADDR_SHIFT) & \
285 OPA_AM_CI_ADDR_MASK)
286
287#define OPA_AM_CI_LEN_SHIFT 13
288#define OPA_AM_CI_LEN_MASK 0x3f
289#define OPA_AM_CI_LEN_SMASK (OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT)
290#define OPA_AM_CI_LEN(am) (((am) >> OPA_AM_CI_LEN_SHIFT) & \
291 OPA_AM_CI_LEN_MASK)
292
293/* error info macros */
294#define OPA_EI_STATUS_SMASK 0x80
295#define OPA_EI_CODE_SMASK 0x0f
296
297struct vl_limit {
298 __be16 dedicated;
299 __be16 shared;
300};
301
302struct buffer_control {
303 __be16 reserved;
304 __be16 overall_shared_limit;
305 struct vl_limit vl[OPA_MAX_VLS];
306};
307
308struct sc2vlnt {
309 u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */
310};
311
312/*
313 * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
314 * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
315 * We support 5 counters which only count the mandatory quantities.
316 */
317#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
318#define COUNTER_MASK0_9 \
319 cpu_to_be32(COUNTER_MASK(1, 0) | \
320 COUNTER_MASK(1, 1) | \
321 COUNTER_MASK(1, 2) | \
322 COUNTER_MASK(1, 3) | \
323 COUNTER_MASK(1, 4))
324
325#endif /* _HFI1_MAD_H */
diff --git a/drivers/staging/rdma/hfi1/mmap.c b/drivers/staging/rdma/hfi1/mmap.c
new file mode 100644
index 000000000000..5173b1c60b3d
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mmap.c
@@ -0,0 +1,192 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/module.h>
52#include <linux/slab.h>
53#include <linux/vmalloc.h>
54#include <linux/mm.h>
55#include <linux/errno.h>
56#include <asm/pgtable.h>
57
58#include "verbs.h"
59
60/**
61 * hfi1_release_mmap_info - free mmap info structure
62 * @ref: a pointer to the kref within struct hfi1_mmap_info
63 */
64void hfi1_release_mmap_info(struct kref *ref)
65{
66 struct hfi1_mmap_info *ip =
67 container_of(ref, struct hfi1_mmap_info, ref);
68 struct hfi1_ibdev *dev = to_idev(ip->context->device);
69
70 spin_lock_irq(&dev->pending_lock);
71 list_del(&ip->pending_mmaps);
72 spin_unlock_irq(&dev->pending_lock);
73
74 vfree(ip->obj);
75 kfree(ip);
76}
77
78/*
79 * open and close keep track of how many times the CQ is mapped,
80 * to avoid releasing it.
81 */
82static void hfi1_vma_open(struct vm_area_struct *vma)
83{
84 struct hfi1_mmap_info *ip = vma->vm_private_data;
85
86 kref_get(&ip->ref);
87}
88
89static void hfi1_vma_close(struct vm_area_struct *vma)
90{
91 struct hfi1_mmap_info *ip = vma->vm_private_data;
92
93 kref_put(&ip->ref, hfi1_release_mmap_info);
94}
95
96static struct vm_operations_struct hfi1_vm_ops = {
97 .open = hfi1_vma_open,
98 .close = hfi1_vma_close,
99};
100
101/**
102 * hfi1_mmap - create a new mmap region
103 * @context: the IB user context of the process making the mmap() call
104 * @vma: the VMA to be initialized
105 * Return zero if the mmap is OK. Otherwise, return an errno.
106 */
107int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
108{
109 struct hfi1_ibdev *dev = to_idev(context->device);
110 unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
111 unsigned long size = vma->vm_end - vma->vm_start;
112 struct hfi1_mmap_info *ip, *pp;
113 int ret = -EINVAL;
114
115 /*
116 * Search the device's list of objects waiting for a mmap call.
117 * Normally, this list is very short since a call to create a
118 * CQ, QP, or SRQ is soon followed by a call to mmap().
119 */
120 spin_lock_irq(&dev->pending_lock);
121 list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
122 pending_mmaps) {
123 /* Only the creator is allowed to mmap the object */
124 if (context != ip->context || (__u64) offset != ip->offset)
125 continue;
126 /* Don't allow a mmap larger than the object. */
127 if (size > ip->size)
128 break;
129
130 list_del_init(&ip->pending_mmaps);
131 spin_unlock_irq(&dev->pending_lock);
132
133 ret = remap_vmalloc_range(vma, ip->obj, 0);
134 if (ret)
135 goto done;
136 vma->vm_ops = &hfi1_vm_ops;
137 vma->vm_private_data = ip;
138 hfi1_vma_open(vma);
139 goto done;
140 }
141 spin_unlock_irq(&dev->pending_lock);
142done:
143 return ret;
144}
145
146/*
147 * Allocate information for hfi1_mmap
148 */
149struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev,
150 u32 size,
151 struct ib_ucontext *context,
152 void *obj) {
153 struct hfi1_mmap_info *ip;
154
155 ip = kmalloc(sizeof(*ip), GFP_KERNEL);
156 if (!ip)
157 goto bail;
158
159 size = PAGE_ALIGN(size);
160
161 spin_lock_irq(&dev->mmap_offset_lock);
162 if (dev->mmap_offset == 0)
163 dev->mmap_offset = PAGE_SIZE;
164 ip->offset = dev->mmap_offset;
165 dev->mmap_offset += size;
166 spin_unlock_irq(&dev->mmap_offset_lock);
167
168 INIT_LIST_HEAD(&ip->pending_mmaps);
169 ip->size = size;
170 ip->context = context;
171 ip->obj = obj;
172 kref_init(&ip->ref);
173
174bail:
175 return ip;
176}
177
178void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip,
179 u32 size, void *obj)
180{
181 size = PAGE_ALIGN(size);
182
183 spin_lock_irq(&dev->mmap_offset_lock);
184 if (dev->mmap_offset == 0)
185 dev->mmap_offset = PAGE_SIZE;
186 ip->offset = dev->mmap_offset;
187 dev->mmap_offset += size;
188 spin_unlock_irq(&dev->mmap_offset_lock);
189
190 ip->size = size;
191 ip->obj = obj;
192}
diff --git a/drivers/staging/rdma/hfi1/mr.c b/drivers/staging/rdma/hfi1/mr.c
new file mode 100644
index 000000000000..bd64e4f986f9
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mr.c
@@ -0,0 +1,551 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <rdma/ib_umem.h>
52#include <rdma/ib_smi.h>
53
54#include "hfi.h"
55
56/* Fast memory region */
57struct hfi1_fmr {
58 struct ib_fmr ibfmr;
59 struct hfi1_mregion mr; /* must be last */
60};
61
62static inline struct hfi1_fmr *to_ifmr(struct ib_fmr *ibfmr)
63{
64 return container_of(ibfmr, struct hfi1_fmr, ibfmr);
65}
66
67static int init_mregion(struct hfi1_mregion *mr, struct ib_pd *pd,
68 int count)
69{
70 int m, i = 0;
71 int rval = 0;
72
73 m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
74 for (; i < m; i++) {
75 mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL);
76 if (!mr->map[i])
77 goto bail;
78 }
79 mr->mapsz = m;
80 init_completion(&mr->comp);
81 /* count returning the ptr to user */
82 atomic_set(&mr->refcount, 1);
83 mr->pd = pd;
84 mr->max_segs = count;
85out:
86 return rval;
87bail:
88 while (i)
89 kfree(mr->map[--i]);
90 rval = -ENOMEM;
91 goto out;
92}
93
94static void deinit_mregion(struct hfi1_mregion *mr)
95{
96 int i = mr->mapsz;
97
98 mr->mapsz = 0;
99 while (i)
100 kfree(mr->map[--i]);
101}
102
103
104/**
105 * hfi1_get_dma_mr - get a DMA memory region
106 * @pd: protection domain for this memory region
107 * @acc: access flags
108 *
109 * Returns the memory region on success, otherwise returns an errno.
110 * Note that all DMA addresses should be created via the
111 * struct ib_dma_mapping_ops functions (see dma.c).
112 */
113struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc)
114{
115 struct hfi1_mr *mr = NULL;
116 struct ib_mr *ret;
117 int rval;
118
119 if (to_ipd(pd)->user) {
120 ret = ERR_PTR(-EPERM);
121 goto bail;
122 }
123
124 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
125 if (!mr) {
126 ret = ERR_PTR(-ENOMEM);
127 goto bail;
128 }
129
130 rval = init_mregion(&mr->mr, pd, 0);
131 if (rval) {
132 ret = ERR_PTR(rval);
133 goto bail;
134 }
135
136
137 rval = hfi1_alloc_lkey(&mr->mr, 1);
138 if (rval) {
139 ret = ERR_PTR(rval);
140 goto bail_mregion;
141 }
142
143 mr->mr.access_flags = acc;
144 ret = &mr->ibmr;
145done:
146 return ret;
147
148bail_mregion:
149 deinit_mregion(&mr->mr);
150bail:
151 kfree(mr);
152 goto done;
153}
154
155static struct hfi1_mr *alloc_mr(int count, struct ib_pd *pd)
156{
157 struct hfi1_mr *mr;
158 int rval = -ENOMEM;
159 int m;
160
161 /* Allocate struct plus pointers to first level page tables. */
162 m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
163 mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL);
164 if (!mr)
165 goto bail;
166
167 rval = init_mregion(&mr->mr, pd, count);
168 if (rval)
169 goto bail;
170 /*
171 * ib_reg_phys_mr() will initialize mr->ibmr except for
172 * lkey and rkey.
173 */
174 rval = hfi1_alloc_lkey(&mr->mr, 0);
175 if (rval)
176 goto bail_mregion;
177 mr->ibmr.lkey = mr->mr.lkey;
178 mr->ibmr.rkey = mr->mr.lkey;
179done:
180 return mr;
181
182bail_mregion:
183 deinit_mregion(&mr->mr);
184bail:
185 kfree(mr);
186 mr = ERR_PTR(rval);
187 goto done;
188}
189
190/**
191 * hfi1_reg_phys_mr - register a physical memory region
192 * @pd: protection domain for this memory region
193 * @buffer_list: pointer to the list of physical buffers to register
194 * @num_phys_buf: the number of physical buffers to register
195 * @iova_start: the starting address passed over IB which maps to this MR
196 *
197 * Returns the memory region on success, otherwise returns an errno.
198 */
199struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
200 struct ib_phys_buf *buffer_list,
201 int num_phys_buf, int acc, u64 *iova_start)
202{
203 struct hfi1_mr *mr;
204 int n, m, i;
205 struct ib_mr *ret;
206
207 mr = alloc_mr(num_phys_buf, pd);
208 if (IS_ERR(mr)) {
209 ret = (struct ib_mr *)mr;
210 goto bail;
211 }
212
213 mr->mr.user_base = *iova_start;
214 mr->mr.iova = *iova_start;
215 mr->mr.access_flags = acc;
216
217 m = 0;
218 n = 0;
219 for (i = 0; i < num_phys_buf; i++) {
220 mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
221 mr->mr.map[m]->segs[n].length = buffer_list[i].size;
222 mr->mr.length += buffer_list[i].size;
223 n++;
224 if (n == HFI1_SEGSZ) {
225 m++;
226 n = 0;
227 }
228 }
229
230 ret = &mr->ibmr;
231
232bail:
233 return ret;
234}
235
236/**
237 * hfi1_reg_user_mr - register a userspace memory region
238 * @pd: protection domain for this memory region
239 * @start: starting userspace address
240 * @length: length of region to register
241 * @mr_access_flags: access flags for this memory region
242 * @udata: unused by the driver
243 *
244 * Returns the memory region on success, otherwise returns an errno.
245 */
246struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
247 u64 virt_addr, int mr_access_flags,
248 struct ib_udata *udata)
249{
250 struct hfi1_mr *mr;
251 struct ib_umem *umem;
252 struct scatterlist *sg;
253 int n, m, entry;
254 struct ib_mr *ret;
255
256 if (length == 0) {
257 ret = ERR_PTR(-EINVAL);
258 goto bail;
259 }
260
261 umem = ib_umem_get(pd->uobject->context, start, length,
262 mr_access_flags, 0);
263 if (IS_ERR(umem))
264 return (void *) umem;
265
266 n = umem->nmap;
267
268 mr = alloc_mr(n, pd);
269 if (IS_ERR(mr)) {
270 ret = (struct ib_mr *)mr;
271 ib_umem_release(umem);
272 goto bail;
273 }
274
275 mr->mr.user_base = start;
276 mr->mr.iova = virt_addr;
277 mr->mr.length = length;
278 mr->mr.offset = ib_umem_offset(umem);
279 mr->mr.access_flags = mr_access_flags;
280 mr->umem = umem;
281
282 if (is_power_of_2(umem->page_size))
283 mr->mr.page_shift = ilog2(umem->page_size);
284 m = 0;
285 n = 0;
286 for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
287 void *vaddr;
288
289 vaddr = page_address(sg_page(sg));
290 if (!vaddr) {
291 ret = ERR_PTR(-EINVAL);
292 goto bail;
293 }
294 mr->mr.map[m]->segs[n].vaddr = vaddr;
295 mr->mr.map[m]->segs[n].length = umem->page_size;
296 n++;
297 if (n == HFI1_SEGSZ) {
298 m++;
299 n = 0;
300 }
301 }
302 ret = &mr->ibmr;
303
304bail:
305 return ret;
306}
307
308/**
309 * hfi1_dereg_mr - unregister and free a memory region
310 * @ibmr: the memory region to free
311 *
312 * Returns 0 on success.
313 *
314 * Note that this is called to free MRs created by hfi1_get_dma_mr()
315 * or hfi1_reg_user_mr().
316 */
317int hfi1_dereg_mr(struct ib_mr *ibmr)
318{
319 struct hfi1_mr *mr = to_imr(ibmr);
320 int ret = 0;
321 unsigned long timeout;
322
323 hfi1_free_lkey(&mr->mr);
324
325 hfi1_put_mr(&mr->mr); /* will set completion if last */
326 timeout = wait_for_completion_timeout(&mr->mr.comp,
327 5 * HZ);
328 if (!timeout) {
329 dd_dev_err(
330 dd_from_ibdev(mr->mr.pd->device),
331 "hfi1_dereg_mr timeout mr %p pd %p refcount %u\n",
332 mr, mr->mr.pd, atomic_read(&mr->mr.refcount));
333 hfi1_get_mr(&mr->mr);
334 ret = -EBUSY;
335 goto out;
336 }
337 deinit_mregion(&mr->mr);
338 if (mr->umem)
339 ib_umem_release(mr->umem);
340 kfree(mr);
341out:
342 return ret;
343}
344
345/*
346 * Allocate a memory region usable with the
347 * IB_WR_FAST_REG_MR send work request.
348 *
349 * Return the memory region on success, otherwise return an errno.
350 */
351struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd,
352 enum ib_mr_type mr_type,
353 u32 max_num_sg)
354{
355 struct hfi1_mr *mr;
356
357 if (mr_type != IB_MR_TYPE_MEM_REG)
358 return ERR_PTR(-EINVAL);
359
360 mr = alloc_mr(max_num_sg, pd);
361 if (IS_ERR(mr))
362 return (struct ib_mr *)mr;
363
364 return &mr->ibmr;
365}
366
367struct ib_fast_reg_page_list *
368hfi1_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len)
369{
370 unsigned size = page_list_len * sizeof(u64);
371 struct ib_fast_reg_page_list *pl;
372
373 if (size > PAGE_SIZE)
374 return ERR_PTR(-EINVAL);
375
376 pl = kzalloc(sizeof(*pl), GFP_KERNEL);
377 if (!pl)
378 return ERR_PTR(-ENOMEM);
379
380 pl->page_list = kzalloc(size, GFP_KERNEL);
381 if (!pl->page_list)
382 goto err_free;
383
384 return pl;
385
386err_free:
387 kfree(pl);
388 return ERR_PTR(-ENOMEM);
389}
390
391void hfi1_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl)
392{
393 kfree(pl->page_list);
394 kfree(pl);
395}
396
397/**
398 * hfi1_alloc_fmr - allocate a fast memory region
399 * @pd: the protection domain for this memory region
400 * @mr_access_flags: access flags for this memory region
401 * @fmr_attr: fast memory region attributes
402 *
403 * Returns the memory region on success, otherwise returns an errno.
404 */
405struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
406 struct ib_fmr_attr *fmr_attr)
407{
408 struct hfi1_fmr *fmr;
409 int m;
410 struct ib_fmr *ret;
411 int rval = -ENOMEM;
412
413 /* Allocate struct plus pointers to first level page tables. */
414 m = (fmr_attr->max_pages + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
415 fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL);
416 if (!fmr)
417 goto bail;
418
419 rval = init_mregion(&fmr->mr, pd, fmr_attr->max_pages);
420 if (rval)
421 goto bail;
422
423 /*
424 * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
425 * rkey.
426 */
427 rval = hfi1_alloc_lkey(&fmr->mr, 0);
428 if (rval)
429 goto bail_mregion;
430 fmr->ibfmr.rkey = fmr->mr.lkey;
431 fmr->ibfmr.lkey = fmr->mr.lkey;
432 /*
433 * Resources are allocated but no valid mapping (RKEY can't be
434 * used).
435 */
436 fmr->mr.access_flags = mr_access_flags;
437 fmr->mr.max_segs = fmr_attr->max_pages;
438 fmr->mr.page_shift = fmr_attr->page_shift;
439
440 ret = &fmr->ibfmr;
441done:
442 return ret;
443
444bail_mregion:
445 deinit_mregion(&fmr->mr);
446bail:
447 kfree(fmr);
448 ret = ERR_PTR(rval);
449 goto done;
450}
451
452/**
453 * hfi1_map_phys_fmr - set up a fast memory region
454 * @ibmfr: the fast memory region to set up
455 * @page_list: the list of pages to associate with the fast memory region
456 * @list_len: the number of pages to associate with the fast memory region
457 * @iova: the virtual address of the start of the fast memory region
458 *
459 * This may be called from interrupt context.
460 */
461
462int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
463 int list_len, u64 iova)
464{
465 struct hfi1_fmr *fmr = to_ifmr(ibfmr);
466 struct hfi1_lkey_table *rkt;
467 unsigned long flags;
468 int m, n, i;
469 u32 ps;
470 int ret;
471
472 i = atomic_read(&fmr->mr.refcount);
473 if (i > 2)
474 return -EBUSY;
475
476 if (list_len > fmr->mr.max_segs) {
477 ret = -EINVAL;
478 goto bail;
479 }
480 rkt = &to_idev(ibfmr->device)->lk_table;
481 spin_lock_irqsave(&rkt->lock, flags);
482 fmr->mr.user_base = iova;
483 fmr->mr.iova = iova;
484 ps = 1 << fmr->mr.page_shift;
485 fmr->mr.length = list_len * ps;
486 m = 0;
487 n = 0;
488 for (i = 0; i < list_len; i++) {
489 fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
490 fmr->mr.map[m]->segs[n].length = ps;
491 if (++n == HFI1_SEGSZ) {
492 m++;
493 n = 0;
494 }
495 }
496 spin_unlock_irqrestore(&rkt->lock, flags);
497 ret = 0;
498
499bail:
500 return ret;
501}
502
503/**
504 * hfi1_unmap_fmr - unmap fast memory regions
505 * @fmr_list: the list of fast memory regions to unmap
506 *
507 * Returns 0 on success.
508 */
509int hfi1_unmap_fmr(struct list_head *fmr_list)
510{
511 struct hfi1_fmr *fmr;
512 struct hfi1_lkey_table *rkt;
513 unsigned long flags;
514
515 list_for_each_entry(fmr, fmr_list, ibfmr.list) {
516 rkt = &to_idev(fmr->ibfmr.device)->lk_table;
517 spin_lock_irqsave(&rkt->lock, flags);
518 fmr->mr.user_base = 0;
519 fmr->mr.iova = 0;
520 fmr->mr.length = 0;
521 spin_unlock_irqrestore(&rkt->lock, flags);
522 }
523 return 0;
524}
525
526/**
527 * hfi1_dealloc_fmr - deallocate a fast memory region
528 * @ibfmr: the fast memory region to deallocate
529 *
530 * Returns 0 on success.
531 */
532int hfi1_dealloc_fmr(struct ib_fmr *ibfmr)
533{
534 struct hfi1_fmr *fmr = to_ifmr(ibfmr);
535 int ret = 0;
536 unsigned long timeout;
537
538 hfi1_free_lkey(&fmr->mr);
539 hfi1_put_mr(&fmr->mr); /* will set completion if last */
540 timeout = wait_for_completion_timeout(&fmr->mr.comp,
541 5 * HZ);
542 if (!timeout) {
543 hfi1_get_mr(&fmr->mr);
544 ret = -EBUSY;
545 goto out;
546 }
547 deinit_mregion(&fmr->mr);
548 kfree(fmr);
549out:
550 return ret;
551}
diff --git a/drivers/staging/rdma/hfi1/opa_compat.h b/drivers/staging/rdma/hfi1/opa_compat.h
new file mode 100644
index 000000000000..f64eec1c2951
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/opa_compat.h
@@ -0,0 +1,129 @@
1#ifndef _LINUX_H
2#define _LINUX_H
3/*
4 *
5 * This file is provided under a dual BSD/GPLv2 license. When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * Copyright(c) 2015 Intel Corporation.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of version 2 of the GNU General Public License as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * BSD LICENSE
22 *
23 * Copyright(c) 2015 Intel Corporation.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 *
29 * - Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * - Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in
33 * the documentation and/or other materials provided with the
34 * distribution.
35 * - Neither the name of Intel Corporation nor the names of its
36 * contributors may be used to endorse or promote products derived
37 * from this software without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
40 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
41 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
43 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
47 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
49 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 *
51 */
52
53/*
54 * This header file is for OPA-specific definitions which are
55 * required by the HFI driver, and which aren't yet in the Linux
56 * IB core. We'll collect these all here, then merge them into
57 * the kernel when that's convenient.
58 */
59
60/* OPA SMA attribute IDs */
61#define OPA_ATTRIB_ID_CONGESTION_INFO cpu_to_be16(0x008b)
62#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG cpu_to_be16(0x008f)
63#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING cpu_to_be16(0x0090)
64#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0091)
65
66/* OPA PMA attribute IDs */
67#define OPA_PM_ATTRIB_ID_PORT_STATUS cpu_to_be16(0x0040)
68#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS cpu_to_be16(0x0041)
69#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS cpu_to_be16(0x0042)
70#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS cpu_to_be16(0x0043)
71#define OPA_PM_ATTRIB_ID_ERROR_INFO cpu_to_be16(0x0044)
72
73/* OPA status codes */
74#define OPA_PM_STATUS_REQUEST_TOO_LARGE cpu_to_be16(0x100)
75
76static inline u8 port_states_to_logical_state(struct opa_port_states *ps)
77{
78 return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE;
79}
80
81static inline u8 port_states_to_phys_state(struct opa_port_states *ps)
82{
83 return ((ps->portphysstate_portstate &
84 OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf;
85}
86
87/*
88 * OPA port physical states
89 * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState
90 * values.
91 *
92 * When writing, only values 0-3 are valid, other values are ignored.
93 * When reading, 0 is reserved.
94 *
95 * Returned by the ibphys_portstate() routine.
96 */
97enum opa_port_phys_state {
98 IB_PORTPHYSSTATE_NOP = 0,
99 /* 1 is reserved */
100 IB_PORTPHYSSTATE_POLLING = 2,
101 IB_PORTPHYSSTATE_DISABLED = 3,
102 IB_PORTPHYSSTATE_TRAINING = 4,
103 IB_PORTPHYSSTATE_LINKUP = 5,
104 IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6,
105 IB_PORTPHYSSTATE_PHY_TEST = 7,
106 /* 8 is reserved */
107 OPA_PORTPHYSSTATE_OFFLINE = 9,
108 OPA_PORTPHYSSTATE_GANGED = 10,
109 OPA_PORTPHYSSTATE_TEST = 11,
110 OPA_PORTPHYSSTATE_MAX = 11,
111 /* values 12-15 are reserved/ignored */
112};
113
114/* OPA_PORT_TYPE_* definitions - these belong in opa_port_info.h */
115#define OPA_PORT_TYPE_UNKNOWN 0
116#define OPA_PORT_TYPE_DISCONNECTED 1
117/* port is not currently usable, CableInfo not available */
118#define OPA_PORT_TYPE_FIXED 2
119/* A fixed backplane port in a director class switch. All OPA ASICS */
120#define OPA_PORT_TYPE_VARIABLE 3
121/* A backplane port in a blade system, possibly mixed configuration */
122#define OPA_PORT_TYPE_STANDARD 4
123/* implies a SFF-8636 defined format for CableInfo (QSFP) */
124#define OPA_PORT_TYPE_SI_PHOTONICS 5
125/* A silicon photonics module implies TBD defined format for CableInfo
126 * as defined by Intel SFO group */
127/* 6 - 15 are reserved */
128
129#endif /* _LINUX_H */
diff --git a/drivers/staging/rdma/hfi1/pcie.c b/drivers/staging/rdma/hfi1/pcie.c
new file mode 100644
index 000000000000..ac5653c0f65e
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pcie.c
@@ -0,0 +1,1253 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/pci.h>
52#include <linux/io.h>
53#include <linux/delay.h>
54#include <linux/vmalloc.h>
55#include <linux/aer.h>
56#include <linux/module.h>
57
58#include "hfi.h"
59#include "chip_registers.h"
60
61/* link speed vector for Gen3 speed - not in Linux headers */
62#define GEN1_SPEED_VECTOR 0x1
63#define GEN2_SPEED_VECTOR 0x2
64#define GEN3_SPEED_VECTOR 0x3
65
66/*
67 * This file contains PCIe utility routines.
68 */
69
70/*
71 * Code to adjust PCIe capabilities.
72 */
73static void tune_pcie_caps(struct hfi1_devdata *);
74
75/*
76 * Do all the common PCIe setup and initialization.
77 * devdata is not yet allocated, and is not allocated until after this
78 * routine returns success. Therefore dd_dev_err() can't be used for error
79 * printing.
80 */
81int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
82{
83 int ret;
84
85 ret = pci_enable_device(pdev);
86 if (ret) {
87 /*
88 * This can happen (in theory) iff:
89 * We did a chip reset, and then failed to reprogram the
90 * BAR, or the chip reset due to an internal error. We then
91 * unloaded the driver and reloaded it.
92 *
93 * Both reset cases set the BAR back to initial state. For
94 * the latter case, the AER sticky error bit at offset 0x718
95 * should be set, but the Linux kernel doesn't yet know
96 * about that, it appears. If the original BAR was retained
97 * in the kernel data structures, this may be OK.
98 */
99 hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
100 -ret);
101 goto done;
102 }
103
104 ret = pci_request_regions(pdev, DRIVER_NAME);
105 if (ret) {
106 hfi1_early_err(&pdev->dev,
107 "pci_request_regions fails: err %d\n", -ret);
108 goto bail;
109 }
110
111 ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
112 if (ret) {
113 /*
114 * If the 64 bit setup fails, try 32 bit. Some systems
115 * do not setup 64 bit maps on systems with 2GB or less
116 * memory installed.
117 */
118 ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
119 if (ret) {
120 hfi1_early_err(&pdev->dev,
121 "Unable to set DMA mask: %d\n", ret);
122 goto bail;
123 }
124 ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
125 } else
126 ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
127 if (ret) {
128 hfi1_early_err(&pdev->dev,
129 "Unable to set DMA consistent mask: %d\n", ret);
130 goto bail;
131 }
132
133 pci_set_master(pdev);
134 ret = pci_enable_pcie_error_reporting(pdev);
135 if (ret) {
136 hfi1_early_err(&pdev->dev,
137 "Unable to enable pcie error reporting: %d\n",
138 ret);
139 ret = 0;
140 }
141 goto done;
142
143bail:
144 hfi1_pcie_cleanup(pdev);
145done:
146 return ret;
147}
148
149/*
150 * Clean what was done in hfi1_pcie_init()
151 */
152void hfi1_pcie_cleanup(struct pci_dev *pdev)
153{
154 pci_disable_device(pdev);
155 /*
156 * Release regions should be called after the disable. OK to
157 * call if request regions has not been called or failed.
158 */
159 pci_release_regions(pdev);
160}
161
162/*
163 * Do remaining PCIe setup, once dd is allocated, and save away
164 * fields required to re-initialize after a chip reset, or for
165 * various other purposes
166 */
167int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev,
168 const struct pci_device_id *ent)
169{
170 unsigned long len;
171 resource_size_t addr;
172
173 dd->pcidev = pdev;
174 pci_set_drvdata(pdev, dd);
175
176 addr = pci_resource_start(pdev, 0);
177 len = pci_resource_len(pdev, 0);
178
179 /*
180 * The TXE PIO buffers are at the tail end of the chip space.
181 * Cut them off and map them separately.
182 */
183
184 /* sanity check vs expectations */
185 if (len != TXE_PIO_SEND + TXE_PIO_SIZE) {
186 dd_dev_err(dd, "chip PIO range does not match\n");
187 return -EINVAL;
188 }
189
190 dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND);
191 if (!dd->kregbase)
192 return -ENOMEM;
193
194 dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
195 if (!dd->piobase) {
196 iounmap(dd->kregbase);
197 return -ENOMEM;
198 }
199
200 dd->flags |= HFI1_PRESENT; /* now register routines work */
201
202 dd->kregend = dd->kregbase + TXE_PIO_SEND;
203 dd->physaddr = addr; /* used for io_remap, etc. */
204
205 /*
206 * Re-map the chip's RcvArray as write-combining to allow us
207 * to write an entire cacheline worth of entries in one shot.
208 * If this re-map fails, just continue - the RcvArray programming
209 * function will handle both cases.
210 */
211 dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT);
212 dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
213 dd->chip_rcv_array_count * 8);
214 dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc);
215 /*
216 * Save BARs and command to rewrite after device reset.
217 */
218 dd->pcibar0 = addr;
219 dd->pcibar1 = addr >> 32;
220 pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
221 pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
222 pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl);
223 pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl);
224 pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
225 &dd->pcie_devctl2);
226 pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
227 pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
228 &dd->pci_lnkctl3);
229 pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
230
231 return 0;
232}
233
234/*
235 * Do PCIe cleanup related to dd, after chip-specific cleanup, etc. Just prior
236 * to releasing the dd memory.
237 * Void because all of the core pcie cleanup functions are void.
238 */
239void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
240{
241 u64 __iomem *base = (void __iomem *) dd->kregbase;
242
243 dd->flags &= ~HFI1_PRESENT;
244 dd->kregbase = NULL;
245 iounmap(base);
246 if (dd->rcvarray_wc)
247 iounmap(dd->rcvarray_wc);
248 if (dd->piobase)
249 iounmap(dd->piobase);
250
251 pci_set_drvdata(dd->pcidev, NULL);
252}
253
254/*
255 * Do a Function Level Reset (FLR) on the device.
256 * Based on static function drivers/pci/pci.c:pcie_flr().
257 */
258void hfi1_pcie_flr(struct hfi1_devdata *dd)
259{
260 int i;
261 u16 status;
262
263 /* no need to check for the capability - we know the device has it */
264
265 /* wait for Transaction Pending bit to clear, at most a few ms */
266 for (i = 0; i < 4; i++) {
267 if (i)
268 msleep((1 << (i - 1)) * 100);
269
270 pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVSTA, &status);
271 if (!(status & PCI_EXP_DEVSTA_TRPND))
272 goto clear;
273 }
274
275 dd_dev_err(dd, "Transaction Pending bit is not clearing, proceeding with reset anyway\n");
276
277clear:
278 pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL,
279 PCI_EXP_DEVCTL_BCR_FLR);
280 /* PCIe spec requires the function to be back within 100ms */
281 msleep(100);
282}
283
284static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt,
285 struct hfi1_msix_entry *hfi1_msix_entry)
286{
287 int ret;
288 int nvec = *msixcnt;
289 struct msix_entry *msix_entry;
290 int i;
291
292 /* We can't pass hfi1_msix_entry array to msix_setup
293 * so use a dummy msix_entry array and copy the allocated
294 * irq back to the hfi1_msix_entry array. */
295 msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL);
296 if (!msix_entry) {
297 ret = -ENOMEM;
298 goto do_intx;
299 }
300
301 for (i = 0; i < nvec; i++)
302 msix_entry[i] = hfi1_msix_entry[i].msix;
303
304 ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
305 if (ret < 0)
306 goto free_msix_entry;
307 nvec = ret;
308
309 for (i = 0; i < nvec; i++)
310 hfi1_msix_entry[i].msix = msix_entry[i];
311
312 kfree(msix_entry);
313 *msixcnt = nvec;
314 return;
315
316free_msix_entry:
317 kfree(msix_entry);
318
319do_intx:
320 dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n",
321 nvec, ret);
322 *msixcnt = 0;
323 hfi1_enable_intx(dd->pcidev);
324
325}
326
327/* return the PCIe link speed from the given link status */
328static u32 extract_speed(u16 linkstat)
329{
330 u32 speed;
331
332 switch (linkstat & PCI_EXP_LNKSTA_CLS) {
333 default: /* not defined, assume Gen1 */
334 case PCI_EXP_LNKSTA_CLS_2_5GB:
335 speed = 2500; /* Gen 1, 2.5GHz */
336 break;
337 case PCI_EXP_LNKSTA_CLS_5_0GB:
338 speed = 5000; /* Gen 2, 5GHz */
339 break;
340 case GEN3_SPEED_VECTOR:
341 speed = 8000; /* Gen 3, 8GHz */
342 break;
343 }
344 return speed;
345}
346
347/* return the PCIe link speed from the given link status */
348static u32 extract_width(u16 linkstat)
349{
350 return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
351}
352
353/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */
354static void update_lbus_info(struct hfi1_devdata *dd)
355{
356 u16 linkstat;
357
358 pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
359 dd->lbus_width = extract_width(linkstat);
360 dd->lbus_speed = extract_speed(linkstat);
361 snprintf(dd->lbus_info, sizeof(dd->lbus_info),
362 "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width);
363}
364
365/*
366 * Read in the current PCIe link width and speed. Find if the link is
367 * Gen3 capable.
368 */
369int pcie_speeds(struct hfi1_devdata *dd)
370{
371 u32 linkcap;
372
373 if (!pci_is_pcie(dd->pcidev)) {
374 dd_dev_err(dd, "Can't find PCI Express capability!\n");
375 return -EINVAL;
376 }
377
378 /* find if our max speed is Gen3 and parent supports Gen3 speeds */
379 dd->link_gen3_capable = 1;
380
381 pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
382 if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
383 dd_dev_info(dd,
384 "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
385 linkcap & PCI_EXP_LNKCAP_SLS);
386 dd->link_gen3_capable = 0;
387 }
388
389 /*
390 * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
391 */
392 if (dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
393 dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
394 dd->link_gen3_capable = 0;
395 }
396
397 /* obtain the link width and current speed */
398 update_lbus_info(dd);
399
400 /* check against expected pcie width and complain if "wrong" */
401 if (dd->lbus_width < 16)
402 dd_dev_err(dd, "PCIe width %u (x16 HFI)\n", dd->lbus_width);
403
404 return 0;
405}
406
407/*
408 * Returns in *nent:
409 * - actual number of interrupts allocated
410 * - 0 if fell back to INTx.
411 */
412void request_msix(struct hfi1_devdata *dd, u32 *nent,
413 struct hfi1_msix_entry *entry)
414{
415 int pos;
416
417 pos = dd->pcidev->msix_cap;
418 if (*nent && pos) {
419 msix_setup(dd, pos, nent, entry);
420 /* did it, either MSI-X or INTx */
421 } else {
422 *nent = 0;
423 hfi1_enable_intx(dd->pcidev);
424 }
425
426 tune_pcie_caps(dd);
427}
428
429/*
430 * Disable MSI-X.
431 */
432void hfi1_nomsix(struct hfi1_devdata *dd)
433{
434 pci_disable_msix(dd->pcidev);
435}
436
437void hfi1_enable_intx(struct pci_dev *pdev)
438{
439 /* first, turn on INTx */
440 pci_intx(pdev, 1);
441 /* then turn off MSI-X */
442 pci_disable_msix(pdev);
443}
444
445/* restore command and BARs after a reset has wiped them out */
446void restore_pci_variables(struct hfi1_devdata *dd)
447{
448 pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
449 pci_write_config_dword(dd->pcidev,
450 PCI_BASE_ADDRESS_0, dd->pcibar0);
451 pci_write_config_dword(dd->pcidev,
452 PCI_BASE_ADDRESS_1, dd->pcibar1);
453 pci_write_config_dword(dd->pcidev,
454 PCI_ROM_ADDRESS, dd->pci_rom);
455 pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl);
456 pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl);
457 pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
458 dd->pcie_devctl2);
459 pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
460 pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
461 dd->pci_lnkctl3);
462 pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
463}
464
465
466/*
467 * BIOS may not set PCIe bus-utilization parameters for best performance.
468 * Check and optionally adjust them to maximize our throughput.
469 */
470static int hfi1_pcie_caps;
471module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
472MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
473
474static void tune_pcie_caps(struct hfi1_devdata *dd)
475{
476 struct pci_dev *parent;
477 u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
478 u16 rc_mrrs, ep_mrrs, max_mrrs;
479
480 /* Find out supported and configured values for parent (root) */
481 parent = dd->pcidev->bus->self;
482 if (!pci_is_root_bus(parent->bus)) {
483 dd_dev_info(dd, "Parent not root\n");
484 return;
485 }
486
487 if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
488 return;
489 rc_mpss = parent->pcie_mpss;
490 rc_mps = ffs(pcie_get_mps(parent)) - 8;
491 /* Find out supported and configured values for endpoint (us) */
492 ep_mpss = dd->pcidev->pcie_mpss;
493 ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
494
495 /* Find max payload supported by root, endpoint */
496 if (rc_mpss > ep_mpss)
497 rc_mpss = ep_mpss;
498
499 /* If Supported greater than limit in module param, limit it */
500 if (rc_mpss > (hfi1_pcie_caps & 7))
501 rc_mpss = hfi1_pcie_caps & 7;
502 /* If less than (allowed, supported), bump root payload */
503 if (rc_mpss > rc_mps) {
504 rc_mps = rc_mpss;
505 pcie_set_mps(parent, 128 << rc_mps);
506 }
507 /* If less than (allowed, supported), bump endpoint payload */
508 if (rc_mpss > ep_mps) {
509 ep_mps = rc_mpss;
510 pcie_set_mps(dd->pcidev, 128 << ep_mps);
511 }
512
513 /*
514 * Now the Read Request size.
515 * No field for max supported, but PCIe spec limits it to 4096,
516 * which is code '5' (log2(4096) - 7)
517 */
518 max_mrrs = 5;
519 if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7))
520 max_mrrs = (hfi1_pcie_caps >> 4) & 7;
521
522 max_mrrs = 128 << max_mrrs;
523 rc_mrrs = pcie_get_readrq(parent);
524 ep_mrrs = pcie_get_readrq(dd->pcidev);
525
526 if (max_mrrs > rc_mrrs) {
527 rc_mrrs = max_mrrs;
528 pcie_set_readrq(parent, rc_mrrs);
529 }
530 if (max_mrrs > ep_mrrs) {
531 ep_mrrs = max_mrrs;
532 pcie_set_readrq(dd->pcidev, ep_mrrs);
533 }
534}
535/* End of PCIe capability tuning */
536
537/*
538 * From here through hfi1_pci_err_handler definition is invoked via
539 * PCI error infrastructure, registered via pci
540 */
541static pci_ers_result_t
542pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
543{
544 struct hfi1_devdata *dd = pci_get_drvdata(pdev);
545 pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
546
547 switch (state) {
548 case pci_channel_io_normal:
549 dd_dev_info(dd, "State Normal, ignoring\n");
550 break;
551
552 case pci_channel_io_frozen:
553 dd_dev_info(dd, "State Frozen, requesting reset\n");
554 pci_disable_device(pdev);
555 ret = PCI_ERS_RESULT_NEED_RESET;
556 break;
557
558 case pci_channel_io_perm_failure:
559 if (dd) {
560 dd_dev_info(dd, "State Permanent Failure, disabling\n");
561 /* no more register accesses! */
562 dd->flags &= ~HFI1_PRESENT;
563 hfi1_disable_after_error(dd);
564 }
565 /* else early, or other problem */
566 ret = PCI_ERS_RESULT_DISCONNECT;
567 break;
568
569 default: /* shouldn't happen */
570 dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n",
571 state);
572 break;
573 }
574 return ret;
575}
576
577static pci_ers_result_t
578pci_mmio_enabled(struct pci_dev *pdev)
579{
580 u64 words = 0U;
581 struct hfi1_devdata *dd = pci_get_drvdata(pdev);
582 pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
583
584 if (dd && dd->pport) {
585 words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL);
586 if (words == ~0ULL)
587 ret = PCI_ERS_RESULT_NEED_RESET;
588 dd_dev_info(dd,
589 "HFI1 mmio_enabled function called, read wordscntr %Lx, returning %d\n",
590 words, ret);
591 }
592 return ret;
593}
594
595static pci_ers_result_t
596pci_slot_reset(struct pci_dev *pdev)
597{
598 struct hfi1_devdata *dd = pci_get_drvdata(pdev);
599
600 dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n");
601 return PCI_ERS_RESULT_CAN_RECOVER;
602}
603
604static pci_ers_result_t
605pci_link_reset(struct pci_dev *pdev)
606{
607 struct hfi1_devdata *dd = pci_get_drvdata(pdev);
608
609 dd_dev_info(dd, "HFI1 link_reset function called, ignored\n");
610 return PCI_ERS_RESULT_CAN_RECOVER;
611}
612
613static void
614pci_resume(struct pci_dev *pdev)
615{
616 struct hfi1_devdata *dd = pci_get_drvdata(pdev);
617
618 dd_dev_info(dd, "HFI1 resume function called\n");
619 pci_cleanup_aer_uncorrect_error_status(pdev);
620 /*
621 * Running jobs will fail, since it's asynchronous
622 * unlike sysfs-requested reset. Better than
623 * doing nothing.
624 */
625 hfi1_init(dd, 1); /* same as re-init after reset */
626}
627
628const struct pci_error_handlers hfi1_pci_err_handler = {
629 .error_detected = pci_error_detected,
630 .mmio_enabled = pci_mmio_enabled,
631 .link_reset = pci_link_reset,
632 .slot_reset = pci_slot_reset,
633 .resume = pci_resume,
634};
635
636/*============================================================================*/
637/* PCIe Gen3 support */
638
639/*
640 * This code is separated out because it is expected to be removed in the
641 * final shipping product. If not, then it will be revisited and items
642 * will be moved to more standard locations.
643 */
644
645/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */
646#define DL_STATUS_HFI0 0x1 /* hfi0 firmware download complete */
647#define DL_STATUS_HFI1 0x2 /* hfi1 firmware download complete */
648#define DL_STATUS_BOTH 0x3 /* hfi0 and hfi1 firmware download complete */
649
650/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */
651#define DL_ERR_NONE 0x0 /* no error */
652#define DL_ERR_SWAP_PARITY 0x1 /* parity error in SerDes interrupt */
653 /* or response data */
654#define DL_ERR_DISABLED 0x2 /* hfi disabled */
655#define DL_ERR_SECURITY 0x3 /* security check failed */
656#define DL_ERR_SBUS 0x4 /* SBus status error */
657#define DL_ERR_XFR_PARITY 0x5 /* parity error during ROM transfer*/
658
659/* gasket block secondary bus reset delay */
660#define SBR_DELAY_US 200000 /* 200ms */
661
662/* mask for PCIe capability register lnkctl2 target link speed */
663#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf
664
665static uint pcie_target = 3;
666module_param(pcie_target, uint, S_IRUGO);
667MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)");
668
669static uint pcie_force;
670module_param(pcie_force, uint, S_IRUGO);
671MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed");
672
673static uint pcie_retry = 5;
674module_param(pcie_retry, uint, S_IRUGO);
675MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed");
676
677#define UNSET_PSET 255
678#define DEFAULT_DISCRETE_PSET 2 /* discrete HFI */
679#define DEFAULT_MCP_PSET 4 /* MCP HFI */
680static uint pcie_pset = UNSET_PSET;
681module_param(pcie_pset, uint, S_IRUGO);
682MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
683
684/* equalization columns */
685#define PREC 0
686#define ATTN 1
687#define POST 2
688
689/* discrete silicon preliminary equalization values */
690static const u8 discrete_preliminary_eq[11][3] = {
691 /* prec attn post */
692 { 0x00, 0x00, 0x12 }, /* p0 */
693 { 0x00, 0x00, 0x0c }, /* p1 */
694 { 0x00, 0x00, 0x0f }, /* p2 */
695 { 0x00, 0x00, 0x09 }, /* p3 */
696 { 0x00, 0x00, 0x00 }, /* p4 */
697 { 0x06, 0x00, 0x00 }, /* p5 */
698 { 0x09, 0x00, 0x00 }, /* p6 */
699 { 0x06, 0x00, 0x0f }, /* p7 */
700 { 0x09, 0x00, 0x09 }, /* p8 */
701 { 0x0c, 0x00, 0x00 }, /* p9 */
702 { 0x00, 0x00, 0x18 }, /* p10 */
703};
704
705/* integrated silicon preliminary equalization values */
706static const u8 integrated_preliminary_eq[11][3] = {
707 /* prec attn post */
708 { 0x00, 0x1e, 0x07 }, /* p0 */
709 { 0x00, 0x1e, 0x05 }, /* p1 */
710 { 0x00, 0x1e, 0x06 }, /* p2 */
711 { 0x00, 0x1e, 0x04 }, /* p3 */
712 { 0x00, 0x1e, 0x00 }, /* p4 */
713 { 0x03, 0x1e, 0x00 }, /* p5 */
714 { 0x04, 0x1e, 0x00 }, /* p6 */
715 { 0x03, 0x1e, 0x06 }, /* p7 */
716 { 0x03, 0x1e, 0x04 }, /* p8 */
717 { 0x05, 0x1e, 0x00 }, /* p9 */
718 { 0x00, 0x1e, 0x0a }, /* p10 */
719};
720
721/* helper to format the value to write to hardware */
722#define eq_value(pre, curr, post) \
723 ((((u32)(pre)) << \
724 PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \
725 | (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \
726 | (((u32)(post)) << \
727 PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT))
728
729/*
730 * Load the given EQ preset table into the PCIe hardware.
731 */
732static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
733 u8 div)
734{
735 struct pci_dev *pdev = dd->pcidev;
736 u32 hit_error = 0;
737 u32 violation;
738 u32 i;
739 u8 c_minus1, c0, c_plus1;
740
741 for (i = 0; i < 11; i++) {
742 /* set index */
743 pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i);
744 /* write the value */
745 c_minus1 = eq[i][PREC] / div;
746 c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div);
747 c_plus1 = eq[i][POST] / div;
748 pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
749 eq_value(c_minus1, c0, c_plus1));
750 /* check if these coefficients violate EQ rules */
751 pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105,
752 &violation);
753 if (violation
754 & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
755 if (hit_error == 0) {
756 dd_dev_err(dd,
757 "Gen3 EQ Table Coefficient rule violations\n");
758 dd_dev_err(dd, " prec attn post\n");
759 }
760 dd_dev_err(dd, " p%02d: %02x %02x %02x\n",
761 i, (u32)eq[i][0], (u32)eq[i][1], (u32)eq[i][2]);
762 dd_dev_err(dd, " %02x %02x %02x\n",
763 (u32)c_minus1, (u32)c0, (u32)c_plus1);
764 hit_error = 1;
765 }
766 }
767 if (hit_error)
768 return -EINVAL;
769 return 0;
770}
771
772/*
773 * Steps to be done after the PCIe firmware is downloaded and
774 * before the SBR for the Pcie Gen3.
775 * The hardware mutex is already being held.
776 */
777static void pcie_post_steps(struct hfi1_devdata *dd)
778{
779 int i;
780
781 set_sbus_fast_mode(dd);
782 /*
783 * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1.
784 * This avoids a spurious framing error that can otherwise be
785 * generated by the MAC layer.
786 *
787 * Use individual addresses since no broadcast is set up.
788 */
789 for (i = 0; i < NUM_PCIE_SERDES; i++) {
790 sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i],
791 0x03, WRITE_SBUS_RECEIVER, 0x00022132);
792 }
793
794 clear_sbus_fast_mode(dd);
795}
796
797/*
798 * Trigger a secondary bus reset (SBR) on ourselves using our parent.
799 *
800 * Based on pci_parent_bus_reset() which is not exported by the
801 * kernel core.
802 */
803static int trigger_sbr(struct hfi1_devdata *dd)
804{
805 struct pci_dev *dev = dd->pcidev;
806 struct pci_dev *pdev;
807
808 /* need a parent */
809 if (!dev->bus->self) {
810 dd_dev_err(dd, "%s: no parent device\n", __func__);
811 return -ENOTTY;
812 }
813
814 /* should not be anyone else on the bus */
815 list_for_each_entry(pdev, &dev->bus->devices, bus_list)
816 if (pdev != dev) {
817 dd_dev_err(dd,
818 "%s: another device is on the same bus\n",
819 __func__);
820 return -ENOTTY;
821 }
822
823 /*
824 * A secondary bus reset (SBR) issues a hot reset to our device.
825 * The following routine does a 1s wait after the reset is dropped
826 * per PCI Trhfa (recovery time). PCIe 3.0 section 6.6.1 -
827 * Conventional Reset, paragraph 3, line 35 also says that a 1s
828 * delay after a reset is required. Per spec requirements,
829 * the link is either working or not after that point.
830 */
831 pci_reset_bridge_secondary_bus(dev->bus->self);
832
833 return 0;
834}
835
836/*
837 * Write the given gasket interrupt register.
838 */
839static void write_gasket_interrupt(struct hfi1_devdata *dd, int index,
840 u16 code, u16 data)
841{
842 write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8),
843 (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT)
844 |((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
845}
846
847/*
848 * Tell the gasket logic how to react to the reset.
849 */
850static void arm_gasket_logic(struct hfi1_devdata *dd)
851{
852 u64 reg;
853
854 reg = (((u64)1 << dd->hfi1_id)
855 << ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT)
856 | ((u64)pcie_serdes_broadcast[dd->hfi1_id]
857 << ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT
858 | ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK
859 | ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK)
860 << ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT
861 );
862 write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg);
863 /* read back to push the write */
864 read_csr(dd, ASIC_PCIE_SD_HOST_CMD);
865}
866
867/*
868 * Do all the steps needed to transition the PCIe link to Gen3 speed.
869 */
870int do_pcie_gen3_transition(struct hfi1_devdata *dd)
871{
872 struct pci_dev *parent;
873 u64 fw_ctrl;
874 u64 reg, therm;
875 u32 reg32, fs, lf;
876 u32 status, err;
877 int ret;
878 int do_retry, retry_count = 0;
879 uint default_pset;
880 u16 target_vector, target_speed;
881 u16 lnkctl, lnkctl2, vendor;
882 u8 nsbr = 1;
883 u8 div;
884 const u8 (*eq)[3];
885 int return_error = 0;
886
887 /* PCIe Gen3 is for the ASIC only */
888 if (dd->icode != ICODE_RTL_SILICON)
889 return 0;
890
891 if (pcie_target == 1) { /* target Gen1 */
892 target_vector = GEN1_SPEED_VECTOR;
893 target_speed = 2500;
894 } else if (pcie_target == 2) { /* target Gen2 */
895 target_vector = GEN2_SPEED_VECTOR;
896 target_speed = 5000;
897 } else if (pcie_target == 3) { /* target Gen3 */
898 target_vector = GEN3_SPEED_VECTOR;
899 target_speed = 8000;
900 } else {
901 /* off or invalid target - skip */
902 dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__);
903 return 0;
904 }
905
906 /* if already at target speed, done (unless forced) */
907 if (dd->lbus_speed == target_speed) {
908 dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__,
909 pcie_target,
910 pcie_force ? "re-doing anyway" : "skipping");
911 if (!pcie_force)
912 return 0;
913 }
914
915 /*
916 * A0 needs an additional SBR
917 */
918 if (is_a0(dd))
919 nsbr++;
920
921 /*
922 * Do the Gen3 transition. Steps are those of the PCIe Gen3
923 * recipe.
924 */
925
926 /* step 1: pcie link working in gen1/gen2 */
927
928 /* step 2: if either side is not capable of Gen3, done */
929 if (pcie_target == 3 && !dd->link_gen3_capable) {
930 dd_dev_err(dd, "The PCIe link is not Gen3 capable\n");
931 ret = -ENOSYS;
932 goto done_no_mutex;
933 }
934
935 /* hold the HW mutex across the firmware download and SBR */
936 ret = acquire_hw_mutex(dd);
937 if (ret)
938 return ret;
939
940 /* make sure thermal polling is not causing interrupts */
941 therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN);
942 if (therm) {
943 write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
944 msleep(100);
945 dd_dev_info(dd, "%s: Disabled therm polling\n",
946 __func__);
947 }
948
949 /* step 3: download SBus Master firmware */
950 /* step 4: download PCIe Gen3 SerDes firmware */
951retry:
952 dd_dev_info(dd, "%s: downloading firmware\n", __func__);
953 ret = load_pcie_firmware(dd);
954 if (ret)
955 goto done;
956
957 /* step 5: set up device parameter settings */
958 dd_dev_info(dd, "%s: setting PCIe registers\n", __func__);
959
960 /*
961 * PcieCfgSpcie1 - Link Control 3
962 * Leave at reset value. No need to set PerfEq - link equalization
963 * will be performed automatically after the SBR when the target
964 * speed is 8GT/s.
965 */
966
967 /* clear all 16 per-lane error bits (PCIe: Lane Error Status) */
968 pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff);
969
970 /* step 5a: Set Synopsys Port Logic registers */
971
972 /*
973 * PcieCfgRegPl2 - Port Force Link
974 *
975 * Set the low power field to 0x10 to avoid unnecessary power
976 * management messages. All other fields are zero.
977 */
978 reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT;
979 pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32);
980
981 /*
982 * PcieCfgRegPl100 - Gen3 Control
983 *
984 * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl
985 * turn on PcieCfgRegPl100.EqEieosCnt (erratum)
986 * Everything else zero.
987 */
988 reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK;
989 pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32);
990
991 /*
992 * PcieCfgRegPl101 - Gen3 EQ FS and LF
993 * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping
994 * PcieCfgRegPl103 - Gen3 EQ Preset Index
995 * PcieCfgRegPl105 - Gen3 EQ Status
996 *
997 * Give initial EQ settings.
998 */
999 if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */
1000 /* 1000mV, FS=24, LF = 8 */
1001 fs = 24;
1002 lf = 8;
1003 div = 3;
1004 eq = discrete_preliminary_eq;
1005 default_pset = DEFAULT_DISCRETE_PSET;
1006 } else {
1007 /* 400mV, FS=29, LF = 9 */
1008 fs = 29;
1009 lf = 9;
1010 div = 1;
1011 eq = integrated_preliminary_eq;
1012 default_pset = DEFAULT_MCP_PSET;
1013 }
1014 pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
1015 (fs << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT)
1016 | (lf << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
1017 ret = load_eq_table(dd, eq, fs, div);
1018 if (ret)
1019 goto done;
1020
1021 /*
1022 * PcieCfgRegPl106 - Gen3 EQ Control
1023 *
1024 * Set Gen3EqPsetReqVec, leave other fields 0.
1025 */
1026 if (pcie_pset == UNSET_PSET)
1027 pcie_pset = default_pset;
1028 if (pcie_pset > 10) { /* valid range is 0-10, inclusive */
1029 dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n",
1030 __func__, pcie_pset, default_pset);
1031 pcie_pset = default_pset;
1032 }
1033 dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset);
1034 pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106,
1035 ((1 << pcie_pset)
1036 << PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT)
1037 | PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK
1038 | PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
1039
1040 /*
1041 * step 5b: Do post firmware download steps via SBus
1042 */
1043 dd_dev_info(dd, "%s: doing pcie post steps\n", __func__);
1044 pcie_post_steps(dd);
1045
1046 /*
1047 * step 5c: Program gasket interrupts
1048 */
1049 /* set the Rx Bit Rate to REFCLK ratio */
1050 write_gasket_interrupt(dd, 0, 0x0006, 0x0050);
1051 /* disable pCal for PCIe Gen3 RX equalization */
1052 write_gasket_interrupt(dd, 1, 0x0026, 0x5b01);
1053 /*
1054 * Enable iCal for PCIe Gen3 RX equalization, and set which
1055 * evaluation of RX_EQ_EVAL will launch the iCal procedure.
1056 */
1057 write_gasket_interrupt(dd, 2, 0x0026, 0x5202);
1058 /* terminate list */
1059 write_gasket_interrupt(dd, 3, 0x0000, 0x0000);
1060
1061 /*
1062 * step 5d: program XMT margin
1063 * Right now, leave the default alone. To change, do a
1064 * read-modify-write of:
1065 * CcePcieCtrl.XmtMargin
1066 * CcePcieCtrl.XmitMarginOverwriteEnable
1067 */
1068
1069 /* step 5e: disable active state power management (ASPM) */
1070 dd_dev_info(dd, "%s: clearing ASPM\n", __func__);
1071 pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &lnkctl);
1072 lnkctl &= ~PCI_EXP_LNKCTL_ASPMC;
1073 pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, lnkctl);
1074
1075 /*
1076 * step 5f: clear DirectSpeedChange
1077 * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the
1078 * change in the speed target from starting before we are ready.
1079 * This field defaults to 0 and we are not changing it, so nothing
1080 * needs to be done.
1081 */
1082
1083 /* step 5g: Set target link speed */
1084 /*
1085 * Set target link speed to be target on both device and parent.
1086 * On setting the parent: Some system BIOSs "helpfully" set the
1087 * parent target speed to Gen2 to match the ASIC's initial speed.
1088 * We can set the target Gen3 because we have already checked
1089 * that it is Gen3 capable earlier.
1090 */
1091 dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
1092 parent = dd->pcidev->bus->self;
1093 pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
1094 dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
1095 (u32)lnkctl2);
1096 /* only write to parent if target is not as high as ours */
1097 if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) {
1098 lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
1099 lnkctl2 |= target_vector;
1100 dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
1101 (u32)lnkctl2);
1102 pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2);
1103 } else {
1104 dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
1105 }
1106
1107 dd_dev_info(dd, "%s: setting target link speed\n", __func__);
1108 pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
1109 dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
1110 (u32)lnkctl2);
1111 lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
1112 lnkctl2 |= target_vector;
1113 dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
1114 (u32)lnkctl2);
1115 pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
1116
1117 /* step 5h: arm gasket logic */
1118 /* hold DC in reset across the SBR */
1119 write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
1120 (void) read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
1121 /* save firmware control across the SBR */
1122 fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL);
1123
1124 dd_dev_info(dd, "%s: arming gasket logic\n", __func__);
1125 arm_gasket_logic(dd);
1126
1127 /*
1128 * step 6: quiesce PCIe link
1129 * The chip has already been reset, so there will be no traffic
1130 * from the chip. Linux has no easy way to enforce that it will
1131 * not try to access the device, so we just need to hope it doesn't
1132 * do it while we are doing the reset.
1133 */
1134
1135 /*
1136 * step 7: initiate the secondary bus reset (SBR)
1137 * step 8: hardware brings the links back up
1138 * step 9: wait for link speed transition to be complete
1139 */
1140 dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__);
1141 ret = trigger_sbr(dd);
1142 if (ret)
1143 goto done;
1144
1145 /* step 10: decide what to do next */
1146
1147 /* check if we can read PCI space */
1148 ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor);
1149 if (ret) {
1150 dd_dev_info(dd,
1151 "%s: read of VendorID failed after SBR, err %d\n",
1152 __func__, ret);
1153 return_error = 1;
1154 goto done;
1155 }
1156 if (vendor == 0xffff) {
1157 dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__);
1158 return_error = 1;
1159 ret = -EIO;
1160 goto done;
1161 }
1162
1163 /* restore PCI space registers we know were reset */
1164 dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
1165 restore_pci_variables(dd);
1166 /* restore firmware control */
1167 write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
1168
1169 /*
1170 * Check the gasket block status.
1171 *
1172 * This is the first CSR read after the SBR. If the read returns
1173 * all 1s (fails), the link did not make it back.
1174 *
1175 * Once we're sure we can read and write, clear the DC reset after
1176 * the SBR. Then check for any per-lane errors. Then look over
1177 * the status.
1178 */
1179 reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS);
1180 dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg);
1181 if (reg == ~0ull) { /* PCIe read failed/timeout */
1182 dd_dev_err(dd, "SBR failed - unable to read from device\n");
1183 return_error = 1;
1184 ret = -ENOSYS;
1185 goto done;
1186 }
1187
1188 /* clear the DC reset */
1189 write_csr(dd, CCE_DC_CTRL, 0);
1190 /* Set the LED off */
1191 if (is_a0(dd))
1192 setextled(dd, 0);
1193
1194 /* check for any per-lane errors */
1195 pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
1196 dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
1197
1198 /* extract status, look for our HFI */
1199 status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT)
1200 & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK;
1201 if ((status & (1 << dd->hfi1_id)) == 0) {
1202 dd_dev_err(dd,
1203 "%s: gasket status 0x%x, expecting 0x%x\n",
1204 __func__, status, 1 << dd->hfi1_id);
1205 ret = -EIO;
1206 goto done;
1207 }
1208
1209 /* extract error */
1210 err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT)
1211 & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK;
1212 if (err) {
1213 dd_dev_err(dd, "%s: gasket error %d\n", __func__, err);
1214 ret = -EIO;
1215 goto done;
1216 }
1217
1218 /* update our link information cache */
1219 update_lbus_info(dd);
1220 dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
1221 dd->lbus_info);
1222
1223 if (dd->lbus_speed != target_speed) { /* not target */
1224 /* maybe retry */
1225 do_retry = retry_count < pcie_retry;
1226 dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
1227 pcie_target, do_retry ? ", retrying" : "");
1228 retry_count++;
1229 if (do_retry) {
1230 msleep(100); /* allow time to settle */
1231 goto retry;
1232 }
1233 ret = -EIO;
1234 }
1235
1236done:
1237 if (therm) {
1238 write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
1239 msleep(100);
1240 dd_dev_info(dd, "%s: Re-enable therm polling\n",
1241 __func__);
1242 }
1243 release_hw_mutex(dd);
1244done_no_mutex:
1245 /* return no error if it is OK to be at current speed */
1246 if (ret && !return_error) {
1247 dd_dev_err(dd, "Proceeding at current speed PCIe speed\n");
1248 ret = 0;
1249 }
1250
1251 dd_dev_info(dd, "%s: done\n", __func__);
1252 return ret;
1253}
diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c
new file mode 100644
index 000000000000..9991814a8f05
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pio.c
@@ -0,0 +1,1771 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/delay.h>
52#include "hfi.h"
53#include "qp.h"
54#include "trace.h"
55
56#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */
57
58#define SC(name) SEND_CTXT_##name
59/*
60 * Send Context functions
61 */
62static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
63
64/*
65 * Set the CM reset bit and wait for it to clear. Use the provided
66 * sendctrl register. This routine has no locking.
67 */
68void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
69{
70 write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
71 while (1) {
72 udelay(1);
73 sendctrl = read_csr(dd, SEND_CTRL);
74 if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
75 break;
76 }
77}
78
79/* defined in header release 48 and higher */
80#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
81#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
82#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
83#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
84 << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
85#endif
86
87/* global control of PIO send */
88void pio_send_control(struct hfi1_devdata *dd, int op)
89{
90 u64 reg, mask;
91 unsigned long flags;
92 int write = 1; /* write sendctrl back */
93 int flush = 0; /* re-read sendctrl to make sure it is flushed */
94
95 spin_lock_irqsave(&dd->sendctrl_lock, flags);
96
97 reg = read_csr(dd, SEND_CTRL);
98 switch (op) {
99 case PSC_GLOBAL_ENABLE:
100 reg |= SEND_CTRL_SEND_ENABLE_SMASK;
101 /* Fall through */
102 case PSC_DATA_VL_ENABLE:
103 /* Disallow sending on VLs not enabled */
104 mask = (((~0ull)<<num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK)<<
105 SEND_CTRL_UNSUPPORTED_VL_SHIFT;
106 reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
107 break;
108 case PSC_GLOBAL_DISABLE:
109 reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
110 break;
111 case PSC_GLOBAL_VLARB_ENABLE:
112 reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
113 break;
114 case PSC_GLOBAL_VLARB_DISABLE:
115 reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
116 break;
117 case PSC_CM_RESET:
118 __cm_reset(dd, reg);
119 write = 0; /* CSR already written (and flushed) */
120 break;
121 case PSC_DATA_VL_DISABLE:
122 reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
123 flush = 1;
124 break;
125 default:
126 dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
127 break;
128 }
129
130 if (write) {
131 write_csr(dd, SEND_CTRL, reg);
132 if (flush)
133 (void) read_csr(dd, SEND_CTRL); /* flush write */
134 }
135
136 spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
137}
138
139/* number of send context memory pools */
140#define NUM_SC_POOLS 2
141
142/* Send Context Size (SCS) wildcards */
143#define SCS_POOL_0 -1
144#define SCS_POOL_1 -2
145/* Send Context Count (SCC) wildcards */
146#define SCC_PER_VL -1
147#define SCC_PER_CPU -2
148
149#define SCC_PER_KRCVQ -3
150#define SCC_ACK_CREDITS 32
151
152#define PIO_WAIT_BATCH_SIZE 5
153
154/* default send context sizes */
155static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
156 [SC_KERNEL] = { .size = SCS_POOL_0, /* even divide, pool 0 */
157 .count = SCC_PER_VL },/* one per NUMA */
158 [SC_ACK] = { .size = SCC_ACK_CREDITS,
159 .count = SCC_PER_KRCVQ },
160 [SC_USER] = { .size = SCS_POOL_0, /* even divide, pool 0 */
161 .count = SCC_PER_CPU }, /* one per CPU */
162
163};
164
165/* send context memory pool configuration */
166struct mem_pool_config {
167 int centipercent; /* % of memory, in 100ths of 1% */
168 int absolute_blocks; /* absolute block count */
169};
170
171/* default memory pool configuration: 100% in pool 0 */
172static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
173 /* centi%, abs blocks */
174 { 10000, -1 }, /* pool 0 */
175 { 0, -1 }, /* pool 1 */
176};
177
178/* memory pool information, used when calculating final sizes */
179struct mem_pool_info {
180 int centipercent; /* 100th of 1% of memory to use, -1 if blocks
181 already set */
182 int count; /* count of contexts in the pool */
183 int blocks; /* block size of the pool */
184 int size; /* context size, in blocks */
185};
186
187/*
188 * Convert a pool wildcard to a valid pool index. The wildcards
189 * start at -1 and increase negatively. Map them as:
190 * -1 => 0
191 * -2 => 1
192 * etc.
193 *
194 * Return -1 on non-wildcard input, otherwise convert to a pool number.
195 */
196static int wildcard_to_pool(int wc)
197{
198 if (wc >= 0)
199 return -1; /* non-wildcard */
200 return -wc - 1;
201}
202
203static const char *sc_type_names[SC_MAX] = {
204 "kernel",
205 "ack",
206 "user"
207};
208
209static const char *sc_type_name(int index)
210{
211 if (index < 0 || index >= SC_MAX)
212 return "unknown";
213 return sc_type_names[index];
214}
215
216/*
217 * Read the send context memory pool configuration and send context
218 * size configuration. Replace any wildcards and come up with final
219 * counts and sizes for the send context types.
220 */
221int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
222{
223 struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
224 int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
225 int total_contexts = 0;
226 int fixed_blocks;
227 int pool_blocks;
228 int used_blocks;
229 int cp_total; /* centipercent total */
230 int ab_total; /* absolute block total */
231 int extra;
232 int i;
233
234 /*
235 * Step 0:
236 * - copy the centipercents/absolute sizes from the pool config
237 * - sanity check these values
238 * - add up centipercents, then later check for full value
239 * - add up absolute blocks, then later check for over-commit
240 */
241 cp_total = 0;
242 ab_total = 0;
243 for (i = 0; i < NUM_SC_POOLS; i++) {
244 int cp = sc_mem_pool_config[i].centipercent;
245 int ab = sc_mem_pool_config[i].absolute_blocks;
246
247 /*
248 * A negative value is "unused" or "invalid". Both *can*
249 * be valid, but centipercent wins, so check that first
250 */
251 if (cp >= 0) { /* centipercent valid */
252 cp_total += cp;
253 } else if (ab >= 0) { /* absolute blocks valid */
254 ab_total += ab;
255 } else { /* neither valid */
256 dd_dev_err(
257 dd,
258 "Send context memory pool %d: both the block count and centipercent are invalid\n",
259 i);
260 return -EINVAL;
261 }
262
263 mem_pool_info[i].centipercent = cp;
264 mem_pool_info[i].blocks = ab;
265 }
266
267 /* do not use both % and absolute blocks for different pools */
268 if (cp_total != 0 && ab_total != 0) {
269 dd_dev_err(
270 dd,
271 "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
272 return -EINVAL;
273 }
274
275 /* if any percentages are present, they must add up to 100% x 100 */
276 if (cp_total != 0 && cp_total != 10000) {
277 dd_dev_err(
278 dd,
279 "Send context memory pool centipercent is %d, expecting 10000\n",
280 cp_total);
281 return -EINVAL;
282 }
283
284 /* the absolute pool total cannot be more than the mem total */
285 if (ab_total > total_blocks) {
286 dd_dev_err(
287 dd,
288 "Send context memory pool absolute block count %d is larger than the memory size %d\n",
289 ab_total, total_blocks);
290 return -EINVAL;
291 }
292
293 /*
294 * Step 2:
295 * - copy from the context size config
296 * - replace context type wildcard counts with real values
297 * - add up non-memory pool block sizes
298 * - add up memory pool user counts
299 */
300 fixed_blocks = 0;
301 for (i = 0; i < SC_MAX; i++) {
302 int count = sc_config_sizes[i].count;
303 int size = sc_config_sizes[i].size;
304 int pool;
305
306 /*
307 * Sanity check count: Either a positive value or
308 * one of the expected wildcards is valid. The positive
309 * value is checked later when we compare against total
310 * memory available.
311 */
312 if (i == SC_ACK) {
313 count = dd->n_krcv_queues;
314 } else if (i == SC_KERNEL) {
315 count = num_vls + 1 /* VL15 */;
316 } else if (count == SCC_PER_CPU) {
317 count = dd->num_rcv_contexts - dd->n_krcv_queues;
318 } else if (count < 0) {
319 dd_dev_err(
320 dd,
321 "%s send context invalid count wildcard %d\n",
322 sc_type_name(i), count);
323 return -EINVAL;
324 }
325 if (total_contexts + count > dd->chip_send_contexts)
326 count = dd->chip_send_contexts - total_contexts;
327
328 total_contexts += count;
329
330 /*
331 * Sanity check pool: The conversion will return a pool
332 * number or -1 if a fixed (non-negative) value. The fixed
333 * value is checked later when we compare against
334 * total memory available.
335 */
336 pool = wildcard_to_pool(size);
337 if (pool == -1) { /* non-wildcard */
338 fixed_blocks += size * count;
339 } else if (pool < NUM_SC_POOLS) { /* valid wildcard */
340 mem_pool_info[pool].count += count;
341 } else { /* invalid wildcard */
342 dd_dev_err(
343 dd,
344 "%s send context invalid pool wildcard %d\n",
345 sc_type_name(i), size);
346 return -EINVAL;
347 }
348
349 dd->sc_sizes[i].count = count;
350 dd->sc_sizes[i].size = size;
351 }
352 if (fixed_blocks > total_blocks) {
353 dd_dev_err(
354 dd,
355 "Send context fixed block count, %u, larger than total block count %u\n",
356 fixed_blocks, total_blocks);
357 return -EINVAL;
358 }
359
360 /* step 3: calculate the blocks in the pools, and pool context sizes */
361 pool_blocks = total_blocks - fixed_blocks;
362 if (ab_total > pool_blocks) {
363 dd_dev_err(
364 dd,
365 "Send context fixed pool sizes, %u, larger than pool block count %u\n",
366 ab_total, pool_blocks);
367 return -EINVAL;
368 }
369 /* subtract off the fixed pool blocks */
370 pool_blocks -= ab_total;
371
372 for (i = 0; i < NUM_SC_POOLS; i++) {
373 struct mem_pool_info *pi = &mem_pool_info[i];
374
375 /* % beats absolute blocks */
376 if (pi->centipercent >= 0)
377 pi->blocks = (pool_blocks * pi->centipercent) / 10000;
378
379 if (pi->blocks == 0 && pi->count != 0) {
380 dd_dev_err(
381 dd,
382 "Send context memory pool %d has %u contexts, but no blocks\n",
383 i, pi->count);
384 return -EINVAL;
385 }
386 if (pi->count == 0) {
387 /* warn about wasted blocks */
388 if (pi->blocks != 0)
389 dd_dev_err(
390 dd,
391 "Send context memory pool %d has %u blocks, but zero contexts\n",
392 i, pi->blocks);
393 pi->size = 0;
394 } else {
395 pi->size = pi->blocks / pi->count;
396 }
397 }
398
399 /* step 4: fill in the context type sizes from the pool sizes */
400 used_blocks = 0;
401 for (i = 0; i < SC_MAX; i++) {
402 if (dd->sc_sizes[i].size < 0) {
403 unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
404
405 WARN_ON_ONCE(pool >= NUM_SC_POOLS);
406 dd->sc_sizes[i].size = mem_pool_info[pool].size;
407 }
408 /* make sure we are not larger than what is allowed by the HW */
409#define PIO_MAX_BLOCKS 1024
410 if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
411 dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
412
413 /* calculate our total usage */
414 used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
415 }
416 extra = total_blocks - used_blocks;
417 if (extra != 0)
418 dd_dev_info(dd, "unused send context blocks: %d\n", extra);
419
420 return total_contexts;
421}
422
423int init_send_contexts(struct hfi1_devdata *dd)
424{
425 u16 base;
426 int ret, i, j, context;
427
428 ret = init_credit_return(dd);
429 if (ret)
430 return ret;
431
432 dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
433 GFP_KERNEL);
434 dd->send_contexts = kcalloc(dd->num_send_contexts,
435 sizeof(struct send_context_info),
436 GFP_KERNEL);
437 if (!dd->send_contexts || !dd->hw_to_sw) {
438 dd_dev_err(dd, "Unable to allocate send context arrays\n");
439 kfree(dd->hw_to_sw);
440 kfree(dd->send_contexts);
441 free_credit_return(dd);
442 return -ENOMEM;
443 }
444
445 /* hardware context map starts with invalid send context indices */
446 for (i = 0; i < TXE_NUM_CONTEXTS; i++)
447 dd->hw_to_sw[i] = INVALID_SCI;
448
449 /*
450 * All send contexts have their credit sizes. Allocate credits
451 * for each context one after another from the global space.
452 */
453 context = 0;
454 base = 1;
455 for (i = 0; i < SC_MAX; i++) {
456 struct sc_config_sizes *scs = &dd->sc_sizes[i];
457
458 for (j = 0; j < scs->count; j++) {
459 struct send_context_info *sci =
460 &dd->send_contexts[context];
461 sci->type = i;
462 sci->base = base;
463 sci->credits = scs->size;
464
465 context++;
466 base += scs->size;
467 }
468 }
469
470 return 0;
471}
472
473/*
474 * Allocate a software index and hardware context of the given type.
475 *
476 * Must be called with dd->sc_lock held.
477 */
478static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
479 u32 *hw_context)
480{
481 struct send_context_info *sci;
482 u32 index;
483 u32 context;
484
485 for (index = 0, sci = &dd->send_contexts[0];
486 index < dd->num_send_contexts; index++, sci++) {
487 if (sci->type == type && sci->allocated == 0) {
488 sci->allocated = 1;
489 /* use a 1:1 mapping, but make them non-equal */
490 context = dd->chip_send_contexts - index - 1;
491 dd->hw_to_sw[context] = index;
492 *sw_index = index;
493 *hw_context = context;
494 return 0; /* success */
495 }
496 }
497 dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
498 return -ENOSPC;
499}
500
501/*
502 * Free the send context given by its software index.
503 *
504 * Must be called with dd->sc_lock held.
505 */
506static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
507{
508 struct send_context_info *sci;
509
510 sci = &dd->send_contexts[sw_index];
511 if (!sci->allocated) {
512 dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
513 __func__, sw_index, hw_context);
514 }
515 sci->allocated = 0;
516 dd->hw_to_sw[hw_context] = INVALID_SCI;
517}
518
519/* return the base context of a context in a group */
520static inline u32 group_context(u32 context, u32 group)
521{
522 return (context >> group) << group;
523}
524
525/* return the size of a group */
526static inline u32 group_size(u32 group)
527{
528 return 1 << group;
529}
530
531/*
532 * Obtain the credit return addresses, kernel virtual and physical, for the
533 * given sc.
534 *
535 * To understand this routine:
536 * o va and pa are arrays of struct credit_return. One for each physical
537 * send context, per NUMA.
538 * o Each send context always looks in its relative location in a struct
539 * credit_return for its credit return.
540 * o Each send context in a group must have its return address CSR programmed
541 * with the same value. Use the address of the first send context in the
542 * group.
543 */
544static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa)
545{
546 u32 gc = group_context(sc->hw_context, sc->group);
547 u32 index = sc->hw_context & 0x7;
548
549 sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
550 *pa = (unsigned long)
551 &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc];
552}
553
554/*
555 * Work queue function triggered in error interrupt routine for
556 * kernel contexts.
557 */
558static void sc_halted(struct work_struct *work)
559{
560 struct send_context *sc;
561
562 sc = container_of(work, struct send_context, halt_work);
563 sc_restart(sc);
564}
565
566/*
567 * Calculate PIO block threshold for this send context using the given MTU.
568 * Trigger a return when one MTU plus optional header of credits remain.
569 *
570 * Parameter mtu is in bytes.
571 * Parameter hdrqentsize is in DWORDs.
572 *
573 * Return value is what to write into the CSR: trigger return when
574 * unreturned credits pass this count.
575 */
576u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
577{
578 u32 release_credits;
579 u32 threshold;
580
581 /* add in the header size, then divide by the PIO block size */
582 mtu += hdrqentsize << 2;
583 release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
584
585 /* check against this context's credits */
586 if (sc->credits <= release_credits)
587 threshold = 1;
588 else
589 threshold = sc->credits - release_credits;
590
591 return threshold;
592}
593
594/*
595 * Calculate credit threshold in terms of percent of the allocated credits.
596 * Trigger when unreturned credits equal or exceed the percentage of the whole.
597 *
598 * Return value is what to write into the CSR: trigger return when
599 * unreturned credits pass this count.
600 */
601static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
602{
603 return (sc->credits * percent) / 100;
604}
605
606/*
607 * Set the credit return threshold.
608 */
609void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
610{
611 unsigned long flags;
612 u32 old_threshold;
613 int force_return = 0;
614
615 spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
616
617 old_threshold = (sc->credit_ctrl >>
618 SC(CREDIT_CTRL_THRESHOLD_SHIFT))
619 & SC(CREDIT_CTRL_THRESHOLD_MASK);
620
621 if (new_threshold != old_threshold) {
622 sc->credit_ctrl =
623 (sc->credit_ctrl
624 & ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
625 | ((new_threshold
626 & SC(CREDIT_CTRL_THRESHOLD_MASK))
627 << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
628 write_kctxt_csr(sc->dd, sc->hw_context,
629 SC(CREDIT_CTRL), sc->credit_ctrl);
630
631 /* force a credit return on change to avoid a possible stall */
632 force_return = 1;
633 }
634
635 spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
636
637 if (force_return)
638 sc_return_credits(sc);
639}
640
641/*
642 * set_pio_integrity
643 *
644 * Set the CHECK_ENABLE register for the send context 'sc'.
645 */
646void set_pio_integrity(struct send_context *sc)
647{
648 struct hfi1_devdata *dd = sc->dd;
649 u64 reg = 0;
650 u32 hw_context = sc->hw_context;
651 int type = sc->type;
652
653 /*
654 * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if
655 * we're snooping.
656 */
657 if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
658 dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE)
659 reg = hfi1_pkt_default_send_ctxt_mask(dd, type);
660
661 write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
662}
663
664/*
665 * Allocate a NUMA relative send context structure of the given type along
666 * with a HW context.
667 */
668struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
669 uint hdrqentsize, int numa)
670{
671 struct send_context_info *sci;
672 struct send_context *sc;
673 dma_addr_t pa;
674 unsigned long flags;
675 u64 reg;
676 u32 thresh;
677 u32 sw_index;
678 u32 hw_context;
679 int ret;
680 u8 opval, opmask;
681
682 /* do not allocate while frozen */
683 if (dd->flags & HFI1_FROZEN)
684 return NULL;
685
686 sc = kzalloc_node(sizeof(struct send_context), GFP_KERNEL, numa);
687 if (!sc) {
688 dd_dev_err(dd, "Cannot allocate send context structure\n");
689 return NULL;
690 }
691
692 spin_lock_irqsave(&dd->sc_lock, flags);
693 ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
694 if (ret) {
695 spin_unlock_irqrestore(&dd->sc_lock, flags);
696 kfree(sc);
697 return NULL;
698 }
699
700 sci = &dd->send_contexts[sw_index];
701 sci->sc = sc;
702
703 sc->dd = dd;
704 sc->node = numa;
705 sc->type = type;
706 spin_lock_init(&sc->alloc_lock);
707 spin_lock_init(&sc->release_lock);
708 spin_lock_init(&sc->credit_ctrl_lock);
709 INIT_LIST_HEAD(&sc->piowait);
710 INIT_WORK(&sc->halt_work, sc_halted);
711 atomic_set(&sc->buffers_allocated, 0);
712 init_waitqueue_head(&sc->halt_wait);
713
714 /* grouping is always single context for now */
715 sc->group = 0;
716
717 sc->sw_index = sw_index;
718 sc->hw_context = hw_context;
719 cr_group_addresses(sc, &pa);
720 sc->credits = sci->credits;
721
722/* PIO Send Memory Address details */
723#define PIO_ADDR_CONTEXT_MASK 0xfful
724#define PIO_ADDR_CONTEXT_SHIFT 16
725 sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
726 << PIO_ADDR_CONTEXT_SHIFT);
727
728 /* set base and credits */
729 reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
730 << SC(CTRL_CTXT_DEPTH_SHIFT))
731 | ((sci->base & SC(CTRL_CTXT_BASE_MASK))
732 << SC(CTRL_CTXT_BASE_SHIFT));
733 write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
734
735 set_pio_integrity(sc);
736
737 /* unmask all errors */
738 write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
739
740 /* set the default partition key */
741 write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
742 (DEFAULT_PKEY &
743 SC(CHECK_PARTITION_KEY_VALUE_MASK))
744 << SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
745
746 /* per context type checks */
747 if (type == SC_USER) {
748 opval = USER_OPCODE_CHECK_VAL;
749 opmask = USER_OPCODE_CHECK_MASK;
750 } else {
751 opval = OPCODE_CHECK_VAL_DISABLED;
752 opmask = OPCODE_CHECK_MASK_DISABLED;
753 }
754
755 /* set the send context check opcode mask and value */
756 write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
757 ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
758 ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
759
760 /* set up credit return */
761 reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
762 write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
763
764 /*
765 * Calculate the initial credit return threshold.
766 *
767 * For Ack contexts, set a threshold for half the credits.
768 * For User contexts use the given percentage. This has been
769 * sanitized on driver start-up.
770 * For Kernel contexts, use the default MTU plus a header.
771 */
772 if (type == SC_ACK) {
773 thresh = sc_percent_to_threshold(sc, 50);
774 } else if (type == SC_USER) {
775 thresh = sc_percent_to_threshold(sc,
776 user_credit_return_threshold);
777 } else { /* kernel */
778 thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize);
779 }
780 reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
781 /* add in early return */
782 if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
783 reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
784 else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
785 reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
786
787 /* set up write-through credit_ctrl */
788 sc->credit_ctrl = reg;
789 write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
790
791 /* User send contexts should not allow sending on VL15 */
792 if (type == SC_USER) {
793 reg = 1ULL << 15;
794 write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
795 }
796
797 spin_unlock_irqrestore(&dd->sc_lock, flags);
798
799 /*
800 * Allocate shadow ring to track outstanding PIO buffers _after_
801 * unlocking. We don't know the size until the lock is held and
802 * we can't allocate while the lock is held. No one is using
803 * the context yet, so allocate it now.
804 *
805 * User contexts do not get a shadow ring.
806 */
807 if (type != SC_USER) {
808 /*
809 * Size the shadow ring 1 larger than the number of credits
810 * so head == tail can mean empty.
811 */
812 sc->sr_size = sci->credits + 1;
813 sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) *
814 sc->sr_size, GFP_KERNEL, numa);
815 if (!sc->sr) {
816 dd_dev_err(dd,
817 "Cannot allocate send context shadow ring structure\n");
818 sc_free(sc);
819 return NULL;
820 }
821 }
822
823 dd_dev_info(dd,
824 "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
825 sw_index,
826 hw_context,
827 sc_type_name(type),
828 sc->group,
829 sc->credits,
830 sc->credit_ctrl,
831 thresh);
832
833 return sc;
834}
835
836/* free a per-NUMA send context structure */
837void sc_free(struct send_context *sc)
838{
839 struct hfi1_devdata *dd;
840 unsigned long flags;
841 u32 sw_index;
842 u32 hw_context;
843
844 if (!sc)
845 return;
846
847 sc->flags |= SCF_IN_FREE; /* ensure no restarts */
848 dd = sc->dd;
849 if (!list_empty(&sc->piowait))
850 dd_dev_err(dd, "piowait list not empty!\n");
851 sw_index = sc->sw_index;
852 hw_context = sc->hw_context;
853 sc_disable(sc); /* make sure the HW is disabled */
854 flush_work(&sc->halt_work);
855
856 spin_lock_irqsave(&dd->sc_lock, flags);
857 dd->send_contexts[sw_index].sc = NULL;
858
859 /* clear/disable all registers set in sc_alloc */
860 write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
861 write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
862 write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
863 write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
864 write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
865 write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
866 write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
867
868 /* release the index and context for re-use */
869 sc_hw_free(dd, sw_index, hw_context);
870 spin_unlock_irqrestore(&dd->sc_lock, flags);
871
872 kfree(sc->sr);
873 kfree(sc);
874}
875
876/* disable the context */
877void sc_disable(struct send_context *sc)
878{
879 u64 reg;
880 unsigned long flags;
881 struct pio_buf *pbuf;
882
883 if (!sc)
884 return;
885
886 /* do all steps, even if already disabled */
887 spin_lock_irqsave(&sc->alloc_lock, flags);
888 reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
889 reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
890 sc->flags &= ~SCF_ENABLED;
891 sc_wait_for_packet_egress(sc, 1);
892 write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
893 spin_unlock_irqrestore(&sc->alloc_lock, flags);
894
895 /*
896 * Flush any waiters. Once the context is disabled,
897 * credit return interrupts are stopped (although there
898 * could be one in-process when the context is disabled).
899 * Wait one microsecond for any lingering interrupts, then
900 * proceed with the flush.
901 */
902 udelay(1);
903 spin_lock_irqsave(&sc->release_lock, flags);
904 if (sc->sr) { /* this context has a shadow ring */
905 while (sc->sr_tail != sc->sr_head) {
906 pbuf = &sc->sr[sc->sr_tail].pbuf;
907 if (pbuf->cb)
908 (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
909 sc->sr_tail++;
910 if (sc->sr_tail >= sc->sr_size)
911 sc->sr_tail = 0;
912 }
913 }
914 spin_unlock_irqrestore(&sc->release_lock, flags);
915}
916
917/* return SendEgressCtxtStatus.PacketOccupancy */
918#define packet_occupancy(r) \
919 (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\
920 >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT)
921
922/* is egress halted on the context? */
923#define egress_halted(r) \
924 ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK)
925
926/* wait for packet egress, optionally pause for credit return */
927static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
928{
929 struct hfi1_devdata *dd = sc->dd;
930 u64 reg;
931 u32 loop = 0;
932
933 while (1) {
934 reg = read_csr(dd, sc->hw_context * 8 +
935 SEND_EGRESS_CTXT_STATUS);
936 /* done if egress is stopped */
937 if (egress_halted(reg))
938 break;
939 reg = packet_occupancy(reg);
940 if (reg == 0)
941 break;
942 if (loop > 100) {
943 dd_dev_err(dd,
944 "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u\n",
945 __func__, sc->sw_index,
946 sc->hw_context, (u32)reg);
947 break;
948 }
949 loop++;
950 udelay(1);
951 }
952
953 if (pause)
954 /* Add additional delay to ensure chip returns all credits */
955 pause_for_credit_return(dd);
956}
957
958void sc_wait(struct hfi1_devdata *dd)
959{
960 int i;
961
962 for (i = 0; i < dd->num_send_contexts; i++) {
963 struct send_context *sc = dd->send_contexts[i].sc;
964
965 if (!sc)
966 continue;
967 sc_wait_for_packet_egress(sc, 0);
968 }
969}
970
971/*
972 * Restart a context after it has been halted due to error.
973 *
974 * If the first step fails - wait for the halt to be asserted, return early.
975 * Otherwise complain about timeouts but keep going.
976 *
977 * It is expected that allocations (enabled flag bit) have been shut off
978 * already (only applies to kernel contexts).
979 */
980int sc_restart(struct send_context *sc)
981{
982 struct hfi1_devdata *dd = sc->dd;
983 u64 reg;
984 u32 loop;
985 int count;
986
987 /* bounce off if not halted, or being free'd */
988 if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
989 return -EINVAL;
990
991 dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
992 sc->hw_context);
993
994 /*
995 * Step 1: Wait for the context to actually halt.
996 *
997 * The error interrupt is asynchronous to actually setting halt
998 * on the context.
999 */
1000 loop = 0;
1001 while (1) {
1002 reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
1003 if (reg & SC(STATUS_CTXT_HALTED_SMASK))
1004 break;
1005 if (loop > 100) {
1006 dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
1007 __func__, sc->sw_index, sc->hw_context);
1008 return -ETIME;
1009 }
1010 loop++;
1011 udelay(1);
1012 }
1013
1014 /*
1015 * Step 2: Ensure no users are still trying to write to PIO.
1016 *
1017 * For kernel contexts, we have already turned off buffer allocation.
1018 * Now wait for the buffer count to go to zero.
1019 *
1020 * For user contexts, the user handling code has cut off write access
1021 * to the context's PIO pages before calling this routine and will
1022 * restore write access after this routine returns.
1023 */
1024 if (sc->type != SC_USER) {
1025 /* kernel context */
1026 loop = 0;
1027 while (1) {
1028 count = atomic_read(&sc->buffers_allocated);
1029 if (count == 0)
1030 break;
1031 if (loop > 100) {
1032 dd_dev_err(dd,
1033 "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
1034 __func__, sc->sw_index,
1035 sc->hw_context, count);
1036 }
1037 loop++;
1038 udelay(1);
1039 }
1040 }
1041
1042 /*
1043 * Step 3: Wait for all packets to egress.
1044 * This is done while disabling the send context
1045 *
1046 * Step 4: Disable the context
1047 *
1048 * This is a superset of the halt. After the disable, the
1049 * errors can be cleared.
1050 */
1051 sc_disable(sc);
1052
1053 /*
1054 * Step 5: Enable the context
1055 *
1056 * This enable will clear the halted flag and per-send context
1057 * error flags.
1058 */
1059 return sc_enable(sc);
1060}
1061
1062/*
1063 * PIO freeze processing. To be called after the TXE block is fully frozen.
1064 * Go through all frozen send contexts and disable them. The contexts are
1065 * already stopped by the freeze.
1066 */
1067void pio_freeze(struct hfi1_devdata *dd)
1068{
1069 struct send_context *sc;
1070 int i;
1071
1072 for (i = 0; i < dd->num_send_contexts; i++) {
1073 sc = dd->send_contexts[i].sc;
1074 /*
1075 * Don't disable unallocated, unfrozen, or user send contexts.
1076 * User send contexts will be disabled when the process
1077 * calls into the driver to reset its context.
1078 */
1079 if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
1080 continue;
1081
1082 /* only need to disable, the context is already stopped */
1083 sc_disable(sc);
1084 }
1085}
1086
1087/*
1088 * Unfreeze PIO for kernel send contexts. The precondition for calling this
1089 * is that all PIO send contexts have been disabled and the SPC freeze has
1090 * been cleared. Now perform the last step and re-enable each kernel context.
1091 * User (PSM) processing will occur when PSM calls into the kernel to
1092 * acknowledge the freeze.
1093 */
1094void pio_kernel_unfreeze(struct hfi1_devdata *dd)
1095{
1096 struct send_context *sc;
1097 int i;
1098
1099 for (i = 0; i < dd->num_send_contexts; i++) {
1100 sc = dd->send_contexts[i].sc;
1101 if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
1102 continue;
1103
1104 sc_enable(sc); /* will clear the sc frozen flag */
1105 }
1106}
1107
1108/*
1109 * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
1110 * Returns:
1111 * -ETIMEDOUT - if we wait too long
1112 * -EIO - if there was an error
1113 */
1114static int pio_init_wait_progress(struct hfi1_devdata *dd)
1115{
1116 u64 reg;
1117 int max, count = 0;
1118
1119 /* max is the longest possible HW init time / delay */
1120 max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
1121 while (1) {
1122 reg = read_csr(dd, SEND_PIO_INIT_CTXT);
1123 if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
1124 break;
1125 if (count >= max)
1126 return -ETIMEDOUT;
1127 udelay(5);
1128 count++;
1129 }
1130
1131 return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
1132}
1133
1134/*
1135 * Reset all of the send contexts to their power-on state. Used
1136 * only during manual init - no lock against sc_enable needed.
1137 */
1138void pio_reset_all(struct hfi1_devdata *dd)
1139{
1140 int ret;
1141
1142 /* make sure the init engine is not busy */
1143 ret = pio_init_wait_progress(dd);
1144 /* ignore any timeout */
1145 if (ret == -EIO) {
1146 /* clear the error */
1147 write_csr(dd, SEND_PIO_ERR_CLEAR,
1148 SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
1149 }
1150
1151 /* reset init all */
1152 write_csr(dd, SEND_PIO_INIT_CTXT,
1153 SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
1154 udelay(2);
1155 ret = pio_init_wait_progress(dd);
1156 if (ret < 0) {
1157 dd_dev_err(dd,
1158 "PIO send context init %s while initializing all PIO blocks\n",
1159 ret == -ETIMEDOUT ? "is stuck" : "had an error");
1160 }
1161}
1162
1163/* enable the context */
1164int sc_enable(struct send_context *sc)
1165{
1166 u64 sc_ctrl, reg, pio;
1167 struct hfi1_devdata *dd;
1168 unsigned long flags;
1169 int ret = 0;
1170
1171 if (!sc)
1172 return -EINVAL;
1173 dd = sc->dd;
1174
1175 /*
1176 * Obtain the allocator lock to guard against any allocation
1177 * attempts (which should not happen prior to context being
1178 * enabled). On the release/disable side we don't need to
1179 * worry about locking since the releaser will not do anything
1180 * if the context accounting values have not changed.
1181 */
1182 spin_lock_irqsave(&sc->alloc_lock, flags);
1183 sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
1184 if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
1185 goto unlock; /* already enabled */
1186
1187 /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
1188
1189 *sc->hw_free = 0;
1190 sc->free = 0;
1191 sc->alloc_free = 0;
1192 sc->fill = 0;
1193 sc->sr_head = 0;
1194 sc->sr_tail = 0;
1195 sc->flags = 0;
1196 atomic_set(&sc->buffers_allocated, 0);
1197
1198 /*
1199 * Clear all per-context errors. Some of these will be set when
1200 * we are re-enabling after a context halt. Now that the context
1201 * is disabled, the halt will not clear until after the PIO init
1202 * engine runs below.
1203 */
1204 reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
1205 if (reg)
1206 write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR),
1207 reg);
1208
1209 /*
1210 * The HW PIO initialization engine can handle only one init
1211 * request at a time. Serialize access to each device's engine.
1212 */
1213 spin_lock(&dd->sc_init_lock);
1214 /*
1215 * Since access to this code block is serialized and
1216 * each access waits for the initialization to complete
1217 * before releasing the lock, the PIO initialization engine
1218 * should not be in use, so we don't have to wait for the
1219 * InProgress bit to go down.
1220 */
1221 pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
1222 SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
1223 SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
1224 write_csr(dd, SEND_PIO_INIT_CTXT, pio);
1225 /*
1226 * Wait until the engine is done. Give the chip the required time
1227 * so, hopefully, we read the register just once.
1228 */
1229 udelay(2);
1230 ret = pio_init_wait_progress(dd);
1231 spin_unlock(&dd->sc_init_lock);
1232 if (ret) {
1233 dd_dev_err(dd,
1234 "sctxt%u(%u): Context not enabled due to init failure %d\n",
1235 sc->sw_index, sc->hw_context, ret);
1236 goto unlock;
1237 }
1238
1239 /*
1240 * All is well. Enable the context.
1241 */
1242 sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
1243 write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
1244 /*
1245 * Read SendCtxtCtrl to force the write out and prevent a timing
1246 * hazard where a PIO write may reach the context before the enable.
1247 */
1248 read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
1249 sc->flags |= SCF_ENABLED;
1250
1251unlock:
1252 spin_unlock_irqrestore(&sc->alloc_lock, flags);
1253
1254 return ret;
1255}
1256
1257/* force a credit return on the context */
1258void sc_return_credits(struct send_context *sc)
1259{
1260 if (!sc)
1261 return;
1262
1263 /* a 0->1 transition schedules a credit return */
1264 write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
1265 SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
1266 /*
1267 * Ensure that the write is flushed and the credit return is
1268 * scheduled. We care more about the 0 -> 1 transition.
1269 */
1270 read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
1271 /* set back to 0 for next time */
1272 write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
1273}
1274
1275/* allow all in-flight packets to drain on the context */
1276void sc_flush(struct send_context *sc)
1277{
1278 if (!sc)
1279 return;
1280
1281 sc_wait_for_packet_egress(sc, 1);
1282}
1283
1284/* drop all packets on the context, no waiting until they are sent */
1285void sc_drop(struct send_context *sc)
1286{
1287 if (!sc)
1288 return;
1289
1290 dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
1291 __func__, sc->sw_index, sc->hw_context);
1292}
1293
1294/*
1295 * Start the software reaction to a context halt or SPC freeze:
1296 * - mark the context as halted or frozen
1297 * - stop buffer allocations
1298 *
1299 * Called from the error interrupt. Other work is deferred until
1300 * out of the interrupt.
1301 */
1302void sc_stop(struct send_context *sc, int flag)
1303{
1304 unsigned long flags;
1305
1306 /* mark the context */
1307 sc->flags |= flag;
1308
1309 /* stop buffer allocations */
1310 spin_lock_irqsave(&sc->alloc_lock, flags);
1311 sc->flags &= ~SCF_ENABLED;
1312 spin_unlock_irqrestore(&sc->alloc_lock, flags);
1313 wake_up(&sc->halt_wait);
1314}
1315
1316#define BLOCK_DWORDS (PIO_BLOCK_SIZE/sizeof(u32))
1317#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
1318
1319/*
1320 * The send context buffer "allocator".
1321 *
1322 * @sc: the PIO send context we are allocating from
1323 * @len: length of whole packet - including PBC - in dwords
1324 * @cb: optional callback to call when the buffer is finished sending
1325 * @arg: argument for cb
1326 *
1327 * Return a pointer to a PIO buffer if successful, NULL if not enough room.
1328 */
1329struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
1330 pio_release_cb cb, void *arg)
1331{
1332 struct pio_buf *pbuf = NULL;
1333 unsigned long flags;
1334 unsigned long avail;
1335 unsigned long blocks = dwords_to_blocks(dw_len);
1336 unsigned long start_fill;
1337 int trycount = 0;
1338 u32 head, next;
1339
1340 spin_lock_irqsave(&sc->alloc_lock, flags);
1341 if (!(sc->flags & SCF_ENABLED)) {
1342 spin_unlock_irqrestore(&sc->alloc_lock, flags);
1343 goto done;
1344 }
1345
1346retry:
1347 avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
1348 if (blocks > avail) {
1349 /* not enough room */
1350 if (unlikely(trycount)) { /* already tried to get more room */
1351 spin_unlock_irqrestore(&sc->alloc_lock, flags);
1352 goto done;
1353 }
1354 /* copy from receiver cache line and recalculate */
1355 sc->alloc_free = ACCESS_ONCE(sc->free);
1356 avail =
1357 (unsigned long)sc->credits -
1358 (sc->fill - sc->alloc_free);
1359 if (blocks > avail) {
1360 /* still no room, actively update */
1361 spin_unlock_irqrestore(&sc->alloc_lock, flags);
1362 sc_release_update(sc);
1363 spin_lock_irqsave(&sc->alloc_lock, flags);
1364 sc->alloc_free = ACCESS_ONCE(sc->free);
1365 trycount++;
1366 goto retry;
1367 }
1368 }
1369
1370 /* there is enough room */
1371
1372 atomic_inc(&sc->buffers_allocated);
1373
1374 /* read this once */
1375 head = sc->sr_head;
1376
1377 /* "allocate" the buffer */
1378 start_fill = sc->fill;
1379 sc->fill += blocks;
1380
1381 /*
1382 * Fill the parts that the releaser looks at before moving the head.
1383 * The only necessary piece is the sent_at field. The credits
1384 * we have just allocated cannot have been returned yet, so the
1385 * cb and arg will not be looked at for a "while". Put them
1386 * on this side of the memory barrier anyway.
1387 */
1388 pbuf = &sc->sr[head].pbuf;
1389 pbuf->sent_at = sc->fill;
1390 pbuf->cb = cb;
1391 pbuf->arg = arg;
1392 pbuf->sc = sc; /* could be filled in at sc->sr init time */
1393 /* make sure this is in memory before updating the head */
1394
1395 /* calculate next head index, do not store */
1396 next = head + 1;
1397 if (next >= sc->sr_size)
1398 next = 0;
1399 /* update the head - must be last! - the releaser can look at fields
1400 in pbuf once we move the head */
1401 smp_wmb();
1402 sc->sr_head = next;
1403 spin_unlock_irqrestore(&sc->alloc_lock, flags);
1404
1405 /* finish filling in the buffer outside the lock */
1406 pbuf->start = sc->base_addr + ((start_fill % sc->credits)
1407 * PIO_BLOCK_SIZE);
1408 pbuf->size = sc->credits * PIO_BLOCK_SIZE;
1409 pbuf->end = sc->base_addr + pbuf->size;
1410 pbuf->block_count = blocks;
1411 pbuf->qw_written = 0;
1412 pbuf->carry_bytes = 0;
1413 pbuf->carry.val64 = 0;
1414done:
1415 return pbuf;
1416}
1417
1418/*
1419 * There are at least two entities that can turn on credit return
1420 * interrupts and they can overlap. Avoid problems by implementing
1421 * a count scheme that is enforced by a lock. The lock is needed because
1422 * the count and CSR write must be paired.
1423 */
1424
1425/*
1426 * Start credit return interrupts. This is managed by a count. If already
1427 * on, just increment the count.
1428 */
1429void sc_add_credit_return_intr(struct send_context *sc)
1430{
1431 unsigned long flags;
1432
1433 /* lock must surround both the count change and the CSR update */
1434 spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
1435 if (sc->credit_intr_count == 0) {
1436 sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
1437 write_kctxt_csr(sc->dd, sc->hw_context,
1438 SC(CREDIT_CTRL), sc->credit_ctrl);
1439 }
1440 sc->credit_intr_count++;
1441 spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
1442}
1443
1444/*
1445 * Stop credit return interrupts. This is managed by a count. Decrement the
1446 * count, if the last user, then turn the credit interrupts off.
1447 */
1448void sc_del_credit_return_intr(struct send_context *sc)
1449{
1450 unsigned long flags;
1451
1452 WARN_ON(sc->credit_intr_count == 0);
1453
1454 /* lock must surround both the count change and the CSR update */
1455 spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
1456 sc->credit_intr_count--;
1457 if (sc->credit_intr_count == 0) {
1458 sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
1459 write_kctxt_csr(sc->dd, sc->hw_context,
1460 SC(CREDIT_CTRL), sc->credit_ctrl);
1461 }
1462 spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
1463}
1464
1465/*
1466 * The caller must be careful when calling this. All needint calls
1467 * must be paired with !needint.
1468 */
1469void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
1470{
1471 if (needint)
1472 sc_add_credit_return_intr(sc);
1473 else
1474 sc_del_credit_return_intr(sc);
1475 trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
1476 if (needint) {
1477 mmiowb();
1478 sc_return_credits(sc);
1479 }
1480}
1481
1482/**
1483 * sc_piobufavail - callback when a PIO buffer is available
1484 * @sc: the send context
1485 *
1486 * This is called from the interrupt handler when a PIO buffer is
1487 * available after hfi1_verbs_send() returned an error that no buffers were
1488 * available. Disable the interrupt if there are no more QPs waiting.
1489 */
1490static void sc_piobufavail(struct send_context *sc)
1491{
1492 struct hfi1_devdata *dd = sc->dd;
1493 struct hfi1_ibdev *dev = &dd->verbs_dev;
1494 struct list_head *list;
1495 struct hfi1_qp *qps[PIO_WAIT_BATCH_SIZE];
1496 struct hfi1_qp *qp;
1497 unsigned long flags;
1498 unsigned i, n = 0;
1499
1500 if (dd->send_contexts[sc->sw_index].type != SC_KERNEL)
1501 return;
1502 list = &sc->piowait;
1503 /*
1504 * Note: checking that the piowait list is empty and clearing
1505 * the buffer available interrupt needs to be atomic or we
1506 * could end up with QPs on the wait list with the interrupt
1507 * disabled.
1508 */
1509 write_seqlock_irqsave(&dev->iowait_lock, flags);
1510 while (!list_empty(list)) {
1511 struct iowait *wait;
1512
1513 if (n == ARRAY_SIZE(qps))
1514 goto full;
1515 wait = list_first_entry(list, struct iowait, list);
1516 qp = container_of(wait, struct hfi1_qp, s_iowait);
1517 list_del_init(&qp->s_iowait.list);
1518 /* refcount held until actual wake up */
1519 qps[n++] = qp;
1520 }
1521 /*
1522 * Counting: only call wantpiobuf_intr() if there were waiters and they
1523 * are now all gone.
1524 */
1525 if (n)
1526 hfi1_sc_wantpiobuf_intr(sc, 0);
1527full:
1528 write_sequnlock_irqrestore(&dev->iowait_lock, flags);
1529
1530 for (i = 0; i < n; i++)
1531 hfi1_qp_wakeup(qps[i], HFI1_S_WAIT_PIO);
1532}
1533
1534/* translate a send credit update to a bit code of reasons */
1535static inline int fill_code(u64 hw_free)
1536{
1537 int code = 0;
1538
1539 if (hw_free & CR_STATUS_SMASK)
1540 code |= PRC_STATUS_ERR;
1541 if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
1542 code |= PRC_PBC;
1543 if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
1544 code |= PRC_THRESHOLD;
1545 if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
1546 code |= PRC_FILL_ERR;
1547 if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
1548 code |= PRC_SC_DISABLE;
1549 return code;
1550}
1551
1552/* use the jiffies compare to get the wrap right */
1553#define sent_before(a, b) time_before(a, b) /* a < b */
1554
1555/*
1556 * The send context buffer "releaser".
1557 */
1558void sc_release_update(struct send_context *sc)
1559{
1560 struct pio_buf *pbuf;
1561 u64 hw_free;
1562 u32 head, tail;
1563 unsigned long old_free;
1564 unsigned long extra;
1565 unsigned long flags;
1566 int code;
1567
1568 if (!sc)
1569 return;
1570
1571 spin_lock_irqsave(&sc->release_lock, flags);
1572 /* update free */
1573 hw_free = le64_to_cpu(*sc->hw_free); /* volatile read */
1574 old_free = sc->free;
1575 extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
1576 - (old_free & CR_COUNTER_MASK))
1577 & CR_COUNTER_MASK;
1578 sc->free = old_free + extra;
1579 trace_hfi1_piofree(sc, extra);
1580
1581 /* call sent buffer callbacks */
1582 code = -1; /* code not yet set */
1583 head = ACCESS_ONCE(sc->sr_head); /* snapshot the head */
1584 tail = sc->sr_tail;
1585 while (head != tail) {
1586 pbuf = &sc->sr[tail].pbuf;
1587
1588 if (sent_before(sc->free, pbuf->sent_at)) {
1589 /* not sent yet */
1590 break;
1591 }
1592 if (pbuf->cb) {
1593 if (code < 0) /* fill in code on first user */
1594 code = fill_code(hw_free);
1595 (*pbuf->cb)(pbuf->arg, code);
1596 }
1597
1598 tail++;
1599 if (tail >= sc->sr_size)
1600 tail = 0;
1601 }
1602 /* update tail, in case we moved it */
1603 sc->sr_tail = tail;
1604 spin_unlock_irqrestore(&sc->release_lock, flags);
1605 sc_piobufavail(sc);
1606}
1607
1608/*
1609 * Send context group releaser. Argument is the send context that caused
1610 * the interrupt. Called from the send context interrupt handler.
1611 *
1612 * Call release on all contexts in the group.
1613 *
1614 * This routine takes the sc_lock without an irqsave because it is only
1615 * called from an interrupt handler. Adjust if that changes.
1616 */
1617void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
1618{
1619 struct send_context *sc;
1620 u32 sw_index;
1621 u32 gc, gc_end;
1622
1623 spin_lock(&dd->sc_lock);
1624 sw_index = dd->hw_to_sw[hw_context];
1625 if (unlikely(sw_index >= dd->num_send_contexts)) {
1626 dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
1627 __func__, hw_context, sw_index);
1628 goto done;
1629 }
1630 sc = dd->send_contexts[sw_index].sc;
1631 if (unlikely(!sc))
1632 goto done;
1633
1634 gc = group_context(hw_context, sc->group);
1635 gc_end = gc + group_size(sc->group);
1636 for (; gc < gc_end; gc++) {
1637 sw_index = dd->hw_to_sw[gc];
1638 if (unlikely(sw_index >= dd->num_send_contexts)) {
1639 dd_dev_err(dd,
1640 "%s: invalid hw (%u) to sw (%u) mapping\n",
1641 __func__, hw_context, sw_index);
1642 continue;
1643 }
1644 sc_release_update(dd->send_contexts[sw_index].sc);
1645 }
1646done:
1647 spin_unlock(&dd->sc_lock);
1648}
1649
1650int init_pervl_scs(struct hfi1_devdata *dd)
1651{
1652 int i;
1653 u64 mask, all_vl_mask = (u64) 0x80ff; /* VLs 0-7, 15 */
1654 u32 ctxt;
1655
1656 dd->vld[15].sc = sc_alloc(dd, SC_KERNEL,
1657 dd->rcd[0]->rcvhdrqentsize, dd->node);
1658 if (!dd->vld[15].sc)
1659 goto nomem;
1660 hfi1_init_ctxt(dd->vld[15].sc);
1661 dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
1662 for (i = 0; i < num_vls; i++) {
1663 /*
1664 * Since this function does not deal with a specific
1665 * receive context but we need the RcvHdrQ entry size,
1666 * use the size from rcd[0]. It is guaranteed to be
1667 * valid at this point and will remain the same for all
1668 * receive contexts.
1669 */
1670 dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
1671 dd->rcd[0]->rcvhdrqentsize, dd->node);
1672 if (!dd->vld[i].sc)
1673 goto nomem;
1674
1675 hfi1_init_ctxt(dd->vld[i].sc);
1676
1677 /* non VL15 start with the max MTU */
1678 dd->vld[i].mtu = hfi1_max_mtu;
1679 }
1680 sc_enable(dd->vld[15].sc);
1681 ctxt = dd->vld[15].sc->hw_context;
1682 mask = all_vl_mask & ~(1LL << 15);
1683 write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
1684 dd_dev_info(dd,
1685 "Using send context %u(%u) for VL15\n",
1686 dd->vld[15].sc->sw_index, ctxt);
1687 for (i = 0; i < num_vls; i++) {
1688 sc_enable(dd->vld[i].sc);
1689 ctxt = dd->vld[i].sc->hw_context;
1690 mask = all_vl_mask & ~(1LL << i);
1691 write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
1692 }
1693 return 0;
1694nomem:
1695 sc_free(dd->vld[15].sc);
1696 for (i = 0; i < num_vls; i++)
1697 sc_free(dd->vld[i].sc);
1698 return -ENOMEM;
1699}
1700
1701int init_credit_return(struct hfi1_devdata *dd)
1702{
1703 int ret;
1704 int num_numa;
1705 int i;
1706
1707 num_numa = num_online_nodes();
1708 /* enforce the expectation that the numas are compact */
1709 for (i = 0; i < num_numa; i++) {
1710 if (!node_online(i)) {
1711 dd_dev_err(dd, "NUMA nodes are not compact\n");
1712 ret = -EINVAL;
1713 goto done;
1714 }
1715 }
1716
1717 dd->cr_base = kcalloc(
1718 num_numa,
1719 sizeof(struct credit_return_base),
1720 GFP_KERNEL);
1721 if (!dd->cr_base) {
1722 dd_dev_err(dd, "Unable to allocate credit return base\n");
1723 ret = -ENOMEM;
1724 goto done;
1725 }
1726 for (i = 0; i < num_numa; i++) {
1727 int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
1728
1729 set_dev_node(&dd->pcidev->dev, i);
1730 dd->cr_base[i].va = dma_zalloc_coherent(
1731 &dd->pcidev->dev,
1732 bytes,
1733 &dd->cr_base[i].pa,
1734 GFP_KERNEL);
1735 if (dd->cr_base[i].va == NULL) {
1736 set_dev_node(&dd->pcidev->dev, dd->node);
1737 dd_dev_err(dd,
1738 "Unable to allocate credit return DMA range for NUMA %d\n",
1739 i);
1740 ret = -ENOMEM;
1741 goto done;
1742 }
1743 }
1744 set_dev_node(&dd->pcidev->dev, dd->node);
1745
1746 ret = 0;
1747done:
1748 return ret;
1749}
1750
1751void free_credit_return(struct hfi1_devdata *dd)
1752{
1753 int num_numa;
1754 int i;
1755
1756 if (!dd->cr_base)
1757 return;
1758
1759 num_numa = num_online_nodes();
1760 for (i = 0; i < num_numa; i++) {
1761 if (dd->cr_base[i].va) {
1762 dma_free_coherent(&dd->pcidev->dev,
1763 TXE_NUM_CONTEXTS
1764 * sizeof(struct credit_return),
1765 dd->cr_base[i].va,
1766 dd->cr_base[i].pa);
1767 }
1768 }
1769 kfree(dd->cr_base);
1770 dd->cr_base = NULL;
1771}
diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h
new file mode 100644
index 000000000000..0bb885ca3cfb
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pio.h
@@ -0,0 +1,224 @@
1#ifndef _PIO_H
2#define _PIO_H
3/*
4 *
5 * This file is provided under a dual BSD/GPLv2 license. When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * Copyright(c) 2015 Intel Corporation.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of version 2 of the GNU General Public License as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * BSD LICENSE
22 *
23 * Copyright(c) 2015 Intel Corporation.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 *
29 * - Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * - Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in
33 * the documentation and/or other materials provided with the
34 * distribution.
35 * - Neither the name of Intel Corporation nor the names of its
36 * contributors may be used to endorse or promote products derived
37 * from this software without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
40 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
41 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
43 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
47 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
49 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 *
51 */
52
53
54/* send context types */
55#define SC_KERNEL 0
56#define SC_ACK 1
57#define SC_USER 2
58#define SC_MAX 3
59
60/* invalid send context index */
61#define INVALID_SCI 0xff
62
63/* PIO buffer release callback function */
64typedef void (*pio_release_cb)(void *arg, int code);
65
66/* PIO release codes - in bits, as there could more than one that apply */
67#define PRC_OK 0 /* no known error */
68#define PRC_STATUS_ERR 0x01 /* credit return due to status error */
69#define PRC_PBC 0x02 /* credit return due to PBC */
70#define PRC_THRESHOLD 0x04 /* credit return due to threshold */
71#define PRC_FILL_ERR 0x08 /* credit return due fill error */
72#define PRC_FORCE 0x10 /* credit return due credit force */
73#define PRC_SC_DISABLE 0x20 /* clean-up after a context disable */
74
75/* byte helper */
76union mix {
77 u64 val64;
78 u32 val32[2];
79 u8 val8[8];
80};
81
82/* an allocated PIO buffer */
83struct pio_buf {
84 struct send_context *sc;/* back pointer to owning send context */
85 pio_release_cb cb; /* called when the buffer is released */
86 void *arg; /* argument for cb */
87 void __iomem *start; /* buffer start address */
88 void __iomem *end; /* context end address */
89 unsigned long size; /* context size, in bytes */
90 unsigned long sent_at; /* buffer is sent when <= free */
91 u32 block_count; /* size of buffer, in blocks */
92 u32 qw_written; /* QW written so far */
93 u32 carry_bytes; /* number of valid bytes in carry */
94 union mix carry; /* pending unwritten bytes */
95};
96
97/* cache line aligned pio buffer array */
98union pio_shadow_ring {
99 struct pio_buf pbuf;
100 u64 unused[16]; /* cache line spacer */
101} ____cacheline_aligned;
102
103/* per-NUMA send context */
104struct send_context {
105 /* read-only after init */
106 struct hfi1_devdata *dd; /* device */
107 void __iomem *base_addr; /* start of PIO memory */
108 union pio_shadow_ring *sr; /* shadow ring */
109 volatile __le64 *hw_free; /* HW free counter */
110 struct work_struct halt_work; /* halted context work queue entry */
111 unsigned long flags; /* flags */
112 int node; /* context home node */
113 int type; /* context type */
114 u32 sw_index; /* software index number */
115 u32 hw_context; /* hardware context number */
116 u32 credits; /* number of blocks in context */
117 u32 sr_size; /* size of the shadow ring */
118 u32 group; /* credit return group */
119 /* allocator fields */
120 spinlock_t alloc_lock ____cacheline_aligned_in_smp;
121 unsigned long fill; /* official alloc count */
122 unsigned long alloc_free; /* copy of free (less cache thrash) */
123 u32 sr_head; /* shadow ring head */
124 /* releaser fields */
125 spinlock_t release_lock ____cacheline_aligned_in_smp;
126 unsigned long free; /* official free count */
127 u32 sr_tail; /* shadow ring tail */
128 /* list for PIO waiters */
129 struct list_head piowait ____cacheline_aligned_in_smp;
130 spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
131 u64 credit_ctrl; /* cache for credit control */
132 u32 credit_intr_count; /* count of credit intr users */
133 atomic_t buffers_allocated; /* count of buffers allocated */
134 wait_queue_head_t halt_wait; /* wait until kernel sees interrupt */
135};
136
137/* send context flags */
138#define SCF_ENABLED 0x01
139#define SCF_IN_FREE 0x02
140#define SCF_HALTED 0x04
141#define SCF_FROZEN 0x08
142
143struct send_context_info {
144 struct send_context *sc; /* allocated working context */
145 u16 allocated; /* has this been allocated? */
146 u16 type; /* context type */
147 u16 base; /* base in PIO array */
148 u16 credits; /* size in PIO array */
149};
150
151/* DMA credit return, index is always (context & 0x7) */
152struct credit_return {
153 volatile __le64 cr[8];
154};
155
156/* NUMA indexed credit return array */
157struct credit_return_base {
158 struct credit_return *va;
159 dma_addr_t pa;
160};
161
162/* send context configuration sizes (one per type) */
163struct sc_config_sizes {
164 short int size;
165 short int count;
166};
167
168/* send context functions */
169int init_credit_return(struct hfi1_devdata *dd);
170void free_credit_return(struct hfi1_devdata *dd);
171int init_sc_pools_and_sizes(struct hfi1_devdata *dd);
172int init_send_contexts(struct hfi1_devdata *dd);
173int init_credit_return(struct hfi1_devdata *dd);
174int init_pervl_scs(struct hfi1_devdata *dd);
175struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
176 uint hdrqentsize, int numa);
177void sc_free(struct send_context *sc);
178int sc_enable(struct send_context *sc);
179void sc_disable(struct send_context *sc);
180int sc_restart(struct send_context *sc);
181void sc_return_credits(struct send_context *sc);
182void sc_flush(struct send_context *sc);
183void sc_drop(struct send_context *sc);
184void sc_stop(struct send_context *sc, int bit);
185struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
186 pio_release_cb cb, void *arg);
187void sc_release_update(struct send_context *sc);
188void sc_return_credits(struct send_context *sc);
189void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
190void sc_add_credit_return_intr(struct send_context *sc);
191void sc_del_credit_return_intr(struct send_context *sc);
192void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
193u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
194void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
195void sc_wait(struct hfi1_devdata *dd);
196void set_pio_integrity(struct send_context *sc);
197
198/* support functions */
199void pio_reset_all(struct hfi1_devdata *dd);
200void pio_freeze(struct hfi1_devdata *dd);
201void pio_kernel_unfreeze(struct hfi1_devdata *dd);
202
203/* global PIO send control operations */
204#define PSC_GLOBAL_ENABLE 0
205#define PSC_GLOBAL_DISABLE 1
206#define PSC_GLOBAL_VLARB_ENABLE 2
207#define PSC_GLOBAL_VLARB_DISABLE 3
208#define PSC_CM_RESET 4
209#define PSC_DATA_VL_ENABLE 5
210#define PSC_DATA_VL_DISABLE 6
211
212void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl);
213void pio_send_control(struct hfi1_devdata *dd, int op);
214
215
216/* PIO copy routines */
217void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
218 const void *from, size_t count);
219void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
220 const void *from, size_t nbytes);
221void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
222void seg_pio_copy_end(struct pio_buf *pbuf);
223
224#endif /* _PIO_H */
diff --git a/drivers/staging/rdma/hfi1/pio_copy.c b/drivers/staging/rdma/hfi1/pio_copy.c
new file mode 100644
index 000000000000..8972bbc02038
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pio_copy.c
@@ -0,0 +1,858 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include "hfi.h"
52
53/* additive distance between non-SOP and SOP space */
54#define SOP_DISTANCE (TXE_PIO_SIZE / 2)
55#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE-1)
56/* number of QUADWORDs in a block */
57#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE/sizeof(u64))
58
59/**
60 * pio_copy - copy data block to MMIO space
61 * @pbuf: a number of blocks allocated within a PIO send context
62 * @pbc: PBC to send
63 * @from: source, must be 8 byte aligned
64 * @count: number of DWORD (32-bit) quantities to copy from source
65 *
66 * Copy data from source to PIO Send Buffer memory, 8 bytes at a time.
67 * Must always write full BLOCK_SIZE bytes blocks. The first block must
68 * be written to the corresponding SOP=1 address.
69 *
70 * Known:
71 * o pbuf->start always starts on a block boundary
72 * o pbuf can wrap only at a block boundary
73 */
74void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
75 const void *from, size_t count)
76{
77 void __iomem *dest = pbuf->start + SOP_DISTANCE;
78 void __iomem *send = dest + PIO_BLOCK_SIZE;
79 void __iomem *dend; /* 8-byte data end */
80
81 /* write the PBC */
82 writeq(pbc, dest);
83 dest += sizeof(u64);
84
85 /* calculate where the QWORD data ends - in SOP=1 space */
86 dend = dest + ((count>>1) * sizeof(u64));
87
88 if (dend < send) {
89 /* all QWORD data is within the SOP block, does *not*
90 reach the end of the SOP block */
91
92 while (dest < dend) {
93 writeq(*(u64 *)from, dest);
94 from += sizeof(u64);
95 dest += sizeof(u64);
96 }
97 /*
98 * No boundary checks are needed here:
99 * 0. We're not on the SOP block boundary
100 * 1. The possible DWORD dangle will still be within
101 * the SOP block
102 * 2. We cannot wrap except on a block boundary.
103 */
104 } else {
105 /* QWORD data extends _to_ or beyond the SOP block */
106
107 /* write 8-byte SOP chunk data */
108 while (dest < send) {
109 writeq(*(u64 *)from, dest);
110 from += sizeof(u64);
111 dest += sizeof(u64);
112 }
113 /* drop out of the SOP range */
114 dest -= SOP_DISTANCE;
115 dend -= SOP_DISTANCE;
116
117 /*
118 * If the wrap comes before or matches the data end,
119 * copy until until the wrap, then wrap.
120 *
121 * If the data ends at the end of the SOP above and
122 * the buffer wraps, then pbuf->end == dend == dest
123 * and nothing will get written, but we will wrap in
124 * case there is a dangling DWORD.
125 */
126 if (pbuf->end <= dend) {
127 while (dest < pbuf->end) {
128 writeq(*(u64 *)from, dest);
129 from += sizeof(u64);
130 dest += sizeof(u64);
131 }
132
133 dest -= pbuf->size;
134 dend -= pbuf->size;
135 }
136
137 /* write 8-byte non-SOP, non-wrap chunk data */
138 while (dest < dend) {
139 writeq(*(u64 *)from, dest);
140 from += sizeof(u64);
141 dest += sizeof(u64);
142 }
143 }
144 /* at this point we have wrapped if we are going to wrap */
145
146 /* write dangling u32, if any */
147 if (count & 1) {
148 union mix val;
149
150 val.val64 = 0;
151 val.val32[0] = *(u32 *)from;
152 writeq(val.val64, dest);
153 dest += sizeof(u64);
154 }
155 /* fill in rest of block, no need to check pbuf->end
156 as we only wrap on a block boundary */
157 while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
158 writeq(0, dest);
159 dest += sizeof(u64);
160 }
161
162 /* finished with this buffer */
163 atomic_dec(&pbuf->sc->buffers_allocated);
164}
165
166/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */
167#define USE_SHIFTS 1
168#ifdef USE_SHIFTS
169/*
170 * Handle carry bytes using shifts and masks.
171 *
172 * NOTE: the value the unused portion of carry is expected to always be zero.
173 */
174
175/*
176 * "zero" shift - bit shift used to zero out upper bytes. Input is
177 * the count of LSB bytes to preserve.
178 */
179#define zshift(x) (8 * (8-(x)))
180
181/*
182 * "merge" shift - bit shift used to merge with carry bytes. Input is
183 * the LSB byte count to move beyond.
184 */
185#define mshift(x) (8 * (x))
186
187/*
188 * Read nbytes bytes from "from" and return them in the LSB bytes
189 * of pbuf->carry. Other bytes are zeroed. Any previous value
190 * pbuf->carry is lost.
191 *
192 * NOTES:
193 * o do not read from from if nbytes is zero
194 * o from may _not_ be u64 aligned
195 * o nbytes must not span a QW boundary
196 */
197static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
198 unsigned int nbytes)
199{
200 unsigned long off;
201
202 if (nbytes == 0) {
203 pbuf->carry.val64 = 0;
204 } else {
205 /* align our pointer */
206 off = (unsigned long)from & 0x7;
207 from = (void *)((unsigned long)from & ~0x7l);
208 pbuf->carry.val64 = ((*(u64 *)from)
209 << zshift(nbytes + off))/* zero upper bytes */
210 >> zshift(nbytes); /* place at bottom */
211 }
212 pbuf->carry_bytes = nbytes;
213}
214
215/*
216 * Read nbytes bytes from "from" and put them at the next significant bytes
217 * of pbuf->carry. Unused bytes are zeroed. It is expected that the extra
218 * read does not overfill carry.
219 *
220 * NOTES:
221 * o from may _not_ be u64 aligned
222 * o nbytes may span a QW boundary
223 */
224static inline void read_extra_bytes(struct pio_buf *pbuf,
225 const void *from, unsigned int nbytes)
226{
227 unsigned long off = (unsigned long)from & 0x7;
228 unsigned int room, xbytes;
229
230 /* align our pointer */
231 from = (void *)((unsigned long)from & ~0x7l);
232
233 /* check count first - don't read anything if count is zero */
234 while (nbytes) {
235 /* find the number of bytes in this u64 */
236 room = 8 - off; /* this u64 has room for this many bytes */
237 xbytes = nbytes > room ? room : nbytes;
238
239 /*
240 * shift down to zero lower bytes, shift up to zero upper
241 * bytes, shift back down to move into place
242 */
243 pbuf->carry.val64 |= (((*(u64 *)from)
244 >> mshift(off))
245 << zshift(xbytes))
246 >> zshift(xbytes+pbuf->carry_bytes);
247 off = 0;
248 pbuf->carry_bytes += xbytes;
249 nbytes -= xbytes;
250 from += sizeof(u64);
251 }
252}
253
254/*
255 * Zero extra bytes from the end of pbuf->carry.
256 *
257 * NOTES:
258 * o zbytes <= old_bytes
259 */
260static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
261{
262 unsigned int remaining;
263
264 if (zbytes == 0) /* nothing to do */
265 return;
266
267 remaining = pbuf->carry_bytes - zbytes; /* remaining bytes */
268
269 /* NOTE: zshift only guaranteed to work if remaining != 0 */
270 if (remaining)
271 pbuf->carry.val64 = (pbuf->carry.val64 << zshift(remaining))
272 >> zshift(remaining);
273 else
274 pbuf->carry.val64 = 0;
275 pbuf->carry_bytes = remaining;
276}
277
278/*
279 * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
280 * Put the unused part of the next 8 bytes of src into the LSB bytes of
281 * pbuf->carry with the upper bytes zeroed..
282 *
283 * NOTES:
284 * o result must keep unused bytes zeroed
285 * o src must be u64 aligned
286 */
287static inline void merge_write8(
288 struct pio_buf *pbuf,
289 void __iomem *dest,
290 const void *src)
291{
292 u64 new, temp;
293
294 new = *(u64 *)src;
295 temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes));
296 writeq(temp, dest);
297 pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes);
298}
299
300/*
301 * Write a quad word using all bytes of carry.
302 */
303static inline void carry8_write8(union mix carry, void __iomem *dest)
304{
305 writeq(carry.val64, dest);
306}
307
308/*
309 * Write a quad word using all the valid bytes of carry. If carry
310 * has zero valid bytes, nothing is written.
311 * Returns 0 on nothing written, non-zero on quad word written.
312 */
313static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest)
314{
315 if (pbuf->carry_bytes) {
316 /* unused bytes are always kept zeroed, so just write */
317 writeq(pbuf->carry.val64, dest);
318 return 1;
319 }
320
321 return 0;
322}
323
324#else /* USE_SHIFTS */
325/*
326 * Handle carry bytes using byte copies.
327 *
328 * NOTE: the value the unused portion of carry is left uninitialized.
329 */
330
331/*
332 * Jump copy - no-loop copy for < 8 bytes.
333 */
334static inline void jcopy(u8 *dest, const u8 *src, u32 n)
335{
336 switch (n) {
337 case 7:
338 *dest++ = *src++;
339 case 6:
340 *dest++ = *src++;
341 case 5:
342 *dest++ = *src++;
343 case 4:
344 *dest++ = *src++;
345 case 3:
346 *dest++ = *src++;
347 case 2:
348 *dest++ = *src++;
349 case 1:
350 *dest++ = *src++;
351 }
352}
353
354/*
355 * Read nbytes from "from" and and place them in the low bytes
356 * of pbuf->carry. Other bytes are left as-is. Any previous
357 * value in pbuf->carry is lost.
358 *
359 * NOTES:
360 * o do not read from from if nbytes is zero
361 * o from may _not_ be u64 aligned.
362 */
363static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
364 unsigned int nbytes)
365{
366 jcopy(&pbuf->carry.val8[0], from, nbytes);
367 pbuf->carry_bytes = nbytes;
368}
369
370/*
371 * Read nbytes bytes from "from" and put them at the end of pbuf->carry.
372 * It is expected that the extra read does not overfill carry.
373 *
374 * NOTES:
375 * o from may _not_ be u64 aligned
376 * o nbytes may span a QW boundary
377 */
378static inline void read_extra_bytes(struct pio_buf *pbuf,
379 const void *from, unsigned int nbytes)
380{
381 jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes);
382 pbuf->carry_bytes += nbytes;
383}
384
385/*
386 * Zero extra bytes from the end of pbuf->carry.
387 *
388 * We do not care about the value of unused bytes in carry, so just
389 * reduce the byte count.
390 *
391 * NOTES:
392 * o zbytes <= old_bytes
393 */
394static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
395{
396 pbuf->carry_bytes -= zbytes;
397}
398
399/*
400 * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
401 * Put the unused part of the next 8 bytes of src into the low bytes of
402 * pbuf->carry.
403 */
404static inline void merge_write8(
405 struct pio_buf *pbuf,
406 void *dest,
407 const void *src)
408{
409 u32 remainder = 8 - pbuf->carry_bytes;
410
411 jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder);
412 writeq(pbuf->carry.val64, dest);
413 jcopy(&pbuf->carry.val8[0], src+remainder, pbuf->carry_bytes);
414}
415
416/*
417 * Write a quad word using all bytes of carry.
418 */
419static inline void carry8_write8(union mix carry, void *dest)
420{
421 writeq(carry.val64, dest);
422}
423
424/*
425 * Write a quad word using all the valid bytes of carry. If carry
426 * has zero valid bytes, nothing is written.
427 * Returns 0 on nothing written, non-zero on quad word written.
428 */
429static inline int carry_write8(struct pio_buf *pbuf, void *dest)
430{
431 if (pbuf->carry_bytes) {
432 u64 zero = 0;
433
434 jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero,
435 8 - pbuf->carry_bytes);
436 writeq(pbuf->carry.val64, dest);
437 return 1;
438 }
439
440 return 0;
441}
442#endif /* USE_SHIFTS */
443
444/*
445 * Segmented PIO Copy - start
446 *
447 * Start a PIO copy.
448 *
449 * @pbuf: destination buffer
450 * @pbc: the PBC for the PIO buffer
451 * @from: data source, QWORD aligned
452 * @nbytes: bytes to copy
453 */
454void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
455 const void *from, size_t nbytes)
456{
457 void __iomem *dest = pbuf->start + SOP_DISTANCE;
458 void __iomem *send = dest + PIO_BLOCK_SIZE;
459 void __iomem *dend; /* 8-byte data end */
460
461 writeq(pbc, dest);
462 dest += sizeof(u64);
463
464 /* calculate where the QWORD data ends - in SOP=1 space */
465 dend = dest + ((nbytes>>3) * sizeof(u64));
466
467 if (dend < send) {
468 /* all QWORD data is within the SOP block, does *not*
469 reach the end of the SOP block */
470
471 while (dest < dend) {
472 writeq(*(u64 *)from, dest);
473 from += sizeof(u64);
474 dest += sizeof(u64);
475 }
476 /*
477 * No boundary checks are needed here:
478 * 0. We're not on the SOP block boundary
479 * 1. The possible DWORD dangle will still be within
480 * the SOP block
481 * 2. We cannot wrap except on a block boundary.
482 */
483 } else {
484 /* QWORD data extends _to_ or beyond the SOP block */
485
486 /* write 8-byte SOP chunk data */
487 while (dest < send) {
488 writeq(*(u64 *)from, dest);
489 from += sizeof(u64);
490 dest += sizeof(u64);
491 }
492 /* drop out of the SOP range */
493 dest -= SOP_DISTANCE;
494 dend -= SOP_DISTANCE;
495
496 /*
497 * If the wrap comes before or matches the data end,
498 * copy until until the wrap, then wrap.
499 *
500 * If the data ends at the end of the SOP above and
501 * the buffer wraps, then pbuf->end == dend == dest
502 * and nothing will get written, but we will wrap in
503 * case there is a dangling DWORD.
504 */
505 if (pbuf->end <= dend) {
506 while (dest < pbuf->end) {
507 writeq(*(u64 *)from, dest);
508 from += sizeof(u64);
509 dest += sizeof(u64);
510 }
511
512 dest -= pbuf->size;
513 dend -= pbuf->size;
514 }
515
516 /* write 8-byte non-SOP, non-wrap chunk data */
517 while (dest < dend) {
518 writeq(*(u64 *)from, dest);
519 from += sizeof(u64);
520 dest += sizeof(u64);
521 }
522 }
523 /* at this point we have wrapped if we are going to wrap */
524
525 /* ...but it doesn't matter as we're done writing */
526
527 /* save dangling bytes, if any */
528 read_low_bytes(pbuf, from, nbytes & 0x7);
529
530 pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3);
531}
532
533/*
534 * Mid copy helper, "mixed case" - source is 64-bit aligned but carry
535 * bytes are non-zero.
536 *
537 * Whole u64s must be written to the chip, so bytes must be manually merged.
538 *
539 * @pbuf: destination buffer
540 * @from: data source, is QWORD aligned.
541 * @nbytes: bytes to copy
542 *
543 * Must handle nbytes < 8.
544 */
545static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
546{
547 void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
548 void __iomem *dend; /* 8-byte data end */
549 unsigned long qw_to_write = (pbuf->carry_bytes + nbytes) >> 3;
550 unsigned long bytes_left = (pbuf->carry_bytes + nbytes) & 0x7;
551
552 /* calculate 8-byte data end */
553 dend = dest + (qw_to_write * sizeof(u64));
554
555 if (pbuf->qw_written < PIO_BLOCK_QWS) {
556 /*
557 * Still within SOP block. We don't need to check for
558 * wrap because we are still in the first block and
559 * can only wrap on block boundaries.
560 */
561 void __iomem *send; /* SOP end */
562 void __iomem *xend;
563
564 /* calculate the end of data or end of block, whichever
565 comes first */
566 send = pbuf->start + PIO_BLOCK_SIZE;
567 xend = send < dend ? send : dend;
568
569 /* shift up to SOP=1 space */
570 dest += SOP_DISTANCE;
571 xend += SOP_DISTANCE;
572
573 /* write 8-byte chunk data */
574 while (dest < xend) {
575 merge_write8(pbuf, dest, from);
576 from += sizeof(u64);
577 dest += sizeof(u64);
578 }
579
580 /* shift down to SOP=0 space */
581 dest -= SOP_DISTANCE;
582 }
583 /*
584 * At this point dest could be (either, both, or neither):
585 * - at dend
586 * - at the wrap
587 */
588
589 /*
590 * If the wrap comes before or matches the data end,
591 * copy until until the wrap, then wrap.
592 *
593 * If dest is at the wrap, we will fall into the if,
594 * not do the loop, when wrap.
595 *
596 * If the data ends at the end of the SOP above and
597 * the buffer wraps, then pbuf->end == dend == dest
598 * and nothing will get written.
599 */
600 if (pbuf->end <= dend) {
601 while (dest < pbuf->end) {
602 merge_write8(pbuf, dest, from);
603 from += sizeof(u64);
604 dest += sizeof(u64);
605 }
606
607 dest -= pbuf->size;
608 dend -= pbuf->size;
609 }
610
611 /* write 8-byte non-SOP, non-wrap chunk data */
612 while (dest < dend) {
613 merge_write8(pbuf, dest, from);
614 from += sizeof(u64);
615 dest += sizeof(u64);
616 }
617
618 /* adjust carry */
619 if (pbuf->carry_bytes < bytes_left) {
620 /* need to read more */
621 read_extra_bytes(pbuf, from, bytes_left - pbuf->carry_bytes);
622 } else {
623 /* remove invalid bytes */
624 zero_extra_bytes(pbuf, pbuf->carry_bytes - bytes_left);
625 }
626
627 pbuf->qw_written += qw_to_write;
628}
629
630/*
631 * Mid copy helper, "straight case" - source pointer is 64-bit aligned
632 * with no carry bytes.
633 *
634 * @pbuf: destination buffer
635 * @from: data source, is QWORD aligned
636 * @nbytes: bytes to copy
637 *
638 * Must handle nbytes < 8.
639 */
640static void mid_copy_straight(struct pio_buf *pbuf,
641 const void *from, size_t nbytes)
642{
643 void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
644 void __iomem *dend; /* 8-byte data end */
645
646 /* calculate 8-byte data end */
647 dend = dest + ((nbytes>>3) * sizeof(u64));
648
649 if (pbuf->qw_written < PIO_BLOCK_QWS) {
650 /*
651 * Still within SOP block. We don't need to check for
652 * wrap because we are still in the first block and
653 * can only wrap on block boundaries.
654 */
655 void __iomem *send; /* SOP end */
656 void __iomem *xend;
657
658 /* calculate the end of data or end of block, whichever
659 comes first */
660 send = pbuf->start + PIO_BLOCK_SIZE;
661 xend = send < dend ? send : dend;
662
663 /* shift up to SOP=1 space */
664 dest += SOP_DISTANCE;
665 xend += SOP_DISTANCE;
666
667 /* write 8-byte chunk data */
668 while (dest < xend) {
669 writeq(*(u64 *)from, dest);
670 from += sizeof(u64);
671 dest += sizeof(u64);
672 }
673
674 /* shift down to SOP=0 space */
675 dest -= SOP_DISTANCE;
676 }
677 /*
678 * At this point dest could be (either, both, or neither):
679 * - at dend
680 * - at the wrap
681 */
682
683 /*
684 * If the wrap comes before or matches the data end,
685 * copy until until the wrap, then wrap.
686 *
687 * If dest is at the wrap, we will fall into the if,
688 * not do the loop, when wrap.
689 *
690 * If the data ends at the end of the SOP above and
691 * the buffer wraps, then pbuf->end == dend == dest
692 * and nothing will get written.
693 */
694 if (pbuf->end <= dend) {
695 while (dest < pbuf->end) {
696 writeq(*(u64 *)from, dest);
697 from += sizeof(u64);
698 dest += sizeof(u64);
699 }
700
701 dest -= pbuf->size;
702 dend -= pbuf->size;
703 }
704
705 /* write 8-byte non-SOP, non-wrap chunk data */
706 while (dest < dend) {
707 writeq(*(u64 *)from, dest);
708 from += sizeof(u64);
709 dest += sizeof(u64);
710 }
711
712 /* we know carry_bytes was zero on entry to this routine */
713 read_low_bytes(pbuf, from, nbytes & 0x7);
714
715 pbuf->qw_written += nbytes>>3;
716}
717
718/*
719 * Segmented PIO Copy - middle
720 *
721 * Must handle any aligned tail and any aligned source with any byte count.
722 *
723 * @pbuf: a number of blocks allocated within a PIO send context
724 * @from: data source
725 * @nbytes: number of bytes to copy
726 */
727void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes)
728{
729 unsigned long from_align = (unsigned long)from & 0x7;
730
731 if (pbuf->carry_bytes + nbytes < 8) {
732 /* not enough bytes to fill a QW */
733 read_extra_bytes(pbuf, from, nbytes);
734 return;
735 }
736
737 if (from_align) {
738 /* misaligned source pointer - align it */
739 unsigned long to_align;
740
741 /* bytes to read to align "from" */
742 to_align = 8 - from_align;
743
744 /*
745 * In the advance-to-alignment logic below, we do not need
746 * to check if we are using more than nbytes. This is because
747 * if we are here, we already know that carry+nbytes will
748 * fill at least one QW.
749 */
750 if (pbuf->carry_bytes + to_align < 8) {
751 /* not enough align bytes to fill a QW */
752 read_extra_bytes(pbuf, from, to_align);
753 from += to_align;
754 nbytes -= to_align;
755 } else {
756 /* bytes to fill carry */
757 unsigned long to_fill = 8 - pbuf->carry_bytes;
758 /* bytes left over to be read */
759 unsigned long extra = to_align - to_fill;
760 void __iomem *dest;
761
762 /* fill carry... */
763 read_extra_bytes(pbuf, from, to_fill);
764 from += to_fill;
765 nbytes -= to_fill;
766
767 /* ...now write carry */
768 dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
769
770 /*
771 * The two checks immediately below cannot both be
772 * true, hence the else. If we have wrapped, we
773 * cannot still be within the first block.
774 * Conversely, if we are still in the first block, we
775 * cannot have wrapped. We do the wrap check first
776 * as that is more likely.
777 */
778 /* adjust if we've wrapped */
779 if (dest >= pbuf->end)
780 dest -= pbuf->size;
781 /* jump to SOP range if within the first block */
782 else if (pbuf->qw_written < PIO_BLOCK_QWS)
783 dest += SOP_DISTANCE;
784
785 carry8_write8(pbuf->carry, dest);
786 pbuf->qw_written++;
787
788 /* read any extra bytes to do final alignment */
789 /* this will overwrite anything in pbuf->carry */
790 read_low_bytes(pbuf, from, extra);
791 from += extra;
792 nbytes -= extra;
793 }
794
795 /* at this point, from is QW aligned */
796 }
797
798 if (pbuf->carry_bytes)
799 mid_copy_mix(pbuf, from, nbytes);
800 else
801 mid_copy_straight(pbuf, from, nbytes);
802}
803
804/*
805 * Segmented PIO Copy - end
806 *
807 * Write any remainder (in pbuf->carry) and finish writing the whole block.
808 *
809 * @pbuf: a number of blocks allocated within a PIO send context
810 */
811void seg_pio_copy_end(struct pio_buf *pbuf)
812{
813 void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
814
815 /*
816 * The two checks immediately below cannot both be true, hence the
817 * else. If we have wrapped, we cannot still be within the first
818 * block. Conversely, if we are still in the first block, we
819 * cannot have wrapped. We do the wrap check first as that is
820 * more likely.
821 */
822 /* adjust if we have wrapped */
823 if (dest >= pbuf->end)
824 dest -= pbuf->size;
825 /* jump to the SOP range if within the first block */
826 else if (pbuf->qw_written < PIO_BLOCK_QWS)
827 dest += SOP_DISTANCE;
828
829 /* write final bytes, if any */
830 if (carry_write8(pbuf, dest)) {
831 dest += sizeof(u64);
832 /*
833 * NOTE: We do not need to recalculate whether dest needs
834 * SOP_DISTANCE or not.
835 *
836 * If we are in the first block and the dangle write
837 * keeps us in the same block, dest will need
838 * to retain SOP_DISTANCE in the loop below.
839 *
840 * If we are in the first block and the dangle write pushes
841 * us to the next block, then loop below will not run
842 * and dest is not used. Hence we do not need to update
843 * it.
844 *
845 * If we are past the first block, then SOP_DISTANCE
846 * was never added, so there is nothing to do.
847 */
848 }
849
850 /* fill in rest of block */
851 while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
852 writeq(0, dest);
853 dest += sizeof(u64);
854 }
855
856 /* finished with this buffer */
857 atomic_dec(&pbuf->sc->buffers_allocated);
858}
diff --git a/drivers/staging/rdma/hfi1/platform_config.h b/drivers/staging/rdma/hfi1/platform_config.h
new file mode 100644
index 000000000000..8a94a8342052
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/platform_config.h
@@ -0,0 +1,286 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50#ifndef __PLATFORM_CONFIG_H
51#define __PLATFORM_CONFIG_H
52
53#define METADATA_TABLE_FIELD_START_SHIFT 0
54#define METADATA_TABLE_FIELD_START_LEN_BITS 15
55#define METADATA_TABLE_FIELD_LEN_SHIFT 16
56#define METADATA_TABLE_FIELD_LEN_LEN_BITS 16
57
58/* Header structure */
59#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT 0
60#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS 6
61#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT 16
62#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS 12
63#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT 28
64#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS 4
65
66enum platform_config_table_type_encoding {
67 PLATFORM_CONFIG_TABLE_RESERVED,
68 PLATFORM_CONFIG_SYSTEM_TABLE,
69 PLATFORM_CONFIG_PORT_TABLE,
70 PLATFORM_CONFIG_RX_PRESET_TABLE,
71 PLATFORM_CONFIG_TX_PRESET_TABLE,
72 PLATFORM_CONFIG_QSFP_ATTEN_TABLE,
73 PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE,
74 PLATFORM_CONFIG_TABLE_MAX
75};
76
77enum platform_config_system_table_fields {
78 SYSTEM_TABLE_RESERVED,
79 SYSTEM_TABLE_NODE_STRING,
80 SYSTEM_TABLE_SYSTEM_IMAGE_GUID,
81 SYSTEM_TABLE_NODE_GUID,
82 SYSTEM_TABLE_REVISION,
83 SYSTEM_TABLE_VENDOR_OUI,
84 SYSTEM_TABLE_META_VERSION,
85 SYSTEM_TABLE_DEVICE_ID,
86 SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP,
87 SYSTEM_TABLE_QSFP_POWER_CLASS_MAX,
88 SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G,
89 SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
90 SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT,
91 SYSTEM_TABLE_MAX
92};
93
94enum platform_config_port_table_fields {
95 PORT_TABLE_RESERVED,
96 PORT_TABLE_PORT_TYPE,
97 PORT_TABLE_ATTENUATION_12G,
98 PORT_TABLE_ATTENUATION_25G,
99 PORT_TABLE_LINK_SPEED_SUPPORTED,
100 PORT_TABLE_LINK_WIDTH_SUPPORTED,
101 PORT_TABLE_VL_CAP,
102 PORT_TABLE_MTU_CAP,
103 PORT_TABLE_TX_LANE_ENABLE_MASK,
104 PORT_TABLE_LOCAL_MAX_TIMEOUT,
105 PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED,
106 PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED,
107 PORT_TABLE_TX_PRESET_IDX_PASSIVE_CU,
108 PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
109 PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
110 PORT_TABLE_RX_PRESET_IDX,
111 PORT_TABLE_CABLE_REACH_CLASS,
112 PORT_TABLE_MAX
113};
114
115enum platform_config_rx_preset_table_fields {
116 RX_PRESET_TABLE_RESERVED,
117 RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
118 RX_PRESET_TABLE_QSFP_RX_EQ_APPLY,
119 RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
120 RX_PRESET_TABLE_QSFP_RX_CDR,
121 RX_PRESET_TABLE_QSFP_RX_EQ,
122 RX_PRESET_TABLE_QSFP_RX_AMP,
123 RX_PRESET_TABLE_MAX
124};
125
126enum platform_config_tx_preset_table_fields {
127 TX_PRESET_TABLE_RESERVED,
128 TX_PRESET_TABLE_PRECUR,
129 TX_PRESET_TABLE_ATTN,
130 TX_PRESET_TABLE_POSTCUR,
131 TX_PRESET_TABLE_QSFP_TX_CDR_APPLY,
132 TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
133 TX_PRESET_TABLE_QSFP_TX_CDR,
134 TX_PRESET_TABLE_QSFP_TX_EQ,
135 TX_PRESET_TABLE_MAX
136};
137
138enum platform_config_qsfp_attn_table_fields {
139 QSFP_ATTEN_TABLE_RESERVED,
140 QSFP_ATTEN_TABLE_TX_PRESET_IDX,
141 QSFP_ATTEN_TABLE_RX_PRESET_IDX,
142 QSFP_ATTEN_TABLE_MAX
143};
144
145enum platform_config_variable_settings_table_fields {
146 VARIABLE_SETTINGS_TABLE_RESERVED,
147 VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX,
148 VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX,
149 VARIABLE_SETTINGS_TABLE_MAX
150};
151
152struct platform_config_data {
153 u32 *table;
154 u32 *table_metadata;
155 u32 num_table;
156};
157
158/*
159 * This struct acts as a quick reference into the platform_data binary image
160 * and is populated by parse_platform_config(...) depending on the specific
161 * META_VERSION
162 */
163struct platform_config_cache {
164 u8 cache_valid;
165 struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX];
166};
167
168static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
169 0,
170 SYSTEM_TABLE_MAX,
171 PORT_TABLE_MAX,
172 RX_PRESET_TABLE_MAX,
173 TX_PRESET_TABLE_MAX,
174 QSFP_ATTEN_TABLE_MAX,
175 VARIABLE_SETTINGS_TABLE_MAX
176};
177
178/* This section defines default values and encodings for the
179 * fields defined for each table above
180 */
181
182/*=====================================================
183 * System table encodings
184 *====================================================*/
185#define PLATFORM_CONFIG_MAGIC_NUM 0x3d4f5041
186#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN 4
187
188/*
189 * These power classes are the same as defined in SFF 8636 spec rev 2.4
190 * describing byte 129 in table 6-16, except enumerated in a different order
191 */
192enum platform_config_qsfp_power_class_encoding {
193 QSFP_POWER_CLASS_1 = 1,
194 QSFP_POWER_CLASS_2,
195 QSFP_POWER_CLASS_3,
196 QSFP_POWER_CLASS_4,
197 QSFP_POWER_CLASS_5,
198 QSFP_POWER_CLASS_6,
199 QSFP_POWER_CLASS_7
200};
201
202
203/*=====================================================
204 * Port table encodings
205 *==================================================== */
206enum platform_config_port_type_encoding {
207 PORT_TYPE_RESERVED,
208 PORT_TYPE_DISCONNECTED,
209 PORT_TYPE_FIXED,
210 PORT_TYPE_VARIABLE,
211 PORT_TYPE_QSFP,
212 PORT_TYPE_MAX
213};
214
215enum platform_config_link_speed_supported_encoding {
216 LINK_SPEED_SUPP_12G = 1,
217 LINK_SPEED_SUPP_25G,
218 LINK_SPEED_SUPP_12G_25G,
219 LINK_SPEED_SUPP_MAX
220};
221
222/*
223 * This is a subset (not strict) of the link downgrades
224 * supported. The link downgrades supported are expected
225 * to be supplied to the driver by another entity such as
226 * the fabric manager
227 */
228enum platform_config_link_width_supported_encoding {
229 LINK_WIDTH_SUPP_1X = 1,
230 LINK_WIDTH_SUPP_2X,
231 LINK_WIDTH_SUPP_2X_1X,
232 LINK_WIDTH_SUPP_3X,
233 LINK_WIDTH_SUPP_3X_1X,
234 LINK_WIDTH_SUPP_3X_2X,
235 LINK_WIDTH_SUPP_3X_2X_1X,
236 LINK_WIDTH_SUPP_4X,
237 LINK_WIDTH_SUPP_4X_1X,
238 LINK_WIDTH_SUPP_4X_2X,
239 LINK_WIDTH_SUPP_4X_2X_1X,
240 LINK_WIDTH_SUPP_4X_3X,
241 LINK_WIDTH_SUPP_4X_3X_1X,
242 LINK_WIDTH_SUPP_4X_3X_2X,
243 LINK_WIDTH_SUPP_4X_3X_2X_1X,
244 LINK_WIDTH_SUPP_MAX
245};
246
247enum platform_config_virtual_lane_capability_encoding {
248 VL_CAP_VL0 = 1,
249 VL_CAP_VL0_1,
250 VL_CAP_VL0_2,
251 VL_CAP_VL0_3,
252 VL_CAP_VL0_4,
253 VL_CAP_VL0_5,
254 VL_CAP_VL0_6,
255 VL_CAP_VL0_7,
256 VL_CAP_VL0_8,
257 VL_CAP_VL0_9,
258 VL_CAP_VL0_10,
259 VL_CAP_VL0_11,
260 VL_CAP_VL0_12,
261 VL_CAP_VL0_13,
262 VL_CAP_VL0_14,
263 VL_CAP_MAX
264};
265
266/* Max MTU */
267enum platform_config_mtu_capability_encoding {
268 MTU_CAP_256 = 1,
269 MTU_CAP_512 = 2,
270 MTU_CAP_1024 = 3,
271 MTU_CAP_2048 = 4,
272 MTU_CAP_4096 = 5,
273 MTU_CAP_8192 = 6,
274 MTU_CAP_10240 = 7
275};
276
277enum platform_config_local_max_timeout_encoding {
278 LOCAL_MAX_TIMEOUT_10_MS = 1,
279 LOCAL_MAX_TIMEOUT_100_MS,
280 LOCAL_MAX_TIMEOUT_1_S,
281 LOCAL_MAX_TIMEOUT_10_S,
282 LOCAL_MAX_TIMEOUT_100_S,
283 LOCAL_MAX_TIMEOUT_1000_S
284};
285
286#endif /*__PLATFORM_CONFIG_H*/
diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c
new file mode 100644
index 000000000000..df1fa56eaf85
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qp.c
@@ -0,0 +1,1687 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/err.h>
52#include <linux/vmalloc.h>
53#include <linux/hash.h>
54#include <linux/module.h>
55#include <linux/random.h>
56#include <linux/seq_file.h>
57
58#include "hfi.h"
59#include "qp.h"
60#include "trace.h"
61#include "sdma.h"
62
63#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE)
64#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
65
66static unsigned int hfi1_qp_table_size = 256;
67module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO);
68MODULE_PARM_DESC(qp_table_size, "QP table size");
69
70static void flush_tx_list(struct hfi1_qp *qp);
71static int iowait_sleep(
72 struct sdma_engine *sde,
73 struct iowait *wait,
74 struct sdma_txreq *stx,
75 unsigned seq);
76static void iowait_wakeup(struct iowait *wait, int reason);
77
78static inline unsigned mk_qpn(struct hfi1_qpn_table *qpt,
79 struct qpn_map *map, unsigned off)
80{
81 return (map - qpt->map) * BITS_PER_PAGE + off;
82}
83
84/*
85 * Convert the AETH credit code into the number of credits.
86 */
87static const u16 credit_table[31] = {
88 0, /* 0 */
89 1, /* 1 */
90 2, /* 2 */
91 3, /* 3 */
92 4, /* 4 */
93 6, /* 5 */
94 8, /* 6 */
95 12, /* 7 */
96 16, /* 8 */
97 24, /* 9 */
98 32, /* A */
99 48, /* B */
100 64, /* C */
101 96, /* D */
102 128, /* E */
103 192, /* F */
104 256, /* 10 */
105 384, /* 11 */
106 512, /* 12 */
107 768, /* 13 */
108 1024, /* 14 */
109 1536, /* 15 */
110 2048, /* 16 */
111 3072, /* 17 */
112 4096, /* 18 */
113 6144, /* 19 */
114 8192, /* 1A */
115 12288, /* 1B */
116 16384, /* 1C */
117 24576, /* 1D */
118 32768 /* 1E */
119};
120
121static void get_map_page(struct hfi1_qpn_table *qpt, struct qpn_map *map)
122{
123 unsigned long page = get_zeroed_page(GFP_KERNEL);
124
125 /*
126 * Free the page if someone raced with us installing it.
127 */
128
129 spin_lock(&qpt->lock);
130 if (map->page)
131 free_page(page);
132 else
133 map->page = (void *)page;
134 spin_unlock(&qpt->lock);
135}
136
137/*
138 * Allocate the next available QPN or
139 * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
140 */
141static int alloc_qpn(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt,
142 enum ib_qp_type type, u8 port)
143{
144 u32 i, offset, max_scan, qpn;
145 struct qpn_map *map;
146 u32 ret;
147
148 if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
149 unsigned n;
150
151 ret = type == IB_QPT_GSI;
152 n = 1 << (ret + 2 * (port - 1));
153 spin_lock(&qpt->lock);
154 if (qpt->flags & n)
155 ret = -EINVAL;
156 else
157 qpt->flags |= n;
158 spin_unlock(&qpt->lock);
159 goto bail;
160 }
161
162 qpn = qpt->last + qpt->incr;
163 if (qpn >= QPN_MAX)
164 qpn = qpt->incr | ((qpt->last & 1) ^ 1);
165 /* offset carries bit 0 */
166 offset = qpn & BITS_PER_PAGE_MASK;
167 map = &qpt->map[qpn / BITS_PER_PAGE];
168 max_scan = qpt->nmaps - !offset;
169 for (i = 0;;) {
170 if (unlikely(!map->page)) {
171 get_map_page(qpt, map);
172 if (unlikely(!map->page))
173 break;
174 }
175 do {
176 if (!test_and_set_bit(offset, map->page)) {
177 qpt->last = qpn;
178 ret = qpn;
179 goto bail;
180 }
181 offset += qpt->incr;
182 /*
183 * This qpn might be bogus if offset >= BITS_PER_PAGE.
184 * That is OK. It gets re-assigned below
185 */
186 qpn = mk_qpn(qpt, map, offset);
187 } while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
188 /*
189 * In order to keep the number of pages allocated to a
190 * minimum, we scan the all existing pages before increasing
191 * the size of the bitmap table.
192 */
193 if (++i > max_scan) {
194 if (qpt->nmaps == QPNMAP_ENTRIES)
195 break;
196 map = &qpt->map[qpt->nmaps++];
197 /* start at incr with current bit 0 */
198 offset = qpt->incr | (offset & 1);
199 } else if (map < &qpt->map[qpt->nmaps]) {
200 ++map;
201 /* start at incr with current bit 0 */
202 offset = qpt->incr | (offset & 1);
203 } else {
204 map = &qpt->map[0];
205 /* wrap to first map page, invert bit 0 */
206 offset = qpt->incr | ((offset & 1) ^ 1);
207 }
208 /* there can be no bits at shift and below */
209 WARN_ON(offset & (dd->qos_shift - 1));
210 qpn = mk_qpn(qpt, map, offset);
211 }
212
213 ret = -ENOMEM;
214
215bail:
216 return ret;
217}
218
219static void free_qpn(struct hfi1_qpn_table *qpt, u32 qpn)
220{
221 struct qpn_map *map;
222
223 map = qpt->map + qpn / BITS_PER_PAGE;
224 if (map->page)
225 clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
226}
227
228/*
229 * Put the QP into the hash table.
230 * The hash table holds a reference to the QP.
231 */
232static void insert_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
233{
234 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
235 unsigned long flags;
236
237 atomic_inc(&qp->refcount);
238 spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
239
240 if (qp->ibqp.qp_num <= 1) {
241 rcu_assign_pointer(ibp->qp[qp->ibqp.qp_num], qp);
242 } else {
243 u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num);
244
245 qp->next = dev->qp_dev->qp_table[n];
246 rcu_assign_pointer(dev->qp_dev->qp_table[n], qp);
247 trace_hfi1_qpinsert(qp, n);
248 }
249
250 spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
251}
252
253/*
254 * Remove the QP from the table so it can't be found asynchronously by
255 * the receive interrupt routine.
256 */
257static void remove_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
258{
259 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
260 u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num);
261 unsigned long flags;
262 int removed = 1;
263
264 spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
265
266 if (rcu_dereference_protected(ibp->qp[0],
267 lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) {
268 RCU_INIT_POINTER(ibp->qp[0], NULL);
269 } else if (rcu_dereference_protected(ibp->qp[1],
270 lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) {
271 RCU_INIT_POINTER(ibp->qp[1], NULL);
272 } else {
273 struct hfi1_qp *q;
274 struct hfi1_qp __rcu **qpp;
275
276 removed = 0;
277 qpp = &dev->qp_dev->qp_table[n];
278 for (; (q = rcu_dereference_protected(*qpp,
279 lockdep_is_held(&dev->qp_dev->qpt_lock)))
280 != NULL;
281 qpp = &q->next)
282 if (q == qp) {
283 RCU_INIT_POINTER(*qpp,
284 rcu_dereference_protected(qp->next,
285 lockdep_is_held(&dev->qp_dev->qpt_lock)));
286 removed = 1;
287 trace_hfi1_qpremove(qp, n);
288 break;
289 }
290 }
291
292 spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
293 if (removed) {
294 synchronize_rcu();
295 if (atomic_dec_and_test(&qp->refcount))
296 wake_up(&qp->wait);
297 }
298}
299
300/**
301 * free_all_qps - check for QPs still in use
302 * @qpt: the QP table to empty
303 *
304 * There should not be any QPs still in use.
305 * Free memory for table.
306 */
307static unsigned free_all_qps(struct hfi1_devdata *dd)
308{
309 struct hfi1_ibdev *dev = &dd->verbs_dev;
310 unsigned long flags;
311 struct hfi1_qp *qp;
312 unsigned n, qp_inuse = 0;
313
314 for (n = 0; n < dd->num_pports; n++) {
315 struct hfi1_ibport *ibp = &dd->pport[n].ibport_data;
316
317 if (!hfi1_mcast_tree_empty(ibp))
318 qp_inuse++;
319 rcu_read_lock();
320 if (rcu_dereference(ibp->qp[0]))
321 qp_inuse++;
322 if (rcu_dereference(ibp->qp[1]))
323 qp_inuse++;
324 rcu_read_unlock();
325 }
326
327 if (!dev->qp_dev)
328 goto bail;
329 spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
330 for (n = 0; n < dev->qp_dev->qp_table_size; n++) {
331 qp = rcu_dereference_protected(dev->qp_dev->qp_table[n],
332 lockdep_is_held(&dev->qp_dev->qpt_lock));
333 RCU_INIT_POINTER(dev->qp_dev->qp_table[n], NULL);
334
335 for (; qp; qp = rcu_dereference_protected(qp->next,
336 lockdep_is_held(&dev->qp_dev->qpt_lock)))
337 qp_inuse++;
338 }
339 spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
340 synchronize_rcu();
341bail:
342 return qp_inuse;
343}
344
345/**
346 * reset_qp - initialize the QP state to the reset state
347 * @qp: the QP to reset
348 * @type: the QP type
349 */
350static void reset_qp(struct hfi1_qp *qp, enum ib_qp_type type)
351{
352 qp->remote_qpn = 0;
353 qp->qkey = 0;
354 qp->qp_access_flags = 0;
355 iowait_init(
356 &qp->s_iowait,
357 1,
358 hfi1_do_send,
359 iowait_sleep,
360 iowait_wakeup);
361 qp->s_flags &= HFI1_S_SIGNAL_REQ_WR;
362 qp->s_hdrwords = 0;
363 qp->s_wqe = NULL;
364 qp->s_draining = 0;
365 qp->s_next_psn = 0;
366 qp->s_last_psn = 0;
367 qp->s_sending_psn = 0;
368 qp->s_sending_hpsn = 0;
369 qp->s_psn = 0;
370 qp->r_psn = 0;
371 qp->r_msn = 0;
372 if (type == IB_QPT_RC) {
373 qp->s_state = IB_OPCODE_RC_SEND_LAST;
374 qp->r_state = IB_OPCODE_RC_SEND_LAST;
375 } else {
376 qp->s_state = IB_OPCODE_UC_SEND_LAST;
377 qp->r_state = IB_OPCODE_UC_SEND_LAST;
378 }
379 qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
380 qp->r_nak_state = 0;
381 qp->r_aflags = 0;
382 qp->r_flags = 0;
383 qp->s_head = 0;
384 qp->s_tail = 0;
385 qp->s_cur = 0;
386 qp->s_acked = 0;
387 qp->s_last = 0;
388 qp->s_ssn = 1;
389 qp->s_lsn = 0;
390 clear_ahg(qp);
391 qp->s_mig_state = IB_MIG_MIGRATED;
392 memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
393 qp->r_head_ack_queue = 0;
394 qp->s_tail_ack_queue = 0;
395 qp->s_num_rd_atomic = 0;
396 if (qp->r_rq.wq) {
397 qp->r_rq.wq->head = 0;
398 qp->r_rq.wq->tail = 0;
399 }
400 qp->r_sge.num_sge = 0;
401}
402
403static void clear_mr_refs(struct hfi1_qp *qp, int clr_sends)
404{
405 unsigned n;
406
407 if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
408 hfi1_put_ss(&qp->s_rdma_read_sge);
409
410 hfi1_put_ss(&qp->r_sge);
411
412 if (clr_sends) {
413 while (qp->s_last != qp->s_head) {
414 struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
415 unsigned i;
416
417 for (i = 0; i < wqe->wr.num_sge; i++) {
418 struct hfi1_sge *sge = &wqe->sg_list[i];
419
420 hfi1_put_mr(sge->mr);
421 }
422 if (qp->ibqp.qp_type == IB_QPT_UD ||
423 qp->ibqp.qp_type == IB_QPT_SMI ||
424 qp->ibqp.qp_type == IB_QPT_GSI)
425 atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
426 if (++qp->s_last >= qp->s_size)
427 qp->s_last = 0;
428 }
429 if (qp->s_rdma_mr) {
430 hfi1_put_mr(qp->s_rdma_mr);
431 qp->s_rdma_mr = NULL;
432 }
433 }
434
435 if (qp->ibqp.qp_type != IB_QPT_RC)
436 return;
437
438 for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) {
439 struct hfi1_ack_entry *e = &qp->s_ack_queue[n];
440
441 if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
442 e->rdma_sge.mr) {
443 hfi1_put_mr(e->rdma_sge.mr);
444 e->rdma_sge.mr = NULL;
445 }
446 }
447}
448
449/**
450 * hfi1_error_qp - put a QP into the error state
451 * @qp: the QP to put into the error state
452 * @err: the receive completion error to signal if a RWQE is active
453 *
454 * Flushes both send and receive work queues.
455 * Returns true if last WQE event should be generated.
456 * The QP r_lock and s_lock should be held and interrupts disabled.
457 * If we are already in error state, just return.
458 */
459int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err)
460{
461 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
462 struct ib_wc wc;
463 int ret = 0;
464
465 if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
466 goto bail;
467
468 qp->state = IB_QPS_ERR;
469
470 if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
471 qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
472 del_timer(&qp->s_timer);
473 }
474
475 if (qp->s_flags & HFI1_S_ANY_WAIT_SEND)
476 qp->s_flags &= ~HFI1_S_ANY_WAIT_SEND;
477
478 write_seqlock(&dev->iowait_lock);
479 if (!list_empty(&qp->s_iowait.list) && !(qp->s_flags & HFI1_S_BUSY)) {
480 qp->s_flags &= ~HFI1_S_ANY_WAIT_IO;
481 list_del_init(&qp->s_iowait.list);
482 if (atomic_dec_and_test(&qp->refcount))
483 wake_up(&qp->wait);
484 }
485 write_sequnlock(&dev->iowait_lock);
486
487 if (!(qp->s_flags & HFI1_S_BUSY)) {
488 qp->s_hdrwords = 0;
489 if (qp->s_rdma_mr) {
490 hfi1_put_mr(qp->s_rdma_mr);
491 qp->s_rdma_mr = NULL;
492 }
493 flush_tx_list(qp);
494 }
495
496 /* Schedule the sending tasklet to drain the send work queue. */
497 if (qp->s_last != qp->s_head)
498 hfi1_schedule_send(qp);
499
500 clear_mr_refs(qp, 0);
501
502 memset(&wc, 0, sizeof(wc));
503 wc.qp = &qp->ibqp;
504 wc.opcode = IB_WC_RECV;
505
506 if (test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) {
507 wc.wr_id = qp->r_wr_id;
508 wc.status = err;
509 hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
510 }
511 wc.status = IB_WC_WR_FLUSH_ERR;
512
513 if (qp->r_rq.wq) {
514 struct hfi1_rwq *wq;
515 u32 head;
516 u32 tail;
517
518 spin_lock(&qp->r_rq.lock);
519
520 /* sanity check pointers before trusting them */
521 wq = qp->r_rq.wq;
522 head = wq->head;
523 if (head >= qp->r_rq.size)
524 head = 0;
525 tail = wq->tail;
526 if (tail >= qp->r_rq.size)
527 tail = 0;
528 while (tail != head) {
529 wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
530 if (++tail >= qp->r_rq.size)
531 tail = 0;
532 hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
533 }
534 wq->tail = tail;
535
536 spin_unlock(&qp->r_rq.lock);
537 } else if (qp->ibqp.event_handler)
538 ret = 1;
539
540bail:
541 return ret;
542}
543
544static void flush_tx_list(struct hfi1_qp *qp)
545{
546 while (!list_empty(&qp->s_iowait.tx_head)) {
547 struct sdma_txreq *tx;
548
549 tx = list_first_entry(
550 &qp->s_iowait.tx_head,
551 struct sdma_txreq,
552 list);
553 list_del_init(&tx->list);
554 hfi1_put_txreq(
555 container_of(tx, struct verbs_txreq, txreq));
556 }
557}
558
559static void flush_iowait(struct hfi1_qp *qp)
560{
561 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
562 unsigned long flags;
563
564 write_seqlock_irqsave(&dev->iowait_lock, flags);
565 if (!list_empty(&qp->s_iowait.list)) {
566 list_del_init(&qp->s_iowait.list);
567 if (atomic_dec_and_test(&qp->refcount))
568 wake_up(&qp->wait);
569 }
570 write_sequnlock_irqrestore(&dev->iowait_lock, flags);
571}
572
573static inline int opa_mtu_enum_to_int(int mtu)
574{
575 switch (mtu) {
576 case OPA_MTU_8192: return 8192;
577 case OPA_MTU_10240: return 10240;
578 default: return -1;
579 }
580}
581
582/**
583 * This function is what we would push to the core layer if we wanted to be a
584 * "first class citizen". Instead we hide this here and rely on Verbs ULPs
585 * to blindly pass the MTU enum value from the PathRecord to us.
586 *
587 * The actual flag used to determine "8k MTU" will change and is currently
588 * unknown.
589 */
590static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
591{
592 int val = opa_mtu_enum_to_int((int)mtu);
593
594 if (val > 0)
595 return val;
596 return ib_mtu_enum_to_int(mtu);
597}
598
599
600/**
601 * hfi1_modify_qp - modify the attributes of a queue pair
602 * @ibqp: the queue pair who's attributes we're modifying
603 * @attr: the new attributes
604 * @attr_mask: the mask of attributes to modify
605 * @udata: user data for libibverbs.so
606 *
607 * Returns 0 on success, otherwise returns an errno.
608 */
609int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
610 int attr_mask, struct ib_udata *udata)
611{
612 struct hfi1_ibdev *dev = to_idev(ibqp->device);
613 struct hfi1_qp *qp = to_iqp(ibqp);
614 enum ib_qp_state cur_state, new_state;
615 struct ib_event ev;
616 int lastwqe = 0;
617 int mig = 0;
618 int ret;
619 u32 pmtu = 0; /* for gcc warning only */
620 struct hfi1_devdata *dd;
621
622 spin_lock_irq(&qp->r_lock);
623 spin_lock(&qp->s_lock);
624
625 cur_state = attr_mask & IB_QP_CUR_STATE ?
626 attr->cur_qp_state : qp->state;
627 new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
628
629 if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
630 attr_mask, IB_LINK_LAYER_UNSPECIFIED))
631 goto inval;
632
633 if (attr_mask & IB_QP_AV) {
634 if (attr->ah_attr.dlid >= HFI1_MULTICAST_LID_BASE)
635 goto inval;
636 if (hfi1_check_ah(qp->ibqp.device, &attr->ah_attr))
637 goto inval;
638 }
639
640 if (attr_mask & IB_QP_ALT_PATH) {
641 if (attr->alt_ah_attr.dlid >= HFI1_MULTICAST_LID_BASE)
642 goto inval;
643 if (hfi1_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
644 goto inval;
645 if (attr->alt_pkey_index >= hfi1_get_npkeys(dd_from_dev(dev)))
646 goto inval;
647 }
648
649 if (attr_mask & IB_QP_PKEY_INDEX)
650 if (attr->pkey_index >= hfi1_get_npkeys(dd_from_dev(dev)))
651 goto inval;
652
653 if (attr_mask & IB_QP_MIN_RNR_TIMER)
654 if (attr->min_rnr_timer > 31)
655 goto inval;
656
657 if (attr_mask & IB_QP_PORT)
658 if (qp->ibqp.qp_type == IB_QPT_SMI ||
659 qp->ibqp.qp_type == IB_QPT_GSI ||
660 attr->port_num == 0 ||
661 attr->port_num > ibqp->device->phys_port_cnt)
662 goto inval;
663
664 if (attr_mask & IB_QP_DEST_QPN)
665 if (attr->dest_qp_num > HFI1_QPN_MASK)
666 goto inval;
667
668 if (attr_mask & IB_QP_RETRY_CNT)
669 if (attr->retry_cnt > 7)
670 goto inval;
671
672 if (attr_mask & IB_QP_RNR_RETRY)
673 if (attr->rnr_retry > 7)
674 goto inval;
675
676 /*
677 * Don't allow invalid path_mtu values. OK to set greater
678 * than the active mtu (or even the max_cap, if we have tuned
679 * that to a small mtu. We'll set qp->path_mtu
680 * to the lesser of requested attribute mtu and active,
681 * for packetizing messages.
682 * Note that the QP port has to be set in INIT and MTU in RTR.
683 */
684 if (attr_mask & IB_QP_PATH_MTU) {
685 int mtu, pidx = qp->port_num - 1;
686
687 dd = dd_from_dev(dev);
688 mtu = verbs_mtu_enum_to_int(ibqp->device, attr->path_mtu);
689 if (mtu == -1)
690 goto inval;
691
692 if (mtu > dd->pport[pidx].ibmtu)
693 pmtu = mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048);
694 else
695 pmtu = attr->path_mtu;
696 }
697
698 if (attr_mask & IB_QP_PATH_MIG_STATE) {
699 if (attr->path_mig_state == IB_MIG_REARM) {
700 if (qp->s_mig_state == IB_MIG_ARMED)
701 goto inval;
702 if (new_state != IB_QPS_RTS)
703 goto inval;
704 } else if (attr->path_mig_state == IB_MIG_MIGRATED) {
705 if (qp->s_mig_state == IB_MIG_REARM)
706 goto inval;
707 if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
708 goto inval;
709 if (qp->s_mig_state == IB_MIG_ARMED)
710 mig = 1;
711 } else
712 goto inval;
713 }
714
715 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
716 if (attr->max_dest_rd_atomic > HFI1_MAX_RDMA_ATOMIC)
717 goto inval;
718
719 switch (new_state) {
720 case IB_QPS_RESET:
721 if (qp->state != IB_QPS_RESET) {
722 qp->state = IB_QPS_RESET;
723 flush_iowait(qp);
724 qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT);
725 spin_unlock(&qp->s_lock);
726 spin_unlock_irq(&qp->r_lock);
727 /* Stop the sending work queue and retry timer */
728 cancel_work_sync(&qp->s_iowait.iowork);
729 del_timer_sync(&qp->s_timer);
730 iowait_sdma_drain(&qp->s_iowait);
731 flush_tx_list(qp);
732 remove_qp(dev, qp);
733 wait_event(qp->wait, !atomic_read(&qp->refcount));
734 spin_lock_irq(&qp->r_lock);
735 spin_lock(&qp->s_lock);
736 clear_mr_refs(qp, 1);
737 clear_ahg(qp);
738 reset_qp(qp, ibqp->qp_type);
739 }
740 break;
741
742 case IB_QPS_RTR:
743 /* Allow event to re-trigger if QP set to RTR more than once */
744 qp->r_flags &= ~HFI1_R_COMM_EST;
745 qp->state = new_state;
746 break;
747
748 case IB_QPS_SQD:
749 qp->s_draining = qp->s_last != qp->s_cur;
750 qp->state = new_state;
751 break;
752
753 case IB_QPS_SQE:
754 if (qp->ibqp.qp_type == IB_QPT_RC)
755 goto inval;
756 qp->state = new_state;
757 break;
758
759 case IB_QPS_ERR:
760 lastwqe = hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
761 break;
762
763 default:
764 qp->state = new_state;
765 break;
766 }
767
768 if (attr_mask & IB_QP_PKEY_INDEX)
769 qp->s_pkey_index = attr->pkey_index;
770
771 if (attr_mask & IB_QP_PORT)
772 qp->port_num = attr->port_num;
773
774 if (attr_mask & IB_QP_DEST_QPN)
775 qp->remote_qpn = attr->dest_qp_num;
776
777 if (attr_mask & IB_QP_SQ_PSN) {
778 qp->s_next_psn = attr->sq_psn & PSN_MODIFY_MASK;
779 qp->s_psn = qp->s_next_psn;
780 qp->s_sending_psn = qp->s_next_psn;
781 qp->s_last_psn = qp->s_next_psn - 1;
782 qp->s_sending_hpsn = qp->s_last_psn;
783 }
784
785 if (attr_mask & IB_QP_RQ_PSN)
786 qp->r_psn = attr->rq_psn & PSN_MODIFY_MASK;
787
788 if (attr_mask & IB_QP_ACCESS_FLAGS)
789 qp->qp_access_flags = attr->qp_access_flags;
790
791 if (attr_mask & IB_QP_AV) {
792 qp->remote_ah_attr = attr->ah_attr;
793 qp->s_srate = attr->ah_attr.static_rate;
794 qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
795 }
796
797 if (attr_mask & IB_QP_ALT_PATH) {
798 qp->alt_ah_attr = attr->alt_ah_attr;
799 qp->s_alt_pkey_index = attr->alt_pkey_index;
800 }
801
802 if (attr_mask & IB_QP_PATH_MIG_STATE) {
803 qp->s_mig_state = attr->path_mig_state;
804 if (mig) {
805 qp->remote_ah_attr = qp->alt_ah_attr;
806 qp->port_num = qp->alt_ah_attr.port_num;
807 qp->s_pkey_index = qp->s_alt_pkey_index;
808 qp->s_flags |= HFI1_S_AHG_CLEAR;
809 }
810 }
811
812 if (attr_mask & IB_QP_PATH_MTU) {
813 struct hfi1_ibport *ibp;
814 u8 sc, vl;
815 u32 mtu;
816
817 dd = dd_from_dev(dev);
818 ibp = &dd->pport[qp->port_num - 1].ibport_data;
819
820 sc = ibp->sl_to_sc[qp->remote_ah_attr.sl];
821 vl = sc_to_vlt(dd, sc);
822
823 mtu = verbs_mtu_enum_to_int(ibqp->device, pmtu);
824 if (vl < PER_VL_SEND_CONTEXTS)
825 mtu = min_t(u32, mtu, dd->vld[vl].mtu);
826 pmtu = mtu_to_enum(mtu, OPA_MTU_8192);
827
828 qp->path_mtu = pmtu;
829 qp->pmtu = mtu;
830 }
831
832 if (attr_mask & IB_QP_RETRY_CNT) {
833 qp->s_retry_cnt = attr->retry_cnt;
834 qp->s_retry = attr->retry_cnt;
835 }
836
837 if (attr_mask & IB_QP_RNR_RETRY) {
838 qp->s_rnr_retry_cnt = attr->rnr_retry;
839 qp->s_rnr_retry = attr->rnr_retry;
840 }
841
842 if (attr_mask & IB_QP_MIN_RNR_TIMER)
843 qp->r_min_rnr_timer = attr->min_rnr_timer;
844
845 if (attr_mask & IB_QP_TIMEOUT) {
846 qp->timeout = attr->timeout;
847 qp->timeout_jiffies =
848 usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
849 1000UL);
850 }
851
852 if (attr_mask & IB_QP_QKEY)
853 qp->qkey = attr->qkey;
854
855 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
856 qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
857
858 if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
859 qp->s_max_rd_atomic = attr->max_rd_atomic;
860
861 spin_unlock(&qp->s_lock);
862 spin_unlock_irq(&qp->r_lock);
863
864 if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
865 insert_qp(dev, qp);
866
867 if (lastwqe) {
868 ev.device = qp->ibqp.device;
869 ev.element.qp = &qp->ibqp;
870 ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
871 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
872 }
873 if (mig) {
874 ev.device = qp->ibqp.device;
875 ev.element.qp = &qp->ibqp;
876 ev.event = IB_EVENT_PATH_MIG;
877 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
878 }
879 ret = 0;
880 goto bail;
881
882inval:
883 spin_unlock(&qp->s_lock);
884 spin_unlock_irq(&qp->r_lock);
885 ret = -EINVAL;
886
887bail:
888 return ret;
889}
890
891int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
892 int attr_mask, struct ib_qp_init_attr *init_attr)
893{
894 struct hfi1_qp *qp = to_iqp(ibqp);
895
896 attr->qp_state = qp->state;
897 attr->cur_qp_state = attr->qp_state;
898 attr->path_mtu = qp->path_mtu;
899 attr->path_mig_state = qp->s_mig_state;
900 attr->qkey = qp->qkey;
901 attr->rq_psn = mask_psn(qp->r_psn);
902 attr->sq_psn = mask_psn(qp->s_next_psn);
903 attr->dest_qp_num = qp->remote_qpn;
904 attr->qp_access_flags = qp->qp_access_flags;
905 attr->cap.max_send_wr = qp->s_size - 1;
906 attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
907 attr->cap.max_send_sge = qp->s_max_sge;
908 attr->cap.max_recv_sge = qp->r_rq.max_sge;
909 attr->cap.max_inline_data = 0;
910 attr->ah_attr = qp->remote_ah_attr;
911 attr->alt_ah_attr = qp->alt_ah_attr;
912 attr->pkey_index = qp->s_pkey_index;
913 attr->alt_pkey_index = qp->s_alt_pkey_index;
914 attr->en_sqd_async_notify = 0;
915 attr->sq_draining = qp->s_draining;
916 attr->max_rd_atomic = qp->s_max_rd_atomic;
917 attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
918 attr->min_rnr_timer = qp->r_min_rnr_timer;
919 attr->port_num = qp->port_num;
920 attr->timeout = qp->timeout;
921 attr->retry_cnt = qp->s_retry_cnt;
922 attr->rnr_retry = qp->s_rnr_retry_cnt;
923 attr->alt_port_num = qp->alt_ah_attr.port_num;
924 attr->alt_timeout = qp->alt_timeout;
925
926 init_attr->event_handler = qp->ibqp.event_handler;
927 init_attr->qp_context = qp->ibqp.qp_context;
928 init_attr->send_cq = qp->ibqp.send_cq;
929 init_attr->recv_cq = qp->ibqp.recv_cq;
930 init_attr->srq = qp->ibqp.srq;
931 init_attr->cap = attr->cap;
932 if (qp->s_flags & HFI1_S_SIGNAL_REQ_WR)
933 init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
934 else
935 init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
936 init_attr->qp_type = qp->ibqp.qp_type;
937 init_attr->port_num = qp->port_num;
938 return 0;
939}
940
941/**
942 * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
943 * @qp: the queue pair to compute the AETH for
944 *
945 * Returns the AETH.
946 */
947__be32 hfi1_compute_aeth(struct hfi1_qp *qp)
948{
949 u32 aeth = qp->r_msn & HFI1_MSN_MASK;
950
951 if (qp->ibqp.srq) {
952 /*
953 * Shared receive queues don't generate credits.
954 * Set the credit field to the invalid value.
955 */
956 aeth |= HFI1_AETH_CREDIT_INVAL << HFI1_AETH_CREDIT_SHIFT;
957 } else {
958 u32 min, max, x;
959 u32 credits;
960 struct hfi1_rwq *wq = qp->r_rq.wq;
961 u32 head;
962 u32 tail;
963
964 /* sanity check pointers before trusting them */
965 head = wq->head;
966 if (head >= qp->r_rq.size)
967 head = 0;
968 tail = wq->tail;
969 if (tail >= qp->r_rq.size)
970 tail = 0;
971 /*
972 * Compute the number of credits available (RWQEs).
973 * There is a small chance that the pair of reads are
974 * not atomic, which is OK, since the fuzziness is
975 * resolved as further ACKs go out.
976 */
977 credits = head - tail;
978 if ((int)credits < 0)
979 credits += qp->r_rq.size;
980 /*
981 * Binary search the credit table to find the code to
982 * use.
983 */
984 min = 0;
985 max = 31;
986 for (;;) {
987 x = (min + max) / 2;
988 if (credit_table[x] == credits)
989 break;
990 if (credit_table[x] > credits)
991 max = x;
992 else if (min == x)
993 break;
994 else
995 min = x;
996 }
997 aeth |= x << HFI1_AETH_CREDIT_SHIFT;
998 }
999 return cpu_to_be32(aeth);
1000}
1001
1002/**
1003 * hfi1_create_qp - create a queue pair for a device
1004 * @ibpd: the protection domain who's device we create the queue pair for
1005 * @init_attr: the attributes of the queue pair
1006 * @udata: user data for libibverbs.so
1007 *
1008 * Returns the queue pair on success, otherwise returns an errno.
1009 *
1010 * Called by the ib_create_qp() core verbs function.
1011 */
1012struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
1013 struct ib_qp_init_attr *init_attr,
1014 struct ib_udata *udata)
1015{
1016 struct hfi1_qp *qp;
1017 int err;
1018 struct hfi1_swqe *swq = NULL;
1019 struct hfi1_ibdev *dev;
1020 struct hfi1_devdata *dd;
1021 size_t sz;
1022 size_t sg_list_sz;
1023 struct ib_qp *ret;
1024
1025 if (init_attr->cap.max_send_sge > hfi1_max_sges ||
1026 init_attr->cap.max_send_wr > hfi1_max_qp_wrs ||
1027 init_attr->create_flags) {
1028 ret = ERR_PTR(-EINVAL);
1029 goto bail;
1030 }
1031
1032 /* Check receive queue parameters if no SRQ is specified. */
1033 if (!init_attr->srq) {
1034 if (init_attr->cap.max_recv_sge > hfi1_max_sges ||
1035 init_attr->cap.max_recv_wr > hfi1_max_qp_wrs) {
1036 ret = ERR_PTR(-EINVAL);
1037 goto bail;
1038 }
1039 if (init_attr->cap.max_send_sge +
1040 init_attr->cap.max_send_wr +
1041 init_attr->cap.max_recv_sge +
1042 init_attr->cap.max_recv_wr == 0) {
1043 ret = ERR_PTR(-EINVAL);
1044 goto bail;
1045 }
1046 }
1047
1048 switch (init_attr->qp_type) {
1049 case IB_QPT_SMI:
1050 case IB_QPT_GSI:
1051 if (init_attr->port_num == 0 ||
1052 init_attr->port_num > ibpd->device->phys_port_cnt) {
1053 ret = ERR_PTR(-EINVAL);
1054 goto bail;
1055 }
1056 case IB_QPT_UC:
1057 case IB_QPT_RC:
1058 case IB_QPT_UD:
1059 sz = sizeof(struct hfi1_sge) *
1060 init_attr->cap.max_send_sge +
1061 sizeof(struct hfi1_swqe);
1062 swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
1063 if (swq == NULL) {
1064 ret = ERR_PTR(-ENOMEM);
1065 goto bail;
1066 }
1067 sz = sizeof(*qp);
1068 sg_list_sz = 0;
1069 if (init_attr->srq) {
1070 struct hfi1_srq *srq = to_isrq(init_attr->srq);
1071
1072 if (srq->rq.max_sge > 1)
1073 sg_list_sz = sizeof(*qp->r_sg_list) *
1074 (srq->rq.max_sge - 1);
1075 } else if (init_attr->cap.max_recv_sge > 1)
1076 sg_list_sz = sizeof(*qp->r_sg_list) *
1077 (init_attr->cap.max_recv_sge - 1);
1078 qp = kzalloc(sz + sg_list_sz, GFP_KERNEL);
1079 if (!qp) {
1080 ret = ERR_PTR(-ENOMEM);
1081 goto bail_swq;
1082 }
1083 RCU_INIT_POINTER(qp->next, NULL);
1084 qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL);
1085 if (!qp->s_hdr) {
1086 ret = ERR_PTR(-ENOMEM);
1087 goto bail_qp;
1088 }
1089 qp->timeout_jiffies =
1090 usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
1091 1000UL);
1092 if (init_attr->srq)
1093 sz = 0;
1094 else {
1095 qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
1096 qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
1097 sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
1098 sizeof(struct hfi1_rwqe);
1099 qp->r_rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) +
1100 qp->r_rq.size * sz);
1101 if (!qp->r_rq.wq) {
1102 ret = ERR_PTR(-ENOMEM);
1103 goto bail_qp;
1104 }
1105 }
1106
1107 /*
1108 * ib_create_qp() will initialize qp->ibqp
1109 * except for qp->ibqp.qp_num.
1110 */
1111 spin_lock_init(&qp->r_lock);
1112 spin_lock_init(&qp->s_lock);
1113 spin_lock_init(&qp->r_rq.lock);
1114 atomic_set(&qp->refcount, 0);
1115 init_waitqueue_head(&qp->wait);
1116 init_timer(&qp->s_timer);
1117 qp->s_timer.data = (unsigned long)qp;
1118 INIT_LIST_HEAD(&qp->rspwait);
1119 qp->state = IB_QPS_RESET;
1120 qp->s_wq = swq;
1121 qp->s_size = init_attr->cap.max_send_wr + 1;
1122 qp->s_max_sge = init_attr->cap.max_send_sge;
1123 if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
1124 qp->s_flags = HFI1_S_SIGNAL_REQ_WR;
1125 dev = to_idev(ibpd->device);
1126 dd = dd_from_dev(dev);
1127 err = alloc_qpn(dd, &dev->qp_dev->qpn_table, init_attr->qp_type,
1128 init_attr->port_num);
1129 if (err < 0) {
1130 ret = ERR_PTR(err);
1131 vfree(qp->r_rq.wq);
1132 goto bail_qp;
1133 }
1134 qp->ibqp.qp_num = err;
1135 qp->port_num = init_attr->port_num;
1136 reset_qp(qp, init_attr->qp_type);
1137
1138 break;
1139
1140 default:
1141 /* Don't support raw QPs */
1142 ret = ERR_PTR(-ENOSYS);
1143 goto bail;
1144 }
1145
1146 init_attr->cap.max_inline_data = 0;
1147
1148 /*
1149 * Return the address of the RWQ as the offset to mmap.
1150 * See hfi1_mmap() for details.
1151 */
1152 if (udata && udata->outlen >= sizeof(__u64)) {
1153 if (!qp->r_rq.wq) {
1154 __u64 offset = 0;
1155
1156 err = ib_copy_to_udata(udata, &offset,
1157 sizeof(offset));
1158 if (err) {
1159 ret = ERR_PTR(err);
1160 goto bail_ip;
1161 }
1162 } else {
1163 u32 s = sizeof(struct hfi1_rwq) + qp->r_rq.size * sz;
1164
1165 qp->ip = hfi1_create_mmap_info(dev, s,
1166 ibpd->uobject->context,
1167 qp->r_rq.wq);
1168 if (!qp->ip) {
1169 ret = ERR_PTR(-ENOMEM);
1170 goto bail_ip;
1171 }
1172
1173 err = ib_copy_to_udata(udata, &(qp->ip->offset),
1174 sizeof(qp->ip->offset));
1175 if (err) {
1176 ret = ERR_PTR(err);
1177 goto bail_ip;
1178 }
1179 }
1180 }
1181
1182 spin_lock(&dev->n_qps_lock);
1183 if (dev->n_qps_allocated == hfi1_max_qps) {
1184 spin_unlock(&dev->n_qps_lock);
1185 ret = ERR_PTR(-ENOMEM);
1186 goto bail_ip;
1187 }
1188
1189 dev->n_qps_allocated++;
1190 spin_unlock(&dev->n_qps_lock);
1191
1192 if (qp->ip) {
1193 spin_lock_irq(&dev->pending_lock);
1194 list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
1195 spin_unlock_irq(&dev->pending_lock);
1196 }
1197
1198 ret = &qp->ibqp;
1199
1200 /*
1201 * We have our QP and its good, now keep track of what types of opcodes
1202 * can be processed on this QP. We do this by keeping track of what the
1203 * 3 high order bits of the opcode are.
1204 */
1205 switch (init_attr->qp_type) {
1206 case IB_QPT_SMI:
1207 case IB_QPT_GSI:
1208 case IB_QPT_UD:
1209 qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & OPCODE_QP_MASK;
1210 break;
1211 case IB_QPT_RC:
1212 qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & OPCODE_QP_MASK;
1213 break;
1214 case IB_QPT_UC:
1215 qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & OPCODE_QP_MASK;
1216 break;
1217 default:
1218 ret = ERR_PTR(-EINVAL);
1219 goto bail_ip;
1220 }
1221
1222 goto bail;
1223
1224bail_ip:
1225 if (qp->ip)
1226 kref_put(&qp->ip->ref, hfi1_release_mmap_info);
1227 else
1228 vfree(qp->r_rq.wq);
1229 free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num);
1230bail_qp:
1231 kfree(qp->s_hdr);
1232 kfree(qp);
1233bail_swq:
1234 vfree(swq);
1235bail:
1236 return ret;
1237}
1238
1239/**
1240 * hfi1_destroy_qp - destroy a queue pair
1241 * @ibqp: the queue pair to destroy
1242 *
1243 * Returns 0 on success.
1244 *
1245 * Note that this can be called while the QP is actively sending or
1246 * receiving!
1247 */
1248int hfi1_destroy_qp(struct ib_qp *ibqp)
1249{
1250 struct hfi1_qp *qp = to_iqp(ibqp);
1251 struct hfi1_ibdev *dev = to_idev(ibqp->device);
1252
1253 /* Make sure HW and driver activity is stopped. */
1254 spin_lock_irq(&qp->r_lock);
1255 spin_lock(&qp->s_lock);
1256 if (qp->state != IB_QPS_RESET) {
1257 qp->state = IB_QPS_RESET;
1258 flush_iowait(qp);
1259 qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT);
1260 spin_unlock(&qp->s_lock);
1261 spin_unlock_irq(&qp->r_lock);
1262 cancel_work_sync(&qp->s_iowait.iowork);
1263 del_timer_sync(&qp->s_timer);
1264 iowait_sdma_drain(&qp->s_iowait);
1265 flush_tx_list(qp);
1266 remove_qp(dev, qp);
1267 wait_event(qp->wait, !atomic_read(&qp->refcount));
1268 spin_lock_irq(&qp->r_lock);
1269 spin_lock(&qp->s_lock);
1270 clear_mr_refs(qp, 1);
1271 clear_ahg(qp);
1272 }
1273 spin_unlock(&qp->s_lock);
1274 spin_unlock_irq(&qp->r_lock);
1275
1276 /* all user's cleaned up, mark it available */
1277 free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num);
1278 spin_lock(&dev->n_qps_lock);
1279 dev->n_qps_allocated--;
1280 spin_unlock(&dev->n_qps_lock);
1281
1282 if (qp->ip)
1283 kref_put(&qp->ip->ref, hfi1_release_mmap_info);
1284 else
1285 vfree(qp->r_rq.wq);
1286 vfree(qp->s_wq);
1287 kfree(qp->s_hdr);
1288 kfree(qp);
1289 return 0;
1290}
1291
1292/**
1293 * init_qpn_table - initialize the QP number table for a device
1294 * @qpt: the QPN table
1295 */
1296static int init_qpn_table(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt)
1297{
1298 u32 offset, qpn, i;
1299 struct qpn_map *map;
1300 int ret = 0;
1301
1302 spin_lock_init(&qpt->lock);
1303
1304 qpt->last = 0;
1305 qpt->incr = 1 << dd->qos_shift;
1306
1307 /* insure we don't assign QPs from KDETH 64K window */
1308 qpn = kdeth_qp << 16;
1309 qpt->nmaps = qpn / BITS_PER_PAGE;
1310 /* This should always be zero */
1311 offset = qpn & BITS_PER_PAGE_MASK;
1312 map = &qpt->map[qpt->nmaps];
1313 dd_dev_info(dd, "Reserving QPNs for KDETH window from 0x%x to 0x%x\n",
1314 qpn, qpn + 65535);
1315 for (i = 0; i < 65536; i++) {
1316 if (!map->page) {
1317 get_map_page(qpt, map);
1318 if (!map->page) {
1319 ret = -ENOMEM;
1320 break;
1321 }
1322 }
1323 set_bit(offset, map->page);
1324 offset++;
1325 if (offset == BITS_PER_PAGE) {
1326 /* next page */
1327 qpt->nmaps++;
1328 map++;
1329 offset = 0;
1330 }
1331 }
1332 return ret;
1333}
1334
1335/**
1336 * free_qpn_table - free the QP number table for a device
1337 * @qpt: the QPN table
1338 */
1339static void free_qpn_table(struct hfi1_qpn_table *qpt)
1340{
1341 int i;
1342
1343 for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
1344 free_page((unsigned long) qpt->map[i].page);
1345}
1346
1347/**
1348 * hfi1_get_credit - flush the send work queue of a QP
1349 * @qp: the qp who's send work queue to flush
1350 * @aeth: the Acknowledge Extended Transport Header
1351 *
1352 * The QP s_lock should be held.
1353 */
1354void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth)
1355{
1356 u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK;
1357
1358 /*
1359 * If the credit is invalid, we can send
1360 * as many packets as we like. Otherwise, we have to
1361 * honor the credit field.
1362 */
1363 if (credit == HFI1_AETH_CREDIT_INVAL) {
1364 if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) {
1365 qp->s_flags |= HFI1_S_UNLIMITED_CREDIT;
1366 if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) {
1367 qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT;
1368 hfi1_schedule_send(qp);
1369 }
1370 }
1371 } else if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) {
1372 /* Compute new LSN (i.e., MSN + credit) */
1373 credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK;
1374 if (cmp_msn(credit, qp->s_lsn) > 0) {
1375 qp->s_lsn = credit;
1376 if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) {
1377 qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT;
1378 hfi1_schedule_send(qp);
1379 }
1380 }
1381 }
1382}
1383
1384void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag)
1385{
1386 unsigned long flags;
1387
1388 spin_lock_irqsave(&qp->s_lock, flags);
1389 if (qp->s_flags & flag) {
1390 qp->s_flags &= ~flag;
1391 trace_hfi1_qpwakeup(qp, flag);
1392 hfi1_schedule_send(qp);
1393 }
1394 spin_unlock_irqrestore(&qp->s_lock, flags);
1395 /* Notify hfi1_destroy_qp() if it is waiting. */
1396 if (atomic_dec_and_test(&qp->refcount))
1397 wake_up(&qp->wait);
1398}
1399
1400static int iowait_sleep(
1401 struct sdma_engine *sde,
1402 struct iowait *wait,
1403 struct sdma_txreq *stx,
1404 unsigned seq)
1405{
1406 struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
1407 struct hfi1_qp *qp;
1408 unsigned long flags;
1409 int ret = 0;
1410 struct hfi1_ibdev *dev;
1411
1412 qp = tx->qp;
1413
1414 spin_lock_irqsave(&qp->s_lock, flags);
1415 if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
1416
1417 /*
1418 * If we couldn't queue the DMA request, save the info
1419 * and try again later rather than destroying the
1420 * buffer and undoing the side effects of the copy.
1421 */
1422 /* Make a common routine? */
1423 dev = &sde->dd->verbs_dev;
1424 list_add_tail(&stx->list, &wait->tx_head);
1425 write_seqlock(&dev->iowait_lock);
1426 if (sdma_progress(sde, seq, stx))
1427 goto eagain;
1428 if (list_empty(&qp->s_iowait.list)) {
1429 struct hfi1_ibport *ibp =
1430 to_iport(qp->ibqp.device, qp->port_num);
1431
1432 ibp->n_dmawait++;
1433 qp->s_flags |= HFI1_S_WAIT_DMA_DESC;
1434 list_add_tail(&qp->s_iowait.list, &sde->dmawait);
1435 trace_hfi1_qpsleep(qp, HFI1_S_WAIT_DMA_DESC);
1436 atomic_inc(&qp->refcount);
1437 }
1438 write_sequnlock(&dev->iowait_lock);
1439 qp->s_flags &= ~HFI1_S_BUSY;
1440 spin_unlock_irqrestore(&qp->s_lock, flags);
1441 ret = -EBUSY;
1442 } else {
1443 spin_unlock_irqrestore(&qp->s_lock, flags);
1444 hfi1_put_txreq(tx);
1445 }
1446 return ret;
1447eagain:
1448 write_sequnlock(&dev->iowait_lock);
1449 spin_unlock_irqrestore(&qp->s_lock, flags);
1450 list_del_init(&stx->list);
1451 return -EAGAIN;
1452}
1453
1454static void iowait_wakeup(struct iowait *wait, int reason)
1455{
1456 struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait);
1457
1458 WARN_ON(reason != SDMA_AVAIL_REASON);
1459 hfi1_qp_wakeup(qp, HFI1_S_WAIT_DMA_DESC);
1460}
1461
1462int hfi1_qp_init(struct hfi1_ibdev *dev)
1463{
1464 struct hfi1_devdata *dd = dd_from_dev(dev);
1465 int i;
1466 int ret = -ENOMEM;
1467
1468 /* allocate parent object */
1469 dev->qp_dev = kzalloc(sizeof(*dev->qp_dev), GFP_KERNEL);
1470 if (!dev->qp_dev)
1471 goto nomem;
1472 /* allocate hash table */
1473 dev->qp_dev->qp_table_size = hfi1_qp_table_size;
1474 dev->qp_dev->qp_table_bits = ilog2(hfi1_qp_table_size);
1475 dev->qp_dev->qp_table =
1476 kmalloc(dev->qp_dev->qp_table_size *
1477 sizeof(*dev->qp_dev->qp_table),
1478 GFP_KERNEL);
1479 if (!dev->qp_dev->qp_table)
1480 goto nomem;
1481 for (i = 0; i < dev->qp_dev->qp_table_size; i++)
1482 RCU_INIT_POINTER(dev->qp_dev->qp_table[i], NULL);
1483 spin_lock_init(&dev->qp_dev->qpt_lock);
1484 /* initialize qpn map */
1485 ret = init_qpn_table(dd, &dev->qp_dev->qpn_table);
1486 if (ret)
1487 goto nomem;
1488 return ret;
1489nomem:
1490 if (dev->qp_dev) {
1491 kfree(dev->qp_dev->qp_table);
1492 free_qpn_table(&dev->qp_dev->qpn_table);
1493 kfree(dev->qp_dev);
1494 }
1495 return ret;
1496}
1497
1498void hfi1_qp_exit(struct hfi1_ibdev *dev)
1499{
1500 struct hfi1_devdata *dd = dd_from_dev(dev);
1501 u32 qps_inuse;
1502
1503 qps_inuse = free_all_qps(dd);
1504 if (qps_inuse)
1505 dd_dev_err(dd, "QP memory leak! %u still in use\n",
1506 qps_inuse);
1507 if (dev->qp_dev) {
1508 kfree(dev->qp_dev->qp_table);
1509 free_qpn_table(&dev->qp_dev->qpn_table);
1510 kfree(dev->qp_dev);
1511 }
1512}
1513
1514/**
1515 *
1516 * qp_to_sdma_engine - map a qp to a send engine
1517 * @qp: the QP
1518 * @sc5: the 5 bit sc
1519 *
1520 * Return:
1521 * A send engine for the qp or NULL for SMI type qp.
1522 */
1523struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5)
1524{
1525 struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1526 struct sdma_engine *sde;
1527
1528 if (!(dd->flags & HFI1_HAS_SEND_DMA))
1529 return NULL;
1530 switch (qp->ibqp.qp_type) {
1531 case IB_QPT_UC:
1532 case IB_QPT_RC:
1533 break;
1534 case IB_QPT_SMI:
1535 return NULL;
1536 default:
1537 break;
1538 }
1539 sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5);
1540 return sde;
1541}
1542
1543struct qp_iter {
1544 struct hfi1_ibdev *dev;
1545 struct hfi1_qp *qp;
1546 int specials;
1547 int n;
1548};
1549
1550struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev)
1551{
1552 struct qp_iter *iter;
1553
1554 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
1555 if (!iter)
1556 return NULL;
1557
1558 iter->dev = dev;
1559 iter->specials = dev->ibdev.phys_port_cnt * 2;
1560 if (qp_iter_next(iter)) {
1561 kfree(iter);
1562 return NULL;
1563 }
1564
1565 return iter;
1566}
1567
1568int qp_iter_next(struct qp_iter *iter)
1569{
1570 struct hfi1_ibdev *dev = iter->dev;
1571 int n = iter->n;
1572 int ret = 1;
1573 struct hfi1_qp *pqp = iter->qp;
1574 struct hfi1_qp *qp;
1575
1576 /*
1577 * The approach is to consider the special qps
1578 * as an additional table entries before the
1579 * real hash table. Since the qp code sets
1580 * the qp->next hash link to NULL, this works just fine.
1581 *
1582 * iter->specials is 2 * # ports
1583 *
1584 * n = 0..iter->specials is the special qp indices
1585 *
1586 * n = iter->specials..dev->qp_dev->qp_table_size+iter->specials are
1587 * the potential hash bucket entries
1588 *
1589 */
1590 for (; n < dev->qp_dev->qp_table_size + iter->specials; n++) {
1591 if (pqp) {
1592 qp = rcu_dereference(pqp->next);
1593 } else {
1594 if (n < iter->specials) {
1595 struct hfi1_pportdata *ppd;
1596 struct hfi1_ibport *ibp;
1597 int pidx;
1598
1599 pidx = n % dev->ibdev.phys_port_cnt;
1600 ppd = &dd_from_dev(dev)->pport[pidx];
1601 ibp = &ppd->ibport_data;
1602
1603 if (!(n & 1))
1604 qp = rcu_dereference(ibp->qp[0]);
1605 else
1606 qp = rcu_dereference(ibp->qp[1]);
1607 } else {
1608 qp = rcu_dereference(
1609 dev->qp_dev->qp_table[
1610 (n - iter->specials)]);
1611 }
1612 }
1613 pqp = qp;
1614 if (qp) {
1615 iter->qp = qp;
1616 iter->n = n;
1617 return 0;
1618 }
1619 }
1620 return ret;
1621}
1622
1623static const char * const qp_type_str[] = {
1624 "SMI", "GSI", "RC", "UC", "UD",
1625};
1626
1627static int qp_idle(struct hfi1_qp *qp)
1628{
1629 return
1630 qp->s_last == qp->s_acked &&
1631 qp->s_acked == qp->s_cur &&
1632 qp->s_cur == qp->s_tail &&
1633 qp->s_tail == qp->s_head;
1634}
1635
1636void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
1637{
1638 struct hfi1_swqe *wqe;
1639 struct hfi1_qp *qp = iter->qp;
1640 struct sdma_engine *sde;
1641
1642 sde = qp_to_sdma_engine(qp, qp->s_sc);
1643 wqe = get_swqe_ptr(qp, qp->s_last);
1644 seq_printf(s,
1645 "N %d %s QP%u R %u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x SL %u MTU %d %u %u %u SDE %p,%u\n",
1646 iter->n,
1647 qp_idle(qp) ? "I" : "B",
1648 qp->ibqp.qp_num,
1649 atomic_read(&qp->refcount),
1650 qp_type_str[qp->ibqp.qp_type],
1651 qp->state,
1652 wqe ? wqe->wr.opcode : 0,
1653 qp->s_hdrwords,
1654 qp->s_flags,
1655 atomic_read(&qp->s_iowait.sdma_busy),
1656 !list_empty(&qp->s_iowait.list),
1657 qp->timeout,
1658 wqe ? wqe->ssn : 0,
1659 qp->s_lsn,
1660 qp->s_last_psn,
1661 qp->s_psn, qp->s_next_psn,
1662 qp->s_sending_psn, qp->s_sending_hpsn,
1663 qp->s_last, qp->s_acked, qp->s_cur,
1664 qp->s_tail, qp->s_head, qp->s_size,
1665 qp->remote_qpn,
1666 qp->remote_ah_attr.dlid,
1667 qp->remote_ah_attr.sl,
1668 qp->pmtu,
1669 qp->s_retry_cnt,
1670 qp->timeout,
1671 qp->s_rnr_retry_cnt,
1672 sde,
1673 sde ? sde->this_idx : 0);
1674}
1675
1676void qp_comm_est(struct hfi1_qp *qp)
1677{
1678 qp->r_flags |= HFI1_R_COMM_EST;
1679 if (qp->ibqp.event_handler) {
1680 struct ib_event ev;
1681
1682 ev.device = qp->ibqp.device;
1683 ev.element.qp = &qp->ibqp;
1684 ev.event = IB_EVENT_COMM_EST;
1685 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1686 }
1687}
diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/staging/rdma/hfi1/qp.h
new file mode 100644
index 000000000000..6b505859b59c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qp.h
@@ -0,0 +1,235 @@
1#ifndef _QP_H
2#define _QP_H
3/*
4 *
5 * This file is provided under a dual BSD/GPLv2 license. When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * Copyright(c) 2015 Intel Corporation.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of version 2 of the GNU General Public License as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * BSD LICENSE
22 *
23 * Copyright(c) 2015 Intel Corporation.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 *
29 * - Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * - Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in
33 * the documentation and/or other materials provided with the
34 * distribution.
35 * - Neither the name of Intel Corporation nor the names of its
36 * contributors may be used to endorse or promote products derived
37 * from this software without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
40 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
41 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
43 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
47 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
49 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 *
51 */
52
53#include <linux/hash.h>
54#include "verbs.h"
55
56#define QPN_MAX (1 << 24)
57#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
58
59/*
60 * QPN-map pages start out as NULL, they get allocated upon
61 * first use and are never deallocated. This way,
62 * large bitmaps are not allocated unless large numbers of QPs are used.
63 */
64struct qpn_map {
65 void *page;
66};
67
68struct hfi1_qpn_table {
69 spinlock_t lock; /* protect changes in this struct */
70 unsigned flags; /* flags for QP0/1 allocated for each port */
71 u32 last; /* last QP number allocated */
72 u32 nmaps; /* size of the map table */
73 u16 limit;
74 u8 incr;
75 /* bit map of free QP numbers other than 0/1 */
76 struct qpn_map map[QPNMAP_ENTRIES];
77};
78
79struct hfi1_qp_ibdev {
80 u32 qp_table_size;
81 u32 qp_table_bits;
82 struct hfi1_qp __rcu **qp_table;
83 spinlock_t qpt_lock;
84 struct hfi1_qpn_table qpn_table;
85};
86
87static inline u32 qpn_hash(struct hfi1_qp_ibdev *dev, u32 qpn)
88{
89 return hash_32(qpn, dev->qp_table_bits);
90}
91
92/**
93 * hfi1_lookup_qpn - return the QP with the given QPN
94 * @ibp: the ibport
95 * @qpn: the QP number to look up
96 *
97 * The caller must hold the rcu_read_lock(), and keep the lock until
98 * the returned qp is no longer in use.
99 */
100static inline struct hfi1_qp *hfi1_lookup_qpn(struct hfi1_ibport *ibp,
101 u32 qpn) __must_hold(RCU)
102{
103 struct hfi1_qp *qp = NULL;
104
105 if (unlikely(qpn <= 1)) {
106 qp = rcu_dereference(ibp->qp[qpn]);
107 } else {
108 struct hfi1_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev;
109 u32 n = qpn_hash(dev->qp_dev, qpn);
110
111 for (qp = rcu_dereference(dev->qp_dev->qp_table[n]); qp;
112 qp = rcu_dereference(qp->next))
113 if (qp->ibqp.qp_num == qpn)
114 break;
115 }
116 return qp;
117}
118
119/**
120 * hfi1_error_qp - put a QP into the error state
121 * @qp: the QP to put into the error state
122 * @err: the receive completion error to signal if a RWQE is active
123 *
124 * Flushes both send and receive work queues.
125 * Returns true if last WQE event should be generated.
126 * The QP r_lock and s_lock should be held and interrupts disabled.
127 * If we are already in error state, just return.
128 */
129int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err);
130
131/**
132 * hfi1_modify_qp - modify the attributes of a queue pair
133 * @ibqp: the queue pair who's attributes we're modifying
134 * @attr: the new attributes
135 * @attr_mask: the mask of attributes to modify
136 * @udata: user data for libibverbs.so
137 *
138 * Returns 0 on success, otherwise returns an errno.
139 */
140int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
141 int attr_mask, struct ib_udata *udata);
142
143int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
144 int attr_mask, struct ib_qp_init_attr *init_attr);
145
146/**
147 * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
148 * @qp: the queue pair to compute the AETH for
149 *
150 * Returns the AETH.
151 */
152__be32 hfi1_compute_aeth(struct hfi1_qp *qp);
153
154/**
155 * hfi1_create_qp - create a queue pair for a device
156 * @ibpd: the protection domain who's device we create the queue pair for
157 * @init_attr: the attributes of the queue pair
158 * @udata: user data for libibverbs.so
159 *
160 * Returns the queue pair on success, otherwise returns an errno.
161 *
162 * Called by the ib_create_qp() core verbs function.
163 */
164struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
165 struct ib_qp_init_attr *init_attr,
166 struct ib_udata *udata);
167/**
168 * hfi1_destroy_qp - destroy a queue pair
169 * @ibqp: the queue pair to destroy
170 *
171 * Returns 0 on success.
172 *
173 * Note that this can be called while the QP is actively sending or
174 * receiving!
175 */
176int hfi1_destroy_qp(struct ib_qp *ibqp);
177
178/**
179 * hfi1_get_credit - flush the send work queue of a QP
180 * @qp: the qp who's send work queue to flush
181 * @aeth: the Acknowledge Extended Transport Header
182 *
183 * The QP s_lock should be held.
184 */
185void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth);
186
187/**
188 * hfi1_qp_init - allocate QP tables
189 * @dev: a pointer to the hfi1_ibdev
190 */
191int hfi1_qp_init(struct hfi1_ibdev *dev);
192
193/**
194 * hfi1_qp_exit - free the QP related structures
195 * @dev: a pointer to the hfi1_ibdev
196 */
197void hfi1_qp_exit(struct hfi1_ibdev *dev);
198
199/**
200 * hfi1_qp_waitup - wake up on the indicated event
201 * @qp: the QP
202 * @flag: flag the qp on which the qp is stalled
203 */
204void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag);
205
206struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5);
207
208struct qp_iter;
209
210/**
211 * qp_iter_init - wake up on the indicated event
212 * @dev: the hfi1_ibdev
213 */
214struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev);
215
216/**
217 * qp_iter_next - wakeup on the indicated event
218 * @iter: the iterator for the qp hash list
219 */
220int qp_iter_next(struct qp_iter *iter);
221
222/**
223 * qp_iter_next - wake up on the indicated event
224 * @s: the seq_file to emit the qp information on
225 * @iter: the iterator for the qp hash list
226 */
227void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
228
229/**
230 * qp_comm_est - handle trap with QP established
231 * @qp: the QP
232 */
233void qp_comm_est(struct hfi1_qp *qp);
234
235#endif /* _QP_H */
diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c
new file mode 100644
index 000000000000..3138936157db
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qsfp.c
@@ -0,0 +1,546 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/delay.h>
52#include <linux/pci.h>
53#include <linux/vmalloc.h>
54
55#include "hfi.h"
56#include "twsi.h"
57
58/*
59 * QSFP support for hfi driver, using "Two Wire Serial Interface" driver
60 * in twsi.c
61 */
62#define I2C_MAX_RETRY 4
63
64/*
65 * Unlocked i2c write. Must hold dd->qsfp_i2c_mutex.
66 */
67static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
68 int offset, void *bp, int len)
69{
70 struct hfi1_devdata *dd = ppd->dd;
71 int ret, cnt;
72 u8 *buff = bp;
73
74 /* Make sure TWSI bus is in sane state. */
75 ret = hfi1_twsi_reset(dd, target);
76 if (ret) {
77 hfi1_dev_porterr(dd, ppd->port,
78 "I2C interface Reset for write failed\n");
79 return -EIO;
80 }
81
82 cnt = 0;
83 while (cnt < len) {
84 int wlen = len - cnt;
85
86 ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset,
87 buff + cnt, wlen);
88 if (ret) {
89 /* hfi1_twsi_blk_wr() 1 for error, else 0 */
90 return -EIO;
91 }
92 offset += wlen;
93 cnt += wlen;
94 }
95
96 /* Must wait min 20us between qsfp i2c transactions */
97 udelay(20);
98
99 return cnt;
100}
101
102int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
103 void *bp, int len)
104{
105 struct hfi1_devdata *dd = ppd->dd;
106 int ret;
107
108 ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex);
109 if (!ret) {
110 ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len);
111 mutex_unlock(&dd->qsfp_i2c_mutex);
112 }
113
114 return ret;
115}
116
117/*
118 * Unlocked i2c read. Must hold dd->qsfp_i2c_mutex.
119 */
120static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
121 int offset, void *bp, int len)
122{
123 struct hfi1_devdata *dd = ppd->dd;
124 int ret, cnt, pass = 0;
125 int stuck = 0;
126 u8 *buff = bp;
127
128 /* Make sure TWSI bus is in sane state. */
129 ret = hfi1_twsi_reset(dd, target);
130 if (ret) {
131 hfi1_dev_porterr(dd, ppd->port,
132 "I2C interface Reset for read failed\n");
133 ret = -EIO;
134 stuck = 1;
135 goto exit;
136 }
137
138 cnt = 0;
139 while (cnt < len) {
140 int rlen = len - cnt;
141
142 ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset,
143 buff + cnt, rlen);
144 /* Some QSFP's fail first try. Retry as experiment */
145 if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY)
146 continue;
147 if (ret) {
148 /* hfi1_twsi_blk_rd() 1 for error, else 0 */
149 ret = -EIO;
150 goto exit;
151 }
152 offset += rlen;
153 cnt += rlen;
154 }
155
156 ret = cnt;
157
158exit:
159 if (stuck)
160 dd_dev_err(dd, "I2C interface bus stuck non-idle\n");
161
162 if (pass >= I2C_MAX_RETRY && ret)
163 hfi1_dev_porterr(dd, ppd->port,
164 "I2C failed even retrying\n");
165 else if (pass)
166 hfi1_dev_porterr(dd, ppd->port, "I2C retries: %d\n", pass);
167
168 /* Must wait min 20us between qsfp i2c transactions */
169 udelay(20);
170
171 return ret;
172}
173
174int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
175 void *bp, int len)
176{
177 struct hfi1_devdata *dd = ppd->dd;
178 int ret;
179
180 ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex);
181 if (!ret) {
182 ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len);
183 mutex_unlock(&dd->qsfp_i2c_mutex);
184 }
185
186 return ret;
187}
188
189int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
190 int len)
191{
192 int count = 0;
193 int offset;
194 int nwrite;
195 int ret;
196 u8 page;
197
198 ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex);
199 if (ret)
200 return ret;
201
202 while (count < len) {
203 /*
204 * Set the qsfp page based on a zero-based addresss
205 * and a page size of QSFP_PAGESIZE bytes.
206 */
207 page = (u8)(addr / QSFP_PAGESIZE);
208
209 ret = __i2c_write(ppd, target, QSFP_DEV,
210 QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
211 if (ret != 1) {
212 hfi1_dev_porterr(
213 ppd->dd,
214 ppd->port,
215 "can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret);
216 ret = -EIO;
217 break;
218 }
219
220 /* truncate write to end of page if crossing page boundary */
221 offset = addr % QSFP_PAGESIZE;
222 nwrite = len - count;
223 if ((offset + nwrite) > QSFP_PAGESIZE)
224 nwrite = QSFP_PAGESIZE - offset;
225
226 ret = __i2c_write(ppd, target, QSFP_DEV, offset, bp + count,
227 nwrite);
228 if (ret <= 0) /* stop on error or nothing read */
229 break;
230
231 count += ret;
232 addr += ret;
233 }
234
235 mutex_unlock(&ppd->dd->qsfp_i2c_mutex);
236
237 if (ret < 0)
238 return ret;
239 return count;
240}
241
242int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
243 int len)
244{
245 int count = 0;
246 int offset;
247 int nread;
248 int ret;
249 u8 page;
250
251 ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex);
252 if (ret)
253 return ret;
254
255 while (count < len) {
256 /*
257 * Set the qsfp page based on a zero-based address
258 * and a page size of QSFP_PAGESIZE bytes.
259 */
260 page = (u8)(addr / QSFP_PAGESIZE);
261 ret = __i2c_write(ppd, target, QSFP_DEV,
262 QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
263 if (ret != 1) {
264 hfi1_dev_porterr(
265 ppd->dd,
266 ppd->port,
267 "can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret);
268 ret = -EIO;
269 break;
270 }
271
272 /* truncate read to end of page if crossing page boundary */
273 offset = addr % QSFP_PAGESIZE;
274 nread = len - count;
275 if ((offset + nread) > QSFP_PAGESIZE)
276 nread = QSFP_PAGESIZE - offset;
277
278 ret = __i2c_read(ppd, target, QSFP_DEV, offset, bp + count,
279 nread);
280 if (ret <= 0) /* stop on error or nothing read */
281 break;
282
283 count += ret;
284 addr += ret;
285 }
286
287 mutex_unlock(&ppd->dd->qsfp_i2c_mutex);
288
289 if (ret < 0)
290 return ret;
291 return count;
292}
293
294/*
295 * This function caches the QSFP memory range in 128 byte chunks.
296 * As an example, the next byte after address 255 is byte 128 from
297 * upper page 01H (if existing) rather than byte 0 from lower page 00H.
298 */
299int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
300{
301 u32 target = ppd->dd->hfi1_id;
302 int ret;
303 unsigned long flags;
304 u8 *cache = &cp->cache[0];
305
306 /* ensure sane contents on invalid reads, for cable swaps */
307 memset(cache, 0, (QSFP_MAX_NUM_PAGES*128));
308 dd_dev_info(ppd->dd, "%s: called\n", __func__);
309 if (!qsfp_mod_present(ppd)) {
310 ret = -ENODEV;
311 goto bail;
312 }
313
314 ret = qsfp_read(ppd, target, 0, cache, 256);
315 if (ret != 256) {
316 dd_dev_info(ppd->dd,
317 "%s: Read of pages 00H failed, expected 256, got %d\n",
318 __func__, ret);
319 goto bail;
320 }
321
322 if (cache[0] != 0x0C && cache[0] != 0x0D)
323 goto bail;
324
325 /* Is paging enabled? */
326 if (!(cache[2] & 4)) {
327
328 /* Paging enabled, page 03 required */
329 if ((cache[195] & 0xC0) == 0xC0) {
330 /* all */
331 ret = qsfp_read(ppd, target, 384, cache + 256, 128);
332 if (ret <= 0 || ret != 128) {
333 dd_dev_info(ppd->dd, "%s: failed\n", __func__);
334 goto bail;
335 }
336 ret = qsfp_read(ppd, target, 640, cache + 384, 128);
337 if (ret <= 0 || ret != 128) {
338 dd_dev_info(ppd->dd, "%s: failed\n", __func__);
339 goto bail;
340 }
341 ret = qsfp_read(ppd, target, 896, cache + 512, 128);
342 if (ret <= 0 || ret != 128) {
343 dd_dev_info(ppd->dd, "%s: failed\n", __func__);
344 goto bail;
345 }
346 } else if ((cache[195] & 0x80) == 0x80) {
347 /* only page 2 and 3 */
348 ret = qsfp_read(ppd, target, 640, cache + 384, 128);
349 if (ret <= 0 || ret != 128) {
350 dd_dev_info(ppd->dd, "%s: failed\n", __func__);
351 goto bail;
352 }
353 ret = qsfp_read(ppd, target, 896, cache + 512, 128);
354 if (ret <= 0 || ret != 128) {
355 dd_dev_info(ppd->dd, "%s: failed\n", __func__);
356 goto bail;
357 }
358 } else if ((cache[195] & 0x40) == 0x40) {
359 /* only page 1 and 3 */
360 ret = qsfp_read(ppd, target, 384, cache + 256, 128);
361 if (ret <= 0 || ret != 128) {
362 dd_dev_info(ppd->dd, "%s: failed\n", __func__);
363 goto bail;
364 }
365 ret = qsfp_read(ppd, target, 896, cache + 512, 128);
366 if (ret <= 0 || ret != 128) {
367 dd_dev_info(ppd->dd, "%s: failed\n", __func__);
368 goto bail;
369 }
370 } else {
371 /* only page 3 */
372 ret = qsfp_read(ppd, target, 896, cache + 512, 128);
373 if (ret <= 0 || ret != 128) {
374 dd_dev_info(ppd->dd, "%s: failed\n", __func__);
375 goto bail;
376 }
377 }
378 }
379
380 spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
381 ppd->qsfp_info.cache_valid = 1;
382 ppd->qsfp_info.cache_refresh_required = 0;
383 spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
384
385 return 0;
386
387bail:
388 memset(cache, 0, (QSFP_MAX_NUM_PAGES*128));
389 return ret;
390}
391
392const char * const hfi1_qsfp_devtech[16] = {
393 "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
394 "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
395 "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
396 "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
397};
398
399#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
400#define QSFP_DEFAULT_HDR_CNT 224
401
402static const char *pwr_codes = "1.5W2.0W2.5W3.5W";
403
404int qsfp_mod_present(struct hfi1_pportdata *ppd)
405{
406 if (HFI1_CAP_IS_KSET(QSFP_ENABLED)) {
407 struct hfi1_devdata *dd = ppd->dd;
408 u64 reg;
409
410 reg = read_csr(dd,
411 dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
412 return !(reg & QSFP_HFI0_MODPRST_N);
413 }
414 /* always return cable present */
415 return 1;
416}
417
418/*
419 * This function maps QSFP memory addresses in 128 byte chunks in the following
420 * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1
421 * spec
422 * For addr 000-127, lower page 00h
423 * For addr 128-255, upper page 00h
424 * For addr 256-383, upper page 01h
425 * For addr 384-511, upper page 02h
426 * For addr 512-639, upper page 03h
427 *
428 * For addresses beyond this range, it returns the invalid range of data buffer
429 * set to 0.
430 * For upper pages that are optional, if they are not valid, returns the
431 * particular range of bytes in the data buffer set to 0.
432 */
433int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
434 u8 *data)
435{
436 struct hfi1_pportdata *ppd;
437 u32 excess_len = 0;
438 int ret = 0;
439
440 if (port_num > dd->num_pports || port_num < 1) {
441 dd_dev_info(dd, "%s: Invalid port number %d\n",
442 __func__, port_num);
443 ret = -EINVAL;
444 goto set_zeroes;
445 }
446
447 ppd = dd->pport + (port_num - 1);
448 if (!qsfp_mod_present(ppd)) {
449 ret = -ENODEV;
450 goto set_zeroes;
451 }
452
453 if (!ppd->qsfp_info.cache_valid) {
454 ret = -EINVAL;
455 goto set_zeroes;
456 }
457
458 if (addr >= (QSFP_MAX_NUM_PAGES * 128)) {
459 ret = -ERANGE;
460 goto set_zeroes;
461 }
462
463 if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) {
464 excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128);
465 memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len));
466 data += (len - excess_len);
467 goto set_zeroes;
468 }
469
470 memcpy(data, &ppd->qsfp_info.cache[addr], len);
471 return 0;
472
473set_zeroes:
474 memset(data, 0, excess_len);
475 return ret;
476}
477
478int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
479{
480 u8 *cache = &ppd->qsfp_info.cache[0];
481 u8 bin_buff[QSFP_DUMP_CHUNK];
482 char lenstr[6];
483 int sofar, ret;
484 int bidx = 0;
485 u8 *atten = &cache[QSFP_ATTEN_OFFS];
486 u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
487
488 sofar = 0;
489 lenstr[0] = ' ';
490 lenstr[1] = '\0';
491
492 if (ppd->qsfp_info.cache_valid) {
493
494 if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
495 sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);
496
497 sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
498 pwr_codes +
499 (QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]) * 4));
500
501 sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
502 lenstr,
503 hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]);
504
505 sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
506 QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]);
507
508 sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
509 QSFP_OUI(vendor_oui));
510
511 sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
512 QSFP_PN_LEN, &cache[QSFP_PN_OFFS]);
513
514 sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
515 QSFP_REV_LEN, &cache[QSFP_REV_OFFS]);
516
517 if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
518 sofar += scnprintf(buf + sofar, len - sofar,
519 "Atten:%d, %d\n",
520 QSFP_ATTEN_SDR(atten),
521 QSFP_ATTEN_DDR(atten));
522
523 sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
524 QSFP_SN_LEN, &cache[QSFP_SN_OFFS]);
525
526 sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
527 QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]);
528
529 sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
530 QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]);
531
532 while (bidx < QSFP_DEFAULT_HDR_CNT) {
533 int iidx;
534
535 memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK);
536 for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) {
537 sofar += scnprintf(buf + sofar, len-sofar,
538 " %02X", bin_buff[iidx]);
539 }
540 sofar += scnprintf(buf + sofar, len - sofar, "\n");
541 bidx += QSFP_DUMP_CHUNK;
542 }
543 }
544 ret = sofar;
545 return ret;
546}
diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h
new file mode 100644
index 000000000000..d30c2a6baa0b
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qsfp.h
@@ -0,0 +1,222 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50/* QSFP support common definitions, for hfi driver */
51
52#define QSFP_DEV 0xA0
53#define QSFP_PWR_LAG_MSEC 2000
54#define QSFP_MODPRS_LAG_MSEC 20
55/* 128 byte pages, per SFF 8636 rev 2.4 */
56#define QSFP_MAX_NUM_PAGES 5
57
58/*
59 * Below are masks for QSFP pins. Pins are the same for HFI0 and HFI1.
60 * _N means asserted low
61 */
62#define QSFP_HFI0_I2CCLK (1 << 0)
63#define QSFP_HFI0_I2CDAT (1 << 1)
64#define QSFP_HFI0_RESET_N (1 << 2)
65#define QSFP_HFI0_INT_N (1 << 3)
66#define QSFP_HFI0_MODPRST_N (1 << 4)
67
68/* QSFP is paged at 256 bytes */
69#define QSFP_PAGESIZE 256
70
71/* Defined fields that Intel requires of qualified cables */
72/* Byte 0 is Identifier, not checked */
73/* Byte 1 is reserved "status MSB" */
74/* Byte 2 is "status LSB" We only care that D2 "Flat Mem" is set. */
75/*
76 * Rest of first 128 not used, although 127 is reserved for page select
77 * if module is not "Flat memory".
78 */
79#define QSFP_PAGE_SELECT_BYTE_OFFS 127
80/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
81#define QSFP_MOD_ID_OFFS 128
82/*
83 * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class
84 * 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
85 */
86#define QSFP_MOD_PWR_OFFS 129
87/* Byte 130 is Connector type. Not Intel req'd */
88/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
89/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */
90/* byte 140 is nominal bit-rate, in units of 100Mbits/sec Not Intel req'd */
91/* Byte 141 is Extended Rate Select. Not Intel req'd */
92/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */
93/* Byte 146 is length for Copper. Units of 1 meter */
94#define QSFP_MOD_LEN_OFFS 146
95/*
96 * Byte 147 is Device technology. D0..3 not Intel req'd
97 * D4..7 select from 15 choices, translated by table:
98 */
99#define QSFP_MOD_TECH_OFFS 147
100extern const char *const hfi1_qsfp_devtech[16];
101/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
102#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
103/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
104#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
105/* Attenuation should be valid for copper other than full/near Eq */
106#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
107/* Length is only valid if technology is "copper" */
108#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
109#define QSFP_TECH_1490 9
110
111#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
112 oui[2])
113#define QSFP_OUI_AMPHENOL 0x415048
114#define QSFP_OUI_FINISAR 0x009065
115#define QSFP_OUI_GORE 0x002177
116
117/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
118#define QSFP_VEND_OFFS 148
119#define QSFP_VEND_LEN 16
120/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
121#define QSFP_IBXCV_OFFS 164
122/* Bytes 165..167 are Vendor OUI number */
123#define QSFP_VOUI_OFFS 165
124#define QSFP_VOUI_LEN 3
125/* Bytes 168..183 are Vendor Part Number, string */
126#define QSFP_PN_OFFS 168
127#define QSFP_PN_LEN 16
128/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
129#define QSFP_REV_OFFS 184
130#define QSFP_REV_LEN 2
131/*
132 * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd
133 * If copper, they are attenuation in dB:
134 * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
135 */
136#define QSFP_ATTEN_OFFS 186
137#define QSFP_ATTEN_LEN 2
138/* Bytes 188,189 are Wavelength tolerance, not Intel req'd */
139/* Byte 190 is Max Case Temp. Not Intel req'd */
140/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */
141#define QSFP_CC_OFFS 191
142/* Bytes 192..195 are Options implemented in qsfp. Not Intel req'd */
143/* Bytes 196..211 are Serial Number, String */
144#define QSFP_SN_OFFS 196
145#define QSFP_SN_LEN 16
146/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
147#define QSFP_DATE_OFFS 212
148#define QSFP_DATE_LEN 6
149/* Bytes 218,219 are optional lot-code, string */
150#define QSFP_LOT_OFFS 218
151#define QSFP_LOT_LEN 2
152/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */
153/* Byte 223 is LSB of sum of bytes 192..222 */
154#define QSFP_CC_EXT_OFFS 223
155
156/*
157 * Interrupt flag masks
158 */
159#define QSFP_DATA_NOT_READY 0x01
160
161#define QSFP_HIGH_TEMP_ALARM 0x80
162#define QSFP_LOW_TEMP_ALARM 0x40
163#define QSFP_HIGH_TEMP_WARNING 0x20
164#define QSFP_LOW_TEMP_WARNING 0x10
165
166#define QSFP_HIGH_VCC_ALARM 0x80
167#define QSFP_LOW_VCC_ALARM 0x40
168#define QSFP_HIGH_VCC_WARNING 0x20
169#define QSFP_LOW_VCC_WARNING 0x10
170
171#define QSFP_HIGH_POWER_ALARM 0x88
172#define QSFP_LOW_POWER_ALARM 0x44
173#define QSFP_HIGH_POWER_WARNING 0x22
174#define QSFP_LOW_POWER_WARNING 0x11
175
176#define QSFP_HIGH_BIAS_ALARM 0x88
177#define QSFP_LOW_BIAS_ALARM 0x44
178#define QSFP_HIGH_BIAS_WARNING 0x22
179#define QSFP_LOW_BIAS_WARNING 0x11
180
181/*
182 * struct qsfp_data encapsulates state of QSFP device for one port.
183 * it will be part of port-specific data if a board supports QSFP.
184 *
185 * Since multiple board-types use QSFP, and their pport_data structs
186 * differ (in the chip-specific section), we need a pointer to its head.
187 *
188 * Avoiding premature optimization, we will have one work_struct per port,
189 * and let the qsfp_lock arbitrate access to common resources.
190 *
191 */
192
193#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
194#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
195#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
196
197struct qsfp_data {
198 /* Helps to find our way */
199 struct hfi1_pportdata *ppd;
200 struct work_struct qsfp_work;
201 u8 cache[QSFP_MAX_NUM_PAGES*128];
202 spinlock_t qsfp_lock;
203 u8 check_interrupt_flags;
204 u8 qsfp_interrupt_functional;
205 u8 cache_valid;
206 u8 cache_refresh_required;
207};
208
209int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
210 struct qsfp_data *cp);
211int qsfp_mod_present(struct hfi1_pportdata *ppd);
212int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
213 u32 len, u8 *data);
214
215int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
216 int offset, void *bp, int len);
217int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
218 int offset, void *bp, int len);
219int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
220 int len);
221int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
222 int len);
diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c
new file mode 100644
index 000000000000..632dd5ba7dfd
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/rc.c
@@ -0,0 +1,2426 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/io.h>
52
53#include "hfi.h"
54#include "qp.h"
55#include "sdma.h"
56#include "trace.h"
57
58/* cut down ridiculously long IB macro names */
59#define OP(x) IB_OPCODE_RC_##x
60
61static void rc_timeout(unsigned long arg);
62
63static u32 restart_sge(struct hfi1_sge_state *ss, struct hfi1_swqe *wqe,
64 u32 psn, u32 pmtu)
65{
66 u32 len;
67
68 len = delta_psn(psn, wqe->psn) * pmtu;
69 ss->sge = wqe->sg_list[0];
70 ss->sg_list = wqe->sg_list + 1;
71 ss->num_sge = wqe->wr.num_sge;
72 ss->total_len = wqe->length;
73 hfi1_skip_sge(ss, len, 0);
74 return wqe->length - len;
75}
76
77static void start_timer(struct hfi1_qp *qp)
78{
79 qp->s_flags |= HFI1_S_TIMER;
80 qp->s_timer.function = rc_timeout;
81 /* 4.096 usec. * (1 << qp->timeout) */
82 qp->s_timer.expires = jiffies + qp->timeout_jiffies;
83 add_timer(&qp->s_timer);
84}
85
86/**
87 * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
88 * @dev: the device for this QP
89 * @qp: a pointer to the QP
90 * @ohdr: a pointer to the IB header being constructed
91 * @pmtu: the path MTU
92 *
93 * Return 1 if constructed; otherwise, return 0.
94 * Note that we are in the responder's side of the QP context.
95 * Note the QP s_lock must be held.
96 */
97static int make_rc_ack(struct hfi1_ibdev *dev, struct hfi1_qp *qp,
98 struct hfi1_other_headers *ohdr, u32 pmtu)
99{
100 struct hfi1_ack_entry *e;
101 u32 hwords;
102 u32 len;
103 u32 bth0;
104 u32 bth2;
105 int middle = 0;
106
107 /* Don't send an ACK if we aren't supposed to. */
108 if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK))
109 goto bail;
110
111 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
112 hwords = 5;
113
114 switch (qp->s_ack_state) {
115 case OP(RDMA_READ_RESPONSE_LAST):
116 case OP(RDMA_READ_RESPONSE_ONLY):
117 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
118 if (e->rdma_sge.mr) {
119 hfi1_put_mr(e->rdma_sge.mr);
120 e->rdma_sge.mr = NULL;
121 }
122 /* FALLTHROUGH */
123 case OP(ATOMIC_ACKNOWLEDGE):
124 /*
125 * We can increment the tail pointer now that the last
126 * response has been sent instead of only being
127 * constructed.
128 */
129 if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
130 qp->s_tail_ack_queue = 0;
131 /* FALLTHROUGH */
132 case OP(SEND_ONLY):
133 case OP(ACKNOWLEDGE):
134 /* Check for no next entry in the queue. */
135 if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
136 if (qp->s_flags & HFI1_S_ACK_PENDING)
137 goto normal;
138 goto bail;
139 }
140
141 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
142 if (e->opcode == OP(RDMA_READ_REQUEST)) {
143 /*
144 * If a RDMA read response is being resent and
145 * we haven't seen the duplicate request yet,
146 * then stop sending the remaining responses the
147 * responder has seen until the requester re-sends it.
148 */
149 len = e->rdma_sge.sge_length;
150 if (len && !e->rdma_sge.mr) {
151 qp->s_tail_ack_queue = qp->r_head_ack_queue;
152 goto bail;
153 }
154 /* Copy SGE state in case we need to resend */
155 qp->s_rdma_mr = e->rdma_sge.mr;
156 if (qp->s_rdma_mr)
157 hfi1_get_mr(qp->s_rdma_mr);
158 qp->s_ack_rdma_sge.sge = e->rdma_sge;
159 qp->s_ack_rdma_sge.num_sge = 1;
160 qp->s_cur_sge = &qp->s_ack_rdma_sge;
161 if (len > pmtu) {
162 len = pmtu;
163 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
164 } else {
165 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
166 e->sent = 1;
167 }
168 ohdr->u.aeth = hfi1_compute_aeth(qp);
169 hwords++;
170 qp->s_ack_rdma_psn = e->psn;
171 bth2 = mask_psn(qp->s_ack_rdma_psn++);
172 } else {
173 /* COMPARE_SWAP or FETCH_ADD */
174 qp->s_cur_sge = NULL;
175 len = 0;
176 qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
177 ohdr->u.at.aeth = hfi1_compute_aeth(qp);
178 ohdr->u.at.atomic_ack_eth[0] =
179 cpu_to_be32(e->atomic_data >> 32);
180 ohdr->u.at.atomic_ack_eth[1] =
181 cpu_to_be32(e->atomic_data);
182 hwords += sizeof(ohdr->u.at) / sizeof(u32);
183 bth2 = mask_psn(e->psn);
184 e->sent = 1;
185 }
186 bth0 = qp->s_ack_state << 24;
187 break;
188
189 case OP(RDMA_READ_RESPONSE_FIRST):
190 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
191 /* FALLTHROUGH */
192 case OP(RDMA_READ_RESPONSE_MIDDLE):
193 qp->s_cur_sge = &qp->s_ack_rdma_sge;
194 qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
195 if (qp->s_rdma_mr)
196 hfi1_get_mr(qp->s_rdma_mr);
197 len = qp->s_ack_rdma_sge.sge.sge_length;
198 if (len > pmtu) {
199 len = pmtu;
200 middle = HFI1_CAP_IS_KSET(SDMA_AHG);
201 } else {
202 ohdr->u.aeth = hfi1_compute_aeth(qp);
203 hwords++;
204 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
205 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
206 e->sent = 1;
207 }
208 bth0 = qp->s_ack_state << 24;
209 bth2 = mask_psn(qp->s_ack_rdma_psn++);
210 break;
211
212 default:
213normal:
214 /*
215 * Send a regular ACK.
216 * Set the s_ack_state so we wait until after sending
217 * the ACK before setting s_ack_state to ACKNOWLEDGE
218 * (see above).
219 */
220 qp->s_ack_state = OP(SEND_ONLY);
221 qp->s_flags &= ~HFI1_S_ACK_PENDING;
222 qp->s_cur_sge = NULL;
223 if (qp->s_nak_state)
224 ohdr->u.aeth =
225 cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
226 (qp->s_nak_state <<
227 HFI1_AETH_CREDIT_SHIFT));
228 else
229 ohdr->u.aeth = hfi1_compute_aeth(qp);
230 hwords++;
231 len = 0;
232 bth0 = OP(ACKNOWLEDGE) << 24;
233 bth2 = mask_psn(qp->s_ack_psn);
234 }
235 qp->s_rdma_ack_cnt++;
236 qp->s_hdrwords = hwords;
237 qp->s_cur_size = len;
238 hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle);
239 return 1;
240
241bail:
242 qp->s_ack_state = OP(ACKNOWLEDGE);
243 /*
244 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
245 * HFI1_S_RESP_PENDING
246 */
247 smp_wmb();
248 qp->s_flags &= ~(HFI1_S_RESP_PENDING
249 | HFI1_S_ACK_PENDING
250 | HFI1_S_AHG_VALID);
251 return 0;
252}
253
254/**
255 * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
256 * @qp: a pointer to the QP
257 *
258 * Return 1 if constructed; otherwise, return 0.
259 */
260int hfi1_make_rc_req(struct hfi1_qp *qp)
261{
262 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
263 struct hfi1_other_headers *ohdr;
264 struct hfi1_sge_state *ss;
265 struct hfi1_swqe *wqe;
266 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
267 u32 hwords = 5;
268 u32 len;
269 u32 bth0 = 0;
270 u32 bth2;
271 u32 pmtu = qp->pmtu;
272 char newreq;
273 unsigned long flags;
274 int ret = 0;
275 int middle = 0;
276 int delta;
277
278 ohdr = &qp->s_hdr->ibh.u.oth;
279 if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
280 ohdr = &qp->s_hdr->ibh.u.l.oth;
281
282 /*
283 * The lock is needed to synchronize between the sending tasklet,
284 * the receive interrupt handler, and timeout re-sends.
285 */
286 spin_lock_irqsave(&qp->s_lock, flags);
287
288 /* Sending responses has higher priority over sending requests. */
289 if ((qp->s_flags & HFI1_S_RESP_PENDING) &&
290 make_rc_ack(dev, qp, ohdr, pmtu))
291 goto done;
292
293 if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) {
294 if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
295 goto bail;
296 /* We are in the error state, flush the work request. */
297 if (qp->s_last == qp->s_head)
298 goto bail;
299 /* If DMAs are in progress, we can't flush immediately. */
300 if (atomic_read(&qp->s_iowait.sdma_busy)) {
301 qp->s_flags |= HFI1_S_WAIT_DMA;
302 goto bail;
303 }
304 clear_ahg(qp);
305 wqe = get_swqe_ptr(qp, qp->s_last);
306 hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
307 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
308 /* will get called again */
309 goto done;
310 }
311
312 if (qp->s_flags & (HFI1_S_WAIT_RNR | HFI1_S_WAIT_ACK))
313 goto bail;
314
315 if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
316 if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
317 qp->s_flags |= HFI1_S_WAIT_PSN;
318 goto bail;
319 }
320 qp->s_sending_psn = qp->s_psn;
321 qp->s_sending_hpsn = qp->s_psn - 1;
322 }
323
324 /* Send a request. */
325 wqe = get_swqe_ptr(qp, qp->s_cur);
326 switch (qp->s_state) {
327 default:
328 if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK))
329 goto bail;
330 /*
331 * Resend an old request or start a new one.
332 *
333 * We keep track of the current SWQE so that
334 * we don't reset the "furthest progress" state
335 * if we need to back up.
336 */
337 newreq = 0;
338 if (qp->s_cur == qp->s_tail) {
339 /* Check if send work queue is empty. */
340 if (qp->s_tail == qp->s_head) {
341 clear_ahg(qp);
342 goto bail;
343 }
344 /*
345 * If a fence is requested, wait for previous
346 * RDMA read and atomic operations to finish.
347 */
348 if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
349 qp->s_num_rd_atomic) {
350 qp->s_flags |= HFI1_S_WAIT_FENCE;
351 goto bail;
352 }
353 wqe->psn = qp->s_next_psn;
354 newreq = 1;
355 }
356 /*
357 * Note that we have to be careful not to modify the
358 * original work request since we may need to resend
359 * it.
360 */
361 len = wqe->length;
362 ss = &qp->s_sge;
363 bth2 = mask_psn(qp->s_psn);
364 switch (wqe->wr.opcode) {
365 case IB_WR_SEND:
366 case IB_WR_SEND_WITH_IMM:
367 /* If no credit, return. */
368 if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) &&
369 cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
370 qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT;
371 goto bail;
372 }
373 wqe->lpsn = wqe->psn;
374 if (len > pmtu) {
375 wqe->lpsn += (len - 1) / pmtu;
376 qp->s_state = OP(SEND_FIRST);
377 len = pmtu;
378 break;
379 }
380 if (wqe->wr.opcode == IB_WR_SEND)
381 qp->s_state = OP(SEND_ONLY);
382 else {
383 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
384 /* Immediate data comes after the BTH */
385 ohdr->u.imm_data = wqe->wr.ex.imm_data;
386 hwords += 1;
387 }
388 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
389 bth0 |= IB_BTH_SOLICITED;
390 bth2 |= IB_BTH_REQ_ACK;
391 if (++qp->s_cur == qp->s_size)
392 qp->s_cur = 0;
393 break;
394
395 case IB_WR_RDMA_WRITE:
396 if (newreq && !(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
397 qp->s_lsn++;
398 /* FALLTHROUGH */
399 case IB_WR_RDMA_WRITE_WITH_IMM:
400 /* If no credit, return. */
401 if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) &&
402 cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
403 qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT;
404 goto bail;
405 }
406 ohdr->u.rc.reth.vaddr =
407 cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
408 ohdr->u.rc.reth.rkey =
409 cpu_to_be32(wqe->wr.wr.rdma.rkey);
410 ohdr->u.rc.reth.length = cpu_to_be32(len);
411 hwords += sizeof(struct ib_reth) / sizeof(u32);
412 wqe->lpsn = wqe->psn;
413 if (len > pmtu) {
414 wqe->lpsn += (len - 1) / pmtu;
415 qp->s_state = OP(RDMA_WRITE_FIRST);
416 len = pmtu;
417 break;
418 }
419 if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
420 qp->s_state = OP(RDMA_WRITE_ONLY);
421 else {
422 qp->s_state =
423 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
424 /* Immediate data comes after RETH */
425 ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
426 hwords += 1;
427 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
428 bth0 |= IB_BTH_SOLICITED;
429 }
430 bth2 |= IB_BTH_REQ_ACK;
431 if (++qp->s_cur == qp->s_size)
432 qp->s_cur = 0;
433 break;
434
435 case IB_WR_RDMA_READ:
436 /*
437 * Don't allow more operations to be started
438 * than the QP limits allow.
439 */
440 if (newreq) {
441 if (qp->s_num_rd_atomic >=
442 qp->s_max_rd_atomic) {
443 qp->s_flags |= HFI1_S_WAIT_RDMAR;
444 goto bail;
445 }
446 qp->s_num_rd_atomic++;
447 if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
448 qp->s_lsn++;
449 /*
450 * Adjust s_next_psn to count the
451 * expected number of responses.
452 */
453 if (len > pmtu)
454 qp->s_next_psn += (len - 1) / pmtu;
455 wqe->lpsn = qp->s_next_psn++;
456 }
457 ohdr->u.rc.reth.vaddr =
458 cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
459 ohdr->u.rc.reth.rkey =
460 cpu_to_be32(wqe->wr.wr.rdma.rkey);
461 ohdr->u.rc.reth.length = cpu_to_be32(len);
462 qp->s_state = OP(RDMA_READ_REQUEST);
463 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
464 ss = NULL;
465 len = 0;
466 bth2 |= IB_BTH_REQ_ACK;
467 if (++qp->s_cur == qp->s_size)
468 qp->s_cur = 0;
469 break;
470
471 case IB_WR_ATOMIC_CMP_AND_SWP:
472 case IB_WR_ATOMIC_FETCH_AND_ADD:
473 /*
474 * Don't allow more operations to be started
475 * than the QP limits allow.
476 */
477 if (newreq) {
478 if (qp->s_num_rd_atomic >=
479 qp->s_max_rd_atomic) {
480 qp->s_flags |= HFI1_S_WAIT_RDMAR;
481 goto bail;
482 }
483 qp->s_num_rd_atomic++;
484 if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
485 qp->s_lsn++;
486 wqe->lpsn = wqe->psn;
487 }
488 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
489 qp->s_state = OP(COMPARE_SWAP);
490 ohdr->u.atomic_eth.swap_data = cpu_to_be64(
491 wqe->wr.wr.atomic.swap);
492 ohdr->u.atomic_eth.compare_data = cpu_to_be64(
493 wqe->wr.wr.atomic.compare_add);
494 } else {
495 qp->s_state = OP(FETCH_ADD);
496 ohdr->u.atomic_eth.swap_data = cpu_to_be64(
497 wqe->wr.wr.atomic.compare_add);
498 ohdr->u.atomic_eth.compare_data = 0;
499 }
500 ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
501 wqe->wr.wr.atomic.remote_addr >> 32);
502 ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
503 wqe->wr.wr.atomic.remote_addr);
504 ohdr->u.atomic_eth.rkey = cpu_to_be32(
505 wqe->wr.wr.atomic.rkey);
506 hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
507 ss = NULL;
508 len = 0;
509 bth2 |= IB_BTH_REQ_ACK;
510 if (++qp->s_cur == qp->s_size)
511 qp->s_cur = 0;
512 break;
513
514 default:
515 goto bail;
516 }
517 qp->s_sge.sge = wqe->sg_list[0];
518 qp->s_sge.sg_list = wqe->sg_list + 1;
519 qp->s_sge.num_sge = wqe->wr.num_sge;
520 qp->s_sge.total_len = wqe->length;
521 qp->s_len = wqe->length;
522 if (newreq) {
523 qp->s_tail++;
524 if (qp->s_tail >= qp->s_size)
525 qp->s_tail = 0;
526 }
527 if (wqe->wr.opcode == IB_WR_RDMA_READ)
528 qp->s_psn = wqe->lpsn + 1;
529 else {
530 qp->s_psn++;
531 if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
532 qp->s_next_psn = qp->s_psn;
533 }
534 break;
535
536 case OP(RDMA_READ_RESPONSE_FIRST):
537 /*
538 * qp->s_state is normally set to the opcode of the
539 * last packet constructed for new requests and therefore
540 * is never set to RDMA read response.
541 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
542 * thread to indicate a SEND needs to be restarted from an
543 * earlier PSN without interfering with the sending thread.
544 * See restart_rc().
545 */
546 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
547 /* FALLTHROUGH */
548 case OP(SEND_FIRST):
549 qp->s_state = OP(SEND_MIDDLE);
550 /* FALLTHROUGH */
551 case OP(SEND_MIDDLE):
552 bth2 = mask_psn(qp->s_psn++);
553 if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
554 qp->s_next_psn = qp->s_psn;
555 ss = &qp->s_sge;
556 len = qp->s_len;
557 if (len > pmtu) {
558 len = pmtu;
559 middle = HFI1_CAP_IS_KSET(SDMA_AHG);
560 break;
561 }
562 if (wqe->wr.opcode == IB_WR_SEND)
563 qp->s_state = OP(SEND_LAST);
564 else {
565 qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
566 /* Immediate data comes after the BTH */
567 ohdr->u.imm_data = wqe->wr.ex.imm_data;
568 hwords += 1;
569 }
570 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
571 bth0 |= IB_BTH_SOLICITED;
572 bth2 |= IB_BTH_REQ_ACK;
573 qp->s_cur++;
574 if (qp->s_cur >= qp->s_size)
575 qp->s_cur = 0;
576 break;
577
578 case OP(RDMA_READ_RESPONSE_LAST):
579 /*
580 * qp->s_state is normally set to the opcode of the
581 * last packet constructed for new requests and therefore
582 * is never set to RDMA read response.
583 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
584 * thread to indicate a RDMA write needs to be restarted from
585 * an earlier PSN without interfering with the sending thread.
586 * See restart_rc().
587 */
588 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
589 /* FALLTHROUGH */
590 case OP(RDMA_WRITE_FIRST):
591 qp->s_state = OP(RDMA_WRITE_MIDDLE);
592 /* FALLTHROUGH */
593 case OP(RDMA_WRITE_MIDDLE):
594 bth2 = mask_psn(qp->s_psn++);
595 if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
596 qp->s_next_psn = qp->s_psn;
597 ss = &qp->s_sge;
598 len = qp->s_len;
599 if (len > pmtu) {
600 len = pmtu;
601 middle = HFI1_CAP_IS_KSET(SDMA_AHG);
602 break;
603 }
604 if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
605 qp->s_state = OP(RDMA_WRITE_LAST);
606 else {
607 qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
608 /* Immediate data comes after the BTH */
609 ohdr->u.imm_data = wqe->wr.ex.imm_data;
610 hwords += 1;
611 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
612 bth0 |= IB_BTH_SOLICITED;
613 }
614 bth2 |= IB_BTH_REQ_ACK;
615 qp->s_cur++;
616 if (qp->s_cur >= qp->s_size)
617 qp->s_cur = 0;
618 break;
619
620 case OP(RDMA_READ_RESPONSE_MIDDLE):
621 /*
622 * qp->s_state is normally set to the opcode of the
623 * last packet constructed for new requests and therefore
624 * is never set to RDMA read response.
625 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
626 * thread to indicate a RDMA read needs to be restarted from
627 * an earlier PSN without interfering with the sending thread.
628 * See restart_rc().
629 */
630 len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
631 ohdr->u.rc.reth.vaddr =
632 cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
633 ohdr->u.rc.reth.rkey =
634 cpu_to_be32(wqe->wr.wr.rdma.rkey);
635 ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
636 qp->s_state = OP(RDMA_READ_REQUEST);
637 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
638 bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
639 qp->s_psn = wqe->lpsn + 1;
640 ss = NULL;
641 len = 0;
642 qp->s_cur++;
643 if (qp->s_cur == qp->s_size)
644 qp->s_cur = 0;
645 break;
646 }
647 qp->s_sending_hpsn = bth2;
648 delta = delta_psn(bth2, wqe->psn);
649 if (delta && delta % HFI1_PSN_CREDIT == 0)
650 bth2 |= IB_BTH_REQ_ACK;
651 if (qp->s_flags & HFI1_S_SEND_ONE) {
652 qp->s_flags &= ~HFI1_S_SEND_ONE;
653 qp->s_flags |= HFI1_S_WAIT_ACK;
654 bth2 |= IB_BTH_REQ_ACK;
655 }
656 qp->s_len -= len;
657 qp->s_hdrwords = hwords;
658 qp->s_cur_sge = ss;
659 qp->s_cur_size = len;
660 hfi1_make_ruc_header(
661 qp,
662 ohdr,
663 bth0 | (qp->s_state << 24),
664 bth2,
665 middle);
666done:
667 ret = 1;
668 goto unlock;
669
670bail:
671 qp->s_flags &= ~HFI1_S_BUSY;
672unlock:
673 spin_unlock_irqrestore(&qp->s_lock, flags);
674 return ret;
675}
676
677/**
678 * hfi1_send_rc_ack - Construct an ACK packet and send it
679 * @qp: a pointer to the QP
680 *
681 * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
682 * Note that RDMA reads and atomics are handled in the
683 * send side QP state and tasklet.
684 */
685void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct hfi1_qp *qp,
686 int is_fecn)
687{
688 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
689 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
690 u64 pbc, pbc_flags = 0;
691 u16 lrh0;
692 u16 sc5;
693 u32 bth0;
694 u32 hwords;
695 u32 vl, plen;
696 struct send_context *sc;
697 struct pio_buf *pbuf;
698 struct hfi1_ib_header hdr;
699 struct hfi1_other_headers *ohdr;
700
701 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
702 if (qp->s_flags & HFI1_S_RESP_PENDING)
703 goto queue_ack;
704
705 /* Ensure s_rdma_ack_cnt changes are committed */
706 smp_read_barrier_depends();
707 if (qp->s_rdma_ack_cnt)
708 goto queue_ack;
709
710 /* Construct the header */
711 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
712 hwords = 6;
713 if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
714 hwords += hfi1_make_grh(ibp, &hdr.u.l.grh,
715 &qp->remote_ah_attr.grh, hwords, 0);
716 ohdr = &hdr.u.l.oth;
717 lrh0 = HFI1_LRH_GRH;
718 } else {
719 ohdr = &hdr.u.oth;
720 lrh0 = HFI1_LRH_BTH;
721 }
722 /* read pkey_index w/o lock (its atomic) */
723 bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
724 if (qp->s_mig_state == IB_MIG_MIGRATED)
725 bth0 |= IB_BTH_MIG_REQ;
726 if (qp->r_nak_state)
727 ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
728 (qp->r_nak_state <<
729 HFI1_AETH_CREDIT_SHIFT));
730 else
731 ohdr->u.aeth = hfi1_compute_aeth(qp);
732 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
733 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
734 pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
735 lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
736 hdr.lrh[0] = cpu_to_be16(lrh0);
737 hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
738 hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
739 hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
740 ohdr->bth[0] = cpu_to_be32(bth0);
741 ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
742 ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT);
743 ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
744
745 /* Don't try to send ACKs if the link isn't ACTIVE */
746 if (driver_lstate(ppd) != IB_PORT_ACTIVE)
747 return;
748
749 sc = rcd->sc;
750 plen = 2 /* PBC */ + hwords;
751 vl = sc_to_vlt(ppd->dd, sc5);
752 pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
753
754 pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
755 if (!pbuf) {
756 /*
757 * We have no room to send at the moment. Pass
758 * responsibility for sending the ACK to the send tasklet
759 * so that when enough buffer space becomes available,
760 * the ACK is sent ahead of other outgoing packets.
761 */
762 goto queue_ack;
763 }
764
765 trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr);
766
767 /* write the pbc and data */
768 ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
769
770 return;
771
772queue_ack:
773 this_cpu_inc(*ibp->rc_qacks);
774 spin_lock(&qp->s_lock);
775 qp->s_flags |= HFI1_S_ACK_PENDING | HFI1_S_RESP_PENDING;
776 qp->s_nak_state = qp->r_nak_state;
777 qp->s_ack_psn = qp->r_ack_psn;
778 if (is_fecn)
779 qp->s_flags |= HFI1_S_ECN;
780
781 /* Schedule the send tasklet. */
782 hfi1_schedule_send(qp);
783 spin_unlock(&qp->s_lock);
784}
785
786/**
787 * reset_psn - reset the QP state to send starting from PSN
788 * @qp: the QP
789 * @psn: the packet sequence number to restart at
790 *
791 * This is called from hfi1_rc_rcv() to process an incoming RC ACK
792 * for the given QP.
793 * Called at interrupt level with the QP s_lock held.
794 */
795static void reset_psn(struct hfi1_qp *qp, u32 psn)
796{
797 u32 n = qp->s_acked;
798 struct hfi1_swqe *wqe = get_swqe_ptr(qp, n);
799 u32 opcode;
800
801 qp->s_cur = n;
802
803 /*
804 * If we are starting the request from the beginning,
805 * let the normal send code handle initialization.
806 */
807 if (cmp_psn(psn, wqe->psn) <= 0) {
808 qp->s_state = OP(SEND_LAST);
809 goto done;
810 }
811
812 /* Find the work request opcode corresponding to the given PSN. */
813 opcode = wqe->wr.opcode;
814 for (;;) {
815 int diff;
816
817 if (++n == qp->s_size)
818 n = 0;
819 if (n == qp->s_tail)
820 break;
821 wqe = get_swqe_ptr(qp, n);
822 diff = cmp_psn(psn, wqe->psn);
823 if (diff < 0)
824 break;
825 qp->s_cur = n;
826 /*
827 * If we are starting the request from the beginning,
828 * let the normal send code handle initialization.
829 */
830 if (diff == 0) {
831 qp->s_state = OP(SEND_LAST);
832 goto done;
833 }
834 opcode = wqe->wr.opcode;
835 }
836
837 /*
838 * Set the state to restart in the middle of a request.
839 * Don't change the s_sge, s_cur_sge, or s_cur_size.
840 * See hfi1_make_rc_req().
841 */
842 switch (opcode) {
843 case IB_WR_SEND:
844 case IB_WR_SEND_WITH_IMM:
845 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
846 break;
847
848 case IB_WR_RDMA_WRITE:
849 case IB_WR_RDMA_WRITE_WITH_IMM:
850 qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
851 break;
852
853 case IB_WR_RDMA_READ:
854 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
855 break;
856
857 default:
858 /*
859 * This case shouldn't happen since its only
860 * one PSN per req.
861 */
862 qp->s_state = OP(SEND_LAST);
863 }
864done:
865 qp->s_psn = psn;
866 /*
867 * Set HFI1_S_WAIT_PSN as rc_complete() may start the timer
868 * asynchronously before the send tasklet can get scheduled.
869 * Doing it in hfi1_make_rc_req() is too late.
870 */
871 if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
872 (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
873 qp->s_flags |= HFI1_S_WAIT_PSN;
874 qp->s_flags &= ~HFI1_S_AHG_VALID;
875}
876
877/*
878 * Back up requester to resend the last un-ACKed request.
879 * The QP r_lock and s_lock should be held and interrupts disabled.
880 */
881static void restart_rc(struct hfi1_qp *qp, u32 psn, int wait)
882{
883 struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_acked);
884 struct hfi1_ibport *ibp;
885
886 if (qp->s_retry == 0) {
887 if (qp->s_mig_state == IB_MIG_ARMED) {
888 hfi1_migrate_qp(qp);
889 qp->s_retry = qp->s_retry_cnt;
890 } else if (qp->s_last == qp->s_acked) {
891 hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
892 hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
893 return;
894 } else /* need to handle delayed completion */
895 return;
896 } else
897 qp->s_retry--;
898
899 ibp = to_iport(qp->ibqp.device, qp->port_num);
900 if (wqe->wr.opcode == IB_WR_RDMA_READ)
901 ibp->n_rc_resends++;
902 else
903 ibp->n_rc_resends += delta_psn(qp->s_psn, psn);
904
905 qp->s_flags &= ~(HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR |
906 HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_PSN |
907 HFI1_S_WAIT_ACK);
908 if (wait)
909 qp->s_flags |= HFI1_S_SEND_ONE;
910 reset_psn(qp, psn);
911}
912
913/*
914 * This is called from s_timer for missing responses.
915 */
916static void rc_timeout(unsigned long arg)
917{
918 struct hfi1_qp *qp = (struct hfi1_qp *)arg;
919 struct hfi1_ibport *ibp;
920 unsigned long flags;
921
922 spin_lock_irqsave(&qp->r_lock, flags);
923 spin_lock(&qp->s_lock);
924 if (qp->s_flags & HFI1_S_TIMER) {
925 ibp = to_iport(qp->ibqp.device, qp->port_num);
926 ibp->n_rc_timeouts++;
927 qp->s_flags &= ~HFI1_S_TIMER;
928 del_timer(&qp->s_timer);
929 restart_rc(qp, qp->s_last_psn + 1, 1);
930 hfi1_schedule_send(qp);
931 }
932 spin_unlock(&qp->s_lock);
933 spin_unlock_irqrestore(&qp->r_lock, flags);
934}
935
936/*
937 * This is called from s_timer for RNR timeouts.
938 */
939void hfi1_rc_rnr_retry(unsigned long arg)
940{
941 struct hfi1_qp *qp = (struct hfi1_qp *)arg;
942 unsigned long flags;
943
944 spin_lock_irqsave(&qp->s_lock, flags);
945 if (qp->s_flags & HFI1_S_WAIT_RNR) {
946 qp->s_flags &= ~HFI1_S_WAIT_RNR;
947 del_timer(&qp->s_timer);
948 hfi1_schedule_send(qp);
949 }
950 spin_unlock_irqrestore(&qp->s_lock, flags);
951}
952
953/*
954 * Set qp->s_sending_psn to the next PSN after the given one.
955 * This would be psn+1 except when RDMA reads are present.
956 */
957static void reset_sending_psn(struct hfi1_qp *qp, u32 psn)
958{
959 struct hfi1_swqe *wqe;
960 u32 n = qp->s_last;
961
962 /* Find the work request corresponding to the given PSN. */
963 for (;;) {
964 wqe = get_swqe_ptr(qp, n);
965 if (cmp_psn(psn, wqe->lpsn) <= 0) {
966 if (wqe->wr.opcode == IB_WR_RDMA_READ)
967 qp->s_sending_psn = wqe->lpsn + 1;
968 else
969 qp->s_sending_psn = psn + 1;
970 break;
971 }
972 if (++n == qp->s_size)
973 n = 0;
974 if (n == qp->s_tail)
975 break;
976 }
977}
978
979/*
980 * This should be called with the QP s_lock held and interrupts disabled.
981 */
982void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr)
983{
984 struct hfi1_other_headers *ohdr;
985 struct hfi1_swqe *wqe;
986 struct ib_wc wc;
987 unsigned i;
988 u32 opcode;
989 u32 psn;
990
991 if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
992 return;
993
994 /* Find out where the BTH is */
995 if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
996 ohdr = &hdr->u.oth;
997 else
998 ohdr = &hdr->u.l.oth;
999
1000 opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
1001 if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1002 opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1003 WARN_ON(!qp->s_rdma_ack_cnt);
1004 qp->s_rdma_ack_cnt--;
1005 return;
1006 }
1007
1008 psn = be32_to_cpu(ohdr->bth[2]);
1009 reset_sending_psn(qp, psn);
1010
1011 /*
1012 * Start timer after a packet requesting an ACK has been sent and
1013 * there are still requests that haven't been acked.
1014 */
1015 if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
1016 !(qp->s_flags &
1017 (HFI1_S_TIMER | HFI1_S_WAIT_RNR | HFI1_S_WAIT_PSN)) &&
1018 (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK))
1019 start_timer(qp);
1020
1021 while (qp->s_last != qp->s_acked) {
1022 wqe = get_swqe_ptr(qp, qp->s_last);
1023 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
1024 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
1025 break;
1026 for (i = 0; i < wqe->wr.num_sge; i++) {
1027 struct hfi1_sge *sge = &wqe->sg_list[i];
1028
1029 hfi1_put_mr(sge->mr);
1030 }
1031 /* Post a send completion queue entry if requested. */
1032 if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
1033 (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
1034 memset(&wc, 0, sizeof(wc));
1035 wc.wr_id = wqe->wr.wr_id;
1036 wc.status = IB_WC_SUCCESS;
1037 wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
1038 wc.byte_len = wqe->length;
1039 wc.qp = &qp->ibqp;
1040 hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
1041 }
1042 if (++qp->s_last >= qp->s_size)
1043 qp->s_last = 0;
1044 }
1045 /*
1046 * If we were waiting for sends to complete before re-sending,
1047 * and they are now complete, restart sending.
1048 */
1049 trace_hfi1_rc_sendcomplete(qp, psn);
1050 if (qp->s_flags & HFI1_S_WAIT_PSN &&
1051 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1052 qp->s_flags &= ~HFI1_S_WAIT_PSN;
1053 qp->s_sending_psn = qp->s_psn;
1054 qp->s_sending_hpsn = qp->s_psn - 1;
1055 hfi1_schedule_send(qp);
1056 }
1057}
1058
1059static inline void update_last_psn(struct hfi1_qp *qp, u32 psn)
1060{
1061 qp->s_last_psn = psn;
1062}
1063
1064/*
1065 * Generate a SWQE completion.
1066 * This is similar to hfi1_send_complete but has to check to be sure
1067 * that the SGEs are not being referenced if the SWQE is being resent.
1068 */
1069static struct hfi1_swqe *do_rc_completion(struct hfi1_qp *qp,
1070 struct hfi1_swqe *wqe,
1071 struct hfi1_ibport *ibp)
1072{
1073 struct ib_wc wc;
1074 unsigned i;
1075
1076 /*
1077 * Don't decrement refcount and don't generate a
1078 * completion if the SWQE is being resent until the send
1079 * is finished.
1080 */
1081 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
1082 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1083 for (i = 0; i < wqe->wr.num_sge; i++) {
1084 struct hfi1_sge *sge = &wqe->sg_list[i];
1085
1086 hfi1_put_mr(sge->mr);
1087 }
1088 /* Post a send completion queue entry if requested. */
1089 if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
1090 (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
1091 memset(&wc, 0, sizeof(wc));
1092 wc.wr_id = wqe->wr.wr_id;
1093 wc.status = IB_WC_SUCCESS;
1094 wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
1095 wc.byte_len = wqe->length;
1096 wc.qp = &qp->ibqp;
1097 hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
1098 }
1099 if (++qp->s_last >= qp->s_size)
1100 qp->s_last = 0;
1101 } else {
1102 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1103
1104 this_cpu_inc(*ibp->rc_delayed_comp);
1105 /*
1106 * If send progress not running attempt to progress
1107 * SDMA queue.
1108 */
1109 if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
1110 struct sdma_engine *engine;
1111 u8 sc5;
1112
1113 /* For now use sc to find engine */
1114 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
1115 engine = qp_to_sdma_engine(qp, sc5);
1116 sdma_engine_progress_schedule(engine);
1117 }
1118 }
1119
1120 qp->s_retry = qp->s_retry_cnt;
1121 update_last_psn(qp, wqe->lpsn);
1122
1123 /*
1124 * If we are completing a request which is in the process of
1125 * being resent, we can stop re-sending it since we know the
1126 * responder has already seen it.
1127 */
1128 if (qp->s_acked == qp->s_cur) {
1129 if (++qp->s_cur >= qp->s_size)
1130 qp->s_cur = 0;
1131 qp->s_acked = qp->s_cur;
1132 wqe = get_swqe_ptr(qp, qp->s_cur);
1133 if (qp->s_acked != qp->s_tail) {
1134 qp->s_state = OP(SEND_LAST);
1135 qp->s_psn = wqe->psn;
1136 }
1137 } else {
1138 if (++qp->s_acked >= qp->s_size)
1139 qp->s_acked = 0;
1140 if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
1141 qp->s_draining = 0;
1142 wqe = get_swqe_ptr(qp, qp->s_acked);
1143 }
1144 return wqe;
1145}
1146
1147/**
1148 * do_rc_ack - process an incoming RC ACK
1149 * @qp: the QP the ACK came in on
1150 * @psn: the packet sequence number of the ACK
1151 * @opcode: the opcode of the request that resulted in the ACK
1152 *
1153 * This is called from rc_rcv_resp() to process an incoming RC ACK
1154 * for the given QP.
1155 * Called at interrupt level with the QP s_lock held.
1156 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1157 */
1158static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode,
1159 u64 val, struct hfi1_ctxtdata *rcd)
1160{
1161 struct hfi1_ibport *ibp;
1162 enum ib_wc_status status;
1163 struct hfi1_swqe *wqe;
1164 int ret = 0;
1165 u32 ack_psn;
1166 int diff;
1167
1168 /* Remove QP from retry timer */
1169 if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
1170 qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
1171 del_timer(&qp->s_timer);
1172 }
1173
1174 /*
1175 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1176 * requests and implicitly NAK RDMA read and atomic requests issued
1177 * before the NAK'ed request. The MSN won't include the NAK'ed
1178 * request but will include an ACK'ed request(s).
1179 */
1180 ack_psn = psn;
1181 if (aeth >> 29)
1182 ack_psn--;
1183 wqe = get_swqe_ptr(qp, qp->s_acked);
1184 ibp = to_iport(qp->ibqp.device, qp->port_num);
1185
1186 /*
1187 * The MSN might be for a later WQE than the PSN indicates so
1188 * only complete WQEs that the PSN finishes.
1189 */
1190 while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
1191 /*
1192 * RDMA_READ_RESPONSE_ONLY is a special case since
1193 * we want to generate completion events for everything
1194 * before the RDMA read, copy the data, then generate
1195 * the completion for the read.
1196 */
1197 if (wqe->wr.opcode == IB_WR_RDMA_READ &&
1198 opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
1199 diff == 0) {
1200 ret = 1;
1201 goto bail;
1202 }
1203 /*
1204 * If this request is a RDMA read or atomic, and the ACK is
1205 * for a later operation, this ACK NAKs the RDMA read or
1206 * atomic. In other words, only a RDMA_READ_LAST or ONLY
1207 * can ACK a RDMA read and likewise for atomic ops. Note
1208 * that the NAK case can only happen if relaxed ordering is
1209 * used and requests are sent after an RDMA read or atomic
1210 * is sent but before the response is received.
1211 */
1212 if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
1213 (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
1214 ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1215 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1216 (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1217 /* Retry this request. */
1218 if (!(qp->r_flags & HFI1_R_RDMAR_SEQ)) {
1219 qp->r_flags |= HFI1_R_RDMAR_SEQ;
1220 restart_rc(qp, qp->s_last_psn + 1, 0);
1221 if (list_empty(&qp->rspwait)) {
1222 qp->r_flags |= HFI1_R_RSP_SEND;
1223 atomic_inc(&qp->refcount);
1224 list_add_tail(&qp->rspwait,
1225 &rcd->qp_wait_list);
1226 }
1227 }
1228 /*
1229 * No need to process the ACK/NAK since we are
1230 * restarting an earlier request.
1231 */
1232 goto bail;
1233 }
1234 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1235 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1236 u64 *vaddr = wqe->sg_list[0].vaddr;
1237 *vaddr = val;
1238 }
1239 if (qp->s_num_rd_atomic &&
1240 (wqe->wr.opcode == IB_WR_RDMA_READ ||
1241 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1242 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
1243 qp->s_num_rd_atomic--;
1244 /* Restart sending task if fence is complete */
1245 if ((qp->s_flags & HFI1_S_WAIT_FENCE) &&
1246 !qp->s_num_rd_atomic) {
1247 qp->s_flags &= ~(HFI1_S_WAIT_FENCE |
1248 HFI1_S_WAIT_ACK);
1249 hfi1_schedule_send(qp);
1250 } else if (qp->s_flags & HFI1_S_WAIT_RDMAR) {
1251 qp->s_flags &= ~(HFI1_S_WAIT_RDMAR |
1252 HFI1_S_WAIT_ACK);
1253 hfi1_schedule_send(qp);
1254 }
1255 }
1256 wqe = do_rc_completion(qp, wqe, ibp);
1257 if (qp->s_acked == qp->s_tail)
1258 break;
1259 }
1260
1261 switch (aeth >> 29) {
1262 case 0: /* ACK */
1263 this_cpu_inc(*ibp->rc_acks);
1264 if (qp->s_acked != qp->s_tail) {
1265 /*
1266 * We are expecting more ACKs so
1267 * reset the re-transmit timer.
1268 */
1269 start_timer(qp);
1270 /*
1271 * We can stop re-sending the earlier packets and
1272 * continue with the next packet the receiver wants.
1273 */
1274 if (cmp_psn(qp->s_psn, psn) <= 0)
1275 reset_psn(qp, psn + 1);
1276 } else if (cmp_psn(qp->s_psn, psn) <= 0) {
1277 qp->s_state = OP(SEND_LAST);
1278 qp->s_psn = psn + 1;
1279 }
1280 if (qp->s_flags & HFI1_S_WAIT_ACK) {
1281 qp->s_flags &= ~HFI1_S_WAIT_ACK;
1282 hfi1_schedule_send(qp);
1283 }
1284 hfi1_get_credit(qp, aeth);
1285 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1286 qp->s_retry = qp->s_retry_cnt;
1287 update_last_psn(qp, psn);
1288 ret = 1;
1289 goto bail;
1290
1291 case 1: /* RNR NAK */
1292 ibp->n_rnr_naks++;
1293 if (qp->s_acked == qp->s_tail)
1294 goto bail;
1295 if (qp->s_flags & HFI1_S_WAIT_RNR)
1296 goto bail;
1297 if (qp->s_rnr_retry == 0) {
1298 status = IB_WC_RNR_RETRY_EXC_ERR;
1299 goto class_b;
1300 }
1301 if (qp->s_rnr_retry_cnt < 7)
1302 qp->s_rnr_retry--;
1303
1304 /* The last valid PSN is the previous PSN. */
1305 update_last_psn(qp, psn - 1);
1306
1307 ibp->n_rc_resends += delta_psn(qp->s_psn, psn);
1308
1309 reset_psn(qp, psn);
1310
1311 qp->s_flags &= ~(HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_ACK);
1312 qp->s_flags |= HFI1_S_WAIT_RNR;
1313 qp->s_timer.function = hfi1_rc_rnr_retry;
1314 qp->s_timer.expires = jiffies + usecs_to_jiffies(
1315 ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) &
1316 HFI1_AETH_CREDIT_MASK]);
1317 add_timer(&qp->s_timer);
1318 goto bail;
1319
1320 case 3: /* NAK */
1321 if (qp->s_acked == qp->s_tail)
1322 goto bail;
1323 /* The last valid PSN is the previous PSN. */
1324 update_last_psn(qp, psn - 1);
1325 switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) &
1326 HFI1_AETH_CREDIT_MASK) {
1327 case 0: /* PSN sequence error */
1328 ibp->n_seq_naks++;
1329 /*
1330 * Back up to the responder's expected PSN.
1331 * Note that we might get a NAK in the middle of an
1332 * RDMA READ response which terminates the RDMA
1333 * READ.
1334 */
1335 restart_rc(qp, psn, 0);
1336 hfi1_schedule_send(qp);
1337 break;
1338
1339 case 1: /* Invalid Request */
1340 status = IB_WC_REM_INV_REQ_ERR;
1341 ibp->n_other_naks++;
1342 goto class_b;
1343
1344 case 2: /* Remote Access Error */
1345 status = IB_WC_REM_ACCESS_ERR;
1346 ibp->n_other_naks++;
1347 goto class_b;
1348
1349 case 3: /* Remote Operation Error */
1350 status = IB_WC_REM_OP_ERR;
1351 ibp->n_other_naks++;
1352class_b:
1353 if (qp->s_last == qp->s_acked) {
1354 hfi1_send_complete(qp, wqe, status);
1355 hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1356 }
1357 break;
1358
1359 default:
1360 /* Ignore other reserved NAK error codes */
1361 goto reserved;
1362 }
1363 qp->s_retry = qp->s_retry_cnt;
1364 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1365 goto bail;
1366
1367 default: /* 2: reserved */
1368reserved:
1369 /* Ignore reserved NAK codes. */
1370 goto bail;
1371 }
1372
1373bail:
1374 return ret;
1375}
1376
1377/*
1378 * We have seen an out of sequence RDMA read middle or last packet.
1379 * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1380 */
1381static void rdma_seq_err(struct hfi1_qp *qp, struct hfi1_ibport *ibp, u32 psn,
1382 struct hfi1_ctxtdata *rcd)
1383{
1384 struct hfi1_swqe *wqe;
1385
1386 /* Remove QP from retry timer */
1387 if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
1388 qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
1389 del_timer(&qp->s_timer);
1390 }
1391
1392 wqe = get_swqe_ptr(qp, qp->s_acked);
1393
1394 while (cmp_psn(psn, wqe->lpsn) > 0) {
1395 if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1396 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1397 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1398 break;
1399 wqe = do_rc_completion(qp, wqe, ibp);
1400 }
1401
1402 ibp->n_rdma_seq++;
1403 qp->r_flags |= HFI1_R_RDMAR_SEQ;
1404 restart_rc(qp, qp->s_last_psn + 1, 0);
1405 if (list_empty(&qp->rspwait)) {
1406 qp->r_flags |= HFI1_R_RSP_SEND;
1407 atomic_inc(&qp->refcount);
1408 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1409 }
1410}
1411
1412/**
1413 * rc_rcv_resp - process an incoming RC response packet
1414 * @ibp: the port this packet came in on
1415 * @ohdr: the other headers for this packet
1416 * @data: the packet data
1417 * @tlen: the packet length
1418 * @qp: the QP for this packet
1419 * @opcode: the opcode for this packet
1420 * @psn: the packet sequence number for this packet
1421 * @hdrsize: the header length
1422 * @pmtu: the path MTU
1423 *
1424 * This is called from hfi1_rc_rcv() to process an incoming RC response
1425 * packet for the given QP.
1426 * Called at interrupt level.
1427 */
1428static void rc_rcv_resp(struct hfi1_ibport *ibp,
1429 struct hfi1_other_headers *ohdr,
1430 void *data, u32 tlen, struct hfi1_qp *qp,
1431 u32 opcode, u32 psn, u32 hdrsize, u32 pmtu,
1432 struct hfi1_ctxtdata *rcd)
1433{
1434 struct hfi1_swqe *wqe;
1435 enum ib_wc_status status;
1436 unsigned long flags;
1437 int diff;
1438 u32 pad;
1439 u32 aeth;
1440 u64 val;
1441
1442 spin_lock_irqsave(&qp->s_lock, flags);
1443
1444 /* Ignore invalid responses. */
1445 if (cmp_psn(psn, qp->s_next_psn) >= 0)
1446 goto ack_done;
1447
1448 /* Ignore duplicate responses. */
1449 diff = cmp_psn(psn, qp->s_last_psn);
1450 if (unlikely(diff <= 0)) {
1451 /* Update credits for "ghost" ACKs */
1452 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1453 aeth = be32_to_cpu(ohdr->u.aeth);
1454 if ((aeth >> 29) == 0)
1455 hfi1_get_credit(qp, aeth);
1456 }
1457 goto ack_done;
1458 }
1459
1460 /*
1461 * Skip everything other than the PSN we expect, if we are waiting
1462 * for a reply to a restarted RDMA read or atomic op.
1463 */
1464 if (qp->r_flags & HFI1_R_RDMAR_SEQ) {
1465 if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
1466 goto ack_done;
1467 qp->r_flags &= ~HFI1_R_RDMAR_SEQ;
1468 }
1469
1470 if (unlikely(qp->s_acked == qp->s_tail))
1471 goto ack_done;
1472 wqe = get_swqe_ptr(qp, qp->s_acked);
1473 status = IB_WC_SUCCESS;
1474
1475 switch (opcode) {
1476 case OP(ACKNOWLEDGE):
1477 case OP(ATOMIC_ACKNOWLEDGE):
1478 case OP(RDMA_READ_RESPONSE_FIRST):
1479 aeth = be32_to_cpu(ohdr->u.aeth);
1480 if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
1481 __be32 *p = ohdr->u.at.atomic_ack_eth;
1482
1483 val = ((u64) be32_to_cpu(p[0]) << 32) |
1484 be32_to_cpu(p[1]);
1485 } else
1486 val = 0;
1487 if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
1488 opcode != OP(RDMA_READ_RESPONSE_FIRST))
1489 goto ack_done;
1490 wqe = get_swqe_ptr(qp, qp->s_acked);
1491 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1492 goto ack_op_err;
1493 /*
1494 * If this is a response to a resent RDMA read, we
1495 * have to be careful to copy the data to the right
1496 * location.
1497 */
1498 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1499 wqe, psn, pmtu);
1500 goto read_middle;
1501
1502 case OP(RDMA_READ_RESPONSE_MIDDLE):
1503 /* no AETH, no ACK */
1504 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
1505 goto ack_seq_err;
1506 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1507 goto ack_op_err;
1508read_middle:
1509 if (unlikely(tlen != (hdrsize + pmtu + 4)))
1510 goto ack_len_err;
1511 if (unlikely(pmtu >= qp->s_rdma_read_len))
1512 goto ack_len_err;
1513
1514 /*
1515 * We got a response so update the timeout.
1516 * 4.096 usec. * (1 << qp->timeout)
1517 */
1518 qp->s_flags |= HFI1_S_TIMER;
1519 mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
1520 if (qp->s_flags & HFI1_S_WAIT_ACK) {
1521 qp->s_flags &= ~HFI1_S_WAIT_ACK;
1522 hfi1_schedule_send(qp);
1523 }
1524
1525 if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
1526 qp->s_retry = qp->s_retry_cnt;
1527
1528 /*
1529 * Update the RDMA receive state but do the copy w/o
1530 * holding the locks and blocking interrupts.
1531 */
1532 qp->s_rdma_read_len -= pmtu;
1533 update_last_psn(qp, psn);
1534 spin_unlock_irqrestore(&qp->s_lock, flags);
1535 hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
1536 goto bail;
1537
1538 case OP(RDMA_READ_RESPONSE_ONLY):
1539 aeth = be32_to_cpu(ohdr->u.aeth);
1540 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
1541 goto ack_done;
1542 /* Get the number of bytes the message was padded by. */
1543 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1544 /*
1545 * Check that the data size is >= 0 && <= pmtu.
1546 * Remember to account for ICRC (4).
1547 */
1548 if (unlikely(tlen < (hdrsize + pad + 4)))
1549 goto ack_len_err;
1550 /*
1551 * If this is a response to a resent RDMA read, we
1552 * have to be careful to copy the data to the right
1553 * location.
1554 */
1555 wqe = get_swqe_ptr(qp, qp->s_acked);
1556 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1557 wqe, psn, pmtu);
1558 goto read_last;
1559
1560 case OP(RDMA_READ_RESPONSE_LAST):
1561 /* ACKs READ req. */
1562 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
1563 goto ack_seq_err;
1564 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1565 goto ack_op_err;
1566 /* Get the number of bytes the message was padded by. */
1567 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1568 /*
1569 * Check that the data size is >= 1 && <= pmtu.
1570 * Remember to account for ICRC (4).
1571 */
1572 if (unlikely(tlen <= (hdrsize + pad + 4)))
1573 goto ack_len_err;
1574read_last:
1575 tlen -= hdrsize + pad + 4;
1576 if (unlikely(tlen != qp->s_rdma_read_len))
1577 goto ack_len_err;
1578 aeth = be32_to_cpu(ohdr->u.aeth);
1579 hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
1580 WARN_ON(qp->s_rdma_read_sge.num_sge);
1581 (void) do_rc_ack(qp, aeth, psn,
1582 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
1583 goto ack_done;
1584 }
1585
1586ack_op_err:
1587 status = IB_WC_LOC_QP_OP_ERR;
1588 goto ack_err;
1589
1590ack_seq_err:
1591 rdma_seq_err(qp, ibp, psn, rcd);
1592 goto ack_done;
1593
1594ack_len_err:
1595 status = IB_WC_LOC_LEN_ERR;
1596ack_err:
1597 if (qp->s_last == qp->s_acked) {
1598 hfi1_send_complete(qp, wqe, status);
1599 hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1600 }
1601ack_done:
1602 spin_unlock_irqrestore(&qp->s_lock, flags);
1603bail:
1604 return;
1605}
1606
1607/**
1608 * rc_rcv_error - process an incoming duplicate or error RC packet
1609 * @ohdr: the other headers for this packet
1610 * @data: the packet data
1611 * @qp: the QP for this packet
1612 * @opcode: the opcode for this packet
1613 * @psn: the packet sequence number for this packet
1614 * @diff: the difference between the PSN and the expected PSN
1615 *
1616 * This is called from hfi1_rc_rcv() to process an unexpected
1617 * incoming RC packet for the given QP.
1618 * Called at interrupt level.
1619 * Return 1 if no more processing is needed; otherwise return 0 to
1620 * schedule a response to be sent.
1621 */
1622static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data,
1623 struct hfi1_qp *qp, u32 opcode, u32 psn, int diff,
1624 struct hfi1_ctxtdata *rcd)
1625{
1626 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1627 struct hfi1_ack_entry *e;
1628 unsigned long flags;
1629 u8 i, prev;
1630 int old_req;
1631
1632 if (diff > 0) {
1633 /*
1634 * Packet sequence error.
1635 * A NAK will ACK earlier sends and RDMA writes.
1636 * Don't queue the NAK if we already sent one.
1637 */
1638 if (!qp->r_nak_state) {
1639 ibp->n_rc_seqnak++;
1640 qp->r_nak_state = IB_NAK_PSN_ERROR;
1641 /* Use the expected PSN. */
1642 qp->r_ack_psn = qp->r_psn;
1643 /*
1644 * Wait to send the sequence NAK until all packets
1645 * in the receive queue have been processed.
1646 * Otherwise, we end up propagating congestion.
1647 */
1648 if (list_empty(&qp->rspwait)) {
1649 qp->r_flags |= HFI1_R_RSP_NAK;
1650 atomic_inc(&qp->refcount);
1651 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1652 }
1653 }
1654 goto done;
1655 }
1656
1657 /*
1658 * Handle a duplicate request. Don't re-execute SEND, RDMA
1659 * write or atomic op. Don't NAK errors, just silently drop
1660 * the duplicate request. Note that r_sge, r_len, and
1661 * r_rcv_len may be in use so don't modify them.
1662 *
1663 * We are supposed to ACK the earliest duplicate PSN but we
1664 * can coalesce an outstanding duplicate ACK. We have to
1665 * send the earliest so that RDMA reads can be restarted at
1666 * the requester's expected PSN.
1667 *
1668 * First, find where this duplicate PSN falls within the
1669 * ACKs previously sent.
1670 * old_req is true if there is an older response that is scheduled
1671 * to be sent before sending this one.
1672 */
1673 e = NULL;
1674 old_req = 1;
1675 ibp->n_rc_dupreq++;
1676
1677 spin_lock_irqsave(&qp->s_lock, flags);
1678
1679 for (i = qp->r_head_ack_queue; ; i = prev) {
1680 if (i == qp->s_tail_ack_queue)
1681 old_req = 0;
1682 if (i)
1683 prev = i - 1;
1684 else
1685 prev = HFI1_MAX_RDMA_ATOMIC;
1686 if (prev == qp->r_head_ack_queue) {
1687 e = NULL;
1688 break;
1689 }
1690 e = &qp->s_ack_queue[prev];
1691 if (!e->opcode) {
1692 e = NULL;
1693 break;
1694 }
1695 if (cmp_psn(psn, e->psn) >= 0) {
1696 if (prev == qp->s_tail_ack_queue &&
1697 cmp_psn(psn, e->lpsn) <= 0)
1698 old_req = 0;
1699 break;
1700 }
1701 }
1702 switch (opcode) {
1703 case OP(RDMA_READ_REQUEST): {
1704 struct ib_reth *reth;
1705 u32 offset;
1706 u32 len;
1707
1708 /*
1709 * If we didn't find the RDMA read request in the ack queue,
1710 * we can ignore this request.
1711 */
1712 if (!e || e->opcode != OP(RDMA_READ_REQUEST))
1713 goto unlock_done;
1714 /* RETH comes after BTH */
1715 reth = &ohdr->u.rc.reth;
1716 /*
1717 * Address range must be a subset of the original
1718 * request and start on pmtu boundaries.
1719 * We reuse the old ack_queue slot since the requester
1720 * should not back up and request an earlier PSN for the
1721 * same request.
1722 */
1723 offset = delta_psn(psn, e->psn) * qp->pmtu;
1724 len = be32_to_cpu(reth->length);
1725 if (unlikely(offset + len != e->rdma_sge.sge_length))
1726 goto unlock_done;
1727 if (e->rdma_sge.mr) {
1728 hfi1_put_mr(e->rdma_sge.mr);
1729 e->rdma_sge.mr = NULL;
1730 }
1731 if (len != 0) {
1732 u32 rkey = be32_to_cpu(reth->rkey);
1733 u64 vaddr = be64_to_cpu(reth->vaddr);
1734 int ok;
1735
1736 ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
1737 IB_ACCESS_REMOTE_READ);
1738 if (unlikely(!ok))
1739 goto unlock_done;
1740 } else {
1741 e->rdma_sge.vaddr = NULL;
1742 e->rdma_sge.length = 0;
1743 e->rdma_sge.sge_length = 0;
1744 }
1745 e->psn = psn;
1746 if (old_req)
1747 goto unlock_done;
1748 qp->s_tail_ack_queue = prev;
1749 break;
1750 }
1751
1752 case OP(COMPARE_SWAP):
1753 case OP(FETCH_ADD): {
1754 /*
1755 * If we didn't find the atomic request in the ack queue
1756 * or the send tasklet is already backed up to send an
1757 * earlier entry, we can ignore this request.
1758 */
1759 if (!e || e->opcode != (u8) opcode || old_req)
1760 goto unlock_done;
1761 qp->s_tail_ack_queue = prev;
1762 break;
1763 }
1764
1765 default:
1766 /*
1767 * Ignore this operation if it doesn't request an ACK
1768 * or an earlier RDMA read or atomic is going to be resent.
1769 */
1770 if (!(psn & IB_BTH_REQ_ACK) || old_req)
1771 goto unlock_done;
1772 /*
1773 * Resend the most recent ACK if this request is
1774 * after all the previous RDMA reads and atomics.
1775 */
1776 if (i == qp->r_head_ack_queue) {
1777 spin_unlock_irqrestore(&qp->s_lock, flags);
1778 qp->r_nak_state = 0;
1779 qp->r_ack_psn = qp->r_psn - 1;
1780 goto send_ack;
1781 }
1782
1783 /*
1784 * Resend the RDMA read or atomic op which
1785 * ACKs this duplicate request.
1786 */
1787 qp->s_tail_ack_queue = i;
1788 break;
1789 }
1790 qp->s_ack_state = OP(ACKNOWLEDGE);
1791 qp->s_flags |= HFI1_S_RESP_PENDING;
1792 qp->r_nak_state = 0;
1793 hfi1_schedule_send(qp);
1794
1795unlock_done:
1796 spin_unlock_irqrestore(&qp->s_lock, flags);
1797done:
1798 return 1;
1799
1800send_ack:
1801 return 0;
1802}
1803
1804void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err)
1805{
1806 unsigned long flags;
1807 int lastwqe;
1808
1809 spin_lock_irqsave(&qp->s_lock, flags);
1810 lastwqe = hfi1_error_qp(qp, err);
1811 spin_unlock_irqrestore(&qp->s_lock, flags);
1812
1813 if (lastwqe) {
1814 struct ib_event ev;
1815
1816 ev.device = qp->ibqp.device;
1817 ev.element.qp = &qp->ibqp;
1818 ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
1819 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1820 }
1821}
1822
1823static inline void update_ack_queue(struct hfi1_qp *qp, unsigned n)
1824{
1825 unsigned next;
1826
1827 next = n + 1;
1828 if (next > HFI1_MAX_RDMA_ATOMIC)
1829 next = 0;
1830 qp->s_tail_ack_queue = next;
1831 qp->s_ack_state = OP(ACKNOWLEDGE);
1832}
1833
1834static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
1835 u32 lqpn, u32 rqpn, u8 svc_type)
1836{
1837 struct opa_hfi1_cong_log_event_internal *cc_event;
1838
1839 if (sl >= OPA_MAX_SLS)
1840 return;
1841
1842 spin_lock(&ppd->cc_log_lock);
1843
1844 ppd->threshold_cong_event_map[sl/8] |= 1 << (sl % 8);
1845 ppd->threshold_event_counter++;
1846
1847 cc_event = &ppd->cc_events[ppd->cc_log_idx++];
1848 if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
1849 ppd->cc_log_idx = 0;
1850 cc_event->lqpn = lqpn & HFI1_QPN_MASK;
1851 cc_event->rqpn = rqpn & HFI1_QPN_MASK;
1852 cc_event->sl = sl;
1853 cc_event->svc_type = svc_type;
1854 cc_event->rlid = rlid;
1855 /* keep timestamp in units of 1.024 usec */
1856 cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
1857
1858 spin_unlock(&ppd->cc_log_lock);
1859}
1860
1861void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
1862 u32 rqpn, u8 svc_type)
1863{
1864 struct cca_timer *cca_timer;
1865 u16 ccti, ccti_incr, ccti_timer, ccti_limit;
1866 u8 trigger_threshold;
1867 struct cc_state *cc_state;
1868
1869 if (sl >= OPA_MAX_SLS)
1870 return;
1871
1872 cca_timer = &ppd->cca_timer[sl];
1873
1874 cc_state = get_cc_state(ppd);
1875
1876 if (cc_state == NULL)
1877 return;
1878
1879 /*
1880 * 1) increase CCTI (for this SL)
1881 * 2) select IPG (i.e., call set_link_ipg())
1882 * 3) start timer
1883 */
1884 ccti_limit = cc_state->cct.ccti_limit;
1885 ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
1886 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
1887 trigger_threshold =
1888 cc_state->cong_setting.entries[sl].trigger_threshold;
1889
1890 spin_lock(&ppd->cca_timer_lock);
1891
1892 if (cca_timer->ccti < ccti_limit) {
1893 if (cca_timer->ccti + ccti_incr <= ccti_limit)
1894 cca_timer->ccti += ccti_incr;
1895 else
1896 cca_timer->ccti = ccti_limit;
1897 set_link_ipg(ppd);
1898 }
1899
1900 spin_unlock(&ppd->cca_timer_lock);
1901
1902 ccti = cca_timer->ccti;
1903
1904 if (!hrtimer_active(&cca_timer->hrtimer)) {
1905 /* ccti_timer is in units of 1.024 usec */
1906 unsigned long nsec = 1024 * ccti_timer;
1907
1908 hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
1909 HRTIMER_MODE_REL);
1910 }
1911
1912 if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
1913 log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
1914}
1915
1916/**
1917 * hfi1_rc_rcv - process an incoming RC packet
1918 * @rcd: the context pointer
1919 * @hdr: the header of this packet
1920 * @rcv_flags: flags relevant to rcv processing
1921 * @data: the packet data
1922 * @tlen: the packet length
1923 * @qp: the QP for this packet
1924 *
1925 * This is called from qp_rcv() to process an incoming RC packet
1926 * for the given QP.
1927 * Called at interrupt level.
1928 */
1929void hfi1_rc_rcv(struct hfi1_packet *packet)
1930{
1931 struct hfi1_ctxtdata *rcd = packet->rcd;
1932 struct hfi1_ib_header *hdr = packet->hdr;
1933 u32 rcv_flags = packet->rcv_flags;
1934 void *data = packet->ebuf;
1935 u32 tlen = packet->tlen;
1936 struct hfi1_qp *qp = packet->qp;
1937 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1938 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1939 struct hfi1_other_headers *ohdr = packet->ohdr;
1940 u32 bth0, opcode;
1941 u32 hdrsize = packet->hlen;
1942 u32 psn;
1943 u32 pad;
1944 struct ib_wc wc;
1945 u32 pmtu = qp->pmtu;
1946 int diff;
1947 struct ib_reth *reth;
1948 unsigned long flags;
1949 u32 bth1;
1950 int ret, is_fecn = 0;
1951
1952 bth0 = be32_to_cpu(ohdr->bth[0]);
1953 if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
1954 return;
1955
1956 bth1 = be32_to_cpu(ohdr->bth[1]);
1957 if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
1958 if (bth1 & HFI1_BECN_SMASK) {
1959 u16 rlid = qp->remote_ah_attr.dlid;
1960 u32 lqpn, rqpn;
1961
1962 lqpn = qp->ibqp.qp_num;
1963 rqpn = qp->remote_qpn;
1964 process_becn(
1965 ppd,
1966 qp->remote_ah_attr.sl,
1967 rlid, lqpn, rqpn,
1968 IB_CC_SVCTYPE_RC);
1969 }
1970 is_fecn = bth1 & HFI1_FECN_SMASK;
1971 }
1972
1973 psn = be32_to_cpu(ohdr->bth[2]);
1974 opcode = bth0 >> 24;
1975
1976 /*
1977 * Process responses (ACKs) before anything else. Note that the
1978 * packet sequence number will be for something in the send work
1979 * queue rather than the expected receive packet sequence number.
1980 * In other words, this QP is the requester.
1981 */
1982 if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1983 opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1984 rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
1985 hdrsize, pmtu, rcd);
1986 if (is_fecn)
1987 goto send_ack;
1988 return;
1989 }
1990
1991 /* Compute 24 bits worth of difference. */
1992 diff = delta_psn(psn, qp->r_psn);
1993 if (unlikely(diff)) {
1994 if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
1995 return;
1996 goto send_ack;
1997 }
1998
1999 /* Check for opcode sequence errors. */
2000 switch (qp->r_state) {
2001 case OP(SEND_FIRST):
2002 case OP(SEND_MIDDLE):
2003 if (opcode == OP(SEND_MIDDLE) ||
2004 opcode == OP(SEND_LAST) ||
2005 opcode == OP(SEND_LAST_WITH_IMMEDIATE))
2006 break;
2007 goto nack_inv;
2008
2009 case OP(RDMA_WRITE_FIRST):
2010 case OP(RDMA_WRITE_MIDDLE):
2011 if (opcode == OP(RDMA_WRITE_MIDDLE) ||
2012 opcode == OP(RDMA_WRITE_LAST) ||
2013 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2014 break;
2015 goto nack_inv;
2016
2017 default:
2018 if (opcode == OP(SEND_MIDDLE) ||
2019 opcode == OP(SEND_LAST) ||
2020 opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2021 opcode == OP(RDMA_WRITE_MIDDLE) ||
2022 opcode == OP(RDMA_WRITE_LAST) ||
2023 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2024 goto nack_inv;
2025 /*
2026 * Note that it is up to the requester to not send a new
2027 * RDMA read or atomic operation before receiving an ACK
2028 * for the previous operation.
2029 */
2030 break;
2031 }
2032
2033 if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST))
2034 qp_comm_est(qp);
2035
2036 /* OK, process the packet. */
2037 switch (opcode) {
2038 case OP(SEND_FIRST):
2039 ret = hfi1_get_rwqe(qp, 0);
2040 if (ret < 0)
2041 goto nack_op_err;
2042 if (!ret)
2043 goto rnr_nak;
2044 qp->r_rcv_len = 0;
2045 /* FALLTHROUGH */
2046 case OP(SEND_MIDDLE):
2047 case OP(RDMA_WRITE_MIDDLE):
2048send_middle:
2049 /* Check for invalid length PMTU or posted rwqe len. */
2050 if (unlikely(tlen != (hdrsize + pmtu + 4)))
2051 goto nack_inv;
2052 qp->r_rcv_len += pmtu;
2053 if (unlikely(qp->r_rcv_len > qp->r_len))
2054 goto nack_inv;
2055 hfi1_copy_sge(&qp->r_sge, data, pmtu, 1);
2056 break;
2057
2058 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
2059 /* consume RWQE */
2060 ret = hfi1_get_rwqe(qp, 1);
2061 if (ret < 0)
2062 goto nack_op_err;
2063 if (!ret)
2064 goto rnr_nak;
2065 goto send_last_imm;
2066
2067 case OP(SEND_ONLY):
2068 case OP(SEND_ONLY_WITH_IMMEDIATE):
2069 ret = hfi1_get_rwqe(qp, 0);
2070 if (ret < 0)
2071 goto nack_op_err;
2072 if (!ret)
2073 goto rnr_nak;
2074 qp->r_rcv_len = 0;
2075 if (opcode == OP(SEND_ONLY))
2076 goto no_immediate_data;
2077 /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
2078 case OP(SEND_LAST_WITH_IMMEDIATE):
2079send_last_imm:
2080 wc.ex.imm_data = ohdr->u.imm_data;
2081 wc.wc_flags = IB_WC_WITH_IMM;
2082 goto send_last;
2083 case OP(SEND_LAST):
2084 case OP(RDMA_WRITE_LAST):
2085no_immediate_data:
2086 wc.wc_flags = 0;
2087 wc.ex.imm_data = 0;
2088send_last:
2089 /* Get the number of bytes the message was padded by. */
2090 pad = (bth0 >> 20) & 3;
2091 /* Check for invalid length. */
2092 /* LAST len should be >= 1 */
2093 if (unlikely(tlen < (hdrsize + pad + 4)))
2094 goto nack_inv;
2095 /* Don't count the CRC. */
2096 tlen -= (hdrsize + pad + 4);
2097 wc.byte_len = tlen + qp->r_rcv_len;
2098 if (unlikely(wc.byte_len > qp->r_len))
2099 goto nack_inv;
2100 hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
2101 hfi1_put_ss(&qp->r_sge);
2102 qp->r_msn++;
2103 if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
2104 break;
2105 wc.wr_id = qp->r_wr_id;
2106 wc.status = IB_WC_SUCCESS;
2107 if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
2108 opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
2109 wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
2110 else
2111 wc.opcode = IB_WC_RECV;
2112 wc.qp = &qp->ibqp;
2113 wc.src_qp = qp->remote_qpn;
2114 wc.slid = qp->remote_ah_attr.dlid;
2115 /*
2116 * It seems that IB mandates the presence of an SL in a
2117 * work completion only for the UD transport (see section
2118 * 11.4.2 of IBTA Vol. 1).
2119 *
2120 * However, the way the SL is chosen below is consistent
2121 * with the way that IB/qib works and is trying avoid
2122 * introducing incompatibilities.
2123 *
2124 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
2125 */
2126 wc.sl = qp->remote_ah_attr.sl;
2127 /* zero fields that are N/A */
2128 wc.vendor_err = 0;
2129 wc.pkey_index = 0;
2130 wc.dlid_path_bits = 0;
2131 wc.port_num = 0;
2132 /* Signal completion event if the solicited bit is set. */
2133 hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
2134 (bth0 & IB_BTH_SOLICITED) != 0);
2135 break;
2136
2137 case OP(RDMA_WRITE_FIRST):
2138 case OP(RDMA_WRITE_ONLY):
2139 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
2140 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
2141 goto nack_inv;
2142 /* consume RWQE */
2143 reth = &ohdr->u.rc.reth;
2144 qp->r_len = be32_to_cpu(reth->length);
2145 qp->r_rcv_len = 0;
2146 qp->r_sge.sg_list = NULL;
2147 if (qp->r_len != 0) {
2148 u32 rkey = be32_to_cpu(reth->rkey);
2149 u64 vaddr = be64_to_cpu(reth->vaddr);
2150 int ok;
2151
2152 /* Check rkey & NAK */
2153 ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
2154 rkey, IB_ACCESS_REMOTE_WRITE);
2155 if (unlikely(!ok))
2156 goto nack_acc;
2157 qp->r_sge.num_sge = 1;
2158 } else {
2159 qp->r_sge.num_sge = 0;
2160 qp->r_sge.sge.mr = NULL;
2161 qp->r_sge.sge.vaddr = NULL;
2162 qp->r_sge.sge.length = 0;
2163 qp->r_sge.sge.sge_length = 0;
2164 }
2165 if (opcode == OP(RDMA_WRITE_FIRST))
2166 goto send_middle;
2167 else if (opcode == OP(RDMA_WRITE_ONLY))
2168 goto no_immediate_data;
2169 ret = hfi1_get_rwqe(qp, 1);
2170 if (ret < 0)
2171 goto nack_op_err;
2172 if (!ret)
2173 goto rnr_nak;
2174 wc.ex.imm_data = ohdr->u.rc.imm_data;
2175 wc.wc_flags = IB_WC_WITH_IMM;
2176 goto send_last;
2177
2178 case OP(RDMA_READ_REQUEST): {
2179 struct hfi1_ack_entry *e;
2180 u32 len;
2181 u8 next;
2182
2183 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2184 goto nack_inv;
2185 next = qp->r_head_ack_queue + 1;
2186 /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
2187 if (next > HFI1_MAX_RDMA_ATOMIC)
2188 next = 0;
2189 spin_lock_irqsave(&qp->s_lock, flags);
2190 if (unlikely(next == qp->s_tail_ack_queue)) {
2191 if (!qp->s_ack_queue[next].sent)
2192 goto nack_inv_unlck;
2193 update_ack_queue(qp, next);
2194 }
2195 e = &qp->s_ack_queue[qp->r_head_ack_queue];
2196 if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2197 hfi1_put_mr(e->rdma_sge.mr);
2198 e->rdma_sge.mr = NULL;
2199 }
2200 reth = &ohdr->u.rc.reth;
2201 len = be32_to_cpu(reth->length);
2202 if (len) {
2203 u32 rkey = be32_to_cpu(reth->rkey);
2204 u64 vaddr = be64_to_cpu(reth->vaddr);
2205 int ok;
2206
2207 /* Check rkey & NAK */
2208 ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr,
2209 rkey, IB_ACCESS_REMOTE_READ);
2210 if (unlikely(!ok))
2211 goto nack_acc_unlck;
2212 /*
2213 * Update the next expected PSN. We add 1 later
2214 * below, so only add the remainder here.
2215 */
2216 if (len > pmtu)
2217 qp->r_psn += (len - 1) / pmtu;
2218 } else {
2219 e->rdma_sge.mr = NULL;
2220 e->rdma_sge.vaddr = NULL;
2221 e->rdma_sge.length = 0;
2222 e->rdma_sge.sge_length = 0;
2223 }
2224 e->opcode = opcode;
2225 e->sent = 0;
2226 e->psn = psn;
2227 e->lpsn = qp->r_psn;
2228 /*
2229 * We need to increment the MSN here instead of when we
2230 * finish sending the result since a duplicate request would
2231 * increment it more than once.
2232 */
2233 qp->r_msn++;
2234 qp->r_psn++;
2235 qp->r_state = opcode;
2236 qp->r_nak_state = 0;
2237 qp->r_head_ack_queue = next;
2238
2239 /* Schedule the send tasklet. */
2240 qp->s_flags |= HFI1_S_RESP_PENDING;
2241 hfi1_schedule_send(qp);
2242
2243 spin_unlock_irqrestore(&qp->s_lock, flags);
2244 if (is_fecn)
2245 goto send_ack;
2246 return;
2247 }
2248
2249 case OP(COMPARE_SWAP):
2250 case OP(FETCH_ADD): {
2251 struct ib_atomic_eth *ateth;
2252 struct hfi1_ack_entry *e;
2253 u64 vaddr;
2254 atomic64_t *maddr;
2255 u64 sdata;
2256 u32 rkey;
2257 u8 next;
2258
2259 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
2260 goto nack_inv;
2261 next = qp->r_head_ack_queue + 1;
2262 if (next > HFI1_MAX_RDMA_ATOMIC)
2263 next = 0;
2264 spin_lock_irqsave(&qp->s_lock, flags);
2265 if (unlikely(next == qp->s_tail_ack_queue)) {
2266 if (!qp->s_ack_queue[next].sent)
2267 goto nack_inv_unlck;
2268 update_ack_queue(qp, next);
2269 }
2270 e = &qp->s_ack_queue[qp->r_head_ack_queue];
2271 if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2272 hfi1_put_mr(e->rdma_sge.mr);
2273 e->rdma_sge.mr = NULL;
2274 }
2275 ateth = &ohdr->u.atomic_eth;
2276 vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
2277 be32_to_cpu(ateth->vaddr[1]);
2278 if (unlikely(vaddr & (sizeof(u64) - 1)))
2279 goto nack_inv_unlck;
2280 rkey = be32_to_cpu(ateth->rkey);
2281 /* Check rkey & NAK */
2282 if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
2283 vaddr, rkey,
2284 IB_ACCESS_REMOTE_ATOMIC)))
2285 goto nack_acc_unlck;
2286 /* Perform atomic OP and save result. */
2287 maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
2288 sdata = be64_to_cpu(ateth->swap_data);
2289 e->atomic_data = (opcode == OP(FETCH_ADD)) ?
2290 (u64) atomic64_add_return(sdata, maddr) - sdata :
2291 (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
2292 be64_to_cpu(ateth->compare_data),
2293 sdata);
2294 hfi1_put_mr(qp->r_sge.sge.mr);
2295 qp->r_sge.num_sge = 0;
2296 e->opcode = opcode;
2297 e->sent = 0;
2298 e->psn = psn;
2299 e->lpsn = psn;
2300 qp->r_msn++;
2301 qp->r_psn++;
2302 qp->r_state = opcode;
2303 qp->r_nak_state = 0;
2304 qp->r_head_ack_queue = next;
2305
2306 /* Schedule the send tasklet. */
2307 qp->s_flags |= HFI1_S_RESP_PENDING;
2308 hfi1_schedule_send(qp);
2309
2310 spin_unlock_irqrestore(&qp->s_lock, flags);
2311 if (is_fecn)
2312 goto send_ack;
2313 return;
2314 }
2315
2316 default:
2317 /* NAK unknown opcodes. */
2318 goto nack_inv;
2319 }
2320 qp->r_psn++;
2321 qp->r_state = opcode;
2322 qp->r_ack_psn = psn;
2323 qp->r_nak_state = 0;
2324 /* Send an ACK if requested or required. */
2325 if (psn & (1 << 31))
2326 goto send_ack;
2327 return;
2328
2329rnr_nak:
2330 qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
2331 qp->r_ack_psn = qp->r_psn;
2332 /* Queue RNR NAK for later */
2333 if (list_empty(&qp->rspwait)) {
2334 qp->r_flags |= HFI1_R_RSP_NAK;
2335 atomic_inc(&qp->refcount);
2336 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2337 }
2338 return;
2339
2340nack_op_err:
2341 hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2342 qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2343 qp->r_ack_psn = qp->r_psn;
2344 /* Queue NAK for later */
2345 if (list_empty(&qp->rspwait)) {
2346 qp->r_flags |= HFI1_R_RSP_NAK;
2347 atomic_inc(&qp->refcount);
2348 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2349 }
2350 return;
2351
2352nack_inv_unlck:
2353 spin_unlock_irqrestore(&qp->s_lock, flags);
2354nack_inv:
2355 hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2356 qp->r_nak_state = IB_NAK_INVALID_REQUEST;
2357 qp->r_ack_psn = qp->r_psn;
2358 /* Queue NAK for later */
2359 if (list_empty(&qp->rspwait)) {
2360 qp->r_flags |= HFI1_R_RSP_NAK;
2361 atomic_inc(&qp->refcount);
2362 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2363 }
2364 return;
2365
2366nack_acc_unlck:
2367 spin_unlock_irqrestore(&qp->s_lock, flags);
2368nack_acc:
2369 hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR);
2370 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2371 qp->r_ack_psn = qp->r_psn;
2372send_ack:
2373 hfi1_send_rc_ack(rcd, qp, is_fecn);
2374}
2375
2376void hfi1_rc_hdrerr(
2377 struct hfi1_ctxtdata *rcd,
2378 struct hfi1_ib_header *hdr,
2379 u32 rcv_flags,
2380 struct hfi1_qp *qp)
2381{
2382 int has_grh = rcv_flags & HFI1_HAS_GRH;
2383 struct hfi1_other_headers *ohdr;
2384 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
2385 int diff;
2386 u8 opcode;
2387 u32 psn;
2388
2389 /* Check for GRH */
2390 ohdr = &hdr->u.oth;
2391 if (has_grh)
2392 ohdr = &hdr->u.l.oth;
2393
2394 opcode = be32_to_cpu(ohdr->bth[0]);
2395 if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
2396 return;
2397
2398 psn = be32_to_cpu(ohdr->bth[2]);
2399 opcode >>= 24;
2400
2401 /* Only deal with RDMA Writes for now */
2402 if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
2403 diff = delta_psn(psn, qp->r_psn);
2404 if (!qp->r_nak_state && diff >= 0) {
2405 ibp->n_rc_seqnak++;
2406 qp->r_nak_state = IB_NAK_PSN_ERROR;
2407 /* Use the expected PSN. */
2408 qp->r_ack_psn = qp->r_psn;
2409 /*
2410 * Wait to send the sequence
2411 * NAK until all packets
2412 * in the receive queue have
2413 * been processed.
2414 * Otherwise, we end up
2415 * propagating congestion.
2416 */
2417 if (list_empty(&qp->rspwait)) {
2418 qp->r_flags |= HFI1_R_RSP_NAK;
2419 atomic_inc(&qp->refcount);
2420 list_add_tail(
2421 &qp->rspwait,
2422 &rcd->qp_wait_list);
2423 }
2424 } /* Out of sequence NAK */
2425 } /* QP Request NAKs */
2426}
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c
new file mode 100644
index 000000000000..a4115288db66
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/ruc.c
@@ -0,0 +1,948 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/spinlock.h>
52
53#include "hfi.h"
54#include "mad.h"
55#include "qp.h"
56#include "sdma.h"
57
58/*
59 * Convert the AETH RNR timeout code into the number of microseconds.
60 */
61const u32 ib_hfi1_rnr_table[32] = {
62 655360, /* 00: 655.36 */
63 10, /* 01: .01 */
64 20, /* 02 .02 */
65 30, /* 03: .03 */
66 40, /* 04: .04 */
67 60, /* 05: .06 */
68 80, /* 06: .08 */
69 120, /* 07: .12 */
70 160, /* 08: .16 */
71 240, /* 09: .24 */
72 320, /* 0A: .32 */
73 480, /* 0B: .48 */
74 640, /* 0C: .64 */
75 960, /* 0D: .96 */
76 1280, /* 0E: 1.28 */
77 1920, /* 0F: 1.92 */
78 2560, /* 10: 2.56 */
79 3840, /* 11: 3.84 */
80 5120, /* 12: 5.12 */
81 7680, /* 13: 7.68 */
82 10240, /* 14: 10.24 */
83 15360, /* 15: 15.36 */
84 20480, /* 16: 20.48 */
85 30720, /* 17: 30.72 */
86 40960, /* 18: 40.96 */
87 61440, /* 19: 61.44 */
88 81920, /* 1A: 81.92 */
89 122880, /* 1B: 122.88 */
90 163840, /* 1C: 163.84 */
91 245760, /* 1D: 245.76 */
92 327680, /* 1E: 327.68 */
93 491520 /* 1F: 491.52 */
94};
95
96/*
97 * Validate a RWQE and fill in the SGE state.
98 * Return 1 if OK.
99 */
100static int init_sge(struct hfi1_qp *qp, struct hfi1_rwqe *wqe)
101{
102 int i, j, ret;
103 struct ib_wc wc;
104 struct hfi1_lkey_table *rkt;
105 struct hfi1_pd *pd;
106 struct hfi1_sge_state *ss;
107
108 rkt = &to_idev(qp->ibqp.device)->lk_table;
109 pd = to_ipd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
110 ss = &qp->r_sge;
111 ss->sg_list = qp->r_sg_list;
112 qp->r_len = 0;
113 for (i = j = 0; i < wqe->num_sge; i++) {
114 if (wqe->sg_list[i].length == 0)
115 continue;
116 /* Check LKEY */
117 if (!hfi1_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
118 &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
119 goto bad_lkey;
120 qp->r_len += wqe->sg_list[i].length;
121 j++;
122 }
123 ss->num_sge = j;
124 ss->total_len = qp->r_len;
125 ret = 1;
126 goto bail;
127
128bad_lkey:
129 while (j) {
130 struct hfi1_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
131
132 hfi1_put_mr(sge->mr);
133 }
134 ss->num_sge = 0;
135 memset(&wc, 0, sizeof(wc));
136 wc.wr_id = wqe->wr_id;
137 wc.status = IB_WC_LOC_PROT_ERR;
138 wc.opcode = IB_WC_RECV;
139 wc.qp = &qp->ibqp;
140 /* Signal solicited completion event. */
141 hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
142 ret = 0;
143bail:
144 return ret;
145}
146
147/**
148 * hfi1_get_rwqe - copy the next RWQE into the QP's RWQE
149 * @qp: the QP
150 * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
151 *
152 * Return -1 if there is a local error, 0 if no RWQE is available,
153 * otherwise return 1.
154 *
155 * Can be called from interrupt level.
156 */
157int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only)
158{
159 unsigned long flags;
160 struct hfi1_rq *rq;
161 struct hfi1_rwq *wq;
162 struct hfi1_srq *srq;
163 struct hfi1_rwqe *wqe;
164 void (*handler)(struct ib_event *, void *);
165 u32 tail;
166 int ret;
167
168 if (qp->ibqp.srq) {
169 srq = to_isrq(qp->ibqp.srq);
170 handler = srq->ibsrq.event_handler;
171 rq = &srq->rq;
172 } else {
173 srq = NULL;
174 handler = NULL;
175 rq = &qp->r_rq;
176 }
177
178 spin_lock_irqsave(&rq->lock, flags);
179 if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) {
180 ret = 0;
181 goto unlock;
182 }
183
184 wq = rq->wq;
185 tail = wq->tail;
186 /* Validate tail before using it since it is user writable. */
187 if (tail >= rq->size)
188 tail = 0;
189 if (unlikely(tail == wq->head)) {
190 ret = 0;
191 goto unlock;
192 }
193 /* Make sure entry is read after head index is read. */
194 smp_rmb();
195 wqe = get_rwqe_ptr(rq, tail);
196 /*
197 * Even though we update the tail index in memory, the verbs
198 * consumer is not supposed to post more entries until a
199 * completion is generated.
200 */
201 if (++tail >= rq->size)
202 tail = 0;
203 wq->tail = tail;
204 if (!wr_id_only && !init_sge(qp, wqe)) {
205 ret = -1;
206 goto unlock;
207 }
208 qp->r_wr_id = wqe->wr_id;
209
210 ret = 1;
211 set_bit(HFI1_R_WRID_VALID, &qp->r_aflags);
212 if (handler) {
213 u32 n;
214
215 /*
216 * Validate head pointer value and compute
217 * the number of remaining WQEs.
218 */
219 n = wq->head;
220 if (n >= rq->size)
221 n = 0;
222 if (n < tail)
223 n += rq->size - tail;
224 else
225 n -= tail;
226 if (n < srq->limit) {
227 struct ib_event ev;
228
229 srq->limit = 0;
230 spin_unlock_irqrestore(&rq->lock, flags);
231 ev.device = qp->ibqp.device;
232 ev.element.srq = qp->ibqp.srq;
233 ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
234 handler(&ev, srq->ibsrq.srq_context);
235 goto bail;
236 }
237 }
238unlock:
239 spin_unlock_irqrestore(&rq->lock, flags);
240bail:
241 return ret;
242}
243
244/*
245 * Switch to alternate path.
246 * The QP s_lock should be held and interrupts disabled.
247 */
248void hfi1_migrate_qp(struct hfi1_qp *qp)
249{
250 struct ib_event ev;
251
252 qp->s_mig_state = IB_MIG_MIGRATED;
253 qp->remote_ah_attr = qp->alt_ah_attr;
254 qp->port_num = qp->alt_ah_attr.port_num;
255 qp->s_pkey_index = qp->s_alt_pkey_index;
256 qp->s_flags |= HFI1_S_AHG_CLEAR;
257
258 ev.device = qp->ibqp.device;
259 ev.element.qp = &qp->ibqp;
260 ev.event = IB_EVENT_PATH_MIG;
261 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
262}
263
264static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index)
265{
266 if (!index) {
267 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
268
269 return cpu_to_be64(ppd->guid);
270 }
271 return ibp->guids[index - 1];
272}
273
274static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
275{
276 return (gid->global.interface_id == id &&
277 (gid->global.subnet_prefix == gid_prefix ||
278 gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
279}
280
281/*
282 *
283 * This should be called with the QP r_lock held.
284 *
285 * The s_lock will be acquired around the hfi1_migrate_qp() call.
286 */
287int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
288 int has_grh, struct hfi1_qp *qp, u32 bth0)
289{
290 __be64 guid;
291 unsigned long flags;
292 u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
293
294 if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
295 if (!has_grh) {
296 if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
297 goto err;
298 } else {
299 if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
300 goto err;
301 guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
302 if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
303 goto err;
304 if (!gid_ok(&hdr->u.l.grh.sgid,
305 qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
306 qp->alt_ah_attr.grh.dgid.global.interface_id))
307 goto err;
308 }
309 if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
310 sc5, be16_to_cpu(hdr->lrh[3])))) {
311 hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
312 (u16)bth0,
313 (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
314 0, qp->ibqp.qp_num,
315 hdr->lrh[3], hdr->lrh[1]);
316 goto err;
317 }
318 /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
319 if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
320 ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
321 goto err;
322 spin_lock_irqsave(&qp->s_lock, flags);
323 hfi1_migrate_qp(qp);
324 spin_unlock_irqrestore(&qp->s_lock, flags);
325 } else {
326 if (!has_grh) {
327 if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
328 goto err;
329 } else {
330 if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
331 goto err;
332 guid = get_sguid(ibp,
333 qp->remote_ah_attr.grh.sgid_index);
334 if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
335 goto err;
336 if (!gid_ok(&hdr->u.l.grh.sgid,
337 qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
338 qp->remote_ah_attr.grh.dgid.global.interface_id))
339 goto err;
340 }
341 if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
342 sc5, be16_to_cpu(hdr->lrh[3])))) {
343 hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
344 (u16)bth0,
345 (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
346 0, qp->ibqp.qp_num,
347 hdr->lrh[3], hdr->lrh[1]);
348 goto err;
349 }
350 /* Validate the SLID. See Ch. 9.6.1.5 */
351 if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
352 ppd_from_ibp(ibp)->port != qp->port_num)
353 goto err;
354 if (qp->s_mig_state == IB_MIG_REARM &&
355 !(bth0 & IB_BTH_MIG_REQ))
356 qp->s_mig_state = IB_MIG_ARMED;
357 }
358
359 return 0;
360
361err:
362 return 1;
363}
364
365/**
366 * ruc_loopback - handle UC and RC loopback requests
367 * @sqp: the sending QP
368 *
369 * This is called from hfi1_do_send() to
370 * forward a WQE addressed to the same HFI.
371 * Note that although we are single threaded due to the tasklet, we still
372 * have to protect against post_send(). We don't have to worry about
373 * receive interrupts since this is a connected protocol and all packets
374 * will pass through here.
375 */
376static void ruc_loopback(struct hfi1_qp *sqp)
377{
378 struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
379 struct hfi1_qp *qp;
380 struct hfi1_swqe *wqe;
381 struct hfi1_sge *sge;
382 unsigned long flags;
383 struct ib_wc wc;
384 u64 sdata;
385 atomic64_t *maddr;
386 enum ib_wc_status send_status;
387 int release;
388 int ret;
389
390 rcu_read_lock();
391
392 /*
393 * Note that we check the responder QP state after
394 * checking the requester's state.
395 */
396 qp = hfi1_lookup_qpn(ibp, sqp->remote_qpn);
397
398 spin_lock_irqsave(&sqp->s_lock, flags);
399
400 /* Return if we are already busy processing a work request. */
401 if ((sqp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT)) ||
402 !(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
403 goto unlock;
404
405 sqp->s_flags |= HFI1_S_BUSY;
406
407again:
408 if (sqp->s_last == sqp->s_head)
409 goto clr_busy;
410 wqe = get_swqe_ptr(sqp, sqp->s_last);
411
412 /* Return if it is not OK to start a new work request. */
413 if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_NEXT_SEND_OK)) {
414 if (!(ib_hfi1_state_ops[sqp->state] & HFI1_FLUSH_SEND))
415 goto clr_busy;
416 /* We are in the error state, flush the work request. */
417 send_status = IB_WC_WR_FLUSH_ERR;
418 goto flush_send;
419 }
420
421 /*
422 * We can rely on the entry not changing without the s_lock
423 * being held until we update s_last.
424 * We increment s_cur to indicate s_last is in progress.
425 */
426 if (sqp->s_last == sqp->s_cur) {
427 if (++sqp->s_cur >= sqp->s_size)
428 sqp->s_cur = 0;
429 }
430 spin_unlock_irqrestore(&sqp->s_lock, flags);
431
432 if (!qp || !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) ||
433 qp->ibqp.qp_type != sqp->ibqp.qp_type) {
434 ibp->n_pkt_drops++;
435 /*
436 * For RC, the requester would timeout and retry so
437 * shortcut the timeouts and just signal too many retries.
438 */
439 if (sqp->ibqp.qp_type == IB_QPT_RC)
440 send_status = IB_WC_RETRY_EXC_ERR;
441 else
442 send_status = IB_WC_SUCCESS;
443 goto serr;
444 }
445
446 memset(&wc, 0, sizeof(wc));
447 send_status = IB_WC_SUCCESS;
448
449 release = 1;
450 sqp->s_sge.sge = wqe->sg_list[0];
451 sqp->s_sge.sg_list = wqe->sg_list + 1;
452 sqp->s_sge.num_sge = wqe->wr.num_sge;
453 sqp->s_len = wqe->length;
454 switch (wqe->wr.opcode) {
455 case IB_WR_SEND_WITH_IMM:
456 wc.wc_flags = IB_WC_WITH_IMM;
457 wc.ex.imm_data = wqe->wr.ex.imm_data;
458 /* FALLTHROUGH */
459 case IB_WR_SEND:
460 ret = hfi1_get_rwqe(qp, 0);
461 if (ret < 0)
462 goto op_err;
463 if (!ret)
464 goto rnr_nak;
465 break;
466
467 case IB_WR_RDMA_WRITE_WITH_IMM:
468 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
469 goto inv_err;
470 wc.wc_flags = IB_WC_WITH_IMM;
471 wc.ex.imm_data = wqe->wr.ex.imm_data;
472 ret = hfi1_get_rwqe(qp, 1);
473 if (ret < 0)
474 goto op_err;
475 if (!ret)
476 goto rnr_nak;
477 /* FALLTHROUGH */
478 case IB_WR_RDMA_WRITE:
479 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
480 goto inv_err;
481 if (wqe->length == 0)
482 break;
483 if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
484 wqe->wr.wr.rdma.remote_addr,
485 wqe->wr.wr.rdma.rkey,
486 IB_ACCESS_REMOTE_WRITE)))
487 goto acc_err;
488 qp->r_sge.sg_list = NULL;
489 qp->r_sge.num_sge = 1;
490 qp->r_sge.total_len = wqe->length;
491 break;
492
493 case IB_WR_RDMA_READ:
494 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
495 goto inv_err;
496 if (unlikely(!hfi1_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
497 wqe->wr.wr.rdma.remote_addr,
498 wqe->wr.wr.rdma.rkey,
499 IB_ACCESS_REMOTE_READ)))
500 goto acc_err;
501 release = 0;
502 sqp->s_sge.sg_list = NULL;
503 sqp->s_sge.num_sge = 1;
504 qp->r_sge.sge = wqe->sg_list[0];
505 qp->r_sge.sg_list = wqe->sg_list + 1;
506 qp->r_sge.num_sge = wqe->wr.num_sge;
507 qp->r_sge.total_len = wqe->length;
508 break;
509
510 case IB_WR_ATOMIC_CMP_AND_SWP:
511 case IB_WR_ATOMIC_FETCH_AND_ADD:
512 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
513 goto inv_err;
514 if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
515 wqe->wr.wr.atomic.remote_addr,
516 wqe->wr.wr.atomic.rkey,
517 IB_ACCESS_REMOTE_ATOMIC)))
518 goto acc_err;
519 /* Perform atomic OP and save result. */
520 maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
521 sdata = wqe->wr.wr.atomic.compare_add;
522 *(u64 *) sqp->s_sge.sge.vaddr =
523 (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
524 (u64) atomic64_add_return(sdata, maddr) - sdata :
525 (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
526 sdata, wqe->wr.wr.atomic.swap);
527 hfi1_put_mr(qp->r_sge.sge.mr);
528 qp->r_sge.num_sge = 0;
529 goto send_comp;
530
531 default:
532 send_status = IB_WC_LOC_QP_OP_ERR;
533 goto serr;
534 }
535
536 sge = &sqp->s_sge.sge;
537 while (sqp->s_len) {
538 u32 len = sqp->s_len;
539
540 if (len > sge->length)
541 len = sge->length;
542 if (len > sge->sge_length)
543 len = sge->sge_length;
544 WARN_ON_ONCE(len == 0);
545 hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release);
546 sge->vaddr += len;
547 sge->length -= len;
548 sge->sge_length -= len;
549 if (sge->sge_length == 0) {
550 if (!release)
551 hfi1_put_mr(sge->mr);
552 if (--sqp->s_sge.num_sge)
553 *sge = *sqp->s_sge.sg_list++;
554 } else if (sge->length == 0 && sge->mr->lkey) {
555 if (++sge->n >= HFI1_SEGSZ) {
556 if (++sge->m >= sge->mr->mapsz)
557 break;
558 sge->n = 0;
559 }
560 sge->vaddr =
561 sge->mr->map[sge->m]->segs[sge->n].vaddr;
562 sge->length =
563 sge->mr->map[sge->m]->segs[sge->n].length;
564 }
565 sqp->s_len -= len;
566 }
567 if (release)
568 hfi1_put_ss(&qp->r_sge);
569
570 if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
571 goto send_comp;
572
573 if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
574 wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
575 else
576 wc.opcode = IB_WC_RECV;
577 wc.wr_id = qp->r_wr_id;
578 wc.status = IB_WC_SUCCESS;
579 wc.byte_len = wqe->length;
580 wc.qp = &qp->ibqp;
581 wc.src_qp = qp->remote_qpn;
582 wc.slid = qp->remote_ah_attr.dlid;
583 wc.sl = qp->remote_ah_attr.sl;
584 wc.port_num = 1;
585 /* Signal completion event if the solicited bit is set. */
586 hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
587 wqe->wr.send_flags & IB_SEND_SOLICITED);
588
589send_comp:
590 spin_lock_irqsave(&sqp->s_lock, flags);
591 ibp->n_loop_pkts++;
592flush_send:
593 sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
594 hfi1_send_complete(sqp, wqe, send_status);
595 goto again;
596
597rnr_nak:
598 /* Handle RNR NAK */
599 if (qp->ibqp.qp_type == IB_QPT_UC)
600 goto send_comp;
601 ibp->n_rnr_naks++;
602 /*
603 * Note: we don't need the s_lock held since the BUSY flag
604 * makes this single threaded.
605 */
606 if (sqp->s_rnr_retry == 0) {
607 send_status = IB_WC_RNR_RETRY_EXC_ERR;
608 goto serr;
609 }
610 if (sqp->s_rnr_retry_cnt < 7)
611 sqp->s_rnr_retry--;
612 spin_lock_irqsave(&sqp->s_lock, flags);
613 if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_RECV_OK))
614 goto clr_busy;
615 sqp->s_flags |= HFI1_S_WAIT_RNR;
616 sqp->s_timer.function = hfi1_rc_rnr_retry;
617 sqp->s_timer.expires = jiffies +
618 usecs_to_jiffies(ib_hfi1_rnr_table[qp->r_min_rnr_timer]);
619 add_timer(&sqp->s_timer);
620 goto clr_busy;
621
622op_err:
623 send_status = IB_WC_REM_OP_ERR;
624 wc.status = IB_WC_LOC_QP_OP_ERR;
625 goto err;
626
627inv_err:
628 send_status = IB_WC_REM_INV_REQ_ERR;
629 wc.status = IB_WC_LOC_QP_OP_ERR;
630 goto err;
631
632acc_err:
633 send_status = IB_WC_REM_ACCESS_ERR;
634 wc.status = IB_WC_LOC_PROT_ERR;
635err:
636 /* responder goes to error state */
637 hfi1_rc_error(qp, wc.status);
638
639serr:
640 spin_lock_irqsave(&sqp->s_lock, flags);
641 hfi1_send_complete(sqp, wqe, send_status);
642 if (sqp->ibqp.qp_type == IB_QPT_RC) {
643 int lastwqe = hfi1_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
644
645 sqp->s_flags &= ~HFI1_S_BUSY;
646 spin_unlock_irqrestore(&sqp->s_lock, flags);
647 if (lastwqe) {
648 struct ib_event ev;
649
650 ev.device = sqp->ibqp.device;
651 ev.element.qp = &sqp->ibqp;
652 ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
653 sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
654 }
655 goto done;
656 }
657clr_busy:
658 sqp->s_flags &= ~HFI1_S_BUSY;
659unlock:
660 spin_unlock_irqrestore(&sqp->s_lock, flags);
661done:
662 rcu_read_unlock();
663}
664
665/**
666 * hfi1_make_grh - construct a GRH header
667 * @ibp: a pointer to the IB port
668 * @hdr: a pointer to the GRH header being constructed
669 * @grh: the global route address to send to
670 * @hwords: the number of 32 bit words of header being sent
671 * @nwords: the number of 32 bit words of data being sent
672 *
673 * Return the size of the header in 32 bit words.
674 */
675u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
676 struct ib_global_route *grh, u32 hwords, u32 nwords)
677{
678 hdr->version_tclass_flow =
679 cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
680 (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
681 (grh->flow_label << IB_GRH_FLOW_SHIFT));
682 hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
683 /* next_hdr is defined by C8-7 in ch. 8.4.1 */
684 hdr->next_hdr = IB_GRH_NEXT_HDR;
685 hdr->hop_limit = grh->hop_limit;
686 /* The SGID is 32-bit aligned. */
687 hdr->sgid.global.subnet_prefix = ibp->gid_prefix;
688 hdr->sgid.global.interface_id =
689 grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ?
690 ibp->guids[grh->sgid_index - 1] :
691 cpu_to_be64(ppd_from_ibp(ibp)->guid);
692 hdr->dgid = grh->dgid;
693
694 /* GRH header size in 32-bit words. */
695 return sizeof(struct ib_grh) / sizeof(u32);
696}
697
698/*
699 * free_ahg - clear ahg from QP
700 */
701void clear_ahg(struct hfi1_qp *qp)
702{
703 qp->s_hdr->ahgcount = 0;
704 qp->s_flags &= ~(HFI1_S_AHG_VALID | HFI1_S_AHG_CLEAR);
705 if (qp->s_sde)
706 sdma_ahg_free(qp->s_sde, qp->s_ahgidx);
707 qp->s_ahgidx = -1;
708 qp->s_sde = NULL;
709}
710
711#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4)
712
713/**
714 * build_ahg - create ahg in s_hdr
715 * @qp: a pointer to QP
716 * @npsn: the next PSN for the request/response
717 *
718 * This routine handles the AHG by allocating an ahg entry and causing the
719 * copy of the first middle.
720 *
721 * Subsequent middles use the copied entry, editing the
722 * PSN with 1 or 2 edits.
723 */
724static inline void build_ahg(struct hfi1_qp *qp, u32 npsn)
725{
726 if (unlikely(qp->s_flags & HFI1_S_AHG_CLEAR))
727 clear_ahg(qp);
728 if (!(qp->s_flags & HFI1_S_AHG_VALID)) {
729 /* first middle that needs copy */
730 if (qp->s_ahgidx < 0) {
731 if (!qp->s_sde)
732 qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc);
733 qp->s_ahgidx = sdma_ahg_alloc(qp->s_sde);
734 }
735 if (qp->s_ahgidx >= 0) {
736 qp->s_ahgpsn = npsn;
737 qp->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
738 /* save to protect a change in another thread */
739 qp->s_hdr->sde = qp->s_sde;
740 qp->s_hdr->ahgidx = qp->s_ahgidx;
741 qp->s_flags |= HFI1_S_AHG_VALID;
742 }
743 } else {
744 /* subsequent middle after valid */
745 if (qp->s_ahgidx >= 0) {
746 qp->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG;
747 qp->s_hdr->ahgidx = qp->s_ahgidx;
748 qp->s_hdr->ahgcount++;
749 qp->s_hdr->ahgdesc[0] =
750 sdma_build_ahg_descriptor(
751 (__force u16)cpu_to_be16((u16)npsn),
752 BTH2_OFFSET,
753 16,
754 16);
755 if ((npsn & 0xffff0000) !=
756 (qp->s_ahgpsn & 0xffff0000)) {
757 qp->s_hdr->ahgcount++;
758 qp->s_hdr->ahgdesc[1] =
759 sdma_build_ahg_descriptor(
760 (__force u16)cpu_to_be16(
761 (u16)(npsn >> 16)),
762 BTH2_OFFSET,
763 0,
764 16);
765 }
766 }
767 }
768}
769
770void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr,
771 u32 bth0, u32 bth2, int middle)
772{
773 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
774 u16 lrh0;
775 u32 nwords;
776 u32 extra_bytes;
777 u8 sc5;
778 u32 bth1;
779
780 /* Construct the header. */
781 extra_bytes = -qp->s_cur_size & 3;
782 nwords = (qp->s_cur_size + extra_bytes) >> 2;
783 lrh0 = HFI1_LRH_BTH;
784 if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
785 qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh,
786 &qp->remote_ah_attr.grh,
787 qp->s_hdrwords, nwords);
788 lrh0 = HFI1_LRH_GRH;
789 middle = 0;
790 }
791 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
792 lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
793 qp->s_sc = sc5;
794 /*
795 * reset s_hdr/AHG fields
796 *
797 * This insures that the ahgentry/ahgcount
798 * are at a non-AHG default to protect
799 * build_verbs_tx_desc() from using
800 * an include ahgidx.
801 *
802 * build_ahg() will modify as appropriate
803 * to use the AHG feature.
804 */
805 qp->s_hdr->tx_flags = 0;
806 qp->s_hdr->ahgcount = 0;
807 qp->s_hdr->ahgidx = 0;
808 qp->s_hdr->sde = NULL;
809 if (qp->s_mig_state == IB_MIG_MIGRATED)
810 bth0 |= IB_BTH_MIG_REQ;
811 else
812 middle = 0;
813 if (middle)
814 build_ahg(qp, bth2);
815 else
816 qp->s_flags &= ~HFI1_S_AHG_VALID;
817 qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0);
818 qp->s_hdr->ibh.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
819 qp->s_hdr->ibh.lrh[2] =
820 cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
821 qp->s_hdr->ibh.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
822 qp->remote_ah_attr.src_path_bits);
823 bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
824 bth0 |= extra_bytes << 20;
825 ohdr->bth[0] = cpu_to_be32(bth0);
826 bth1 = qp->remote_qpn;
827 if (qp->s_flags & HFI1_S_ECN) {
828 qp->s_flags &= ~HFI1_S_ECN;
829 /* we recently received a FECN, so return a BECN */
830 bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
831 }
832 ohdr->bth[1] = cpu_to_be32(bth1);
833 ohdr->bth[2] = cpu_to_be32(bth2);
834}
835
836/**
837 * hfi1_do_send - perform a send on a QP
838 * @work: contains a pointer to the QP
839 *
840 * Process entries in the send work queue until credit or queue is
841 * exhausted. Only allow one CPU to send a packet per QP (tasklet).
842 * Otherwise, two threads could send packets out of order.
843 */
844void hfi1_do_send(struct work_struct *work)
845{
846 struct iowait *wait = container_of(work, struct iowait, iowork);
847 struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait);
848 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
849 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
850 int (*make_req)(struct hfi1_qp *qp);
851 unsigned long flags;
852
853 if ((qp->ibqp.qp_type == IB_QPT_RC ||
854 qp->ibqp.qp_type == IB_QPT_UC) &&
855 !loopback &&
856 (qp->remote_ah_attr.dlid & ~((1 << ppd->lmc) - 1)) == ppd->lid) {
857 ruc_loopback(qp);
858 return;
859 }
860
861 if (qp->ibqp.qp_type == IB_QPT_RC)
862 make_req = hfi1_make_rc_req;
863 else if (qp->ibqp.qp_type == IB_QPT_UC)
864 make_req = hfi1_make_uc_req;
865 else
866 make_req = hfi1_make_ud_req;
867
868 spin_lock_irqsave(&qp->s_lock, flags);
869
870 /* Return if we are already busy processing a work request. */
871 if (!hfi1_send_ok(qp)) {
872 spin_unlock_irqrestore(&qp->s_lock, flags);
873 return;
874 }
875
876 qp->s_flags |= HFI1_S_BUSY;
877
878 spin_unlock_irqrestore(&qp->s_lock, flags);
879
880 do {
881 /* Check for a constructed packet to be sent. */
882 if (qp->s_hdrwords != 0) {
883 /*
884 * If the packet cannot be sent now, return and
885 * the send tasklet will be woken up later.
886 */
887 if (hfi1_verbs_send(qp, qp->s_hdr, qp->s_hdrwords,
888 qp->s_cur_sge, qp->s_cur_size))
889 break;
890 /* Record that s_hdr is empty. */
891 qp->s_hdrwords = 0;
892 }
893 } while (make_req(qp));
894}
895
896/*
897 * This should be called with s_lock held.
898 */
899void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe,
900 enum ib_wc_status status)
901{
902 u32 old_last, last;
903 unsigned i;
904
905 if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
906 return;
907
908 for (i = 0; i < wqe->wr.num_sge; i++) {
909 struct hfi1_sge *sge = &wqe->sg_list[i];
910
911 hfi1_put_mr(sge->mr);
912 }
913 if (qp->ibqp.qp_type == IB_QPT_UD ||
914 qp->ibqp.qp_type == IB_QPT_SMI ||
915 qp->ibqp.qp_type == IB_QPT_GSI)
916 atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
917
918 /* See ch. 11.2.4.1 and 10.7.3.1 */
919 if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
920 (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
921 status != IB_WC_SUCCESS) {
922 struct ib_wc wc;
923
924 memset(&wc, 0, sizeof(wc));
925 wc.wr_id = wqe->wr.wr_id;
926 wc.status = status;
927 wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
928 wc.qp = &qp->ibqp;
929 if (status == IB_WC_SUCCESS)
930 wc.byte_len = wqe->length;
931 hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
932 status != IB_WC_SUCCESS);
933 }
934
935 last = qp->s_last;
936 old_last = last;
937 if (++last >= qp->s_size)
938 last = 0;
939 qp->s_last = last;
940 if (qp->s_acked == old_last)
941 qp->s_acked = last;
942 if (qp->s_cur == old_last)
943 qp->s_cur = last;
944 if (qp->s_tail == old_last)
945 qp->s_tail = last;
946 if (qp->state == IB_QPS_SQD && last == qp->s_cur)
947 qp->s_draining = 0;
948}
diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/staging/rdma/hfi1/sdma.c
new file mode 100644
index 000000000000..a8c903caecce
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/sdma.c
@@ -0,0 +1,2962 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/spinlock.h>
52#include <linux/seqlock.h>
53#include <linux/netdevice.h>
54#include <linux/moduleparam.h>
55#include <linux/bitops.h>
56#include <linux/timer.h>
57#include <linux/vmalloc.h>
58
59#include "hfi.h"
60#include "common.h"
61#include "qp.h"
62#include "sdma.h"
63#include "iowait.h"
64#include "trace.h"
65
66/* must be a power of 2 >= 64 <= 32768 */
67#define SDMA_DESCQ_CNT 1024
68#define INVALID_TAIL 0xffff
69
70static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
71module_param(sdma_descq_cnt, uint, S_IRUGO);
72MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
73
74static uint sdma_idle_cnt = 250;
75module_param(sdma_idle_cnt, uint, S_IRUGO);
76MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
77
78uint mod_num_sdma;
79module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
80MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
81
82#define SDMA_WAIT_BATCH_SIZE 20
83/* max wait time for a SDMA engine to indicate it has halted */
84#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
85/* all SDMA engine errors that cause a halt */
86
87#define SD(name) SEND_DMA_##name
88#define ALL_SDMA_ENG_HALT_ERRS \
89 (SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
90 | SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
91 | SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
92 | SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
93 | SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
94 | SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
95 | SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
96 | SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
97 | SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
98 | SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
99 | SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
100 | SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
101 | SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
102 | SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
103 | SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
104 | SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
105 | SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
106 | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
107
108/* sdma_sendctrl operations */
109#define SDMA_SENDCTRL_OP_ENABLE (1U << 0)
110#define SDMA_SENDCTRL_OP_INTENABLE (1U << 1)
111#define SDMA_SENDCTRL_OP_HALT (1U << 2)
112#define SDMA_SENDCTRL_OP_CLEANUP (1U << 3)
113
114/* handle long defines */
115#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
116SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
117#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
118SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
119
120static const char * const sdma_state_names[] = {
121 [sdma_state_s00_hw_down] = "s00_HwDown",
122 [sdma_state_s10_hw_start_up_halt_wait] = "s10_HwStartUpHaltWait",
123 [sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
124 [sdma_state_s20_idle] = "s20_Idle",
125 [sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait",
126 [sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait",
127 [sdma_state_s50_hw_halt_wait] = "s50_HwHaltWait",
128 [sdma_state_s60_idle_halt_wait] = "s60_IdleHaltWait",
129 [sdma_state_s80_hw_freeze] = "s80_HwFreeze",
130 [sdma_state_s82_freeze_sw_clean] = "s82_FreezeSwClean",
131 [sdma_state_s99_running] = "s99_Running",
132};
133
134static const char * const sdma_event_names[] = {
135 [sdma_event_e00_go_hw_down] = "e00_GoHwDown",
136 [sdma_event_e10_go_hw_start] = "e10_GoHwStart",
137 [sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
138 [sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
139 [sdma_event_e30_go_running] = "e30_GoRunning",
140 [sdma_event_e40_sw_cleaned] = "e40_SwCleaned",
141 [sdma_event_e50_hw_cleaned] = "e50_HwCleaned",
142 [sdma_event_e60_hw_halted] = "e60_HwHalted",
143 [sdma_event_e70_go_idle] = "e70_GoIdle",
144 [sdma_event_e80_hw_freeze] = "e80_HwFreeze",
145 [sdma_event_e81_hw_frozen] = "e81_HwFrozen",
146 [sdma_event_e82_hw_unfreeze] = "e82_HwUnfreeze",
147 [sdma_event_e85_link_down] = "e85_LinkDown",
148 [sdma_event_e90_sw_halted] = "e90_SwHalted",
149};
150
151static const struct sdma_set_state_action sdma_action_table[] = {
152 [sdma_state_s00_hw_down] = {
153 .go_s99_running_tofalse = 1,
154 .op_enable = 0,
155 .op_intenable = 0,
156 .op_halt = 0,
157 .op_cleanup = 0,
158 },
159 [sdma_state_s10_hw_start_up_halt_wait] = {
160 .op_enable = 0,
161 .op_intenable = 0,
162 .op_halt = 1,
163 .op_cleanup = 0,
164 },
165 [sdma_state_s15_hw_start_up_clean_wait] = {
166 .op_enable = 0,
167 .op_intenable = 1,
168 .op_halt = 0,
169 .op_cleanup = 1,
170 },
171 [sdma_state_s20_idle] = {
172 .op_enable = 0,
173 .op_intenable = 1,
174 .op_halt = 0,
175 .op_cleanup = 0,
176 },
177 [sdma_state_s30_sw_clean_up_wait] = {
178 .op_enable = 0,
179 .op_intenable = 0,
180 .op_halt = 0,
181 .op_cleanup = 0,
182 },
183 [sdma_state_s40_hw_clean_up_wait] = {
184 .op_enable = 0,
185 .op_intenable = 0,
186 .op_halt = 0,
187 .op_cleanup = 1,
188 },
189 [sdma_state_s50_hw_halt_wait] = {
190 .op_enable = 0,
191 .op_intenable = 0,
192 .op_halt = 0,
193 .op_cleanup = 0,
194 },
195 [sdma_state_s60_idle_halt_wait] = {
196 .go_s99_running_tofalse = 1,
197 .op_enable = 0,
198 .op_intenable = 0,
199 .op_halt = 1,
200 .op_cleanup = 0,
201 },
202 [sdma_state_s80_hw_freeze] = {
203 .op_enable = 0,
204 .op_intenable = 0,
205 .op_halt = 0,
206 .op_cleanup = 0,
207 },
208 [sdma_state_s82_freeze_sw_clean] = {
209 .op_enable = 0,
210 .op_intenable = 0,
211 .op_halt = 0,
212 .op_cleanup = 0,
213 },
214 [sdma_state_s99_running] = {
215 .op_enable = 1,
216 .op_intenable = 1,
217 .op_halt = 0,
218 .op_cleanup = 0,
219 .go_s99_running_totrue = 1,
220 },
221};
222
223#define SDMA_TAIL_UPDATE_THRESH 0x1F
224
225/* declare all statics here rather than keep sorting */
226static void sdma_complete(struct kref *);
227static void sdma_finalput(struct sdma_state *);
228static void sdma_get(struct sdma_state *);
229static void sdma_hw_clean_up_task(unsigned long);
230static void sdma_put(struct sdma_state *);
231static void sdma_set_state(struct sdma_engine *, enum sdma_states);
232static void sdma_start_hw_clean_up(struct sdma_engine *);
233static void sdma_start_sw_clean_up(struct sdma_engine *);
234static void sdma_sw_clean_up_task(unsigned long);
235static void sdma_sendctrl(struct sdma_engine *, unsigned);
236static void init_sdma_regs(struct sdma_engine *, u32, uint);
237static void sdma_process_event(
238 struct sdma_engine *sde,
239 enum sdma_events event);
240static void __sdma_process_event(
241 struct sdma_engine *sde,
242 enum sdma_events event);
243static void dump_sdma_state(struct sdma_engine *sde);
244static void sdma_make_progress(struct sdma_engine *sde, u64 status);
245static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail);
246static void sdma_flush_descq(struct sdma_engine *sde);
247
248/**
249 * sdma_state_name() - return state string from enum
250 * @state: state
251 */
252static const char *sdma_state_name(enum sdma_states state)
253{
254 return sdma_state_names[state];
255}
256
257static void sdma_get(struct sdma_state *ss)
258{
259 kref_get(&ss->kref);
260}
261
262static void sdma_complete(struct kref *kref)
263{
264 struct sdma_state *ss =
265 container_of(kref, struct sdma_state, kref);
266
267 complete(&ss->comp);
268}
269
270static void sdma_put(struct sdma_state *ss)
271{
272 kref_put(&ss->kref, sdma_complete);
273}
274
275static void sdma_finalput(struct sdma_state *ss)
276{
277 sdma_put(ss);
278 wait_for_completion(&ss->comp);
279}
280
281static inline void write_sde_csr(
282 struct sdma_engine *sde,
283 u32 offset0,
284 u64 value)
285{
286 write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
287}
288
289static inline u64 read_sde_csr(
290 struct sdma_engine *sde,
291 u32 offset0)
292{
293 return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
294}
295
296/*
297 * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
298 * sdma engine 'sde' to drop to 0.
299 */
300static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
301 int pause)
302{
303 u64 off = 8 * sde->this_idx;
304 struct hfi1_devdata *dd = sde->dd;
305 int lcnt = 0;
306
307 while (1) {
308 u64 reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
309
310 reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
311 reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
312 if (reg == 0)
313 break;
314 if (lcnt++ > 100) {
315 dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u\n",
316 __func__, sde->this_idx, (u32)reg);
317 break;
318 }
319 udelay(1);
320 }
321}
322
323/*
324 * sdma_wait() - wait for packet egress to complete for all SDMA engines,
325 * and pause for credit return.
326 */
327void sdma_wait(struct hfi1_devdata *dd)
328{
329 int i;
330
331 for (i = 0; i < dd->num_sdma; i++) {
332 struct sdma_engine *sde = &dd->per_sdma[i];
333
334 sdma_wait_for_packet_egress(sde, 0);
335 }
336}
337
338static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
339{
340 u64 reg;
341
342 if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
343 return;
344 reg = cnt;
345 reg &= SD(DESC_CNT_CNT_MASK);
346 reg <<= SD(DESC_CNT_CNT_SHIFT);
347 write_sde_csr(sde, SD(DESC_CNT), reg);
348}
349
350/*
351 * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
352 *
353 * Depending on timing there can be txreqs in two places:
354 * - in the descq ring
355 * - in the flush list
356 *
357 * To avoid ordering issues the descq ring needs to be flushed
358 * first followed by the flush list.
359 *
360 * This routine is called from two places
361 * - From a work queue item
362 * - Directly from the state machine just before setting the
363 * state to running
364 *
365 * Must be called with head_lock held
366 *
367 */
368static void sdma_flush(struct sdma_engine *sde)
369{
370 struct sdma_txreq *txp, *txp_next;
371 LIST_HEAD(flushlist);
372
373 /* flush from head to tail */
374 sdma_flush_descq(sde);
375 spin_lock(&sde->flushlist_lock);
376 /* copy flush list */
377 list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
378 list_del_init(&txp->list);
379 list_add_tail(&txp->list, &flushlist);
380 }
381 spin_unlock(&sde->flushlist_lock);
382 /* flush from flush list */
383 list_for_each_entry_safe(txp, txp_next, &flushlist, list) {
384 int drained = 0;
385 /* protect against complete modifying */
386 struct iowait *wait = txp->wait;
387
388 list_del_init(&txp->list);
389#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
390 trace_hfi1_sdma_out_sn(sde, txp->sn);
391 if (WARN_ON_ONCE(sde->head_sn != txp->sn))
392 dd_dev_err(sde->dd, "expected %llu got %llu\n",
393 sde->head_sn, txp->sn);
394 sde->head_sn++;
395#endif
396 sdma_txclean(sde->dd, txp);
397 if (wait)
398 drained = atomic_dec_and_test(&wait->sdma_busy);
399 if (txp->complete)
400 (*txp->complete)(txp, SDMA_TXREQ_S_ABORTED, drained);
401 if (wait && drained)
402 iowait_drain_wakeup(wait);
403 }
404}
405
406/*
407 * Fields a work request for flushing the descq ring
408 * and the flush list
409 *
410 * If the engine has been brought to running during
411 * the scheduling delay, the flush is ignored, assuming
412 * that the process of bringing the engine to running
413 * would have done this flush prior to going to running.
414 *
415 */
416static void sdma_field_flush(struct work_struct *work)
417{
418 unsigned long flags;
419 struct sdma_engine *sde =
420 container_of(work, struct sdma_engine, flush_worker);
421
422 write_seqlock_irqsave(&sde->head_lock, flags);
423 if (!__sdma_running(sde))
424 sdma_flush(sde);
425 write_sequnlock_irqrestore(&sde->head_lock, flags);
426}
427
428static void sdma_err_halt_wait(struct work_struct *work)
429{
430 struct sdma_engine *sde = container_of(work, struct sdma_engine,
431 err_halt_worker);
432 u64 statuscsr;
433 unsigned long timeout;
434
435 timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
436 while (1) {
437 statuscsr = read_sde_csr(sde, SD(STATUS));
438 statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
439 if (statuscsr)
440 break;
441 if (time_after(jiffies, timeout)) {
442 dd_dev_err(sde->dd,
443 "SDMA engine %d - timeout waiting for engine to halt\n",
444 sde->this_idx);
445 /*
446 * Continue anyway. This could happen if there was
447 * an uncorrectable error in the wrong spot.
448 */
449 break;
450 }
451 usleep_range(80, 120);
452 }
453
454 sdma_process_event(sde, sdma_event_e15_hw_halt_done);
455}
456
457static void sdma_start_err_halt_wait(struct sdma_engine *sde)
458{
459 schedule_work(&sde->err_halt_worker);
460}
461
462
463static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
464{
465 if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
466
467 unsigned index;
468 struct hfi1_devdata *dd = sde->dd;
469
470 for (index = 0; index < dd->num_sdma; index++) {
471 struct sdma_engine *curr_sdma = &dd->per_sdma[index];
472
473 if (curr_sdma != sde)
474 curr_sdma->progress_check_head =
475 curr_sdma->descq_head;
476 }
477 dd_dev_err(sde->dd,
478 "SDMA engine %d - check scheduled\n",
479 sde->this_idx);
480 mod_timer(&sde->err_progress_check_timer, jiffies + 10);
481 }
482}
483
484static void sdma_err_progress_check(unsigned long data)
485{
486 unsigned index;
487 struct sdma_engine *sde = (struct sdma_engine *)data;
488
489 dd_dev_err(sde->dd, "SDE progress check event\n");
490 for (index = 0; index < sde->dd->num_sdma; index++) {
491 struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
492 unsigned long flags;
493
494 /* check progress on each engine except the current one */
495 if (curr_sde == sde)
496 continue;
497 /*
498 * We must lock interrupts when acquiring sde->lock,
499 * to avoid a deadlock if interrupt triggers and spins on
500 * the same lock on same CPU
501 */
502 spin_lock_irqsave(&curr_sde->tail_lock, flags);
503 write_seqlock(&curr_sde->head_lock);
504
505 /* skip non-running queues */
506 if (curr_sde->state.current_state != sdma_state_s99_running) {
507 write_sequnlock(&curr_sde->head_lock);
508 spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
509 continue;
510 }
511
512 if ((curr_sde->descq_head != curr_sde->descq_tail) &&
513 (curr_sde->descq_head ==
514 curr_sde->progress_check_head))
515 __sdma_process_event(curr_sde,
516 sdma_event_e90_sw_halted);
517 write_sequnlock(&curr_sde->head_lock);
518 spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
519 }
520 schedule_work(&sde->err_halt_worker);
521}
522
523static void sdma_hw_clean_up_task(unsigned long opaque)
524{
525 struct sdma_engine *sde = (struct sdma_engine *) opaque;
526 u64 statuscsr;
527
528 while (1) {
529#ifdef CONFIG_SDMA_VERBOSITY
530 dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
531 sde->this_idx, slashstrip(__FILE__), __LINE__,
532 __func__);
533#endif
534 statuscsr = read_sde_csr(sde, SD(STATUS));
535 statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
536 if (statuscsr)
537 break;
538 udelay(10);
539 }
540
541 sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
542}
543
544static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
545{
546 smp_read_barrier_depends(); /* see sdma_update_tail() */
547 return sde->tx_ring[sde->tx_head & sde->sdma_mask];
548}
549
550/*
551 * flush ring for recovery
552 */
553static void sdma_flush_descq(struct sdma_engine *sde)
554{
555 u16 head, tail;
556 int progress = 0;
557 struct sdma_txreq *txp = get_txhead(sde);
558
559 /* The reason for some of the complexity of this code is that
560 * not all descriptors have corresponding txps. So, we have to
561 * be able to skip over descs until we wander into the range of
562 * the next txp on the list.
563 */
564 head = sde->descq_head & sde->sdma_mask;
565 tail = sde->descq_tail & sde->sdma_mask;
566 while (head != tail) {
567 /* advance head, wrap if needed */
568 head = ++sde->descq_head & sde->sdma_mask;
569 /* if now past this txp's descs, do the callback */
570 if (txp && txp->next_descq_idx == head) {
571 int drained = 0;
572 /* protect against complete modifying */
573 struct iowait *wait = txp->wait;
574
575 /* remove from list */
576 sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
577 if (wait)
578 drained = atomic_dec_and_test(&wait->sdma_busy);
579#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
580 trace_hfi1_sdma_out_sn(sde, txp->sn);
581 if (WARN_ON_ONCE(sde->head_sn != txp->sn))
582 dd_dev_err(sde->dd, "expected %llu got %llu\n",
583 sde->head_sn, txp->sn);
584 sde->head_sn++;
585#endif
586 sdma_txclean(sde->dd, txp);
587 trace_hfi1_sdma_progress(sde, head, tail, txp);
588 if (txp->complete)
589 (*txp->complete)(
590 txp,
591 SDMA_TXREQ_S_ABORTED,
592 drained);
593 if (wait && drained)
594 iowait_drain_wakeup(wait);
595 /* see if there is another txp */
596 txp = get_txhead(sde);
597 }
598 progress++;
599 }
600 if (progress)
601 sdma_desc_avail(sde, sdma_descq_freecnt(sde));
602}
603
604static void sdma_sw_clean_up_task(unsigned long opaque)
605{
606 struct sdma_engine *sde = (struct sdma_engine *) opaque;
607 unsigned long flags;
608
609 spin_lock_irqsave(&sde->tail_lock, flags);
610 write_seqlock(&sde->head_lock);
611
612 /*
613 * At this point, the following should always be true:
614 * - We are halted, so no more descriptors are getting retired.
615 * - We are not running, so no one is submitting new work.
616 * - Only we can send the e40_sw_cleaned, so we can't start
617 * running again until we say so. So, the active list and
618 * descq are ours to play with.
619 */
620
621
622 /*
623 * In the error clean up sequence, software clean must be called
624 * before the hardware clean so we can use the hardware head in
625 * the progress routine. A hardware clean or SPC unfreeze will
626 * reset the hardware head.
627 *
628 * Process all retired requests. The progress routine will use the
629 * latest physical hardware head - we are not running so speed does
630 * not matter.
631 */
632 sdma_make_progress(sde, 0);
633
634 sdma_flush(sde);
635
636 /*
637 * Reset our notion of head and tail.
638 * Note that the HW registers have been reset via an earlier
639 * clean up.
640 */
641 sde->descq_tail = 0;
642 sde->descq_head = 0;
643 sde->desc_avail = sdma_descq_freecnt(sde);
644 *sde->head_dma = 0;
645
646 __sdma_process_event(sde, sdma_event_e40_sw_cleaned);
647
648 write_sequnlock(&sde->head_lock);
649 spin_unlock_irqrestore(&sde->tail_lock, flags);
650}
651
652static void sdma_sw_tear_down(struct sdma_engine *sde)
653{
654 struct sdma_state *ss = &sde->state;
655
656 /* Releasing this reference means the state machine has stopped. */
657 sdma_put(ss);
658
659 /* stop waiting for all unfreeze events to complete */
660 atomic_set(&sde->dd->sdma_unfreeze_count, -1);
661 wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
662}
663
664static void sdma_start_hw_clean_up(struct sdma_engine *sde)
665{
666 tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
667}
668
669static void sdma_start_sw_clean_up(struct sdma_engine *sde)
670{
671 tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
672}
673
674static void sdma_set_state(struct sdma_engine *sde,
675 enum sdma_states next_state)
676{
677 struct sdma_state *ss = &sde->state;
678 const struct sdma_set_state_action *action = sdma_action_table;
679 unsigned op = 0;
680
681 trace_hfi1_sdma_state(
682 sde,
683 sdma_state_names[ss->current_state],
684 sdma_state_names[next_state]);
685
686 /* debugging bookkeeping */
687 ss->previous_state = ss->current_state;
688 ss->previous_op = ss->current_op;
689 ss->current_state = next_state;
690
691 if (ss->previous_state != sdma_state_s99_running
692 && next_state == sdma_state_s99_running)
693 sdma_flush(sde);
694
695 if (action[next_state].op_enable)
696 op |= SDMA_SENDCTRL_OP_ENABLE;
697
698 if (action[next_state].op_intenable)
699 op |= SDMA_SENDCTRL_OP_INTENABLE;
700
701 if (action[next_state].op_halt)
702 op |= SDMA_SENDCTRL_OP_HALT;
703
704 if (action[next_state].op_cleanup)
705 op |= SDMA_SENDCTRL_OP_CLEANUP;
706
707 if (action[next_state].go_s99_running_tofalse)
708 ss->go_s99_running = 0;
709
710 if (action[next_state].go_s99_running_totrue)
711 ss->go_s99_running = 1;
712
713 ss->current_op = op;
714 sdma_sendctrl(sde, ss->current_op);
715}
716
717/**
718 * sdma_get_descq_cnt() - called when device probed
719 *
720 * Return a validated descq count.
721 *
722 * This is currently only used in the verbs initialization to build the tx
723 * list.
724 *
725 * This will probably be deleted in favor of a more scalable approach to
726 * alloc tx's.
727 *
728 */
729u16 sdma_get_descq_cnt(void)
730{
731 u16 count = sdma_descq_cnt;
732
733 if (!count)
734 return SDMA_DESCQ_CNT;
735 /* count must be a power of 2 greater than 64 and less than
736 * 32768. Otherwise return default.
737 */
738 if (!is_power_of_2(count))
739 return SDMA_DESCQ_CNT;
740 if (count < 64 && count > 32768)
741 return SDMA_DESCQ_CNT;
742 return count;
743}
744/**
745 * sdma_select_engine_vl() - select sdma engine
746 * @dd: devdata
747 * @selector: a spreading factor
748 * @vl: this vl
749 *
750 *
751 * This function returns an engine based on the selector and a vl. The
752 * mapping fields are protected by RCU.
753 */
754struct sdma_engine *sdma_select_engine_vl(
755 struct hfi1_devdata *dd,
756 u32 selector,
757 u8 vl)
758{
759 struct sdma_vl_map *m;
760 struct sdma_map_elem *e;
761 struct sdma_engine *rval;
762
763 if (WARN_ON(vl > 8))
764 return NULL;
765
766 rcu_read_lock();
767 m = rcu_dereference(dd->sdma_map);
768 if (unlikely(!m)) {
769 rcu_read_unlock();
770 return NULL;
771 }
772 e = m->map[vl & m->mask];
773 rval = e->sde[selector & e->mask];
774 rcu_read_unlock();
775
776 trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
777 return rval;
778}
779
780/**
781 * sdma_select_engine_sc() - select sdma engine
782 * @dd: devdata
783 * @selector: a spreading factor
784 * @sc5: the 5 bit sc
785 *
786 *
787 * This function returns an engine based on the selector and an sc.
788 */
789struct sdma_engine *sdma_select_engine_sc(
790 struct hfi1_devdata *dd,
791 u32 selector,
792 u8 sc5)
793{
794 u8 vl = sc_to_vlt(dd, sc5);
795
796 return sdma_select_engine_vl(dd, selector, vl);
797}
798
799/*
800 * Free the indicated map struct
801 */
802static void sdma_map_free(struct sdma_vl_map *m)
803{
804 int i;
805
806 for (i = 0; m && i < m->actual_vls; i++)
807 kfree(m->map[i]);
808 kfree(m);
809}
810
811/*
812 * Handle RCU callback
813 */
814static void sdma_map_rcu_callback(struct rcu_head *list)
815{
816 struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
817
818 sdma_map_free(m);
819}
820
821/**
822 * sdma_map_init - called when # vls change
823 * @dd: hfi1_devdata
824 * @port: port number
825 * @num_vls: number of vls
826 * @vl_engines: per vl engine mapping (optional)
827 *
828 * This routine changes the mapping based on the number of vls.
829 *
830 * vl_engines is used to specify a non-uniform vl/engine loading. NULL
831 * implies auto computing the loading and giving each VLs a uniform
832 * distribution of engines per VL.
833 *
834 * The auto algorithm computes the sde_per_vl and the number of extra
835 * engines. Any extra engines are added from the last VL on down.
836 *
837 * rcu locking is used here to control access to the mapping fields.
838 *
839 * If either the num_vls or num_sdma are non-power of 2, the array sizes
840 * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
841 * up to the next highest power of 2 and the first entry is reused
842 * in a round robin fashion.
843 *
844 * If an error occurs the map change is not done and the mapping is
845 * not changed.
846 *
847 */
848int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
849{
850 int i, j;
851 int extra, sde_per_vl;
852 int engine = 0;
853 u8 lvl_engines[OPA_MAX_VLS];
854 struct sdma_vl_map *oldmap, *newmap;
855
856 if (!(dd->flags & HFI1_HAS_SEND_DMA))
857 return 0;
858
859 if (!vl_engines) {
860 /* truncate divide */
861 sde_per_vl = dd->num_sdma / num_vls;
862 /* extras */
863 extra = dd->num_sdma % num_vls;
864 vl_engines = lvl_engines;
865 /* add extras from last vl down */
866 for (i = num_vls - 1; i >= 0; i--, extra--)
867 vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
868 }
869 /* build new map */
870 newmap = kzalloc(
871 sizeof(struct sdma_vl_map) +
872 roundup_pow_of_two(num_vls) *
873 sizeof(struct sdma_map_elem *),
874 GFP_KERNEL);
875 if (!newmap)
876 goto bail;
877 newmap->actual_vls = num_vls;
878 newmap->vls = roundup_pow_of_two(num_vls);
879 newmap->mask = (1 << ilog2(newmap->vls)) - 1;
880 for (i = 0; i < newmap->vls; i++) {
881 /* save for wrap around */
882 int first_engine = engine;
883
884 if (i < newmap->actual_vls) {
885 int sz = roundup_pow_of_two(vl_engines[i]);
886
887 /* only allocate once */
888 newmap->map[i] = kzalloc(
889 sizeof(struct sdma_map_elem) +
890 sz * sizeof(struct sdma_engine *),
891 GFP_KERNEL);
892 if (!newmap->map[i])
893 goto bail;
894 newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
895 /* assign engines */
896 for (j = 0; j < sz; j++) {
897 newmap->map[i]->sde[j] =
898 &dd->per_sdma[engine];
899 if (++engine >= first_engine + vl_engines[i])
900 /* wrap back to first engine */
901 engine = first_engine;
902 }
903 } else {
904 /* just re-use entry without allocating */
905 newmap->map[i] = newmap->map[i % num_vls];
906 }
907 engine = first_engine + vl_engines[i];
908 }
909 /* newmap in hand, save old map */
910 spin_lock_irq(&dd->sde_map_lock);
911 oldmap = rcu_dereference_protected(dd->sdma_map,
912 lockdep_is_held(&dd->sde_map_lock));
913
914 /* publish newmap */
915 rcu_assign_pointer(dd->sdma_map, newmap);
916
917 spin_unlock_irq(&dd->sde_map_lock);
918 /* success, free any old map after grace period */
919 if (oldmap)
920 call_rcu(&oldmap->list, sdma_map_rcu_callback);
921 return 0;
922bail:
923 /* free any partial allocation */
924 sdma_map_free(newmap);
925 return -ENOMEM;
926}
927
928/*
929 * Clean up allocated memory.
930 *
931 * This routine is can be called regardless of the success of sdma_init()
932 *
933 */
934static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
935{
936 size_t i;
937 struct sdma_engine *sde;
938
939 if (dd->sdma_pad_dma) {
940 dma_free_coherent(&dd->pcidev->dev, 4,
941 (void *)dd->sdma_pad_dma,
942 dd->sdma_pad_phys);
943 dd->sdma_pad_dma = NULL;
944 dd->sdma_pad_phys = 0;
945 }
946 if (dd->sdma_heads_dma) {
947 dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
948 (void *)dd->sdma_heads_dma,
949 dd->sdma_heads_phys);
950 dd->sdma_heads_dma = NULL;
951 dd->sdma_heads_phys = 0;
952 }
953 for (i = 0; dd->per_sdma && i < num_engines; ++i) {
954 sde = &dd->per_sdma[i];
955
956 sde->head_dma = NULL;
957 sde->head_phys = 0;
958
959 if (sde->descq) {
960 dma_free_coherent(
961 &dd->pcidev->dev,
962 sde->descq_cnt * sizeof(u64[2]),
963 sde->descq,
964 sde->descq_phys
965 );
966 sde->descq = NULL;
967 sde->descq_phys = 0;
968 }
969 if (is_vmalloc_addr(sde->tx_ring))
970 vfree(sde->tx_ring);
971 else
972 kfree(sde->tx_ring);
973 sde->tx_ring = NULL;
974 }
975 spin_lock_irq(&dd->sde_map_lock);
976 kfree(rcu_access_pointer(dd->sdma_map));
977 RCU_INIT_POINTER(dd->sdma_map, NULL);
978 spin_unlock_irq(&dd->sde_map_lock);
979 synchronize_rcu();
980 kfree(dd->per_sdma);
981 dd->per_sdma = NULL;
982}
983
984/**
985 * sdma_init() - called when device probed
986 * @dd: hfi1_devdata
987 * @port: port number (currently only zero)
988 *
989 * sdma_init initializes the specified number of engines.
990 *
991 * The code initializes each sde, its csrs. Interrupts
992 * are not required to be enabled.
993 *
994 * Returns:
995 * 0 - success, -errno on failure
996 */
997int sdma_init(struct hfi1_devdata *dd, u8 port)
998{
999 unsigned this_idx;
1000 struct sdma_engine *sde;
1001 u16 descq_cnt;
1002 void *curr_head;
1003 struct hfi1_pportdata *ppd = dd->pport + port;
1004 u32 per_sdma_credits;
1005 uint idle_cnt = sdma_idle_cnt;
1006 size_t num_engines = dd->chip_sdma_engines;
1007
1008 if (!HFI1_CAP_IS_KSET(SDMA)) {
1009 HFI1_CAP_CLEAR(SDMA_AHG);
1010 return 0;
1011 }
1012 if (mod_num_sdma &&
1013 /* can't exceed chip support */
1014 mod_num_sdma <= dd->chip_sdma_engines &&
1015 /* count must be >= vls */
1016 mod_num_sdma >= num_vls)
1017 num_engines = mod_num_sdma;
1018
1019 dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
1020 dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines);
1021 dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
1022 dd->chip_sdma_mem_size);
1023
1024 per_sdma_credits =
1025 dd->chip_sdma_mem_size/(num_engines * SDMA_BLOCK_SIZE);
1026
1027 /* set up freeze waitqueue */
1028 init_waitqueue_head(&dd->sdma_unfreeze_wq);
1029 atomic_set(&dd->sdma_unfreeze_count, 0);
1030
1031 descq_cnt = sdma_get_descq_cnt();
1032 dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
1033 num_engines, descq_cnt);
1034
1035 /* alloc memory for array of send engines */
1036 dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL);
1037 if (!dd->per_sdma)
1038 return -ENOMEM;
1039
1040 idle_cnt = ns_to_cclock(dd, idle_cnt);
1041 /* Allocate memory for SendDMA descriptor FIFOs */
1042 for (this_idx = 0; this_idx < num_engines; ++this_idx) {
1043 sde = &dd->per_sdma[this_idx];
1044 sde->dd = dd;
1045 sde->ppd = ppd;
1046 sde->this_idx = this_idx;
1047 sde->descq_cnt = descq_cnt;
1048 sde->desc_avail = sdma_descq_freecnt(sde);
1049 sde->sdma_shift = ilog2(descq_cnt);
1050 sde->sdma_mask = (1 << sde->sdma_shift) - 1;
1051 sde->descq_full_count = 0;
1052
1053 /* Create a mask for all 3 chip interrupt sources */
1054 sde->imask = (u64)1 << (0*TXE_NUM_SDMA_ENGINES + this_idx)
1055 | (u64)1 << (1*TXE_NUM_SDMA_ENGINES + this_idx)
1056 | (u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx);
1057 /* Create a mask specifically for sdma_idle */
1058 sde->idle_mask =
1059 (u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx);
1060 /* Create a mask specifically for sdma_progress */
1061 sde->progress_mask =
1062 (u64)1 << (TXE_NUM_SDMA_ENGINES + this_idx);
1063 spin_lock_init(&sde->tail_lock);
1064 seqlock_init(&sde->head_lock);
1065 spin_lock_init(&sde->senddmactrl_lock);
1066 spin_lock_init(&sde->flushlist_lock);
1067 /* insure there is always a zero bit */
1068 sde->ahg_bits = 0xfffffffe00000000ULL;
1069
1070 sdma_set_state(sde, sdma_state_s00_hw_down);
1071
1072 /* set up reference counting */
1073 kref_init(&sde->state.kref);
1074 init_completion(&sde->state.comp);
1075
1076 INIT_LIST_HEAD(&sde->flushlist);
1077 INIT_LIST_HEAD(&sde->dmawait);
1078
1079 sde->tail_csr =
1080 get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
1081
1082 if (idle_cnt)
1083 dd->default_desc1 =
1084 SDMA_DESC1_HEAD_TO_HOST_FLAG;
1085 else
1086 dd->default_desc1 =
1087 SDMA_DESC1_INT_REQ_FLAG;
1088
1089 tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task,
1090 (unsigned long)sde);
1091
1092 tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
1093 (unsigned long)sde);
1094 INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
1095 INIT_WORK(&sde->flush_worker, sdma_field_flush);
1096
1097 sde->progress_check_head = 0;
1098
1099 init_timer(&sde->err_progress_check_timer);
1100 sde->err_progress_check_timer.function =
1101 sdma_err_progress_check;
1102 sde->err_progress_check_timer.data = (unsigned long)sde;
1103
1104 sde->descq = dma_zalloc_coherent(
1105 &dd->pcidev->dev,
1106 descq_cnt * sizeof(u64[2]),
1107 &sde->descq_phys,
1108 GFP_KERNEL
1109 );
1110 if (!sde->descq)
1111 goto bail;
1112 sde->tx_ring =
1113 kcalloc(descq_cnt, sizeof(struct sdma_txreq *),
1114 GFP_KERNEL);
1115 if (!sde->tx_ring)
1116 sde->tx_ring =
1117 vzalloc(
1118 sizeof(struct sdma_txreq *) *
1119 descq_cnt);
1120 if (!sde->tx_ring)
1121 goto bail;
1122 }
1123
1124 dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
1125 /* Allocate memory for DMA of head registers to memory */
1126 dd->sdma_heads_dma = dma_zalloc_coherent(
1127 &dd->pcidev->dev,
1128 dd->sdma_heads_size,
1129 &dd->sdma_heads_phys,
1130 GFP_KERNEL
1131 );
1132 if (!dd->sdma_heads_dma) {
1133 dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
1134 goto bail;
1135 }
1136
1137 /* Allocate memory for pad */
1138 dd->sdma_pad_dma = dma_zalloc_coherent(
1139 &dd->pcidev->dev,
1140 sizeof(u32),
1141 &dd->sdma_pad_phys,
1142 GFP_KERNEL
1143 );
1144 if (!dd->sdma_pad_dma) {
1145 dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
1146 goto bail;
1147 }
1148
1149 /* assign each engine to different cacheline and init registers */
1150 curr_head = (void *)dd->sdma_heads_dma;
1151 for (this_idx = 0; this_idx < num_engines; ++this_idx) {
1152 unsigned long phys_offset;
1153
1154 sde = &dd->per_sdma[this_idx];
1155
1156 sde->head_dma = curr_head;
1157 curr_head += L1_CACHE_BYTES;
1158 phys_offset = (unsigned long)sde->head_dma -
1159 (unsigned long)dd->sdma_heads_dma;
1160 sde->head_phys = dd->sdma_heads_phys + phys_offset;
1161 init_sdma_regs(sde, per_sdma_credits, idle_cnt);
1162 }
1163 dd->flags |= HFI1_HAS_SEND_DMA;
1164 dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
1165 dd->num_sdma = num_engines;
1166 if (sdma_map_init(dd, port, ppd->vls_operational, NULL))
1167 goto bail;
1168 dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
1169 return 0;
1170
1171bail:
1172 sdma_clean(dd, num_engines);
1173 return -ENOMEM;
1174}
1175
1176/**
1177 * sdma_all_running() - called when the link goes up
1178 * @dd: hfi1_devdata
1179 *
1180 * This routine moves all engines to the running state.
1181 */
1182void sdma_all_running(struct hfi1_devdata *dd)
1183{
1184 struct sdma_engine *sde;
1185 unsigned int i;
1186
1187 /* move all engines to running */
1188 for (i = 0; i < dd->num_sdma; ++i) {
1189 sde = &dd->per_sdma[i];
1190 sdma_process_event(sde, sdma_event_e30_go_running);
1191 }
1192}
1193
1194/**
1195 * sdma_all_idle() - called when the link goes down
1196 * @dd: hfi1_devdata
1197 *
1198 * This routine moves all engines to the idle state.
1199 */
1200void sdma_all_idle(struct hfi1_devdata *dd)
1201{
1202 struct sdma_engine *sde;
1203 unsigned int i;
1204
1205 /* idle all engines */
1206 for (i = 0; i < dd->num_sdma; ++i) {
1207 sde = &dd->per_sdma[i];
1208 sdma_process_event(sde, sdma_event_e70_go_idle);
1209 }
1210}
1211
1212/**
1213 * sdma_start() - called to kick off state processing for all engines
1214 * @dd: hfi1_devdata
1215 *
1216 * This routine is for kicking off the state processing for all required
1217 * sdma engines. Interrupts need to be working at this point.
1218 *
1219 */
1220void sdma_start(struct hfi1_devdata *dd)
1221{
1222 unsigned i;
1223 struct sdma_engine *sde;
1224
1225 /* kick off the engines state processing */
1226 for (i = 0; i < dd->num_sdma; ++i) {
1227 sde = &dd->per_sdma[i];
1228 sdma_process_event(sde, sdma_event_e10_go_hw_start);
1229 }
1230}
1231
1232/**
1233 * sdma_exit() - used when module is removed
1234 * @dd: hfi1_devdata
1235 */
1236void sdma_exit(struct hfi1_devdata *dd)
1237{
1238 unsigned this_idx;
1239 struct sdma_engine *sde;
1240
1241 for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
1242 ++this_idx) {
1243
1244 sde = &dd->per_sdma[this_idx];
1245 if (!list_empty(&sde->dmawait))
1246 dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
1247 sde->this_idx);
1248 sdma_process_event(sde, sdma_event_e00_go_hw_down);
1249
1250 del_timer_sync(&sde->err_progress_check_timer);
1251
1252 /*
1253 * This waits for the state machine to exit so it is not
1254 * necessary to kill the sdma_sw_clean_up_task to make sure
1255 * it is not running.
1256 */
1257 sdma_finalput(&sde->state);
1258 }
1259 sdma_clean(dd, dd->num_sdma);
1260}
1261
1262/*
1263 * unmap the indicated descriptor
1264 */
1265static inline void sdma_unmap_desc(
1266 struct hfi1_devdata *dd,
1267 struct sdma_desc *descp)
1268{
1269 switch (sdma_mapping_type(descp)) {
1270 case SDMA_MAP_SINGLE:
1271 dma_unmap_single(
1272 &dd->pcidev->dev,
1273 sdma_mapping_addr(descp),
1274 sdma_mapping_len(descp),
1275 DMA_TO_DEVICE);
1276 break;
1277 case SDMA_MAP_PAGE:
1278 dma_unmap_page(
1279 &dd->pcidev->dev,
1280 sdma_mapping_addr(descp),
1281 sdma_mapping_len(descp),
1282 DMA_TO_DEVICE);
1283 break;
1284 }
1285}
1286
1287/*
1288 * return the mode as indicated by the first
1289 * descriptor in the tx.
1290 */
1291static inline u8 ahg_mode(struct sdma_txreq *tx)
1292{
1293 return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
1294 >> SDMA_DESC1_HEADER_MODE_SHIFT;
1295}
1296
1297/**
1298 * sdma_txclean() - clean tx of mappings, descp *kmalloc's
1299 * @dd: hfi1_devdata for unmapping
1300 * @tx: tx request to clean
1301 *
1302 * This is used in the progress routine to clean the tx or
1303 * by the ULP to toss an in-process tx build.
1304 *
1305 * The code can be called multiple times without issue.
1306 *
1307 */
1308void sdma_txclean(
1309 struct hfi1_devdata *dd,
1310 struct sdma_txreq *tx)
1311{
1312 u16 i;
1313
1314 if (tx->num_desc) {
1315 u8 skip = 0, mode = ahg_mode(tx);
1316
1317 /* unmap first */
1318 sdma_unmap_desc(dd, &tx->descp[0]);
1319 /* determine number of AHG descriptors to skip */
1320 if (mode > SDMA_AHG_APPLY_UPDATE1)
1321 skip = mode >> 1;
1322 for (i = 1 + skip; i < tx->num_desc; i++)
1323 sdma_unmap_desc(dd, &tx->descp[i]);
1324 tx->num_desc = 0;
1325 }
1326 kfree(tx->coalesce_buf);
1327 tx->coalesce_buf = NULL;
1328 /* kmalloc'ed descp */
1329 if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
1330 tx->desc_limit = ARRAY_SIZE(tx->descs);
1331 kfree(tx->descp);
1332 }
1333}
1334
1335static inline u16 sdma_gethead(struct sdma_engine *sde)
1336{
1337 struct hfi1_devdata *dd = sde->dd;
1338 int use_dmahead;
1339 u16 hwhead;
1340
1341#ifdef CONFIG_SDMA_VERBOSITY
1342 dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
1343 sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
1344#endif
1345
1346retry:
1347 use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
1348 (dd->flags & HFI1_HAS_SDMA_TIMEOUT);
1349 hwhead = use_dmahead ?
1350 (u16) le64_to_cpu(*sde->head_dma) :
1351 (u16) read_sde_csr(sde, SD(HEAD));
1352
1353 if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
1354 u16 cnt;
1355 u16 swtail;
1356 u16 swhead;
1357 int sane;
1358
1359 swhead = sde->descq_head & sde->sdma_mask;
1360 /* this code is really bad for cache line trading */
1361 swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
1362 cnt = sde->descq_cnt;
1363
1364 if (swhead < swtail)
1365 /* not wrapped */
1366 sane = (hwhead >= swhead) & (hwhead <= swtail);
1367 else if (swhead > swtail)
1368 /* wrapped around */
1369 sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
1370 (hwhead <= swtail);
1371 else
1372 /* empty */
1373 sane = (hwhead == swhead);
1374
1375 if (unlikely(!sane)) {
1376 dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n",
1377 sde->this_idx,
1378 use_dmahead ? "dma" : "kreg",
1379 hwhead, swhead, swtail, cnt);
1380 if (use_dmahead) {
1381 /* try one more time, using csr */
1382 use_dmahead = 0;
1383 goto retry;
1384 }
1385 /* proceed as if no progress */
1386 hwhead = swhead;
1387 }
1388 }
1389 return hwhead;
1390}
1391
1392/*
1393 * This is called when there are send DMA descriptors that might be
1394 * available.
1395 *
1396 * This is called with head_lock held.
1397 */
1398static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
1399{
1400 struct iowait *wait, *nw;
1401 struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
1402 unsigned i, n = 0, seq;
1403 struct sdma_txreq *stx;
1404 struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
1405
1406#ifdef CONFIG_SDMA_VERBOSITY
1407 dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
1408 slashstrip(__FILE__), __LINE__, __func__);
1409 dd_dev_err(sde->dd, "avail: %u\n", avail);
1410#endif
1411
1412 do {
1413 seq = read_seqbegin(&dev->iowait_lock);
1414 if (!list_empty(&sde->dmawait)) {
1415 /* at least one item */
1416 write_seqlock(&dev->iowait_lock);
1417 /* Harvest waiters wanting DMA descriptors */
1418 list_for_each_entry_safe(
1419 wait,
1420 nw,
1421 &sde->dmawait,
1422 list) {
1423 u16 num_desc = 0;
1424
1425 if (!wait->wakeup)
1426 continue;
1427 if (n == ARRAY_SIZE(waits))
1428 break;
1429 if (!list_empty(&wait->tx_head)) {
1430 stx = list_first_entry(
1431 &wait->tx_head,
1432 struct sdma_txreq,
1433 list);
1434 num_desc = stx->num_desc;
1435 }
1436 if (num_desc > avail)
1437 break;
1438 avail -= num_desc;
1439 list_del_init(&wait->list);
1440 waits[n++] = wait;
1441 }
1442 write_sequnlock(&dev->iowait_lock);
1443 break;
1444 }
1445 } while (read_seqretry(&dev->iowait_lock, seq));
1446
1447 for (i = 0; i < n; i++)
1448 waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
1449}
1450
1451/* head_lock must be held */
1452static void sdma_make_progress(struct sdma_engine *sde, u64 status)
1453{
1454 struct sdma_txreq *txp = NULL;
1455 int progress = 0;
1456 u16 hwhead, swhead, swtail;
1457 int idle_check_done = 0;
1458
1459 hwhead = sdma_gethead(sde);
1460
1461 /* The reason for some of the complexity of this code is that
1462 * not all descriptors have corresponding txps. So, we have to
1463 * be able to skip over descs until we wander into the range of
1464 * the next txp on the list.
1465 */
1466
1467retry:
1468 txp = get_txhead(sde);
1469 swhead = sde->descq_head & sde->sdma_mask;
1470 trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
1471 while (swhead != hwhead) {
1472 /* advance head, wrap if needed */
1473 swhead = ++sde->descq_head & sde->sdma_mask;
1474
1475 /* if now past this txp's descs, do the callback */
1476 if (txp && txp->next_descq_idx == swhead) {
1477 int drained = 0;
1478 /* protect against complete modifying */
1479 struct iowait *wait = txp->wait;
1480
1481 /* remove from list */
1482 sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
1483 if (wait)
1484 drained = atomic_dec_and_test(&wait->sdma_busy);
1485#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
1486 trace_hfi1_sdma_out_sn(sde, txp->sn);
1487 if (WARN_ON_ONCE(sde->head_sn != txp->sn))
1488 dd_dev_err(sde->dd, "expected %llu got %llu\n",
1489 sde->head_sn, txp->sn);
1490 sde->head_sn++;
1491#endif
1492 sdma_txclean(sde->dd, txp);
1493 if (txp->complete)
1494 (*txp->complete)(
1495 txp,
1496 SDMA_TXREQ_S_OK,
1497 drained);
1498 if (wait && drained)
1499 iowait_drain_wakeup(wait);
1500 /* see if there is another txp */
1501 txp = get_txhead(sde);
1502 }
1503 trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
1504 progress++;
1505 }
1506
1507 /*
1508 * The SDMA idle interrupt is not guaranteed to be ordered with respect
1509 * to updates to the the dma_head location in host memory. The head
1510 * value read might not be fully up to date. If there are pending
1511 * descriptors and the SDMA idle interrupt fired then read from the
1512 * CSR SDMA head instead to get the latest value from the hardware.
1513 * The hardware SDMA head should be read at most once in this invocation
1514 * of sdma_make_progress(..) which is ensured by idle_check_done flag
1515 */
1516 if ((status & sde->idle_mask) && !idle_check_done) {
1517 swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
1518 if (swtail != hwhead) {
1519 hwhead = (u16)read_sde_csr(sde, SD(HEAD));
1520 idle_check_done = 1;
1521 goto retry;
1522 }
1523 }
1524
1525 sde->last_status = status;
1526 if (progress)
1527 sdma_desc_avail(sde, sdma_descq_freecnt(sde));
1528}
1529
1530/*
1531 * sdma_engine_interrupt() - interrupt handler for engine
1532 * @sde: sdma engine
1533 * @status: sdma interrupt reason
1534 *
1535 * Status is a mask of the 3 possible interrupts for this engine. It will
1536 * contain bits _only_ for this SDMA engine. It will contain at least one
1537 * bit, it may contain more.
1538 */
1539void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
1540{
1541 trace_hfi1_sdma_engine_interrupt(sde, status);
1542 write_seqlock(&sde->head_lock);
1543 sdma_set_desc_cnt(sde, sde->descq_cnt / 2);
1544 sdma_make_progress(sde, status);
1545 write_sequnlock(&sde->head_lock);
1546}
1547
1548/**
1549 * sdma_engine_error() - error handler for engine
1550 * @sde: sdma engine
1551 * @status: sdma interrupt reason
1552 */
1553void sdma_engine_error(struct sdma_engine *sde, u64 status)
1554{
1555 unsigned long flags;
1556
1557#ifdef CONFIG_SDMA_VERBOSITY
1558 dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
1559 sde->this_idx,
1560 (unsigned long long)status,
1561 sdma_state_names[sde->state.current_state]);
1562#endif
1563 spin_lock_irqsave(&sde->tail_lock, flags);
1564 write_seqlock(&sde->head_lock);
1565 if (status & ALL_SDMA_ENG_HALT_ERRS)
1566 __sdma_process_event(sde, sdma_event_e60_hw_halted);
1567 if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
1568 dd_dev_err(sde->dd,
1569 "SDMA (%u) engine error: 0x%llx state %s\n",
1570 sde->this_idx,
1571 (unsigned long long)status,
1572 sdma_state_names[sde->state.current_state]);
1573 dump_sdma_state(sde);
1574 }
1575 write_sequnlock(&sde->head_lock);
1576 spin_unlock_irqrestore(&sde->tail_lock, flags);
1577}
1578
1579static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
1580{
1581 u64 set_senddmactrl = 0;
1582 u64 clr_senddmactrl = 0;
1583 unsigned long flags;
1584
1585#ifdef CONFIG_SDMA_VERBOSITY
1586 dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
1587 sde->this_idx,
1588 (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
1589 (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
1590 (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
1591 (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
1592#endif
1593
1594 if (op & SDMA_SENDCTRL_OP_ENABLE)
1595 set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
1596 else
1597 clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
1598
1599 if (op & SDMA_SENDCTRL_OP_INTENABLE)
1600 set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
1601 else
1602 clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
1603
1604 if (op & SDMA_SENDCTRL_OP_HALT)
1605 set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
1606 else
1607 clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
1608
1609 spin_lock_irqsave(&sde->senddmactrl_lock, flags);
1610
1611 sde->p_senddmactrl |= set_senddmactrl;
1612 sde->p_senddmactrl &= ~clr_senddmactrl;
1613
1614 if (op & SDMA_SENDCTRL_OP_CLEANUP)
1615 write_sde_csr(sde, SD(CTRL),
1616 sde->p_senddmactrl |
1617 SD(CTRL_SDMA_CLEANUP_SMASK));
1618 else
1619 write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
1620
1621 spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
1622
1623#ifdef CONFIG_SDMA_VERBOSITY
1624 sdma_dumpstate(sde);
1625#endif
1626}
1627
1628static void sdma_setlengen(struct sdma_engine *sde)
1629{
1630#ifdef CONFIG_SDMA_VERBOSITY
1631 dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
1632 sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
1633#endif
1634
1635 /*
1636 * Set SendDmaLenGen and clear-then-set the MSB of the generation
1637 * count to enable generation checking and load the internal
1638 * generation counter.
1639 */
1640 write_sde_csr(sde, SD(LEN_GEN),
1641 (sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT)
1642 );
1643 write_sde_csr(sde, SD(LEN_GEN),
1644 ((sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT))
1645 | (4ULL << SD(LEN_GEN_GENERATION_SHIFT))
1646 );
1647}
1648
1649static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
1650{
1651 /* Commit writes to memory and advance the tail on the chip */
1652 smp_wmb(); /* see get_txhead() */
1653 writeq(tail, sde->tail_csr);
1654}
1655
1656/*
1657 * This is called when changing to state s10_hw_start_up_halt_wait as
1658 * a result of send buffer errors or send DMA descriptor errors.
1659 */
1660static void sdma_hw_start_up(struct sdma_engine *sde)
1661{
1662 u64 reg;
1663
1664#ifdef CONFIG_SDMA_VERBOSITY
1665 dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
1666 sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
1667#endif
1668
1669 sdma_setlengen(sde);
1670 sdma_update_tail(sde, 0); /* Set SendDmaTail */
1671 *sde->head_dma = 0;
1672
1673 reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
1674 SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
1675 write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
1676}
1677
1678#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
1679(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
1680
1681#define SET_STATIC_RATE_CONTROL_SMASK(r) \
1682(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
1683/*
1684 * set_sdma_integrity
1685 *
1686 * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
1687 */
1688static void set_sdma_integrity(struct sdma_engine *sde)
1689{
1690 struct hfi1_devdata *dd = sde->dd;
1691 u64 reg;
1692
1693 if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY)))
1694 return;
1695
1696 reg = hfi1_pkt_base_sdma_integrity(dd);
1697
1698 if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL))
1699 CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
1700 else
1701 SET_STATIC_RATE_CONTROL_SMASK(reg);
1702
1703 write_sde_csr(sde, SD(CHECK_ENABLE), reg);
1704}
1705
1706
1707static void init_sdma_regs(
1708 struct sdma_engine *sde,
1709 u32 credits,
1710 uint idle_cnt)
1711{
1712 u8 opval, opmask;
1713#ifdef CONFIG_SDMA_VERBOSITY
1714 struct hfi1_devdata *dd = sde->dd;
1715
1716 dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
1717 sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
1718#endif
1719
1720 write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
1721 sdma_setlengen(sde);
1722 sdma_update_tail(sde, 0); /* Set SendDmaTail */
1723 write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
1724 write_sde_csr(sde, SD(DESC_CNT), 0);
1725 write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
1726 write_sde_csr(sde, SD(MEMORY),
1727 ((u64)credits <<
1728 SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
1729 ((u64)(credits * sde->this_idx) <<
1730 SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
1731 write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
1732 set_sdma_integrity(sde);
1733 opmask = OPCODE_CHECK_MASK_DISABLED;
1734 opval = OPCODE_CHECK_VAL_DISABLED;
1735 write_sde_csr(sde, SD(CHECK_OPCODE),
1736 (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
1737 (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
1738}
1739
1740#ifdef CONFIG_SDMA_VERBOSITY
1741
1742#define sdma_dumpstate_helper0(reg) do { \
1743 csr = read_csr(sde->dd, reg); \
1744 dd_dev_err(sde->dd, "%36s 0x%016llx\n", #reg, csr); \
1745 } while (0)
1746
1747#define sdma_dumpstate_helper(reg) do { \
1748 csr = read_sde_csr(sde, reg); \
1749 dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
1750 #reg, sde->this_idx, csr); \
1751 } while (0)
1752
1753#define sdma_dumpstate_helper2(reg) do { \
1754 csr = read_csr(sde->dd, reg + (8 * i)); \
1755 dd_dev_err(sde->dd, "%33s_%02u 0x%016llx\n", \
1756 #reg, i, csr); \
1757 } while (0)
1758
1759void sdma_dumpstate(struct sdma_engine *sde)
1760{
1761 u64 csr;
1762 unsigned i;
1763
1764 sdma_dumpstate_helper(SD(CTRL));
1765 sdma_dumpstate_helper(SD(STATUS));
1766 sdma_dumpstate_helper0(SD(ERR_STATUS));
1767 sdma_dumpstate_helper0(SD(ERR_MASK));
1768 sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
1769 sdma_dumpstate_helper(SD(ENG_ERR_MASK));
1770
1771 for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
1772 sdma_dumpstate_helper2(CCE_INT_STATUS);
1773 sdma_dumpstate_helper2(CCE_INT_MASK);
1774 sdma_dumpstate_helper2(CCE_INT_BLOCKED);
1775 }
1776
1777 sdma_dumpstate_helper(SD(TAIL));
1778 sdma_dumpstate_helper(SD(HEAD));
1779 sdma_dumpstate_helper(SD(PRIORITY_THLD));
1780 sdma_dumpstate_helper(SD(IDLE_CNT));
1781 sdma_dumpstate_helper(SD(RELOAD_CNT));
1782 sdma_dumpstate_helper(SD(DESC_CNT));
1783 sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
1784 sdma_dumpstate_helper(SD(MEMORY));
1785 sdma_dumpstate_helper0(SD(ENGINES));
1786 sdma_dumpstate_helper0(SD(MEM_SIZE));
1787 /* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS); */
1788 sdma_dumpstate_helper(SD(BASE_ADDR));
1789 sdma_dumpstate_helper(SD(LEN_GEN));
1790 sdma_dumpstate_helper(SD(HEAD_ADDR));
1791 sdma_dumpstate_helper(SD(CHECK_ENABLE));
1792 sdma_dumpstate_helper(SD(CHECK_VL));
1793 sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
1794 sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
1795 sdma_dumpstate_helper(SD(CHECK_SLID));
1796 sdma_dumpstate_helper(SD(CHECK_OPCODE));
1797}
1798#endif
1799
1800static void dump_sdma_state(struct sdma_engine *sde)
1801{
1802 struct hw_sdma_desc *descq;
1803 struct hw_sdma_desc *descqp;
1804 u64 desc[2];
1805 u64 addr;
1806 u8 gen;
1807 u16 len;
1808 u16 head, tail, cnt;
1809
1810 head = sde->descq_head & sde->sdma_mask;
1811 tail = sde->descq_tail & sde->sdma_mask;
1812 cnt = sdma_descq_freecnt(sde);
1813 descq = sde->descq;
1814
1815 dd_dev_err(sde->dd,
1816 "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
1817 sde->this_idx,
1818 head,
1819 tail,
1820 cnt,
1821 !list_empty(&sde->flushlist));
1822
1823 /* print info for each entry in the descriptor queue */
1824 while (head != tail) {
1825 char flags[6] = { 'x', 'x', 'x', 'x', 0 };
1826
1827 descqp = &sde->descq[head];
1828 desc[0] = le64_to_cpu(descqp->qw[0]);
1829 desc[1] = le64_to_cpu(descqp->qw[1]);
1830 flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
1831 flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
1832 'H' : '-';
1833 flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
1834 flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
1835 addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
1836 & SDMA_DESC0_PHY_ADDR_MASK;
1837 gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
1838 & SDMA_DESC1_GENERATION_MASK;
1839 len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
1840 & SDMA_DESC0_BYTE_COUNT_MASK;
1841 dd_dev_err(sde->dd,
1842 "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
1843 head, flags, addr, gen, len);
1844 dd_dev_err(sde->dd,
1845 "\tdesc0:0x%016llx desc1 0x%016llx\n",
1846 desc[0], desc[1]);
1847 if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
1848 dd_dev_err(sde->dd,
1849 "\taidx: %u amode: %u alen: %u\n",
1850 (u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK)
1851 >> SDMA_DESC1_HEADER_INDEX_MASK),
1852 (u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK)
1853 >> SDMA_DESC1_HEADER_MODE_SHIFT),
1854 (u8)((desc[1] & SDMA_DESC1_HEADER_DWS_SMASK)
1855 >> SDMA_DESC1_HEADER_DWS_SHIFT));
1856 head++;
1857 head &= sde->sdma_mask;
1858 }
1859}
1860
1861#define SDE_FMT \
1862 "SDE %u STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
1863/**
1864 * sdma_seqfile_dump_sde() - debugfs dump of sde
1865 * @s: seq file
1866 * @sde: send dma engine to dump
1867 *
1868 * This routine dumps the sde to the indicated seq file.
1869 */
1870void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
1871{
1872 u16 head, tail;
1873 struct hw_sdma_desc *descqp;
1874 u64 desc[2];
1875 u64 addr;
1876 u8 gen;
1877 u16 len;
1878
1879 head = sde->descq_head & sde->sdma_mask;
1880 tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
1881 seq_printf(s, SDE_FMT, sde->this_idx,
1882 sdma_state_name(sde->state.current_state),
1883 (unsigned long long)read_sde_csr(sde, SD(CTRL)),
1884 (unsigned long long)read_sde_csr(sde, SD(STATUS)),
1885 (unsigned long long)read_sde_csr(sde,
1886 SD(ENG_ERR_STATUS)),
1887 (unsigned long long)read_sde_csr(sde, SD(TAIL)),
1888 tail,
1889 (unsigned long long)read_sde_csr(sde, SD(HEAD)),
1890 head,
1891 (unsigned long long)le64_to_cpu(*sde->head_dma),
1892 (unsigned long long)read_sde_csr(sde, SD(MEMORY)),
1893 (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
1894 (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
1895 (unsigned long long)sde->last_status,
1896 (unsigned long long)sde->ahg_bits,
1897 sde->tx_tail,
1898 sde->tx_head,
1899 sde->descq_tail,
1900 sde->descq_head,
1901 !list_empty(&sde->flushlist),
1902 sde->descq_full_count,
1903 (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
1904
1905 /* print info for each entry in the descriptor queue */
1906 while (head != tail) {
1907 char flags[6] = { 'x', 'x', 'x', 'x', 0 };
1908
1909 descqp = &sde->descq[head];
1910 desc[0] = le64_to_cpu(descqp->qw[0]);
1911 desc[1] = le64_to_cpu(descqp->qw[1]);
1912 flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
1913 flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
1914 'H' : '-';
1915 flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
1916 flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
1917 addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
1918 & SDMA_DESC0_PHY_ADDR_MASK;
1919 gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
1920 & SDMA_DESC1_GENERATION_MASK;
1921 len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
1922 & SDMA_DESC0_BYTE_COUNT_MASK;
1923 seq_printf(s,
1924 "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
1925 head, flags, addr, gen, len);
1926 if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
1927 seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
1928 (u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK)
1929 >> SDMA_DESC1_HEADER_INDEX_MASK),
1930 (u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK)
1931 >> SDMA_DESC1_HEADER_MODE_SHIFT));
1932 head = (head + 1) & sde->sdma_mask;
1933 }
1934}
1935
1936/*
1937 * add the generation number into
1938 * the qw1 and return
1939 */
1940static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
1941{
1942 u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
1943
1944 qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
1945 qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
1946 << SDMA_DESC1_GENERATION_SHIFT;
1947 return qw1;
1948}
1949
1950/*
1951 * This routine submits the indicated tx
1952 *
1953 * Space has already been guaranteed and
1954 * tail side of ring is locked.
1955 *
1956 * The hardware tail update is done
1957 * in the caller and that is facilitated
1958 * by returning the new tail.
1959 *
1960 * There is special case logic for ahg
1961 * to not add the generation number for
1962 * up to 2 descriptors that follow the
1963 * first descriptor.
1964 *
1965 */
1966static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
1967{
1968 int i;
1969 u16 tail;
1970 struct sdma_desc *descp = tx->descp;
1971 u8 skip = 0, mode = ahg_mode(tx);
1972
1973 tail = sde->descq_tail & sde->sdma_mask;
1974 sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
1975 sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
1976 trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
1977 tail, &sde->descq[tail]);
1978 tail = ++sde->descq_tail & sde->sdma_mask;
1979 descp++;
1980 if (mode > SDMA_AHG_APPLY_UPDATE1)
1981 skip = mode >> 1;
1982 for (i = 1; i < tx->num_desc; i++, descp++) {
1983 u64 qw1;
1984
1985 sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
1986 if (skip) {
1987 /* edits don't have generation */
1988 qw1 = descp->qw[1];
1989 skip--;
1990 } else {
1991 /* replace generation with real one for non-edits */
1992 qw1 = add_gen(sde, descp->qw[1]);
1993 }
1994 sde->descq[tail].qw[1] = cpu_to_le64(qw1);
1995 trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
1996 tail, &sde->descq[tail]);
1997 tail = ++sde->descq_tail & sde->sdma_mask;
1998 }
1999 tx->next_descq_idx = tail;
2000#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
2001 tx->sn = sde->tail_sn++;
2002 trace_hfi1_sdma_in_sn(sde, tx->sn);
2003 WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
2004#endif
2005 sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
2006 sde->desc_avail -= tx->num_desc;
2007 return tail;
2008}
2009
2010/*
2011 * Check for progress
2012 */
2013static int sdma_check_progress(
2014 struct sdma_engine *sde,
2015 struct iowait *wait,
2016 struct sdma_txreq *tx)
2017{
2018 int ret;
2019
2020 sde->desc_avail = sdma_descq_freecnt(sde);
2021 if (tx->num_desc <= sde->desc_avail)
2022 return -EAGAIN;
2023 /* pulse the head_lock */
2024 if (wait && wait->sleep) {
2025 unsigned seq;
2026
2027 seq = raw_seqcount_begin(
2028 (const seqcount_t *)&sde->head_lock.seqcount);
2029 ret = wait->sleep(sde, wait, tx, seq);
2030 if (ret == -EAGAIN)
2031 sde->desc_avail = sdma_descq_freecnt(sde);
2032 } else
2033 ret = -EBUSY;
2034 return ret;
2035}
2036
2037/**
2038 * sdma_send_txreq() - submit a tx req to ring
2039 * @sde: sdma engine to use
2040 * @wait: wait structure to use when full (may be NULL)
2041 * @tx: sdma_txreq to submit
2042 *
2043 * The call submits the tx into the ring. If a iowait structure is non-NULL
2044 * the packet will be queued to the list in wait.
2045 *
2046 * Return:
2047 * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
2048 * ring (wait == NULL)
2049 * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
2050 */
2051int sdma_send_txreq(struct sdma_engine *sde,
2052 struct iowait *wait,
2053 struct sdma_txreq *tx)
2054{
2055 int ret = 0;
2056 u16 tail;
2057 unsigned long flags;
2058
2059 /* user should have supplied entire packet */
2060 if (unlikely(tx->tlen))
2061 return -EINVAL;
2062 tx->wait = wait;
2063 spin_lock_irqsave(&sde->tail_lock, flags);
2064retry:
2065 if (unlikely(!__sdma_running(sde)))
2066 goto unlock_noconn;
2067 if (unlikely(tx->num_desc > sde->desc_avail))
2068 goto nodesc;
2069 tail = submit_tx(sde, tx);
2070 if (wait)
2071 atomic_inc(&wait->sdma_busy);
2072 sdma_update_tail(sde, tail);
2073unlock:
2074 spin_unlock_irqrestore(&sde->tail_lock, flags);
2075 return ret;
2076unlock_noconn:
2077 if (wait)
2078 atomic_inc(&wait->sdma_busy);
2079 tx->next_descq_idx = 0;
2080#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
2081 tx->sn = sde->tail_sn++;
2082 trace_hfi1_sdma_in_sn(sde, tx->sn);
2083#endif
2084 spin_lock(&sde->flushlist_lock);
2085 list_add_tail(&tx->list, &sde->flushlist);
2086 spin_unlock(&sde->flushlist_lock);
2087 if (wait) {
2088 wait->tx_count++;
2089 wait->count += tx->num_desc;
2090 }
2091 schedule_work(&sde->flush_worker);
2092 ret = -ECOMM;
2093 goto unlock;
2094nodesc:
2095 ret = sdma_check_progress(sde, wait, tx);
2096 if (ret == -EAGAIN) {
2097 ret = 0;
2098 goto retry;
2099 }
2100 sde->descq_full_count++;
2101 goto unlock;
2102}
2103
2104/**
2105 * sdma_send_txlist() - submit a list of tx req to ring
2106 * @sde: sdma engine to use
2107 * @wait: wait structure to use when full (may be NULL)
2108 * @tx_list: list of sdma_txreqs to submit
2109 *
2110 * The call submits the list into the ring.
2111 *
2112 * If the iowait structure is non-NULL and not equal to the iowait list
2113 * the unprocessed part of the list will be appended to the list in wait.
2114 *
2115 * In all cases, the tx_list will be updated so the head of the tx_list is
2116 * the list of descriptors that have yet to be transmitted.
2117 *
2118 * The intent of this call is to provide a more efficient
2119 * way of submitting multiple packets to SDMA while holding the tail
2120 * side locking.
2121 *
2122 * Return:
2123 * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring
2124 * (wait == NULL)
2125 * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
2126 */
2127int sdma_send_txlist(struct sdma_engine *sde,
2128 struct iowait *wait,
2129 struct list_head *tx_list)
2130{
2131 struct sdma_txreq *tx, *tx_next;
2132 int ret = 0;
2133 unsigned long flags;
2134 u16 tail = INVALID_TAIL;
2135 int count = 0;
2136
2137 spin_lock_irqsave(&sde->tail_lock, flags);
2138retry:
2139 list_for_each_entry_safe(tx, tx_next, tx_list, list) {
2140 tx->wait = wait;
2141 if (unlikely(!__sdma_running(sde)))
2142 goto unlock_noconn;
2143 if (unlikely(tx->num_desc > sde->desc_avail))
2144 goto nodesc;
2145 if (unlikely(tx->tlen)) {
2146 ret = -EINVAL;
2147 goto update_tail;
2148 }
2149 list_del_init(&tx->list);
2150 tail = submit_tx(sde, tx);
2151 count++;
2152 if (tail != INVALID_TAIL &&
2153 (count & SDMA_TAIL_UPDATE_THRESH) == 0) {
2154 sdma_update_tail(sde, tail);
2155 tail = INVALID_TAIL;
2156 }
2157 }
2158update_tail:
2159 if (wait)
2160 atomic_add(count, &wait->sdma_busy);
2161 if (tail != INVALID_TAIL)
2162 sdma_update_tail(sde, tail);
2163 spin_unlock_irqrestore(&sde->tail_lock, flags);
2164 return ret;
2165unlock_noconn:
2166 spin_lock(&sde->flushlist_lock);
2167 list_for_each_entry_safe(tx, tx_next, tx_list, list) {
2168 tx->wait = wait;
2169 list_del_init(&tx->list);
2170 if (wait)
2171 atomic_inc(&wait->sdma_busy);
2172 tx->next_descq_idx = 0;
2173#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
2174 tx->sn = sde->tail_sn++;
2175 trace_hfi1_sdma_in_sn(sde, tx->sn);
2176#endif
2177 list_add_tail(&tx->list, &sde->flushlist);
2178 if (wait) {
2179 wait->tx_count++;
2180 wait->count += tx->num_desc;
2181 }
2182 }
2183 spin_unlock(&sde->flushlist_lock);
2184 schedule_work(&sde->flush_worker);
2185 ret = -ECOMM;
2186 goto update_tail;
2187nodesc:
2188 ret = sdma_check_progress(sde, wait, tx);
2189 if (ret == -EAGAIN) {
2190 ret = 0;
2191 goto retry;
2192 }
2193 sde->descq_full_count++;
2194 goto update_tail;
2195}
2196
2197static void sdma_process_event(struct sdma_engine *sde,
2198 enum sdma_events event)
2199{
2200 unsigned long flags;
2201
2202 spin_lock_irqsave(&sde->tail_lock, flags);
2203 write_seqlock(&sde->head_lock);
2204
2205 __sdma_process_event(sde, event);
2206
2207 if (sde->state.current_state == sdma_state_s99_running)
2208 sdma_desc_avail(sde, sdma_descq_freecnt(sde));
2209
2210 write_sequnlock(&sde->head_lock);
2211 spin_unlock_irqrestore(&sde->tail_lock, flags);
2212}
2213
2214static void __sdma_process_event(struct sdma_engine *sde,
2215 enum sdma_events event)
2216{
2217 struct sdma_state *ss = &sde->state;
2218 int need_progress = 0;
2219
2220 /* CONFIG SDMA temporary */
2221#ifdef CONFIG_SDMA_VERBOSITY
2222 dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
2223 sdma_state_names[ss->current_state],
2224 sdma_event_names[event]);
2225#endif
2226
2227 switch (ss->current_state) {
2228 case sdma_state_s00_hw_down:
2229 switch (event) {
2230 case sdma_event_e00_go_hw_down:
2231 break;
2232 case sdma_event_e30_go_running:
2233 /*
2234 * If down, but running requested (usually result
2235 * of link up, then we need to start up.
2236 * This can happen when hw down is requested while
2237 * bringing the link up with traffic active on
2238 * 7220, e.g. */
2239 ss->go_s99_running = 1;
2240 /* fall through and start dma engine */
2241 case sdma_event_e10_go_hw_start:
2242 /* This reference means the state machine is started */
2243 sdma_get(&sde->state);
2244 sdma_set_state(sde,
2245 sdma_state_s10_hw_start_up_halt_wait);
2246 break;
2247 case sdma_event_e15_hw_halt_done:
2248 break;
2249 case sdma_event_e25_hw_clean_up_done:
2250 break;
2251 case sdma_event_e40_sw_cleaned:
2252 sdma_sw_tear_down(sde);
2253 break;
2254 case sdma_event_e50_hw_cleaned:
2255 break;
2256 case sdma_event_e60_hw_halted:
2257 break;
2258 case sdma_event_e70_go_idle:
2259 break;
2260 case sdma_event_e80_hw_freeze:
2261 break;
2262 case sdma_event_e81_hw_frozen:
2263 break;
2264 case sdma_event_e82_hw_unfreeze:
2265 break;
2266 case sdma_event_e85_link_down:
2267 break;
2268 case sdma_event_e90_sw_halted:
2269 break;
2270 }
2271 break;
2272
2273 case sdma_state_s10_hw_start_up_halt_wait:
2274 switch (event) {
2275 case sdma_event_e00_go_hw_down:
2276 sdma_set_state(sde, sdma_state_s00_hw_down);
2277 sdma_sw_tear_down(sde);
2278 break;
2279 case sdma_event_e10_go_hw_start:
2280 break;
2281 case sdma_event_e15_hw_halt_done:
2282 sdma_set_state(sde,
2283 sdma_state_s15_hw_start_up_clean_wait);
2284 sdma_start_hw_clean_up(sde);
2285 break;
2286 case sdma_event_e25_hw_clean_up_done:
2287 break;
2288 case sdma_event_e30_go_running:
2289 ss->go_s99_running = 1;
2290 break;
2291 case sdma_event_e40_sw_cleaned:
2292 break;
2293 case sdma_event_e50_hw_cleaned:
2294 break;
2295 case sdma_event_e60_hw_halted:
2296 sdma_start_err_halt_wait(sde);
2297 break;
2298 case sdma_event_e70_go_idle:
2299 ss->go_s99_running = 0;
2300 break;
2301 case sdma_event_e80_hw_freeze:
2302 break;
2303 case sdma_event_e81_hw_frozen:
2304 break;
2305 case sdma_event_e82_hw_unfreeze:
2306 break;
2307 case sdma_event_e85_link_down:
2308 break;
2309 case sdma_event_e90_sw_halted:
2310 break;
2311 }
2312 break;
2313
2314 case sdma_state_s15_hw_start_up_clean_wait:
2315 switch (event) {
2316 case sdma_event_e00_go_hw_down:
2317 sdma_set_state(sde, sdma_state_s00_hw_down);
2318 sdma_sw_tear_down(sde);
2319 break;
2320 case sdma_event_e10_go_hw_start:
2321 break;
2322 case sdma_event_e15_hw_halt_done:
2323 break;
2324 case sdma_event_e25_hw_clean_up_done:
2325 sdma_hw_start_up(sde);
2326 sdma_set_state(sde, ss->go_s99_running ?
2327 sdma_state_s99_running :
2328 sdma_state_s20_idle);
2329 break;
2330 case sdma_event_e30_go_running:
2331 ss->go_s99_running = 1;
2332 break;
2333 case sdma_event_e40_sw_cleaned:
2334 break;
2335 case sdma_event_e50_hw_cleaned:
2336 break;
2337 case sdma_event_e60_hw_halted:
2338 break;
2339 case sdma_event_e70_go_idle:
2340 ss->go_s99_running = 0;
2341 break;
2342 case sdma_event_e80_hw_freeze:
2343 break;
2344 case sdma_event_e81_hw_frozen:
2345 break;
2346 case sdma_event_e82_hw_unfreeze:
2347 break;
2348 case sdma_event_e85_link_down:
2349 break;
2350 case sdma_event_e90_sw_halted:
2351 break;
2352 }
2353 break;
2354
2355 case sdma_state_s20_idle:
2356 switch (event) {
2357 case sdma_event_e00_go_hw_down:
2358 sdma_set_state(sde, sdma_state_s00_hw_down);
2359 sdma_sw_tear_down(sde);
2360 break;
2361 case sdma_event_e10_go_hw_start:
2362 break;
2363 case sdma_event_e15_hw_halt_done:
2364 break;
2365 case sdma_event_e25_hw_clean_up_done:
2366 break;
2367 case sdma_event_e30_go_running:
2368 sdma_set_state(sde, sdma_state_s99_running);
2369 ss->go_s99_running = 1;
2370 break;
2371 case sdma_event_e40_sw_cleaned:
2372 break;
2373 case sdma_event_e50_hw_cleaned:
2374 break;
2375 case sdma_event_e60_hw_halted:
2376 sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
2377 sdma_start_err_halt_wait(sde);
2378 break;
2379 case sdma_event_e70_go_idle:
2380 break;
2381 case sdma_event_e85_link_down:
2382 /* fall through */
2383 case sdma_event_e80_hw_freeze:
2384 sdma_set_state(sde, sdma_state_s80_hw_freeze);
2385 atomic_dec(&sde->dd->sdma_unfreeze_count);
2386 wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
2387 break;
2388 case sdma_event_e81_hw_frozen:
2389 break;
2390 case sdma_event_e82_hw_unfreeze:
2391 break;
2392 case sdma_event_e90_sw_halted:
2393 break;
2394 }
2395 break;
2396
2397 case sdma_state_s30_sw_clean_up_wait:
2398 switch (event) {
2399 case sdma_event_e00_go_hw_down:
2400 sdma_set_state(sde, sdma_state_s00_hw_down);
2401 break;
2402 case sdma_event_e10_go_hw_start:
2403 break;
2404 case sdma_event_e15_hw_halt_done:
2405 break;
2406 case sdma_event_e25_hw_clean_up_done:
2407 break;
2408 case sdma_event_e30_go_running:
2409 ss->go_s99_running = 1;
2410 break;
2411 case sdma_event_e40_sw_cleaned:
2412 sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
2413 sdma_start_hw_clean_up(sde);
2414 break;
2415 case sdma_event_e50_hw_cleaned:
2416 break;
2417 case sdma_event_e60_hw_halted:
2418 break;
2419 case sdma_event_e70_go_idle:
2420 ss->go_s99_running = 0;
2421 break;
2422 case sdma_event_e80_hw_freeze:
2423 break;
2424 case sdma_event_e81_hw_frozen:
2425 break;
2426 case sdma_event_e82_hw_unfreeze:
2427 break;
2428 case sdma_event_e85_link_down:
2429 ss->go_s99_running = 0;
2430 break;
2431 case sdma_event_e90_sw_halted:
2432 break;
2433 }
2434 break;
2435
2436 case sdma_state_s40_hw_clean_up_wait:
2437 switch (event) {
2438 case sdma_event_e00_go_hw_down:
2439 sdma_set_state(sde, sdma_state_s00_hw_down);
2440 sdma_start_sw_clean_up(sde);
2441 break;
2442 case sdma_event_e10_go_hw_start:
2443 break;
2444 case sdma_event_e15_hw_halt_done:
2445 break;
2446 case sdma_event_e25_hw_clean_up_done:
2447 sdma_hw_start_up(sde);
2448 sdma_set_state(sde, ss->go_s99_running ?
2449 sdma_state_s99_running :
2450 sdma_state_s20_idle);
2451 break;
2452 case sdma_event_e30_go_running:
2453 ss->go_s99_running = 1;
2454 break;
2455 case sdma_event_e40_sw_cleaned:
2456 break;
2457 case sdma_event_e50_hw_cleaned:
2458 break;
2459 case sdma_event_e60_hw_halted:
2460 break;
2461 case sdma_event_e70_go_idle:
2462 ss->go_s99_running = 0;
2463 break;
2464 case sdma_event_e80_hw_freeze:
2465 break;
2466 case sdma_event_e81_hw_frozen:
2467 break;
2468 case sdma_event_e82_hw_unfreeze:
2469 break;
2470 case sdma_event_e85_link_down:
2471 ss->go_s99_running = 0;
2472 break;
2473 case sdma_event_e90_sw_halted:
2474 break;
2475 }
2476 break;
2477
2478 case sdma_state_s50_hw_halt_wait:
2479 switch (event) {
2480 case sdma_event_e00_go_hw_down:
2481 sdma_set_state(sde, sdma_state_s00_hw_down);
2482 sdma_start_sw_clean_up(sde);
2483 break;
2484 case sdma_event_e10_go_hw_start:
2485 break;
2486 case sdma_event_e15_hw_halt_done:
2487 sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
2488 sdma_start_sw_clean_up(sde);
2489 break;
2490 case sdma_event_e25_hw_clean_up_done:
2491 break;
2492 case sdma_event_e30_go_running:
2493 ss->go_s99_running = 1;
2494 break;
2495 case sdma_event_e40_sw_cleaned:
2496 break;
2497 case sdma_event_e50_hw_cleaned:
2498 break;
2499 case sdma_event_e60_hw_halted:
2500 sdma_start_err_halt_wait(sde);
2501 break;
2502 case sdma_event_e70_go_idle:
2503 ss->go_s99_running = 0;
2504 break;
2505 case sdma_event_e80_hw_freeze:
2506 break;
2507 case sdma_event_e81_hw_frozen:
2508 break;
2509 case sdma_event_e82_hw_unfreeze:
2510 break;
2511 case sdma_event_e85_link_down:
2512 ss->go_s99_running = 0;
2513 break;
2514 case sdma_event_e90_sw_halted:
2515 break;
2516 }
2517 break;
2518
2519 case sdma_state_s60_idle_halt_wait:
2520 switch (event) {
2521 case sdma_event_e00_go_hw_down:
2522 sdma_set_state(sde, sdma_state_s00_hw_down);
2523 sdma_start_sw_clean_up(sde);
2524 break;
2525 case sdma_event_e10_go_hw_start:
2526 break;
2527 case sdma_event_e15_hw_halt_done:
2528 sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
2529 sdma_start_sw_clean_up(sde);
2530 break;
2531 case sdma_event_e25_hw_clean_up_done:
2532 break;
2533 case sdma_event_e30_go_running:
2534 ss->go_s99_running = 1;
2535 break;
2536 case sdma_event_e40_sw_cleaned:
2537 break;
2538 case sdma_event_e50_hw_cleaned:
2539 break;
2540 case sdma_event_e60_hw_halted:
2541 sdma_start_err_halt_wait(sde);
2542 break;
2543 case sdma_event_e70_go_idle:
2544 ss->go_s99_running = 0;
2545 break;
2546 case sdma_event_e80_hw_freeze:
2547 break;
2548 case sdma_event_e81_hw_frozen:
2549 break;
2550 case sdma_event_e82_hw_unfreeze:
2551 break;
2552 case sdma_event_e85_link_down:
2553 break;
2554 case sdma_event_e90_sw_halted:
2555 break;
2556 }
2557 break;
2558
2559 case sdma_state_s80_hw_freeze:
2560 switch (event) {
2561 case sdma_event_e00_go_hw_down:
2562 sdma_set_state(sde, sdma_state_s00_hw_down);
2563 sdma_start_sw_clean_up(sde);
2564 break;
2565 case sdma_event_e10_go_hw_start:
2566 break;
2567 case sdma_event_e15_hw_halt_done:
2568 break;
2569 case sdma_event_e25_hw_clean_up_done:
2570 break;
2571 case sdma_event_e30_go_running:
2572 ss->go_s99_running = 1;
2573 break;
2574 case sdma_event_e40_sw_cleaned:
2575 break;
2576 case sdma_event_e50_hw_cleaned:
2577 break;
2578 case sdma_event_e60_hw_halted:
2579 break;
2580 case sdma_event_e70_go_idle:
2581 ss->go_s99_running = 0;
2582 break;
2583 case sdma_event_e80_hw_freeze:
2584 break;
2585 case sdma_event_e81_hw_frozen:
2586 sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
2587 sdma_start_sw_clean_up(sde);
2588 break;
2589 case sdma_event_e82_hw_unfreeze:
2590 break;
2591 case sdma_event_e85_link_down:
2592 break;
2593 case sdma_event_e90_sw_halted:
2594 break;
2595 }
2596 break;
2597
2598 case sdma_state_s82_freeze_sw_clean:
2599 switch (event) {
2600 case sdma_event_e00_go_hw_down:
2601 sdma_set_state(sde, sdma_state_s00_hw_down);
2602 sdma_start_sw_clean_up(sde);
2603 break;
2604 case sdma_event_e10_go_hw_start:
2605 break;
2606 case sdma_event_e15_hw_halt_done:
2607 break;
2608 case sdma_event_e25_hw_clean_up_done:
2609 break;
2610 case sdma_event_e30_go_running:
2611 ss->go_s99_running = 1;
2612 break;
2613 case sdma_event_e40_sw_cleaned:
2614 /* notify caller this engine is done cleaning */
2615 atomic_dec(&sde->dd->sdma_unfreeze_count);
2616 wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
2617 break;
2618 case sdma_event_e50_hw_cleaned:
2619 break;
2620 case sdma_event_e60_hw_halted:
2621 break;
2622 case sdma_event_e70_go_idle:
2623 ss->go_s99_running = 0;
2624 break;
2625 case sdma_event_e80_hw_freeze:
2626 break;
2627 case sdma_event_e81_hw_frozen:
2628 break;
2629 case sdma_event_e82_hw_unfreeze:
2630 sdma_hw_start_up(sde);
2631 sdma_set_state(sde, ss->go_s99_running ?
2632 sdma_state_s99_running :
2633 sdma_state_s20_idle);
2634 break;
2635 case sdma_event_e85_link_down:
2636 break;
2637 case sdma_event_e90_sw_halted:
2638 break;
2639 }
2640 break;
2641
2642 case sdma_state_s99_running:
2643 switch (event) {
2644 case sdma_event_e00_go_hw_down:
2645 sdma_set_state(sde, sdma_state_s00_hw_down);
2646 sdma_start_sw_clean_up(sde);
2647 break;
2648 case sdma_event_e10_go_hw_start:
2649 break;
2650 case sdma_event_e15_hw_halt_done:
2651 break;
2652 case sdma_event_e25_hw_clean_up_done:
2653 break;
2654 case sdma_event_e30_go_running:
2655 break;
2656 case sdma_event_e40_sw_cleaned:
2657 break;
2658 case sdma_event_e50_hw_cleaned:
2659 break;
2660 case sdma_event_e60_hw_halted:
2661 need_progress = 1;
2662 sdma_err_progress_check_schedule(sde);
2663 case sdma_event_e90_sw_halted:
2664 /*
2665 * SW initiated halt does not perform engines
2666 * progress check
2667 */
2668 sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
2669 sdma_start_err_halt_wait(sde);
2670 break;
2671 case sdma_event_e70_go_idle:
2672 sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
2673 break;
2674 case sdma_event_e85_link_down:
2675 ss->go_s99_running = 0;
2676 /* fall through */
2677 case sdma_event_e80_hw_freeze:
2678 sdma_set_state(sde, sdma_state_s80_hw_freeze);
2679 atomic_dec(&sde->dd->sdma_unfreeze_count);
2680 wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
2681 break;
2682 case sdma_event_e81_hw_frozen:
2683 break;
2684 case sdma_event_e82_hw_unfreeze:
2685 break;
2686 }
2687 break;
2688 }
2689
2690 ss->last_event = event;
2691 if (need_progress)
2692 sdma_make_progress(sde, 0);
2693}
2694
2695/*
2696 * _extend_sdma_tx_descs() - helper to extend txreq
2697 *
2698 * This is called once the initial nominal allocation
2699 * of descriptors in the sdma_txreq is exhausted.
2700 *
2701 * The code will bump the allocation up to the max
2702 * of MAX_DESC (64) descriptors. There doesn't seem
2703 * much point in an interim step.
2704 *
2705 */
2706int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
2707{
2708 int i;
2709
2710 tx->descp = kmalloc_array(
2711 MAX_DESC,
2712 sizeof(struct sdma_desc),
2713 GFP_ATOMIC);
2714 if (!tx->descp)
2715 return -ENOMEM;
2716 tx->desc_limit = MAX_DESC;
2717 /* copy ones already built */
2718 for (i = 0; i < tx->num_desc; i++)
2719 tx->descp[i] = tx->descs[i];
2720 return 0;
2721}
2722
2723/* Update sdes when the lmc changes */
2724void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
2725{
2726 struct sdma_engine *sde;
2727 int i;
2728 u64 sreg;
2729
2730 sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
2731 SD(CHECK_SLID_MASK_SHIFT)) |
2732 (((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
2733 SD(CHECK_SLID_VALUE_SHIFT));
2734
2735 for (i = 0; i < dd->num_sdma; i++) {
2736 hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
2737 i, (u32)sreg);
2738 sde = &dd->per_sdma[i];
2739 write_sde_csr(sde, SD(CHECK_SLID), sreg);
2740 }
2741}
2742
2743/* tx not dword sized - pad */
2744int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
2745{
2746 int rval = 0;
2747
2748 if ((unlikely(tx->num_desc == tx->desc_limit))) {
2749 rval = _extend_sdma_tx_descs(dd, tx);
2750 if (rval)
2751 return rval;
2752 }
2753 /* finish the one just added */
2754 tx->num_desc++;
2755 make_tx_sdma_desc(
2756 tx,
2757 SDMA_MAP_NONE,
2758 dd->sdma_pad_phys,
2759 sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
2760 _sdma_close_tx(dd, tx);
2761 return rval;
2762}
2763
2764/*
2765 * Add ahg to the sdma_txreq
2766 *
2767 * The logic will consume up to 3
2768 * descriptors at the beginning of
2769 * sdma_txreq.
2770 */
2771void _sdma_txreq_ahgadd(
2772 struct sdma_txreq *tx,
2773 u8 num_ahg,
2774 u8 ahg_entry,
2775 u32 *ahg,
2776 u8 ahg_hlen)
2777{
2778 u32 i, shift = 0, desc = 0;
2779 u8 mode;
2780
2781 WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
2782 /* compute mode */
2783 if (num_ahg == 1)
2784 mode = SDMA_AHG_APPLY_UPDATE1;
2785 else if (num_ahg <= 5)
2786 mode = SDMA_AHG_APPLY_UPDATE2;
2787 else
2788 mode = SDMA_AHG_APPLY_UPDATE3;
2789 tx->num_desc++;
2790 /* initialize to consumed descriptors to zero */
2791 switch (mode) {
2792 case SDMA_AHG_APPLY_UPDATE3:
2793 tx->num_desc++;
2794 tx->descs[2].qw[0] = 0;
2795 tx->descs[2].qw[1] = 0;
2796 /* FALLTHROUGH */
2797 case SDMA_AHG_APPLY_UPDATE2:
2798 tx->num_desc++;
2799 tx->descs[1].qw[0] = 0;
2800 tx->descs[1].qw[1] = 0;
2801 break;
2802 }
2803 ahg_hlen >>= 2;
2804 tx->descs[0].qw[1] |=
2805 (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
2806 << SDMA_DESC1_HEADER_INDEX_SHIFT) |
2807 (((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
2808 << SDMA_DESC1_HEADER_DWS_SHIFT) |
2809 (((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
2810 << SDMA_DESC1_HEADER_MODE_SHIFT) |
2811 (((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
2812 << SDMA_DESC1_HEADER_UPDATE1_SHIFT);
2813 for (i = 0; i < (num_ahg - 1); i++) {
2814 if (!shift && !(i & 2))
2815 desc++;
2816 tx->descs[desc].qw[!!(i & 2)] |=
2817 (((u64)ahg[i + 1])
2818 << shift);
2819 shift = (shift + 32) & 63;
2820 }
2821}
2822
2823/**
2824 * sdma_ahg_alloc - allocate an AHG entry
2825 * @sde: engine to allocate from
2826 *
2827 * Return:
2828 * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
2829 * -ENOSPC if an entry is not available
2830 */
2831int sdma_ahg_alloc(struct sdma_engine *sde)
2832{
2833 int nr;
2834 int oldbit;
2835
2836 if (!sde) {
2837 trace_hfi1_ahg_allocate(sde, -EINVAL);
2838 return -EINVAL;
2839 }
2840 while (1) {
2841 nr = ffz(ACCESS_ONCE(sde->ahg_bits));
2842 if (nr > 31) {
2843 trace_hfi1_ahg_allocate(sde, -ENOSPC);
2844 return -ENOSPC;
2845 }
2846 oldbit = test_and_set_bit(nr, &sde->ahg_bits);
2847 if (!oldbit)
2848 break;
2849 cpu_relax();
2850 }
2851 trace_hfi1_ahg_allocate(sde, nr);
2852 return nr;
2853}
2854
2855/**
2856 * sdma_ahg_free - free an AHG entry
2857 * @sde: engine to return AHG entry
2858 * @ahg_index: index to free
2859 *
2860 * This routine frees the indicate AHG entry.
2861 */
2862void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
2863{
2864 if (!sde)
2865 return;
2866 trace_hfi1_ahg_deallocate(sde, ahg_index);
2867 if (ahg_index < 0 || ahg_index > 31)
2868 return;
2869 clear_bit(ahg_index, &sde->ahg_bits);
2870}
2871
2872/*
2873 * SPC freeze handling for SDMA engines. Called when the driver knows
2874 * the SPC is going into a freeze but before the freeze is fully
2875 * settled. Generally an error interrupt.
2876 *
2877 * This event will pull the engine out of running so no more entries can be
2878 * added to the engine's queue.
2879 */
2880void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
2881{
2882 int i;
2883 enum sdma_events event = link_down ? sdma_event_e85_link_down :
2884 sdma_event_e80_hw_freeze;
2885
2886 /* set up the wait but do not wait here */
2887 atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
2888
2889 /* tell all engines to stop running and wait */
2890 for (i = 0; i < dd->num_sdma; i++)
2891 sdma_process_event(&dd->per_sdma[i], event);
2892
2893 /* sdma_freeze() will wait for all engines to have stopped */
2894}
2895
2896/*
2897 * SPC freeze handling for SDMA engines. Called when the driver knows
2898 * the SPC is fully frozen.
2899 */
2900void sdma_freeze(struct hfi1_devdata *dd)
2901{
2902 int i;
2903 int ret;
2904
2905 /*
2906 * Make sure all engines have moved out of the running state before
2907 * continuing.
2908 */
2909 ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
2910 atomic_read(&dd->sdma_unfreeze_count) <= 0);
2911 /* interrupted or count is negative, then unloading - just exit */
2912 if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
2913 return;
2914
2915 /* set up the count for the next wait */
2916 atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
2917
2918 /* tell all engines that the SPC is frozen, they can start cleaning */
2919 for (i = 0; i < dd->num_sdma; i++)
2920 sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
2921
2922 /*
2923 * Wait for everyone to finish software clean before exiting. The
2924 * software clean will read engine CSRs, so must be completed before
2925 * the next step, which will clear the engine CSRs.
2926 */
2927 (void) wait_event_interruptible(dd->sdma_unfreeze_wq,
2928 atomic_read(&dd->sdma_unfreeze_count) <= 0);
2929 /* no need to check results - done no matter what */
2930}
2931
2932/*
2933 * SPC freeze handling for the SDMA engines. Called after the SPC is unfrozen.
2934 *
2935 * The SPC freeze acts like a SDMA halt and a hardware clean combined. All
2936 * that is left is a software clean. We could do it after the SPC is fully
2937 * frozen, but then we'd have to add another state to wait for the unfreeze.
2938 * Instead, just defer the software clean until the unfreeze step.
2939 */
2940void sdma_unfreeze(struct hfi1_devdata *dd)
2941{
2942 int i;
2943
2944 /* tell all engines start freeze clean up */
2945 for (i = 0; i < dd->num_sdma; i++)
2946 sdma_process_event(&dd->per_sdma[i],
2947 sdma_event_e82_hw_unfreeze);
2948}
2949
2950/**
2951 * _sdma_engine_progress_schedule() - schedule progress on engine
2952 * @sde: sdma_engine to schedule progress
2953 *
2954 */
2955void _sdma_engine_progress_schedule(
2956 struct sdma_engine *sde)
2957{
2958 trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
2959 /* assume we have selected a good cpu */
2960 write_csr(sde->dd,
2961 CCE_INT_FORCE + (8*(IS_SDMA_START/64)), sde->progress_mask);
2962}
diff --git a/drivers/staging/rdma/hfi1/sdma.h b/drivers/staging/rdma/hfi1/sdma.h
new file mode 100644
index 000000000000..1e613fcd8f4c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/sdma.h
@@ -0,0 +1,1123 @@
1#ifndef _HFI1_SDMA_H
2#define _HFI1_SDMA_H
3/*
4 *
5 * This file is provided under a dual BSD/GPLv2 license. When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * Copyright(c) 2015 Intel Corporation.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of version 2 of the GNU General Public License as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * BSD LICENSE
22 *
23 * Copyright(c) 2015 Intel Corporation.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 *
29 * - Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * - Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in
33 * the documentation and/or other materials provided with the
34 * distribution.
35 * - Neither the name of Intel Corporation nor the names of its
36 * contributors may be used to endorse or promote products derived
37 * from this software without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
40 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
41 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
43 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
47 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
49 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 *
51 */
52
53#include <linux/types.h>
54#include <linux/list.h>
55#include <asm/byteorder.h>
56#include <linux/workqueue.h>
57#include <linux/rculist.h>
58
59#include "hfi.h"
60#include "verbs.h"
61
62/* increased for AHG */
63#define NUM_DESC 6
64/* Hardware limit */
65#define MAX_DESC 64
66/* Hardware limit for SDMA packet size */
67#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
68
69
70#define SDMA_TXREQ_S_OK 0
71#define SDMA_TXREQ_S_SENDERROR 1
72#define SDMA_TXREQ_S_ABORTED 2
73#define SDMA_TXREQ_S_SHUTDOWN 3
74
75/* flags bits */
76#define SDMA_TXREQ_F_URGENT 0x0001
77#define SDMA_TXREQ_F_AHG_COPY 0x0002
78#define SDMA_TXREQ_F_USE_AHG 0x0004
79
80#define SDMA_MAP_NONE 0
81#define SDMA_MAP_SINGLE 1
82#define SDMA_MAP_PAGE 2
83
84#define SDMA_AHG_VALUE_MASK 0xffff
85#define SDMA_AHG_VALUE_SHIFT 0
86#define SDMA_AHG_INDEX_MASK 0xf
87#define SDMA_AHG_INDEX_SHIFT 16
88#define SDMA_AHG_FIELD_LEN_MASK 0xf
89#define SDMA_AHG_FIELD_LEN_SHIFT 20
90#define SDMA_AHG_FIELD_START_MASK 0x1f
91#define SDMA_AHG_FIELD_START_SHIFT 24
92#define SDMA_AHG_UPDATE_ENABLE_MASK 0x1
93#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31
94
95/* AHG modes */
96
97/*
98 * Be aware the ordering and values
99 * for SDMA_AHG_APPLY_UPDATE[123]
100 * are assumed in generating a skip
101 * count in submit_tx() in sdma.c
102 */
103#define SDMA_AHG_NO_AHG 0
104#define SDMA_AHG_COPY 1
105#define SDMA_AHG_APPLY_UPDATE1 2
106#define SDMA_AHG_APPLY_UPDATE2 3
107#define SDMA_AHG_APPLY_UPDATE3 4
108
109/*
110 * Bits defined in the send DMA descriptor.
111 */
112#define SDMA_DESC0_FIRST_DESC_FLAG (1ULL<<63)
113#define SDMA_DESC0_LAST_DESC_FLAG (1ULL<<62)
114#define SDMA_DESC0_BYTE_COUNT_SHIFT 48
115#define SDMA_DESC0_BYTE_COUNT_WIDTH 14
116#define SDMA_DESC0_BYTE_COUNT_MASK \
117 ((1ULL<<SDMA_DESC0_BYTE_COUNT_WIDTH)-1ULL)
118#define SDMA_DESC0_BYTE_COUNT_SMASK \
119 (SDMA_DESC0_BYTE_COUNT_MASK<<SDMA_DESC0_BYTE_COUNT_SHIFT)
120#define SDMA_DESC0_PHY_ADDR_SHIFT 0
121#define SDMA_DESC0_PHY_ADDR_WIDTH 48
122#define SDMA_DESC0_PHY_ADDR_MASK \
123 ((1ULL<<SDMA_DESC0_PHY_ADDR_WIDTH)-1ULL)
124#define SDMA_DESC0_PHY_ADDR_SMASK \
125 (SDMA_DESC0_PHY_ADDR_MASK<<SDMA_DESC0_PHY_ADDR_SHIFT)
126
127#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32
128#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32
129#define SDMA_DESC1_HEADER_UPDATE1_MASK \
130 ((1ULL<<SDMA_DESC1_HEADER_UPDATE1_WIDTH)-1ULL)
131#define SDMA_DESC1_HEADER_UPDATE1_SMASK \
132 (SDMA_DESC1_HEADER_UPDATE1_MASK<<SDMA_DESC1_HEADER_UPDATE1_SHIFT)
133#define SDMA_DESC1_HEADER_MODE_SHIFT 13
134#define SDMA_DESC1_HEADER_MODE_WIDTH 3
135#define SDMA_DESC1_HEADER_MODE_MASK \
136 ((1ULL<<SDMA_DESC1_HEADER_MODE_WIDTH)-1ULL)
137#define SDMA_DESC1_HEADER_MODE_SMASK \
138 (SDMA_DESC1_HEADER_MODE_MASK<<SDMA_DESC1_HEADER_MODE_SHIFT)
139#define SDMA_DESC1_HEADER_INDEX_SHIFT 8
140#define SDMA_DESC1_HEADER_INDEX_WIDTH 5
141#define SDMA_DESC1_HEADER_INDEX_MASK \
142 ((1ULL<<SDMA_DESC1_HEADER_INDEX_WIDTH)-1ULL)
143#define SDMA_DESC1_HEADER_INDEX_SMASK \
144 (SDMA_DESC1_HEADER_INDEX_MASK<<SDMA_DESC1_HEADER_INDEX_SHIFT)
145#define SDMA_DESC1_HEADER_DWS_SHIFT 4
146#define SDMA_DESC1_HEADER_DWS_WIDTH 4
147#define SDMA_DESC1_HEADER_DWS_MASK \
148 ((1ULL<<SDMA_DESC1_HEADER_DWS_WIDTH)-1ULL)
149#define SDMA_DESC1_HEADER_DWS_SMASK \
150 (SDMA_DESC1_HEADER_DWS_MASK<<SDMA_DESC1_HEADER_DWS_SHIFT)
151#define SDMA_DESC1_GENERATION_SHIFT 2
152#define SDMA_DESC1_GENERATION_WIDTH 2
153#define SDMA_DESC1_GENERATION_MASK \
154 ((1ULL<<SDMA_DESC1_GENERATION_WIDTH)-1ULL)
155#define SDMA_DESC1_GENERATION_SMASK \
156 (SDMA_DESC1_GENERATION_MASK<<SDMA_DESC1_GENERATION_SHIFT)
157#define SDMA_DESC1_INT_REQ_FLAG (1ULL<<1)
158#define SDMA_DESC1_HEAD_TO_HOST_FLAG (1ULL<<0)
159
160enum sdma_states {
161 sdma_state_s00_hw_down,
162 sdma_state_s10_hw_start_up_halt_wait,
163 sdma_state_s15_hw_start_up_clean_wait,
164 sdma_state_s20_idle,
165 sdma_state_s30_sw_clean_up_wait,
166 sdma_state_s40_hw_clean_up_wait,
167 sdma_state_s50_hw_halt_wait,
168 sdma_state_s60_idle_halt_wait,
169 sdma_state_s80_hw_freeze,
170 sdma_state_s82_freeze_sw_clean,
171 sdma_state_s99_running,
172};
173
174enum sdma_events {
175 sdma_event_e00_go_hw_down,
176 sdma_event_e10_go_hw_start,
177 sdma_event_e15_hw_halt_done,
178 sdma_event_e25_hw_clean_up_done,
179 sdma_event_e30_go_running,
180 sdma_event_e40_sw_cleaned,
181 sdma_event_e50_hw_cleaned,
182 sdma_event_e60_hw_halted,
183 sdma_event_e70_go_idle,
184 sdma_event_e80_hw_freeze,
185 sdma_event_e81_hw_frozen,
186 sdma_event_e82_hw_unfreeze,
187 sdma_event_e85_link_down,
188 sdma_event_e90_sw_halted,
189};
190
191struct sdma_set_state_action {
192 unsigned op_enable:1;
193 unsigned op_intenable:1;
194 unsigned op_halt:1;
195 unsigned op_cleanup:1;
196 unsigned go_s99_running_tofalse:1;
197 unsigned go_s99_running_totrue:1;
198};
199
200struct sdma_state {
201 struct kref kref;
202 struct completion comp;
203 enum sdma_states current_state;
204 unsigned current_op;
205 unsigned go_s99_running;
206 /* debugging/development */
207 enum sdma_states previous_state;
208 unsigned previous_op;
209 enum sdma_events last_event;
210};
211
212/**
213 * DOC: sdma exported routines
214 *
215 * These sdma routines fit into three categories:
216 * - The SDMA API for building and submitting packets
217 * to the ring
218 *
219 * - Initialization and tear down routines to buildup
220 * and tear down SDMA
221 *
222 * - ISR entrances to handle interrupts, state changes
223 * and errors
224 */
225
226/**
227 * DOC: sdma PSM/verbs API
228 *
229 * The sdma API is designed to be used by both PSM
230 * and verbs to supply packets to the SDMA ring.
231 *
232 * The usage of the API is as follows:
233 *
234 * Embed a struct iowait in the QP or
235 * PQ. The iowait should be initialized with a
236 * call to iowait_init().
237 *
238 * The user of the API should create an allocation method
239 * for their version of the txreq. slabs, pre-allocated lists,
240 * and dma pools can be used. Once the user's overload of
241 * the sdma_txreq has been allocated, the sdma_txreq member
242 * must be initialized with sdma_txinit() or sdma_txinit_ahg().
243 *
244 * The txreq must be declared with the sdma_txreq first.
245 *
246 * The tx request, once initialized, is manipulated with calls to
247 * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr()
248 * for each disjoint memory location. It is the user's responsibility
249 * to understand the packet boundaries and page boundaries to do the
250 * appropriate number of sdma_txadd_* calls.. The user
251 * must be prepared to deal with failures from these routines due to
252 * either memory allocation or dma_mapping failures.
253 *
254 * The mapping specifics for each memory location are recorded
255 * in the tx. Memory locations added with sdma_txadd_page()
256 * and sdma_txadd_kvaddr() are automatically mapped when added
257 * to the tx and nmapped as part of the progress processing in the
258 * SDMA interrupt handling.
259 *
260 * sdma_txadd_daddr() is used to add an dma_addr_t memory to the
261 * tx. An example of a use case would be a pre-allocated
262 * set of headers allocated via dma_pool_alloc() or
263 * dma_alloc_coherent(). For these memory locations, it
264 * is the responsibility of the user to handle that unmapping.
265 * (This would usually be at an unload or job termination.)
266 *
267 * The routine sdma_send_txreq() is used to submit
268 * a tx to the ring after the appropriate number of
269 * sdma_txadd_* have been done.
270 *
271 * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist()
272 * can be used to submit a list of packets.
273 *
274 * The user is free to use the link overhead in the struct sdma_txreq as
275 * long as the tx isn't in flight.
276 *
277 * The extreme degenerate case of the number of descriptors
278 * exceeding the ring size is automatically handled as
279 * memory locations are added. An overflow of the descriptor
280 * array that is part of the sdma_txreq is also automatically
281 * handled.
282 *
283 */
284
285/**
286 * DOC: Infrastructure calls
287 *
288 * sdma_init() is used to initialize data structures and
289 * CSRs for the desired number of SDMA engines.
290 *
291 * sdma_start() is used to kick the SDMA engines initialized
292 * with sdma_init(). Interrupts must be enabled at this
293 * point since aspects of the state machine are interrupt
294 * driven.
295 *
296 * sdma_engine_error() and sdma_engine_interrupt() are
297 * entrances for interrupts.
298 *
299 * sdma_map_init() is for the management of the mapping
300 * table when the number of vls is changed.
301 *
302 */
303
304/*
305 * struct hw_sdma_desc - raw 128 bit SDMA descriptor
306 *
307 * This is the raw descriptor in the SDMA ring
308 */
309struct hw_sdma_desc {
310 /* private: don't use directly */
311 __le64 qw[2];
312};
313
314/*
315 * struct sdma_desc - canonical fragment descriptor
316 *
317 * This is the descriptor carried in the tx request
318 * corresponding to each fragment.
319 *
320 */
321struct sdma_desc {
322 /* private: don't use directly */
323 u64 qw[2];
324};
325
326struct sdma_txreq;
327typedef void (*callback_t)(struct sdma_txreq *, int, int);
328
329/**
330 * struct sdma_txreq - the sdma_txreq structure (one per packet)
331 * @list: for use by user and by queuing for wait
332 *
333 * This is the representation of a packet which consists of some
334 * number of fragments. Storage is provided to within the structure.
335 * for all fragments.
336 *
337 * The storage for the descriptors are automatically extended as needed
338 * when the currently allocation is exceeded.
339 *
340 * The user (Verbs or PSM) may overload this structure with fields
341 * specific to their use by putting this struct first in their struct.
342 * The method of allocation of the overloaded structure is user dependent
343 *
344 * The list is the only public field in the structure.
345 *
346 */
347
348struct sdma_txreq {
349 struct list_head list;
350 /* private: */
351 struct sdma_desc *descp;
352 /* private: */
353 void *coalesce_buf;
354 /* private: */
355 struct iowait *wait;
356 /* private: */
357 callback_t complete;
358#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
359 u64 sn;
360#endif
361 /* private: - used in coalesce/pad processing */
362 u16 packet_len;
363 /* private: - down-counted to trigger last */
364 u16 tlen;
365 /* private: flags */
366 u16 flags;
367 /* private: */
368 u16 num_desc;
369 /* private: */
370 u16 desc_limit;
371 /* private: */
372 u16 next_descq_idx;
373 /* private: */
374 struct sdma_desc descs[NUM_DESC];
375};
376
377struct verbs_txreq {
378 struct hfi1_pio_header phdr;
379 struct sdma_txreq txreq;
380 struct hfi1_qp *qp;
381 struct hfi1_swqe *wqe;
382 struct hfi1_mregion *mr;
383 struct hfi1_sge_state *ss;
384 struct sdma_engine *sde;
385 u16 hdr_dwords;
386 u16 hdr_inx;
387};
388
389/**
390 * struct sdma_engine - Data pertaining to each SDMA engine.
391 * @dd: a back-pointer to the device data
392 * @ppd: per port back-pointer
393 * @imask: mask for irq manipulation
394 * @idle_mask: mask for determining if an interrupt is due to sdma_idle
395 *
396 * This structure has the state for each sdma_engine.
397 *
398 * Accessing to non public fields are not supported
399 * since the private members are subject to change.
400 */
401struct sdma_engine {
402 /* read mostly */
403 struct hfi1_devdata *dd;
404 struct hfi1_pportdata *ppd;
405 /* private: */
406 void __iomem *tail_csr;
407 u64 imask; /* clear interrupt mask */
408 u64 idle_mask;
409 u64 progress_mask;
410 /* private: */
411 struct workqueue_struct *wq;
412 /* private: */
413 volatile __le64 *head_dma; /* DMA'ed by chip */
414 /* private: */
415 dma_addr_t head_phys;
416 /* private: */
417 struct hw_sdma_desc *descq;
418 /* private: */
419 unsigned descq_full_count;
420 struct sdma_txreq **tx_ring;
421 /* private: */
422 dma_addr_t descq_phys;
423 /* private */
424 u32 sdma_mask;
425 /* private */
426 struct sdma_state state;
427 /* private: */
428 u8 sdma_shift;
429 /* private: */
430 u8 this_idx; /* zero relative engine */
431 /* protect changes to senddmactrl shadow */
432 spinlock_t senddmactrl_lock;
433 /* private: */
434 u64 p_senddmactrl; /* shadow per-engine SendDmaCtrl */
435
436 /* read/write using tail_lock */
437 spinlock_t tail_lock ____cacheline_aligned_in_smp;
438#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
439 /* private: */
440 u64 tail_sn;
441#endif
442 /* private: */
443 u32 descq_tail;
444 /* private: */
445 unsigned long ahg_bits;
446 /* private: */
447 u16 desc_avail;
448 /* private: */
449 u16 tx_tail;
450 /* private: */
451 u16 descq_cnt;
452
453 /* read/write using head_lock */
454 /* private: */
455 seqlock_t head_lock ____cacheline_aligned_in_smp;
456#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
457 /* private: */
458 u64 head_sn;
459#endif
460 /* private: */
461 u32 descq_head;
462 /* private: */
463 u16 tx_head;
464 /* private: */
465 u64 last_status;
466
467 /* private: */
468 struct list_head dmawait;
469
470 /* CONFIG SDMA for now, just blindly duplicate */
471 /* private: */
472 struct tasklet_struct sdma_hw_clean_up_task
473 ____cacheline_aligned_in_smp;
474
475 /* private: */
476 struct tasklet_struct sdma_sw_clean_up_task
477 ____cacheline_aligned_in_smp;
478 /* private: */
479 struct work_struct err_halt_worker;
480 /* private */
481 struct timer_list err_progress_check_timer;
482 u32 progress_check_head;
483 /* private: */
484 struct work_struct flush_worker;
485 spinlock_t flushlist_lock;
486 /* private: */
487 struct list_head flushlist;
488};
489
490
491int sdma_init(struct hfi1_devdata *dd, u8 port);
492void sdma_start(struct hfi1_devdata *dd);
493void sdma_exit(struct hfi1_devdata *dd);
494void sdma_all_running(struct hfi1_devdata *dd);
495void sdma_all_idle(struct hfi1_devdata *dd);
496void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
497void sdma_freeze(struct hfi1_devdata *dd);
498void sdma_unfreeze(struct hfi1_devdata *dd);
499void sdma_wait(struct hfi1_devdata *dd);
500
501/**
502 * sdma_empty() - idle engine test
503 * @engine: sdma engine
504 *
505 * Currently used by verbs as a latency optimization.
506 *
507 * Return:
508 * 1 - empty, 0 - non-empty
509 */
510static inline int sdma_empty(struct sdma_engine *sde)
511{
512 return sde->descq_tail == sde->descq_head;
513}
514
515static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
516{
517 return sde->descq_cnt -
518 (sde->descq_tail -
519 ACCESS_ONCE(sde->descq_head)) - 1;
520}
521
522static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
523{
524 return sde->descq_cnt - sdma_descq_freecnt(sde);
525}
526
527/*
528 * Either head_lock or tail lock required to see
529 * a steady state.
530 */
531static inline int __sdma_running(struct sdma_engine *engine)
532{
533 return engine->state.current_state == sdma_state_s99_running;
534}
535
536
537/**
538 * sdma_running() - state suitability test
539 * @engine: sdma engine
540 *
541 * sdma_running probes the internal state to determine if it is suitable
542 * for submitting packets.
543 *
544 * Return:
545 * 1 - ok to submit, 0 - not ok to submit
546 *
547 */
548static inline int sdma_running(struct sdma_engine *engine)
549{
550 unsigned long flags;
551 int ret;
552
553 spin_lock_irqsave(&engine->tail_lock, flags);
554 ret = __sdma_running(engine);
555 spin_unlock_irqrestore(&engine->tail_lock, flags);
556 return ret;
557}
558
559void _sdma_txreq_ahgadd(
560 struct sdma_txreq *tx,
561 u8 num_ahg,
562 u8 ahg_entry,
563 u32 *ahg,
564 u8 ahg_hlen);
565
566
567/**
568 * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG
569 * @tx: tx request to initialize
570 * @flags: flags to key last descriptor additions
571 * @tlen: total packet length (pbc + headers + data)
572 * @ahg_entry: ahg entry to use (0 - 31)
573 * @num_ahg: ahg descriptor for first descriptor (0 - 9)
574 * @ahg: array of AHG descriptors (up to 9 entries)
575 * @ahg_hlen: number of bytes from ASIC entry to use
576 * @cb: callback
577 *
578 * The allocation of the sdma_txreq and it enclosing structure is user
579 * dependent. This routine must be called to initialize the user independent
580 * fields.
581 *
582 * The currently supported flags are SDMA_TXREQ_F_URGENT,
583 * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG.
584 *
585 * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
586 * completion is desired as soon as possible.
587 *
588 * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be
589 * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in
590 * the AHG descriptors into the first 1 to 3 descriptors.
591 *
592 * Completions of submitted requests can be gotten on selected
593 * txreqs by giving a completion routine callback to sdma_txinit() or
594 * sdma_txinit_ahg(). The environment in which the callback runs
595 * can be from an ISR, a tasklet, or a thread, so no sleeping
596 * kernel routines can be used. Aspects of the sdma ring may
597 * be locked so care should be taken with locking.
598 *
599 * The callback pointer can be NULL to avoid any callback for the packet
600 * being submitted. The callback will be provided this tx, a status, and a flag.
601 *
602 * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
603 * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
604 *
605 * The flag, if the is the iowait had been used, indicates the iowait
606 * sdma_busy count has reached zero.
607 *
608 * user data portion of tlen should be precise. The sdma_txadd_* entrances
609 * will pad with a descriptor references 1 - 3 bytes when the number of bytes
610 * specified in tlen have been supplied to the sdma_txreq.
611 *
612 * ahg_hlen is used to determine the number of on-chip entry bytes to
613 * use as the header. This is for cases where the stored header is
614 * larger than the header to be used in a packet. This is typical
615 * for verbs where an RDMA_WRITE_FIRST is larger than the packet in
616 * and RDMA_WRITE_MIDDLE.
617 *
618 */
619static inline int sdma_txinit_ahg(
620 struct sdma_txreq *tx,
621 u16 flags,
622 u16 tlen,
623 u8 ahg_entry,
624 u8 num_ahg,
625 u32 *ahg,
626 u8 ahg_hlen,
627 void (*cb)(struct sdma_txreq *, int, int))
628{
629 if (tlen == 0)
630 return -ENODATA;
631 if (tlen > MAX_SDMA_PKT_SIZE)
632 return -EMSGSIZE;
633 tx->desc_limit = ARRAY_SIZE(tx->descs);
634 tx->descp = &tx->descs[0];
635 INIT_LIST_HEAD(&tx->list);
636 tx->num_desc = 0;
637 tx->flags = flags;
638 tx->complete = cb;
639 tx->coalesce_buf = NULL;
640 tx->wait = NULL;
641 tx->tlen = tx->packet_len = tlen;
642 tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG;
643 tx->descs[0].qw[1] = 0;
644 if (flags & SDMA_TXREQ_F_AHG_COPY)
645 tx->descs[0].qw[1] |=
646 (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
647 << SDMA_DESC1_HEADER_INDEX_SHIFT) |
648 (((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK)
649 << SDMA_DESC1_HEADER_MODE_SHIFT);
650 else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg)
651 _sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen);
652 return 0;
653}
654
655/**
656 * sdma_txinit() - initialize an sdma_txreq struct (no AHG)
657 * @tx: tx request to initialize
658 * @flags: flags to key last descriptor additions
659 * @tlen: total packet length (pbc + headers + data)
660 * @cb: callback pointer
661 *
662 * The allocation of the sdma_txreq and it enclosing structure is user
663 * dependent. This routine must be called to initialize the user
664 * independent fields.
665 *
666 * The currently supported flags is SDMA_TXREQ_F_URGENT.
667 *
668 * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
669 * completion is desired as soon as possible.
670 *
671 * Completions of submitted requests can be gotten on selected
672 * txreqs by giving a completion routine callback to sdma_txinit() or
673 * sdma_txinit_ahg(). The environment in which the callback runs
674 * can be from an ISR, a tasklet, or a thread, so no sleeping
675 * kernel routines can be used. The head size of the sdma ring may
676 * be locked so care should be taken with locking.
677 *
678 * The callback pointer can be NULL to avoid any callback for the packet
679 * being submitted.
680 *
681 * The callback, if non-NULL, will be provided this tx and a status. The
682 * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
683 * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
684 *
685 */
686static inline int sdma_txinit(
687 struct sdma_txreq *tx,
688 u16 flags,
689 u16 tlen,
690 void (*cb)(struct sdma_txreq *, int, int))
691{
692 return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
693}
694
695/* helpers - don't use */
696static inline int sdma_mapping_type(struct sdma_desc *d)
697{
698 return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK)
699 >> SDMA_DESC1_GENERATION_SHIFT;
700}
701
702static inline size_t sdma_mapping_len(struct sdma_desc *d)
703{
704 return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK)
705 >> SDMA_DESC0_BYTE_COUNT_SHIFT;
706}
707
708static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
709{
710 return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK)
711 >> SDMA_DESC0_PHY_ADDR_SHIFT;
712}
713
714static inline void make_tx_sdma_desc(
715 struct sdma_txreq *tx,
716 int type,
717 dma_addr_t addr,
718 size_t len)
719{
720 struct sdma_desc *desc = &tx->descp[tx->num_desc];
721
722 if (!tx->num_desc) {
723 /* qw[0] zero; qw[1] first, ahg mode already in from init */
724 desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK)
725 << SDMA_DESC1_GENERATION_SHIFT;
726 } else {
727 desc->qw[0] = 0;
728 desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK)
729 << SDMA_DESC1_GENERATION_SHIFT;
730 }
731 desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK)
732 << SDMA_DESC0_PHY_ADDR_SHIFT) |
733 (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
734 << SDMA_DESC0_BYTE_COUNT_SHIFT);
735}
736
737/* helper to extend txreq */
738int _extend_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
739int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
740void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
741
742/* helpers used by public routines */
743static inline void _sdma_close_tx(struct hfi1_devdata *dd,
744 struct sdma_txreq *tx)
745{
746 tx->descp[tx->num_desc].qw[0] |=
747 SDMA_DESC0_LAST_DESC_FLAG;
748 tx->descp[tx->num_desc].qw[1] |=
749 dd->default_desc1;
750 if (tx->flags & SDMA_TXREQ_F_URGENT)
751 tx->descp[tx->num_desc].qw[1] |=
752 (SDMA_DESC1_HEAD_TO_HOST_FLAG|
753 SDMA_DESC1_INT_REQ_FLAG);
754}
755
756static inline int _sdma_txadd_daddr(
757 struct hfi1_devdata *dd,
758 int type,
759 struct sdma_txreq *tx,
760 dma_addr_t addr,
761 u16 len)
762{
763 int rval = 0;
764
765 if ((unlikely(tx->num_desc == tx->desc_limit))) {
766 rval = _extend_sdma_tx_descs(dd, tx);
767 if (rval)
768 return rval;
769 }
770 make_tx_sdma_desc(
771 tx,
772 type,
773 addr, len);
774 WARN_ON(len > tx->tlen);
775 tx->tlen -= len;
776 /* special cases for last */
777 if (!tx->tlen) {
778 if (tx->packet_len & (sizeof(u32) - 1))
779 rval = _pad_sdma_tx_descs(dd, tx);
780 else
781 _sdma_close_tx(dd, tx);
782 }
783 tx->num_desc++;
784 return rval;
785}
786
787/**
788 * sdma_txadd_page() - add a page to the sdma_txreq
789 * @dd: the device to use for mapping
790 * @tx: tx request to which the page is added
791 * @page: page to map
792 * @offset: offset within the page
793 * @len: length in bytes
794 *
795 * This is used to add a page/offset/length descriptor.
796 *
797 * The mapping/unmapping of the page/offset/len is automatically handled.
798 *
799 * Return:
800 * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't
801 * extend descriptor array or couldn't allocate coalesce
802 * buffer.
803 *
804 */
805static inline int sdma_txadd_page(
806 struct hfi1_devdata *dd,
807 struct sdma_txreq *tx,
808 struct page *page,
809 unsigned long offset,
810 u16 len)
811{
812 dma_addr_t addr =
813 dma_map_page(
814 &dd->pcidev->dev,
815 page,
816 offset,
817 len,
818 DMA_TO_DEVICE);
819 if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
820 sdma_txclean(dd, tx);
821 return -ENOSPC;
822 }
823 return _sdma_txadd_daddr(
824 dd, SDMA_MAP_PAGE, tx, addr, len);
825}
826
827/**
828 * sdma_txadd_daddr() - add a dma address to the sdma_txreq
829 * @dd: the device to use for mapping
830 * @tx: sdma_txreq to which the page is added
831 * @addr: dma address mapped by caller
832 * @len: length in bytes
833 *
834 * This is used to add a descriptor for memory that is already dma mapped.
835 *
836 * In this case, there is no unmapping as part of the progress processing for
837 * this memory location.
838 *
839 * Return:
840 * 0 - success, -ENOMEM - couldn't extend descriptor array
841 */
842
843static inline int sdma_txadd_daddr(
844 struct hfi1_devdata *dd,
845 struct sdma_txreq *tx,
846 dma_addr_t addr,
847 u16 len)
848{
849 return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len);
850}
851
852/**
853 * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq
854 * @dd: the device to use for mapping
855 * @tx: sdma_txreq to which the page is added
856 * @kvaddr: the kernel virtual address
857 * @len: length in bytes
858 *
859 * This is used to add a descriptor referenced by the indicated kvaddr and
860 * len.
861 *
862 * The mapping/unmapping of the kvaddr and len is automatically handled.
863 *
864 * Return:
865 * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend
866 * descriptor array
867 */
868static inline int sdma_txadd_kvaddr(
869 struct hfi1_devdata *dd,
870 struct sdma_txreq *tx,
871 void *kvaddr,
872 u16 len)
873{
874 dma_addr_t addr =
875 dma_map_single(
876 &dd->pcidev->dev,
877 kvaddr,
878 len,
879 DMA_TO_DEVICE);
880 if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
881 sdma_txclean(dd, tx);
882 return -ENOSPC;
883 }
884 return _sdma_txadd_daddr(
885 dd, SDMA_MAP_SINGLE, tx, addr, len);
886}
887
888struct iowait;
889
890int sdma_send_txreq(struct sdma_engine *sde,
891 struct iowait *wait,
892 struct sdma_txreq *tx);
893int sdma_send_txlist(struct sdma_engine *sde,
894 struct iowait *wait,
895 struct list_head *tx_list);
896
897int sdma_ahg_alloc(struct sdma_engine *sde);
898void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
899
900/**
901 * sdma_build_ahg - build ahg descriptor
902 * @data
903 * @dwindex
904 * @startbit
905 * @bits
906 *
907 * Build and return a 32 bit descriptor.
908 */
909static inline u32 sdma_build_ahg_descriptor(
910 u16 data,
911 u8 dwindex,
912 u8 startbit,
913 u8 bits)
914{
915 return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT |
916 ((startbit & SDMA_AHG_FIELD_START_MASK) <<
917 SDMA_AHG_FIELD_START_SHIFT) |
918 ((bits & SDMA_AHG_FIELD_LEN_MASK) <<
919 SDMA_AHG_FIELD_LEN_SHIFT) |
920 ((dwindex & SDMA_AHG_INDEX_MASK) <<
921 SDMA_AHG_INDEX_SHIFT) |
922 ((data & SDMA_AHG_VALUE_MASK) <<
923 SDMA_AHG_VALUE_SHIFT));
924}
925
926/**
927 * sdma_progress - use seq number of detect head progress
928 * @sde: sdma_engine to check
929 * @seq: base seq count
930 * @tx: txreq for which we need to check descriptor availability
931 *
932 * This is used in the appropriate spot in the sleep routine
933 * to check for potential ring progress. This routine gets the
934 * seqcount before queuing the iowait structure for progress.
935 *
936 * If the seqcount indicates that progress needs to be checked,
937 * re-submission is detected by checking whether the descriptor
938 * queue has enough descriptor for the txreq.
939 */
940static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
941 struct sdma_txreq *tx)
942{
943 if (read_seqretry(&sde->head_lock, seq)) {
944 sde->desc_avail = sdma_descq_freecnt(sde);
945 if (tx->num_desc > sde->desc_avail)
946 return 0;
947 return 1;
948 }
949 return 0;
950}
951
952/**
953 * sdma_iowait_schedule() - initialize wait structure
954 * @sde: sdma_engine to schedule
955 * @wait: wait struct to schedule
956 *
957 * This function initializes the iowait
958 * structure embedded in the QP or PQ.
959 *
960 */
961static inline void sdma_iowait_schedule(
962 struct sdma_engine *sde,
963 struct iowait *wait)
964{
965 iowait_schedule(wait, sde->wq);
966}
967
968/* for use by interrupt handling */
969void sdma_engine_error(struct sdma_engine *sde, u64 status);
970void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
971
972/*
973 *
974 * The diagram below details the relationship of the mapping structures
975 *
976 * Since the mapping now allows for non-uniform engines per vl, the
977 * number of engines for a vl is either the vl_engines[vl] or
978 * a computation based on num_sdma/num_vls:
979 *
980 * For example:
981 * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls
982 *
983 * n = roundup to next highest power of 2 using nactual
984 *
985 * In the case where there are num_sdma/num_vls doesn't divide
986 * evenly, the extras are added from the last vl downward.
987 *
988 * For the case where n > nactual, the engines are assigned
989 * in a round robin fashion wrapping back to the first engine
990 * for a particular vl.
991 *
992 * dd->sdma_map
993 * | sdma_map_elem[0]
994 * | +--------------------+
995 * v | mask |
996 * sdma_vl_map |--------------------|
997 * +--------------------------+ | sde[0] -> eng 1 |
998 * | list (RCU) | |--------------------|
999 * |--------------------------| ->| sde[1] -> eng 2 |
1000 * | mask | --/ |--------------------|
1001 * |--------------------------| -/ | * |
1002 * | actual_vls (max 8) | -/ |--------------------|
1003 * |--------------------------| --/ | sde[n] -> eng n |
1004 * | vls (max 8) | -/ +--------------------+
1005 * |--------------------------| --/
1006 * | map[0] |-/
1007 * |--------------------------| +--------------------+
1008 * | map[1] |--- | mask |
1009 * |--------------------------| \---- |--------------------|
1010 * | * | \-- | sde[0] -> eng 1+n |
1011 * | * | \---- |--------------------|
1012 * | * | \->| sde[1] -> eng 2+n |
1013 * |--------------------------| |--------------------|
1014 * | map[vls - 1] |- | * |
1015 * +--------------------------+ \- |--------------------|
1016 * \- | sde[m] -> eng m+n |
1017 * \ +--------------------+
1018 * \-
1019 * \
1020 * \- +--------------------+
1021 * \- | mask |
1022 * \ |--------------------|
1023 * \- | sde[0] -> eng 1+m+n|
1024 * \- |--------------------|
1025 * >| sde[1] -> eng 2+m+n|
1026 * |--------------------|
1027 * | * |
1028 * |--------------------|
1029 * | sde[o] -> eng o+m+n|
1030 * +--------------------+
1031 *
1032 */
1033
1034/**
1035 * struct sdma_map_elem - mapping for a vl
1036 * @mask - selector mask
1037 * @sde - array of engines for this vl
1038 *
1039 * The mask is used to "mod" the selector
1040 * to produce index into the trailing
1041 * array of sdes.
1042 */
1043struct sdma_map_elem {
1044 u32 mask;
1045 struct sdma_engine *sde[0];
1046};
1047
1048/**
1049 * struct sdma_map_el - mapping for a vl
1050 * @list - rcu head for free callback
1051 * @mask - vl mask to "mod" the vl to produce an index to map array
1052 * @actual_vls - number of vls
1053 * @vls - number of vls rounded to next power of 2
1054 * @map - array of sdma_map_elem entries
1055 *
1056 * This is the parent mapping structure. The trailing
1057 * members of the struct point to sdma_map_elem entries, which
1058 * in turn point to an array of sde's for that vl.
1059 */
1060struct sdma_vl_map {
1061 struct rcu_head list;
1062 u32 mask;
1063 u8 actual_vls;
1064 u8 vls;
1065 struct sdma_map_elem *map[0];
1066};
1067
1068int sdma_map_init(
1069 struct hfi1_devdata *dd,
1070 u8 port,
1071 u8 num_vls,
1072 u8 *vl_engines);
1073
1074/* slow path */
1075void _sdma_engine_progress_schedule(struct sdma_engine *sde);
1076
1077/**
1078 * sdma_engine_progress_schedule() - schedule progress on engine
1079 * @sde: sdma_engine to schedule progress
1080 *
1081 * This is the fast path.
1082 *
1083 */
1084static inline void sdma_engine_progress_schedule(
1085 struct sdma_engine *sde)
1086{
1087 if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8))
1088 return;
1089 _sdma_engine_progress_schedule(sde);
1090}
1091
1092struct sdma_engine *sdma_select_engine_sc(
1093 struct hfi1_devdata *dd,
1094 u32 selector,
1095 u8 sc5);
1096
1097struct sdma_engine *sdma_select_engine_vl(
1098 struct hfi1_devdata *dd,
1099 u32 selector,
1100 u8 vl);
1101
1102void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *);
1103
1104#ifdef CONFIG_SDMA_VERBOSITY
1105void sdma_dumpstate(struct sdma_engine *);
1106#endif
1107static inline char *slashstrip(char *s)
1108{
1109 char *r = s;
1110
1111 while (*s)
1112 if (*s++ == '/')
1113 r = s;
1114 return r;
1115}
1116
1117u16 sdma_get_descq_cnt(void);
1118
1119extern uint mod_num_sdma;
1120
1121void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
1122
1123#endif
diff --git a/drivers/staging/rdma/hfi1/srq.c b/drivers/staging/rdma/hfi1/srq.c
new file mode 100644
index 000000000000..67786d417493
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/srq.c
@@ -0,0 +1,397 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/err.h>
52#include <linux/slab.h>
53#include <linux/vmalloc.h>
54
55#include "verbs.h"
56
57/**
58 * hfi1_post_srq_receive - post a receive on a shared receive queue
59 * @ibsrq: the SRQ to post the receive on
60 * @wr: the list of work requests to post
61 * @bad_wr: A pointer to the first WR to cause a problem is put here
62 *
63 * This may be called from interrupt context.
64 */
65int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
66 struct ib_recv_wr **bad_wr)
67{
68 struct hfi1_srq *srq = to_isrq(ibsrq);
69 struct hfi1_rwq *wq;
70 unsigned long flags;
71 int ret;
72
73 for (; wr; wr = wr->next) {
74 struct hfi1_rwqe *wqe;
75 u32 next;
76 int i;
77
78 if ((unsigned) wr->num_sge > srq->rq.max_sge) {
79 *bad_wr = wr;
80 ret = -EINVAL;
81 goto bail;
82 }
83
84 spin_lock_irqsave(&srq->rq.lock, flags);
85 wq = srq->rq.wq;
86 next = wq->head + 1;
87 if (next >= srq->rq.size)
88 next = 0;
89 if (next == wq->tail) {
90 spin_unlock_irqrestore(&srq->rq.lock, flags);
91 *bad_wr = wr;
92 ret = -ENOMEM;
93 goto bail;
94 }
95
96 wqe = get_rwqe_ptr(&srq->rq, wq->head);
97 wqe->wr_id = wr->wr_id;
98 wqe->num_sge = wr->num_sge;
99 for (i = 0; i < wr->num_sge; i++)
100 wqe->sg_list[i] = wr->sg_list[i];
101 /* Make sure queue entry is written before the head index. */
102 smp_wmb();
103 wq->head = next;
104 spin_unlock_irqrestore(&srq->rq.lock, flags);
105 }
106 ret = 0;
107
108bail:
109 return ret;
110}
111
112/**
113 * hfi1_create_srq - create a shared receive queue
114 * @ibpd: the protection domain of the SRQ to create
115 * @srq_init_attr: the attributes of the SRQ
116 * @udata: data from libibverbs when creating a user SRQ
117 */
118struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd,
119 struct ib_srq_init_attr *srq_init_attr,
120 struct ib_udata *udata)
121{
122 struct hfi1_ibdev *dev = to_idev(ibpd->device);
123 struct hfi1_srq *srq;
124 u32 sz;
125 struct ib_srq *ret;
126
127 if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
128 ret = ERR_PTR(-ENOSYS);
129 goto done;
130 }
131
132 if (srq_init_attr->attr.max_sge == 0 ||
133 srq_init_attr->attr.max_sge > hfi1_max_srq_sges ||
134 srq_init_attr->attr.max_wr == 0 ||
135 srq_init_attr->attr.max_wr > hfi1_max_srq_wrs) {
136 ret = ERR_PTR(-EINVAL);
137 goto done;
138 }
139
140 srq = kmalloc(sizeof(*srq), GFP_KERNEL);
141 if (!srq) {
142 ret = ERR_PTR(-ENOMEM);
143 goto done;
144 }
145
146 /*
147 * Need to use vmalloc() if we want to support large #s of entries.
148 */
149 srq->rq.size = srq_init_attr->attr.max_wr + 1;
150 srq->rq.max_sge = srq_init_attr->attr.max_sge;
151 sz = sizeof(struct ib_sge) * srq->rq.max_sge +
152 sizeof(struct hfi1_rwqe);
153 srq->rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) + srq->rq.size * sz);
154 if (!srq->rq.wq) {
155 ret = ERR_PTR(-ENOMEM);
156 goto bail_srq;
157 }
158
159 /*
160 * Return the address of the RWQ as the offset to mmap.
161 * See hfi1_mmap() for details.
162 */
163 if (udata && udata->outlen >= sizeof(__u64)) {
164 int err;
165 u32 s = sizeof(struct hfi1_rwq) + srq->rq.size * sz;
166
167 srq->ip =
168 hfi1_create_mmap_info(dev, s, ibpd->uobject->context,
169 srq->rq.wq);
170 if (!srq->ip) {
171 ret = ERR_PTR(-ENOMEM);
172 goto bail_wq;
173 }
174
175 err = ib_copy_to_udata(udata, &srq->ip->offset,
176 sizeof(srq->ip->offset));
177 if (err) {
178 ret = ERR_PTR(err);
179 goto bail_ip;
180 }
181 } else
182 srq->ip = NULL;
183
184 /*
185 * ib_create_srq() will initialize srq->ibsrq.
186 */
187 spin_lock_init(&srq->rq.lock);
188 srq->rq.wq->head = 0;
189 srq->rq.wq->tail = 0;
190 srq->limit = srq_init_attr->attr.srq_limit;
191
192 spin_lock(&dev->n_srqs_lock);
193 if (dev->n_srqs_allocated == hfi1_max_srqs) {
194 spin_unlock(&dev->n_srqs_lock);
195 ret = ERR_PTR(-ENOMEM);
196 goto bail_ip;
197 }
198
199 dev->n_srqs_allocated++;
200 spin_unlock(&dev->n_srqs_lock);
201
202 if (srq->ip) {
203 spin_lock_irq(&dev->pending_lock);
204 list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
205 spin_unlock_irq(&dev->pending_lock);
206 }
207
208 ret = &srq->ibsrq;
209 goto done;
210
211bail_ip:
212 kfree(srq->ip);
213bail_wq:
214 vfree(srq->rq.wq);
215bail_srq:
216 kfree(srq);
217done:
218 return ret;
219}
220
221/**
222 * hfi1_modify_srq - modify a shared receive queue
223 * @ibsrq: the SRQ to modify
224 * @attr: the new attributes of the SRQ
225 * @attr_mask: indicates which attributes to modify
226 * @udata: user data for libibverbs.so
227 */
228int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
229 enum ib_srq_attr_mask attr_mask,
230 struct ib_udata *udata)
231{
232 struct hfi1_srq *srq = to_isrq(ibsrq);
233 struct hfi1_rwq *wq;
234 int ret = 0;
235
236 if (attr_mask & IB_SRQ_MAX_WR) {
237 struct hfi1_rwq *owq;
238 struct hfi1_rwqe *p;
239 u32 sz, size, n, head, tail;
240
241 /* Check that the requested sizes are below the limits. */
242 if ((attr->max_wr > hfi1_max_srq_wrs) ||
243 ((attr_mask & IB_SRQ_LIMIT) ?
244 attr->srq_limit : srq->limit) > attr->max_wr) {
245 ret = -EINVAL;
246 goto bail;
247 }
248
249 sz = sizeof(struct hfi1_rwqe) +
250 srq->rq.max_sge * sizeof(struct ib_sge);
251 size = attr->max_wr + 1;
252 wq = vmalloc_user(sizeof(struct hfi1_rwq) + size * sz);
253 if (!wq) {
254 ret = -ENOMEM;
255 goto bail;
256 }
257
258 /* Check that we can write the offset to mmap. */
259 if (udata && udata->inlen >= sizeof(__u64)) {
260 __u64 offset_addr;
261 __u64 offset = 0;
262
263 ret = ib_copy_from_udata(&offset_addr, udata,
264 sizeof(offset_addr));
265 if (ret)
266 goto bail_free;
267 udata->outbuf =
268 (void __user *) (unsigned long) offset_addr;
269 ret = ib_copy_to_udata(udata, &offset,
270 sizeof(offset));
271 if (ret)
272 goto bail_free;
273 }
274
275 spin_lock_irq(&srq->rq.lock);
276 /*
277 * validate head and tail pointer values and compute
278 * the number of remaining WQEs.
279 */
280 owq = srq->rq.wq;
281 head = owq->head;
282 tail = owq->tail;
283 if (head >= srq->rq.size || tail >= srq->rq.size) {
284 ret = -EINVAL;
285 goto bail_unlock;
286 }
287 n = head;
288 if (n < tail)
289 n += srq->rq.size - tail;
290 else
291 n -= tail;
292 if (size <= n) {
293 ret = -EINVAL;
294 goto bail_unlock;
295 }
296 n = 0;
297 p = wq->wq;
298 while (tail != head) {
299 struct hfi1_rwqe *wqe;
300 int i;
301
302 wqe = get_rwqe_ptr(&srq->rq, tail);
303 p->wr_id = wqe->wr_id;
304 p->num_sge = wqe->num_sge;
305 for (i = 0; i < wqe->num_sge; i++)
306 p->sg_list[i] = wqe->sg_list[i];
307 n++;
308 p = (struct hfi1_rwqe *)((char *)p + sz);
309 if (++tail >= srq->rq.size)
310 tail = 0;
311 }
312 srq->rq.wq = wq;
313 srq->rq.size = size;
314 wq->head = n;
315 wq->tail = 0;
316 if (attr_mask & IB_SRQ_LIMIT)
317 srq->limit = attr->srq_limit;
318 spin_unlock_irq(&srq->rq.lock);
319
320 vfree(owq);
321
322 if (srq->ip) {
323 struct hfi1_mmap_info *ip = srq->ip;
324 struct hfi1_ibdev *dev = to_idev(srq->ibsrq.device);
325 u32 s = sizeof(struct hfi1_rwq) + size * sz;
326
327 hfi1_update_mmap_info(dev, ip, s, wq);
328
329 /*
330 * Return the offset to mmap.
331 * See hfi1_mmap() for details.
332 */
333 if (udata && udata->inlen >= sizeof(__u64)) {
334 ret = ib_copy_to_udata(udata, &ip->offset,
335 sizeof(ip->offset));
336 if (ret)
337 goto bail;
338 }
339
340 /*
341 * Put user mapping info onto the pending list
342 * unless it already is on the list.
343 */
344 spin_lock_irq(&dev->pending_lock);
345 if (list_empty(&ip->pending_mmaps))
346 list_add(&ip->pending_mmaps,
347 &dev->pending_mmaps);
348 spin_unlock_irq(&dev->pending_lock);
349 }
350 } else if (attr_mask & IB_SRQ_LIMIT) {
351 spin_lock_irq(&srq->rq.lock);
352 if (attr->srq_limit >= srq->rq.size)
353 ret = -EINVAL;
354 else
355 srq->limit = attr->srq_limit;
356 spin_unlock_irq(&srq->rq.lock);
357 }
358 goto bail;
359
360bail_unlock:
361 spin_unlock_irq(&srq->rq.lock);
362bail_free:
363 vfree(wq);
364bail:
365 return ret;
366}
367
368int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
369{
370 struct hfi1_srq *srq = to_isrq(ibsrq);
371
372 attr->max_wr = srq->rq.size - 1;
373 attr->max_sge = srq->rq.max_sge;
374 attr->srq_limit = srq->limit;
375 return 0;
376}
377
378/**
379 * hfi1_destroy_srq - destroy a shared receive queue
380 * @ibsrq: the SRQ to destroy
381 */
382int hfi1_destroy_srq(struct ib_srq *ibsrq)
383{
384 struct hfi1_srq *srq = to_isrq(ibsrq);
385 struct hfi1_ibdev *dev = to_idev(ibsrq->device);
386
387 spin_lock(&dev->n_srqs_lock);
388 dev->n_srqs_allocated--;
389 spin_unlock(&dev->n_srqs_lock);
390 if (srq->ip)
391 kref_put(&srq->ip->ref, hfi1_release_mmap_info);
392 else
393 vfree(srq->rq.wq);
394 kfree(srq);
395
396 return 0;
397}
diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c
new file mode 100644
index 000000000000..b78c72861ef9
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/sysfs.c
@@ -0,0 +1,739 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50#include <linux/ctype.h>
51
52#include "hfi.h"
53#include "mad.h"
54#include "trace.h"
55
56
57/*
58 * Start of per-port congestion control structures and support code
59 */
60
61/*
62 * Congestion control table size followed by table entries
63 */
64static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
65 struct bin_attribute *bin_attr,
66 char *buf, loff_t pos, size_t count)
67{
68 int ret;
69 struct hfi1_pportdata *ppd =
70 container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
71 struct cc_state *cc_state;
72
73 ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
74 + sizeof(__be16);
75
76 if (pos > ret)
77 return -EINVAL;
78
79 if (count > ret - pos)
80 count = ret - pos;
81
82 if (!count)
83 return count;
84
85 rcu_read_lock();
86 cc_state = get_cc_state(ppd);
87 if (cc_state == NULL) {
88 rcu_read_unlock();
89 return -EINVAL;
90 }
91 memcpy(buf, &cc_state->cct, count);
92 rcu_read_unlock();
93
94 return count;
95}
96
97static void port_release(struct kobject *kobj)
98{
99 /* nothing to do since memory is freed by hfi1_free_devdata() */
100}
101
102static struct kobj_type port_cc_ktype = {
103 .release = port_release,
104};
105
106static struct bin_attribute cc_table_bin_attr = {
107 .attr = {.name = "cc_table_bin", .mode = 0444},
108 .read = read_cc_table_bin,
109 .size = PAGE_SIZE,
110};
111
112/*
113 * Congestion settings: port control, control map and an array of 16
114 * entries for the congestion entries - increase, timer, event log
115 * trigger threshold and the minimum injection rate delay.
116 */
117static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
118 struct bin_attribute *bin_attr,
119 char *buf, loff_t pos, size_t count)
120{
121 int ret;
122 struct hfi1_pportdata *ppd =
123 container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
124 struct cc_state *cc_state;
125
126 ret = sizeof(struct opa_congestion_setting_attr_shadow);
127
128 if (pos > ret)
129 return -EINVAL;
130 if (count > ret - pos)
131 count = ret - pos;
132
133 if (!count)
134 return count;
135
136 rcu_read_lock();
137 cc_state = get_cc_state(ppd);
138 if (cc_state == NULL) {
139 rcu_read_unlock();
140 return -EINVAL;
141 }
142 memcpy(buf, &cc_state->cong_setting, count);
143 rcu_read_unlock();
144
145 return count;
146}
147
148static struct bin_attribute cc_setting_bin_attr = {
149 .attr = {.name = "cc_settings_bin", .mode = 0444},
150 .read = read_cc_setting_bin,
151 .size = PAGE_SIZE,
152};
153
154/* Start sc2vl */
155#define HFI1_SC2VL_ATTR(N) \
156 static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \
157 .attr = { .name = __stringify(N), .mode = 0444 }, \
158 .sc = N \
159 }
160
161struct hfi1_sc2vl_attr {
162 struct attribute attr;
163 int sc;
164};
165
166HFI1_SC2VL_ATTR(0);
167HFI1_SC2VL_ATTR(1);
168HFI1_SC2VL_ATTR(2);
169HFI1_SC2VL_ATTR(3);
170HFI1_SC2VL_ATTR(4);
171HFI1_SC2VL_ATTR(5);
172HFI1_SC2VL_ATTR(6);
173HFI1_SC2VL_ATTR(7);
174HFI1_SC2VL_ATTR(8);
175HFI1_SC2VL_ATTR(9);
176HFI1_SC2VL_ATTR(10);
177HFI1_SC2VL_ATTR(11);
178HFI1_SC2VL_ATTR(12);
179HFI1_SC2VL_ATTR(13);
180HFI1_SC2VL_ATTR(14);
181HFI1_SC2VL_ATTR(15);
182HFI1_SC2VL_ATTR(16);
183HFI1_SC2VL_ATTR(17);
184HFI1_SC2VL_ATTR(18);
185HFI1_SC2VL_ATTR(19);
186HFI1_SC2VL_ATTR(20);
187HFI1_SC2VL_ATTR(21);
188HFI1_SC2VL_ATTR(22);
189HFI1_SC2VL_ATTR(23);
190HFI1_SC2VL_ATTR(24);
191HFI1_SC2VL_ATTR(25);
192HFI1_SC2VL_ATTR(26);
193HFI1_SC2VL_ATTR(27);
194HFI1_SC2VL_ATTR(28);
195HFI1_SC2VL_ATTR(29);
196HFI1_SC2VL_ATTR(30);
197HFI1_SC2VL_ATTR(31);
198
199
200static struct attribute *sc2vl_default_attributes[] = {
201 &hfi1_sc2vl_attr_0.attr,
202 &hfi1_sc2vl_attr_1.attr,
203 &hfi1_sc2vl_attr_2.attr,
204 &hfi1_sc2vl_attr_3.attr,
205 &hfi1_sc2vl_attr_4.attr,
206 &hfi1_sc2vl_attr_5.attr,
207 &hfi1_sc2vl_attr_6.attr,
208 &hfi1_sc2vl_attr_7.attr,
209 &hfi1_sc2vl_attr_8.attr,
210 &hfi1_sc2vl_attr_9.attr,
211 &hfi1_sc2vl_attr_10.attr,
212 &hfi1_sc2vl_attr_11.attr,
213 &hfi1_sc2vl_attr_12.attr,
214 &hfi1_sc2vl_attr_13.attr,
215 &hfi1_sc2vl_attr_14.attr,
216 &hfi1_sc2vl_attr_15.attr,
217 &hfi1_sc2vl_attr_16.attr,
218 &hfi1_sc2vl_attr_17.attr,
219 &hfi1_sc2vl_attr_18.attr,
220 &hfi1_sc2vl_attr_19.attr,
221 &hfi1_sc2vl_attr_20.attr,
222 &hfi1_sc2vl_attr_21.attr,
223 &hfi1_sc2vl_attr_22.attr,
224 &hfi1_sc2vl_attr_23.attr,
225 &hfi1_sc2vl_attr_24.attr,
226 &hfi1_sc2vl_attr_25.attr,
227 &hfi1_sc2vl_attr_26.attr,
228 &hfi1_sc2vl_attr_27.attr,
229 &hfi1_sc2vl_attr_28.attr,
230 &hfi1_sc2vl_attr_29.attr,
231 &hfi1_sc2vl_attr_30.attr,
232 &hfi1_sc2vl_attr_31.attr,
233 NULL
234};
235
236static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr,
237 char *buf)
238{
239 struct hfi1_sc2vl_attr *sattr =
240 container_of(attr, struct hfi1_sc2vl_attr, attr);
241 struct hfi1_pportdata *ppd =
242 container_of(kobj, struct hfi1_pportdata, sc2vl_kobj);
243 struct hfi1_devdata *dd = ppd->dd;
244
245 return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc));
246}
247
248static const struct sysfs_ops hfi1_sc2vl_ops = {
249 .show = sc2vl_attr_show,
250};
251
252static struct kobj_type hfi1_sc2vl_ktype = {
253 .release = port_release,
254 .sysfs_ops = &hfi1_sc2vl_ops,
255 .default_attrs = sc2vl_default_attributes
256};
257
258/* End sc2vl */
259
260/* Start sl2sc */
261#define HFI1_SL2SC_ATTR(N) \
262 static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = { \
263 .attr = { .name = __stringify(N), .mode = 0444 }, \
264 .sl = N \
265 }
266
267struct hfi1_sl2sc_attr {
268 struct attribute attr;
269 int sl;
270};
271
272HFI1_SL2SC_ATTR(0);
273HFI1_SL2SC_ATTR(1);
274HFI1_SL2SC_ATTR(2);
275HFI1_SL2SC_ATTR(3);
276HFI1_SL2SC_ATTR(4);
277HFI1_SL2SC_ATTR(5);
278HFI1_SL2SC_ATTR(6);
279HFI1_SL2SC_ATTR(7);
280HFI1_SL2SC_ATTR(8);
281HFI1_SL2SC_ATTR(9);
282HFI1_SL2SC_ATTR(10);
283HFI1_SL2SC_ATTR(11);
284HFI1_SL2SC_ATTR(12);
285HFI1_SL2SC_ATTR(13);
286HFI1_SL2SC_ATTR(14);
287HFI1_SL2SC_ATTR(15);
288HFI1_SL2SC_ATTR(16);
289HFI1_SL2SC_ATTR(17);
290HFI1_SL2SC_ATTR(18);
291HFI1_SL2SC_ATTR(19);
292HFI1_SL2SC_ATTR(20);
293HFI1_SL2SC_ATTR(21);
294HFI1_SL2SC_ATTR(22);
295HFI1_SL2SC_ATTR(23);
296HFI1_SL2SC_ATTR(24);
297HFI1_SL2SC_ATTR(25);
298HFI1_SL2SC_ATTR(26);
299HFI1_SL2SC_ATTR(27);
300HFI1_SL2SC_ATTR(28);
301HFI1_SL2SC_ATTR(29);
302HFI1_SL2SC_ATTR(30);
303HFI1_SL2SC_ATTR(31);
304
305
306static struct attribute *sl2sc_default_attributes[] = {
307 &hfi1_sl2sc_attr_0.attr,
308 &hfi1_sl2sc_attr_1.attr,
309 &hfi1_sl2sc_attr_2.attr,
310 &hfi1_sl2sc_attr_3.attr,
311 &hfi1_sl2sc_attr_4.attr,
312 &hfi1_sl2sc_attr_5.attr,
313 &hfi1_sl2sc_attr_6.attr,
314 &hfi1_sl2sc_attr_7.attr,
315 &hfi1_sl2sc_attr_8.attr,
316 &hfi1_sl2sc_attr_9.attr,
317 &hfi1_sl2sc_attr_10.attr,
318 &hfi1_sl2sc_attr_11.attr,
319 &hfi1_sl2sc_attr_12.attr,
320 &hfi1_sl2sc_attr_13.attr,
321 &hfi1_sl2sc_attr_14.attr,
322 &hfi1_sl2sc_attr_15.attr,
323 &hfi1_sl2sc_attr_16.attr,
324 &hfi1_sl2sc_attr_17.attr,
325 &hfi1_sl2sc_attr_18.attr,
326 &hfi1_sl2sc_attr_19.attr,
327 &hfi1_sl2sc_attr_20.attr,
328 &hfi1_sl2sc_attr_21.attr,
329 &hfi1_sl2sc_attr_22.attr,
330 &hfi1_sl2sc_attr_23.attr,
331 &hfi1_sl2sc_attr_24.attr,
332 &hfi1_sl2sc_attr_25.attr,
333 &hfi1_sl2sc_attr_26.attr,
334 &hfi1_sl2sc_attr_27.attr,
335 &hfi1_sl2sc_attr_28.attr,
336 &hfi1_sl2sc_attr_29.attr,
337 &hfi1_sl2sc_attr_30.attr,
338 &hfi1_sl2sc_attr_31.attr,
339 NULL
340};
341
342static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr,
343 char *buf)
344{
345 struct hfi1_sl2sc_attr *sattr =
346 container_of(attr, struct hfi1_sl2sc_attr, attr);
347 struct hfi1_pportdata *ppd =
348 container_of(kobj, struct hfi1_pportdata, sl2sc_kobj);
349 struct hfi1_ibport *ibp = &ppd->ibport_data;
350
351 return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]);
352}
353
354static const struct sysfs_ops hfi1_sl2sc_ops = {
355 .show = sl2sc_attr_show,
356};
357
358static struct kobj_type hfi1_sl2sc_ktype = {
359 .release = port_release,
360 .sysfs_ops = &hfi1_sl2sc_ops,
361 .default_attrs = sl2sc_default_attributes
362};
363
364/* End sl2sc */
365
366/* Start vl2mtu */
367
368#define HFI1_VL2MTU_ATTR(N) \
369 static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \
370 .attr = { .name = __stringify(N), .mode = 0444 }, \
371 .vl = N \
372 }
373
374struct hfi1_vl2mtu_attr {
375 struct attribute attr;
376 int vl;
377};
378
379HFI1_VL2MTU_ATTR(0);
380HFI1_VL2MTU_ATTR(1);
381HFI1_VL2MTU_ATTR(2);
382HFI1_VL2MTU_ATTR(3);
383HFI1_VL2MTU_ATTR(4);
384HFI1_VL2MTU_ATTR(5);
385HFI1_VL2MTU_ATTR(6);
386HFI1_VL2MTU_ATTR(7);
387HFI1_VL2MTU_ATTR(8);
388HFI1_VL2MTU_ATTR(9);
389HFI1_VL2MTU_ATTR(10);
390HFI1_VL2MTU_ATTR(11);
391HFI1_VL2MTU_ATTR(12);
392HFI1_VL2MTU_ATTR(13);
393HFI1_VL2MTU_ATTR(14);
394HFI1_VL2MTU_ATTR(15);
395
396static struct attribute *vl2mtu_default_attributes[] = {
397 &hfi1_vl2mtu_attr_0.attr,
398 &hfi1_vl2mtu_attr_1.attr,
399 &hfi1_vl2mtu_attr_2.attr,
400 &hfi1_vl2mtu_attr_3.attr,
401 &hfi1_vl2mtu_attr_4.attr,
402 &hfi1_vl2mtu_attr_5.attr,
403 &hfi1_vl2mtu_attr_6.attr,
404 &hfi1_vl2mtu_attr_7.attr,
405 &hfi1_vl2mtu_attr_8.attr,
406 &hfi1_vl2mtu_attr_9.attr,
407 &hfi1_vl2mtu_attr_10.attr,
408 &hfi1_vl2mtu_attr_11.attr,
409 &hfi1_vl2mtu_attr_12.attr,
410 &hfi1_vl2mtu_attr_13.attr,
411 &hfi1_vl2mtu_attr_14.attr,
412 &hfi1_vl2mtu_attr_15.attr,
413 NULL
414};
415
416static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr,
417 char *buf)
418{
419 struct hfi1_vl2mtu_attr *vlattr =
420 container_of(attr, struct hfi1_vl2mtu_attr, attr);
421 struct hfi1_pportdata *ppd =
422 container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj);
423 struct hfi1_devdata *dd = ppd->dd;
424
425 return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu);
426}
427
428static const struct sysfs_ops hfi1_vl2mtu_ops = {
429 .show = vl2mtu_attr_show,
430};
431
432static struct kobj_type hfi1_vl2mtu_ktype = {
433 .release = port_release,
434 .sysfs_ops = &hfi1_vl2mtu_ops,
435 .default_attrs = vl2mtu_default_attributes
436};
437
438
439/* end of per-port file structures and support code */
440
441/*
442 * Start of per-unit (or driver, in some cases, but replicated
443 * per unit) functions (these get a device *)
444 */
445static ssize_t show_rev(struct device *device, struct device_attribute *attr,
446 char *buf)
447{
448 struct hfi1_ibdev *dev =
449 container_of(device, struct hfi1_ibdev, ibdev.dev);
450
451 return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
452}
453
454static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
455 char *buf)
456{
457 struct hfi1_ibdev *dev =
458 container_of(device, struct hfi1_ibdev, ibdev.dev);
459 struct hfi1_devdata *dd = dd_from_dev(dev);
460 int ret;
461
462 if (!dd->boardname)
463 ret = -EINVAL;
464 else
465 ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
466 return ret;
467}
468
469static ssize_t show_boardversion(struct device *device,
470 struct device_attribute *attr, char *buf)
471{
472 struct hfi1_ibdev *dev =
473 container_of(device, struct hfi1_ibdev, ibdev.dev);
474 struct hfi1_devdata *dd = dd_from_dev(dev);
475
476 /* The string printed here is already newline-terminated. */
477 return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
478}
479
480
481static ssize_t show_nctxts(struct device *device,
482 struct device_attribute *attr, char *buf)
483{
484 struct hfi1_ibdev *dev =
485 container_of(device, struct hfi1_ibdev, ibdev.dev);
486 struct hfi1_devdata *dd = dd_from_dev(dev);
487
488 /*
489 * Return the smaller of send and receive contexts.
490 * Normally, user level applications would require both a send
491 * and a receive context, so returning the smaller of the two counts
492 * give a more accurate picture of total contexts available.
493 */
494 return scnprintf(buf, PAGE_SIZE, "%u\n",
495 min(dd->num_rcv_contexts - dd->first_user_ctxt,
496 (u32)dd->sc_sizes[SC_USER].count));
497}
498
499static ssize_t show_nfreectxts(struct device *device,
500 struct device_attribute *attr, char *buf)
501{
502 struct hfi1_ibdev *dev =
503 container_of(device, struct hfi1_ibdev, ibdev.dev);
504 struct hfi1_devdata *dd = dd_from_dev(dev);
505
506 /* Return the number of free user ports (contexts) available. */
507 return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
508}
509
510static ssize_t show_serial(struct device *device,
511 struct device_attribute *attr, char *buf)
512{
513 struct hfi1_ibdev *dev =
514 container_of(device, struct hfi1_ibdev, ibdev.dev);
515 struct hfi1_devdata *dd = dd_from_dev(dev);
516
517 return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
518
519}
520
521static ssize_t store_chip_reset(struct device *device,
522 struct device_attribute *attr, const char *buf,
523 size_t count)
524{
525 struct hfi1_ibdev *dev =
526 container_of(device, struct hfi1_ibdev, ibdev.dev);
527 struct hfi1_devdata *dd = dd_from_dev(dev);
528 int ret;
529
530 if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
531 ret = -EINVAL;
532 goto bail;
533 }
534
535 ret = hfi1_reset_device(dd->unit);
536bail:
537 return ret < 0 ? ret : count;
538}
539
540/*
541 * Convert the reported temperature from an integer (reported in
542 * units of 0.25C) to a floating point number.
543 */
544#define temp2str(temp, buf, size, idx) \
545 scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ", \
546 ((temp) >> 2), ((temp) & 0x3) * 25)
547
548/*
549 * Dump tempsense values, in decimal, to ease shell-scripts.
550 */
551static ssize_t show_tempsense(struct device *device,
552 struct device_attribute *attr, char *buf)
553{
554 struct hfi1_ibdev *dev =
555 container_of(device, struct hfi1_ibdev, ibdev.dev);
556 struct hfi1_devdata *dd = dd_from_dev(dev);
557 struct hfi1_temp temp;
558 int ret = -ENXIO;
559
560 ret = hfi1_tempsense_rd(dd, &temp);
561 if (!ret) {
562 int idx = 0;
563
564 idx += temp2str(temp.curr, buf, PAGE_SIZE, idx);
565 idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx);
566 idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx);
567 idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx);
568 idx += scnprintf(buf + idx, PAGE_SIZE - idx,
569 "%u %u %u\n", temp.triggers & 0x1,
570 temp.triggers & 0x2, temp.triggers & 0x4);
571 ret = idx;
572 }
573 return ret;
574}
575
576/*
577 * end of per-unit (or driver, in some cases, but replicated
578 * per unit) functions
579 */
580
581/* start of per-unit file structures and support code */
582static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
583static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
584static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
585static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
586static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
587static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
588static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
589static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
590
591static struct device_attribute *hfi1_attributes[] = {
592 &dev_attr_hw_rev,
593 &dev_attr_board_id,
594 &dev_attr_nctxts,
595 &dev_attr_nfreectxts,
596 &dev_attr_serial,
597 &dev_attr_boardversion,
598 &dev_attr_tempsense,
599 &dev_attr_chip_reset,
600};
601
602int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
603 struct kobject *kobj)
604{
605 struct hfi1_pportdata *ppd;
606 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
607 int ret;
608
609 if (!port_num || port_num > dd->num_pports) {
610 dd_dev_err(dd,
611 "Skipping infiniband class with invalid port %u\n",
612 port_num);
613 return -ENODEV;
614 }
615 ppd = &dd->pport[port_num - 1];
616
617 ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj,
618 "sc2vl");
619 if (ret) {
620 dd_dev_err(dd,
621 "Skipping sc2vl sysfs info, (err %d) port %u\n",
622 ret, port_num);
623 goto bail;
624 }
625 kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD);
626
627 ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj,
628 "sl2sc");
629 if (ret) {
630 dd_dev_err(dd,
631 "Skipping sl2sc sysfs info, (err %d) port %u\n",
632 ret, port_num);
633 goto bail_sc2vl;
634 }
635 kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD);
636
637 ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj,
638 "vl2mtu");
639 if (ret) {
640 dd_dev_err(dd,
641 "Skipping vl2mtu sysfs info, (err %d) port %u\n",
642 ret, port_num);
643 goto bail_sl2sc;
644 }
645 kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD);
646
647
648 ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype,
649 kobj, "CCMgtA");
650 if (ret) {
651 dd_dev_err(dd,
652 "Skipping Congestion Control sysfs info, (err %d) port %u\n",
653 ret, port_num);
654 goto bail_vl2mtu;
655 }
656
657 kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
658
659 ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
660 &cc_setting_bin_attr);
661 if (ret) {
662 dd_dev_err(dd,
663 "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
664 ret, port_num);
665 goto bail_cc;
666 }
667
668 ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
669 &cc_table_bin_attr);
670 if (ret) {
671 dd_dev_err(dd,
672 "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
673 ret, port_num);
674 goto bail_cc_entry_bin;
675 }
676
677 dd_dev_info(dd,
678 "IB%u: Congestion Control Agent enabled for port %d\n",
679 dd->unit, port_num);
680
681 return 0;
682
683bail_cc_entry_bin:
684 sysfs_remove_bin_file(&ppd->pport_cc_kobj,
685 &cc_setting_bin_attr);
686bail_cc:
687 kobject_put(&ppd->pport_cc_kobj);
688bail_vl2mtu:
689 kobject_put(&ppd->vl2mtu_kobj);
690bail_sl2sc:
691 kobject_put(&ppd->sl2sc_kobj);
692bail_sc2vl:
693 kobject_put(&ppd->sc2vl_kobj);
694bail:
695 return ret;
696}
697
698/*
699 * Register and create our files in /sys/class/infiniband.
700 */
701int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
702{
703 struct ib_device *dev = &dd->verbs_dev.ibdev;
704 int i, ret;
705
706 for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
707 ret = device_create_file(&dev->dev, hfi1_attributes[i]);
708 if (ret)
709 goto bail;
710 }
711
712 return 0;
713bail:
714 for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
715 device_remove_file(&dev->dev, hfi1_attributes[i]);
716 return ret;
717}
718
719/*
720 * Unregister and remove our files in /sys/class/infiniband.
721 */
722void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd)
723{
724 struct hfi1_pportdata *ppd;
725 int i;
726
727 for (i = 0; i < dd->num_pports; i++) {
728 ppd = &dd->pport[i];
729
730 sysfs_remove_bin_file(&ppd->pport_cc_kobj,
731 &cc_setting_bin_attr);
732 sysfs_remove_bin_file(&ppd->pport_cc_kobj,
733 &cc_table_bin_attr);
734 kobject_put(&ppd->pport_cc_kobj);
735 kobject_put(&ppd->vl2mtu_kobj);
736 kobject_put(&ppd->sl2sc_kobj);
737 kobject_put(&ppd->sc2vl_kobj);
738 }
739}
diff --git a/drivers/staging/rdma/hfi1/trace.c b/drivers/staging/rdma/hfi1/trace.c
new file mode 100644
index 000000000000..70ad7b9fc1ce
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/trace.c
@@ -0,0 +1,221 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50#define CREATE_TRACE_POINTS
51#include "trace.h"
52
53u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr)
54{
55 struct hfi1_other_headers *ohdr;
56 u8 opcode;
57 u8 lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
58
59 if (lnh == HFI1_LRH_BTH)
60 ohdr = &hdr->u.oth;
61 else
62 ohdr = &hdr->u.l.oth;
63 opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
64 return hdr_len_by_opcode[opcode] == 0 ?
65 0 : hdr_len_by_opcode[opcode] - (12 + 8);
66}
67
68#define IMM_PRN "imm %d"
69#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x"
70#define AETH_PRN "aeth syn 0x%.2x msn 0x%.8x"
71#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x"
72#define ATOMICACKETH_PRN "origdata %lld"
73#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld"
74
75#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
76
77static u64 ib_u64_get(__be32 *p)
78{
79 return ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]);
80}
81
82const char *parse_everbs_hdrs(
83 struct trace_seq *p,
84 u8 opcode,
85 void *ehdrs)
86{
87 union ib_ehdrs *eh = ehdrs;
88 const char *ret = trace_seq_buffer_ptr(p);
89
90 switch (opcode) {
91 /* imm */
92 case OP(RC, SEND_LAST_WITH_IMMEDIATE):
93 case OP(UC, SEND_LAST_WITH_IMMEDIATE):
94 case OP(RC, SEND_ONLY_WITH_IMMEDIATE):
95 case OP(UC, SEND_ONLY_WITH_IMMEDIATE):
96 case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
97 case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
98 trace_seq_printf(p, IMM_PRN,
99 be32_to_cpu(eh->imm_data));
100 break;
101 /* reth + imm */
102 case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
103 case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
104 trace_seq_printf(p, RETH_PRN " " IMM_PRN,
105 (unsigned long long)ib_u64_get(
106 (__be32 *)&eh->rc.reth.vaddr),
107 be32_to_cpu(eh->rc.reth.rkey),
108 be32_to_cpu(eh->rc.reth.length),
109 be32_to_cpu(eh->rc.imm_data));
110 break;
111 /* reth */
112 case OP(RC, RDMA_READ_REQUEST):
113 case OP(RC, RDMA_WRITE_FIRST):
114 case OP(UC, RDMA_WRITE_FIRST):
115 case OP(RC, RDMA_WRITE_ONLY):
116 case OP(UC, RDMA_WRITE_ONLY):
117 trace_seq_printf(p, RETH_PRN,
118 (unsigned long long)ib_u64_get(
119 (__be32 *)&eh->rc.reth.vaddr),
120 be32_to_cpu(eh->rc.reth.rkey),
121 be32_to_cpu(eh->rc.reth.length));
122 break;
123 case OP(RC, RDMA_READ_RESPONSE_FIRST):
124 case OP(RC, RDMA_READ_RESPONSE_LAST):
125 case OP(RC, RDMA_READ_RESPONSE_ONLY):
126 case OP(RC, ACKNOWLEDGE):
127 trace_seq_printf(p, AETH_PRN,
128 be32_to_cpu(eh->aeth) >> 24,
129 be32_to_cpu(eh->aeth) & HFI1_QPN_MASK);
130 break;
131 /* aeth + atomicacketh */
132 case OP(RC, ATOMIC_ACKNOWLEDGE):
133 trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
134 (be32_to_cpu(eh->at.aeth) >> 24) & 0xff,
135 be32_to_cpu(eh->at.aeth) & HFI1_QPN_MASK,
136 (unsigned long long)ib_u64_get(eh->at.atomic_ack_eth));
137 break;
138 /* atomiceth */
139 case OP(RC, COMPARE_SWAP):
140 case OP(RC, FETCH_ADD):
141 trace_seq_printf(p, ATOMICETH_PRN,
142 (unsigned long long)ib_u64_get(eh->atomic_eth.vaddr),
143 eh->atomic_eth.rkey,
144 (unsigned long long)ib_u64_get(
145 (__be32 *)&eh->atomic_eth.swap_data),
146 (unsigned long long) ib_u64_get(
147 (__be32 *)&eh->atomic_eth.compare_data));
148 break;
149 /* deth */
150 case OP(UD, SEND_ONLY):
151 case OP(UD, SEND_ONLY_WITH_IMMEDIATE):
152 trace_seq_printf(p, DETH_PRN,
153 be32_to_cpu(eh->ud.deth[0]),
154 be32_to_cpu(eh->ud.deth[1]) & HFI1_QPN_MASK);
155 break;
156 }
157 trace_seq_putc(p, 0);
158 return ret;
159}
160
161const char *parse_sdma_flags(
162 struct trace_seq *p,
163 u64 desc0, u64 desc1)
164{
165 const char *ret = trace_seq_buffer_ptr(p);
166 char flags[5] = { 'x', 'x', 'x', 'x', 0 };
167
168 flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
169 flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? 'H' : '-';
170 flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
171 flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
172 trace_seq_printf(p, "%s", flags);
173 if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG)
174 trace_seq_printf(p, " amode:%u aidx:%u alen:%u",
175 (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT)
176 & SDMA_DESC1_HEADER_MODE_MASK),
177 (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT)
178 & SDMA_DESC1_HEADER_INDEX_MASK),
179 (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT)
180 & SDMA_DESC1_HEADER_DWS_MASK));
181 return ret;
182}
183
184const char *print_u32_array(
185 struct trace_seq *p,
186 u32 *arr, int len)
187{
188 int i;
189 const char *ret = trace_seq_buffer_ptr(p);
190
191 for (i = 0; i < len ; i++)
192 trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]);
193 trace_seq_putc(p, 0);
194 return ret;
195}
196
197const char *print_u64_array(
198 struct trace_seq *p,
199 u64 *arr, int len)
200{
201 int i;
202 const char *ret = trace_seq_buffer_ptr(p);
203
204 for (i = 0; i < len; i++)
205 trace_seq_printf(p, "%s0x%016llx", i == 0 ? "" : " ", arr[i]);
206 trace_seq_putc(p, 0);
207 return ret;
208}
209
210__hfi1_trace_fn(PKT);
211__hfi1_trace_fn(PROC);
212__hfi1_trace_fn(SDMA);
213__hfi1_trace_fn(LINKVERB);
214__hfi1_trace_fn(DEBUG);
215__hfi1_trace_fn(SNOOP);
216__hfi1_trace_fn(CNTR);
217__hfi1_trace_fn(PIO);
218__hfi1_trace_fn(DC8051);
219__hfi1_trace_fn(FIRMWARE);
220__hfi1_trace_fn(RCVCTRL);
221__hfi1_trace_fn(TID);
diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/staging/rdma/hfi1/trace.h
new file mode 100644
index 000000000000..d7851c0a0171
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/trace.h
@@ -0,0 +1,1409 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50#undef TRACE_SYSTEM_VAR
51#define TRACE_SYSTEM_VAR hfi1
52
53#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
54#define __HFI1_TRACE_H
55
56#include <linux/tracepoint.h>
57#include <linux/trace_seq.h>
58
59#include "hfi.h"
60#include "mad.h"
61#include "sdma.h"
62
63#define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev))
64#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev))
65
66#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
67#define show_packettype(etype) \
68__print_symbolic(etype, \
69 packettype_name(EXPECTED), \
70 packettype_name(EAGER), \
71 packettype_name(IB), \
72 packettype_name(ERROR), \
73 packettype_name(BYPASS))
74
75#undef TRACE_SYSTEM
76#define TRACE_SYSTEM hfi1_rx
77
78TRACE_EVENT(hfi1_rcvhdr,
79 TP_PROTO(struct hfi1_devdata *dd,
80 u64 eflags,
81 u32 ctxt,
82 u32 etype,
83 u32 hlen,
84 u32 tlen,
85 u32 updegr,
86 u32 etail),
87 TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
88 TP_STRUCT__entry(
89 DD_DEV_ENTRY(dd)
90 __field(u64, eflags)
91 __field(u32, ctxt)
92 __field(u32, etype)
93 __field(u32, hlen)
94 __field(u32, tlen)
95 __field(u32, updegr)
96 __field(u32, etail)
97 ),
98 TP_fast_assign(
99 DD_DEV_ASSIGN(dd);
100 __entry->eflags = eflags;
101 __entry->ctxt = ctxt;
102 __entry->etype = etype;
103 __entry->hlen = hlen;
104 __entry->tlen = tlen;
105 __entry->updegr = updegr;
106 __entry->etail = etail;
107 ),
108 TP_printk(
109"[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
110 __get_str(dev),
111 __entry->ctxt,
112 __entry->eflags,
113 __entry->etype, show_packettype(__entry->etype),
114 __entry->hlen,
115 __entry->tlen,
116 __entry->updegr,
117 __entry->etail
118 )
119);
120
121TRACE_EVENT(hfi1_receive_interrupt,
122 TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
123 TP_ARGS(dd, ctxt),
124 TP_STRUCT__entry(
125 DD_DEV_ENTRY(dd)
126 __field(u32, ctxt)
127 __field(u8, slow_path)
128 __field(u8, dma_rtail)
129 ),
130 TP_fast_assign(
131 DD_DEV_ASSIGN(dd);
132 __entry->ctxt = ctxt;
133 if (dd->rcd[ctxt]->do_interrupt ==
134 &handle_receive_interrupt) {
135 __entry->slow_path = 1;
136 __entry->dma_rtail = 0xFF;
137 } else if (dd->rcd[ctxt]->do_interrupt ==
138 &handle_receive_interrupt_dma_rtail){
139 __entry->dma_rtail = 1;
140 __entry->slow_path = 0;
141 } else if (dd->rcd[ctxt]->do_interrupt ==
142 &handle_receive_interrupt_nodma_rtail) {
143 __entry->dma_rtail = 0;
144 __entry->slow_path = 0;
145 }
146 ),
147 TP_printk(
148 "[%s] ctxt %d SlowPath: %d DmaRtail: %d",
149 __get_str(dev),
150 __entry->ctxt,
151 __entry->slow_path,
152 __entry->dma_rtail
153 )
154);
155
156const char *print_u64_array(struct trace_seq *, u64 *, int);
157
158TRACE_EVENT(hfi1_exp_tid_map,
159 TP_PROTO(unsigned ctxt, u16 subctxt, int dir,
160 unsigned long *maps, u16 count),
161 TP_ARGS(ctxt, subctxt, dir, maps, count),
162 TP_STRUCT__entry(
163 __field(unsigned, ctxt)
164 __field(u16, subctxt)
165 __field(int, dir)
166 __field(u16, count)
167 __dynamic_array(unsigned long, maps, sizeof(*maps) * count)
168 ),
169 TP_fast_assign(
170 __entry->ctxt = ctxt;
171 __entry->subctxt = subctxt;
172 __entry->dir = dir;
173 __entry->count = count;
174 memcpy(__get_dynamic_array(maps), maps,
175 sizeof(*maps) * count);
176 ),
177 TP_printk("[%3u:%02u] %s tidmaps %s",
178 __entry->ctxt,
179 __entry->subctxt,
180 (__entry->dir ? ">" : "<"),
181 print_u64_array(p, __get_dynamic_array(maps),
182 __entry->count)
183 )
184 );
185
186TRACE_EVENT(hfi1_exp_rcv_set,
187 TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid,
188 unsigned long vaddr, u64 phys_addr, void *page),
189 TP_ARGS(ctxt, subctxt, tid, vaddr, phys_addr, page),
190 TP_STRUCT__entry(
191 __field(unsigned, ctxt)
192 __field(u16, subctxt)
193 __field(u32, tid)
194 __field(unsigned long, vaddr)
195 __field(u64, phys_addr)
196 __field(void *, page)
197 ),
198 TP_fast_assign(
199 __entry->ctxt = ctxt;
200 __entry->subctxt = subctxt;
201 __entry->tid = tid;
202 __entry->vaddr = vaddr;
203 __entry->phys_addr = phys_addr;
204 __entry->page = page;
205 ),
206 TP_printk("[%u:%u] TID %u, vaddrs 0x%lx, physaddr 0x%llx, pgp %p",
207 __entry->ctxt,
208 __entry->subctxt,
209 __entry->tid,
210 __entry->vaddr,
211 __entry->phys_addr,
212 __entry->page
213 )
214 );
215
216TRACE_EVENT(hfi1_exp_rcv_free,
217 TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid,
218 unsigned long phys, void *page),
219 TP_ARGS(ctxt, subctxt, tid, phys, page),
220 TP_STRUCT__entry(
221 __field(unsigned, ctxt)
222 __field(u16, subctxt)
223 __field(u32, tid)
224 __field(unsigned long, phys)
225 __field(void *, page)
226 ),
227 TP_fast_assign(
228 __entry->ctxt = ctxt;
229 __entry->subctxt = subctxt;
230 __entry->tid = tid;
231 __entry->phys = phys;
232 __entry->page = page;
233 ),
234 TP_printk("[%u:%u] freeing TID %u, 0x%lx, pgp %p",
235 __entry->ctxt,
236 __entry->subctxt,
237 __entry->tid,
238 __entry->phys,
239 __entry->page
240 )
241 );
242#undef TRACE_SYSTEM
243#define TRACE_SYSTEM hfi1_tx
244
245TRACE_EVENT(hfi1_piofree,
246 TP_PROTO(struct send_context *sc, int extra),
247 TP_ARGS(sc, extra),
248 TP_STRUCT__entry(
249 DD_DEV_ENTRY(sc->dd)
250 __field(u32, sw_index)
251 __field(u32, hw_context)
252 __field(int, extra)
253 ),
254 TP_fast_assign(
255 DD_DEV_ASSIGN(sc->dd);
256 __entry->sw_index = sc->sw_index;
257 __entry->hw_context = sc->hw_context;
258 __entry->extra = extra;
259 ),
260 TP_printk(
261 "[%s] ctxt %u(%u) extra %d",
262 __get_str(dev),
263 __entry->sw_index,
264 __entry->hw_context,
265 __entry->extra
266 )
267);
268
269TRACE_EVENT(hfi1_wantpiointr,
270 TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
271 TP_ARGS(sc, needint, credit_ctrl),
272 TP_STRUCT__entry(
273 DD_DEV_ENTRY(sc->dd)
274 __field(u32, sw_index)
275 __field(u32, hw_context)
276 __field(u32, needint)
277 __field(u64, credit_ctrl)
278 ),
279 TP_fast_assign(
280 DD_DEV_ASSIGN(sc->dd);
281 __entry->sw_index = sc->sw_index;
282 __entry->hw_context = sc->hw_context;
283 __entry->needint = needint;
284 __entry->credit_ctrl = credit_ctrl;
285 ),
286 TP_printk(
287 "[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
288 __get_str(dev),
289 __entry->sw_index,
290 __entry->hw_context,
291 __entry->needint,
292 (unsigned long long)__entry->credit_ctrl
293 )
294);
295
296DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
297 TP_PROTO(struct hfi1_qp *qp, u32 flags),
298 TP_ARGS(qp, flags),
299 TP_STRUCT__entry(
300 DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
301 __field(u32, qpn)
302 __field(u32, flags)
303 __field(u32, s_flags)
304 ),
305 TP_fast_assign(
306 DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
307 __entry->flags = flags;
308 __entry->qpn = qp->ibqp.qp_num;
309 __entry->s_flags = qp->s_flags;
310 ),
311 TP_printk(
312 "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
313 __get_str(dev),
314 __entry->qpn,
315 __entry->flags,
316 __entry->s_flags
317 )
318);
319
320DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
321 TP_PROTO(struct hfi1_qp *qp, u32 flags),
322 TP_ARGS(qp, flags));
323
324DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
325 TP_PROTO(struct hfi1_qp *qp, u32 flags),
326 TP_ARGS(qp, flags));
327
328#undef TRACE_SYSTEM
329#define TRACE_SYSTEM hfi1_qphash
330DECLARE_EVENT_CLASS(hfi1_qphash_template,
331 TP_PROTO(struct hfi1_qp *qp, u32 bucket),
332 TP_ARGS(qp, bucket),
333 TP_STRUCT__entry(
334 DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
335 __field(u32, qpn)
336 __field(u32, bucket)
337 ),
338 TP_fast_assign(
339 DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
340 __entry->qpn = qp->ibqp.qp_num;
341 __entry->bucket = bucket;
342 ),
343 TP_printk(
344 "[%s] qpn 0x%x bucket %u",
345 __get_str(dev),
346 __entry->qpn,
347 __entry->bucket
348 )
349);
350
351DEFINE_EVENT(hfi1_qphash_template, hfi1_qpinsert,
352 TP_PROTO(struct hfi1_qp *qp, u32 bucket),
353 TP_ARGS(qp, bucket));
354
355DEFINE_EVENT(hfi1_qphash_template, hfi1_qpremove,
356 TP_PROTO(struct hfi1_qp *qp, u32 bucket),
357 TP_ARGS(qp, bucket));
358
359#undef TRACE_SYSTEM
360#define TRACE_SYSTEM hfi1_ibhdrs
361
362u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
363const char *parse_everbs_hdrs(
364 struct trace_seq *p,
365 u8 opcode,
366 void *ehdrs);
367
368#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
369
370const char *parse_sdma_flags(
371 struct trace_seq *p,
372 u64 desc0, u64 desc1);
373
374#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
375
376
377#define lrh_name(lrh) { HFI1_##lrh, #lrh }
378#define show_lnh(lrh) \
379__print_symbolic(lrh, \
380 lrh_name(LRH_BTH), \
381 lrh_name(LRH_GRH))
382
383#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode }
384#define show_ib_opcode(opcode) \
385__print_symbolic(opcode, \
386 ib_opcode_name(RC_SEND_FIRST), \
387 ib_opcode_name(RC_SEND_MIDDLE), \
388 ib_opcode_name(RC_SEND_LAST), \
389 ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \
390 ib_opcode_name(RC_SEND_ONLY), \
391 ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \
392 ib_opcode_name(RC_RDMA_WRITE_FIRST), \
393 ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \
394 ib_opcode_name(RC_RDMA_WRITE_LAST), \
395 ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
396 ib_opcode_name(RC_RDMA_WRITE_ONLY), \
397 ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
398 ib_opcode_name(RC_RDMA_READ_REQUEST), \
399 ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \
400 ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \
401 ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \
402 ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \
403 ib_opcode_name(RC_ACKNOWLEDGE), \
404 ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \
405 ib_opcode_name(RC_COMPARE_SWAP), \
406 ib_opcode_name(RC_FETCH_ADD), \
407 ib_opcode_name(UC_SEND_FIRST), \
408 ib_opcode_name(UC_SEND_MIDDLE), \
409 ib_opcode_name(UC_SEND_LAST), \
410 ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \
411 ib_opcode_name(UC_SEND_ONLY), \
412 ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \
413 ib_opcode_name(UC_RDMA_WRITE_FIRST), \
414 ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \
415 ib_opcode_name(UC_RDMA_WRITE_LAST), \
416 ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
417 ib_opcode_name(UC_RDMA_WRITE_ONLY), \
418 ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
419 ib_opcode_name(UD_SEND_ONLY), \
420 ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE))
421
422
423#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
424#define BTH_PRN \
425 "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
426 "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
427#define EHDR_PRN "%s"
428
429DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
430 TP_PROTO(struct hfi1_devdata *dd,
431 struct hfi1_ib_header *hdr),
432 TP_ARGS(dd, hdr),
433 TP_STRUCT__entry(
434 DD_DEV_ENTRY(dd)
435 /* LRH */
436 __field(u8, vl)
437 __field(u8, lver)
438 __field(u8, sl)
439 __field(u8, lnh)
440 __field(u16, dlid)
441 __field(u16, len)
442 __field(u16, slid)
443 /* BTH */
444 __field(u8, opcode)
445 __field(u8, se)
446 __field(u8, m)
447 __field(u8, pad)
448 __field(u8, tver)
449 __field(u16, pkey)
450 __field(u8, f)
451 __field(u8, b)
452 __field(u32, qpn)
453 __field(u8, a)
454 __field(u32, psn)
455 /* extended headers */
456 __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
457 ),
458 TP_fast_assign(
459 struct hfi1_other_headers *ohdr;
460
461 DD_DEV_ASSIGN(dd);
462 /* LRH */
463 __entry->vl =
464 (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
465 __entry->lver =
466 (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
467 __entry->sl =
468 (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
469 __entry->lnh =
470 (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
471 __entry->dlid =
472 be16_to_cpu(hdr->lrh[1]);
473 /* allow for larger len */
474 __entry->len =
475 be16_to_cpu(hdr->lrh[2]);
476 __entry->slid =
477 be16_to_cpu(hdr->lrh[3]);
478 /* BTH */
479 if (__entry->lnh == HFI1_LRH_BTH)
480 ohdr = &hdr->u.oth;
481 else
482 ohdr = &hdr->u.l.oth;
483 __entry->opcode =
484 (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
485 __entry->se =
486 (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
487 __entry->m =
488 (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
489 __entry->pad =
490 (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
491 __entry->tver =
492 (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
493 __entry->pkey =
494 be32_to_cpu(ohdr->bth[0]) & 0xffff;
495 __entry->f =
496 (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT)
497 & HFI1_FECN_MASK;
498 __entry->b =
499 (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT)
500 & HFI1_BECN_MASK;
501 __entry->qpn =
502 be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
503 __entry->a =
504 (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
505 /* allow for larger PSN */
506 __entry->psn =
507 be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
508 /* extended headers */
509 memcpy(
510 __get_dynamic_array(ehdrs),
511 &ohdr->u,
512 ibhdr_exhdr_len(hdr));
513 ),
514 TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
515 __get_str(dev),
516 /* LRH */
517 __entry->vl,
518 __entry->lver,
519 __entry->sl,
520 __entry->lnh, show_lnh(__entry->lnh),
521 __entry->dlid,
522 __entry->len,
523 __entry->slid,
524 /* BTH */
525 __entry->opcode, show_ib_opcode(__entry->opcode),
526 __entry->se,
527 __entry->m,
528 __entry->pad,
529 __entry->tver,
530 __entry->pkey,
531 __entry->f,
532 __entry->b,
533 __entry->qpn,
534 __entry->a,
535 __entry->psn,
536 /* extended headers */
537 __parse_ib_ehdrs(
538 __entry->opcode,
539 (void *)__get_dynamic_array(ehdrs))
540 )
541);
542
543DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
544 TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
545 TP_ARGS(dd, hdr));
546
547DEFINE_EVENT(hfi1_ibhdr_template, output_ibhdr,
548 TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
549 TP_ARGS(dd, hdr));
550
551#define SNOOP_PRN \
552 "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
553 "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
554
555#undef TRACE_SYSTEM
556#define TRACE_SYSTEM hfi1_snoop
557
558
559TRACE_EVENT(snoop_capture,
560 TP_PROTO(struct hfi1_devdata *dd,
561 int hdr_len,
562 struct hfi1_ib_header *hdr,
563 int data_len,
564 void *data),
565 TP_ARGS(dd, hdr_len, hdr, data_len, data),
566 TP_STRUCT__entry(
567 DD_DEV_ENTRY(dd)
568 __field(u16, slid)
569 __field(u16, dlid)
570 __field(u32, qpn)
571 __field(u8, opcode)
572 __field(u8, sl)
573 __field(u16, pkey)
574 __field(u32, hdr_len)
575 __field(u32, data_len)
576 __field(u8, lnh)
577 __dynamic_array(u8, raw_hdr, hdr_len)
578 __dynamic_array(u8, raw_pkt, data_len)
579 ),
580 TP_fast_assign(
581 struct hfi1_other_headers *ohdr;
582
583 __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
584 if (__entry->lnh == HFI1_LRH_BTH)
585 ohdr = &hdr->u.oth;
586 else
587 ohdr = &hdr->u.l.oth;
588 DD_DEV_ASSIGN(dd);
589 __entry->slid = be16_to_cpu(hdr->lrh[3]);
590 __entry->dlid = be16_to_cpu(hdr->lrh[1]);
591 __entry->qpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
592 __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
593 __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
594 __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
595 __entry->hdr_len = hdr_len;
596 __entry->data_len = data_len;
597 memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
598 memcpy(__get_dynamic_array(raw_pkt), data, data_len);
599 ),
600 TP_printk("[%s] " SNOOP_PRN,
601 __get_str(dev),
602 __entry->slid,
603 __entry->dlid,
604 __entry->qpn,
605 __entry->opcode,
606 show_ib_opcode(__entry->opcode),
607 __entry->sl,
608 __entry->pkey,
609 __entry->hdr_len,
610 __entry->data_len
611 )
612);
613
614#undef TRACE_SYSTEM
615#define TRACE_SYSTEM hfi1_ctxts
616
617#define UCTXT_FMT \
618 "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, " \
619 "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
620TRACE_EVENT(hfi1_uctxtdata,
621 TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
622 TP_ARGS(dd, uctxt),
623 TP_STRUCT__entry(
624 DD_DEV_ENTRY(dd)
625 __field(unsigned, ctxt)
626 __field(u32, credits)
627 __field(u64, hw_free)
628 __field(u64, piobase)
629 __field(u16, rcvhdrq_cnt)
630 __field(u64, rcvhdrq_phys)
631 __field(u32, eager_cnt)
632 __field(u64, rcvegr_phys)
633 ),
634 TP_fast_assign(
635 DD_DEV_ASSIGN(dd);
636 __entry->ctxt = uctxt->ctxt;
637 __entry->credits = uctxt->sc->credits;
638 __entry->hw_free = (u64)uctxt->sc->hw_free;
639 __entry->piobase = (u64)uctxt->sc->base_addr;
640 __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
641 __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
642 __entry->eager_cnt = uctxt->egrbufs.alloced;
643 __entry->rcvegr_phys = uctxt->egrbufs.rcvtids[0].phys;
644 ),
645 TP_printk(
646 "[%s] ctxt %u " UCTXT_FMT,
647 __get_str(dev),
648 __entry->ctxt,
649 __entry->credits,
650 __entry->hw_free,
651 __entry->piobase,
652 __entry->rcvhdrq_cnt,
653 __entry->rcvhdrq_phys,
654 __entry->eager_cnt,
655 __entry->rcvegr_phys
656 )
657 );
658
659#define CINFO_FMT \
660 "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
661TRACE_EVENT(hfi1_ctxt_info,
662 TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt,
663 struct hfi1_ctxt_info cinfo),
664 TP_ARGS(dd, ctxt, subctxt, cinfo),
665 TP_STRUCT__entry(
666 DD_DEV_ENTRY(dd)
667 __field(unsigned, ctxt)
668 __field(unsigned, subctxt)
669 __field(u16, egrtids)
670 __field(u16, rcvhdrq_cnt)
671 __field(u16, rcvhdrq_size)
672 __field(u16, sdma_ring_size)
673 __field(u32, rcvegr_size)
674 ),
675 TP_fast_assign(
676 DD_DEV_ASSIGN(dd);
677 __entry->ctxt = ctxt;
678 __entry->subctxt = subctxt;
679 __entry->egrtids = cinfo.egrtids;
680 __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
681 __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
682 __entry->sdma_ring_size = cinfo.sdma_ring_size;
683 __entry->rcvegr_size = cinfo.rcvegr_size;
684 ),
685 TP_printk(
686 "[%s] ctxt %u:%u " CINFO_FMT,
687 __get_str(dev),
688 __entry->ctxt,
689 __entry->subctxt,
690 __entry->egrtids,
691 __entry->rcvegr_size,
692 __entry->rcvhdrq_cnt,
693 __entry->rcvhdrq_size,
694 __entry->sdma_ring_size
695 )
696 );
697
698#undef TRACE_SYSTEM
699#define TRACE_SYSTEM hfi1_sma
700
701#define BCT_FORMAT \
702 "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
703
704#define BCT(field) \
705 be16_to_cpu( \
706 ((struct buffer_control *)__get_dynamic_array(bct))->field \
707 )
708
709DECLARE_EVENT_CLASS(hfi1_bct_template,
710 TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
711 TP_ARGS(dd, bc),
712 TP_STRUCT__entry(
713 DD_DEV_ENTRY(dd)
714 __dynamic_array(u8, bct, sizeof(*bc))
715 ),
716 TP_fast_assign(
717 DD_DEV_ASSIGN(dd);
718 memcpy(
719 __get_dynamic_array(bct),
720 bc,
721 sizeof(*bc));
722 ),
723 TP_printk(BCT_FORMAT,
724 BCT(overall_shared_limit),
725
726 BCT(vl[0].dedicated),
727 BCT(vl[0].shared),
728
729 BCT(vl[1].dedicated),
730 BCT(vl[1].shared),
731
732 BCT(vl[2].dedicated),
733 BCT(vl[2].shared),
734
735 BCT(vl[3].dedicated),
736 BCT(vl[3].shared),
737
738 BCT(vl[4].dedicated),
739 BCT(vl[4].shared),
740
741 BCT(vl[5].dedicated),
742 BCT(vl[5].shared),
743
744 BCT(vl[6].dedicated),
745 BCT(vl[6].shared),
746
747 BCT(vl[7].dedicated),
748 BCT(vl[7].shared),
749
750 BCT(vl[15].dedicated),
751 BCT(vl[15].shared)
752 )
753);
754
755
756DEFINE_EVENT(hfi1_bct_template, bct_set,
757 TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
758 TP_ARGS(dd, bc));
759
760DEFINE_EVENT(hfi1_bct_template, bct_get,
761 TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
762 TP_ARGS(dd, bc));
763
764#undef TRACE_SYSTEM
765#define TRACE_SYSTEM hfi1_sdma
766
767TRACE_EVENT(hfi1_sdma_descriptor,
768 TP_PROTO(
769 struct sdma_engine *sde,
770 u64 desc0,
771 u64 desc1,
772 u16 e,
773 void *descp),
774 TP_ARGS(sde, desc0, desc1, e, descp),
775 TP_STRUCT__entry(
776 DD_DEV_ENTRY(sde->dd)
777 __field(void *, descp)
778 __field(u64, desc0)
779 __field(u64, desc1)
780 __field(u16, e)
781 __field(u8, idx)
782 ),
783 TP_fast_assign(
784 DD_DEV_ASSIGN(sde->dd);
785 __entry->desc0 = desc0;
786 __entry->desc1 = desc1;
787 __entry->idx = sde->this_idx;
788 __entry->descp = descp;
789 __entry->e = e;
790 ),
791 TP_printk(
792 "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
793 __get_str(dev),
794 __entry->idx,
795 __parse_sdma_flags(__entry->desc0, __entry->desc1),
796 (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT)
797 & SDMA_DESC0_PHY_ADDR_MASK,
798 (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT)
799 & SDMA_DESC1_GENERATION_MASK),
800 (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT)
801 & SDMA_DESC0_BYTE_COUNT_MASK),
802 __entry->desc0,
803 __entry->desc1,
804 __entry->descp,
805 __entry->e
806 )
807);
808
809TRACE_EVENT(hfi1_sdma_engine_select,
810 TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
811 TP_ARGS(dd, sel, vl, idx),
812 TP_STRUCT__entry(
813 DD_DEV_ENTRY(dd)
814 __field(u32, sel)
815 __field(u8, vl)
816 __field(u8, idx)
817 ),
818 TP_fast_assign(
819 DD_DEV_ASSIGN(dd);
820 __entry->sel = sel;
821 __entry->vl = vl;
822 __entry->idx = idx;
823 ),
824 TP_printk(
825 "[%s] selecting SDE %u sel 0x%x vl %u",
826 __get_str(dev),
827 __entry->idx,
828 __entry->sel,
829 __entry->vl
830 )
831);
832
833DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
834 TP_PROTO(
835 struct sdma_engine *sde,
836 u64 status
837 ),
838 TP_ARGS(sde, status),
839 TP_STRUCT__entry(
840 DD_DEV_ENTRY(sde->dd)
841 __field(u64, status)
842 __field(u8, idx)
843 ),
844 TP_fast_assign(
845 DD_DEV_ASSIGN(sde->dd);
846 __entry->status = status;
847 __entry->idx = sde->this_idx;
848 ),
849 TP_printk(
850 "[%s] SDE(%u) status %llx",
851 __get_str(dev),
852 __entry->idx,
853 (unsigned long long)__entry->status
854 )
855);
856
857DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
858 TP_PROTO(
859 struct sdma_engine *sde,
860 u64 status
861 ),
862 TP_ARGS(sde, status)
863);
864
865DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
866 TP_PROTO(
867 struct sdma_engine *sde,
868 u64 status
869 ),
870 TP_ARGS(sde, status)
871);
872
873DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
874 TP_PROTO(
875 struct sdma_engine *sde,
876 int aidx
877 ),
878 TP_ARGS(sde, aidx),
879 TP_STRUCT__entry(
880 DD_DEV_ENTRY(sde->dd)
881 __field(int, aidx)
882 __field(u8, idx)
883 ),
884 TP_fast_assign(
885 DD_DEV_ASSIGN(sde->dd);
886 __entry->idx = sde->this_idx;
887 __entry->aidx = aidx;
888 ),
889 TP_printk(
890 "[%s] SDE(%u) aidx %d",
891 __get_str(dev),
892 __entry->idx,
893 __entry->aidx
894 )
895);
896
897DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
898 TP_PROTO(
899 struct sdma_engine *sde,
900 int aidx
901 ),
902 TP_ARGS(sde, aidx));
903
904DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
905 TP_PROTO(
906 struct sdma_engine *sde,
907 int aidx
908 ),
909 TP_ARGS(sde, aidx));
910
911#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
912TRACE_EVENT(hfi1_sdma_progress,
913 TP_PROTO(
914 struct sdma_engine *sde,
915 u16 hwhead,
916 u16 swhead,
917 struct sdma_txreq *txp
918 ),
919 TP_ARGS(sde, hwhead, swhead, txp),
920 TP_STRUCT__entry(
921 DD_DEV_ENTRY(sde->dd)
922 __field(u64, sn)
923 __field(u16, hwhead)
924 __field(u16, swhead)
925 __field(u16, txnext)
926 __field(u16, tx_tail)
927 __field(u16, tx_head)
928 __field(u8, idx)
929 ),
930 TP_fast_assign(
931 DD_DEV_ASSIGN(sde->dd);
932 __entry->hwhead = hwhead;
933 __entry->swhead = swhead;
934 __entry->tx_tail = sde->tx_tail;
935 __entry->tx_head = sde->tx_head;
936 __entry->txnext = txp ? txp->next_descq_idx : ~0;
937 __entry->idx = sde->this_idx;
938 __entry->sn = txp ? txp->sn : ~0;
939 ),
940 TP_printk(
941 "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
942 __get_str(dev),
943 __entry->idx,
944 __entry->sn,
945 __entry->hwhead,
946 __entry->swhead,
947 __entry->txnext,
948 __entry->tx_head,
949 __entry->tx_tail
950 )
951);
952#else
953TRACE_EVENT(hfi1_sdma_progress,
954 TP_PROTO(
955 struct sdma_engine *sde,
956 u16 hwhead,
957 u16 swhead,
958 struct sdma_txreq *txp
959 ),
960 TP_ARGS(sde, hwhead, swhead, txp),
961 TP_STRUCT__entry(
962 DD_DEV_ENTRY(sde->dd)
963 __field(u16, hwhead)
964 __field(u16, swhead)
965 __field(u16, txnext)
966 __field(u16, tx_tail)
967 __field(u16, tx_head)
968 __field(u8, idx)
969 ),
970 TP_fast_assign(
971 DD_DEV_ASSIGN(sde->dd);
972 __entry->hwhead = hwhead;
973 __entry->swhead = swhead;
974 __entry->tx_tail = sde->tx_tail;
975 __entry->tx_head = sde->tx_head;
976 __entry->txnext = txp ? txp->next_descq_idx : ~0;
977 __entry->idx = sde->this_idx;
978 ),
979 TP_printk(
980 "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
981 __get_str(dev),
982 __entry->idx,
983 __entry->hwhead,
984 __entry->swhead,
985 __entry->txnext,
986 __entry->tx_head,
987 __entry->tx_tail
988 )
989);
990#endif
991
992DECLARE_EVENT_CLASS(hfi1_sdma_sn,
993 TP_PROTO(
994 struct sdma_engine *sde,
995 u64 sn
996 ),
997 TP_ARGS(sde, sn),
998 TP_STRUCT__entry(
999 DD_DEV_ENTRY(sde->dd)
1000 __field(u64, sn)
1001 __field(u8, idx)
1002 ),
1003 TP_fast_assign(
1004 DD_DEV_ASSIGN(sde->dd);
1005 __entry->sn = sn;
1006 __entry->idx = sde->this_idx;
1007 ),
1008 TP_printk(
1009 "[%s] SDE(%u) sn %llu",
1010 __get_str(dev),
1011 __entry->idx,
1012 __entry->sn
1013 )
1014);
1015
1016DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
1017 TP_PROTO(
1018 struct sdma_engine *sde,
1019 u64 sn
1020 ),
1021 TP_ARGS(sde, sn)
1022);
1023
1024DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
1025 TP_PROTO(
1026 struct sdma_engine *sde,
1027 u64 sn
1028 ),
1029 TP_ARGS(sde, sn)
1030);
1031
1032#define USDMA_HDR_FORMAT \
1033 "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
1034
1035TRACE_EVENT(hfi1_sdma_user_header,
1036 TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
1037 struct hfi1_pkt_header *hdr, u32 tidval),
1038 TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
1039 TP_STRUCT__entry(
1040 DD_DEV_ENTRY(dd)
1041 __field(u16, ctxt)
1042 __field(u8, subctxt)
1043 __field(u16, req)
1044 __field(__le32, pbc0)
1045 __field(__le32, pbc1)
1046 __field(__be32, lrh0)
1047 __field(__be32, lrh1)
1048 __field(__be32, bth0)
1049 __field(__be32, bth1)
1050 __field(__be32, bth2)
1051 __field(__le32, kdeth0)
1052 __field(__le32, kdeth1)
1053 __field(__le32, kdeth2)
1054 __field(__le32, kdeth3)
1055 __field(__le32, kdeth4)
1056 __field(__le32, kdeth5)
1057 __field(__le32, kdeth6)
1058 __field(__le32, kdeth7)
1059 __field(__le32, kdeth8)
1060 __field(u32, tidval)
1061 ),
1062 TP_fast_assign(
1063 __le32 *pbc = (__le32 *)hdr->pbc;
1064 __be32 *lrh = (__be32 *)hdr->lrh;
1065 __be32 *bth = (__be32 *)hdr->bth;
1066 __le32 *kdeth = (__le32 *)&hdr->kdeth;
1067
1068 DD_DEV_ASSIGN(dd);
1069 __entry->ctxt = ctxt;
1070 __entry->subctxt = subctxt;
1071 __entry->req = req;
1072 __entry->pbc0 = pbc[0];
1073 __entry->pbc1 = pbc[1];
1074 __entry->lrh0 = be32_to_cpu(lrh[0]);
1075 __entry->lrh1 = be32_to_cpu(lrh[1]);
1076 __entry->bth0 = be32_to_cpu(bth[0]);
1077 __entry->bth1 = be32_to_cpu(bth[1]);
1078 __entry->bth2 = be32_to_cpu(bth[2]);
1079 __entry->kdeth0 = kdeth[0];
1080 __entry->kdeth1 = kdeth[1];
1081 __entry->kdeth2 = kdeth[2];
1082 __entry->kdeth3 = kdeth[3];
1083 __entry->kdeth4 = kdeth[4];
1084 __entry->kdeth5 = kdeth[5];
1085 __entry->kdeth6 = kdeth[6];
1086 __entry->kdeth7 = kdeth[7];
1087 __entry->kdeth8 = kdeth[8];
1088 __entry->tidval = tidval;
1089 ),
1090 TP_printk(USDMA_HDR_FORMAT,
1091 __get_str(dev),
1092 __entry->ctxt,
1093 __entry->subctxt,
1094 __entry->req,
1095 __entry->pbc1,
1096 __entry->pbc0,
1097 __entry->lrh0,
1098 __entry->lrh1,
1099 __entry->bth0,
1100 __entry->bth1,
1101 __entry->bth2,
1102 __entry->kdeth0,
1103 __entry->kdeth1,
1104 __entry->kdeth2,
1105 __entry->kdeth3,
1106 __entry->kdeth4,
1107 __entry->kdeth5,
1108 __entry->kdeth6,
1109 __entry->kdeth7,
1110 __entry->kdeth8,
1111 __entry->tidval
1112 )
1113 );
1114
1115#define SDMA_UREQ_FMT \
1116 "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
1117TRACE_EVENT(hfi1_sdma_user_reqinfo,
1118 TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
1119 TP_ARGS(dd, ctxt, subctxt, i),
1120 TP_STRUCT__entry(
1121 DD_DEV_ENTRY(dd);
1122 __field(u16, ctxt)
1123 __field(u8, subctxt)
1124 __field(u8, ver_opcode)
1125 __field(u8, iovcnt)
1126 __field(u16, npkts)
1127 __field(u16, fragsize)
1128 __field(u16, comp_idx)
1129 ),
1130 TP_fast_assign(
1131 DD_DEV_ASSIGN(dd);
1132 __entry->ctxt = ctxt;
1133 __entry->subctxt = subctxt;
1134 __entry->ver_opcode = i[0] & 0xff;
1135 __entry->iovcnt = (i[0] >> 8) & 0xff;
1136 __entry->npkts = i[1];
1137 __entry->fragsize = i[2];
1138 __entry->comp_idx = i[3];
1139 ),
1140 TP_printk(SDMA_UREQ_FMT,
1141 __get_str(dev),
1142 __entry->ctxt,
1143 __entry->subctxt,
1144 __entry->ver_opcode,
1145 __entry->iovcnt,
1146 __entry->npkts,
1147 __entry->fragsize,
1148 __entry->comp_idx
1149 )
1150 );
1151
1152#define usdma_complete_name(st) { st, #st }
1153#define show_usdma_complete_state(st) \
1154 __print_symbolic(st, \
1155 usdma_complete_name(FREE), \
1156 usdma_complete_name(QUEUED), \
1157 usdma_complete_name(COMPLETE), \
1158 usdma_complete_name(ERROR))
1159
1160TRACE_EVENT(hfi1_sdma_user_completion,
1161 TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
1162 u8 state, int code),
1163 TP_ARGS(dd, ctxt, subctxt, idx, state, code),
1164 TP_STRUCT__entry(
1165 DD_DEV_ENTRY(dd)
1166 __field(u16, ctxt)
1167 __field(u8, subctxt)
1168 __field(u16, idx)
1169 __field(u8, state)
1170 __field(int, code)
1171 ),
1172 TP_fast_assign(
1173 DD_DEV_ASSIGN(dd);
1174 __entry->ctxt = ctxt;
1175 __entry->subctxt = subctxt;
1176 __entry->idx = idx;
1177 __entry->state = state;
1178 __entry->code = code;
1179 ),
1180 TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
1181 __get_str(dev), __entry->ctxt, __entry->subctxt,
1182 __entry->idx, show_usdma_complete_state(__entry->state),
1183 __entry->code)
1184 );
1185
1186const char *print_u32_array(struct trace_seq *, u32 *, int);
1187#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
1188
1189TRACE_EVENT(hfi1_sdma_user_header_ahg,
1190 TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
1191 u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
1192 TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
1193 TP_STRUCT__entry(
1194 DD_DEV_ENTRY(dd)
1195 __field(u16, ctxt)
1196 __field(u8, subctxt)
1197 __field(u16, req)
1198 __field(u8, sde)
1199 __field(u8, idx)
1200 __field(int, len)
1201 __field(u32, tidval)
1202 __array(u32, ahg, 10)
1203 ),
1204 TP_fast_assign(
1205 DD_DEV_ASSIGN(dd);
1206 __entry->ctxt = ctxt;
1207 __entry->subctxt = subctxt;
1208 __entry->req = req;
1209 __entry->sde = sde;
1210 __entry->idx = ahgidx;
1211 __entry->len = len;
1212 __entry->tidval = tidval;
1213 memcpy(__entry->ahg, ahg, len * sizeof(u32));
1214 ),
1215 TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
1216 __get_str(dev),
1217 __entry->ctxt,
1218 __entry->subctxt,
1219 __entry->req,
1220 __entry->sde,
1221 __entry->idx,
1222 __entry->len - 1,
1223 __print_u32_hex(__entry->ahg, __entry->len),
1224 __entry->tidval
1225 )
1226 );
1227
1228TRACE_EVENT(hfi1_sdma_state,
1229 TP_PROTO(
1230 struct sdma_engine *sde,
1231 const char *cstate,
1232 const char *nstate
1233 ),
1234 TP_ARGS(sde, cstate, nstate),
1235 TP_STRUCT__entry(
1236 DD_DEV_ENTRY(sde->dd)
1237 __string(curstate, cstate)
1238 __string(newstate, nstate)
1239 ),
1240 TP_fast_assign(
1241 DD_DEV_ASSIGN(sde->dd);
1242 __assign_str(curstate, cstate);
1243 __assign_str(newstate, nstate);
1244 ),
1245 TP_printk("[%s] current state %s new state %s",
1246 __get_str(dev),
1247 __get_str(curstate),
1248 __get_str(newstate)
1249 )
1250);
1251
1252#undef TRACE_SYSTEM
1253#define TRACE_SYSTEM hfi1_rc
1254
1255DECLARE_EVENT_CLASS(hfi1_sdma_rc,
1256 TP_PROTO(struct hfi1_qp *qp, u32 psn),
1257 TP_ARGS(qp, psn),
1258 TP_STRUCT__entry(
1259 DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
1260 __field(u32, qpn)
1261 __field(u32, flags)
1262 __field(u32, psn)
1263 __field(u32, sending_psn)
1264 __field(u32, sending_hpsn)
1265 ),
1266 TP_fast_assign(
1267 DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
1268 __entry->qpn = qp->ibqp.qp_num;
1269 __entry->flags = qp->s_flags;
1270 __entry->psn = psn;
1271 __entry->sending_psn = qp->s_sending_psn;
1272 __entry->sending_hpsn = qp->s_sending_hpsn;
1273 ),
1274 TP_printk(
1275 "[%s] qpn 0x%x flags 0x%x psn 0x%x sending_psn 0x%x sending_hpsn 0x%x",
1276 __get_str(dev),
1277 __entry->qpn,
1278 __entry->flags,
1279 __entry->psn,
1280 __entry->sending_psn,
1281 __entry->sending_psn
1282 )
1283);
1284
1285DEFINE_EVENT(hfi1_sdma_rc, hfi1_rc_sendcomplete,
1286 TP_PROTO(struct hfi1_qp *qp, u32 psn),
1287 TP_ARGS(qp, psn)
1288);
1289
1290#undef TRACE_SYSTEM
1291#define TRACE_SYSTEM hfi1_misc
1292
1293TRACE_EVENT(hfi1_interrupt,
1294 TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
1295 int src),
1296 TP_ARGS(dd, is_entry, src),
1297 TP_STRUCT__entry(
1298 DD_DEV_ENTRY(dd)
1299 __array(char, buf, 64)
1300 __field(int, src)
1301 ),
1302 TP_fast_assign(
1303 DD_DEV_ASSIGN(dd)
1304 is_entry->is_name(__entry->buf, 64, src - is_entry->start);
1305 __entry->src = src;
1306 ),
1307 TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
1308 __entry->src)
1309);
1310
1311/*
1312 * Note:
1313 * This produces a REALLY ugly trace in the console output when the string is
1314 * too long.
1315 */
1316
1317#undef TRACE_SYSTEM
1318#define TRACE_SYSTEM hfi1_trace
1319
1320#define MAX_MSG_LEN 512
1321
1322DECLARE_EVENT_CLASS(hfi1_trace_template,
1323 TP_PROTO(const char *function, struct va_format *vaf),
1324 TP_ARGS(function, vaf),
1325 TP_STRUCT__entry(
1326 __string(function, function)
1327 __dynamic_array(char, msg, MAX_MSG_LEN)
1328 ),
1329 TP_fast_assign(
1330 __assign_str(function, function);
1331 WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg),
1332 MAX_MSG_LEN, vaf->fmt,
1333 *vaf->va) >= MAX_MSG_LEN);
1334 ),
1335 TP_printk("(%s) %s",
1336 __get_str(function),
1337 __get_str(msg))
1338);
1339
1340/*
1341 * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
1342 * actual function to work and can not be in a macro.
1343 */
1344#define __hfi1_trace_def(lvl) \
1345void __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \
1346 \
1347DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl, \
1348 TP_PROTO(const char *function, struct va_format *vaf), \
1349 TP_ARGS(function, vaf))
1350
1351#define __hfi1_trace_fn(lvl) \
1352void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \
1353{ \
1354 struct va_format vaf = { \
1355 .fmt = fmt, \
1356 }; \
1357 va_list args; \
1358 \
1359 va_start(args, fmt); \
1360 vaf.va = &args; \
1361 trace_hfi1_ ##lvl(func, &vaf); \
1362 va_end(args); \
1363 return; \
1364}
1365
1366/*
1367 * To create a new trace level simply define it below and as a __hfi1_trace_fn
1368 * in trace.c. This will create all the hooks for calling
1369 * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
1370 * the debugfs stuff.
1371 */
1372__hfi1_trace_def(PKT);
1373__hfi1_trace_def(PROC);
1374__hfi1_trace_def(SDMA);
1375__hfi1_trace_def(LINKVERB);
1376__hfi1_trace_def(DEBUG);
1377__hfi1_trace_def(SNOOP);
1378__hfi1_trace_def(CNTR);
1379__hfi1_trace_def(PIO);
1380__hfi1_trace_def(DC8051);
1381__hfi1_trace_def(FIRMWARE);
1382__hfi1_trace_def(RCVCTRL);
1383__hfi1_trace_def(TID);
1384
1385#define hfi1_cdbg(which, fmt, ...) \
1386 __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
1387
1388#define hfi1_dbg(fmt, ...) \
1389 hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
1390
1391/*
1392 * Define HFI1_EARLY_DBG at compile time or here to enable early trace
1393 * messages. Do not check in an enablement for this.
1394 */
1395
1396#ifdef HFI1_EARLY_DBG
1397#define hfi1_dbg_early(fmt, ...) \
1398 trace_printk(fmt, ##__VA_ARGS__)
1399#else
1400#define hfi1_dbg_early(fmt, ...)
1401#endif
1402
1403#endif /* __HFI1_TRACE_H */
1404
1405#undef TRACE_INCLUDE_PATH
1406#undef TRACE_INCLUDE_FILE
1407#define TRACE_INCLUDE_PATH .
1408#define TRACE_INCLUDE_FILE trace
1409#include <trace/define_trace.h>
diff --git a/drivers/staging/rdma/hfi1/twsi.c b/drivers/staging/rdma/hfi1/twsi.c
new file mode 100644
index 000000000000..ea54fd2700ad
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/twsi.c
@@ -0,0 +1,518 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/delay.h>
52#include <linux/pci.h>
53#include <linux/vmalloc.h>
54
55#include "hfi.h"
56#include "twsi.h"
57
58/*
59 * "Two Wire Serial Interface" support.
60 *
61 * Originally written for a not-quite-i2c serial eeprom, which is
62 * still used on some supported boards. Later boards have added a
63 * variety of other uses, most board-specific, so the bit-boffing
64 * part has been split off to this file, while the other parts
65 * have been moved to chip-specific files.
66 *
67 * We have also dropped all pretense of fully generic (e.g. pretend
68 * we don't know whether '1' is the higher voltage) interface, as
69 * the restrictions of the generic i2c interface (e.g. no access from
70 * driver itself) make it unsuitable for this use.
71 */
72
73#define READ_CMD 1
74#define WRITE_CMD 0
75
76/**
77 * i2c_wait_for_writes - wait for a write
78 * @dd: the hfi1_ib device
79 *
80 * We use this instead of udelay directly, so we can make sure
81 * that previous register writes have been flushed all the way
82 * to the chip. Since we are delaying anyway, the cost doesn't
83 * hurt, and makes the bit twiddling more regular
84 */
85static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target)
86{
87 /*
88 * implicit read of EXTStatus is as good as explicit
89 * read of scratch, if all we want to do is flush
90 * writes.
91 */
92 hfi1_gpio_mod(dd, target, 0, 0, 0);
93 rmb(); /* inlined, so prevent compiler reordering */
94}
95
96/*
97 * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
98 * for "almost compliant" modules
99 */
100#define SCL_WAIT_USEC 1000
101
102/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
103 * Should be 20, but some chips need more.
104 */
105#define TWSI_BUF_WAIT_USEC 60
106
107static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit)
108{
109 u32 mask;
110
111 udelay(1);
112
113 mask = QSFP_HFI0_I2CCLK;
114
115 /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
116 hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
117
118 /*
119 * Allow for slow slaves by simple
120 * delay for falling edge, sampling on rise.
121 */
122 if (!bit)
123 udelay(2);
124 else {
125 int rise_usec;
126
127 for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
128 if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0))
129 break;
130 udelay(2);
131 }
132 if (rise_usec <= 0)
133 dd_dev_err(dd, "SCL interface stuck low > %d uSec\n",
134 SCL_WAIT_USEC);
135 }
136 i2c_wait_for_writes(dd, target);
137}
138
139static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit)
140{
141 u32 mask;
142
143 mask = QSFP_HFI0_I2CDAT;
144
145 /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
146 hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
147
148 i2c_wait_for_writes(dd, target);
149 udelay(2);
150}
151
152static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait)
153{
154 u32 read_val, mask;
155
156 mask = QSFP_HFI0_I2CDAT;
157 /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
158 hfi1_gpio_mod(dd, target, 0, 0, mask);
159 read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
160 if (wait)
161 i2c_wait_for_writes(dd, target);
162 return (read_val & mask) >> GPIO_SDA_NUM;
163}
164
165/**
166 * i2c_ackrcv - see if ack following write is true
167 * @dd: the hfi1_ib device
168 */
169static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target)
170{
171 u8 ack_received;
172
173 /* AT ENTRY SCL = LOW */
174 /* change direction, ignore data */
175 ack_received = sda_in(dd, target, 1);
176 scl_out(dd, target, 1);
177 ack_received = sda_in(dd, target, 1) == 0;
178 scl_out(dd, target, 0);
179 return ack_received;
180}
181
182static void stop_cmd(struct hfi1_devdata *dd, u32 target);
183
184/**
185 * rd_byte - read a byte, sending STOP on last, else ACK
186 * @dd: the hfi1_ib device
187 *
188 * Returns byte shifted out of device
189 */
190static int rd_byte(struct hfi1_devdata *dd, u32 target, int last)
191{
192 int bit_cntr, data;
193
194 data = 0;
195
196 for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
197 data <<= 1;
198 scl_out(dd, target, 1);
199 data |= sda_in(dd, target, 0);
200 scl_out(dd, target, 0);
201 }
202 if (last) {
203 scl_out(dd, target, 1);
204 stop_cmd(dd, target);
205 } else {
206 sda_out(dd, target, 0);
207 scl_out(dd, target, 1);
208 scl_out(dd, target, 0);
209 sda_out(dd, target, 1);
210 }
211 return data;
212}
213
214/**
215 * wr_byte - write a byte, one bit at a time
216 * @dd: the hfi1_ib device
217 * @data: the byte to write
218 *
219 * Returns 0 if we got the following ack, otherwise 1
220 */
221static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data)
222{
223 int bit_cntr;
224 u8 bit;
225
226 for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
227 bit = (data >> bit_cntr) & 1;
228 sda_out(dd, target, bit);
229 scl_out(dd, target, 1);
230 scl_out(dd, target, 0);
231 }
232 return (!i2c_ackrcv(dd, target)) ? 1 : 0;
233}
234
235/*
236 * issue TWSI start sequence:
237 * (both clock/data high, clock high, data low while clock is high)
238 */
239static void start_seq(struct hfi1_devdata *dd, u32 target)
240{
241 sda_out(dd, target, 1);
242 scl_out(dd, target, 1);
243 sda_out(dd, target, 0);
244 udelay(1);
245 scl_out(dd, target, 0);
246}
247
248/**
249 * stop_seq - transmit the stop sequence
250 * @dd: the hfi1_ib device
251 *
252 * (both clock/data low, clock high, data high while clock is high)
253 */
254static void stop_seq(struct hfi1_devdata *dd, u32 target)
255{
256 scl_out(dd, target, 0);
257 sda_out(dd, target, 0);
258 scl_out(dd, target, 1);
259 sda_out(dd, target, 1);
260}
261
262/**
263 * stop_cmd - transmit the stop condition
264 * @dd: the hfi1_ib device
265 *
266 * (both clock/data low, clock high, data high while clock is high)
267 */
268static void stop_cmd(struct hfi1_devdata *dd, u32 target)
269{
270 stop_seq(dd, target);
271 udelay(TWSI_BUF_WAIT_USEC);
272}
273
274/**
275 * hfi1_twsi_reset - reset I2C communication
276 * @dd: the hfi1_ib device
277 */
278
279int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target)
280{
281 int clock_cycles_left = 9;
282 int was_high = 0;
283 u32 pins, mask;
284
285 /* Both SCL and SDA should be high. If not, there
286 * is something wrong.
287 */
288 mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT;
289
290 /*
291 * Force pins to desired innocuous state.
292 * This is the default power-on state with out=0 and dir=0,
293 * So tri-stated and should be floating high (barring HW problems)
294 */
295 hfi1_gpio_mod(dd, target, 0, 0, mask);
296
297 /*
298 * Clock nine times to get all listeners into a sane state.
299 * If SDA does not go high at any point, we are wedged.
300 * One vendor recommends then issuing START followed by STOP.
301 * we cannot use our "normal" functions to do that, because
302 * if SCL drops between them, another vendor's part will
303 * wedge, dropping SDA and keeping it low forever, at the end of
304 * the next transaction (even if it was not the device addressed).
305 * So our START and STOP take place with SCL held high.
306 */
307 while (clock_cycles_left--) {
308 scl_out(dd, target, 0);
309 scl_out(dd, target, 1);
310 /* Note if SDA is high, but keep clocking to sync slave */
311 was_high |= sda_in(dd, target, 0);
312 }
313
314 if (was_high) {
315 /*
316 * We saw a high, which we hope means the slave is sync'd.
317 * Issue START, STOP, pause for T_BUF.
318 */
319
320 pins = hfi1_gpio_mod(dd, target, 0, 0, 0);
321 if ((pins & mask) != mask)
322 dd_dev_err(dd, "GPIO pins not at rest: %d\n",
323 pins & mask);
324 /* Drop SDA to issue START */
325 udelay(1); /* Guarantee .6 uSec setup */
326 sda_out(dd, target, 0);
327 udelay(1); /* Guarantee .6 uSec hold */
328 /* At this point, SCL is high, SDA low. Raise SDA for STOP */
329 sda_out(dd, target, 1);
330 udelay(TWSI_BUF_WAIT_USEC);
331 }
332
333 return !was_high;
334}
335
336#define HFI1_TWSI_START 0x100
337#define HFI1_TWSI_STOP 0x200
338
339/* Write byte to TWSI, optionally prefixed with START or suffixed with
340 * STOP.
341 * returns 0 if OK (ACK received), else != 0
342 */
343static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags)
344{
345 int ret = 1;
346
347 if (flags & HFI1_TWSI_START)
348 start_seq(dd, target);
349
350 /* Leaves SCL low (from i2c_ackrcv()) */
351 ret = wr_byte(dd, target, data);
352
353 if (flags & HFI1_TWSI_STOP)
354 stop_cmd(dd, target);
355 return ret;
356}
357
358/* Added functionality for IBA7220-based cards */
359#define HFI1_TEMP_DEV 0x98
360
361/*
362 * hfi1_twsi_blk_rd
363 * General interface for data transfer from twsi devices.
364 * One vestige of its former role is that it recognizes a device
365 * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
366 * which responded to all TWSI device codes, interpreting them as
367 * address within device. On all other devices found on board handled by
368 * this driver, the device is followed by a one-byte "address" which selects
369 * the "register" or "offset" within the device from which data should
370 * be read.
371 */
372int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
373 void *buffer, int len)
374{
375 int ret;
376 u8 *bp = buffer;
377
378 ret = 1;
379
380 if (dev == HFI1_TWSI_NO_DEV) {
381 /* legacy not-really-I2C */
382 addr = (addr << 1) | READ_CMD;
383 ret = twsi_wr(dd, target, addr, HFI1_TWSI_START);
384 } else {
385 /* Actual I2C */
386 ret = twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START);
387 if (ret) {
388 stop_cmd(dd, target);
389 ret = 1;
390 goto bail;
391 }
392 /*
393 * SFF spec claims we do _not_ stop after the addr
394 * but simply issue a start with the "read" dev-addr.
395 * Since we are implicitly waiting for ACK here,
396 * we need t_buf (nominally 20uSec) before that start,
397 * and cannot rely on the delay built in to the STOP
398 */
399 ret = twsi_wr(dd, target, addr, 0);
400 udelay(TWSI_BUF_WAIT_USEC);
401
402 if (ret) {
403 dd_dev_err(dd,
404 "Failed to write interface read addr %02X\n",
405 addr);
406 ret = 1;
407 goto bail;
408 }
409 ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START);
410 }
411 if (ret) {
412 stop_cmd(dd, target);
413 ret = 1;
414 goto bail;
415 }
416
417 /*
418 * block devices keeps clocking data out as long as we ack,
419 * automatically incrementing the address. Some have "pages"
420 * whose boundaries will not be crossed, but the handling
421 * of these is left to the caller, who is in a better
422 * position to know.
423 */
424 while (len-- > 0) {
425 /*
426 * Get and store data, sending ACK if length remaining,
427 * else STOP
428 */
429 *bp++ = rd_byte(dd, target, !len);
430 }
431
432 ret = 0;
433
434bail:
435 return ret;
436}
437
438/*
439 * hfi1_twsi_blk_wr
440 * General interface for data transfer to twsi devices.
441 * One vestige of its former role is that it recognizes a device
442 * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
443 * which responded to all TWSI device codes, interpreting them as
444 * address within device. On all other devices found on board handled by
445 * this driver, the device is followed by a one-byte "address" which selects
446 * the "register" or "offset" within the device to which data should
447 * be written.
448 */
449int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
450 const void *buffer, int len)
451{
452 int sub_len;
453 const u8 *bp = buffer;
454 int max_wait_time, i;
455 int ret = 1;
456
457 while (len > 0) {
458 if (dev == HFI1_TWSI_NO_DEV) {
459 if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD,
460 HFI1_TWSI_START)) {
461 goto failed_write;
462 }
463 } else {
464 /* Real I2C */
465 if (twsi_wr(dd, target,
466 dev | WRITE_CMD, HFI1_TWSI_START))
467 goto failed_write;
468 ret = twsi_wr(dd, target, addr, 0);
469 if (ret) {
470 dd_dev_err(dd,
471 "Failed to write interface write addr %02X\n",
472 addr);
473 goto failed_write;
474 }
475 }
476
477 sub_len = min(len, 4);
478 addr += sub_len;
479 len -= sub_len;
480
481 for (i = 0; i < sub_len; i++)
482 if (twsi_wr(dd, target, *bp++, 0))
483 goto failed_write;
484
485 stop_cmd(dd, target);
486
487 /*
488 * Wait for write complete by waiting for a successful
489 * read (the chip replies with a zero after the write
490 * cmd completes, and before it writes to the eeprom.
491 * The startcmd for the read will fail the ack until
492 * the writes have completed. We do this inline to avoid
493 * the debug prints that are in the real read routine
494 * if the startcmd fails.
495 * We also use the proper device address, so it doesn't matter
496 * whether we have real eeprom_dev. Legacy likes any address.
497 */
498 max_wait_time = 100;
499 while (twsi_wr(dd, target,
500 dev | READ_CMD, HFI1_TWSI_START)) {
501 stop_cmd(dd, target);
502 if (!--max_wait_time)
503 goto failed_write;
504 }
505 /* now read (and ignore) the resulting byte */
506 rd_byte(dd, target, 1);
507 }
508
509 ret = 0;
510 goto bail;
511
512failed_write:
513 stop_cmd(dd, target);
514 ret = 1;
515
516bail:
517 return ret;
518}
diff --git a/drivers/staging/rdma/hfi1/twsi.h b/drivers/staging/rdma/hfi1/twsi.h
new file mode 100644
index 000000000000..5907e029613d
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/twsi.h
@@ -0,0 +1,68 @@
1#ifndef _TWSI_H
2#define _TWSI_H
3/*
4 *
5 * This file is provided under a dual BSD/GPLv2 license. When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * Copyright(c) 2015 Intel Corporation.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of version 2 of the GNU General Public License as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * BSD LICENSE
22 *
23 * Copyright(c) 2015 Intel Corporation.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 *
29 * - Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * - Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in
33 * the documentation and/or other materials provided with the
34 * distribution.
35 * - Neither the name of Intel Corporation nor the names of its
36 * contributors may be used to endorse or promote products derived
37 * from this software without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
40 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
41 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
43 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
47 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
49 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 *
51 */
52
53#define HFI1_TWSI_NO_DEV 0xFF
54
55struct hfi1_devdata;
56
57/* Bit position of SDA pin in ASIC_QSFP* registers */
58#define GPIO_SDA_NUM 1
59
60/* these functions must be called with qsfp_lock held */
61int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target);
62int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
63 void *buffer, int len);
64int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
65 const void *buffer, int len);
66
67
68#endif /* _TWSI_H */
diff --git a/drivers/staging/rdma/hfi1/uc.c b/drivers/staging/rdma/hfi1/uc.c
new file mode 100644
index 000000000000..b536f397737c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/uc.c
@@ -0,0 +1,585 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include "hfi.h"
52#include "sdma.h"
53#include "qp.h"
54
55/* cut down ridiculously long IB macro names */
56#define OP(x) IB_OPCODE_UC_##x
57
58/**
59 * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
60 * @qp: a pointer to the QP
61 *
62 * Return 1 if constructed; otherwise, return 0.
63 */
64int hfi1_make_uc_req(struct hfi1_qp *qp)
65{
66 struct hfi1_other_headers *ohdr;
67 struct hfi1_swqe *wqe;
68 unsigned long flags;
69 u32 hwords = 5;
70 u32 bth0 = 0;
71 u32 len;
72 u32 pmtu = qp->pmtu;
73 int ret = 0;
74 int middle = 0;
75
76 spin_lock_irqsave(&qp->s_lock, flags);
77
78 if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) {
79 if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
80 goto bail;
81 /* We are in the error state, flush the work request. */
82 if (qp->s_last == qp->s_head)
83 goto bail;
84 /* If DMAs are in progress, we can't flush immediately. */
85 if (atomic_read(&qp->s_iowait.sdma_busy)) {
86 qp->s_flags |= HFI1_S_WAIT_DMA;
87 goto bail;
88 }
89 clear_ahg(qp);
90 wqe = get_swqe_ptr(qp, qp->s_last);
91 hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
92 goto done;
93 }
94
95 ohdr = &qp->s_hdr->ibh.u.oth;
96 if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
97 ohdr = &qp->s_hdr->ibh.u.l.oth;
98
99 /* Get the next send request. */
100 wqe = get_swqe_ptr(qp, qp->s_cur);
101 qp->s_wqe = NULL;
102 switch (qp->s_state) {
103 default:
104 if (!(ib_hfi1_state_ops[qp->state] &
105 HFI1_PROCESS_NEXT_SEND_OK))
106 goto bail;
107 /* Check if send work queue is empty. */
108 if (qp->s_cur == qp->s_head) {
109 clear_ahg(qp);
110 goto bail;
111 }
112 /*
113 * Start a new request.
114 */
115 wqe->psn = qp->s_next_psn;
116 qp->s_psn = qp->s_next_psn;
117 qp->s_sge.sge = wqe->sg_list[0];
118 qp->s_sge.sg_list = wqe->sg_list + 1;
119 qp->s_sge.num_sge = wqe->wr.num_sge;
120 qp->s_sge.total_len = wqe->length;
121 len = wqe->length;
122 qp->s_len = len;
123 switch (wqe->wr.opcode) {
124 case IB_WR_SEND:
125 case IB_WR_SEND_WITH_IMM:
126 if (len > pmtu) {
127 qp->s_state = OP(SEND_FIRST);
128 len = pmtu;
129 break;
130 }
131 if (wqe->wr.opcode == IB_WR_SEND)
132 qp->s_state = OP(SEND_ONLY);
133 else {
134 qp->s_state =
135 OP(SEND_ONLY_WITH_IMMEDIATE);
136 /* Immediate data comes after the BTH */
137 ohdr->u.imm_data = wqe->wr.ex.imm_data;
138 hwords += 1;
139 }
140 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
141 bth0 |= IB_BTH_SOLICITED;
142 qp->s_wqe = wqe;
143 if (++qp->s_cur >= qp->s_size)
144 qp->s_cur = 0;
145 break;
146
147 case IB_WR_RDMA_WRITE:
148 case IB_WR_RDMA_WRITE_WITH_IMM:
149 ohdr->u.rc.reth.vaddr =
150 cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
151 ohdr->u.rc.reth.rkey =
152 cpu_to_be32(wqe->wr.wr.rdma.rkey);
153 ohdr->u.rc.reth.length = cpu_to_be32(len);
154 hwords += sizeof(struct ib_reth) / 4;
155 if (len > pmtu) {
156 qp->s_state = OP(RDMA_WRITE_FIRST);
157 len = pmtu;
158 break;
159 }
160 if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
161 qp->s_state = OP(RDMA_WRITE_ONLY);
162 else {
163 qp->s_state =
164 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
165 /* Immediate data comes after the RETH */
166 ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
167 hwords += 1;
168 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
169 bth0 |= IB_BTH_SOLICITED;
170 }
171 qp->s_wqe = wqe;
172 if (++qp->s_cur >= qp->s_size)
173 qp->s_cur = 0;
174 break;
175
176 default:
177 goto bail;
178 }
179 break;
180
181 case OP(SEND_FIRST):
182 qp->s_state = OP(SEND_MIDDLE);
183 /* FALLTHROUGH */
184 case OP(SEND_MIDDLE):
185 len = qp->s_len;
186 if (len > pmtu) {
187 len = pmtu;
188 middle = HFI1_CAP_IS_KSET(SDMA_AHG);
189 break;
190 }
191 if (wqe->wr.opcode == IB_WR_SEND)
192 qp->s_state = OP(SEND_LAST);
193 else {
194 qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
195 /* Immediate data comes after the BTH */
196 ohdr->u.imm_data = wqe->wr.ex.imm_data;
197 hwords += 1;
198 }
199 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
200 bth0 |= IB_BTH_SOLICITED;
201 qp->s_wqe = wqe;
202 if (++qp->s_cur >= qp->s_size)
203 qp->s_cur = 0;
204 break;
205
206 case OP(RDMA_WRITE_FIRST):
207 qp->s_state = OP(RDMA_WRITE_MIDDLE);
208 /* FALLTHROUGH */
209 case OP(RDMA_WRITE_MIDDLE):
210 len = qp->s_len;
211 if (len > pmtu) {
212 len = pmtu;
213 middle = HFI1_CAP_IS_KSET(SDMA_AHG);
214 break;
215 }
216 if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
217 qp->s_state = OP(RDMA_WRITE_LAST);
218 else {
219 qp->s_state =
220 OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
221 /* Immediate data comes after the BTH */
222 ohdr->u.imm_data = wqe->wr.ex.imm_data;
223 hwords += 1;
224 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
225 bth0 |= IB_BTH_SOLICITED;
226 }
227 qp->s_wqe = wqe;
228 if (++qp->s_cur >= qp->s_size)
229 qp->s_cur = 0;
230 break;
231 }
232 qp->s_len -= len;
233 qp->s_hdrwords = hwords;
234 qp->s_cur_sge = &qp->s_sge;
235 qp->s_cur_size = len;
236 hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
237 mask_psn(qp->s_next_psn++), middle);
238done:
239 ret = 1;
240 goto unlock;
241
242bail:
243 qp->s_flags &= ~HFI1_S_BUSY;
244unlock:
245 spin_unlock_irqrestore(&qp->s_lock, flags);
246 return ret;
247}
248
249/**
250 * hfi1_uc_rcv - handle an incoming UC packet
251 * @ibp: the port the packet came in on
252 * @hdr: the header of the packet
253 * @rcv_flags: flags relevant to rcv processing
254 * @data: the packet data
255 * @tlen: the length of the packet
256 * @qp: the QP for this packet.
257 *
258 * This is called from qp_rcv() to process an incoming UC packet
259 * for the given QP.
260 * Called at interrupt level.
261 */
262void hfi1_uc_rcv(struct hfi1_packet *packet)
263{
264 struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
265 struct hfi1_ib_header *hdr = packet->hdr;
266 u32 rcv_flags = packet->rcv_flags;
267 void *data = packet->ebuf;
268 u32 tlen = packet->tlen;
269 struct hfi1_qp *qp = packet->qp;
270 struct hfi1_other_headers *ohdr = packet->ohdr;
271 u32 opcode;
272 u32 hdrsize = packet->hlen;
273 u32 psn;
274 u32 pad;
275 struct ib_wc wc;
276 u32 pmtu = qp->pmtu;
277 struct ib_reth *reth;
278 int has_grh = rcv_flags & HFI1_HAS_GRH;
279 int ret;
280 u32 bth1;
281 struct ib_grh *grh = NULL;
282
283 opcode = be32_to_cpu(ohdr->bth[0]);
284 if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
285 return;
286
287 bth1 = be32_to_cpu(ohdr->bth[1]);
288 if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
289 if (bth1 & HFI1_BECN_SMASK) {
290 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
291 u32 rqpn, lqpn;
292 u16 rlid = be16_to_cpu(hdr->lrh[3]);
293 u8 sl, sc5;
294
295 lqpn = bth1 & HFI1_QPN_MASK;
296 rqpn = qp->remote_qpn;
297
298 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
299 sl = ibp->sc_to_sl[sc5];
300
301 process_becn(ppd, sl, rlid, lqpn, rqpn,
302 IB_CC_SVCTYPE_UC);
303 }
304
305 if (bth1 & HFI1_FECN_SMASK) {
306 u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
307 u16 slid = be16_to_cpu(hdr->lrh[3]);
308 u16 dlid = be16_to_cpu(hdr->lrh[1]);
309 u32 src_qp = qp->remote_qpn;
310 u8 sc5;
311
312 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
313
314 return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
315 }
316 }
317
318 psn = be32_to_cpu(ohdr->bth[2]);
319 opcode >>= 24;
320
321 /* Compare the PSN verses the expected PSN. */
322 if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
323 /*
324 * Handle a sequence error.
325 * Silently drop any current message.
326 */
327 qp->r_psn = psn;
328inv:
329 if (qp->r_state == OP(SEND_FIRST) ||
330 qp->r_state == OP(SEND_MIDDLE)) {
331 set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags);
332 qp->r_sge.num_sge = 0;
333 } else
334 hfi1_put_ss(&qp->r_sge);
335 qp->r_state = OP(SEND_LAST);
336 switch (opcode) {
337 case OP(SEND_FIRST):
338 case OP(SEND_ONLY):
339 case OP(SEND_ONLY_WITH_IMMEDIATE):
340 goto send_first;
341
342 case OP(RDMA_WRITE_FIRST):
343 case OP(RDMA_WRITE_ONLY):
344 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
345 goto rdma_first;
346
347 default:
348 goto drop;
349 }
350 }
351
352 /* Check for opcode sequence errors. */
353 switch (qp->r_state) {
354 case OP(SEND_FIRST):
355 case OP(SEND_MIDDLE):
356 if (opcode == OP(SEND_MIDDLE) ||
357 opcode == OP(SEND_LAST) ||
358 opcode == OP(SEND_LAST_WITH_IMMEDIATE))
359 break;
360 goto inv;
361
362 case OP(RDMA_WRITE_FIRST):
363 case OP(RDMA_WRITE_MIDDLE):
364 if (opcode == OP(RDMA_WRITE_MIDDLE) ||
365 opcode == OP(RDMA_WRITE_LAST) ||
366 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
367 break;
368 goto inv;
369
370 default:
371 if (opcode == OP(SEND_FIRST) ||
372 opcode == OP(SEND_ONLY) ||
373 opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
374 opcode == OP(RDMA_WRITE_FIRST) ||
375 opcode == OP(RDMA_WRITE_ONLY) ||
376 opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
377 break;
378 goto inv;
379 }
380
381 if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST))
382 qp_comm_est(qp);
383
384 /* OK, process the packet. */
385 switch (opcode) {
386 case OP(SEND_FIRST):
387 case OP(SEND_ONLY):
388 case OP(SEND_ONLY_WITH_IMMEDIATE):
389send_first:
390 if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
391 qp->r_sge = qp->s_rdma_read_sge;
392 else {
393 ret = hfi1_get_rwqe(qp, 0);
394 if (ret < 0)
395 goto op_err;
396 if (!ret)
397 goto drop;
398 /*
399 * qp->s_rdma_read_sge will be the owner
400 * of the mr references.
401 */
402 qp->s_rdma_read_sge = qp->r_sge;
403 }
404 qp->r_rcv_len = 0;
405 if (opcode == OP(SEND_ONLY))
406 goto no_immediate_data;
407 else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
408 goto send_last_imm;
409 /* FALLTHROUGH */
410 case OP(SEND_MIDDLE):
411 /* Check for invalid length PMTU or posted rwqe len. */
412 if (unlikely(tlen != (hdrsize + pmtu + 4)))
413 goto rewind;
414 qp->r_rcv_len += pmtu;
415 if (unlikely(qp->r_rcv_len > qp->r_len))
416 goto rewind;
417 hfi1_copy_sge(&qp->r_sge, data, pmtu, 0);
418 break;
419
420 case OP(SEND_LAST_WITH_IMMEDIATE):
421send_last_imm:
422 wc.ex.imm_data = ohdr->u.imm_data;
423 wc.wc_flags = IB_WC_WITH_IMM;
424 goto send_last;
425 case OP(SEND_LAST):
426no_immediate_data:
427 wc.ex.imm_data = 0;
428 wc.wc_flags = 0;
429send_last:
430 /* Get the number of bytes the message was padded by. */
431 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
432 /* Check for invalid length. */
433 /* LAST len should be >= 1 */
434 if (unlikely(tlen < (hdrsize + pad + 4)))
435 goto rewind;
436 /* Don't count the CRC. */
437 tlen -= (hdrsize + pad + 4);
438 wc.byte_len = tlen + qp->r_rcv_len;
439 if (unlikely(wc.byte_len > qp->r_len))
440 goto rewind;
441 wc.opcode = IB_WC_RECV;
442 hfi1_copy_sge(&qp->r_sge, data, tlen, 0);
443 hfi1_put_ss(&qp->s_rdma_read_sge);
444last_imm:
445 wc.wr_id = qp->r_wr_id;
446 wc.status = IB_WC_SUCCESS;
447 wc.qp = &qp->ibqp;
448 wc.src_qp = qp->remote_qpn;
449 wc.slid = qp->remote_ah_attr.dlid;
450 /*
451 * It seems that IB mandates the presence of an SL in a
452 * work completion only for the UD transport (see section
453 * 11.4.2 of IBTA Vol. 1).
454 *
455 * However, the way the SL is chosen below is consistent
456 * with the way that IB/qib works and is trying avoid
457 * introducing incompatibilities.
458 *
459 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
460 */
461 wc.sl = qp->remote_ah_attr.sl;
462 /* zero fields that are N/A */
463 wc.vendor_err = 0;
464 wc.pkey_index = 0;
465 wc.dlid_path_bits = 0;
466 wc.port_num = 0;
467 /* Signal completion event if the solicited bit is set. */
468 hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
469 (ohdr->bth[0] &
470 cpu_to_be32(IB_BTH_SOLICITED)) != 0);
471 break;
472
473 case OP(RDMA_WRITE_FIRST):
474 case OP(RDMA_WRITE_ONLY):
475 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
476rdma_first:
477 if (unlikely(!(qp->qp_access_flags &
478 IB_ACCESS_REMOTE_WRITE))) {
479 goto drop;
480 }
481 reth = &ohdr->u.rc.reth;
482 qp->r_len = be32_to_cpu(reth->length);
483 qp->r_rcv_len = 0;
484 qp->r_sge.sg_list = NULL;
485 if (qp->r_len != 0) {
486 u32 rkey = be32_to_cpu(reth->rkey);
487 u64 vaddr = be64_to_cpu(reth->vaddr);
488 int ok;
489
490 /* Check rkey */
491 ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
492 vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
493 if (unlikely(!ok))
494 goto drop;
495 qp->r_sge.num_sge = 1;
496 } else {
497 qp->r_sge.num_sge = 0;
498 qp->r_sge.sge.mr = NULL;
499 qp->r_sge.sge.vaddr = NULL;
500 qp->r_sge.sge.length = 0;
501 qp->r_sge.sge.sge_length = 0;
502 }
503 if (opcode == OP(RDMA_WRITE_ONLY))
504 goto rdma_last;
505 else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
506 wc.ex.imm_data = ohdr->u.rc.imm_data;
507 goto rdma_last_imm;
508 }
509 /* FALLTHROUGH */
510 case OP(RDMA_WRITE_MIDDLE):
511 /* Check for invalid length PMTU or posted rwqe len. */
512 if (unlikely(tlen != (hdrsize + pmtu + 4)))
513 goto drop;
514 qp->r_rcv_len += pmtu;
515 if (unlikely(qp->r_rcv_len > qp->r_len))
516 goto drop;
517 hfi1_copy_sge(&qp->r_sge, data, pmtu, 1);
518 break;
519
520 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
521 wc.ex.imm_data = ohdr->u.imm_data;
522rdma_last_imm:
523 wc.wc_flags = IB_WC_WITH_IMM;
524
525 /* Get the number of bytes the message was padded by. */
526 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
527 /* Check for invalid length. */
528 /* LAST len should be >= 1 */
529 if (unlikely(tlen < (hdrsize + pad + 4)))
530 goto drop;
531 /* Don't count the CRC. */
532 tlen -= (hdrsize + pad + 4);
533 if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
534 goto drop;
535 if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
536 hfi1_put_ss(&qp->s_rdma_read_sge);
537 else {
538 ret = hfi1_get_rwqe(qp, 1);
539 if (ret < 0)
540 goto op_err;
541 if (!ret)
542 goto drop;
543 }
544 wc.byte_len = qp->r_len;
545 wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
546 hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
547 hfi1_put_ss(&qp->r_sge);
548 goto last_imm;
549
550 case OP(RDMA_WRITE_LAST):
551rdma_last:
552 /* Get the number of bytes the message was padded by. */
553 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
554 /* Check for invalid length. */
555 /* LAST len should be >= 1 */
556 if (unlikely(tlen < (hdrsize + pad + 4)))
557 goto drop;
558 /* Don't count the CRC. */
559 tlen -= (hdrsize + pad + 4);
560 if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
561 goto drop;
562 hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
563 hfi1_put_ss(&qp->r_sge);
564 break;
565
566 default:
567 /* Drop packet for unknown opcodes. */
568 goto drop;
569 }
570 qp->r_psn++;
571 qp->r_state = opcode;
572 return;
573
574rewind:
575 set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags);
576 qp->r_sge.num_sge = 0;
577drop:
578 ibp->n_pkt_drops++;
579 return;
580
581op_err:
582 hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
583 return;
584
585}
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c
new file mode 100644
index 000000000000..d40d1a1e10aa
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/ud.c
@@ -0,0 +1,885 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/net.h>
52#include <rdma/ib_smi.h>
53
54#include "hfi.h"
55#include "mad.h"
56#include "qp.h"
57
58/**
59 * ud_loopback - handle send on loopback QPs
60 * @sqp: the sending QP
61 * @swqe: the send work request
62 *
63 * This is called from hfi1_make_ud_req() to forward a WQE addressed
64 * to the same HFI.
65 * Note that the receive interrupt handler may be calling hfi1_ud_rcv()
66 * while this is being called.
67 */
68static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe)
69{
70 struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
71 struct hfi1_pportdata *ppd;
72 struct hfi1_qp *qp;
73 struct ib_ah_attr *ah_attr;
74 unsigned long flags;
75 struct hfi1_sge_state ssge;
76 struct hfi1_sge *sge;
77 struct ib_wc wc;
78 u32 length;
79 enum ib_qp_type sqptype, dqptype;
80
81 rcu_read_lock();
82
83 qp = hfi1_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn);
84 if (!qp) {
85 ibp->n_pkt_drops++;
86 rcu_read_unlock();
87 return;
88 }
89
90 sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
91 IB_QPT_UD : sqp->ibqp.qp_type;
92 dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
93 IB_QPT_UD : qp->ibqp.qp_type;
94
95 if (dqptype != sqptype ||
96 !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) {
97 ibp->n_pkt_drops++;
98 goto drop;
99 }
100
101 ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
102 ppd = ppd_from_ibp(ibp);
103
104 if (qp->ibqp.qp_num > 1) {
105 u16 pkey;
106 u16 slid;
107 u8 sc5 = ibp->sl_to_sc[ah_attr->sl];
108
109 pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index);
110 slid = ppd->lid | (ah_attr->src_path_bits &
111 ((1 << ppd->lmc) - 1));
112 if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
113 qp->s_pkey_index, slid))) {
114 hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, pkey,
115 ah_attr->sl,
116 sqp->ibqp.qp_num, qp->ibqp.qp_num,
117 cpu_to_be16(slid),
118 cpu_to_be16(ah_attr->dlid));
119 goto drop;
120 }
121 }
122
123 /*
124 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
125 * Qkeys with the high order bit set mean use the
126 * qkey from the QP context instead of the WR (see 10.2.5).
127 */
128 if (qp->ibqp.qp_num) {
129 u32 qkey;
130
131 qkey = (int)swqe->wr.wr.ud.remote_qkey < 0 ?
132 sqp->qkey : swqe->wr.wr.ud.remote_qkey;
133 if (unlikely(qkey != qp->qkey)) {
134 u16 lid;
135
136 lid = ppd->lid | (ah_attr->src_path_bits &
137 ((1 << ppd->lmc) - 1));
138 hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
139 ah_attr->sl,
140 sqp->ibqp.qp_num, qp->ibqp.qp_num,
141 cpu_to_be16(lid),
142 cpu_to_be16(ah_attr->dlid));
143 goto drop;
144 }
145 }
146
147 /*
148 * A GRH is expected to precede the data even if not
149 * present on the wire.
150 */
151 length = swqe->length;
152 memset(&wc, 0, sizeof(wc));
153 wc.byte_len = length + sizeof(struct ib_grh);
154
155 if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
156 wc.wc_flags = IB_WC_WITH_IMM;
157 wc.ex.imm_data = swqe->wr.ex.imm_data;
158 }
159
160 spin_lock_irqsave(&qp->r_lock, flags);
161
162 /*
163 * Get the next work request entry to find where to put the data.
164 */
165 if (qp->r_flags & HFI1_R_REUSE_SGE)
166 qp->r_flags &= ~HFI1_R_REUSE_SGE;
167 else {
168 int ret;
169
170 ret = hfi1_get_rwqe(qp, 0);
171 if (ret < 0) {
172 hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
173 goto bail_unlock;
174 }
175 if (!ret) {
176 if (qp->ibqp.qp_num == 0)
177 ibp->n_vl15_dropped++;
178 goto bail_unlock;
179 }
180 }
181 /* Silently drop packets which are too big. */
182 if (unlikely(wc.byte_len > qp->r_len)) {
183 qp->r_flags |= HFI1_R_REUSE_SGE;
184 ibp->n_pkt_drops++;
185 goto bail_unlock;
186 }
187
188 if (ah_attr->ah_flags & IB_AH_GRH) {
189 hfi1_copy_sge(&qp->r_sge, &ah_attr->grh,
190 sizeof(struct ib_grh), 1);
191 wc.wc_flags |= IB_WC_GRH;
192 } else
193 hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
194 ssge.sg_list = swqe->sg_list + 1;
195 ssge.sge = *swqe->sg_list;
196 ssge.num_sge = swqe->wr.num_sge;
197 sge = &ssge.sge;
198 while (length) {
199 u32 len = sge->length;
200
201 if (len > length)
202 len = length;
203 if (len > sge->sge_length)
204 len = sge->sge_length;
205 WARN_ON_ONCE(len == 0);
206 hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1);
207 sge->vaddr += len;
208 sge->length -= len;
209 sge->sge_length -= len;
210 if (sge->sge_length == 0) {
211 if (--ssge.num_sge)
212 *sge = *ssge.sg_list++;
213 } else if (sge->length == 0 && sge->mr->lkey) {
214 if (++sge->n >= HFI1_SEGSZ) {
215 if (++sge->m >= sge->mr->mapsz)
216 break;
217 sge->n = 0;
218 }
219 sge->vaddr =
220 sge->mr->map[sge->m]->segs[sge->n].vaddr;
221 sge->length =
222 sge->mr->map[sge->m]->segs[sge->n].length;
223 }
224 length -= len;
225 }
226 hfi1_put_ss(&qp->r_sge);
227 if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
228 goto bail_unlock;
229 wc.wr_id = qp->r_wr_id;
230 wc.status = IB_WC_SUCCESS;
231 wc.opcode = IB_WC_RECV;
232 wc.qp = &qp->ibqp;
233 wc.src_qp = sqp->ibqp.qp_num;
234 if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
235 if (sqp->ibqp.qp_type == IB_QPT_GSI ||
236 sqp->ibqp.qp_type == IB_QPT_SMI)
237 wc.pkey_index = swqe->wr.wr.ud.pkey_index;
238 else
239 wc.pkey_index = sqp->s_pkey_index;
240 } else {
241 wc.pkey_index = 0;
242 }
243 wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
244 /* Check for loopback when the port lid is not set */
245 if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI)
246 wc.slid = HFI1_PERMISSIVE_LID;
247 wc.sl = ah_attr->sl;
248 wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
249 wc.port_num = qp->port_num;
250 /* Signal completion event if the solicited bit is set. */
251 hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
252 swqe->wr.send_flags & IB_SEND_SOLICITED);
253 ibp->n_loop_pkts++;
254bail_unlock:
255 spin_unlock_irqrestore(&qp->r_lock, flags);
256drop:
257 rcu_read_unlock();
258}
259
260/**
261 * hfi1_make_ud_req - construct a UD request packet
262 * @qp: the QP
263 *
264 * Return 1 if constructed; otherwise, return 0.
265 */
266int hfi1_make_ud_req(struct hfi1_qp *qp)
267{
268 struct hfi1_other_headers *ohdr;
269 struct ib_ah_attr *ah_attr;
270 struct hfi1_pportdata *ppd;
271 struct hfi1_ibport *ibp;
272 struct hfi1_swqe *wqe;
273 unsigned long flags;
274 u32 nwords;
275 u32 extra_bytes;
276 u32 bth0;
277 u16 lrh0;
278 u16 lid;
279 int ret = 0;
280 int next_cur;
281 u8 sc5;
282
283 spin_lock_irqsave(&qp->s_lock, flags);
284
285 if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK)) {
286 if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
287 goto bail;
288 /* We are in the error state, flush the work request. */
289 if (qp->s_last == qp->s_head)
290 goto bail;
291 /* If DMAs are in progress, we can't flush immediately. */
292 if (atomic_read(&qp->s_iowait.sdma_busy)) {
293 qp->s_flags |= HFI1_S_WAIT_DMA;
294 goto bail;
295 }
296 wqe = get_swqe_ptr(qp, qp->s_last);
297 hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
298 goto done;
299 }
300
301 if (qp->s_cur == qp->s_head)
302 goto bail;
303
304 wqe = get_swqe_ptr(qp, qp->s_cur);
305 next_cur = qp->s_cur + 1;
306 if (next_cur >= qp->s_size)
307 next_cur = 0;
308
309 /* Construct the header. */
310 ibp = to_iport(qp->ibqp.device, qp->port_num);
311 ppd = ppd_from_ibp(ibp);
312 ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
313 if (ah_attr->dlid < HFI1_MULTICAST_LID_BASE ||
314 ah_attr->dlid == HFI1_PERMISSIVE_LID) {
315 lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
316 if (unlikely(!loopback && (lid == ppd->lid ||
317 (lid == HFI1_PERMISSIVE_LID &&
318 qp->ibqp.qp_type == IB_QPT_GSI)))) {
319 /*
320 * If DMAs are in progress, we can't generate
321 * a completion for the loopback packet since
322 * it would be out of order.
323 * Instead of waiting, we could queue a
324 * zero length descriptor so we get a callback.
325 */
326 if (atomic_read(&qp->s_iowait.sdma_busy)) {
327 qp->s_flags |= HFI1_S_WAIT_DMA;
328 goto bail;
329 }
330 qp->s_cur = next_cur;
331 spin_unlock_irqrestore(&qp->s_lock, flags);
332 ud_loopback(qp, wqe);
333 spin_lock_irqsave(&qp->s_lock, flags);
334 hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
335 goto done;
336 }
337 }
338
339 qp->s_cur = next_cur;
340 extra_bytes = -wqe->length & 3;
341 nwords = (wqe->length + extra_bytes) >> 2;
342
343 /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
344 qp->s_hdrwords = 7;
345 qp->s_cur_size = wqe->length;
346 qp->s_cur_sge = &qp->s_sge;
347 qp->s_srate = ah_attr->static_rate;
348 qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
349 qp->s_wqe = wqe;
350 qp->s_sge.sge = wqe->sg_list[0];
351 qp->s_sge.sg_list = wqe->sg_list + 1;
352 qp->s_sge.num_sge = wqe->wr.num_sge;
353 qp->s_sge.total_len = wqe->length;
354
355 if (ah_attr->ah_flags & IB_AH_GRH) {
356 /* Header size in 32-bit words. */
357 qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh,
358 &ah_attr->grh,
359 qp->s_hdrwords, nwords);
360 lrh0 = HFI1_LRH_GRH;
361 ohdr = &qp->s_hdr->ibh.u.l.oth;
362 /*
363 * Don't worry about sending to locally attached multicast
364 * QPs. It is unspecified by the spec. what happens.
365 */
366 } else {
367 /* Header size in 32-bit words. */
368 lrh0 = HFI1_LRH_BTH;
369 ohdr = &qp->s_hdr->ibh.u.oth;
370 }
371 if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
372 qp->s_hdrwords++;
373 ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
374 bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
375 } else
376 bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
377 sc5 = ibp->sl_to_sc[ah_attr->sl];
378 lrh0 |= (ah_attr->sl & 0xf) << 4;
379 if (qp->ibqp.qp_type == IB_QPT_SMI) {
380 lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
381 qp->s_sc = 0xf;
382 } else {
383 lrh0 |= (sc5 & 0xf) << 12;
384 qp->s_sc = sc5;
385 }
386 qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0);
387 qp->s_hdr->ibh.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */
388 qp->s_hdr->ibh.lrh[2] =
389 cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
390 if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE))
391 qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE;
392 else {
393 lid = ppd->lid;
394 if (lid) {
395 lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1);
396 qp->s_hdr->ibh.lrh[3] = cpu_to_be16(lid);
397 } else
398 qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE;
399 }
400 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
401 bth0 |= IB_BTH_SOLICITED;
402 bth0 |= extra_bytes << 20;
403 if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
404 bth0 |= hfi1_get_pkey(ibp, wqe->wr.wr.ud.pkey_index);
405 else
406 bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
407 ohdr->bth[0] = cpu_to_be32(bth0);
408 ohdr->bth[1] = cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
409 ohdr->bth[2] = cpu_to_be32(mask_psn(qp->s_next_psn++));
410 /*
411 * Qkeys with the high order bit set mean use the
412 * qkey from the QP context instead of the WR (see 10.2.5).
413 */
414 ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
415 qp->qkey : wqe->wr.wr.ud.remote_qkey);
416 ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
417 /* disarm any ahg */
418 qp->s_hdr->ahgcount = 0;
419 qp->s_hdr->ahgidx = 0;
420 qp->s_hdr->tx_flags = 0;
421 qp->s_hdr->sde = NULL;
422
423done:
424 ret = 1;
425 goto unlock;
426
427bail:
428 qp->s_flags &= ~HFI1_S_BUSY;
429unlock:
430 spin_unlock_irqrestore(&qp->s_lock, flags);
431 return ret;
432}
433
434/*
435 * Hardware can't check this so we do it here.
436 *
437 * This is a slightly different algorithm than the standard pkey check. It
438 * special cases the management keys and allows for 0x7fff and 0xffff to be in
439 * the table at the same time.
440 *
441 * @returns the index found or -1 if not found
442 */
443int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey)
444{
445 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
446 unsigned i;
447
448 if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) {
449 unsigned lim_idx = -1;
450
451 for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) {
452 /* here we look for an exact match */
453 if (ppd->pkeys[i] == pkey)
454 return i;
455 if (ppd->pkeys[i] == LIM_MGMT_P_KEY)
456 lim_idx = i;
457 }
458
459 /* did not find 0xffff return 0x7fff idx if found */
460 if (pkey == FULL_MGMT_P_KEY)
461 return lim_idx;
462
463 /* no match... */
464 return -1;
465 }
466
467 pkey &= 0x7fff; /* remove limited/full membership bit */
468
469 for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i)
470 if ((ppd->pkeys[i] & 0x7fff) == pkey)
471 return i;
472
473 /*
474 * Should not get here, this means hardware failed to validate pkeys.
475 */
476 return -1;
477}
478
479void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn,
480 u32 pkey, u32 slid, u32 dlid, u8 sc5,
481 const struct ib_grh *old_grh)
482{
483 u64 pbc, pbc_flags = 0;
484 u32 bth0, plen, vl, hwords = 5;
485 u16 lrh0;
486 u8 sl = ibp->sc_to_sl[sc5];
487 struct hfi1_ib_header hdr;
488 struct hfi1_other_headers *ohdr;
489 struct pio_buf *pbuf;
490 struct send_context *ctxt = qp_to_send_context(qp, sc5);
491 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
492
493 if (old_grh) {
494 struct ib_grh *grh = &hdr.u.l.grh;
495
496 grh->version_tclass_flow = old_grh->version_tclass_flow;
497 grh->paylen = cpu_to_be16((hwords - 2 + SIZE_OF_CRC) << 2);
498 grh->hop_limit = 0xff;
499 grh->sgid = old_grh->dgid;
500 grh->dgid = old_grh->sgid;
501 ohdr = &hdr.u.l.oth;
502 lrh0 = HFI1_LRH_GRH;
503 hwords += sizeof(struct ib_grh) / sizeof(u32);
504 } else {
505 ohdr = &hdr.u.oth;
506 lrh0 = HFI1_LRH_BTH;
507 }
508
509 lrh0 |= (sc5 & 0xf) << 12 | sl << 4;
510
511 bth0 = pkey | (IB_OPCODE_CNP << 24);
512 ohdr->bth[0] = cpu_to_be32(bth0);
513
514 ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << HFI1_BECN_SHIFT));
515 ohdr->bth[2] = 0; /* PSN 0 */
516
517 hdr.lrh[0] = cpu_to_be16(lrh0);
518 hdr.lrh[1] = cpu_to_be16(dlid);
519 hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
520 hdr.lrh[3] = cpu_to_be16(slid);
521
522 plen = 2 /* PBC */ + hwords;
523 pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
524 vl = sc_to_vlt(ppd->dd, sc5);
525 pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
526 if (ctxt) {
527 pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
528 if (pbuf)
529 ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
530 &hdr, hwords);
531 }
532}
533
534/*
535 * opa_smp_check() - Do the regular pkey checking, and the additional
536 * checks for SMPs specified in OPAv1 rev 0.90, section 9.10.26
537 * ("SMA Packet Checks").
538 *
539 * Note that:
540 * - Checks are done using the pkey directly from the packet's BTH,
541 * and specifically _not_ the pkey that we attach to the completion,
542 * which may be different.
543 * - These checks are specifically for "non-local" SMPs (i.e., SMPs
544 * which originated on another node). SMPs which are sent from, and
545 * destined to this node are checked in opa_local_smp_check().
546 *
547 * At the point where opa_smp_check() is called, we know:
548 * - destination QP is QP0
549 *
550 * opa_smp_check() returns 0 if all checks succeed, 1 otherwise.
551 */
552static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
553 struct hfi1_qp *qp, u16 slid, struct opa_smp *smp)
554{
555 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
556
557 /*
558 * I don't think it's possible for us to get here with sc != 0xf,
559 * but check it to be certain.
560 */
561 if (sc5 != 0xf)
562 return 1;
563
564 if (rcv_pkey_check(ppd, pkey, sc5, slid))
565 return 1;
566
567 /*
568 * At this point we know (and so don't need to check again) that
569 * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY
570 * (see ingress_pkey_check).
571 */
572 if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE &&
573 smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) {
574 ingress_pkey_table_fail(ppd, pkey, slid);
575 return 1;
576 }
577
578 /*
579 * SMPs fall into one of four (disjoint) categories:
580 * SMA request, SMA response, trap, or trap repress.
581 * Our response depends, in part, on which type of
582 * SMP we're processing.
583 *
584 * If this is not an SMA request, or trap repress:
585 * - accept MAD if the port is running an SM
586 * - pkey == FULL_MGMT_P_KEY =>
587 * reply with unsupported method (i.e., just mark
588 * the smp's status field here, and let it be
589 * processed normally)
590 * - pkey != LIM_MGMT_P_KEY =>
591 * increment port recv constraint errors, drop MAD
592 * If this is an SMA request or trap repress:
593 * - pkey != FULL_MGMT_P_KEY =>
594 * increment port recv constraint errors, drop MAD
595 */
596 switch (smp->method) {
597 case IB_MGMT_METHOD_GET:
598 case IB_MGMT_METHOD_SET:
599 case IB_MGMT_METHOD_REPORT:
600 case IB_MGMT_METHOD_TRAP_REPRESS:
601 if (pkey != FULL_MGMT_P_KEY) {
602 ingress_pkey_table_fail(ppd, pkey, slid);
603 return 1;
604 }
605 break;
606 case IB_MGMT_METHOD_SEND:
607 case IB_MGMT_METHOD_TRAP:
608 case IB_MGMT_METHOD_GET_RESP:
609 case IB_MGMT_METHOD_REPORT_RESP:
610 if (ibp->port_cap_flags & IB_PORT_SM)
611 return 0;
612 if (pkey == FULL_MGMT_P_KEY) {
613 smp->status |= IB_SMP_UNSUP_METHOD;
614 return 0;
615 }
616 if (pkey != LIM_MGMT_P_KEY) {
617 ingress_pkey_table_fail(ppd, pkey, slid);
618 return 1;
619 }
620 break;
621 default:
622 break;
623 }
624 return 0;
625}
626
627
628/**
629 * hfi1_ud_rcv - receive an incoming UD packet
630 * @ibp: the port the packet came in on
631 * @hdr: the packet header
632 * @rcv_flags: flags relevant to rcv processing
633 * @data: the packet data
634 * @tlen: the packet length
635 * @qp: the QP the packet came on
636 *
637 * This is called from qp_rcv() to process an incoming UD packet
638 * for the given QP.
639 * Called at interrupt level.
640 */
641void hfi1_ud_rcv(struct hfi1_packet *packet)
642{
643 struct hfi1_other_headers *ohdr = packet->ohdr;
644 int opcode;
645 u32 hdrsize = packet->hlen;
646 u32 pad;
647 struct ib_wc wc;
648 u32 qkey;
649 u32 src_qp;
650 u16 dlid, pkey;
651 int mgmt_pkey_idx = -1;
652 struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
653 struct hfi1_ib_header *hdr = packet->hdr;
654 u32 rcv_flags = packet->rcv_flags;
655 void *data = packet->ebuf;
656 u32 tlen = packet->tlen;
657 struct hfi1_qp *qp = packet->qp;
658 bool has_grh = rcv_flags & HFI1_HAS_GRH;
659 bool sc4_bit = has_sc4_bit(packet);
660 u8 sc;
661 u32 bth1;
662 int is_mcast;
663 struct ib_grh *grh = NULL;
664
665 qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
666 src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK;
667 dlid = be16_to_cpu(hdr->lrh[1]);
668 is_mcast = (dlid > HFI1_MULTICAST_LID_BASE) &&
669 (dlid != HFI1_PERMISSIVE_LID);
670 bth1 = be32_to_cpu(ohdr->bth[1]);
671 if (unlikely(bth1 & HFI1_BECN_SMASK)) {
672 /*
673 * In pre-B0 h/w the CNP_OPCODE is handled via an
674 * error path (errata 291394).
675 */
676 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
677 u32 lqpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
678 u8 sl, sc5;
679
680 sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
681 sc5 |= sc4_bit;
682 sl = ibp->sc_to_sl[sc5];
683
684 process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD);
685 }
686
687 /*
688 * The opcode is in the low byte when its in network order
689 * (top byte when in host order).
690 */
691 opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
692 opcode &= 0xff;
693
694 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
695
696 if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) {
697 u16 slid = be16_to_cpu(hdr->lrh[3]);
698 u8 sc5;
699
700 sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
701 sc5 |= sc4_bit;
702
703 return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
704 }
705 /*
706 * Get the number of bytes the message was padded by
707 * and drop incomplete packets.
708 */
709 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
710 if (unlikely(tlen < (hdrsize + pad + 4)))
711 goto drop;
712
713 tlen -= hdrsize + pad + 4;
714
715 /*
716 * Check that the permissive LID is only used on QP0
717 * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
718 */
719 if (qp->ibqp.qp_num) {
720 if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
721 hdr->lrh[3] == IB_LID_PERMISSIVE))
722 goto drop;
723 if (qp->ibqp.qp_num > 1) {
724 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
725 u16 slid;
726 u8 sc5;
727
728 sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
729 sc5 |= sc4_bit;
730
731 slid = be16_to_cpu(hdr->lrh[3]);
732 if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
733 /*
734 * Traps will not be sent for packets dropped
735 * by the HW. This is fine, as sending trap
736 * for invalid pkeys is optional according to
737 * IB spec (release 1.3, section 10.9.4)
738 */
739 hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
740 pkey,
741 (be16_to_cpu(hdr->lrh[0]) >> 4) &
742 0xF,
743 src_qp, qp->ibqp.qp_num,
744 hdr->lrh[3], hdr->lrh[1]);
745 return;
746 }
747 } else {
748 /* GSI packet */
749 mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
750 if (mgmt_pkey_idx < 0)
751 goto drop;
752
753 }
754 if (unlikely(qkey != qp->qkey)) {
755 hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
756 (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
757 src_qp, qp->ibqp.qp_num,
758 hdr->lrh[3], hdr->lrh[1]);
759 return;
760 }
761 /* Drop invalid MAD packets (see 13.5.3.1). */
762 if (unlikely(qp->ibqp.qp_num == 1 &&
763 (tlen > 2048 ||
764 (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
765 goto drop;
766 } else {
767 /* Received on QP0, and so by definition, this is an SMP */
768 struct opa_smp *smp = (struct opa_smp *)data;
769 u16 slid = be16_to_cpu(hdr->lrh[3]);
770 u8 sc5;
771
772 sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
773 sc5 |= sc4_bit;
774
775 if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
776 goto drop;
777
778 if (tlen > 2048)
779 goto drop;
780 if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
781 hdr->lrh[3] == IB_LID_PERMISSIVE) &&
782 smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
783 goto drop;
784
785 /* look up SMI pkey */
786 mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
787 if (mgmt_pkey_idx < 0)
788 goto drop;
789
790 }
791
792 if (qp->ibqp.qp_num > 1 &&
793 opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
794 wc.ex.imm_data = ohdr->u.ud.imm_data;
795 wc.wc_flags = IB_WC_WITH_IMM;
796 tlen -= sizeof(u32);
797 } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
798 wc.ex.imm_data = 0;
799 wc.wc_flags = 0;
800 } else
801 goto drop;
802
803 /*
804 * A GRH is expected to precede the data even if not
805 * present on the wire.
806 */
807 wc.byte_len = tlen + sizeof(struct ib_grh);
808
809 /*
810 * Get the next work request entry to find where to put the data.
811 */
812 if (qp->r_flags & HFI1_R_REUSE_SGE)
813 qp->r_flags &= ~HFI1_R_REUSE_SGE;
814 else {
815 int ret;
816
817 ret = hfi1_get_rwqe(qp, 0);
818 if (ret < 0) {
819 hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
820 return;
821 }
822 if (!ret) {
823 if (qp->ibqp.qp_num == 0)
824 ibp->n_vl15_dropped++;
825 return;
826 }
827 }
828 /* Silently drop packets which are too big. */
829 if (unlikely(wc.byte_len > qp->r_len)) {
830 qp->r_flags |= HFI1_R_REUSE_SGE;
831 goto drop;
832 }
833 if (has_grh) {
834 hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
835 sizeof(struct ib_grh), 1);
836 wc.wc_flags |= IB_WC_GRH;
837 } else
838 hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
839 hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1);
840 hfi1_put_ss(&qp->r_sge);
841 if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
842 return;
843 wc.wr_id = qp->r_wr_id;
844 wc.status = IB_WC_SUCCESS;
845 wc.opcode = IB_WC_RECV;
846 wc.vendor_err = 0;
847 wc.qp = &qp->ibqp;
848 wc.src_qp = src_qp;
849
850 if (qp->ibqp.qp_type == IB_QPT_GSI ||
851 qp->ibqp.qp_type == IB_QPT_SMI) {
852 if (mgmt_pkey_idx < 0) {
853 if (net_ratelimit()) {
854 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
855 struct hfi1_devdata *dd = ppd->dd;
856
857 dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
858 qp->ibqp.qp_type);
859 mgmt_pkey_idx = 0;
860 }
861 }
862 wc.pkey_index = (unsigned)mgmt_pkey_idx;
863 } else
864 wc.pkey_index = 0;
865
866 wc.slid = be16_to_cpu(hdr->lrh[3]);
867 sc = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
868 sc |= sc4_bit;
869 wc.sl = ibp->sc_to_sl[sc];
870
871 /*
872 * Save the LMC lower bits if the destination LID is a unicast LID.
873 */
874 wc.dlid_path_bits = dlid >= HFI1_MULTICAST_LID_BASE ? 0 :
875 dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
876 wc.port_num = qp->port_num;
877 /* Signal completion event if the solicited bit is set. */
878 hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
879 (ohdr->bth[0] &
880 cpu_to_be32(IB_BTH_SOLICITED)) != 0);
881 return;
882
883drop:
884 ibp->n_pkt_drops++;
885}
diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/staging/rdma/hfi1/user_pages.c
new file mode 100644
index 000000000000..9071afbd7bf4
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/user_pages.c
@@ -0,0 +1,156 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/mm.h>
52#include <linux/device.h>
53
54#include "hfi.h"
55
56static void __hfi1_release_user_pages(struct page **p, size_t num_pages,
57 int dirty)
58{
59 size_t i;
60
61 for (i = 0; i < num_pages; i++) {
62 if (dirty)
63 set_page_dirty_lock(p[i]);
64 put_page(p[i]);
65 }
66}
67
68/*
69 * Call with current->mm->mmap_sem held.
70 */
71static int __hfi1_get_user_pages(unsigned long start_page, size_t num_pages,
72 struct page **p)
73{
74 unsigned long lock_limit;
75 size_t got;
76 int ret;
77
78 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
79
80 if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) {
81 ret = -ENOMEM;
82 goto bail;
83 }
84
85 for (got = 0; got < num_pages; got += ret) {
86 ret = get_user_pages(current, current->mm,
87 start_page + got * PAGE_SIZE,
88 num_pages - got, 1, 1,
89 p + got, NULL);
90 if (ret < 0)
91 goto bail_release;
92 }
93
94 current->mm->pinned_vm += num_pages;
95
96 ret = 0;
97 goto bail;
98
99bail_release:
100 __hfi1_release_user_pages(p, got, 0);
101bail:
102 return ret;
103}
104
105/**
106 * hfi1_map_page - a safety wrapper around pci_map_page()
107 *
108 */
109dma_addr_t hfi1_map_page(struct pci_dev *hwdev, struct page *page,
110 unsigned long offset, size_t size, int direction)
111{
112 dma_addr_t phys;
113
114 phys = pci_map_page(hwdev, page, offset, size, direction);
115
116 return phys;
117}
118
119/**
120 * hfi1_get_user_pages - lock user pages into memory
121 * @start_page: the start page
122 * @num_pages: the number of pages
123 * @p: the output page structures
124 *
125 * This function takes a given start page (page aligned user virtual
126 * address) and pins it and the following specified number of pages. For
127 * now, num_pages is always 1, but that will probably change at some point
128 * (because caller is doing expected sends on a single virtually contiguous
129 * buffer, so we can do all pages at once).
130 */
131int hfi1_get_user_pages(unsigned long start_page, size_t num_pages,
132 struct page **p)
133{
134 int ret;
135
136 down_write(&current->mm->mmap_sem);
137
138 ret = __hfi1_get_user_pages(start_page, num_pages, p);
139
140 up_write(&current->mm->mmap_sem);
141
142 return ret;
143}
144
145void hfi1_release_user_pages(struct page **p, size_t num_pages)
146{
147 if (current->mm) /* during close after signal, mm can be NULL */
148 down_write(&current->mm->mmap_sem);
149
150 __hfi1_release_user_pages(p, num_pages, 1);
151
152 if (current->mm) {
153 current->mm->pinned_vm -= num_pages;
154 up_write(&current->mm->mmap_sem);
155 }
156}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
new file mode 100644
index 000000000000..55526613a522
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -0,0 +1,1444 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50#include <linux/mm.h>
51#include <linux/types.h>
52#include <linux/device.h>
53#include <linux/dmapool.h>
54#include <linux/slab.h>
55#include <linux/list.h>
56#include <linux/highmem.h>
57#include <linux/io.h>
58#include <linux/uio.h>
59#include <linux/rbtree.h>
60#include <linux/spinlock.h>
61#include <linux/delay.h>
62#include <linux/kthread.h>
63#include <linux/mmu_context.h>
64#include <linux/module.h>
65#include <linux/vmalloc.h>
66
67#include "hfi.h"
68#include "sdma.h"
69#include "user_sdma.h"
70#include "sdma.h"
71#include "verbs.h" /* for the headers */
72#include "common.h" /* for struct hfi1_tid_info */
73#include "trace.h"
74
75static uint hfi1_sdma_comp_ring_size = 128;
76module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
77MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
78
79/* The maximum number of Data io vectors per message/request */
80#define MAX_VECTORS_PER_REQ 8
81/*
82 * Maximum number of packet to send from each message/request
83 * before moving to the next one.
84 */
85#define MAX_PKTS_PER_QUEUE 16
86
87#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
88
89#define req_opcode(x) \
90 (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
91#define req_version(x) \
92 (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
93#define req_iovcnt(x) \
94 (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
95
96/* Number of BTH.PSN bits used for sequence number in expected rcvs */
97#define BTH_SEQ_MASK 0x7ffull
98
99/*
100 * Define fields in the KDETH header so we can update the header
101 * template.
102 */
103#define KDETH_OFFSET_SHIFT 0
104#define KDETH_OFFSET_MASK 0x7fff
105#define KDETH_OM_SHIFT 15
106#define KDETH_OM_MASK 0x1
107#define KDETH_TID_SHIFT 16
108#define KDETH_TID_MASK 0x3ff
109#define KDETH_TIDCTRL_SHIFT 26
110#define KDETH_TIDCTRL_MASK 0x3
111#define KDETH_INTR_SHIFT 28
112#define KDETH_INTR_MASK 0x1
113#define KDETH_SH_SHIFT 29
114#define KDETH_SH_MASK 0x1
115#define KDETH_HCRC_UPPER_SHIFT 16
116#define KDETH_HCRC_UPPER_MASK 0xff
117#define KDETH_HCRC_LOWER_SHIFT 24
118#define KDETH_HCRC_LOWER_MASK 0xff
119
120#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
121#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
122
123#define KDETH_GET(val, field) \
124 (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
125#define KDETH_SET(dw, field, val) do { \
126 u32 dwval = le32_to_cpu(dw); \
127 dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
128 dwval |= (((val) & KDETH_##field##_MASK) << \
129 KDETH_##field##_SHIFT); \
130 dw = cpu_to_le32(dwval); \
131 } while (0)
132
133#define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \
134 do { \
135 if ((idx) < ARRAY_SIZE((arr))) \
136 (arr)[(idx++)] = sdma_build_ahg_descriptor( \
137 (__force u16)(value), (dw), (bit), \
138 (width)); \
139 else \
140 return -ERANGE; \
141 } while (0)
142
143/* KDETH OM multipliers and switch over point */
144#define KDETH_OM_SMALL 4
145#define KDETH_OM_LARGE 64
146#define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
147
148/* Last packet in the request */
149#define USER_SDMA_TXREQ_FLAGS_LAST_PKT (1 << 0)
150
151#define SDMA_REQ_IN_USE 0
152#define SDMA_REQ_FOR_THREAD 1
153#define SDMA_REQ_SEND_DONE 2
154#define SDMA_REQ_HAVE_AHG 3
155#define SDMA_REQ_HAS_ERROR 4
156#define SDMA_REQ_DONE_ERROR 5
157
158#define SDMA_PKT_Q_INACTIVE (1 << 0)
159#define SDMA_PKT_Q_ACTIVE (1 << 1)
160#define SDMA_PKT_Q_DEFERRED (1 << 2)
161
162/*
163 * Maximum retry attempts to submit a TX request
164 * before putting the process to sleep.
165 */
166#define MAX_DEFER_RETRY_COUNT 1
167
168static unsigned initial_pkt_count = 8;
169
170#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
171
172struct user_sdma_iovec {
173 struct iovec iov;
174 /* number of pages in this vector */
175 unsigned npages;
176 /* array of pinned pages for this vector */
177 struct page **pages;
178 /* offset into the virtual address space of the vector at
179 * which we last left off. */
180 u64 offset;
181};
182
183struct user_sdma_request {
184 struct sdma_req_info info;
185 struct hfi1_user_sdma_pkt_q *pq;
186 struct hfi1_user_sdma_comp_q *cq;
187 /* This is the original header from user space */
188 struct hfi1_pkt_header hdr;
189 /*
190 * Pointer to the SDMA engine for this request.
191 * Since different request could be on different VLs,
192 * each request will need it's own engine pointer.
193 */
194 struct sdma_engine *sde;
195 u8 ahg_idx;
196 u32 ahg[9];
197 /*
198 * KDETH.Offset (Eager) field
199 * We need to remember the initial value so the headers
200 * can be updated properly.
201 */
202 u32 koffset;
203 /*
204 * KDETH.OFFSET (TID) field
205 * The offset can cover multiple packets, depending on the
206 * size of the TID entry.
207 */
208 u32 tidoffset;
209 /*
210 * KDETH.OM
211 * Remember this because the header template always sets it
212 * to 0.
213 */
214 u8 omfactor;
215 /*
216 * pointer to the user's task_struct. We are going to
217 * get a reference to it so we can process io vectors
218 * at a later time.
219 */
220 struct task_struct *user_proc;
221 /*
222 * pointer to the user's mm_struct. We are going to
223 * get a reference to it so it doesn't get freed
224 * since we might not be in process context when we
225 * are processing the iov's.
226 * Using this mm_struct, we can get vma based on the
227 * iov's address (find_vma()).
228 */
229 struct mm_struct *user_mm;
230 /*
231 * We copy the iovs for this request (based on
232 * info.iovcnt). These are only the data vectors
233 */
234 unsigned data_iovs;
235 /* total length of the data in the request */
236 u32 data_len;
237 /* progress index moving along the iovs array */
238 unsigned iov_idx;
239 struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
240 /* number of elements copied to the tids array */
241 u16 n_tids;
242 /* TID array values copied from the tid_iov vector */
243 u32 *tids;
244 u16 tididx;
245 u32 sent;
246 u64 seqnum;
247 spinlock_t list_lock;
248 struct list_head txps;
249 unsigned long flags;
250};
251
252struct user_sdma_txreq {
253 /* Packet header for the txreq */
254 struct hfi1_pkt_header hdr;
255 struct sdma_txreq txreq;
256 struct user_sdma_request *req;
257 struct user_sdma_iovec *iovec1;
258 struct user_sdma_iovec *iovec2;
259 u16 flags;
260 unsigned busycount;
261 u64 seqnum;
262};
263
264#define SDMA_DBG(req, fmt, ...) \
265 hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
266 (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
267 ##__VA_ARGS__)
268#define SDMA_Q_DBG(pq, fmt, ...) \
269 hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \
270 (pq)->subctxt, ##__VA_ARGS__)
271
272static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
273static int num_user_pages(const struct iovec *);
274static void user_sdma_txreq_cb(struct sdma_txreq *, int, int);
275static void user_sdma_free_request(struct user_sdma_request *);
276static int pin_vector_pages(struct user_sdma_request *,
277 struct user_sdma_iovec *);
278static void unpin_vector_pages(struct user_sdma_iovec *);
279static int check_header_template(struct user_sdma_request *,
280 struct hfi1_pkt_header *, u32, u32);
281static int set_txreq_header(struct user_sdma_request *,
282 struct user_sdma_txreq *, u32);
283static int set_txreq_header_ahg(struct user_sdma_request *,
284 struct user_sdma_txreq *, u32);
285static inline void set_comp_state(struct user_sdma_request *,
286 enum hfi1_sdma_comp_state, int);
287static inline u32 set_pkt_bth_psn(__be32, u8, u32);
288static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
289
290static int defer_packet_queue(
291 struct sdma_engine *,
292 struct iowait *,
293 struct sdma_txreq *,
294 unsigned seq);
295static void activate_packet_queue(struct iowait *, int);
296
297static inline int iovec_may_free(struct user_sdma_iovec *iovec,
298 void (*free)(struct user_sdma_iovec *))
299{
300 if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
301 free(iovec);
302 return 1;
303 }
304 return 0;
305}
306
307static inline void iovec_set_complete(struct user_sdma_iovec *iovec)
308{
309 iovec->offset = iovec->iov.iov_len;
310}
311
312static int defer_packet_queue(
313 struct sdma_engine *sde,
314 struct iowait *wait,
315 struct sdma_txreq *txreq,
316 unsigned seq)
317{
318 struct hfi1_user_sdma_pkt_q *pq =
319 container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
320 struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
321 struct user_sdma_txreq *tx =
322 container_of(txreq, struct user_sdma_txreq, txreq);
323
324 if (sdma_progress(sde, seq, txreq)) {
325 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
326 goto eagain;
327 }
328 /*
329 * We are assuming that if the list is enqueued somewhere, it
330 * is to the dmawait list since that is the only place where
331 * it is supposed to be enqueued.
332 */
333 xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
334 write_seqlock(&dev->iowait_lock);
335 if (list_empty(&pq->busy.list))
336 list_add_tail(&pq->busy.list, &sde->dmawait);
337 write_sequnlock(&dev->iowait_lock);
338 return -EBUSY;
339eagain:
340 return -EAGAIN;
341}
342
343static void activate_packet_queue(struct iowait *wait, int reason)
344{
345 struct hfi1_user_sdma_pkt_q *pq =
346 container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
347 xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
348 wake_up(&wait->wait_dma);
349};
350
351static void sdma_kmem_cache_ctor(void *obj)
352{
353 struct user_sdma_txreq *tx = (struct user_sdma_txreq *)obj;
354
355 memset(tx, 0, sizeof(*tx));
356}
357
358int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
359{
360 int ret = 0;
361 unsigned memsize;
362 char buf[64];
363 struct hfi1_devdata *dd;
364 struct hfi1_user_sdma_comp_q *cq;
365 struct hfi1_user_sdma_pkt_q *pq;
366 unsigned long flags;
367
368 if (!uctxt || !fp) {
369 ret = -EBADF;
370 goto done;
371 }
372
373 if (!hfi1_sdma_comp_ring_size) {
374 ret = -EINVAL;
375 goto done;
376 }
377
378 dd = uctxt->dd;
379
380 pq = kzalloc(sizeof(*pq), GFP_KERNEL);
381 if (!pq) {
382 dd_dev_err(dd,
383 "[%u:%u] Failed to allocate SDMA request struct\n",
384 uctxt->ctxt, subctxt_fp(fp));
385 goto pq_nomem;
386 }
387 memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
388 pq->reqs = kmalloc(memsize, GFP_KERNEL);
389 if (!pq->reqs) {
390 dd_dev_err(dd,
391 "[%u:%u] Failed to allocate SDMA request queue (%u)\n",
392 uctxt->ctxt, subctxt_fp(fp), memsize);
393 goto pq_reqs_nomem;
394 }
395 INIT_LIST_HEAD(&pq->list);
396 pq->dd = dd;
397 pq->ctxt = uctxt->ctxt;
398 pq->subctxt = subctxt_fp(fp);
399 pq->n_max_reqs = hfi1_sdma_comp_ring_size;
400 pq->state = SDMA_PKT_Q_INACTIVE;
401 atomic_set(&pq->n_reqs, 0);
402
403 iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
404 activate_packet_queue);
405 pq->reqidx = 0;
406 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
407 subctxt_fp(fp));
408 pq->txreq_cache = kmem_cache_create(buf,
409 sizeof(struct user_sdma_txreq),
410 L1_CACHE_BYTES,
411 SLAB_HWCACHE_ALIGN,
412 sdma_kmem_cache_ctor);
413 if (!pq->txreq_cache) {
414 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
415 uctxt->ctxt);
416 goto pq_txreq_nomem;
417 }
418 user_sdma_pkt_fp(fp) = pq;
419 cq = kzalloc(sizeof(*cq), GFP_KERNEL);
420 if (!cq) {
421 dd_dev_err(dd,
422 "[%u:%u] Failed to allocate SDMA completion queue\n",
423 uctxt->ctxt, subctxt_fp(fp));
424 goto cq_nomem;
425 }
426
427 memsize = ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size,
428 PAGE_SIZE);
429 cq->comps = vmalloc_user(memsize);
430 if (!cq->comps) {
431 dd_dev_err(dd,
432 "[%u:%u] Failed to allocate SDMA completion queue entries\n",
433 uctxt->ctxt, subctxt_fp(fp));
434 goto cq_comps_nomem;
435 }
436 cq->nentries = hfi1_sdma_comp_ring_size;
437 user_sdma_comp_fp(fp) = cq;
438
439 spin_lock_irqsave(&uctxt->sdma_qlock, flags);
440 list_add(&pq->list, &uctxt->sdma_queues);
441 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
442 goto done;
443
444cq_comps_nomem:
445 kfree(cq);
446cq_nomem:
447 kmem_cache_destroy(pq->txreq_cache);
448pq_txreq_nomem:
449 kfree(pq->reqs);
450pq_reqs_nomem:
451 kfree(pq);
452 user_sdma_pkt_fp(fp) = NULL;
453pq_nomem:
454 ret = -ENOMEM;
455done:
456 return ret;
457}
458
459int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
460{
461 struct hfi1_ctxtdata *uctxt = fd->uctxt;
462 struct hfi1_user_sdma_pkt_q *pq;
463 unsigned long flags;
464
465 hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
466 uctxt->ctxt, fd->subctxt);
467 pq = fd->pq;
468 if (pq) {
469 u16 i, j;
470
471 spin_lock_irqsave(&uctxt->sdma_qlock, flags);
472 if (!list_empty(&pq->list))
473 list_del_init(&pq->list);
474 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
475 iowait_sdma_drain(&pq->busy);
476 if (pq->reqs) {
477 for (i = 0, j = 0; i < atomic_read(&pq->n_reqs) &&
478 j < pq->n_max_reqs; j++) {
479 struct user_sdma_request *req = &pq->reqs[j];
480
481 if (test_bit(SDMA_REQ_IN_USE, &req->flags)) {
482 set_comp_state(req, ERROR, -ECOMM);
483 user_sdma_free_request(req);
484 i++;
485 }
486 }
487 kfree(pq->reqs);
488 }
489 if (pq->txreq_cache)
490 kmem_cache_destroy(pq->txreq_cache);
491 kfree(pq);
492 fd->pq = NULL;
493 }
494 if (fd->cq) {
495 if (fd->cq->comps)
496 vfree(fd->cq->comps);
497 kfree(fd->cq);
498 fd->cq = NULL;
499 }
500 return 0;
501}
502
503int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
504 unsigned long dim, unsigned long *count)
505{
506 int ret = 0, i = 0, sent;
507 struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
508 struct hfi1_user_sdma_pkt_q *pq = user_sdma_pkt_fp(fp);
509 struct hfi1_user_sdma_comp_q *cq = user_sdma_comp_fp(fp);
510 struct hfi1_devdata *dd = pq->dd;
511 unsigned long idx = 0;
512 u8 pcount = initial_pkt_count;
513 struct sdma_req_info info;
514 struct user_sdma_request *req;
515 u8 opcode, sc, vl;
516
517 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
518 hfi1_cdbg(
519 SDMA,
520 "[%u:%u:%u] First vector not big enough for header %lu/%lu",
521 dd->unit, uctxt->ctxt, subctxt_fp(fp),
522 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
523 ret = -EINVAL;
524 goto done;
525 }
526 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
527 if (ret) {
528 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
529 dd->unit, uctxt->ctxt, subctxt_fp(fp), ret);
530 ret = -EFAULT;
531 goto done;
532 }
533 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, subctxt_fp(fp),
534 (u16 *)&info);
535 if (cq->comps[info.comp_idx].status == QUEUED) {
536 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
537 dd->unit, uctxt->ctxt, subctxt_fp(fp),
538 info.comp_idx);
539 ret = -EBADSLT;
540 goto done;
541 }
542 if (!info.fragsize) {
543 hfi1_cdbg(SDMA,
544 "[%u:%u:%u:%u] Request does not specify fragsize",
545 dd->unit, uctxt->ctxt, subctxt_fp(fp), info.comp_idx);
546 ret = -EINVAL;
547 goto done;
548 }
549 /*
550 * We've done all the safety checks that we can up to this point,
551 * "allocate" the request entry.
552 */
553 hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
554 uctxt->ctxt, subctxt_fp(fp), info.comp_idx);
555 req = pq->reqs + info.comp_idx;
556 memset(req, 0, sizeof(*req));
557 /* Mark the request as IN_USE before we start filling it in. */
558 set_bit(SDMA_REQ_IN_USE, &req->flags);
559 req->data_iovs = req_iovcnt(info.ctrl) - 1;
560 req->pq = pq;
561 req->cq = cq;
562 INIT_LIST_HEAD(&req->txps);
563 spin_lock_init(&req->list_lock);
564 memcpy(&req->info, &info, sizeof(info));
565
566 if (req_opcode(info.ctrl) == EXPECTED)
567 req->data_iovs--;
568
569 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
570 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
571 MAX_VECTORS_PER_REQ);
572 ret = -EINVAL;
573 goto done;
574 }
575 /* Copy the header from the user buffer */
576 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
577 sizeof(req->hdr));
578 if (ret) {
579 SDMA_DBG(req, "Failed to copy header template (%d)", ret);
580 ret = -EFAULT;
581 goto free_req;
582 }
583
584 /* If Static rate control is not enabled, sanitize the header. */
585 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
586 req->hdr.pbc[2] = 0;
587
588 /* Validate the opcode. Do not trust packets from user space blindly. */
589 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
590 if ((opcode & USER_OPCODE_CHECK_MASK) !=
591 USER_OPCODE_CHECK_VAL) {
592 SDMA_DBG(req, "Invalid opcode (%d)", opcode);
593 ret = -EINVAL;
594 goto free_req;
595 }
596 /*
597 * Validate the vl. Do not trust packets from user space blindly.
598 * VL comes from PBC, SC comes from LRH, and the VL needs to
599 * match the SC look up.
600 */
601 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
602 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
603 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
604 if (vl >= dd->pport->vls_operational ||
605 vl != sc_to_vlt(dd, sc)) {
606 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
607 ret = -EINVAL;
608 goto free_req;
609 }
610
611 /*
612 * Also should check the BTH.lnh. If it says the next header is GRH then
613 * the RXE parsing will be off and will land in the middle of the KDETH
614 * or miss it entirely.
615 */
616 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
617 SDMA_DBG(req, "User tried to pass in a GRH");
618 ret = -EINVAL;
619 goto free_req;
620 }
621
622 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
623 /* Calculate the initial TID offset based on the values of
624 KDETH.OFFSET and KDETH.OM that are passed in. */
625 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
626 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
627 KDETH_OM_LARGE : KDETH_OM_SMALL);
628 SDMA_DBG(req, "Initial TID offset %u", req->tidoffset);
629 idx++;
630
631 /* Save all the IO vector structures */
632 while (i < req->data_iovs) {
633 memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
634 req->iovs[i].offset = 0;
635 req->data_len += req->iovs[i++].iov.iov_len;
636 }
637 SDMA_DBG(req, "total data length %u", req->data_len);
638
639 if (pcount > req->info.npkts)
640 pcount = req->info.npkts;
641 /*
642 * Copy any TID info
643 * User space will provide the TID info only when the
644 * request type is EXPECTED. This is true even if there is
645 * only one packet in the request and the header is already
646 * setup. The reason for the singular TID case is that the
647 * driver needs to perform safety checks.
648 */
649 if (req_opcode(req->info.ctrl) == EXPECTED) {
650 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
651
652 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
653 ret = -EINVAL;
654 goto free_req;
655 }
656 req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL);
657 if (!req->tids) {
658 ret = -ENOMEM;
659 goto free_req;
660 }
661 /*
662 * We have to copy all of the tids because they may vary
663 * in size and, therefore, the TID count might not be
664 * equal to the pkt count. However, there is no way to
665 * tell at this point.
666 */
667 ret = copy_from_user(req->tids, iovec[idx].iov_base,
668 ntids * sizeof(*req->tids));
669 if (ret) {
670 SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
671 ntids, ret);
672 ret = -EFAULT;
673 goto free_req;
674 }
675 req->n_tids = ntids;
676 idx++;
677 }
678
679 /* Have to select the engine */
680 req->sde = sdma_select_engine_vl(dd,
681 (u32)(uctxt->ctxt + subctxt_fp(fp)),
682 vl);
683 if (!req->sde || !sdma_running(req->sde)) {
684 ret = -ECOMM;
685 goto free_req;
686 }
687
688 /* We don't need an AHG entry if the request contains only one packet */
689 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) {
690 int ahg = sdma_ahg_alloc(req->sde);
691
692 if (likely(ahg >= 0)) {
693 req->ahg_idx = (u8)ahg;
694 set_bit(SDMA_REQ_HAVE_AHG, &req->flags);
695 }
696 }
697
698 set_comp_state(req, QUEUED, 0);
699 /* Send the first N packets in the request to buy us some time */
700 sent = user_sdma_send_pkts(req, pcount);
701 if (unlikely(sent < 0)) {
702 if (sent != -EBUSY) {
703 ret = sent;
704 goto send_err;
705 } else
706 sent = 0;
707 }
708 atomic_inc(&pq->n_reqs);
709
710 if (sent < req->info.npkts) {
711 /* Take the references to the user's task and mm_struct */
712 get_task_struct(current);
713 req->user_proc = current;
714
715 /*
716 * This is a somewhat blocking send implementation.
717 * The driver will block the caller until all packets of the
718 * request have been submitted to the SDMA engine. However, it
719 * will not wait for send completions.
720 */
721 while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
722 ret = user_sdma_send_pkts(req, pcount);
723 if (ret < 0) {
724 if (ret != -EBUSY)
725 goto send_err;
726 wait_event_interruptible_timeout(
727 pq->busy.wait_dma,
728 (pq->state == SDMA_PKT_Q_ACTIVE),
729 msecs_to_jiffies(
730 SDMA_IOWAIT_TIMEOUT));
731 }
732 }
733
734 }
735 ret = 0;
736 *count += idx;
737 goto done;
738send_err:
739 set_comp_state(req, ERROR, ret);
740free_req:
741 user_sdma_free_request(req);
742done:
743 return ret;
744}
745
746static inline u32 compute_data_length(struct user_sdma_request *req,
747 struct user_sdma_txreq *tx)
748{
749 /*
750 * Determine the proper size of the packet data.
751 * The size of the data of the first packet is in the header
752 * template. However, it includes the header and ICRC, which need
753 * to be subtracted.
754 * The size of the remaining packets is the minimum of the frag
755 * size (MTU) or remaining data in the request.
756 */
757 u32 len;
758
759 if (!req->seqnum) {
760 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
761 (sizeof(tx->hdr) - 4));
762 } else if (req_opcode(req->info.ctrl) == EXPECTED) {
763 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
764 PAGE_SIZE;
765 /* Get the data length based on the remaining space in the
766 * TID pair. */
767 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
768 /* If we've filled up the TID pair, move to the next one. */
769 if (unlikely(!len) && ++req->tididx < req->n_tids &&
770 req->tids[req->tididx]) {
771 tidlen = EXP_TID_GET(req->tids[req->tididx],
772 LEN) * PAGE_SIZE;
773 req->tidoffset = 0;
774 len = min_t(u32, tidlen, req->info.fragsize);
775 }
776 /* Since the TID pairs map entire pages, make sure that we
777 * are not going to try to send more data that we have
778 * remaining. */
779 len = min(len, req->data_len - req->sent);
780 } else
781 len = min(req->data_len - req->sent, (u32)req->info.fragsize);
782 SDMA_DBG(req, "Data Length = %u", len);
783 return len;
784}
785
786static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
787{
788 /* (Size of complete header - size of PBC) + 4B ICRC + data length */
789 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
790}
791
792static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
793{
794 int ret = 0;
795 unsigned npkts = 0;
796 struct user_sdma_txreq *tx = NULL;
797 struct hfi1_user_sdma_pkt_q *pq = NULL;
798 struct user_sdma_iovec *iovec = NULL;
799
800 if (!req->pq) {
801 ret = -EINVAL;
802 goto done;
803 }
804
805 pq = req->pq;
806
807 /*
808 * Check if we might have sent the entire request already
809 */
810 if (unlikely(req->seqnum == req->info.npkts)) {
811 if (!list_empty(&req->txps))
812 goto dosend;
813 goto done;
814 }
815
816 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
817 maxpkts = req->info.npkts - req->seqnum;
818
819 while (npkts < maxpkts) {
820 u32 datalen = 0, queued = 0, data_sent = 0;
821 u64 iov_offset = 0;
822
823 /*
824 * Check whether any of the completions have come back
825 * with errors. If so, we are not going to process any
826 * more packets from this request.
827 */
828 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
829 set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
830 ret = -EFAULT;
831 goto done;
832 }
833
834 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
835 if (!tx) {
836 ret = -ENOMEM;
837 goto done;
838 }
839 tx->flags = 0;
840 tx->req = req;
841 tx->busycount = 0;
842 tx->iovec1 = NULL;
843 tx->iovec2 = NULL;
844
845 if (req->seqnum == req->info.npkts - 1)
846 tx->flags |= USER_SDMA_TXREQ_FLAGS_LAST_PKT;
847
848 /*
849 * Calculate the payload size - this is min of the fragment
850 * (MTU) size or the remaining bytes in the request but only
851 * if we have payload data.
852 */
853 if (req->data_len) {
854 iovec = &req->iovs[req->iov_idx];
855 if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
856 if (++req->iov_idx == req->data_iovs) {
857 ret = -EFAULT;
858 goto free_txreq;
859 }
860 iovec = &req->iovs[req->iov_idx];
861 WARN_ON(iovec->offset);
862 }
863
864 /*
865 * This request might include only a header and no user
866 * data, so pin pages only if there is data and it the
867 * pages have not been pinned already.
868 */
869 if (unlikely(!iovec->pages && iovec->iov.iov_len)) {
870 ret = pin_vector_pages(req, iovec);
871 if (ret)
872 goto free_tx;
873 }
874
875 tx->iovec1 = iovec;
876 datalen = compute_data_length(req, tx);
877 if (!datalen) {
878 SDMA_DBG(req,
879 "Request has data but pkt len is 0");
880 ret = -EFAULT;
881 goto free_tx;
882 }
883 }
884
885 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
886 if (!req->seqnum) {
887 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
888 u32 lrhlen = get_lrh_len(req->hdr, datalen);
889 /*
890 * Copy the request header into the tx header
891 * because the HW needs a cacheline-aligned
892 * address.
893 * This copy can be optimized out if the hdr
894 * member of user_sdma_request were also
895 * cacheline aligned.
896 */
897 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
898 if (PBC2LRH(pbclen) != lrhlen) {
899 pbclen = (pbclen & 0xf000) |
900 LRH2PBC(lrhlen);
901 tx->hdr.pbc[0] = cpu_to_le16(pbclen);
902 }
903 ret = sdma_txinit_ahg(&tx->txreq,
904 SDMA_TXREQ_F_AHG_COPY,
905 sizeof(tx->hdr) + datalen,
906 req->ahg_idx, 0, NULL, 0,
907 user_sdma_txreq_cb);
908 if (ret)
909 goto free_tx;
910 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq,
911 &tx->hdr,
912 sizeof(tx->hdr));
913 if (ret)
914 goto free_txreq;
915 } else {
916 int changes;
917
918 changes = set_txreq_header_ahg(req, tx,
919 datalen);
920 if (changes < 0)
921 goto free_tx;
922 sdma_txinit_ahg(&tx->txreq,
923 SDMA_TXREQ_F_USE_AHG,
924 datalen, req->ahg_idx, changes,
925 req->ahg, sizeof(req->hdr),
926 user_sdma_txreq_cb);
927 }
928 } else {
929 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
930 datalen, user_sdma_txreq_cb);
931 if (ret)
932 goto free_tx;
933 /*
934 * Modify the header for this packet. This only needs
935 * to be done if we are not going to use AHG. Otherwise,
936 * the HW will do it based on the changes we gave it
937 * during sdma_txinit_ahg().
938 */
939 ret = set_txreq_header(req, tx, datalen);
940 if (ret)
941 goto free_txreq;
942 }
943
944 /*
945 * If the request contains any data vectors, add up to
946 * fragsize bytes to the descriptor.
947 */
948 while (queued < datalen &&
949 (req->sent + data_sent) < req->data_len) {
950 unsigned long base, offset;
951 unsigned pageidx, len;
952
953 base = (unsigned long)iovec->iov.iov_base;
954 offset = ((base + iovec->offset + iov_offset) &
955 ~PAGE_MASK);
956 pageidx = (((iovec->offset + iov_offset +
957 base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
958 len = offset + req->info.fragsize > PAGE_SIZE ?
959 PAGE_SIZE - offset : req->info.fragsize;
960 len = min((datalen - queued), len);
961 ret = sdma_txadd_page(pq->dd, &tx->txreq,
962 iovec->pages[pageidx],
963 offset, len);
964 if (ret) {
965 dd_dev_err(pq->dd,
966 "SDMA txreq add page failed %d\n",
967 ret);
968 iovec_set_complete(iovec);
969 goto free_txreq;
970 }
971 iov_offset += len;
972 queued += len;
973 data_sent += len;
974 if (unlikely(queued < datalen &&
975 pageidx == iovec->npages &&
976 req->iov_idx < req->data_iovs - 1)) {
977 iovec->offset += iov_offset;
978 iovec = &req->iovs[++req->iov_idx];
979 if (!iovec->pages) {
980 ret = pin_vector_pages(req, iovec);
981 if (ret)
982 goto free_txreq;
983 }
984 iov_offset = 0;
985 tx->iovec2 = iovec;
986
987 }
988 }
989 /*
990 * The txreq was submitted successfully so we can update
991 * the counters.
992 */
993 req->koffset += datalen;
994 if (req_opcode(req->info.ctrl) == EXPECTED)
995 req->tidoffset += datalen;
996 req->sent += data_sent;
997 if (req->data_len) {
998 if (tx->iovec1 && !tx->iovec2)
999 tx->iovec1->offset += iov_offset;
1000 else if (tx->iovec2)
1001 tx->iovec2->offset += iov_offset;
1002 }
1003 /*
1004 * It is important to increment this here as it is used to
1005 * generate the BTH.PSN and, therefore, can't be bulk-updated
1006 * outside of the loop.
1007 */
1008 tx->seqnum = req->seqnum++;
1009 list_add_tail(&tx->txreq.list, &req->txps);
1010 npkts++;
1011 }
1012dosend:
1013 ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
1014 if (list_empty(&req->txps))
1015 if (req->seqnum == req->info.npkts) {
1016 set_bit(SDMA_REQ_SEND_DONE, &req->flags);
1017 /*
1018 * The txreq has already been submitted to the HW queue
1019 * so we can free the AHG entry now. Corruption will not
1020 * happen due to the sequential manner in which
1021 * descriptors are processed.
1022 */
1023 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
1024 sdma_ahg_free(req->sde, req->ahg_idx);
1025 }
1026 goto done;
1027free_txreq:
1028 sdma_txclean(pq->dd, &tx->txreq);
1029free_tx:
1030 kmem_cache_free(pq->txreq_cache, tx);
1031done:
1032 return ret;
1033}
1034
1035/*
1036 * How many pages in this iovec element?
1037 */
1038static inline int num_user_pages(const struct iovec *iov)
1039{
1040 const unsigned long addr = (unsigned long) iov->iov_base;
1041 const unsigned long len = iov->iov_len;
1042 const unsigned long spage = addr & PAGE_MASK;
1043 const unsigned long epage = (addr + len - 1) & PAGE_MASK;
1044
1045 return 1 + ((epage - spage) >> PAGE_SHIFT);
1046}
1047
1048static int pin_vector_pages(struct user_sdma_request *req,
1049 struct user_sdma_iovec *iovec) {
1050 int ret = 0;
1051 unsigned pinned;
1052
1053 iovec->npages = num_user_pages(&iovec->iov);
1054 iovec->pages = kzalloc(sizeof(*iovec->pages) *
1055 iovec->npages, GFP_KERNEL);
1056 if (!iovec->pages) {
1057 SDMA_DBG(req, "Failed page array alloc");
1058 ret = -ENOMEM;
1059 goto done;
1060 }
1061 /* If called by the kernel thread, use the user's mm */
1062 if (current->flags & PF_KTHREAD)
1063 use_mm(req->user_proc->mm);
1064 pinned = get_user_pages_fast(
1065 (unsigned long)iovec->iov.iov_base,
1066 iovec->npages, 0, iovec->pages);
1067 /* If called by the kernel thread, unuse the user's mm */
1068 if (current->flags & PF_KTHREAD)
1069 unuse_mm(req->user_proc->mm);
1070 if (pinned != iovec->npages) {
1071 SDMA_DBG(req, "Failed to pin pages (%u/%u)", pinned,
1072 iovec->npages);
1073 ret = -EFAULT;
1074 goto pfree;
1075 }
1076 goto done;
1077pfree:
1078 unpin_vector_pages(iovec);
1079done:
1080 return ret;
1081}
1082
1083static void unpin_vector_pages(struct user_sdma_iovec *iovec)
1084{
1085 unsigned i;
1086
1087 if (ACCESS_ONCE(iovec->offset) != iovec->iov.iov_len) {
1088 hfi1_cdbg(SDMA,
1089 "the complete vector has not been sent yet %llu %zu",
1090 iovec->offset, iovec->iov.iov_len);
1091 return;
1092 }
1093 for (i = 0; i < iovec->npages; i++)
1094 if (iovec->pages[i])
1095 put_page(iovec->pages[i]);
1096 kfree(iovec->pages);
1097 iovec->pages = NULL;
1098 iovec->npages = 0;
1099 iovec->offset = 0;
1100}
1101
1102static int check_header_template(struct user_sdma_request *req,
1103 struct hfi1_pkt_header *hdr, u32 lrhlen,
1104 u32 datalen)
1105{
1106 /*
1107 * Perform safety checks for any type of packet:
1108 * - transfer size is multiple of 64bytes
1109 * - packet length is multiple of 4bytes
1110 * - entire request length is multiple of 4bytes
1111 * - packet length is not larger than MTU size
1112 *
1113 * These checks are only done for the first packet of the
1114 * transfer since the header is "given" to us by user space.
1115 * For the remainder of the packets we compute the values.
1116 */
1117 if (req->info.fragsize % PIO_BLOCK_SIZE ||
1118 lrhlen & 0x3 || req->data_len & 0x3 ||
1119 lrhlen > get_lrh_len(*hdr, req->info.fragsize))
1120 return -EINVAL;
1121
1122 if (req_opcode(req->info.ctrl) == EXPECTED) {
1123 /*
1124 * The header is checked only on the first packet. Furthermore,
1125 * we ensure that at least one TID entry is copied when the
1126 * request is submitted. Therefore, we don't have to verify that
1127 * tididx points to something sane.
1128 */
1129 u32 tidval = req->tids[req->tididx],
1130 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
1131 tididx = EXP_TID_GET(tidval, IDX),
1132 tidctrl = EXP_TID_GET(tidval, CTRL),
1133 tidoff;
1134 __le32 kval = hdr->kdeth.ver_tid_offset;
1135
1136 tidoff = KDETH_GET(kval, OFFSET) *
1137 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
1138 KDETH_OM_LARGE : KDETH_OM_SMALL);
1139 /*
1140 * Expected receive packets have the following
1141 * additional checks:
1142 * - offset is not larger than the TID size
1143 * - TIDCtrl values match between header and TID array
1144 * - TID indexes match between header and TID array
1145 */
1146 if ((tidoff + datalen > tidlen) ||
1147 KDETH_GET(kval, TIDCTRL) != tidctrl ||
1148 KDETH_GET(kval, TID) != tididx)
1149 return -EINVAL;
1150 }
1151 return 0;
1152}
1153
1154/*
1155 * Correctly set the BTH.PSN field based on type of
1156 * transfer - eager packets can just increment the PSN but
1157 * expected packets encode generation and sequence in the
1158 * BTH.PSN field so just incrementing will result in errors.
1159 */
1160static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
1161{
1162 u32 val = be32_to_cpu(bthpsn),
1163 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
1164 0xffffffull),
1165 psn = val & mask;
1166 if (expct)
1167 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
1168 else
1169 psn = psn + frags;
1170 return psn & mask;
1171}
1172
1173static int set_txreq_header(struct user_sdma_request *req,
1174 struct user_sdma_txreq *tx, u32 datalen)
1175{
1176 struct hfi1_user_sdma_pkt_q *pq = req->pq;
1177 struct hfi1_pkt_header *hdr = &tx->hdr;
1178 u16 pbclen;
1179 int ret;
1180 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
1181
1182 /* Copy the header template to the request before modification */
1183 memcpy(hdr, &req->hdr, sizeof(*hdr));
1184
1185 /*
1186 * Check if the PBC and LRH length are mismatched. If so
1187 * adjust both in the header.
1188 */
1189 pbclen = le16_to_cpu(hdr->pbc[0]);
1190 if (PBC2LRH(pbclen) != lrhlen) {
1191 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
1192 hdr->pbc[0] = cpu_to_le16(pbclen);
1193 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
1194 /*
1195 * Third packet
1196 * This is the first packet in the sequence that has
1197 * a "static" size that can be used for the rest of
1198 * the packets (besides the last one).
1199 */
1200 if (unlikely(req->seqnum == 2)) {
1201 /*
1202 * From this point on the lengths in both the
1203 * PBC and LRH are the same until the last
1204 * packet.
1205 * Adjust the template so we don't have to update
1206 * every packet
1207 */
1208 req->hdr.pbc[0] = hdr->pbc[0];
1209 req->hdr.lrh[2] = hdr->lrh[2];
1210 }
1211 }
1212 /*
1213 * We only have to modify the header if this is not the
1214 * first packet in the request. Otherwise, we use the
1215 * header given to us.
1216 */
1217 if (unlikely(!req->seqnum)) {
1218 ret = check_header_template(req, hdr, lrhlen, datalen);
1219 if (ret)
1220 return ret;
1221 goto done;
1222
1223 }
1224
1225 hdr->bth[2] = cpu_to_be32(
1226 set_pkt_bth_psn(hdr->bth[2],
1227 (req_opcode(req->info.ctrl) == EXPECTED),
1228 req->seqnum));
1229
1230 /* Set ACK request on last packet */
1231 if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
1232 hdr->bth[2] |= cpu_to_be32(1UL<<31);
1233
1234 /* Set the new offset */
1235 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
1236 /* Expected packets have to fill in the new TID information */
1237 if (req_opcode(req->info.ctrl) == EXPECTED) {
1238 tidval = req->tids[req->tididx];
1239 /*
1240 * If the offset puts us at the end of the current TID,
1241 * advance everything.
1242 */
1243 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1244 PAGE_SIZE)) {
1245 req->tidoffset = 0;
1246 /* Since we don't copy all the TIDs, all at once,
1247 * we have to check again. */
1248 if (++req->tididx > req->n_tids - 1 ||
1249 !req->tids[req->tididx]) {
1250 return -EINVAL;
1251 }
1252 tidval = req->tids[req->tididx];
1253 }
1254 req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
1255 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL;
1256 /* Set KDETH.TIDCtrl based on value for this TID. */
1257 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1258 EXP_TID_GET(tidval, CTRL));
1259 /* Set KDETH.TID based on value for this TID */
1260 KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1261 EXP_TID_GET(tidval, IDX));
1262 /* Clear KDETH.SH only on the last packet */
1263 if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
1264 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1265 /*
1266 * Set the KDETH.OFFSET and KDETH.OM based on size of
1267 * transfer.
1268 */
1269 SDMA_DBG(req, "TID offset %ubytes %uunits om%u",
1270 req->tidoffset, req->tidoffset / req->omfactor,
1271 !!(req->omfactor - KDETH_OM_SMALL));
1272 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1273 req->tidoffset / req->omfactor);
1274 KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1275 !!(req->omfactor - KDETH_OM_SMALL));
1276 }
1277done:
1278 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1279 req->info.comp_idx, hdr, tidval);
1280 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1281}
1282
1283static int set_txreq_header_ahg(struct user_sdma_request *req,
1284 struct user_sdma_txreq *tx, u32 len)
1285{
1286 int diff = 0;
1287 struct hfi1_user_sdma_pkt_q *pq = req->pq;
1288 struct hfi1_pkt_header *hdr = &req->hdr;
1289 u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1290 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
1291
1292 if (PBC2LRH(pbclen) != lrhlen) {
1293 /* PBC.PbcLengthDWs */
1294 AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
1295 cpu_to_le16(LRH2PBC(lrhlen)));
1296 /* LRH.PktLen (we need the full 16 bits due to byte swap) */
1297 AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
1298 cpu_to_be16(lrhlen >> 2));
1299 }
1300
1301 /*
1302 * Do the common updates
1303 */
1304 /* BTH.PSN and BTH.A */
1305 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1306 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1307 if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
1308 val32 |= 1UL << 31;
1309 AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
1310 AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
1311 /* KDETH.Offset */
1312 AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
1313 cpu_to_le16(req->koffset & 0xffff));
1314 AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
1315 cpu_to_le16(req->koffset >> 16));
1316 if (req_opcode(req->info.ctrl) == EXPECTED) {
1317 __le16 val;
1318
1319 tidval = req->tids[req->tididx];
1320
1321 /*
1322 * If the offset puts us at the end of the current TID,
1323 * advance everything.
1324 */
1325 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1326 PAGE_SIZE)) {
1327 req->tidoffset = 0;
1328 /* Since we don't copy all the TIDs, all at once,
1329 * we have to check again. */
1330 if (++req->tididx > req->n_tids - 1 ||
1331 !req->tids[req->tididx]) {
1332 return -EINVAL;
1333 }
1334 tidval = req->tids[req->tididx];
1335 }
1336 req->omfactor = ((EXP_TID_GET(tidval, LEN) *
1337 PAGE_SIZE) >=
1338 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE :
1339 KDETH_OM_SMALL;
1340 /* KDETH.OM and KDETH.OFFSET (TID) */
1341 AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
1342 ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
1343 ((req->tidoffset / req->omfactor) & 0x7fff)));
1344 /* KDETH.TIDCtrl, KDETH.TID */
1345 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1346 (EXP_TID_GET(tidval, IDX) & 0x3ff));
1347 /* Clear KDETH.SH on last packet */
1348 if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT)) {
1349 val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
1350 INTR) >> 16);
1351 val &= cpu_to_le16(~(1U << 13));
1352 AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
1353 } else
1354 AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
1355 }
1356
1357 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1358 req->info.comp_idx, req->sde->this_idx,
1359 req->ahg_idx, req->ahg, diff, tidval);
1360 return diff;
1361}
1362
1363static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status,
1364 int drain)
1365{
1366 struct user_sdma_txreq *tx =
1367 container_of(txreq, struct user_sdma_txreq, txreq);
1368 struct user_sdma_request *req = tx->req;
1369 struct hfi1_user_sdma_pkt_q *pq = req ? req->pq : NULL;
1370 u64 tx_seqnum;
1371
1372 if (unlikely(!req || !pq))
1373 return;
1374
1375 if (tx->iovec1)
1376 iovec_may_free(tx->iovec1, unpin_vector_pages);
1377 if (tx->iovec2)
1378 iovec_may_free(tx->iovec2, unpin_vector_pages);
1379
1380 tx_seqnum = tx->seqnum;
1381 kmem_cache_free(pq->txreq_cache, tx);
1382
1383 if (status != SDMA_TXREQ_S_OK) {
1384 dd_dev_err(pq->dd, "SDMA completion with error %d", status);
1385 set_comp_state(req, ERROR, status);
1386 set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
1387 /* Do not free the request until the sender loop has ack'ed
1388 * the error and we've seen all txreqs. */
1389 if (tx_seqnum == ACCESS_ONCE(req->seqnum) &&
1390 test_bit(SDMA_REQ_DONE_ERROR, &req->flags)) {
1391 atomic_dec(&pq->n_reqs);
1392 user_sdma_free_request(req);
1393 }
1394 } else {
1395 if (tx_seqnum == req->info.npkts - 1) {
1396 /* We've sent and completed all packets in this
1397 * request. Signal completion to the user */
1398 atomic_dec(&pq->n_reqs);
1399 set_comp_state(req, COMPLETE, 0);
1400 user_sdma_free_request(req);
1401 }
1402 }
1403 if (!atomic_read(&pq->n_reqs))
1404 xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
1405}
1406
1407static void user_sdma_free_request(struct user_sdma_request *req)
1408{
1409 if (!list_empty(&req->txps)) {
1410 struct sdma_txreq *t, *p;
1411
1412 list_for_each_entry_safe(t, p, &req->txps, list) {
1413 struct user_sdma_txreq *tx =
1414 container_of(t, struct user_sdma_txreq, txreq);
1415 list_del_init(&t->list);
1416 sdma_txclean(req->pq->dd, t);
1417 kmem_cache_free(req->pq->txreq_cache, tx);
1418 }
1419 }
1420 if (req->data_iovs) {
1421 int i;
1422
1423 for (i = 0; i < req->data_iovs; i++)
1424 if (req->iovs[i].npages && req->iovs[i].pages)
1425 unpin_vector_pages(&req->iovs[i]);
1426 }
1427 if (req->user_proc)
1428 put_task_struct(req->user_proc);
1429 kfree(req->tids);
1430 clear_bit(SDMA_REQ_IN_USE, &req->flags);
1431}
1432
1433static inline void set_comp_state(struct user_sdma_request *req,
1434 enum hfi1_sdma_comp_state state,
1435 int ret)
1436{
1437 SDMA_DBG(req, "Setting completion status %u %d", state, ret);
1438 req->cq->comps[req->info.comp_idx].status = state;
1439 if (state == ERROR)
1440 req->cq->comps[req->info.comp_idx].errcode = -ret;
1441 trace_hfi1_sdma_user_completion(req->pq->dd, req->pq->ctxt,
1442 req->pq->subctxt, req->info.comp_idx,
1443 state, ret);
1444}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/staging/rdma/hfi1/user_sdma.h
new file mode 100644
index 000000000000..fa4422553e23
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/user_sdma.h
@@ -0,0 +1,89 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50#include <linux/device.h>
51#include <linux/wait.h>
52
53#include "common.h"
54#include "iowait.h"
55
56#define EXP_TID_TIDLEN_MASK 0x7FFULL
57#define EXP_TID_TIDLEN_SHIFT 0
58#define EXP_TID_TIDCTRL_MASK 0x3ULL
59#define EXP_TID_TIDCTRL_SHIFT 20
60#define EXP_TID_TIDIDX_MASK 0x7FFULL
61#define EXP_TID_TIDIDX_SHIFT 22
62#define EXP_TID_GET(tid, field) \
63 (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
64
65extern uint extended_psn;
66
67struct hfi1_user_sdma_pkt_q {
68 struct list_head list;
69 unsigned ctxt;
70 unsigned subctxt;
71 u16 n_max_reqs;
72 atomic_t n_reqs;
73 u16 reqidx;
74 struct hfi1_devdata *dd;
75 struct kmem_cache *txreq_cache;
76 struct user_sdma_request *reqs;
77 struct iowait busy;
78 unsigned state;
79};
80
81struct hfi1_user_sdma_comp_q {
82 u16 nentries;
83 struct hfi1_sdma_comp_entry *comps;
84};
85
86int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *);
87int hfi1_user_sdma_free_queues(struct hfi1_filedata *);
88int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long,
89 unsigned long *);
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
new file mode 100644
index 000000000000..53ac21431542
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -0,0 +1,2143 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <rdma/ib_mad.h>
52#include <rdma/ib_user_verbs.h>
53#include <linux/io.h>
54#include <linux/module.h>
55#include <linux/utsname.h>
56#include <linux/rculist.h>
57#include <linux/mm.h>
58#include <linux/random.h>
59#include <linux/vmalloc.h>
60
61#include "hfi.h"
62#include "common.h"
63#include "device.h"
64#include "trace.h"
65#include "qp.h"
66#include "sdma.h"
67
68unsigned int hfi1_lkey_table_size = 16;
69module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
70 S_IRUGO);
71MODULE_PARM_DESC(lkey_table_size,
72 "LKEY table size in bits (2^n, 1 <= n <= 23)");
73
74static unsigned int hfi1_max_pds = 0xFFFF;
75module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
76MODULE_PARM_DESC(max_pds,
77 "Maximum number of protection domains to support");
78
79static unsigned int hfi1_max_ahs = 0xFFFF;
80module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
81MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
82
83unsigned int hfi1_max_cqes = 0x2FFFF;
84module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
85MODULE_PARM_DESC(max_cqes,
86 "Maximum number of completion queue entries to support");
87
88unsigned int hfi1_max_cqs = 0x1FFFF;
89module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
90MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
91
92unsigned int hfi1_max_qp_wrs = 0x3FFF;
93module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
94MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
95
96unsigned int hfi1_max_qps = 16384;
97module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
98MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
99
100unsigned int hfi1_max_sges = 0x60;
101module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
102MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
103
104unsigned int hfi1_max_mcast_grps = 16384;
105module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
106MODULE_PARM_DESC(max_mcast_grps,
107 "Maximum number of multicast groups to support");
108
109unsigned int hfi1_max_mcast_qp_attached = 16;
110module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
111 uint, S_IRUGO);
112MODULE_PARM_DESC(max_mcast_qp_attached,
113 "Maximum number of attached QPs to support");
114
115unsigned int hfi1_max_srqs = 1024;
116module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
117MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
118
119unsigned int hfi1_max_srq_sges = 128;
120module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
121MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
122
123unsigned int hfi1_max_srq_wrs = 0x1FFFF;
124module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
125MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
126
127static void verbs_sdma_complete(
128 struct sdma_txreq *cookie,
129 int status,
130 int drained);
131
132/*
133 * Note that it is OK to post send work requests in the SQE and ERR
134 * states; hfi1_do_send() will process them and generate error
135 * completions as per IB 1.2 C10-96.
136 */
137const int ib_hfi1_state_ops[IB_QPS_ERR + 1] = {
138 [IB_QPS_RESET] = 0,
139 [IB_QPS_INIT] = HFI1_POST_RECV_OK,
140 [IB_QPS_RTR] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK,
141 [IB_QPS_RTS] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
142 HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK |
143 HFI1_PROCESS_NEXT_SEND_OK,
144 [IB_QPS_SQD] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
145 HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK,
146 [IB_QPS_SQE] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
147 HFI1_POST_SEND_OK | HFI1_FLUSH_SEND,
148 [IB_QPS_ERR] = HFI1_POST_RECV_OK | HFI1_FLUSH_RECV |
149 HFI1_POST_SEND_OK | HFI1_FLUSH_SEND,
150};
151
152struct hfi1_ucontext {
153 struct ib_ucontext ibucontext;
154};
155
156static inline struct hfi1_ucontext *to_iucontext(struct ib_ucontext
157 *ibucontext)
158{
159 return container_of(ibucontext, struct hfi1_ucontext, ibucontext);
160}
161
162/*
163 * Translate ib_wr_opcode into ib_wc_opcode.
164 */
165const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
166 [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
167 [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
168 [IB_WR_SEND] = IB_WC_SEND,
169 [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
170 [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
171 [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
172 [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
173};
174
175/*
176 * Length of header by opcode, 0 --> not supported
177 */
178const u8 hdr_len_by_opcode[256] = {
179 /* RC */
180 [IB_OPCODE_RC_SEND_FIRST] = 12 + 8,
181 [IB_OPCODE_RC_SEND_MIDDLE] = 12 + 8,
182 [IB_OPCODE_RC_SEND_LAST] = 12 + 8,
183 [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
184 [IB_OPCODE_RC_SEND_ONLY] = 12 + 8,
185 [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4,
186 [IB_OPCODE_RC_RDMA_WRITE_FIRST] = 12 + 8 + 16,
187 [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = 12 + 8,
188 [IB_OPCODE_RC_RDMA_WRITE_LAST] = 12 + 8,
189 [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
190 [IB_OPCODE_RC_RDMA_WRITE_ONLY] = 12 + 8 + 16,
191 [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
192 [IB_OPCODE_RC_RDMA_READ_REQUEST] = 12 + 8 + 16,
193 [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = 12 + 8 + 4,
194 [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = 12 + 8,
195 [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = 12 + 8 + 4,
196 [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = 12 + 8 + 4,
197 [IB_OPCODE_RC_ACKNOWLEDGE] = 12 + 8 + 4,
198 [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = 12 + 8 + 4,
199 [IB_OPCODE_RC_COMPARE_SWAP] = 12 + 8 + 28,
200 [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28,
201 /* UC */
202 [IB_OPCODE_UC_SEND_FIRST] = 12 + 8,
203 [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8,
204 [IB_OPCODE_UC_SEND_LAST] = 12 + 8,
205 [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
206 [IB_OPCODE_UC_SEND_ONLY] = 12 + 8,
207 [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4,
208 [IB_OPCODE_UC_RDMA_WRITE_FIRST] = 12 + 8 + 16,
209 [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = 12 + 8,
210 [IB_OPCODE_UC_RDMA_WRITE_LAST] = 12 + 8,
211 [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
212 [IB_OPCODE_UC_RDMA_WRITE_ONLY] = 12 + 8 + 16,
213 [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
214 /* UD */
215 [IB_OPCODE_UD_SEND_ONLY] = 12 + 8 + 8,
216 [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 12
217};
218
219static const opcode_handler opcode_handler_tbl[256] = {
220 /* RC */
221 [IB_OPCODE_RC_SEND_FIRST] = &hfi1_rc_rcv,
222 [IB_OPCODE_RC_SEND_MIDDLE] = &hfi1_rc_rcv,
223 [IB_OPCODE_RC_SEND_LAST] = &hfi1_rc_rcv,
224 [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
225 [IB_OPCODE_RC_SEND_ONLY] = &hfi1_rc_rcv,
226 [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
227 [IB_OPCODE_RC_RDMA_WRITE_FIRST] = &hfi1_rc_rcv,
228 [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = &hfi1_rc_rcv,
229 [IB_OPCODE_RC_RDMA_WRITE_LAST] = &hfi1_rc_rcv,
230 [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
231 [IB_OPCODE_RC_RDMA_WRITE_ONLY] = &hfi1_rc_rcv,
232 [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
233 [IB_OPCODE_RC_RDMA_READ_REQUEST] = &hfi1_rc_rcv,
234 [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = &hfi1_rc_rcv,
235 [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = &hfi1_rc_rcv,
236 [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = &hfi1_rc_rcv,
237 [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = &hfi1_rc_rcv,
238 [IB_OPCODE_RC_ACKNOWLEDGE] = &hfi1_rc_rcv,
239 [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = &hfi1_rc_rcv,
240 [IB_OPCODE_RC_COMPARE_SWAP] = &hfi1_rc_rcv,
241 [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv,
242 /* UC */
243 [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv,
244 [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv,
245 [IB_OPCODE_UC_SEND_LAST] = &hfi1_uc_rcv,
246 [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
247 [IB_OPCODE_UC_SEND_ONLY] = &hfi1_uc_rcv,
248 [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
249 [IB_OPCODE_UC_RDMA_WRITE_FIRST] = &hfi1_uc_rcv,
250 [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = &hfi1_uc_rcv,
251 [IB_OPCODE_UC_RDMA_WRITE_LAST] = &hfi1_uc_rcv,
252 [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
253 [IB_OPCODE_UC_RDMA_WRITE_ONLY] = &hfi1_uc_rcv,
254 [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
255 /* UD */
256 [IB_OPCODE_UD_SEND_ONLY] = &hfi1_ud_rcv,
257 [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_ud_rcv,
258 /* CNP */
259 [IB_OPCODE_CNP] = &hfi1_cnp_rcv
260};
261
262/*
263 * System image GUID.
264 */
265__be64 ib_hfi1_sys_image_guid;
266
267/**
268 * hfi1_copy_sge - copy data to SGE memory
269 * @ss: the SGE state
270 * @data: the data to copy
271 * @length: the length of the data
272 */
273void hfi1_copy_sge(
274 struct hfi1_sge_state *ss,
275 void *data, u32 length,
276 int release)
277{
278 struct hfi1_sge *sge = &ss->sge;
279
280 while (length) {
281 u32 len = sge->length;
282
283 if (len > length)
284 len = length;
285 if (len > sge->sge_length)
286 len = sge->sge_length;
287 WARN_ON_ONCE(len == 0);
288 memcpy(sge->vaddr, data, len);
289 sge->vaddr += len;
290 sge->length -= len;
291 sge->sge_length -= len;
292 if (sge->sge_length == 0) {
293 if (release)
294 hfi1_put_mr(sge->mr);
295 if (--ss->num_sge)
296 *sge = *ss->sg_list++;
297 } else if (sge->length == 0 && sge->mr->lkey) {
298 if (++sge->n >= HFI1_SEGSZ) {
299 if (++sge->m >= sge->mr->mapsz)
300 break;
301 sge->n = 0;
302 }
303 sge->vaddr =
304 sge->mr->map[sge->m]->segs[sge->n].vaddr;
305 sge->length =
306 sge->mr->map[sge->m]->segs[sge->n].length;
307 }
308 data += len;
309 length -= len;
310 }
311}
312
313/**
314 * hfi1_skip_sge - skip over SGE memory
315 * @ss: the SGE state
316 * @length: the number of bytes to skip
317 */
318void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release)
319{
320 struct hfi1_sge *sge = &ss->sge;
321
322 while (length) {
323 u32 len = sge->length;
324
325 if (len > length)
326 len = length;
327 if (len > sge->sge_length)
328 len = sge->sge_length;
329 WARN_ON_ONCE(len == 0);
330 sge->vaddr += len;
331 sge->length -= len;
332 sge->sge_length -= len;
333 if (sge->sge_length == 0) {
334 if (release)
335 hfi1_put_mr(sge->mr);
336 if (--ss->num_sge)
337 *sge = *ss->sg_list++;
338 } else if (sge->length == 0 && sge->mr->lkey) {
339 if (++sge->n >= HFI1_SEGSZ) {
340 if (++sge->m >= sge->mr->mapsz)
341 break;
342 sge->n = 0;
343 }
344 sge->vaddr =
345 sge->mr->map[sge->m]->segs[sge->n].vaddr;
346 sge->length =
347 sge->mr->map[sge->m]->segs[sge->n].length;
348 }
349 length -= len;
350 }
351}
352
353/**
354 * post_one_send - post one RC, UC, or UD send work request
355 * @qp: the QP to post on
356 * @wr: the work request to send
357 */
358static int post_one_send(struct hfi1_qp *qp, struct ib_send_wr *wr)
359{
360 struct hfi1_swqe *wqe;
361 u32 next;
362 int i;
363 int j;
364 int acc;
365 struct hfi1_lkey_table *rkt;
366 struct hfi1_pd *pd;
367 struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
368 struct hfi1_pportdata *ppd;
369 struct hfi1_ibport *ibp;
370
371 /* IB spec says that num_sge == 0 is OK. */
372 if (unlikely(wr->num_sge > qp->s_max_sge))
373 return -EINVAL;
374
375 ppd = &dd->pport[qp->port_num - 1];
376 ibp = &ppd->ibport_data;
377
378 /*
379 * Don't allow RDMA reads or atomic operations on UC or
380 * undefined operations.
381 * Make sure buffer is large enough to hold the result for atomics.
382 */
383 if (wr->opcode == IB_WR_FAST_REG_MR) {
384 return -EINVAL;
385 } else if (qp->ibqp.qp_type == IB_QPT_UC) {
386 if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
387 return -EINVAL;
388 } else if (qp->ibqp.qp_type != IB_QPT_RC) {
389 /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
390 if (wr->opcode != IB_WR_SEND &&
391 wr->opcode != IB_WR_SEND_WITH_IMM)
392 return -EINVAL;
393 /* Check UD destination address PD */
394 if (qp->ibqp.pd != wr->wr.ud.ah->pd)
395 return -EINVAL;
396 } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
397 return -EINVAL;
398 else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
399 (wr->num_sge == 0 ||
400 wr->sg_list[0].length < sizeof(u64) ||
401 wr->sg_list[0].addr & (sizeof(u64) - 1)))
402 return -EINVAL;
403 else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
404 return -EINVAL;
405
406 next = qp->s_head + 1;
407 if (next >= qp->s_size)
408 next = 0;
409 if (next == qp->s_last)
410 return -ENOMEM;
411
412 rkt = &to_idev(qp->ibqp.device)->lk_table;
413 pd = to_ipd(qp->ibqp.pd);
414 wqe = get_swqe_ptr(qp, qp->s_head);
415 wqe->wr = *wr;
416 wqe->length = 0;
417 j = 0;
418 if (wr->num_sge) {
419 acc = wr->opcode >= IB_WR_RDMA_READ ?
420 IB_ACCESS_LOCAL_WRITE : 0;
421 for (i = 0; i < wr->num_sge; i++) {
422 u32 length = wr->sg_list[i].length;
423 int ok;
424
425 if (length == 0)
426 continue;
427 ok = hfi1_lkey_ok(rkt, pd, &wqe->sg_list[j],
428 &wr->sg_list[i], acc);
429 if (!ok)
430 goto bail_inval_free;
431 wqe->length += length;
432 j++;
433 }
434 wqe->wr.num_sge = j;
435 }
436 if (qp->ibqp.qp_type == IB_QPT_UC ||
437 qp->ibqp.qp_type == IB_QPT_RC) {
438 if (wqe->length > 0x80000000U)
439 goto bail_inval_free;
440 } else {
441 struct hfi1_ah *ah = to_iah(wr->wr.ud.ah);
442
443 atomic_inc(&ah->refcount);
444 }
445 wqe->ssn = qp->s_ssn++;
446 qp->s_head = next;
447
448 return 0;
449
450bail_inval_free:
451 /* release mr holds */
452 while (j) {
453 struct hfi1_sge *sge = &wqe->sg_list[--j];
454
455 hfi1_put_mr(sge->mr);
456 }
457 return -EINVAL;
458}
459
460/**
461 * post_send - post a send on a QP
462 * @ibqp: the QP to post the send on
463 * @wr: the list of work requests to post
464 * @bad_wr: the first bad WR is put here
465 *
466 * This may be called from interrupt context.
467 */
468static int post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
469 struct ib_send_wr **bad_wr)
470{
471 struct hfi1_qp *qp = to_iqp(ibqp);
472 int err = 0;
473 int call_send;
474 unsigned long flags;
475 unsigned nreq = 0;
476
477 spin_lock_irqsave(&qp->s_lock, flags);
478
479 /* Check that state is OK to post send. */
480 if (unlikely(!(ib_hfi1_state_ops[qp->state] & HFI1_POST_SEND_OK))) {
481 spin_unlock_irqrestore(&qp->s_lock, flags);
482 return -EINVAL;
483 }
484
485 /* sq empty and not list -> call send */
486 call_send = qp->s_head == qp->s_last && !wr->next;
487
488 for (; wr; wr = wr->next) {
489 err = post_one_send(qp, wr);
490 if (unlikely(err)) {
491 *bad_wr = wr;
492 goto bail;
493 }
494 nreq++;
495 }
496bail:
497 if (nreq && !call_send)
498 hfi1_schedule_send(qp);
499 spin_unlock_irqrestore(&qp->s_lock, flags);
500 if (nreq && call_send)
501 hfi1_do_send(&qp->s_iowait.iowork);
502 return err;
503}
504
505/**
506 * post_receive - post a receive on a QP
507 * @ibqp: the QP to post the receive on
508 * @wr: the WR to post
509 * @bad_wr: the first bad WR is put here
510 *
511 * This may be called from interrupt context.
512 */
513static int post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
514 struct ib_recv_wr **bad_wr)
515{
516 struct hfi1_qp *qp = to_iqp(ibqp);
517 struct hfi1_rwq *wq = qp->r_rq.wq;
518 unsigned long flags;
519 int ret;
520
521 /* Check that state is OK to post receive. */
522 if (!(ib_hfi1_state_ops[qp->state] & HFI1_POST_RECV_OK) || !wq) {
523 *bad_wr = wr;
524 ret = -EINVAL;
525 goto bail;
526 }
527
528 for (; wr; wr = wr->next) {
529 struct hfi1_rwqe *wqe;
530 u32 next;
531 int i;
532
533 if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
534 *bad_wr = wr;
535 ret = -EINVAL;
536 goto bail;
537 }
538
539 spin_lock_irqsave(&qp->r_rq.lock, flags);
540 next = wq->head + 1;
541 if (next >= qp->r_rq.size)
542 next = 0;
543 if (next == wq->tail) {
544 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
545 *bad_wr = wr;
546 ret = -ENOMEM;
547 goto bail;
548 }
549
550 wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
551 wqe->wr_id = wr->wr_id;
552 wqe->num_sge = wr->num_sge;
553 for (i = 0; i < wr->num_sge; i++)
554 wqe->sg_list[i] = wr->sg_list[i];
555 /* Make sure queue entry is written before the head index. */
556 smp_wmb();
557 wq->head = next;
558 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
559 }
560 ret = 0;
561
562bail:
563 return ret;
564}
565
566/*
567 * Make sure the QP is ready and able to accept the given opcode.
568 */
569static inline int qp_ok(int opcode, struct hfi1_packet *packet)
570{
571 struct hfi1_ibport *ibp;
572
573 if (!(ib_hfi1_state_ops[packet->qp->state] & HFI1_PROCESS_RECV_OK))
574 goto dropit;
575 if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
576 (opcode == IB_OPCODE_CNP))
577 return 1;
578dropit:
579 ibp = &packet->rcd->ppd->ibport_data;
580 ibp->n_pkt_drops++;
581 return 0;
582}
583
584
585/**
586 * hfi1_ib_rcv - process an incoming packet
587 * @packet: data packet information
588 *
589 * This is called to process an incoming packet at interrupt level.
590 *
591 * Tlen is the length of the header + data + CRC in bytes.
592 */
593void hfi1_ib_rcv(struct hfi1_packet *packet)
594{
595 struct hfi1_ctxtdata *rcd = packet->rcd;
596 struct hfi1_ib_header *hdr = packet->hdr;
597 u32 tlen = packet->tlen;
598 struct hfi1_pportdata *ppd = rcd->ppd;
599 struct hfi1_ibport *ibp = &ppd->ibport_data;
600 u32 qp_num;
601 int lnh;
602 u8 opcode;
603 u16 lid;
604
605 /* Check for GRH */
606 lnh = be16_to_cpu(hdr->lrh[0]) & 3;
607 if (lnh == HFI1_LRH_BTH)
608 packet->ohdr = &hdr->u.oth;
609 else if (lnh == HFI1_LRH_GRH) {
610 u32 vtf;
611
612 packet->ohdr = &hdr->u.l.oth;
613 if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
614 goto drop;
615 vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
616 if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
617 goto drop;
618 packet->rcv_flags |= HFI1_HAS_GRH;
619 } else
620 goto drop;
621
622 trace_input_ibhdr(rcd->dd, hdr);
623
624 opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
625 inc_opstats(tlen, &rcd->opstats->stats[opcode]);
626
627 /* Get the destination QP number. */
628 qp_num = be32_to_cpu(packet->ohdr->bth[1]) & HFI1_QPN_MASK;
629 lid = be16_to_cpu(hdr->lrh[1]);
630 if (unlikely((lid >= HFI1_MULTICAST_LID_BASE) &&
631 (lid != HFI1_PERMISSIVE_LID))) {
632 struct hfi1_mcast *mcast;
633 struct hfi1_mcast_qp *p;
634
635 if (lnh != HFI1_LRH_GRH)
636 goto drop;
637 mcast = hfi1_mcast_find(ibp, &hdr->u.l.grh.dgid);
638 if (mcast == NULL)
639 goto drop;
640 list_for_each_entry_rcu(p, &mcast->qp_list, list) {
641 packet->qp = p->qp;
642 spin_lock(&packet->qp->r_lock);
643 if (likely((qp_ok(opcode, packet))))
644 opcode_handler_tbl[opcode](packet);
645 spin_unlock(&packet->qp->r_lock);
646 }
647 /*
648 * Notify hfi1_multicast_detach() if it is waiting for us
649 * to finish.
650 */
651 if (atomic_dec_return(&mcast->refcount) <= 1)
652 wake_up(&mcast->wait);
653 } else {
654 rcu_read_lock();
655 packet->qp = hfi1_lookup_qpn(ibp, qp_num);
656 if (!packet->qp) {
657 rcu_read_unlock();
658 goto drop;
659 }
660 spin_lock(&packet->qp->r_lock);
661 if (likely((qp_ok(opcode, packet))))
662 opcode_handler_tbl[opcode](packet);
663 spin_unlock(&packet->qp->r_lock);
664 rcu_read_unlock();
665 }
666 return;
667
668drop:
669 ibp->n_pkt_drops++;
670}
671
672/*
673 * This is called from a timer to check for QPs
674 * which need kernel memory in order to send a packet.
675 */
676static void mem_timer(unsigned long data)
677{
678 struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
679 struct list_head *list = &dev->memwait;
680 struct hfi1_qp *qp = NULL;
681 struct iowait *wait;
682 unsigned long flags;
683
684 write_seqlock_irqsave(&dev->iowait_lock, flags);
685 if (!list_empty(list)) {
686 wait = list_first_entry(list, struct iowait, list);
687 qp = container_of(wait, struct hfi1_qp, s_iowait);
688 list_del_init(&qp->s_iowait.list);
689 /* refcount held until actual wake up */
690 if (!list_empty(list))
691 mod_timer(&dev->mem_timer, jiffies + 1);
692 }
693 write_sequnlock_irqrestore(&dev->iowait_lock, flags);
694
695 if (qp)
696 hfi1_qp_wakeup(qp, HFI1_S_WAIT_KMEM);
697}
698
699void update_sge(struct hfi1_sge_state *ss, u32 length)
700{
701 struct hfi1_sge *sge = &ss->sge;
702
703 sge->vaddr += length;
704 sge->length -= length;
705 sge->sge_length -= length;
706 if (sge->sge_length == 0) {
707 if (--ss->num_sge)
708 *sge = *ss->sg_list++;
709 } else if (sge->length == 0 && sge->mr->lkey) {
710 if (++sge->n >= HFI1_SEGSZ) {
711 if (++sge->m >= sge->mr->mapsz)
712 return;
713 sge->n = 0;
714 }
715 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
716 sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
717 }
718}
719
720static noinline struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
721 struct hfi1_qp *qp)
722{
723 struct verbs_txreq *tx;
724 unsigned long flags;
725
726 tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
727 if (!tx) {
728 spin_lock_irqsave(&qp->s_lock, flags);
729 write_seqlock(&dev->iowait_lock);
730 if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK &&
731 list_empty(&qp->s_iowait.list)) {
732 dev->n_txwait++;
733 qp->s_flags |= HFI1_S_WAIT_TX;
734 list_add_tail(&qp->s_iowait.list, &dev->txwait);
735 trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TX);
736 atomic_inc(&qp->refcount);
737 }
738 qp->s_flags &= ~HFI1_S_BUSY;
739 write_sequnlock(&dev->iowait_lock);
740 spin_unlock_irqrestore(&qp->s_lock, flags);
741 tx = ERR_PTR(-EBUSY);
742 }
743 return tx;
744}
745
746static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
747 struct hfi1_qp *qp)
748{
749 struct verbs_txreq *tx;
750
751 tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
752 if (!tx)
753 /* call slow path to get the lock */
754 tx = __get_txreq(dev, qp);
755 if (tx)
756 tx->qp = qp;
757 return tx;
758}
759
760void hfi1_put_txreq(struct verbs_txreq *tx)
761{
762 struct hfi1_ibdev *dev;
763 struct hfi1_qp *qp;
764 unsigned long flags;
765 unsigned int seq;
766
767 qp = tx->qp;
768 dev = to_idev(qp->ibqp.device);
769
770 if (tx->mr) {
771 hfi1_put_mr(tx->mr);
772 tx->mr = NULL;
773 }
774 sdma_txclean(dd_from_dev(dev), &tx->txreq);
775
776 /* Free verbs_txreq and return to slab cache */
777 kmem_cache_free(dev->verbs_txreq_cache, tx);
778
779 do {
780 seq = read_seqbegin(&dev->iowait_lock);
781 if (!list_empty(&dev->txwait)) {
782 struct iowait *wait;
783
784 write_seqlock_irqsave(&dev->iowait_lock, flags);
785 /* Wake up first QP wanting a free struct */
786 wait = list_first_entry(&dev->txwait, struct iowait,
787 list);
788 qp = container_of(wait, struct hfi1_qp, s_iowait);
789 list_del_init(&qp->s_iowait.list);
790 /* refcount held until actual wake up */
791 write_sequnlock_irqrestore(&dev->iowait_lock, flags);
792 hfi1_qp_wakeup(qp, HFI1_S_WAIT_TX);
793 break;
794 }
795 } while (read_seqretry(&dev->iowait_lock, seq));
796}
797
798/*
799 * This is called with progress side lock held.
800 */
801/* New API */
802static void verbs_sdma_complete(
803 struct sdma_txreq *cookie,
804 int status,
805 int drained)
806{
807 struct verbs_txreq *tx =
808 container_of(cookie, struct verbs_txreq, txreq);
809 struct hfi1_qp *qp = tx->qp;
810
811 spin_lock(&qp->s_lock);
812 if (tx->wqe)
813 hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
814 else if (qp->ibqp.qp_type == IB_QPT_RC) {
815 struct hfi1_ib_header *hdr;
816
817 hdr = &tx->phdr.hdr;
818 hfi1_rc_send_complete(qp, hdr);
819 }
820 if (drained) {
821 /*
822 * This happens when the send engine notes
823 * a QP in the error state and cannot
824 * do the flush work until that QP's
825 * sdma work has finished.
826 */
827 if (qp->s_flags & HFI1_S_WAIT_DMA) {
828 qp->s_flags &= ~HFI1_S_WAIT_DMA;
829 hfi1_schedule_send(qp);
830 }
831 }
832 spin_unlock(&qp->s_lock);
833
834 hfi1_put_txreq(tx);
835}
836
837static int wait_kmem(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
838{
839 unsigned long flags;
840 int ret = 0;
841
842 spin_lock_irqsave(&qp->s_lock, flags);
843 if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
844 write_seqlock(&dev->iowait_lock);
845 if (list_empty(&qp->s_iowait.list)) {
846 if (list_empty(&dev->memwait))
847 mod_timer(&dev->mem_timer, jiffies + 1);
848 qp->s_flags |= HFI1_S_WAIT_KMEM;
849 list_add_tail(&qp->s_iowait.list, &dev->memwait);
850 trace_hfi1_qpsleep(qp, HFI1_S_WAIT_KMEM);
851 atomic_inc(&qp->refcount);
852 }
853 write_sequnlock(&dev->iowait_lock);
854 qp->s_flags &= ~HFI1_S_BUSY;
855 ret = -EBUSY;
856 }
857 spin_unlock_irqrestore(&qp->s_lock, flags);
858
859 return ret;
860}
861
862/*
863 * This routine calls txadds for each sg entry.
864 *
865 * Add failures will revert the sge cursor
866 */
867static int build_verbs_ulp_payload(
868 struct sdma_engine *sde,
869 struct hfi1_sge_state *ss,
870 u32 length,
871 struct verbs_txreq *tx)
872{
873 struct hfi1_sge *sg_list = ss->sg_list;
874 struct hfi1_sge sge = ss->sge;
875 u8 num_sge = ss->num_sge;
876 u32 len;
877 int ret = 0;
878
879 while (length) {
880 len = ss->sge.length;
881 if (len > length)
882 len = length;
883 if (len > ss->sge.sge_length)
884 len = ss->sge.sge_length;
885 WARN_ON_ONCE(len == 0);
886 ret = sdma_txadd_kvaddr(
887 sde->dd,
888 &tx->txreq,
889 ss->sge.vaddr,
890 len);
891 if (ret)
892 goto bail_txadd;
893 update_sge(ss, len);
894 length -= len;
895 }
896 return ret;
897bail_txadd:
898 /* unwind cursor */
899 ss->sge = sge;
900 ss->num_sge = num_sge;
901 ss->sg_list = sg_list;
902 return ret;
903}
904
905/*
906 * Build the number of DMA descriptors needed to send length bytes of data.
907 *
908 * NOTE: DMA mapping is held in the tx until completed in the ring or
909 * the tx desc is freed without having been submitted to the ring
910 *
911 * This routine insures the following all the helper routine
912 * calls succeed.
913 */
914/* New API */
915static int build_verbs_tx_desc(
916 struct sdma_engine *sde,
917 struct hfi1_sge_state *ss,
918 u32 length,
919 struct verbs_txreq *tx,
920 struct ahg_ib_header *ahdr,
921 u64 pbc)
922{
923 int ret = 0;
924 struct hfi1_pio_header *phdr;
925 u16 hdrbytes = tx->hdr_dwords << 2;
926
927 phdr = &tx->phdr;
928 if (!ahdr->ahgcount) {
929 ret = sdma_txinit_ahg(
930 &tx->txreq,
931 ahdr->tx_flags,
932 hdrbytes + length,
933 ahdr->ahgidx,
934 0,
935 NULL,
936 0,
937 verbs_sdma_complete);
938 if (ret)
939 goto bail_txadd;
940 phdr->pbc = cpu_to_le64(pbc);
941 memcpy(&phdr->hdr, &ahdr->ibh, hdrbytes - sizeof(phdr->pbc));
942 /* add the header */
943 ret = sdma_txadd_kvaddr(
944 sde->dd,
945 &tx->txreq,
946 &tx->phdr,
947 tx->hdr_dwords << 2);
948 if (ret)
949 goto bail_txadd;
950 } else {
951 struct hfi1_other_headers *sohdr = &ahdr->ibh.u.oth;
952 struct hfi1_other_headers *dohdr = &phdr->hdr.u.oth;
953
954 /* needed in rc_send_complete() */
955 phdr->hdr.lrh[0] = ahdr->ibh.lrh[0];
956 if ((be16_to_cpu(phdr->hdr.lrh[0]) & 3) == HFI1_LRH_GRH) {
957 sohdr = &ahdr->ibh.u.l.oth;
958 dohdr = &phdr->hdr.u.l.oth;
959 }
960 /* opcode */
961 dohdr->bth[0] = sohdr->bth[0];
962 /* PSN/ACK */
963 dohdr->bth[2] = sohdr->bth[2];
964 ret = sdma_txinit_ahg(
965 &tx->txreq,
966 ahdr->tx_flags,
967 length,
968 ahdr->ahgidx,
969 ahdr->ahgcount,
970 ahdr->ahgdesc,
971 hdrbytes,
972 verbs_sdma_complete);
973 if (ret)
974 goto bail_txadd;
975 }
976
977 /* add the ulp payload - if any. ss can be NULL for acks */
978 if (ss)
979 ret = build_verbs_ulp_payload(sde, ss, length, tx);
980bail_txadd:
981 return ret;
982}
983
984int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
985 u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
986 u32 plen, u32 dwords, u64 pbc)
987{
988 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
989 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
990 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
991 struct verbs_txreq *tx;
992 struct sdma_txreq *stx;
993 u64 pbc_flags = 0;
994 struct sdma_engine *sde;
995 u8 sc5 = qp->s_sc;
996 int ret;
997
998 if (!list_empty(&qp->s_iowait.tx_head)) {
999 stx = list_first_entry(
1000 &qp->s_iowait.tx_head,
1001 struct sdma_txreq,
1002 list);
1003 list_del_init(&stx->list);
1004 tx = container_of(stx, struct verbs_txreq, txreq);
1005 ret = sdma_send_txreq(tx->sde, &qp->s_iowait, stx);
1006 if (unlikely(ret == -ECOMM))
1007 goto bail_ecomm;
1008 return ret;
1009 }
1010
1011 tx = get_txreq(dev, qp);
1012 if (IS_ERR(tx))
1013 goto bail_tx;
1014
1015 if (!qp->s_hdr->sde) {
1016 tx->sde = sde = qp_to_sdma_engine(qp, sc5);
1017 if (!sde)
1018 goto bail_no_sde;
1019 } else
1020 tx->sde = sde = qp->s_hdr->sde;
1021
1022 if (likely(pbc == 0)) {
1023 u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
1024 /* No vl15 here */
1025 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
1026 pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
1027
1028 pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
1029 }
1030 tx->wqe = qp->s_wqe;
1031 tx->mr = qp->s_rdma_mr;
1032 if (qp->s_rdma_mr)
1033 qp->s_rdma_mr = NULL;
1034 tx->hdr_dwords = hdrwords + 2;
1035 ret = build_verbs_tx_desc(sde, ss, len, tx, ahdr, pbc);
1036 if (unlikely(ret))
1037 goto bail_build;
1038 trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh);
1039 ret = sdma_send_txreq(sde, &qp->s_iowait, &tx->txreq);
1040 if (unlikely(ret == -ECOMM))
1041 goto bail_ecomm;
1042 return ret;
1043
1044bail_no_sde:
1045 hfi1_put_txreq(tx);
1046bail_ecomm:
1047 /* The current one got "sent" */
1048 return 0;
1049bail_build:
1050 /* kmalloc or mapping fail */
1051 hfi1_put_txreq(tx);
1052 return wait_kmem(dev, qp);
1053bail_tx:
1054 return PTR_ERR(tx);
1055}
1056
1057/*
1058 * If we are now in the error state, return zero to flush the
1059 * send work request.
1060 */
1061static int no_bufs_available(struct hfi1_qp *qp, struct send_context *sc)
1062{
1063 struct hfi1_devdata *dd = sc->dd;
1064 struct hfi1_ibdev *dev = &dd->verbs_dev;
1065 unsigned long flags;
1066 int ret = 0;
1067
1068 /*
1069 * Note that as soon as want_buffer() is called and
1070 * possibly before it returns, sc_piobufavail()
1071 * could be called. Therefore, put QP on the I/O wait list before
1072 * enabling the PIO avail interrupt.
1073 */
1074 spin_lock_irqsave(&qp->s_lock, flags);
1075 if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
1076 write_seqlock(&dev->iowait_lock);
1077 if (list_empty(&qp->s_iowait.list)) {
1078 struct hfi1_ibdev *dev = &dd->verbs_dev;
1079 int was_empty;
1080
1081 dev->n_piowait++;
1082 qp->s_flags |= HFI1_S_WAIT_PIO;
1083 was_empty = list_empty(&sc->piowait);
1084 list_add_tail(&qp->s_iowait.list, &sc->piowait);
1085 trace_hfi1_qpsleep(qp, HFI1_S_WAIT_PIO);
1086 atomic_inc(&qp->refcount);
1087 /* counting: only call wantpiobuf_intr if first user */
1088 if (was_empty)
1089 hfi1_sc_wantpiobuf_intr(sc, 1);
1090 }
1091 write_sequnlock(&dev->iowait_lock);
1092 qp->s_flags &= ~HFI1_S_BUSY;
1093 ret = -EBUSY;
1094 }
1095 spin_unlock_irqrestore(&qp->s_lock, flags);
1096 return ret;
1097}
1098
1099struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5)
1100{
1101 struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1102 struct hfi1_pportdata *ppd = dd->pport + (qp->port_num - 1);
1103 u8 vl;
1104
1105 vl = sc_to_vlt(dd, sc5);
1106 if (vl >= ppd->vls_supported && vl != 15)
1107 return NULL;
1108 return dd->vld[vl].sc;
1109}
1110
1111int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
1112 u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
1113 u32 plen, u32 dwords, u64 pbc)
1114{
1115 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1116 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1117 u32 *hdr = (u32 *)&ahdr->ibh;
1118 u64 pbc_flags = 0;
1119 u32 sc5;
1120 unsigned long flags = 0;
1121 struct send_context *sc;
1122 struct pio_buf *pbuf;
1123 int wc_status = IB_WC_SUCCESS;
1124
1125 /* vl15 special case taken care of in ud.c */
1126 sc5 = qp->s_sc;
1127 sc = qp_to_send_context(qp, sc5);
1128
1129 if (!sc)
1130 return -EINVAL;
1131 if (likely(pbc == 0)) {
1132 u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
1133 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
1134 pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
1135 pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
1136 }
1137 pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
1138 if (unlikely(pbuf == NULL)) {
1139 if (ppd->host_link_state != HLS_UP_ACTIVE) {
1140 /*
1141 * If we have filled the PIO buffers to capacity and are
1142 * not in an active state this request is not going to
1143 * go out to so just complete it with an error or else a
1144 * ULP or the core may be stuck waiting.
1145 */
1146 hfi1_cdbg(
1147 PIO,
1148 "alloc failed. state not active, completing");
1149 wc_status = IB_WC_GENERAL_ERR;
1150 goto pio_bail;
1151 } else {
1152 /*
1153 * This is a normal occurrence. The PIO buffs are full
1154 * up but we are still happily sending, well we could be
1155 * so lets continue to queue the request.
1156 */
1157 hfi1_cdbg(PIO, "alloc failed. state active, queuing");
1158 return no_bufs_available(qp, sc);
1159 }
1160 }
1161
1162 if (len == 0) {
1163 pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
1164 } else {
1165 if (ss) {
1166 seg_pio_copy_start(pbuf, pbc, hdr, hdrwords*4);
1167 while (len) {
1168 void *addr = ss->sge.vaddr;
1169 u32 slen = ss->sge.length;
1170
1171 if (slen > len)
1172 slen = len;
1173 update_sge(ss, slen);
1174 seg_pio_copy_mid(pbuf, addr, slen);
1175 len -= slen;
1176 }
1177 seg_pio_copy_end(pbuf);
1178 }
1179 }
1180
1181 trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh);
1182
1183 if (qp->s_rdma_mr) {
1184 hfi1_put_mr(qp->s_rdma_mr);
1185 qp->s_rdma_mr = NULL;
1186 }
1187
1188pio_bail:
1189 if (qp->s_wqe) {
1190 spin_lock_irqsave(&qp->s_lock, flags);
1191 hfi1_send_complete(qp, qp->s_wqe, wc_status);
1192 spin_unlock_irqrestore(&qp->s_lock, flags);
1193 } else if (qp->ibqp.qp_type == IB_QPT_RC) {
1194 spin_lock_irqsave(&qp->s_lock, flags);
1195 hfi1_rc_send_complete(qp, &ahdr->ibh);
1196 spin_unlock_irqrestore(&qp->s_lock, flags);
1197 }
1198 return 0;
1199}
1200/*
1201 * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
1202 * being an entry from the ingress partition key table), return 0
1203 * otherwise. Use the matching criteria for egress partition keys
1204 * specified in the OPAv1 spec., section 9.1l.7.
1205 */
1206static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
1207{
1208 u16 mkey = pkey & PKEY_LOW_15_MASK;
1209 u16 ment = ent & PKEY_LOW_15_MASK;
1210
1211 if (mkey == ment) {
1212 /*
1213 * If pkey[15] is set (full partition member),
1214 * is bit 15 in the corresponding table element
1215 * clear (limited member)?
1216 */
1217 if (pkey & PKEY_MEMBER_MASK)
1218 return !!(ent & PKEY_MEMBER_MASK);
1219 return 1;
1220 }
1221 return 0;
1222}
1223
1224/*
1225 * egress_pkey_check - return 0 if hdr's pkey matches according to the
1226 * criteria in the OPAv1 spec., section 9.11.7.
1227 */
1228static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
1229 struct hfi1_ib_header *hdr,
1230 struct hfi1_qp *qp)
1231{
1232 struct hfi1_other_headers *ohdr;
1233 struct hfi1_devdata *dd;
1234 int i = 0;
1235 u16 pkey;
1236 u8 lnh, sc5 = qp->s_sc;
1237
1238 if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
1239 return 0;
1240
1241 /* locate the pkey within the headers */
1242 lnh = be16_to_cpu(hdr->lrh[0]) & 3;
1243 if (lnh == HFI1_LRH_GRH)
1244 ohdr = &hdr->u.l.oth;
1245 else
1246 ohdr = &hdr->u.oth;
1247
1248 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
1249
1250 /* If SC15, pkey[0:14] must be 0x7fff */
1251 if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
1252 goto bad;
1253
1254
1255 /* Is the pkey = 0x0, or 0x8000? */
1256 if ((pkey & PKEY_LOW_15_MASK) == 0)
1257 goto bad;
1258
1259 /* The most likely matching pkey has index qp->s_pkey_index */
1260 if (unlikely(!egress_pkey_matches_entry(pkey,
1261 ppd->pkeys[qp->s_pkey_index]))) {
1262 /* no match - try the entire table */
1263 for (; i < MAX_PKEY_VALUES; i++) {
1264 if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
1265 break;
1266 }
1267 }
1268
1269 if (i < MAX_PKEY_VALUES)
1270 return 0;
1271bad:
1272 incr_cntr64(&ppd->port_xmit_constraint_errors);
1273 dd = ppd->dd;
1274 if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) {
1275 u16 slid = be16_to_cpu(hdr->lrh[3]);
1276
1277 dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK;
1278 dd->err_info_xmit_constraint.slid = slid;
1279 dd->err_info_xmit_constraint.pkey = pkey;
1280 }
1281 return 1;
1282}
1283
1284/**
1285 * hfi1_verbs_send - send a packet
1286 * @qp: the QP to send on
1287 * @ahdr: the packet header
1288 * @hdrwords: the number of 32-bit words in the header
1289 * @ss: the SGE to send
1290 * @len: the length of the packet in bytes
1291 *
1292 * Return zero if packet is sent or queued OK.
1293 * Return non-zero and clear qp->s_flags HFI1_S_BUSY otherwise.
1294 */
1295int hfi1_verbs_send(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
1296 u32 hdrwords, struct hfi1_sge_state *ss, u32 len)
1297{
1298 struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1299 u32 plen;
1300 int ret;
1301 int pio = 0;
1302 unsigned long flags = 0;
1303 u32 dwords = (len + 3) >> 2;
1304
1305 /*
1306 * VL15 packets (IB_QPT_SMI) will always use PIO, so we
1307 * can defer SDMA restart until link goes ACTIVE without
1308 * worrying about just how we got there.
1309 */
1310 if ((qp->ibqp.qp_type == IB_QPT_SMI) ||
1311 !(dd->flags & HFI1_HAS_SEND_DMA))
1312 pio = 1;
1313
1314 ret = egress_pkey_check(dd->pport, &ahdr->ibh, qp);
1315 if (unlikely(ret)) {
1316 /*
1317 * The value we are returning here does not get propagated to
1318 * the verbs caller. Thus we need to complete the request with
1319 * error otherwise the caller could be sitting waiting on the
1320 * completion event. Only do this for PIO. SDMA has its own
1321 * mechanism for handling the errors. So for SDMA we can just
1322 * return.
1323 */
1324 if (pio) {
1325 hfi1_cdbg(PIO, "%s() Failed. Completing with err",
1326 __func__);
1327 spin_lock_irqsave(&qp->s_lock, flags);
1328 hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
1329 spin_unlock_irqrestore(&qp->s_lock, flags);
1330 }
1331 return -EINVAL;
1332 }
1333
1334 /*
1335 * Calculate the send buffer trigger address.
1336 * The +2 counts for the pbc control qword
1337 */
1338 plen = hdrwords + dwords + 2;
1339
1340 if (pio) {
1341 ret = dd->process_pio_send(
1342 qp, ahdr, hdrwords, ss, len, plen, dwords, 0);
1343 } else {
1344#ifdef CONFIG_SDMA_VERBOSITY
1345 dd_dev_err(dd, "CONFIG SDMA %s:%d %s()\n",
1346 slashstrip(__FILE__), __LINE__, __func__);
1347 dd_dev_err(dd, "SDMA hdrwords = %u, len = %u\n", hdrwords, len);
1348#endif
1349 ret = dd->process_dma_send(
1350 qp, ahdr, hdrwords, ss, len, plen, dwords, 0);
1351 }
1352
1353 return ret;
1354}
1355
1356static int query_device(struct ib_device *ibdev,
1357 struct ib_device_attr *props,
1358 struct ib_udata *uhw)
1359{
1360 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1361 struct hfi1_ibdev *dev = to_idev(ibdev);
1362
1363 if (uhw->inlen || uhw->outlen)
1364 return -EINVAL;
1365 memset(props, 0, sizeof(*props));
1366
1367 props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
1368 IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
1369 IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
1370 IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
1371
1372 props->page_size_cap = PAGE_SIZE;
1373 props->vendor_id =
1374 dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
1375 props->vendor_part_id = dd->pcidev->device;
1376 props->hw_ver = dd->minrev;
1377 props->sys_image_guid = ib_hfi1_sys_image_guid;
1378 props->max_mr_size = ~0ULL;
1379 props->max_qp = hfi1_max_qps;
1380 props->max_qp_wr = hfi1_max_qp_wrs;
1381 props->max_sge = hfi1_max_sges;
1382 props->max_sge_rd = hfi1_max_sges;
1383 props->max_cq = hfi1_max_cqs;
1384 props->max_ah = hfi1_max_ahs;
1385 props->max_cqe = hfi1_max_cqes;
1386 props->max_mr = dev->lk_table.max;
1387 props->max_fmr = dev->lk_table.max;
1388 props->max_map_per_fmr = 32767;
1389 props->max_pd = hfi1_max_pds;
1390 props->max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
1391 props->max_qp_init_rd_atom = 255;
1392 /* props->max_res_rd_atom */
1393 props->max_srq = hfi1_max_srqs;
1394 props->max_srq_wr = hfi1_max_srq_wrs;
1395 props->max_srq_sge = hfi1_max_srq_sges;
1396 /* props->local_ca_ack_delay */
1397 props->atomic_cap = IB_ATOMIC_GLOB;
1398 props->max_pkeys = hfi1_get_npkeys(dd);
1399 props->max_mcast_grp = hfi1_max_mcast_grps;
1400 props->max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
1401 props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
1402 props->max_mcast_grp;
1403
1404 return 0;
1405}
1406
1407static inline u16 opa_speed_to_ib(u16 in)
1408{
1409 u16 out = 0;
1410
1411 if (in & OPA_LINK_SPEED_25G)
1412 out |= IB_SPEED_EDR;
1413 if (in & OPA_LINK_SPEED_12_5G)
1414 out |= IB_SPEED_FDR;
1415
1416 return out;
1417}
1418
1419/*
1420 * Convert a single OPA link width (no multiple flags) to an IB value.
1421 * A zero OPA link width means link down, which means the IB width value
1422 * is a don't care.
1423 */
1424static inline u16 opa_width_to_ib(u16 in)
1425{
1426 switch (in) {
1427 case OPA_LINK_WIDTH_1X:
1428 /* map 2x and 3x to 1x as they don't exist in IB */
1429 case OPA_LINK_WIDTH_2X:
1430 case OPA_LINK_WIDTH_3X:
1431 return IB_WIDTH_1X;
1432 default: /* link down or unknown, return our largest width */
1433 case OPA_LINK_WIDTH_4X:
1434 return IB_WIDTH_4X;
1435 }
1436}
1437
1438static int query_port(struct ib_device *ibdev, u8 port,
1439 struct ib_port_attr *props)
1440{
1441 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1442 struct hfi1_ibport *ibp = to_iport(ibdev, port);
1443 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1444 u16 lid = ppd->lid;
1445
1446 memset(props, 0, sizeof(*props));
1447 props->lid = lid ? lid : 0;
1448 props->lmc = ppd->lmc;
1449 props->sm_lid = ibp->sm_lid;
1450 props->sm_sl = ibp->sm_sl;
1451 /* OPA logical states match IB logical states */
1452 props->state = driver_lstate(ppd);
1453 props->phys_state = hfi1_ibphys_portstate(ppd);
1454 props->port_cap_flags = ibp->port_cap_flags;
1455 props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
1456 props->max_msg_sz = 0x80000000;
1457 props->pkey_tbl_len = hfi1_get_npkeys(dd);
1458 props->bad_pkey_cntr = ibp->pkey_violations;
1459 props->qkey_viol_cntr = ibp->qkey_violations;
1460 props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
1461 /* see rate_show() in ib core/sysfs.c */
1462 props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
1463 props->max_vl_num = ppd->vls_supported;
1464 props->init_type_reply = 0;
1465
1466 /* Once we are a "first class" citizen and have added the OPA MTUs to
1467 * the core we can advertise the larger MTU enum to the ULPs, for now
1468 * advertise only 4K.
1469 *
1470 * Those applications which are either OPA aware or pass the MTU enum
1471 * from the Path Records to us will get the new 8k MTU. Those that
1472 * attempt to process the MTU enum may fail in various ways.
1473 */
1474 props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
1475 4096 : hfi1_max_mtu), IB_MTU_4096);
1476 props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
1477 mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
1478 props->subnet_timeout = ibp->subnet_timeout;
1479
1480 return 0;
1481}
1482
1483static int port_immutable(struct ib_device *ibdev, u8 port_num,
1484 struct ib_port_immutable *immutable)
1485{
1486 struct ib_port_attr attr;
1487 int err;
1488
1489 err = query_port(ibdev, port_num, &attr);
1490 if (err)
1491 return err;
1492
1493 memset(immutable, 0, sizeof(*immutable));
1494
1495 immutable->pkey_tbl_len = attr.pkey_tbl_len;
1496 immutable->gid_tbl_len = attr.gid_tbl_len;
1497 immutable->core_cap_flags = RDMA_CORE_PORT_INTEL_OPA;
1498 immutable->max_mad_size = OPA_MGMT_MAD_SIZE;
1499
1500 return 0;
1501}
1502
1503static int modify_device(struct ib_device *device,
1504 int device_modify_mask,
1505 struct ib_device_modify *device_modify)
1506{
1507 struct hfi1_devdata *dd = dd_from_ibdev(device);
1508 unsigned i;
1509 int ret;
1510
1511 if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
1512 IB_DEVICE_MODIFY_NODE_DESC)) {
1513 ret = -EOPNOTSUPP;
1514 goto bail;
1515 }
1516
1517 if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
1518 memcpy(device->node_desc, device_modify->node_desc, 64);
1519 for (i = 0; i < dd->num_pports; i++) {
1520 struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1521
1522 hfi1_node_desc_chg(ibp);
1523 }
1524 }
1525
1526 if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
1527 ib_hfi1_sys_image_guid =
1528 cpu_to_be64(device_modify->sys_image_guid);
1529 for (i = 0; i < dd->num_pports; i++) {
1530 struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1531
1532 hfi1_sys_guid_chg(ibp);
1533 }
1534 }
1535
1536 ret = 0;
1537
1538bail:
1539 return ret;
1540}
1541
1542static int modify_port(struct ib_device *ibdev, u8 port,
1543 int port_modify_mask, struct ib_port_modify *props)
1544{
1545 struct hfi1_ibport *ibp = to_iport(ibdev, port);
1546 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1547 int ret = 0;
1548
1549 ibp->port_cap_flags |= props->set_port_cap_mask;
1550 ibp->port_cap_flags &= ~props->clr_port_cap_mask;
1551 if (props->set_port_cap_mask || props->clr_port_cap_mask)
1552 hfi1_cap_mask_chg(ibp);
1553 if (port_modify_mask & IB_PORT_SHUTDOWN) {
1554 set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
1555 OPA_LINKDOWN_REASON_UNKNOWN);
1556 ret = set_link_state(ppd, HLS_DN_DOWNDEF);
1557 }
1558 if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
1559 ibp->qkey_violations = 0;
1560 return ret;
1561}
1562
1563static int query_gid(struct ib_device *ibdev, u8 port,
1564 int index, union ib_gid *gid)
1565{
1566 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1567 int ret = 0;
1568
1569 if (!port || port > dd->num_pports)
1570 ret = -EINVAL;
1571 else {
1572 struct hfi1_ibport *ibp = to_iport(ibdev, port);
1573 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1574
1575 gid->global.subnet_prefix = ibp->gid_prefix;
1576 if (index == 0)
1577 gid->global.interface_id = cpu_to_be64(ppd->guid);
1578 else if (index < HFI1_GUIDS_PER_PORT)
1579 gid->global.interface_id = ibp->guids[index - 1];
1580 else
1581 ret = -EINVAL;
1582 }
1583
1584 return ret;
1585}
1586
1587static struct ib_pd *alloc_pd(struct ib_device *ibdev,
1588 struct ib_ucontext *context,
1589 struct ib_udata *udata)
1590{
1591 struct hfi1_ibdev *dev = to_idev(ibdev);
1592 struct hfi1_pd *pd;
1593 struct ib_pd *ret;
1594
1595 /*
1596 * This is actually totally arbitrary. Some correctness tests
1597 * assume there's a maximum number of PDs that can be allocated.
1598 * We don't actually have this limit, but we fail the test if
1599 * we allow allocations of more than we report for this value.
1600 */
1601
1602 pd = kmalloc(sizeof(*pd), GFP_KERNEL);
1603 if (!pd) {
1604 ret = ERR_PTR(-ENOMEM);
1605 goto bail;
1606 }
1607
1608 spin_lock(&dev->n_pds_lock);
1609 if (dev->n_pds_allocated == hfi1_max_pds) {
1610 spin_unlock(&dev->n_pds_lock);
1611 kfree(pd);
1612 ret = ERR_PTR(-ENOMEM);
1613 goto bail;
1614 }
1615
1616 dev->n_pds_allocated++;
1617 spin_unlock(&dev->n_pds_lock);
1618
1619 /* ib_alloc_pd() will initialize pd->ibpd. */
1620 pd->user = udata != NULL;
1621
1622 ret = &pd->ibpd;
1623
1624bail:
1625 return ret;
1626}
1627
1628static int dealloc_pd(struct ib_pd *ibpd)
1629{
1630 struct hfi1_pd *pd = to_ipd(ibpd);
1631 struct hfi1_ibdev *dev = to_idev(ibpd->device);
1632
1633 spin_lock(&dev->n_pds_lock);
1634 dev->n_pds_allocated--;
1635 spin_unlock(&dev->n_pds_lock);
1636
1637 kfree(pd);
1638
1639 return 0;
1640}
1641
1642/*
1643 * convert ah port,sl to sc
1644 */
1645u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah)
1646{
1647 struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num);
1648
1649 return ibp->sl_to_sc[ah->sl];
1650}
1651
1652int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
1653{
1654 struct hfi1_ibport *ibp;
1655 struct hfi1_pportdata *ppd;
1656 struct hfi1_devdata *dd;
1657 u8 sc5;
1658
1659 /* A multicast address requires a GRH (see ch. 8.4.1). */
1660 if (ah_attr->dlid >= HFI1_MULTICAST_LID_BASE &&
1661 ah_attr->dlid != HFI1_PERMISSIVE_LID &&
1662 !(ah_attr->ah_flags & IB_AH_GRH))
1663 goto bail;
1664 if ((ah_attr->ah_flags & IB_AH_GRH) &&
1665 ah_attr->grh.sgid_index >= HFI1_GUIDS_PER_PORT)
1666 goto bail;
1667 if (ah_attr->dlid == 0)
1668 goto bail;
1669 if (ah_attr->port_num < 1 ||
1670 ah_attr->port_num > ibdev->phys_port_cnt)
1671 goto bail;
1672 if (ah_attr->static_rate != IB_RATE_PORT_CURRENT &&
1673 ib_rate_to_mbps(ah_attr->static_rate) < 0)
1674 goto bail;
1675 if (ah_attr->sl >= OPA_MAX_SLS)
1676 goto bail;
1677 /* test the mapping for validity */
1678 ibp = to_iport(ibdev, ah_attr->port_num);
1679 ppd = ppd_from_ibp(ibp);
1680 sc5 = ibp->sl_to_sc[ah_attr->sl];
1681 dd = dd_from_ppd(ppd);
1682 if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
1683 goto bail;
1684 return 0;
1685bail:
1686 return -EINVAL;
1687}
1688
1689/**
1690 * create_ah - create an address handle
1691 * @pd: the protection domain
1692 * @ah_attr: the attributes of the AH
1693 *
1694 * This may be called from interrupt context.
1695 */
1696static struct ib_ah *create_ah(struct ib_pd *pd,
1697 struct ib_ah_attr *ah_attr)
1698{
1699 struct hfi1_ah *ah;
1700 struct ib_ah *ret;
1701 struct hfi1_ibdev *dev = to_idev(pd->device);
1702 unsigned long flags;
1703
1704 if (hfi1_check_ah(pd->device, ah_attr)) {
1705 ret = ERR_PTR(-EINVAL);
1706 goto bail;
1707 }
1708
1709 ah = kmalloc(sizeof(*ah), GFP_ATOMIC);
1710 if (!ah) {
1711 ret = ERR_PTR(-ENOMEM);
1712 goto bail;
1713 }
1714
1715 spin_lock_irqsave(&dev->n_ahs_lock, flags);
1716 if (dev->n_ahs_allocated == hfi1_max_ahs) {
1717 spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
1718 kfree(ah);
1719 ret = ERR_PTR(-ENOMEM);
1720 goto bail;
1721 }
1722
1723 dev->n_ahs_allocated++;
1724 spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
1725
1726 /* ib_create_ah() will initialize ah->ibah. */
1727 ah->attr = *ah_attr;
1728 atomic_set(&ah->refcount, 0);
1729
1730 ret = &ah->ibah;
1731
1732bail:
1733 return ret;
1734}
1735
1736struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
1737{
1738 struct ib_ah_attr attr;
1739 struct ib_ah *ah = ERR_PTR(-EINVAL);
1740 struct hfi1_qp *qp0;
1741
1742 memset(&attr, 0, sizeof(attr));
1743 attr.dlid = dlid;
1744 attr.port_num = ppd_from_ibp(ibp)->port;
1745 rcu_read_lock();
1746 qp0 = rcu_dereference(ibp->qp[0]);
1747 if (qp0)
1748 ah = ib_create_ah(qp0->ibqp.pd, &attr);
1749 rcu_read_unlock();
1750 return ah;
1751}
1752
1753/**
1754 * destroy_ah - destroy an address handle
1755 * @ibah: the AH to destroy
1756 *
1757 * This may be called from interrupt context.
1758 */
1759static int destroy_ah(struct ib_ah *ibah)
1760{
1761 struct hfi1_ibdev *dev = to_idev(ibah->device);
1762 struct hfi1_ah *ah = to_iah(ibah);
1763 unsigned long flags;
1764
1765 if (atomic_read(&ah->refcount) != 0)
1766 return -EBUSY;
1767
1768 spin_lock_irqsave(&dev->n_ahs_lock, flags);
1769 dev->n_ahs_allocated--;
1770 spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
1771
1772 kfree(ah);
1773
1774 return 0;
1775}
1776
1777static int modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
1778{
1779 struct hfi1_ah *ah = to_iah(ibah);
1780
1781 if (hfi1_check_ah(ibah->device, ah_attr))
1782 return -EINVAL;
1783
1784 ah->attr = *ah_attr;
1785
1786 return 0;
1787}
1788
1789static int query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
1790{
1791 struct hfi1_ah *ah = to_iah(ibah);
1792
1793 *ah_attr = ah->attr;
1794
1795 return 0;
1796}
1797
1798/**
1799 * hfi1_get_npkeys - return the size of the PKEY table for context 0
1800 * @dd: the hfi1_ib device
1801 */
1802unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
1803{
1804 return ARRAY_SIZE(dd->pport[0].pkeys);
1805}
1806
1807static int query_pkey(struct ib_device *ibdev, u8 port, u16 index,
1808 u16 *pkey)
1809{
1810 struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1811 int ret;
1812
1813 if (index >= hfi1_get_npkeys(dd)) {
1814 ret = -EINVAL;
1815 goto bail;
1816 }
1817
1818 *pkey = hfi1_get_pkey(to_iport(ibdev, port), index);
1819 ret = 0;
1820
1821bail:
1822 return ret;
1823}
1824
1825/**
1826 * alloc_ucontext - allocate a ucontest
1827 * @ibdev: the infiniband device
1828 * @udata: not used by the driver
1829 */
1830
1831static struct ib_ucontext *alloc_ucontext(struct ib_device *ibdev,
1832 struct ib_udata *udata)
1833{
1834 struct hfi1_ucontext *context;
1835 struct ib_ucontext *ret;
1836
1837 context = kmalloc(sizeof(*context), GFP_KERNEL);
1838 if (!context) {
1839 ret = ERR_PTR(-ENOMEM);
1840 goto bail;
1841 }
1842
1843 ret = &context->ibucontext;
1844
1845bail:
1846 return ret;
1847}
1848
1849static int dealloc_ucontext(struct ib_ucontext *context)
1850{
1851 kfree(to_iucontext(context));
1852 return 0;
1853}
1854
1855static void init_ibport(struct hfi1_pportdata *ppd)
1856{
1857 struct hfi1_ibport *ibp = &ppd->ibport_data;
1858 size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
1859 int i;
1860
1861 for (i = 0; i < sz; i++) {
1862 ibp->sl_to_sc[i] = i;
1863 ibp->sc_to_sl[i] = i;
1864 }
1865
1866 spin_lock_init(&ibp->lock);
1867 /* Set the prefix to the default value (see ch. 4.1.1) */
1868 ibp->gid_prefix = IB_DEFAULT_GID_PREFIX;
1869 ibp->sm_lid = 0;
1870 /* Below should only set bits defined in OPA PortInfo.CapabilityMask */
1871 ibp->port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
1872 IB_PORT_CAP_MASK_NOTICE_SUP;
1873 ibp->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
1874 ibp->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
1875 ibp->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
1876 ibp->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
1877 ibp->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
1878
1879 RCU_INIT_POINTER(ibp->qp[0], NULL);
1880 RCU_INIT_POINTER(ibp->qp[1], NULL);
1881}
1882
1883static void verbs_txreq_kmem_cache_ctor(void *obj)
1884{
1885 struct verbs_txreq *tx = (struct verbs_txreq *)obj;
1886
1887 memset(tx, 0, sizeof(*tx));
1888}
1889
1890/**
1891 * hfi1_register_ib_device - register our device with the infiniband core
1892 * @dd: the device data structure
1893 * Return 0 if successful, errno if unsuccessful.
1894 */
1895int hfi1_register_ib_device(struct hfi1_devdata *dd)
1896{
1897 struct hfi1_ibdev *dev = &dd->verbs_dev;
1898 struct ib_device *ibdev = &dev->ibdev;
1899 struct hfi1_pportdata *ppd = dd->pport;
1900 unsigned i, lk_tab_size;
1901 int ret;
1902 size_t lcpysz = IB_DEVICE_NAME_MAX;
1903 u16 descq_cnt;
1904
1905 ret = hfi1_qp_init(dev);
1906 if (ret)
1907 goto err_qp_init;
1908
1909
1910 for (i = 0; i < dd->num_pports; i++)
1911 init_ibport(ppd + i);
1912
1913 /* Only need to initialize non-zero fields. */
1914 spin_lock_init(&dev->n_pds_lock);
1915 spin_lock_init(&dev->n_ahs_lock);
1916 spin_lock_init(&dev->n_cqs_lock);
1917 spin_lock_init(&dev->n_qps_lock);
1918 spin_lock_init(&dev->n_srqs_lock);
1919 spin_lock_init(&dev->n_mcast_grps_lock);
1920 init_timer(&dev->mem_timer);
1921 dev->mem_timer.function = mem_timer;
1922 dev->mem_timer.data = (unsigned long) dev;
1923
1924 /*
1925 * The top hfi1_lkey_table_size bits are used to index the
1926 * table. The lower 8 bits can be owned by the user (copied from
1927 * the LKEY). The remaining bits act as a generation number or tag.
1928 */
1929 spin_lock_init(&dev->lk_table.lock);
1930 dev->lk_table.max = 1 << hfi1_lkey_table_size;
1931 /* ensure generation is at least 4 bits (keys.c) */
1932 if (hfi1_lkey_table_size > MAX_LKEY_TABLE_BITS) {
1933 dd_dev_warn(dd, "lkey bits %u too large, reduced to %u\n",
1934 hfi1_lkey_table_size, MAX_LKEY_TABLE_BITS);
1935 hfi1_lkey_table_size = MAX_LKEY_TABLE_BITS;
1936 }
1937 lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
1938 dev->lk_table.table = (struct hfi1_mregion __rcu **)
1939 vmalloc(lk_tab_size);
1940 if (dev->lk_table.table == NULL) {
1941 ret = -ENOMEM;
1942 goto err_lk;
1943 }
1944 RCU_INIT_POINTER(dev->dma_mr, NULL);
1945 for (i = 0; i < dev->lk_table.max; i++)
1946 RCU_INIT_POINTER(dev->lk_table.table[i], NULL);
1947 INIT_LIST_HEAD(&dev->pending_mmaps);
1948 spin_lock_init(&dev->pending_lock);
1949 seqlock_init(&dev->iowait_lock);
1950 dev->mmap_offset = PAGE_SIZE;
1951 spin_lock_init(&dev->mmap_offset_lock);
1952 INIT_LIST_HEAD(&dev->txwait);
1953 INIT_LIST_HEAD(&dev->memwait);
1954
1955 descq_cnt = sdma_get_descq_cnt();
1956
1957 /* SLAB_HWCACHE_ALIGN for AHG */
1958 dev->verbs_txreq_cache = kmem_cache_create("hfi1_vtxreq_cache",
1959 sizeof(struct verbs_txreq),
1960 0, SLAB_HWCACHE_ALIGN,
1961 verbs_txreq_kmem_cache_ctor);
1962 if (!dev->verbs_txreq_cache) {
1963 ret = -ENOMEM;
1964 goto err_verbs_txreq;
1965 }
1966
1967 /*
1968 * The system image GUID is supposed to be the same for all
1969 * HFIs in a single system but since there can be other
1970 * device types in the system, we can't be sure this is unique.
1971 */
1972 if (!ib_hfi1_sys_image_guid)
1973 ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
1974 lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
1975 strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
1976 ibdev->owner = THIS_MODULE;
1977 ibdev->node_guid = cpu_to_be64(ppd->guid);
1978 ibdev->uverbs_abi_ver = HFI1_UVERBS_ABI_VERSION;
1979 ibdev->uverbs_cmd_mask =
1980 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
1981 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
1982 (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
1983 (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
1984 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
1985 (1ull << IB_USER_VERBS_CMD_CREATE_AH) |
1986 (1ull << IB_USER_VERBS_CMD_MODIFY_AH) |
1987 (1ull << IB_USER_VERBS_CMD_QUERY_AH) |
1988 (1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
1989 (1ull << IB_USER_VERBS_CMD_REG_MR) |
1990 (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
1991 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
1992 (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
1993 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) |
1994 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
1995 (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
1996 (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
1997 (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
1998 (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
1999 (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
2000 (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
2001 (1ull << IB_USER_VERBS_CMD_POST_SEND) |
2002 (1ull << IB_USER_VERBS_CMD_POST_RECV) |
2003 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
2004 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
2005 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
2006 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
2007 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
2008 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
2009 (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
2010 ibdev->node_type = RDMA_NODE_IB_CA;
2011 ibdev->phys_port_cnt = dd->num_pports;
2012 ibdev->num_comp_vectors = 1;
2013 ibdev->dma_device = &dd->pcidev->dev;
2014 ibdev->query_device = query_device;
2015 ibdev->modify_device = modify_device;
2016 ibdev->query_port = query_port;
2017 ibdev->modify_port = modify_port;
2018 ibdev->query_pkey = query_pkey;
2019 ibdev->query_gid = query_gid;
2020 ibdev->alloc_ucontext = alloc_ucontext;
2021 ibdev->dealloc_ucontext = dealloc_ucontext;
2022 ibdev->alloc_pd = alloc_pd;
2023 ibdev->dealloc_pd = dealloc_pd;
2024 ibdev->create_ah = create_ah;
2025 ibdev->destroy_ah = destroy_ah;
2026 ibdev->modify_ah = modify_ah;
2027 ibdev->query_ah = query_ah;
2028 ibdev->create_srq = hfi1_create_srq;
2029 ibdev->modify_srq = hfi1_modify_srq;
2030 ibdev->query_srq = hfi1_query_srq;
2031 ibdev->destroy_srq = hfi1_destroy_srq;
2032 ibdev->create_qp = hfi1_create_qp;
2033 ibdev->modify_qp = hfi1_modify_qp;
2034 ibdev->query_qp = hfi1_query_qp;
2035 ibdev->destroy_qp = hfi1_destroy_qp;
2036 ibdev->post_send = post_send;
2037 ibdev->post_recv = post_receive;
2038 ibdev->post_srq_recv = hfi1_post_srq_receive;
2039 ibdev->create_cq = hfi1_create_cq;
2040 ibdev->destroy_cq = hfi1_destroy_cq;
2041 ibdev->resize_cq = hfi1_resize_cq;
2042 ibdev->poll_cq = hfi1_poll_cq;
2043 ibdev->req_notify_cq = hfi1_req_notify_cq;
2044 ibdev->get_dma_mr = hfi1_get_dma_mr;
2045 ibdev->reg_phys_mr = hfi1_reg_phys_mr;
2046 ibdev->reg_user_mr = hfi1_reg_user_mr;
2047 ibdev->dereg_mr = hfi1_dereg_mr;
2048 ibdev->alloc_mr = hfi1_alloc_mr;
2049 ibdev->alloc_fast_reg_page_list = hfi1_alloc_fast_reg_page_list;
2050 ibdev->free_fast_reg_page_list = hfi1_free_fast_reg_page_list;
2051 ibdev->alloc_fmr = hfi1_alloc_fmr;
2052 ibdev->map_phys_fmr = hfi1_map_phys_fmr;
2053 ibdev->unmap_fmr = hfi1_unmap_fmr;
2054 ibdev->dealloc_fmr = hfi1_dealloc_fmr;
2055 ibdev->attach_mcast = hfi1_multicast_attach;
2056 ibdev->detach_mcast = hfi1_multicast_detach;
2057 ibdev->process_mad = hfi1_process_mad;
2058 ibdev->mmap = hfi1_mmap;
2059 ibdev->dma_ops = &hfi1_dma_mapping_ops;
2060 ibdev->get_port_immutable = port_immutable;
2061
2062 strncpy(ibdev->node_desc, init_utsname()->nodename,
2063 sizeof(ibdev->node_desc));
2064
2065 ret = ib_register_device(ibdev, hfi1_create_port_files);
2066 if (ret)
2067 goto err_reg;
2068
2069 ret = hfi1_create_agents(dev);
2070 if (ret)
2071 goto err_agents;
2072
2073 ret = hfi1_verbs_register_sysfs(dd);
2074 if (ret)
2075 goto err_class;
2076
2077 goto bail;
2078
2079err_class:
2080 hfi1_free_agents(dev);
2081err_agents:
2082 ib_unregister_device(ibdev);
2083err_reg:
2084err_verbs_txreq:
2085 kmem_cache_destroy(dev->verbs_txreq_cache);
2086 vfree(dev->lk_table.table);
2087err_lk:
2088 hfi1_qp_exit(dev);
2089err_qp_init:
2090 dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
2091bail:
2092 return ret;
2093}
2094
2095void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
2096{
2097 struct hfi1_ibdev *dev = &dd->verbs_dev;
2098 struct ib_device *ibdev = &dev->ibdev;
2099
2100 hfi1_verbs_unregister_sysfs(dd);
2101
2102 hfi1_free_agents(dev);
2103
2104 ib_unregister_device(ibdev);
2105
2106 if (!list_empty(&dev->txwait))
2107 dd_dev_err(dd, "txwait list not empty!\n");
2108 if (!list_empty(&dev->memwait))
2109 dd_dev_err(dd, "memwait list not empty!\n");
2110 if (dev->dma_mr)
2111 dd_dev_err(dd, "DMA MR not NULL!\n");
2112
2113 hfi1_qp_exit(dev);
2114 del_timer_sync(&dev->mem_timer);
2115 kmem_cache_destroy(dev->verbs_txreq_cache);
2116 vfree(dev->lk_table.table);
2117}
2118
2119/*
2120 * This must be called with s_lock held.
2121 */
2122void hfi1_schedule_send(struct hfi1_qp *qp)
2123{
2124 if (hfi1_send_ok(qp)) {
2125 struct hfi1_ibport *ibp =
2126 to_iport(qp->ibqp.device, qp->port_num);
2127 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
2128
2129 iowait_schedule(&qp->s_iowait, ppd->hfi1_wq);
2130 }
2131}
2132
2133void hfi1_cnp_rcv(struct hfi1_packet *packet)
2134{
2135 struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
2136
2137 if (packet->qp->ibqp.qp_type == IB_QPT_UC)
2138 hfi1_uc_rcv(packet);
2139 else if (packet->qp->ibqp.qp_type == IB_QPT_UD)
2140 hfi1_ud_rcv(packet);
2141 else
2142 ibp->n_pkt_drops++;
2143}
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
new file mode 100644
index 000000000000..ed903a93baf7
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -0,0 +1,1151 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#ifndef HFI1_VERBS_H
52#define HFI1_VERBS_H
53
54#include <linux/types.h>
55#include <linux/seqlock.h>
56#include <linux/kernel.h>
57#include <linux/interrupt.h>
58#include <linux/kref.h>
59#include <linux/workqueue.h>
60#include <linux/kthread.h>
61#include <linux/completion.h>
62#include <rdma/ib_pack.h>
63#include <rdma/ib_user_verbs.h>
64#include <rdma/ib_mad.h>
65
66struct hfi1_ctxtdata;
67struct hfi1_pportdata;
68struct hfi1_devdata;
69struct hfi1_packet;
70
71#include "iowait.h"
72
73#define HFI1_MAX_RDMA_ATOMIC 16
74#define HFI1_GUIDS_PER_PORT 5
75
76/*
77 * Increment this value if any changes that break userspace ABI
78 * compatibility are made.
79 */
80#define HFI1_UVERBS_ABI_VERSION 2
81
82/*
83 * Define an ib_cq_notify value that is not valid so we know when CQ
84 * notifications are armed.
85 */
86#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1)
87
88#define IB_SEQ_NAK (3 << 29)
89
90/* AETH NAK opcode values */
91#define IB_RNR_NAK 0x20
92#define IB_NAK_PSN_ERROR 0x60
93#define IB_NAK_INVALID_REQUEST 0x61
94#define IB_NAK_REMOTE_ACCESS_ERROR 0x62
95#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
96#define IB_NAK_INVALID_RD_REQUEST 0x64
97
98/* Flags for checking QP state (see ib_hfi1_state_ops[]) */
99#define HFI1_POST_SEND_OK 0x01
100#define HFI1_POST_RECV_OK 0x02
101#define HFI1_PROCESS_RECV_OK 0x04
102#define HFI1_PROCESS_SEND_OK 0x08
103#define HFI1_PROCESS_NEXT_SEND_OK 0x10
104#define HFI1_FLUSH_SEND 0x20
105#define HFI1_FLUSH_RECV 0x40
106#define HFI1_PROCESS_OR_FLUSH_SEND \
107 (HFI1_PROCESS_SEND_OK | HFI1_FLUSH_SEND)
108
109/* IB Performance Manager status values */
110#define IB_PMA_SAMPLE_STATUS_DONE 0x00
111#define IB_PMA_SAMPLE_STATUS_STARTED 0x01
112#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02
113
114/* Mandatory IB performance counter select values. */
115#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001)
116#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002)
117#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003)
118#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004)
119#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005)
120
121#define HFI1_VENDOR_IPG cpu_to_be16(0xFFA0)
122
123#define IB_BTH_REQ_ACK (1 << 31)
124#define IB_BTH_SOLICITED (1 << 23)
125#define IB_BTH_MIG_REQ (1 << 22)
126
127#define IB_GRH_VERSION 6
128#define IB_GRH_VERSION_MASK 0xF
129#define IB_GRH_VERSION_SHIFT 28
130#define IB_GRH_TCLASS_MASK 0xFF
131#define IB_GRH_TCLASS_SHIFT 20
132#define IB_GRH_FLOW_MASK 0xFFFFF
133#define IB_GRH_FLOW_SHIFT 0
134#define IB_GRH_NEXT_HDR 0x1B
135
136#define IB_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL)
137
138/* flags passed by hfi1_ib_rcv() */
139enum {
140 HFI1_HAS_GRH = (1 << 0),
141};
142
143struct ib_reth {
144 __be64 vaddr;
145 __be32 rkey;
146 __be32 length;
147} __packed;
148
149struct ib_atomic_eth {
150 __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */
151 __be32 rkey;
152 __be64 swap_data;
153 __be64 compare_data;
154} __packed;
155
156union ib_ehdrs {
157 struct {
158 __be32 deth[2];
159 __be32 imm_data;
160 } ud;
161 struct {
162 struct ib_reth reth;
163 __be32 imm_data;
164 } rc;
165 struct {
166 __be32 aeth;
167 __be32 atomic_ack_eth[2];
168 } at;
169 __be32 imm_data;
170 __be32 aeth;
171 struct ib_atomic_eth atomic_eth;
172} __packed;
173
174struct hfi1_other_headers {
175 __be32 bth[3];
176 union ib_ehdrs u;
177} __packed;
178
179/*
180 * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
181 * long (72 w/ imm_data). Only the first 56 bytes of the IB header
182 * will be in the eager header buffer. The remaining 12 or 16 bytes
183 * are in the data buffer.
184 */
185struct hfi1_ib_header {
186 __be16 lrh[4];
187 union {
188 struct {
189 struct ib_grh grh;
190 struct hfi1_other_headers oth;
191 } l;
192 struct hfi1_other_headers oth;
193 } u;
194} __packed;
195
196struct ahg_ib_header {
197 struct sdma_engine *sde;
198 u32 ahgdesc[2];
199 u16 tx_flags;
200 u8 ahgcount;
201 u8 ahgidx;
202 struct hfi1_ib_header ibh;
203};
204
205struct hfi1_pio_header {
206 __le64 pbc;
207 struct hfi1_ib_header hdr;
208} __packed;
209
210/*
211 * used for force cacheline alignment for AHG
212 */
213struct tx_pio_header {
214 struct hfi1_pio_header phdr;
215} ____cacheline_aligned;
216
217/*
218 * There is one struct hfi1_mcast for each multicast GID.
219 * All attached QPs are then stored as a list of
220 * struct hfi1_mcast_qp.
221 */
222struct hfi1_mcast_qp {
223 struct list_head list;
224 struct hfi1_qp *qp;
225};
226
227struct hfi1_mcast {
228 struct rb_node rb_node;
229 union ib_gid mgid;
230 struct list_head qp_list;
231 wait_queue_head_t wait;
232 atomic_t refcount;
233 int n_attached;
234};
235
236/* Protection domain */
237struct hfi1_pd {
238 struct ib_pd ibpd;
239 int user; /* non-zero if created from user space */
240};
241
242/* Address Handle */
243struct hfi1_ah {
244 struct ib_ah ibah;
245 struct ib_ah_attr attr;
246 atomic_t refcount;
247};
248
249/*
250 * This structure is used by hfi1_mmap() to validate an offset
251 * when an mmap() request is made. The vm_area_struct then uses
252 * this as its vm_private_data.
253 */
254struct hfi1_mmap_info {
255 struct list_head pending_mmaps;
256 struct ib_ucontext *context;
257 void *obj;
258 __u64 offset;
259 struct kref ref;
260 unsigned size;
261};
262
263/*
264 * This structure is used to contain the head pointer, tail pointer,
265 * and completion queue entries as a single memory allocation so
266 * it can be mmap'ed into user space.
267 */
268struct hfi1_cq_wc {
269 u32 head; /* index of next entry to fill */
270 u32 tail; /* index of next ib_poll_cq() entry */
271 union {
272 /* these are actually size ibcq.cqe + 1 */
273 struct ib_uverbs_wc uqueue[0];
274 struct ib_wc kqueue[0];
275 };
276};
277
278/*
279 * The completion queue structure.
280 */
281struct hfi1_cq {
282 struct ib_cq ibcq;
283 struct kthread_work comptask;
284 struct hfi1_devdata *dd;
285 spinlock_t lock; /* protect changes in this struct */
286 u8 notify;
287 u8 triggered;
288 struct hfi1_cq_wc *queue;
289 struct hfi1_mmap_info *ip;
290};
291
292/*
293 * A segment is a linear region of low physical memory.
294 * Used by the verbs layer.
295 */
296struct hfi1_seg {
297 void *vaddr;
298 size_t length;
299};
300
301/* The number of hfi1_segs that fit in a page. */
302#define HFI1_SEGSZ (PAGE_SIZE / sizeof(struct hfi1_seg))
303
304struct hfi1_segarray {
305 struct hfi1_seg segs[HFI1_SEGSZ];
306};
307
308struct hfi1_mregion {
309 struct ib_pd *pd; /* shares refcnt of ibmr.pd */
310 u64 user_base; /* User's address for this region */
311 u64 iova; /* IB start address of this region */
312 size_t length;
313 u32 lkey;
314 u32 offset; /* offset (bytes) to start of region */
315 int access_flags;
316 u32 max_segs; /* number of hfi1_segs in all the arrays */
317 u32 mapsz; /* size of the map array */
318 u8 page_shift; /* 0 - non unform/non powerof2 sizes */
319 u8 lkey_published; /* in global table */
320 struct completion comp; /* complete when refcount goes to zero */
321 atomic_t refcount;
322 struct hfi1_segarray *map[0]; /* the segments */
323};
324
325/*
326 * These keep track of the copy progress within a memory region.
327 * Used by the verbs layer.
328 */
329struct hfi1_sge {
330 struct hfi1_mregion *mr;
331 void *vaddr; /* kernel virtual address of segment */
332 u32 sge_length; /* length of the SGE */
333 u32 length; /* remaining length of the segment */
334 u16 m; /* current index: mr->map[m] */
335 u16 n; /* current index: mr->map[m]->segs[n] */
336};
337
338/* Memory region */
339struct hfi1_mr {
340 struct ib_mr ibmr;
341 struct ib_umem *umem;
342 struct hfi1_mregion mr; /* must be last */
343};
344
345/*
346 * Send work request queue entry.
347 * The size of the sg_list is determined when the QP is created and stored
348 * in qp->s_max_sge.
349 */
350struct hfi1_swqe {
351 struct ib_send_wr wr; /* don't use wr.sg_list */
352 u32 psn; /* first packet sequence number */
353 u32 lpsn; /* last packet sequence number */
354 u32 ssn; /* send sequence number */
355 u32 length; /* total length of data in sg_list */
356 struct hfi1_sge sg_list[0];
357};
358
359/*
360 * Receive work request queue entry.
361 * The size of the sg_list is determined when the QP (or SRQ) is created
362 * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
363 */
364struct hfi1_rwqe {
365 u64 wr_id;
366 u8 num_sge;
367 struct ib_sge sg_list[0];
368};
369
370/*
371 * This structure is used to contain the head pointer, tail pointer,
372 * and receive work queue entries as a single memory allocation so
373 * it can be mmap'ed into user space.
374 * Note that the wq array elements are variable size so you can't
375 * just index into the array to get the N'th element;
376 * use get_rwqe_ptr() instead.
377 */
378struct hfi1_rwq {
379 u32 head; /* new work requests posted to the head */
380 u32 tail; /* receives pull requests from here. */
381 struct hfi1_rwqe wq[0];
382};
383
384struct hfi1_rq {
385 struct hfi1_rwq *wq;
386 u32 size; /* size of RWQE array */
387 u8 max_sge;
388 /* protect changes in this struct */
389 spinlock_t lock ____cacheline_aligned_in_smp;
390};
391
392struct hfi1_srq {
393 struct ib_srq ibsrq;
394 struct hfi1_rq rq;
395 struct hfi1_mmap_info *ip;
396 /* send signal when number of RWQEs < limit */
397 u32 limit;
398};
399
400struct hfi1_sge_state {
401 struct hfi1_sge *sg_list; /* next SGE to be used if any */
402 struct hfi1_sge sge; /* progress state for the current SGE */
403 u32 total_len;
404 u8 num_sge;
405};
406
407/*
408 * This structure holds the information that the send tasklet needs
409 * to send a RDMA read response or atomic operation.
410 */
411struct hfi1_ack_entry {
412 u8 opcode;
413 u8 sent;
414 u32 psn;
415 u32 lpsn;
416 union {
417 struct hfi1_sge rdma_sge;
418 u64 atomic_data;
419 };
420};
421
422/*
423 * Variables prefixed with s_ are for the requester (sender).
424 * Variables prefixed with r_ are for the responder (receiver).
425 * Variables prefixed with ack_ are for responder replies.
426 *
427 * Common variables are protected by both r_rq.lock and s_lock in that order
428 * which only happens in modify_qp() or changing the QP 'state'.
429 */
430struct hfi1_qp {
431 struct ib_qp ibqp;
432 /* read mostly fields above and below */
433 struct ib_ah_attr remote_ah_attr;
434 struct ib_ah_attr alt_ah_attr;
435 struct hfi1_qp __rcu *next; /* link list for QPN hash table */
436 struct hfi1_swqe *s_wq; /* send work queue */
437 struct hfi1_mmap_info *ip;
438 struct ahg_ib_header *s_hdr; /* next packet header to send */
439 u8 s_sc; /* SC[0..4] for next packet */
440 unsigned long timeout_jiffies; /* computed from timeout */
441
442 enum ib_mtu path_mtu;
443 int srate_mbps; /* s_srate (below) converted to Mbit/s */
444 u32 remote_qpn;
445 u32 pmtu; /* decoded from path_mtu */
446 u32 qkey; /* QKEY for this QP (for UD or RD) */
447 u32 s_size; /* send work queue size */
448 u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */
449 u32 s_ahgpsn; /* set to the psn in the copy of the header */
450
451 u8 state; /* QP state */
452 u8 allowed_ops; /* high order bits of allowed opcodes */
453 u8 qp_access_flags;
454 u8 alt_timeout; /* Alternate path timeout for this QP */
455 u8 timeout; /* Timeout for this QP */
456 u8 s_srate;
457 u8 s_mig_state;
458 u8 port_num;
459 u8 s_pkey_index; /* PKEY index to use */
460 u8 s_alt_pkey_index; /* Alternate path PKEY index to use */
461 u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */
462 u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */
463 u8 s_retry_cnt; /* number of times to retry */
464 u8 s_rnr_retry_cnt;
465 u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */
466 u8 s_max_sge; /* size of s_wq->sg_list */
467 u8 s_draining;
468
469 /* start of read/write fields */
470 atomic_t refcount ____cacheline_aligned_in_smp;
471 wait_queue_head_t wait;
472
473
474 struct hfi1_ack_entry s_ack_queue[HFI1_MAX_RDMA_ATOMIC + 1]
475 ____cacheline_aligned_in_smp;
476 struct hfi1_sge_state s_rdma_read_sge;
477
478 spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */
479 unsigned long r_aflags;
480 u64 r_wr_id; /* ID for current receive WQE */
481 u32 r_ack_psn; /* PSN for next ACK or atomic ACK */
482 u32 r_len; /* total length of r_sge */
483 u32 r_rcv_len; /* receive data len processed */
484 u32 r_psn; /* expected rcv packet sequence number */
485 u32 r_msn; /* message sequence number */
486
487 u8 r_state; /* opcode of last packet received */
488 u8 r_flags;
489 u8 r_head_ack_queue; /* index into s_ack_queue[] */
490
491 struct list_head rspwait; /* link for waiting to respond */
492
493 struct hfi1_sge_state r_sge; /* current receive data */
494 struct hfi1_rq r_rq; /* receive work queue */
495
496 spinlock_t s_lock ____cacheline_aligned_in_smp;
497 struct hfi1_sge_state *s_cur_sge;
498 u32 s_flags;
499 struct hfi1_swqe *s_wqe;
500 struct hfi1_sge_state s_sge; /* current send request data */
501 struct hfi1_mregion *s_rdma_mr;
502 struct sdma_engine *s_sde; /* current sde */
503 u32 s_cur_size; /* size of send packet in bytes */
504 u32 s_len; /* total length of s_sge */
505 u32 s_rdma_read_len; /* total length of s_rdma_read_sge */
506 u32 s_next_psn; /* PSN for next request */
507 u32 s_last_psn; /* last response PSN processed */
508 u32 s_sending_psn; /* lowest PSN that is being sent */
509 u32 s_sending_hpsn; /* highest PSN that is being sent */
510 u32 s_psn; /* current packet sequence number */
511 u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */
512 u32 s_ack_psn; /* PSN for acking sends and RDMA writes */
513 u32 s_head; /* new entries added here */
514 u32 s_tail; /* next entry to process */
515 u32 s_cur; /* current work queue entry */
516 u32 s_acked; /* last un-ACK'ed entry */
517 u32 s_last; /* last completed entry */
518 u32 s_ssn; /* SSN of tail entry */
519 u32 s_lsn; /* limit sequence number (credit) */
520 u16 s_hdrwords; /* size of s_hdr in 32 bit words */
521 u16 s_rdma_ack_cnt;
522 s8 s_ahgidx;
523 u8 s_state; /* opcode of last packet sent */
524 u8 s_ack_state; /* opcode of packet to ACK */
525 u8 s_nak_state; /* non-zero if NAK is pending */
526 u8 r_nak_state; /* non-zero if NAK is pending */
527 u8 s_retry; /* requester retry counter */
528 u8 s_rnr_retry; /* requester RNR retry counter */
529 u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */
530 u8 s_tail_ack_queue; /* index into s_ack_queue[] */
531
532 struct hfi1_sge_state s_ack_rdma_sge;
533 struct timer_list s_timer;
534
535 struct iowait s_iowait;
536
537 struct hfi1_sge r_sg_list[0] /* verified SGEs */
538 ____cacheline_aligned_in_smp;
539};
540
541/*
542 * Atomic bit definitions for r_aflags.
543 */
544#define HFI1_R_WRID_VALID 0
545#define HFI1_R_REWIND_SGE 1
546
547/*
548 * Bit definitions for r_flags.
549 */
550#define HFI1_R_REUSE_SGE 0x01
551#define HFI1_R_RDMAR_SEQ 0x02
552#define HFI1_R_RSP_NAK 0x04
553#define HFI1_R_RSP_SEND 0x08
554#define HFI1_R_COMM_EST 0x10
555
556/*
557 * Bit definitions for s_flags.
558 *
559 * HFI1_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled
560 * HFI1_S_BUSY - send tasklet is processing the QP
561 * HFI1_S_TIMER - the RC retry timer is active
562 * HFI1_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics
563 * HFI1_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs
564 * before processing the next SWQE
565 * HFI1_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete
566 * before processing the next SWQE
567 * HFI1_S_WAIT_RNR - waiting for RNR timeout
568 * HFI1_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
569 * HFI1_S_WAIT_DMA - waiting for send DMA queue to drain before generating
570 * next send completion entry not via send DMA
571 * HFI1_S_WAIT_PIO - waiting for a send buffer to be available
572 * HFI1_S_WAIT_TX - waiting for a struct verbs_txreq to be available
573 * HFI1_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
574 * HFI1_S_WAIT_KMEM - waiting for kernel memory to be available
575 * HFI1_S_WAIT_PSN - waiting for a packet to exit the send DMA queue
576 * HFI1_S_WAIT_ACK - waiting for an ACK packet before sending more requests
577 * HFI1_S_SEND_ONE - send one packet, request ACK, then wait for ACK
578 * HFI1_S_ECN - a BECN was queued to the send engine
579 */
580#define HFI1_S_SIGNAL_REQ_WR 0x0001
581#define HFI1_S_BUSY 0x0002
582#define HFI1_S_TIMER 0x0004
583#define HFI1_S_RESP_PENDING 0x0008
584#define HFI1_S_ACK_PENDING 0x0010
585#define HFI1_S_WAIT_FENCE 0x0020
586#define HFI1_S_WAIT_RDMAR 0x0040
587#define HFI1_S_WAIT_RNR 0x0080
588#define HFI1_S_WAIT_SSN_CREDIT 0x0100
589#define HFI1_S_WAIT_DMA 0x0200
590#define HFI1_S_WAIT_PIO 0x0400
591#define HFI1_S_WAIT_TX 0x0800
592#define HFI1_S_WAIT_DMA_DESC 0x1000
593#define HFI1_S_WAIT_KMEM 0x2000
594#define HFI1_S_WAIT_PSN 0x4000
595#define HFI1_S_WAIT_ACK 0x8000
596#define HFI1_S_SEND_ONE 0x10000
597#define HFI1_S_UNLIMITED_CREDIT 0x20000
598#define HFI1_S_AHG_VALID 0x40000
599#define HFI1_S_AHG_CLEAR 0x80000
600#define HFI1_S_ECN 0x100000
601
602/*
603 * Wait flags that would prevent any packet type from being sent.
604 */
605#define HFI1_S_ANY_WAIT_IO (HFI1_S_WAIT_PIO | HFI1_S_WAIT_TX | \
606 HFI1_S_WAIT_DMA_DESC | HFI1_S_WAIT_KMEM)
607
608/*
609 * Wait flags that would prevent send work requests from making progress.
610 */
611#define HFI1_S_ANY_WAIT_SEND (HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR | \
612 HFI1_S_WAIT_RNR | HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_DMA | \
613 HFI1_S_WAIT_PSN | HFI1_S_WAIT_ACK)
614
615#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | HFI1_S_ANY_WAIT_SEND)
616
617#define HFI1_PSN_CREDIT 16
618
619/*
620 * Since struct hfi1_swqe is not a fixed size, we can't simply index into
621 * struct hfi1_qp.s_wq. This function does the array index computation.
622 */
623static inline struct hfi1_swqe *get_swqe_ptr(struct hfi1_qp *qp,
624 unsigned n)
625{
626 return (struct hfi1_swqe *)((char *)qp->s_wq +
627 (sizeof(struct hfi1_swqe) +
628 qp->s_max_sge *
629 sizeof(struct hfi1_sge)) * n);
630}
631
632/*
633 * Since struct hfi1_rwqe is not a fixed size, we can't simply index into
634 * struct hfi1_rwq.wq. This function does the array index computation.
635 */
636static inline struct hfi1_rwqe *get_rwqe_ptr(struct hfi1_rq *rq, unsigned n)
637{
638 return (struct hfi1_rwqe *)
639 ((char *) rq->wq->wq +
640 (sizeof(struct hfi1_rwqe) +
641 rq->max_sge * sizeof(struct ib_sge)) * n);
642}
643
644#define MAX_LKEY_TABLE_BITS 23
645
646struct hfi1_lkey_table {
647 spinlock_t lock; /* protect changes in this struct */
648 u32 next; /* next unused index (speeds search) */
649 u32 gen; /* generation count */
650 u32 max; /* size of the table */
651 struct hfi1_mregion __rcu **table;
652};
653
654struct hfi1_opcode_stats {
655 u64 n_packets; /* number of packets */
656 u64 n_bytes; /* total number of bytes */
657};
658
659struct hfi1_opcode_stats_perctx {
660 struct hfi1_opcode_stats stats[256];
661};
662
663static inline void inc_opstats(
664 u32 tlen,
665 struct hfi1_opcode_stats *stats)
666{
667#ifdef CONFIG_DEBUG_FS
668 stats->n_bytes += tlen;
669 stats->n_packets++;
670#endif
671}
672
673struct hfi1_ibport {
674 struct hfi1_qp __rcu *qp[2];
675 struct ib_mad_agent *send_agent; /* agent for SMI (traps) */
676 struct hfi1_ah *sm_ah;
677 struct hfi1_ah *smi_ah;
678 struct rb_root mcast_tree;
679 spinlock_t lock; /* protect changes in this struct */
680
681 /* non-zero when timer is set */
682 unsigned long mkey_lease_timeout;
683 unsigned long trap_timeout;
684 __be64 gid_prefix; /* in network order */
685 __be64 mkey;
686 __be64 guids[HFI1_GUIDS_PER_PORT - 1]; /* writable GUIDs */
687 u64 tid; /* TID for traps */
688 u64 n_rc_resends;
689 u64 n_seq_naks;
690 u64 n_rdma_seq;
691 u64 n_rnr_naks;
692 u64 n_other_naks;
693 u64 n_loop_pkts;
694 u64 n_pkt_drops;
695 u64 n_vl15_dropped;
696 u64 n_rc_timeouts;
697 u64 n_dmawait;
698 u64 n_unaligned;
699 u64 n_rc_dupreq;
700 u64 n_rc_seqnak;
701
702 /* Hot-path per CPU counters to avoid cacheline trading to update */
703 u64 z_rc_acks;
704 u64 z_rc_qacks;
705 u64 z_rc_delayed_comp;
706 u64 __percpu *rc_acks;
707 u64 __percpu *rc_qacks;
708 u64 __percpu *rc_delayed_comp;
709
710 u32 port_cap_flags;
711 u32 pma_sample_start;
712 u32 pma_sample_interval;
713 __be16 pma_counter_select[5];
714 u16 pma_tag;
715 u16 pkey_violations;
716 u16 qkey_violations;
717 u16 mkey_violations;
718 u16 mkey_lease_period;
719 u16 sm_lid;
720 u16 repress_traps;
721 u8 sm_sl;
722 u8 mkeyprot;
723 u8 subnet_timeout;
724 u8 vl_high_limit;
725 /* the first 16 entries are sl_to_vl for !OPA */
726 u8 sl_to_sc[32];
727 u8 sc_to_sl[32];
728};
729
730
731struct hfi1_qp_ibdev;
732struct hfi1_ibdev {
733 struct ib_device ibdev;
734 struct list_head pending_mmaps;
735 spinlock_t mmap_offset_lock; /* protect mmap_offset */
736 u32 mmap_offset;
737 struct hfi1_mregion __rcu *dma_mr;
738
739 struct hfi1_qp_ibdev *qp_dev;
740
741 /* QP numbers are shared by all IB ports */
742 struct hfi1_lkey_table lk_table;
743 /* protect wait lists */
744 seqlock_t iowait_lock;
745 struct list_head txwait; /* list for wait verbs_txreq */
746 struct list_head memwait; /* list for wait kernel memory */
747 struct list_head txreq_free;
748 struct kmem_cache *verbs_txreq_cache;
749 struct timer_list mem_timer;
750
751 /* other waiters */
752 spinlock_t pending_lock;
753
754 u64 n_piowait;
755 u64 n_txwait;
756 u64 n_kmem_wait;
757
758 u32 n_pds_allocated; /* number of PDs allocated for device */
759 spinlock_t n_pds_lock;
760 u32 n_ahs_allocated; /* number of AHs allocated for device */
761 spinlock_t n_ahs_lock;
762 u32 n_cqs_allocated; /* number of CQs allocated for device */
763 spinlock_t n_cqs_lock;
764 u32 n_qps_allocated; /* number of QPs allocated for device */
765 spinlock_t n_qps_lock;
766 u32 n_srqs_allocated; /* number of SRQs allocated for device */
767 spinlock_t n_srqs_lock;
768 u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
769 spinlock_t n_mcast_grps_lock;
770#ifdef CONFIG_DEBUG_FS
771 /* per HFI debugfs */
772 struct dentry *hfi1_ibdev_dbg;
773 /* per HFI symlinks to above */
774 struct dentry *hfi1_ibdev_link;
775#endif
776};
777
778struct hfi1_verbs_counters {
779 u64 symbol_error_counter;
780 u64 link_error_recovery_counter;
781 u64 link_downed_counter;
782 u64 port_rcv_errors;
783 u64 port_rcv_remphys_errors;
784 u64 port_xmit_discards;
785 u64 port_xmit_data;
786 u64 port_rcv_data;
787 u64 port_xmit_packets;
788 u64 port_rcv_packets;
789 u32 local_link_integrity_errors;
790 u32 excessive_buffer_overrun_errors;
791 u32 vl15_dropped;
792};
793
794static inline struct hfi1_mr *to_imr(struct ib_mr *ibmr)
795{
796 return container_of(ibmr, struct hfi1_mr, ibmr);
797}
798
799static inline struct hfi1_pd *to_ipd(struct ib_pd *ibpd)
800{
801 return container_of(ibpd, struct hfi1_pd, ibpd);
802}
803
804static inline struct hfi1_ah *to_iah(struct ib_ah *ibah)
805{
806 return container_of(ibah, struct hfi1_ah, ibah);
807}
808
809static inline struct hfi1_cq *to_icq(struct ib_cq *ibcq)
810{
811 return container_of(ibcq, struct hfi1_cq, ibcq);
812}
813
814static inline struct hfi1_srq *to_isrq(struct ib_srq *ibsrq)
815{
816 return container_of(ibsrq, struct hfi1_srq, ibsrq);
817}
818
819static inline struct hfi1_qp *to_iqp(struct ib_qp *ibqp)
820{
821 return container_of(ibqp, struct hfi1_qp, ibqp);
822}
823
824static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev)
825{
826 return container_of(ibdev, struct hfi1_ibdev, ibdev);
827}
828
829/*
830 * Send if not busy or waiting for I/O and either
831 * a RC response is pending or we can process send work requests.
832 */
833static inline int hfi1_send_ok(struct hfi1_qp *qp)
834{
835 return !(qp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT_IO)) &&
836 (qp->s_hdrwords || (qp->s_flags & HFI1_S_RESP_PENDING) ||
837 !(qp->s_flags & HFI1_S_ANY_WAIT_SEND));
838}
839
840/*
841 * This must be called with s_lock held.
842 */
843void hfi1_schedule_send(struct hfi1_qp *qp);
844void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
845 u32 qp1, u32 qp2, __be16 lid1, __be16 lid2);
846void hfi1_cap_mask_chg(struct hfi1_ibport *ibp);
847void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
848void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
849int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
850 const struct ib_wc *in_wc, const struct ib_grh *in_grh,
851 const struct ib_mad_hdr *in_mad, size_t in_mad_size,
852 struct ib_mad_hdr *out_mad, size_t *out_mad_size,
853 u16 *out_mad_pkey_index);
854int hfi1_create_agents(struct hfi1_ibdev *dev);
855void hfi1_free_agents(struct hfi1_ibdev *dev);
856
857/*
858 * The PSN_MASK and PSN_SHIFT allow for
859 * 1) comparing two PSNs
860 * 2) returning the PSN with any upper bits masked
861 * 3) returning the difference between to PSNs
862 *
863 * The number of significant bits in the PSN must
864 * necessarily be at least one bit less than
865 * the container holding the PSN.
866 */
867#ifndef CONFIG_HFI1_VERBS_31BIT_PSN
868#define PSN_MASK 0xFFFFFF
869#define PSN_SHIFT 8
870#else
871#define PSN_MASK 0x7FFFFFFF
872#define PSN_SHIFT 1
873#endif
874#define PSN_MODIFY_MASK 0xFFFFFF
875
876/* Number of bits to pay attention to in the opcode for checking qp type */
877#define OPCODE_QP_MASK 0xE0
878
879/*
880 * Compare the lower 24 bits of the msn values.
881 * Returns an integer <, ==, or > than zero.
882 */
883static inline int cmp_msn(u32 a, u32 b)
884{
885 return (((int) a) - ((int) b)) << 8;
886}
887
888/*
889 * Compare two PSNs
890 * Returns an integer <, ==, or > than zero.
891 */
892static inline int cmp_psn(u32 a, u32 b)
893{
894 return (((int) a) - ((int) b)) << PSN_SHIFT;
895}
896
897/*
898 * Return masked PSN
899 */
900static inline u32 mask_psn(u32 a)
901{
902 return a & PSN_MASK;
903}
904
905/*
906 * Return delta between two PSNs
907 */
908static inline u32 delta_psn(u32 a, u32 b)
909{
910 return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
911}
912
913struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid);
914
915int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
916
917int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
918
919int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp);
920
921struct verbs_txreq;
922void hfi1_put_txreq(struct verbs_txreq *tx);
923
924int hfi1_verbs_send(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
925 u32 hdrwords, struct hfi1_sge_state *ss, u32 len);
926
927void hfi1_copy_sge(struct hfi1_sge_state *ss, void *data, u32 length,
928 int release);
929
930void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release);
931
932void hfi1_cnp_rcv(struct hfi1_packet *packet);
933
934void hfi1_uc_rcv(struct hfi1_packet *packet);
935
936void hfi1_rc_rcv(struct hfi1_packet *packet);
937
938void hfi1_rc_hdrerr(
939 struct hfi1_ctxtdata *rcd,
940 struct hfi1_ib_header *hdr,
941 u32 rcv_flags,
942 struct hfi1_qp *qp);
943
944u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
945
946int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
947
948struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid);
949
950void hfi1_rc_rnr_retry(unsigned long arg);
951
952void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr);
953
954void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err);
955
956void hfi1_ud_rcv(struct hfi1_packet *packet);
957
958int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey);
959
960int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region);
961
962void hfi1_free_lkey(struct hfi1_mregion *mr);
963
964int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd,
965 struct hfi1_sge *isge, struct ib_sge *sge, int acc);
966
967int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge,
968 u32 len, u64 vaddr, u32 rkey, int acc);
969
970int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
971 struct ib_recv_wr **bad_wr);
972
973struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd,
974 struct ib_srq_init_attr *srq_init_attr,
975 struct ib_udata *udata);
976
977int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
978 enum ib_srq_attr_mask attr_mask,
979 struct ib_udata *udata);
980
981int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
982
983int hfi1_destroy_srq(struct ib_srq *ibsrq);
984
985int hfi1_cq_init(struct hfi1_devdata *dd);
986
987void hfi1_cq_exit(struct hfi1_devdata *dd);
988
989void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int sig);
990
991int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
992
993struct ib_cq *hfi1_create_cq(
994 struct ib_device *ibdev,
995 const struct ib_cq_init_attr *attr,
996 struct ib_ucontext *context,
997 struct ib_udata *udata);
998
999int hfi1_destroy_cq(struct ib_cq *ibcq);
1000
1001int hfi1_req_notify_cq(
1002 struct ib_cq *ibcq,
1003 enum ib_cq_notify_flags notify_flags);
1004
1005int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
1006
1007struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc);
1008
1009struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
1010 struct ib_phys_buf *buffer_list,
1011 int num_phys_buf, int acc, u64 *iova_start);
1012
1013struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1014 u64 virt_addr, int mr_access_flags,
1015 struct ib_udata *udata);
1016
1017int hfi1_dereg_mr(struct ib_mr *ibmr);
1018
1019struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd,
1020 enum ib_mr_type mr_type,
1021 u32 max_entries);
1022
1023struct ib_fast_reg_page_list *hfi1_alloc_fast_reg_page_list(
1024 struct ib_device *ibdev, int page_list_len);
1025
1026void hfi1_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl);
1027
1028int hfi1_fast_reg_mr(struct hfi1_qp *qp, struct ib_send_wr *wr);
1029
1030struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
1031 struct ib_fmr_attr *fmr_attr);
1032
1033int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
1034 int list_len, u64 iova);
1035
1036int hfi1_unmap_fmr(struct list_head *fmr_list);
1037
1038int hfi1_dealloc_fmr(struct ib_fmr *ibfmr);
1039
1040static inline void hfi1_get_mr(struct hfi1_mregion *mr)
1041{
1042 atomic_inc(&mr->refcount);
1043}
1044
1045static inline void hfi1_put_mr(struct hfi1_mregion *mr)
1046{
1047 if (unlikely(atomic_dec_and_test(&mr->refcount)))
1048 complete(&mr->comp);
1049}
1050
1051static inline void hfi1_put_ss(struct hfi1_sge_state *ss)
1052{
1053 while (ss->num_sge) {
1054 hfi1_put_mr(ss->sge.mr);
1055 if (--ss->num_sge)
1056 ss->sge = *ss->sg_list++;
1057 }
1058}
1059
1060void hfi1_release_mmap_info(struct kref *ref);
1061
1062struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev, u32 size,
1063 struct ib_ucontext *context,
1064 void *obj);
1065
1066void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip,
1067 u32 size, void *obj);
1068
1069int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
1070
1071int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only);
1072
1073void hfi1_migrate_qp(struct hfi1_qp *qp);
1074
1075int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
1076 int has_grh, struct hfi1_qp *qp, u32 bth0);
1077
1078u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
1079 struct ib_global_route *grh, u32 hwords, u32 nwords);
1080
1081void clear_ahg(struct hfi1_qp *qp);
1082
1083void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr,
1084 u32 bth0, u32 bth2, int middle);
1085
1086void hfi1_do_send(struct work_struct *work);
1087
1088void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe,
1089 enum ib_wc_status status);
1090
1091void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct hfi1_qp *qp, int is_fecn);
1092
1093int hfi1_make_rc_req(struct hfi1_qp *qp);
1094
1095int hfi1_make_uc_req(struct hfi1_qp *qp);
1096
1097int hfi1_make_ud_req(struct hfi1_qp *qp);
1098
1099int hfi1_register_ib_device(struct hfi1_devdata *);
1100
1101void hfi1_unregister_ib_device(struct hfi1_devdata *);
1102
1103void hfi1_ib_rcv(struct hfi1_packet *packet);
1104
1105unsigned hfi1_get_npkeys(struct hfi1_devdata *);
1106
1107int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *hdr,
1108 u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
1109 u32 plen, u32 dwords, u64 pbc);
1110
1111int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct ahg_ib_header *hdr,
1112 u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
1113 u32 plen, u32 dwords, u64 pbc);
1114
1115struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5);
1116
1117extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
1118
1119extern const u8 hdr_len_by_opcode[];
1120
1121extern const int ib_hfi1_state_ops[];
1122
1123extern __be64 ib_hfi1_sys_image_guid; /* in network order */
1124
1125extern unsigned int hfi1_lkey_table_size;
1126
1127extern unsigned int hfi1_max_cqes;
1128
1129extern unsigned int hfi1_max_cqs;
1130
1131extern unsigned int hfi1_max_qp_wrs;
1132
1133extern unsigned int hfi1_max_qps;
1134
1135extern unsigned int hfi1_max_sges;
1136
1137extern unsigned int hfi1_max_mcast_grps;
1138
1139extern unsigned int hfi1_max_mcast_qp_attached;
1140
1141extern unsigned int hfi1_max_srqs;
1142
1143extern unsigned int hfi1_max_srq_sges;
1144
1145extern unsigned int hfi1_max_srq_wrs;
1146
1147extern const u32 ib_hfi1_rnr_table[];
1148
1149extern struct ib_dma_mapping_ops hfi1_dma_mapping_ops;
1150
1151#endif /* HFI1_VERBS_H */
diff --git a/drivers/staging/rdma/hfi1/verbs_mcast.c b/drivers/staging/rdma/hfi1/verbs_mcast.c
new file mode 100644
index 000000000000..afc6b4c61a1d
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/verbs_mcast.c
@@ -0,0 +1,385 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51#include <linux/rculist.h>
52
53#include "hfi.h"
54
55/**
56 * mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
57 * @qp: the QP to link
58 */
59static struct hfi1_mcast_qp *mcast_qp_alloc(struct hfi1_qp *qp)
60{
61 struct hfi1_mcast_qp *mqp;
62
63 mqp = kmalloc(sizeof(*mqp), GFP_KERNEL);
64 if (!mqp)
65 goto bail;
66
67 mqp->qp = qp;
68 atomic_inc(&qp->refcount);
69
70bail:
71 return mqp;
72}
73
74static void mcast_qp_free(struct hfi1_mcast_qp *mqp)
75{
76 struct hfi1_qp *qp = mqp->qp;
77
78 /* Notify hfi1_destroy_qp() if it is waiting. */
79 if (atomic_dec_and_test(&qp->refcount))
80 wake_up(&qp->wait);
81
82 kfree(mqp);
83}
84
85/**
86 * mcast_alloc - allocate the multicast GID structure
87 * @mgid: the multicast GID
88 *
89 * A list of QPs will be attached to this structure.
90 */
91static struct hfi1_mcast *mcast_alloc(union ib_gid *mgid)
92{
93 struct hfi1_mcast *mcast;
94
95 mcast = kmalloc(sizeof(*mcast), GFP_KERNEL);
96 if (!mcast)
97 goto bail;
98
99 mcast->mgid = *mgid;
100 INIT_LIST_HEAD(&mcast->qp_list);
101 init_waitqueue_head(&mcast->wait);
102 atomic_set(&mcast->refcount, 0);
103 mcast->n_attached = 0;
104
105bail:
106 return mcast;
107}
108
109static void mcast_free(struct hfi1_mcast *mcast)
110{
111 struct hfi1_mcast_qp *p, *tmp;
112
113 list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
114 mcast_qp_free(p);
115
116 kfree(mcast);
117}
118
119/**
120 * hfi1_mcast_find - search the global table for the given multicast GID
121 * @ibp: the IB port structure
122 * @mgid: the multicast GID to search for
123 *
124 * Returns NULL if not found.
125 *
126 * The caller is responsible for decrementing the reference count if found.
127 */
128struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid)
129{
130 struct rb_node *n;
131 unsigned long flags;
132 struct hfi1_mcast *mcast;
133
134 spin_lock_irqsave(&ibp->lock, flags);
135 n = ibp->mcast_tree.rb_node;
136 while (n) {
137 int ret;
138
139 mcast = rb_entry(n, struct hfi1_mcast, rb_node);
140
141 ret = memcmp(mgid->raw, mcast->mgid.raw,
142 sizeof(union ib_gid));
143 if (ret < 0)
144 n = n->rb_left;
145 else if (ret > 0)
146 n = n->rb_right;
147 else {
148 atomic_inc(&mcast->refcount);
149 spin_unlock_irqrestore(&ibp->lock, flags);
150 goto bail;
151 }
152 }
153 spin_unlock_irqrestore(&ibp->lock, flags);
154
155 mcast = NULL;
156
157bail:
158 return mcast;
159}
160
161/**
162 * mcast_add - insert mcast GID into table and attach QP struct
163 * @mcast: the mcast GID table
164 * @mqp: the QP to attach
165 *
166 * Return zero if both were added. Return EEXIST if the GID was already in
167 * the table but the QP was added. Return ESRCH if the QP was already
168 * attached and neither structure was added.
169 */
170static int mcast_add(struct hfi1_ibdev *dev, struct hfi1_ibport *ibp,
171 struct hfi1_mcast *mcast, struct hfi1_mcast_qp *mqp)
172{
173 struct rb_node **n = &ibp->mcast_tree.rb_node;
174 struct rb_node *pn = NULL;
175 int ret;
176
177 spin_lock_irq(&ibp->lock);
178
179 while (*n) {
180 struct hfi1_mcast *tmcast;
181 struct hfi1_mcast_qp *p;
182
183 pn = *n;
184 tmcast = rb_entry(pn, struct hfi1_mcast, rb_node);
185
186 ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
187 sizeof(union ib_gid));
188 if (ret < 0) {
189 n = &pn->rb_left;
190 continue;
191 }
192 if (ret > 0) {
193 n = &pn->rb_right;
194 continue;
195 }
196
197 /* Search the QP list to see if this is already there. */
198 list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
199 if (p->qp == mqp->qp) {
200 ret = ESRCH;
201 goto bail;
202 }
203 }
204 if (tmcast->n_attached == hfi1_max_mcast_qp_attached) {
205 ret = ENOMEM;
206 goto bail;
207 }
208
209 tmcast->n_attached++;
210
211 list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
212 ret = EEXIST;
213 goto bail;
214 }
215
216 spin_lock(&dev->n_mcast_grps_lock);
217 if (dev->n_mcast_grps_allocated == hfi1_max_mcast_grps) {
218 spin_unlock(&dev->n_mcast_grps_lock);
219 ret = ENOMEM;
220 goto bail;
221 }
222
223 dev->n_mcast_grps_allocated++;
224 spin_unlock(&dev->n_mcast_grps_lock);
225
226 mcast->n_attached++;
227
228 list_add_tail_rcu(&mqp->list, &mcast->qp_list);
229
230 atomic_inc(&mcast->refcount);
231 rb_link_node(&mcast->rb_node, pn, n);
232 rb_insert_color(&mcast->rb_node, &ibp->mcast_tree);
233
234 ret = 0;
235
236bail:
237 spin_unlock_irq(&ibp->lock);
238
239 return ret;
240}
241
242int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
243{
244 struct hfi1_qp *qp = to_iqp(ibqp);
245 struct hfi1_ibdev *dev = to_idev(ibqp->device);
246 struct hfi1_ibport *ibp;
247 struct hfi1_mcast *mcast;
248 struct hfi1_mcast_qp *mqp;
249 int ret;
250
251 if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
252 ret = -EINVAL;
253 goto bail;
254 }
255
256 /*
257 * Allocate data structures since its better to do this outside of
258 * spin locks and it will most likely be needed.
259 */
260 mcast = mcast_alloc(gid);
261 if (mcast == NULL) {
262 ret = -ENOMEM;
263 goto bail;
264 }
265 mqp = mcast_qp_alloc(qp);
266 if (mqp == NULL) {
267 mcast_free(mcast);
268 ret = -ENOMEM;
269 goto bail;
270 }
271 ibp = to_iport(ibqp->device, qp->port_num);
272 switch (mcast_add(dev, ibp, mcast, mqp)) {
273 case ESRCH:
274 /* Neither was used: OK to attach the same QP twice. */
275 mcast_qp_free(mqp);
276 mcast_free(mcast);
277 break;
278
279 case EEXIST: /* The mcast wasn't used */
280 mcast_free(mcast);
281 break;
282
283 case ENOMEM:
284 /* Exceeded the maximum number of mcast groups. */
285 mcast_qp_free(mqp);
286 mcast_free(mcast);
287 ret = -ENOMEM;
288 goto bail;
289
290 default:
291 break;
292 }
293
294 ret = 0;
295
296bail:
297 return ret;
298}
299
300int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
301{
302 struct hfi1_qp *qp = to_iqp(ibqp);
303 struct hfi1_ibdev *dev = to_idev(ibqp->device);
304 struct hfi1_ibport *ibp = to_iport(ibqp->device, qp->port_num);
305 struct hfi1_mcast *mcast = NULL;
306 struct hfi1_mcast_qp *p, *tmp;
307 struct rb_node *n;
308 int last = 0;
309 int ret;
310
311 if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
312 ret = -EINVAL;
313 goto bail;
314 }
315
316 spin_lock_irq(&ibp->lock);
317
318 /* Find the GID in the mcast table. */
319 n = ibp->mcast_tree.rb_node;
320 while (1) {
321 if (n == NULL) {
322 spin_unlock_irq(&ibp->lock);
323 ret = -EINVAL;
324 goto bail;
325 }
326
327 mcast = rb_entry(n, struct hfi1_mcast, rb_node);
328 ret = memcmp(gid->raw, mcast->mgid.raw,
329 sizeof(union ib_gid));
330 if (ret < 0)
331 n = n->rb_left;
332 else if (ret > 0)
333 n = n->rb_right;
334 else
335 break;
336 }
337
338 /* Search the QP list. */
339 list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
340 if (p->qp != qp)
341 continue;
342 /*
343 * We found it, so remove it, but don't poison the forward
344 * link until we are sure there are no list walkers.
345 */
346 list_del_rcu(&p->list);
347 mcast->n_attached--;
348
349 /* If this was the last attached QP, remove the GID too. */
350 if (list_empty(&mcast->qp_list)) {
351 rb_erase(&mcast->rb_node, &ibp->mcast_tree);
352 last = 1;
353 }
354 break;
355 }
356
357 spin_unlock_irq(&ibp->lock);
358
359 if (p) {
360 /*
361 * Wait for any list walkers to finish before freeing the
362 * list element.
363 */
364 wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
365 mcast_qp_free(p);
366 }
367 if (last) {
368 atomic_dec(&mcast->refcount);
369 wait_event(mcast->wait, !atomic_read(&mcast->refcount));
370 mcast_free(mcast);
371 spin_lock_irq(&dev->n_mcast_grps_lock);
372 dev->n_mcast_grps_allocated--;
373 spin_unlock_irq(&dev->n_mcast_grps_lock);
374 }
375
376 ret = 0;
377
378bail:
379 return ret;
380}
381
382int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp)
383{
384 return ibp->mcast_tree.rb_node == NULL;
385}
diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/staging/rdma/ipath/Kconfig
index 8fe54ff00580..041ce0634968 100644
--- a/drivers/infiniband/hw/ipath/Kconfig
+++ b/drivers/staging/rdma/ipath/Kconfig
@@ -2,7 +2,7 @@ config INFINIBAND_IPATH
2 tristate "QLogic HTX HCA support" 2 tristate "QLogic HTX HCA support"
3 depends on 64BIT && NET && HT_IRQ 3 depends on 64BIT && NET && HT_IRQ
4 ---help--- 4 ---help---
5 This is a driver for the obsolete QLogic Hyper-Transport 5 This is a driver for the deprecated QLogic Hyper-Transport
6 IB host channel adapter (model QHT7140), 6 IB host channel adapter (model QHT7140),
7 including InfiniBand verbs support. This driver allows these 7 including InfiniBand verbs support. This driver allows these
8 devices to be used with both kernel upper level protocols such 8 devices to be used with both kernel upper level protocols such
@@ -12,3 +12,5 @@ config INFINIBAND_IPATH
12 12
13 If you have this hardware you will need to boot with PAT disabled 13 If you have this hardware you will need to boot with PAT disabled
14 on your x86-64 systems, use the nopat kernel parameter. 14 on your x86-64 systems, use the nopat kernel parameter.
15
16 Note that this driver will soon be removed entirely from the kernel.
diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/staging/rdma/ipath/Makefile
index 4496f2820c92..4496f2820c92 100644
--- a/drivers/infiniband/hw/ipath/Makefile
+++ b/drivers/staging/rdma/ipath/Makefile
diff --git a/drivers/staging/rdma/ipath/TODO b/drivers/staging/rdma/ipath/TODO
new file mode 100644
index 000000000000..cb00158d64c8
--- /dev/null
+++ b/drivers/staging/rdma/ipath/TODO
@@ -0,0 +1,5 @@
1The ipath driver has been moved to staging in preparation for its removal in a
2few releases. The driver will be deleted during the 4.6 merge window.
3
4Contact Dennis Dalessandro <dennis.dalessandro@intel.com> and
5Cc: linux-rdma@vger.kernel.org
diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/staging/rdma/ipath/ipath_common.h
index 28cfe97cf1e9..28cfe97cf1e9 100644
--- a/drivers/infiniband/hw/ipath/ipath_common.h
+++ b/drivers/staging/rdma/ipath/ipath_common.h
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/staging/rdma/ipath/ipath_cq.c
index e9dd9112e718..e9dd9112e718 100644
--- a/drivers/infiniband/hw/ipath/ipath_cq.c
+++ b/drivers/staging/rdma/ipath/ipath_cq.c
diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/staging/rdma/ipath/ipath_debug.h
index 65926cd35759..65926cd35759 100644
--- a/drivers/infiniband/hw/ipath/ipath_debug.h
+++ b/drivers/staging/rdma/ipath/ipath_debug.h
diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/staging/rdma/ipath/ipath_diag.c
index 45802e97332e..45802e97332e 100644
--- a/drivers/infiniband/hw/ipath/ipath_diag.c
+++ b/drivers/staging/rdma/ipath/ipath_diag.c
diff --git a/drivers/infiniband/hw/ipath/ipath_dma.c b/drivers/staging/rdma/ipath/ipath_dma.c
index 123a8c053539..123a8c053539 100644
--- a/drivers/infiniband/hw/ipath/ipath_dma.c
+++ b/drivers/staging/rdma/ipath/ipath_dma.c
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/staging/rdma/ipath/ipath_driver.c
index 871dbe56216a..871dbe56216a 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/staging/rdma/ipath/ipath_driver.c
diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/staging/rdma/ipath/ipath_eeprom.c
index fc7181985e8e..fc7181985e8e 100644
--- a/drivers/infiniband/hw/ipath/ipath_eeprom.c
+++ b/drivers/staging/rdma/ipath/ipath_eeprom.c
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/staging/rdma/ipath/ipath_file_ops.c
index 450d15965005..450d15965005 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/staging/rdma/ipath/ipath_file_ops.c
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/staging/rdma/ipath/ipath_fs.c
index 25422a3a7238..25422a3a7238 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/staging/rdma/ipath/ipath_fs.c
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/staging/rdma/ipath/ipath_iba6110.c
index 7cc305488a3d..7cc305488a3d 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba6110.c
+++ b/drivers/staging/rdma/ipath/ipath_iba6110.c
diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/staging/rdma/ipath/ipath_init_chip.c
index be2a60e142b0..be2a60e142b0 100644
--- a/drivers/infiniband/hw/ipath/ipath_init_chip.c
+++ b/drivers/staging/rdma/ipath/ipath_init_chip.c
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/staging/rdma/ipath/ipath_intr.c
index 01ba792791a0..01ba792791a0 100644
--- a/drivers/infiniband/hw/ipath/ipath_intr.c
+++ b/drivers/staging/rdma/ipath/ipath_intr.c
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/staging/rdma/ipath/ipath_kernel.h
index f0f947122779..f0f947122779 100644
--- a/drivers/infiniband/hw/ipath/ipath_kernel.h
+++ b/drivers/staging/rdma/ipath/ipath_kernel.h
diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/staging/rdma/ipath/ipath_keys.c
index c0e933fec218..c0e933fec218 100644
--- a/drivers/infiniband/hw/ipath/ipath_keys.c
+++ b/drivers/staging/rdma/ipath/ipath_keys.c
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/staging/rdma/ipath/ipath_mad.c
index ad3a926ab3c5..ad3a926ab3c5 100644
--- a/drivers/infiniband/hw/ipath/ipath_mad.c
+++ b/drivers/staging/rdma/ipath/ipath_mad.c
diff --git a/drivers/infiniband/hw/ipath/ipath_mmap.c b/drivers/staging/rdma/ipath/ipath_mmap.c
index e73274229404..e73274229404 100644
--- a/drivers/infiniband/hw/ipath/ipath_mmap.c
+++ b/drivers/staging/rdma/ipath/ipath_mmap.c
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/staging/rdma/ipath/ipath_mr.c
index c7278f6a8217..c7278f6a8217 100644
--- a/drivers/infiniband/hw/ipath/ipath_mr.c
+++ b/drivers/staging/rdma/ipath/ipath_mr.c
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/staging/rdma/ipath/ipath_qp.c
index face87602dc1..face87602dc1 100644
--- a/drivers/infiniband/hw/ipath/ipath_qp.c
+++ b/drivers/staging/rdma/ipath/ipath_qp.c
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/staging/rdma/ipath/ipath_rc.c
index 79b3dbc97179..79b3dbc97179 100644
--- a/drivers/infiniband/hw/ipath/ipath_rc.c
+++ b/drivers/staging/rdma/ipath/ipath_rc.c
diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/staging/rdma/ipath/ipath_registers.h
index 8f44d0cf3833..8f44d0cf3833 100644
--- a/drivers/infiniband/hw/ipath/ipath_registers.h
+++ b/drivers/staging/rdma/ipath/ipath_registers.h
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/staging/rdma/ipath/ipath_ruc.c
index 1f95bbaf7602..1f95bbaf7602 100644
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ b/drivers/staging/rdma/ipath/ipath_ruc.c
diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/staging/rdma/ipath/ipath_sdma.c
index 17a517766ad2..17a517766ad2 100644
--- a/drivers/infiniband/hw/ipath/ipath_sdma.c
+++ b/drivers/staging/rdma/ipath/ipath_sdma.c
diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/staging/rdma/ipath/ipath_srq.c
index 26271984b717..26271984b717 100644
--- a/drivers/infiniband/hw/ipath/ipath_srq.c
+++ b/drivers/staging/rdma/ipath/ipath_srq.c
diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/staging/rdma/ipath/ipath_stats.c
index f63e143e3292..f63e143e3292 100644
--- a/drivers/infiniband/hw/ipath/ipath_stats.c
+++ b/drivers/staging/rdma/ipath/ipath_stats.c
diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/staging/rdma/ipath/ipath_sysfs.c
index 75558f33f1cb..75558f33f1cb 100644
--- a/drivers/infiniband/hw/ipath/ipath_sysfs.c
+++ b/drivers/staging/rdma/ipath/ipath_sysfs.c
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/staging/rdma/ipath/ipath_uc.c
index 22e60998f1a7..22e60998f1a7 100644
--- a/drivers/infiniband/hw/ipath/ipath_uc.c
+++ b/drivers/staging/rdma/ipath/ipath_uc.c
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/staging/rdma/ipath/ipath_ud.c
index e8a2a915251e..e8a2a915251e 100644
--- a/drivers/infiniband/hw/ipath/ipath_ud.c
+++ b/drivers/staging/rdma/ipath/ipath_ud.c
diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/staging/rdma/ipath/ipath_user_pages.c
index 1da1252dcdb3..1da1252dcdb3 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_pages.c
+++ b/drivers/staging/rdma/ipath/ipath_user_pages.c
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/staging/rdma/ipath/ipath_user_sdma.c
index cc04b7ba3488..cc04b7ba3488 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c
+++ b/drivers/staging/rdma/ipath/ipath_user_sdma.c
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.h b/drivers/staging/rdma/ipath/ipath_user_sdma.h
index fc76316c4a58..fc76316c4a58 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.h
+++ b/drivers/staging/rdma/ipath/ipath_user_sdma.h
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/staging/rdma/ipath/ipath_verbs.c
index 30ba49c4a98c..ed2bbc2f7eae 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/staging/rdma/ipath/ipath_verbs.c
@@ -1521,6 +1521,7 @@ static int ipath_query_device(struct ib_device *ibdev, struct ib_device_attr *pr
1521 props->max_qp = ib_ipath_max_qps; 1521 props->max_qp = ib_ipath_max_qps;
1522 props->max_qp_wr = ib_ipath_max_qp_wrs; 1522 props->max_qp_wr = ib_ipath_max_qp_wrs;
1523 props->max_sge = ib_ipath_max_sges; 1523 props->max_sge = ib_ipath_max_sges;
1524 props->max_sge_rd = ib_ipath_max_sges;
1524 props->max_cq = ib_ipath_max_cqs; 1525 props->max_cq = ib_ipath_max_cqs;
1525 props->max_ah = ib_ipath_max_ahs; 1526 props->max_ah = ib_ipath_max_ahs;
1526 props->max_cqe = ib_ipath_max_cqes; 1527 props->max_cqe = ib_ipath_max_cqes;
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/staging/rdma/ipath/ipath_verbs.h
index ec167e545e15..ec167e545e15 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.h
+++ b/drivers/staging/rdma/ipath/ipath_verbs.h
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/staging/rdma/ipath/ipath_verbs_mcast.c
index 6216ea923853..6216ea923853 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
+++ b/drivers/staging/rdma/ipath/ipath_verbs_mcast.c
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c b/drivers/staging/rdma/ipath/ipath_wc_ppc64.c
index 1a7e20a75149..1a7e20a75149 100644
--- a/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c
+++ b/drivers/staging/rdma/ipath/ipath_wc_ppc64.c
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/drivers/staging/rdma/ipath/ipath_wc_x86_64.c
index 7b6e4c843e19..7b6e4c843e19 100644
--- a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c
+++ b/drivers/staging/rdma/ipath/ipath_wc_x86_64.c
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index bcbf8c72a77b..baad4cb8e9b0 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -79,7 +79,8 @@ enum {
79 79
80enum { 80enum {
81 MLX4_MAX_PORTS = 2, 81 MLX4_MAX_PORTS = 2,
82 MLX4_MAX_PORT_PKEYS = 128 82 MLX4_MAX_PORT_PKEYS = 128,
83 MLX4_MAX_PORT_GIDS = 128
83}; 84};
84 85
85/* base qkey for use in sriov tunnel-qp/proxy-qp communication. 86/* base qkey for use in sriov tunnel-qp/proxy-qp communication.
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index 9553a73d2049..5a06d969338e 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -59,6 +59,7 @@ struct mlx4_interface {
59 void (*event) (struct mlx4_dev *dev, void *context, 59 void (*event) (struct mlx4_dev *dev, void *context,
60 enum mlx4_dev_event event, unsigned long param); 60 enum mlx4_dev_event event, unsigned long param);
61 void * (*get_dev)(struct mlx4_dev *dev, void *context, u8 port); 61 void * (*get_dev)(struct mlx4_dev *dev, void *context, u8 port);
62 void (*activate)(struct mlx4_dev *dev, void *context);
62 struct list_head list; 63 struct list_head list;
63 enum mlx4_protocol protocol; 64 enum mlx4_protocol protocol;
64 int flags; 65 int flags;
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 250b1ff8b48d..8eb3b19af2a4 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -402,6 +402,17 @@ struct mlx5_cmd_teardown_hca_mbox_out {
402 u8 rsvd[8]; 402 u8 rsvd[8];
403}; 403};
404 404
405struct mlx5_cmd_query_special_contexts_mbox_in {
406 struct mlx5_inbox_hdr hdr;
407 u8 rsvd[8];
408};
409
410struct mlx5_cmd_query_special_contexts_mbox_out {
411 struct mlx5_outbox_hdr hdr;
412 __be32 dump_fill_mkey;
413 __be32 resd_lkey;
414};
415
405struct mlx5_cmd_layout { 416struct mlx5_cmd_layout {
406 u8 type; 417 u8 type;
407 u8 rsvd0[3]; 418 u8 rsvd0[3];
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8b6d6f2154a4..27b53f9a24ad 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -845,6 +845,7 @@ void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol);
845int mlx5_register_interface(struct mlx5_interface *intf); 845int mlx5_register_interface(struct mlx5_interface *intf);
846void mlx5_unregister_interface(struct mlx5_interface *intf); 846void mlx5_unregister_interface(struct mlx5_interface *intf);
847int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id); 847int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
848int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey);
848 849
849struct mlx5_profile { 850struct mlx5_profile {
850 u64 mask; 851 u64 mask;
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index d5ee6d8b7c58..7ccc961f33e9 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -132,6 +132,7 @@ struct svcxprt_rdma {
132 struct list_head sc_accept_q; /* Conn. waiting accept */ 132 struct list_head sc_accept_q; /* Conn. waiting accept */
133 int sc_ord; /* RDMA read limit */ 133 int sc_ord; /* RDMA read limit */
134 int sc_max_sge; 134 int sc_max_sge;
135 int sc_max_sge_rd; /* max sge for read target */
135 136
136 int sc_sq_depth; /* Depth of SQ */ 137 int sc_sq_depth; /* Depth of SQ */
137 atomic_t sc_sq_count; /* Number of SQ WR on queue */ 138 atomic_t sc_sq_count; /* Number of SQ WR on queue */
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 0c3ac5acb85f..b5474b1fcd83 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -91,6 +91,37 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2);
91void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); 91void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
92void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); 92void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
93 93
94static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
95{
96 if (dev->addr_len != ETH_ALEN)
97 return -1;
98 memcpy(eui, dev->dev_addr, 3);
99 memcpy(eui + 5, dev->dev_addr + 3, 3);
100
101 /*
102 * The zSeries OSA network cards can be shared among various
103 * OS instances, but the OSA cards have only one MAC address.
104 * This leads to duplicate address conflicts in conjunction
105 * with IPv6 if more than one instance uses the same card.
106 *
107 * The driver for these cards can deliver a unique 16-bit
108 * identifier for each instance sharing the same card. It is
109 * placed instead of 0xFFFE in the interface identifier. The
110 * "u" bit of the interface identifier is not inverted in this
111 * case. Hence the resulting interface identifier has local
112 * scope according to RFC2373.
113 */
114 if (dev->dev_id) {
115 eui[3] = (dev->dev_id >> 8) & 0xFF;
116 eui[4] = dev->dev_id & 0xFF;
117 } else {
118 eui[3] = 0xFF;
119 eui[4] = 0xFE;
120 eui[0] ^= 2;
121 }
122 return 0;
123}
124
94static inline unsigned long addrconf_timeout_fixup(u32 timeout, 125static inline unsigned long addrconf_timeout_fixup(u32 timeout,
95 unsigned int unit) 126 unsigned int unit)
96{ 127{
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 20defc0353d1..c1740a2794a3 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -310,6 +310,13 @@ static inline bool bond_uses_primary(struct bonding *bond)
310 return bond_mode_uses_primary(BOND_MODE(bond)); 310 return bond_mode_uses_primary(BOND_MODE(bond));
311} 311}
312 312
313static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond)
314{
315 struct slave *slave = rcu_dereference(bond->curr_active_slave);
316
317 return bond_uses_primary(bond) && slave ? slave->dev : NULL;
318}
319
313static inline bool bond_slave_is_up(struct slave *slave) 320static inline bool bond_slave_is_up(struct slave *slave)
314{ 321{
315 return netif_running(slave->dev) && netif_carrier_ok(slave->dev); 322 return netif_running(slave->dev) && netif_carrier_ok(slave->dev);
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 39ed2d2fbd51..92a7d85917b4 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -105,14 +105,16 @@ enum ib_cm_data_size {
105 IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216, 105 IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216,
106 IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136, 106 IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136,
107 IB_CM_SIDR_REP_INFO_LENGTH = 72, 107 IB_CM_SIDR_REP_INFO_LENGTH = 72,
108 /* compare done u32 at a time */
109 IB_CM_COMPARE_SIZE = (64 / sizeof(u32))
110}; 108};
111 109
112struct ib_cm_id; 110struct ib_cm_id;
113 111
114struct ib_cm_req_event_param { 112struct ib_cm_req_event_param {
115 struct ib_cm_id *listen_id; 113 struct ib_cm_id *listen_id;
114
115 /* P_Key that was used by the GMP's BTH header */
116 u16 bth_pkey;
117
116 u8 port; 118 u8 port;
117 119
118 struct ib_sa_path_rec *primary_path; 120 struct ib_sa_path_rec *primary_path;
@@ -223,6 +225,9 @@ struct ib_cm_apr_event_param {
223 225
224struct ib_cm_sidr_req_event_param { 226struct ib_cm_sidr_req_event_param {
225 struct ib_cm_id *listen_id; 227 struct ib_cm_id *listen_id;
228 __be64 service_id;
229 /* P_Key that was used by the GMP's BTH header */
230 u16 bth_pkey;
226 u8 port; 231 u8 port;
227 u16 pkey; 232 u16 pkey;
228}; 233};
@@ -337,11 +342,6 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id);
337#define IB_SDP_SERVICE_ID cpu_to_be64(0x0000000000010000ULL) 342#define IB_SDP_SERVICE_ID cpu_to_be64(0x0000000000010000ULL)
338#define IB_SDP_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFFFF0000ULL) 343#define IB_SDP_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
339 344
340struct ib_cm_compare_data {
341 u32 data[IB_CM_COMPARE_SIZE];
342 u32 mask[IB_CM_COMPARE_SIZE];
343};
344
345/** 345/**
346 * ib_cm_listen - Initiates listening on the specified service ID for 346 * ib_cm_listen - Initiates listening on the specified service ID for
347 * connection and service ID resolution requests. 347 * connection and service ID resolution requests.
@@ -354,12 +354,13 @@ struct ib_cm_compare_data {
354 * range of service IDs. If set to 0, the service ID is matched 354 * range of service IDs. If set to 0, the service ID is matched
355 * exactly. This parameter is ignored if %service_id is set to 355 * exactly. This parameter is ignored if %service_id is set to
356 * IB_CM_ASSIGN_SERVICE_ID. 356 * IB_CM_ASSIGN_SERVICE_ID.
357 * @compare_data: This parameter is optional. It specifies data that must
358 * appear in the private data of a connection request for the specified
359 * listen request.
360 */ 357 */
361int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, 358int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
362 struct ib_cm_compare_data *compare_data); 359 __be64 service_mask);
360
361struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
362 ib_cm_handler cm_handler,
363 __be64 service_id);
363 364
364struct ib_cm_req_param { 365struct ib_cm_req_param {
365 struct ib_sa_path_rec *primary_path; 366 struct ib_sa_path_rec *primary_path;
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index c8422d5a5a91..188df91d5851 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -127,6 +127,23 @@
127#define IB_DEFAULT_PKEY_PARTIAL 0x7FFF 127#define IB_DEFAULT_PKEY_PARTIAL 0x7FFF
128#define IB_DEFAULT_PKEY_FULL 0xFFFF 128#define IB_DEFAULT_PKEY_FULL 0xFFFF
129 129
130/*
131 * Generic trap/notice types
132 */
133#define IB_NOTICE_TYPE_FATAL 0x80
134#define IB_NOTICE_TYPE_URGENT 0x81
135#define IB_NOTICE_TYPE_SECURITY 0x82
136#define IB_NOTICE_TYPE_SM 0x83
137#define IB_NOTICE_TYPE_INFO 0x84
138
139/*
140 * Generic trap/notice producers
141 */
142#define IB_NOTICE_PROD_CA cpu_to_be16(1)
143#define IB_NOTICE_PROD_SWITCH cpu_to_be16(2)
144#define IB_NOTICE_PROD_ROUTER cpu_to_be16(3)
145#define IB_NOTICE_PROD_CLASS_MGR cpu_to_be16(4)
146
130enum { 147enum {
131 IB_MGMT_MAD_HDR = 24, 148 IB_MGMT_MAD_HDR = 24,
132 IB_MGMT_MAD_DATA = 232, 149 IB_MGMT_MAD_DATA = 232,
@@ -240,6 +257,70 @@ struct ib_class_port_info {
240 __be32 trap_qkey; 257 __be32 trap_qkey;
241}; 258};
242 259
260struct ib_mad_notice_attr {
261 u8 generic_type;
262 u8 prod_type_msb;
263 __be16 prod_type_lsb;
264 __be16 trap_num;
265 __be16 issuer_lid;
266 __be16 toggle_count;
267
268 union {
269 struct {
270 u8 details[54];
271 } raw_data;
272
273 struct {
274 __be16 reserved;
275 __be16 lid; /* where violation happened */
276 u8 port_num; /* where violation happened */
277 } __packed ntc_129_131;
278
279 struct {
280 __be16 reserved;
281 __be16 lid; /* LID where change occurred */
282 u8 reserved2;
283 u8 local_changes; /* low bit - local changes */
284 __be32 new_cap_mask; /* new capability mask */
285 u8 reserved3;
286 u8 change_flags; /* low 3 bits only */
287 } __packed ntc_144;
288
289 struct {
290 __be16 reserved;
291 __be16 lid; /* lid where sys guid changed */
292 __be16 reserved2;
293 __be64 new_sys_guid;
294 } __packed ntc_145;
295
296 struct {
297 __be16 reserved;
298 __be16 lid;
299 __be16 dr_slid;
300 u8 method;
301 u8 reserved2;
302 __be16 attr_id;
303 __be32 attr_mod;
304 __be64 mkey;
305 u8 reserved3;
306 u8 dr_trunc_hop;
307 u8 dr_rtn_path[30];
308 } __packed ntc_256;
309
310 struct {
311 __be16 reserved;
312 __be16 lid1;
313 __be16 lid2;
314 __be32 key;
315 __be32 sl_qp1; /* SL: high 4 bits */
316 __be32 qp2; /* high 8 bits reserved */
317 union ib_gid gid1;
318 union ib_gid gid2;
319 } __packed ntc_257_258;
320
321 } details;
322};
323
243/** 324/**
244 * ib_mad_send_buf - MAD data buffer and work request for sends. 325 * ib_mad_send_buf - MAD data buffer and work request for sends.
245 * @next: A pointer used to chain together MADs for posting. 326 * @next: A pointer used to chain together MADs for posting.
@@ -388,7 +469,6 @@ enum {
388struct ib_mad_agent { 469struct ib_mad_agent {
389 struct ib_device *device; 470 struct ib_device *device;
390 struct ib_qp *qp; 471 struct ib_qp *qp;
391 struct ib_mr *mr;
392 ib_mad_recv_handler recv_handler; 472 ib_mad_recv_handler recv_handler;
393 ib_mad_send_handler send_handler; 473 ib_mad_send_handler send_handler;
394 ib_mad_snoop_handler snoop_handler; 474 ib_mad_snoop_handler snoop_handler;
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index b1f7592e02e4..709a5331e6b9 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -76,6 +76,8 @@ enum {
76 IB_OPCODE_UC = 0x20, 76 IB_OPCODE_UC = 0x20,
77 IB_OPCODE_RD = 0x40, 77 IB_OPCODE_RD = 0x40,
78 IB_OPCODE_UD = 0x60, 78 IB_OPCODE_UD = 0x60,
79 /* per IBTA 3.1 Table 38, A10.3.2 */
80 IB_OPCODE_CNP = 0x80,
79 81
80 /* operations -- just used to define real constants */ 82 /* operations -- just used to define real constants */
81 IB_OPCODE_SEND_FIRST = 0x00, 83 IB_OPCODE_SEND_FIRST = 0x00,
diff --git a/include/rdma/ib_smi.h b/include/rdma/ib_smi.h
index 98b9086d769a..b439e988408e 100644
--- a/include/rdma/ib_smi.h
+++ b/include/rdma/ib_smi.h
@@ -119,10 +119,57 @@ struct ib_port_info {
119 u8 link_roundtrip_latency[3]; 119 u8 link_roundtrip_latency[3];
120}; 120};
121 121
122struct ib_node_info {
123 u8 base_version;
124 u8 class_version;
125 u8 node_type;
126 u8 num_ports;
127 __be64 sys_guid;
128 __be64 node_guid;
129 __be64 port_guid;
130 __be16 partition_cap;
131 __be16 device_id;
132 __be32 revision;
133 u8 local_port_num;
134 u8 vendor_id[3];
135} __packed;
136
137struct ib_vl_weight_elem {
138 u8 vl; /* IB: VL is low 4 bits, upper 4 bits reserved */
139 /* OPA: VL is low 5 bits, upper 3 bits reserved */
140 u8 weight;
141};
142
122static inline u8 143static inline u8
123ib_get_smp_direction(struct ib_smp *smp) 144ib_get_smp_direction(struct ib_smp *smp)
124{ 145{
125 return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION); 146 return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
126} 147}
127 148
149/*
150 * SM Trap/Notice numbers
151 */
152#define IB_NOTICE_TRAP_LLI_THRESH cpu_to_be16(129)
153#define IB_NOTICE_TRAP_EBO_THRESH cpu_to_be16(130)
154#define IB_NOTICE_TRAP_FLOW_UPDATE cpu_to_be16(131)
155#define IB_NOTICE_TRAP_CAP_MASK_CHG cpu_to_be16(144)
156#define IB_NOTICE_TRAP_SYS_GUID_CHG cpu_to_be16(145)
157#define IB_NOTICE_TRAP_BAD_MKEY cpu_to_be16(256)
158#define IB_NOTICE_TRAP_BAD_PKEY cpu_to_be16(257)
159#define IB_NOTICE_TRAP_BAD_QKEY cpu_to_be16(258)
160
161/*
162 * Other local changes flags (trap 144).
163 */
164#define IB_NOTICE_TRAP_LSE_CHG 0x04 /* Link Speed Enable changed */
165#define IB_NOTICE_TRAP_LWE_CHG 0x02 /* Link Width Enable changed */
166#define IB_NOTICE_TRAP_NODE_DESC_CHG 0x01
167
168/*
169 * M_Key volation flags in dr_trunc_hop (trap 256).
170 */
171#define IB_NOTICE_TRAP_DR_NOTICE 0x80
172#define IB_NOTICE_TRAP_DR_TRUNC 0x40
173
174
128#endif /* IB_SMI_H */ 175#endif /* IB_SMI_H */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 43c1cf01c84b..7845fae6f2df 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -48,6 +48,7 @@
48#include <linux/rwsem.h> 48#include <linux/rwsem.h>
49#include <linux/scatterlist.h> 49#include <linux/scatterlist.h>
50#include <linux/workqueue.h> 50#include <linux/workqueue.h>
51#include <linux/socket.h>
51#include <uapi/linux/if_ether.h> 52#include <uapi/linux/if_ether.h>
52 53
53#include <linux/atomic.h> 54#include <linux/atomic.h>
@@ -64,6 +65,12 @@ union ib_gid {
64 } global; 65 } global;
65}; 66};
66 67
68extern union ib_gid zgid;
69
70struct ib_gid_attr {
71 struct net_device *ndev;
72};
73
67enum rdma_node_type { 74enum rdma_node_type {
68 /* IB values map to NodeInfo:NodeType. */ 75 /* IB values map to NodeInfo:NodeType. */
69 RDMA_NODE_IB_CA = 1, 76 RDMA_NODE_IB_CA = 1,
@@ -284,7 +291,7 @@ enum ib_port_cap_flags {
284 IB_PORT_BOOT_MGMT_SUP = 1 << 23, 291 IB_PORT_BOOT_MGMT_SUP = 1 << 23,
285 IB_PORT_LINK_LATENCY_SUP = 1 << 24, 292 IB_PORT_LINK_LATENCY_SUP = 1 << 24,
286 IB_PORT_CLIENT_REG_SUP = 1 << 25, 293 IB_PORT_CLIENT_REG_SUP = 1 << 25,
287 IB_PORT_IP_BASED_GIDS = 1 << 26 294 IB_PORT_IP_BASED_GIDS = 1 << 26,
288}; 295};
289 296
290enum ib_port_width { 297enum ib_port_width {
@@ -556,20 +563,18 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate);
556 */ 563 */
557__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); 564__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
558 565
559enum ib_mr_create_flags {
560 IB_MR_SIGNATURE_EN = 1,
561};
562 566
563/** 567/**
564 * ib_mr_init_attr - Memory region init attributes passed to routine 568 * enum ib_mr_type - memory region type
565 * ib_create_mr. 569 * @IB_MR_TYPE_MEM_REG: memory region that is used for
566 * @max_reg_descriptors: max number of registration descriptors that 570 * normal registration
567 * may be used with registration work requests. 571 * @IB_MR_TYPE_SIGNATURE: memory region that is used for
568 * @flags: MR creation flags bit mask. 572 * signature operations (data-integrity
573 * capable regions)
569 */ 574 */
570struct ib_mr_init_attr { 575enum ib_mr_type {
571 int max_reg_descriptors; 576 IB_MR_TYPE_MEM_REG,
572 u32 flags; 577 IB_MR_TYPE_SIGNATURE,
573}; 578};
574 579
575/** 580/**
@@ -1252,9 +1257,11 @@ struct ib_udata {
1252}; 1257};
1253 1258
1254struct ib_pd { 1259struct ib_pd {
1260 u32 local_dma_lkey;
1255 struct ib_device *device; 1261 struct ib_device *device;
1256 struct ib_uobject *uobject; 1262 struct ib_uobject *uobject;
1257 atomic_t usecnt; /* count all resources */ 1263 atomic_t usecnt; /* count all resources */
1264 struct ib_mr *local_mr;
1258}; 1265};
1259 1266
1260struct ib_xrcd { 1267struct ib_xrcd {
@@ -1488,7 +1495,7 @@ struct ib_cache {
1488 rwlock_t lock; 1495 rwlock_t lock;
1489 struct ib_event_handler event_handler; 1496 struct ib_event_handler event_handler;
1490 struct ib_pkey_cache **pkey_cache; 1497 struct ib_pkey_cache **pkey_cache;
1491 struct ib_gid_cache **gid_cache; 1498 struct ib_gid_table **gid_cache;
1492 u8 *lmc_cache; 1499 u8 *lmc_cache;
1493}; 1500};
1494 1501
@@ -1550,6 +1557,8 @@ struct ib_device {
1550 1557
1551 spinlock_t client_data_lock; 1558 spinlock_t client_data_lock;
1552 struct list_head core_list; 1559 struct list_head core_list;
1560 /* Access to the client_data_list is protected by the client_data_lock
1561 * spinlock and the lists_rwsem read-write semaphore */
1553 struct list_head client_data_list; 1562 struct list_head client_data_list;
1554 1563
1555 struct ib_cache cache; 1564 struct ib_cache cache;
@@ -1572,9 +1581,47 @@ struct ib_device {
1572 struct ib_port_attr *port_attr); 1581 struct ib_port_attr *port_attr);
1573 enum rdma_link_layer (*get_link_layer)(struct ib_device *device, 1582 enum rdma_link_layer (*get_link_layer)(struct ib_device *device,
1574 u8 port_num); 1583 u8 port_num);
1584 /* When calling get_netdev, the HW vendor's driver should return the
1585 * net device of device @device at port @port_num or NULL if such
1586 * a net device doesn't exist. The vendor driver should call dev_hold
1587 * on this net device. The HW vendor's device driver must guarantee
1588 * that this function returns NULL before the net device reaches
1589 * NETDEV_UNREGISTER_FINAL state.
1590 */
1591 struct net_device *(*get_netdev)(struct ib_device *device,
1592 u8 port_num);
1575 int (*query_gid)(struct ib_device *device, 1593 int (*query_gid)(struct ib_device *device,
1576 u8 port_num, int index, 1594 u8 port_num, int index,
1577 union ib_gid *gid); 1595 union ib_gid *gid);
1596 /* When calling add_gid, the HW vendor's driver should
1597 * add the gid of device @device at gid index @index of
1598 * port @port_num to be @gid. Meta-info of that gid (for example,
1599 * the network device related to this gid is available
1600 * at @attr. @context allows the HW vendor driver to store extra
1601 * information together with a GID entry. The HW vendor may allocate
1602 * memory to contain this information and store it in @context when a
1603 * new GID entry is written to. Params are consistent until the next
1604 * call of add_gid or delete_gid. The function should return 0 on
1605 * success or error otherwise. The function could be called
1606 * concurrently for different ports. This function is only called
1607 * when roce_gid_table is used.
1608 */
1609 int (*add_gid)(struct ib_device *device,
1610 u8 port_num,
1611 unsigned int index,
1612 const union ib_gid *gid,
1613 const struct ib_gid_attr *attr,
1614 void **context);
1615 /* When calling del_gid, the HW vendor's driver should delete the
1616 * gid of device @device at gid index @index of port @port_num.
1617 * Upon the deletion of a GID entry, the HW vendor must free any
1618 * allocated memory. The caller will clear @context afterwards.
1619 * This function is only called when roce_gid_table is used.
1620 */
1621 int (*del_gid)(struct ib_device *device,
1622 u8 port_num,
1623 unsigned int index,
1624 void **context);
1578 int (*query_pkey)(struct ib_device *device, 1625 int (*query_pkey)(struct ib_device *device,
1579 u8 port_num, u16 index, u16 *pkey); 1626 u8 port_num, u16 index, u16 *pkey);
1580 int (*modify_device)(struct ib_device *device, 1627 int (*modify_device)(struct ib_device *device,
@@ -1668,11 +1715,9 @@ struct ib_device {
1668 int (*query_mr)(struct ib_mr *mr, 1715 int (*query_mr)(struct ib_mr *mr,
1669 struct ib_mr_attr *mr_attr); 1716 struct ib_mr_attr *mr_attr);
1670 int (*dereg_mr)(struct ib_mr *mr); 1717 int (*dereg_mr)(struct ib_mr *mr);
1671 int (*destroy_mr)(struct ib_mr *mr); 1718 struct ib_mr * (*alloc_mr)(struct ib_pd *pd,
1672 struct ib_mr * (*create_mr)(struct ib_pd *pd, 1719 enum ib_mr_type mr_type,
1673 struct ib_mr_init_attr *mr_init_attr); 1720 u32 max_num_sg);
1674 struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd,
1675 int max_page_list_len);
1676 struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device, 1721 struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
1677 int page_list_len); 1722 int page_list_len);
1678 void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list); 1723 void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
@@ -1724,6 +1769,7 @@ struct ib_device {
1724 int (*destroy_flow)(struct ib_flow *flow_id); 1769 int (*destroy_flow)(struct ib_flow *flow_id);
1725 int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, 1770 int (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
1726 struct ib_mr_status *mr_status); 1771 struct ib_mr_status *mr_status);
1772 void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
1727 1773
1728 struct ib_dma_mapping_ops *dma_ops; 1774 struct ib_dma_mapping_ops *dma_ops;
1729 1775
@@ -1761,8 +1807,30 @@ struct ib_device {
1761struct ib_client { 1807struct ib_client {
1762 char *name; 1808 char *name;
1763 void (*add) (struct ib_device *); 1809 void (*add) (struct ib_device *);
1764 void (*remove)(struct ib_device *); 1810 void (*remove)(struct ib_device *, void *client_data);
1765 1811
1812 /* Returns the net_dev belonging to this ib_client and matching the
1813 * given parameters.
1814 * @dev: An RDMA device that the net_dev use for communication.
1815 * @port: A physical port number on the RDMA device.
1816 * @pkey: P_Key that the net_dev uses if applicable.
1817 * @gid: A GID that the net_dev uses to communicate.
1818 * @addr: An IP address the net_dev is configured with.
1819 * @client_data: The device's client data set by ib_set_client_data().
1820 *
1821 * An ib_client that implements a net_dev on top of RDMA devices
1822 * (such as IP over IB) should implement this callback, allowing the
1823 * rdma_cm module to find the right net_dev for a given request.
1824 *
1825 * The caller is responsible for calling dev_put on the returned
1826 * netdev. */
1827 struct net_device *(*get_net_dev_by_params)(
1828 struct ib_device *dev,
1829 u8 port,
1830 u16 pkey,
1831 const union ib_gid *gid,
1832 const struct sockaddr *addr,
1833 void *client_data);
1766 struct list_head list; 1834 struct list_head list;
1767}; 1835};
1768 1836
@@ -2071,34 +2139,6 @@ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num)
2071} 2139}
2072 2140
2073/** 2141/**
2074 * rdma_cap_read_multi_sge - Check if the port of device has the capability
2075 * RDMA Read Multiple Scatter-Gather Entries.
2076 * @device: Device to check
2077 * @port_num: Port number to check
2078 *
2079 * iWARP has a restriction that RDMA READ requests may only have a single
2080 * Scatter/Gather Entry (SGE) in the work request.
2081 *
2082 * NOTE: although the linux kernel currently assumes all devices are either
2083 * single SGE RDMA READ devices or identical SGE maximums for RDMA READs and
2084 * WRITEs, according to Tom Talpey, this is not accurate. There are some
2085 * devices out there that support more than a single SGE on RDMA READ
2086 * requests, but do not support the same number of SGEs as they do on
2087 * RDMA WRITE requests. The linux kernel would need rearchitecting to
2088 * support these imbalanced READ/WRITE SGEs allowed devices. So, for now,
2089 * suffice with either the device supports the same READ/WRITE SGEs, or
2090 * it only gets one READ sge.
2091 *
2092 * Return: true for any device that allows more than one SGE in RDMA READ
2093 * requests.
2094 */
2095static inline bool rdma_cap_read_multi_sge(struct ib_device *device,
2096 u8 port_num)
2097{
2098 return !(device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP);
2099}
2100
2101/**
2102 * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. 2142 * rdma_max_mad_size - Return the max MAD size required by this RDMA Port.
2103 * 2143 *
2104 * @device: Device 2144 * @device: Device
@@ -2115,6 +2155,26 @@ static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_n
2115 return device->port_immutable[port_num].max_mad_size; 2155 return device->port_immutable[port_num].max_mad_size;
2116} 2156}
2117 2157
2158/**
2159 * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table
2160 * @device: Device to check
2161 * @port_num: Port number to check
2162 *
2163 * RoCE GID table mechanism manages the various GIDs for a device.
2164 *
2165 * NOTE: if allocating the port's GID table has failed, this call will still
2166 * return true, but any RoCE GID table API will fail.
2167 *
2168 * Return: true if the port uses RoCE GID table mechanism in order to manage
2169 * its GIDs.
2170 */
2171static inline bool rdma_cap_roce_gid_table(const struct ib_device *device,
2172 u8 port_num)
2173{
2174 return rdma_protocol_roce(device, port_num) &&
2175 device->add_gid && device->del_gid;
2176}
2177
2118int ib_query_gid(struct ib_device *device, 2178int ib_query_gid(struct ib_device *device,
2119 u8 port_num, int index, union ib_gid *gid); 2179 u8 port_num, int index, union ib_gid *gid);
2120 2180
@@ -2135,20 +2195,9 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
2135int ib_find_pkey(struct ib_device *device, 2195int ib_find_pkey(struct ib_device *device,
2136 u8 port_num, u16 pkey, u16 *index); 2196 u8 port_num, u16 pkey, u16 *index);
2137 2197
2138/**
2139 * ib_alloc_pd - Allocates an unused protection domain.
2140 * @device: The device on which to allocate the protection domain.
2141 *
2142 * A protection domain object provides an association between QPs, shared
2143 * receive queues, address handles, memory regions, and memory windows.
2144 */
2145struct ib_pd *ib_alloc_pd(struct ib_device *device); 2198struct ib_pd *ib_alloc_pd(struct ib_device *device);
2146 2199
2147/** 2200void ib_dealloc_pd(struct ib_pd *pd);
2148 * ib_dealloc_pd - Deallocates a protection domain.
2149 * @pd: The protection domain to deallocate.
2150 */
2151int ib_dealloc_pd(struct ib_pd *pd);
2152 2201
2153/** 2202/**
2154 * ib_create_ah - Creates an address handle for the given address vector. 2203 * ib_create_ah - Creates an address handle for the given address vector.
@@ -2775,33 +2824,9 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
2775 */ 2824 */
2776int ib_dereg_mr(struct ib_mr *mr); 2825int ib_dereg_mr(struct ib_mr *mr);
2777 2826
2778 2827struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
2779/** 2828 enum ib_mr_type mr_type,
2780 * ib_create_mr - Allocates a memory region that may be used for 2829 u32 max_num_sg);
2781 * signature handover operations.
2782 * @pd: The protection domain associated with the region.
2783 * @mr_init_attr: memory region init attributes.
2784 */
2785struct ib_mr *ib_create_mr(struct ib_pd *pd,
2786 struct ib_mr_init_attr *mr_init_attr);
2787
2788/**
2789 * ib_destroy_mr - Destroys a memory region that was created using
2790 * ib_create_mr and removes it from HW translation tables.
2791 * @mr: The memory region to destroy.
2792 *
2793 * This function can fail, if the memory region has memory windows bound to it.
2794 */
2795int ib_destroy_mr(struct ib_mr *mr);
2796
2797/**
2798 * ib_alloc_fast_reg_mr - Allocates memory region usable with the
2799 * IB_WR_FAST_REG_MR send work request.
2800 * @pd: The protection domain associated with the region.
2801 * @max_page_list_len: requested max physical buffer list length to be
2802 * used with fast register work requests for this MR.
2803 */
2804struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
2805 2830
2806/** 2831/**
2807 * ib_alloc_fast_reg_page_list - Allocates a page list array 2832 * ib_alloc_fast_reg_page_list - Allocates a page list array
@@ -2994,4 +3019,8 @@ static inline int ib_check_mr_access(int flags)
2994int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, 3019int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
2995 struct ib_mr_status *mr_status); 3020 struct ib_mr_status *mr_status);
2996 3021
3022struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
3023 u16 pkey, const union ib_gid *gid,
3024 const struct sockaddr *addr);
3025
2997#endif /* IB_VERBS_H */ 3026#endif /* IB_VERBS_H */
diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h
new file mode 100644
index 000000000000..391dae1931c0
--- /dev/null
+++ b/include/rdma/opa_port_info.h
@@ -0,0 +1,433 @@
1/*
2 * Copyright (c) 2014 Intel Corporation. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#if !defined(OPA_PORT_INFO_H)
34#define OPA_PORT_INFO_H
35
36/* Temporary until HFI driver is updated */
37#ifndef USE_PI_LED_ENABLE
38#define USE_PI_LED_ENABLE 0
39#endif
40
41#define OPA_PORT_LINK_MODE_NOP 0 /* No change */
42#define OPA_PORT_LINK_MODE_OPA 4 /* Port mode is OPA */
43
44#define OPA_PORT_PACKET_FORMAT_NOP 0 /* No change */
45#define OPA_PORT_PACKET_FORMAT_8B 1 /* Format 8B */
46#define OPA_PORT_PACKET_FORMAT_9B 2 /* Format 9B */
47#define OPA_PORT_PACKET_FORMAT_10B 4 /* Format 10B */
48#define OPA_PORT_PACKET_FORMAT_16B 8 /* Format 16B */
49
50#define OPA_PORT_LTP_CRC_MODE_NONE 0 /* No change */
51#define OPA_PORT_LTP_CRC_MODE_14 1 /* 14-bit LTP CRC mode (optional) */
52#define OPA_PORT_LTP_CRC_MODE_16 2 /* 16-bit LTP CRC mode */
53#define OPA_PORT_LTP_CRC_MODE_48 4 /* 48-bit LTP CRC mode (optional) */
54#define OPA_PORT_LTP_CRC_MODE_PER_LANE 8 /* 12/16-bit per lane LTP CRC mode */
55
56/* Link Down / Neighbor Link Down Reason; indicated as follows: */
57#define OPA_LINKDOWN_REASON_NONE 0 /* No specified reason */
58#define OPA_LINKDOWN_REASON_RCV_ERROR_0 1
59#define OPA_LINKDOWN_REASON_BAD_PKT_LEN 2
60#define OPA_LINKDOWN_REASON_PKT_TOO_LONG 3
61#define OPA_LINKDOWN_REASON_PKT_TOO_SHORT 4
62#define OPA_LINKDOWN_REASON_BAD_SLID 5
63#define OPA_LINKDOWN_REASON_BAD_DLID 6
64#define OPA_LINKDOWN_REASON_BAD_L2 7
65#define OPA_LINKDOWN_REASON_BAD_SC 8
66#define OPA_LINKDOWN_REASON_RCV_ERROR_8 9
67#define OPA_LINKDOWN_REASON_BAD_MID_TAIL 10
68#define OPA_LINKDOWN_REASON_RCV_ERROR_10 11
69#define OPA_LINKDOWN_REASON_PREEMPT_ERROR 12
70#define OPA_LINKDOWN_REASON_PREEMPT_VL15 13
71#define OPA_LINKDOWN_REASON_BAD_VL_MARKER 14
72#define OPA_LINKDOWN_REASON_RCV_ERROR_14 15
73#define OPA_LINKDOWN_REASON_RCV_ERROR_15 16
74#define OPA_LINKDOWN_REASON_BAD_HEAD_DIST 17
75#define OPA_LINKDOWN_REASON_BAD_TAIL_DIST 18
76#define OPA_LINKDOWN_REASON_BAD_CTRL_DIST 19
77#define OPA_LINKDOWN_REASON_BAD_CREDIT_ACK 20
78#define OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER 21
79#define OPA_LINKDOWN_REASON_BAD_PREEMPT 22
80#define OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT 23
81#define OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT 24
82#define OPA_LINKDOWN_REASON_RCV_ERROR_24 25
83#define OPA_LINKDOWN_REASON_RCV_ERROR_25 26
84#define OPA_LINKDOWN_REASON_RCV_ERROR_26 27
85#define OPA_LINKDOWN_REASON_RCV_ERROR_27 28
86#define OPA_LINKDOWN_REASON_RCV_ERROR_28 29
87#define OPA_LINKDOWN_REASON_RCV_ERROR_29 30
88#define OPA_LINKDOWN_REASON_RCV_ERROR_30 31
89#define OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN 32
90#define OPA_LINKDOWN_REASON_UNKNOWN 33
91/* 34 -reserved */
92#define OPA_LINKDOWN_REASON_REBOOT 35
93#define OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN 36
94/* 37-38 reserved */
95#define OPA_LINKDOWN_REASON_FM_BOUNCE 39
96#define OPA_LINKDOWN_REASON_SPEED_POLICY 40
97#define OPA_LINKDOWN_REASON_WIDTH_POLICY 41
98/* 42-48 reserved */
99#define OPA_LINKDOWN_REASON_DISCONNECTED 49
100#define OPA_LINKDOWN_REASONLOCAL_MEDIA_NOT_INSTALLED 50
101#define OPA_LINKDOWN_REASON_NOT_INSTALLED 51
102#define OPA_LINKDOWN_REASON_CHASSIS_CONFIG 52
103/* 53 reserved */
104#define OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED 54
105/* 55 reserved */
106#define OPA_LINKDOWN_REASON_POWER_POLICY 56
107#define OPA_LINKDOWN_REASON_LINKSPEED_POLICY 57
108#define OPA_LINKDOWN_REASON_LINKWIDTH_POLICY 58
109/* 59 reserved */
110#define OPA_LINKDOWN_REASON_SWITCH_MGMT 60
111#define OPA_LINKDOWN_REASON_SMA_DISABLED 61
112/* 62 reserved */
113#define OPA_LINKDOWN_REASON_TRANSIENT 63
114/* 64-255 reserved */
115
116/* OPA Link Init reason; indicated as follows: */
117/* 3-7; 11-15 reserved; 8-15 cleared on Polling->LinkUp */
118#define OPA_LINKINIT_REASON_NOP 0
119#define OPA_LINKINIT_REASON_LINKUP (1 << 4)
120#define OPA_LINKINIT_REASON_FLAPPING (2 << 4)
121#define OPA_LINKINIT_REASON_CLEAR (8 << 4)
122#define OPA_LINKINIT_OUTSIDE_POLICY (8 << 4)
123#define OPA_LINKINIT_QUARANTINED (9 << 4)
124#define OPA_LINKINIT_INSUFIC_CAPABILITY (10 << 4)
125
126#define OPA_LINK_SPEED_NOP 0x0000 /* Reserved (1-5 Gbps) */
127#define OPA_LINK_SPEED_12_5G 0x0001 /* 12.5 Gbps */
128#define OPA_LINK_SPEED_25G 0x0002 /* 25.78125? Gbps (EDR) */
129
130#define OPA_LINK_WIDTH_1X 0x0001
131#define OPA_LINK_WIDTH_2X 0x0002
132#define OPA_LINK_WIDTH_3X 0x0004
133#define OPA_LINK_WIDTH_4X 0x0008
134
135#define OPA_CAP_MASK3_IsSnoopSupported (1 << 7)
136#define OPA_CAP_MASK3_IsAsyncSC2VLSupported (1 << 6)
137#define OPA_CAP_MASK3_IsAddrRangeConfigSupported (1 << 5)
138#define OPA_CAP_MASK3_IsPassThroughSupported (1 << 4)
139#define OPA_CAP_MASK3_IsSharedSpaceSupported (1 << 3)
140/* reserved (1 << 2) */
141#define OPA_CAP_MASK3_IsVLMarkerSupported (1 << 1)
142#define OPA_CAP_MASK3_IsVLrSupported (1 << 0)
143
144/**
145 * new MTU values
146 */
147enum {
148 OPA_MTU_8192 = 6,
149 OPA_MTU_10240 = 7,
150};
151
152enum {
153 OPA_PORT_PHYS_CONF_DISCONNECTED = 0,
154 OPA_PORT_PHYS_CONF_STANDARD = 1,
155 OPA_PORT_PHYS_CONF_FIXED = 2,
156 OPA_PORT_PHYS_CONF_VARIABLE = 3,
157 OPA_PORT_PHYS_CONF_SI_PHOTO = 4
158};
159
160enum port_info_field_masks {
161 /* vl.cap */
162 OPA_PI_MASK_VL_CAP = 0x1F,
163 /* port_states.ledenable_offlinereason */
164 OPA_PI_MASK_OFFLINE_REASON = 0x0F,
165 OPA_PI_MASK_LED_ENABLE = 0x40,
166 /* port_states.unsleepstate_downdefstate */
167 OPA_PI_MASK_UNSLEEP_STATE = 0xF0,
168 OPA_PI_MASK_DOWNDEF_STATE = 0x0F,
169 /* port_states.portphysstate_portstate */
170 OPA_PI_MASK_PORT_PHYSICAL_STATE = 0xF0,
171 OPA_PI_MASK_PORT_STATE = 0x0F,
172 /* port_phys_conf */
173 OPA_PI_MASK_PORT_PHYSICAL_CONF = 0x0F,
174 /* collectivemask_multicastmask */
175 OPA_PI_MASK_COLLECT_MASK = 0x38,
176 OPA_PI_MASK_MULTICAST_MASK = 0x07,
177 /* mkeyprotect_lmc */
178 OPA_PI_MASK_MKEY_PROT_BIT = 0xC0,
179 OPA_PI_MASK_LMC = 0x0F,
180 /* smsl */
181 OPA_PI_MASK_SMSL = 0x1F,
182 /* partenforce_filterraw */
183 /* Filter Raw In/Out bits 1 and 2 were removed */
184 OPA_PI_MASK_LINKINIT_REASON = 0xF0,
185 OPA_PI_MASK_PARTITION_ENFORCE_IN = 0x08,
186 OPA_PI_MASK_PARTITION_ENFORCE_OUT = 0x04,
187 /* operational_vls */
188 OPA_PI_MASK_OPERATIONAL_VL = 0x1F,
189 /* sa_qp */
190 OPA_PI_MASK_SA_QP = 0x00FFFFFF,
191 /* sm_trap_qp */
192 OPA_PI_MASK_SM_TRAP_QP = 0x00FFFFFF,
193 /* localphy_overrun_errors */
194 OPA_PI_MASK_LOCAL_PHY_ERRORS = 0xF0,
195 OPA_PI_MASK_OVERRUN_ERRORS = 0x0F,
196 /* clientrereg_subnettimeout */
197 OPA_PI_MASK_CLIENT_REREGISTER = 0x80,
198 OPA_PI_MASK_SUBNET_TIMEOUT = 0x1F,
199 /* port_link_mode */
200 OPA_PI_MASK_PORT_LINK_SUPPORTED = (0x001F << 10),
201 OPA_PI_MASK_PORT_LINK_ENABLED = (0x001F << 5),
202 OPA_PI_MASK_PORT_LINK_ACTIVE = (0x001F << 0),
203 /* port_link_crc_mode */
204 OPA_PI_MASK_PORT_LINK_CRC_SUPPORTED = 0x0F00,
205 OPA_PI_MASK_PORT_LINK_CRC_ENABLED = 0x00F0,
206 OPA_PI_MASK_PORT_LINK_CRC_ACTIVE = 0x000F,
207 /* port_mode */
208 OPA_PI_MASK_PORT_MODE_SECURITY_CHECK = 0x0001,
209 OPA_PI_MASK_PORT_MODE_16B_TRAP_QUERY = 0x0002,
210 OPA_PI_MASK_PORT_MODE_PKEY_CONVERT = 0x0004,
211 OPA_PI_MASK_PORT_MODE_SC2SC_MAPPING = 0x0008,
212 OPA_PI_MASK_PORT_MODE_VL_MARKER = 0x0010,
213 OPA_PI_MASK_PORT_PASS_THROUGH = 0x0020,
214 OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE = 0x0040,
215 /* flit_control.interleave */
216 OPA_PI_MASK_INTERLEAVE_DIST_SUP = (0x0003 << 12),
217 OPA_PI_MASK_INTERLEAVE_DIST_ENABLE = (0x0003 << 10),
218 OPA_PI_MASK_INTERLEAVE_MAX_NEST_TX = (0x001F << 5),
219 OPA_PI_MASK_INTERLEAVE_MAX_NEST_RX = (0x001F << 0),
220
221 /* port_error_action */
222 OPA_PI_MASK_EX_BUFFER_OVERRUN = 0x80000000,
223 /* 7 bits reserved */
224 OPA_PI_MASK_FM_CFG_ERR_EXCEED_MULTICAST_LIMIT = 0x00800000,
225 OPA_PI_MASK_FM_CFG_BAD_CONTROL_FLIT = 0x00400000,
226 OPA_PI_MASK_FM_CFG_BAD_PREEMPT = 0x00200000,
227 OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER = 0x00100000,
228 OPA_PI_MASK_FM_CFG_BAD_CRDT_ACK = 0x00080000,
229 OPA_PI_MASK_FM_CFG_BAD_CTRL_DIST = 0x00040000,
230 OPA_PI_MASK_FM_CFG_BAD_TAIL_DIST = 0x00020000,
231 OPA_PI_MASK_FM_CFG_BAD_HEAD_DIST = 0x00010000,
232 /* 2 bits reserved */
233 OPA_PI_MASK_PORT_RCV_BAD_VL_MARKER = 0x00002000,
234 OPA_PI_MASK_PORT_RCV_PREEMPT_VL15 = 0x00001000,
235 OPA_PI_MASK_PORT_RCV_PREEMPT_ERROR = 0x00000800,
236 /* 1 bit reserved */
237 OPA_PI_MASK_PORT_RCV_BAD_MidTail = 0x00000200,
238 /* 1 bit reserved */
239 OPA_PI_MASK_PORT_RCV_BAD_SC = 0x00000080,
240 OPA_PI_MASK_PORT_RCV_BAD_L2 = 0x00000040,
241 OPA_PI_MASK_PORT_RCV_BAD_DLID = 0x00000020,
242 OPA_PI_MASK_PORT_RCV_BAD_SLID = 0x00000010,
243 OPA_PI_MASK_PORT_RCV_PKTLEN_TOOSHORT = 0x00000008,
244 OPA_PI_MASK_PORT_RCV_PKTLEN_TOOLONG = 0x00000004,
245 OPA_PI_MASK_PORT_RCV_BAD_PKTLEN = 0x00000002,
246 OPA_PI_MASK_PORT_RCV_BAD_LT = 0x00000001,
247
248 /* pass_through.res_drctl */
249 OPA_PI_MASK_PASS_THROUGH_DR_CONTROL = 0x01,
250
251 /* buffer_units */
252 OPA_PI_MASK_BUF_UNIT_VL15_INIT = (0x00000FFF << 11),
253 OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE = (0x0000001F << 6),
254 OPA_PI_MASK_BUF_UNIT_CREDIT_ACK = (0x00000003 << 3),
255 OPA_PI_MASK_BUF_UNIT_BUF_ALLOC = (0x00000003 << 0),
256
257 /* neigh_mtu.pvlx_to_mtu */
258 OPA_PI_MASK_NEIGH_MTU_PVL0 = 0xF0,
259 OPA_PI_MASK_NEIGH_MTU_PVL1 = 0x0F,
260
261 /* neigh_mtu.vlstall_hoq_life */
262 OPA_PI_MASK_VL_STALL = (0x03 << 5),
263 OPA_PI_MASK_HOQ_LIFE = (0x1F << 0),
264
265 /* port_neigh_mode */
266 OPA_PI_MASK_NEIGH_MGMT_ALLOWED = (0x01 << 3),
267 OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS = (0x01 << 2),
268 OPA_PI_MASK_NEIGH_NODE_TYPE = (0x03 << 0),
269
270 /* resptime_value */
271 OPA_PI_MASK_RESPONSE_TIME_VALUE = 0x1F,
272
273 /* mtucap */
274 OPA_PI_MASK_MTU_CAP = 0x0F,
275};
276
277#if USE_PI_LED_ENABLE
278struct opa_port_states {
279 u8 reserved;
280 u8 ledenable_offlinereason; /* 1 res, 1 bit, 6 bits */
281 u8 reserved2;
282 u8 portphysstate_portstate; /* 4 bits, 4 bits */
283};
284#define PI_LED_ENABLE_SUP 1
285#else
286struct opa_port_states {
287 u8 reserved;
288 u8 offline_reason; /* 2 res, 6 bits */
289 u8 reserved2;
290 u8 portphysstate_portstate; /* 4 bits, 4 bits */
291};
292#define PI_LED_ENABLE_SUP 0
293#endif
294
295struct opa_port_state_info {
296 struct opa_port_states port_states;
297 u16 link_width_downgrade_tx_active;
298 u16 link_width_downgrade_rx_active;
299};
300
301struct opa_port_info {
302 __be32 lid;
303 __be32 flow_control_mask;
304
305 struct {
306 u8 res; /* was inittype */
307 u8 cap; /* 3 res, 5 bits */
308 __be16 high_limit;
309 __be16 preempt_limit;
310 u8 arb_high_cap;
311 u8 arb_low_cap;
312 } vl;
313
314 struct opa_port_states port_states;
315 u8 port_phys_conf; /* 4 res, 4 bits */
316 u8 collectivemask_multicastmask; /* 2 res, 3, 3 */
317 u8 mkeyprotect_lmc; /* 2 bits, 2 res, 4 bits */
318 u8 smsl; /* 3 res, 5 bits */
319
320 u8 partenforce_filterraw; /* bit fields */
321 u8 operational_vls; /* 3 res, 5 bits */
322 __be16 pkey_8b;
323 __be16 pkey_10b;
324 __be16 mkey_violations;
325
326 __be16 pkey_violations;
327 __be16 qkey_violations;
328 __be32 sm_trap_qp; /* 8 bits, 24 bits */
329
330 __be32 sa_qp; /* 8 bits, 24 bits */
331 u8 neigh_port_num;
332 u8 link_down_reason;
333 u8 neigh_link_down_reason;
334 u8 clientrereg_subnettimeout; /* 1 bit, 2 bits, 5 */
335
336 struct {
337 __be16 supported;
338 __be16 enabled;
339 __be16 active;
340 } link_speed;
341 struct {
342 __be16 supported;
343 __be16 enabled;
344 __be16 active;
345 } link_width;
346 struct {
347 __be16 supported;
348 __be16 enabled;
349 __be16 tx_active;
350 __be16 rx_active;
351 } link_width_downgrade;
352 __be16 port_link_mode; /* 1 res, 5 bits, 5 bits, 5 bits */
353 __be16 port_ltp_crc_mode; /* 4 res, 4 bits, 4 bits, 4 bits */
354
355 __be16 port_mode; /* 9 res, bit fields */
356 struct {
357 __be16 supported;
358 __be16 enabled;
359 } port_packet_format;
360 struct {
361 __be16 interleave; /* 2 res, 2,2,5,5 */
362 struct {
363 __be16 min_initial;
364 __be16 min_tail;
365 u8 large_pkt_limit;
366 u8 small_pkt_limit;
367 u8 max_small_pkt_limit;
368 u8 preemption_limit;
369 } preemption;
370 } flit_control;
371
372 __be32 reserved4;
373 __be32 port_error_action; /* bit field */
374
375 struct {
376 u8 egress_port;
377 u8 res_drctl; /* 7 res, 1 */
378 } pass_through;
379 __be16 mkey_lease_period;
380 __be32 buffer_units; /* 9 res, 12, 5, 3, 3 */
381
382 __be32 reserved5;
383 __be32 sm_lid;
384
385 __be64 mkey;
386
387 __be64 subnet_prefix;
388
389 struct {
390 u8 pvlx_to_mtu[OPA_MAX_VLS/2]; /* 4 bits, 4 bits */
391 } neigh_mtu;
392
393 struct {
394 u8 vlstall_hoqlife; /* 3 bits, 5 bits */
395 } xmit_q[OPA_MAX_VLS];
396
397 struct {
398 u8 addr[16];
399 } ipaddr_ipv6;
400
401 struct {
402 u8 addr[4];
403 } ipaddr_ipv4;
404
405 u32 reserved6;
406 u32 reserved7;
407 u32 reserved8;
408
409 __be64 neigh_node_guid;
410
411 __be32 ib_cap_mask;
412 __be16 reserved9; /* was ib_cap_mask2 */
413 __be16 opa_cap_mask;
414
415 __be32 reserved10; /* was link_roundtrip_latency */
416 __be16 overall_buffer_space;
417 __be16 reserved11; /* was max_credit_hint */
418
419 __be16 diag_code;
420 struct {
421 u8 buffer;
422 u8 wire;
423 } replay_depth;
424 u8 port_neigh_mode;
425 u8 mtucap; /* 4 res, 4 bits */
426
427 u8 resptimevalue; /* 3 res, 5 bits */
428 u8 local_port_num;
429 u8 reserved12;
430 u8 reserved13; /* was guid_cap */
431} __attribute__ ((packed));
432
433#endif /* OPA_PORT_INFO_H */
diff --git a/include/rdma/opa_smi.h b/include/rdma/opa_smi.h
index 29063e84c253..4a529ef47995 100644
--- a/include/rdma/opa_smi.h
+++ b/include/rdma/opa_smi.h
@@ -40,6 +40,10 @@
40#define OPA_SMP_DR_DATA_SIZE 1872 40#define OPA_SMP_DR_DATA_SIZE 1872
41#define OPA_SMP_MAX_PATH_HOPS 64 41#define OPA_SMP_MAX_PATH_HOPS 64
42 42
43#define OPA_MAX_VLS 32
44#define OPA_MAX_SLS 32
45#define OPA_MAX_SCS 32
46
43#define OPA_SMI_CLASS_VERSION 0x80 47#define OPA_SMI_CLASS_VERSION 0x80
44 48
45#define OPA_LID_PERMISSIVE cpu_to_be32(0xFFFFFFFF) 49#define OPA_LID_PERMISSIVE cpu_to_be32(0xFFFFFFFF)
@@ -73,6 +77,49 @@ struct opa_smp {
73} __packed; 77} __packed;
74 78
75 79
80/* Subnet management attributes */
81/* ... */
82#define OPA_ATTRIB_ID_NODE_DESCRIPTION cpu_to_be16(0x0010)
83#define OPA_ATTRIB_ID_NODE_INFO cpu_to_be16(0x0011)
84#define OPA_ATTRIB_ID_PORT_INFO cpu_to_be16(0x0015)
85#define OPA_ATTRIB_ID_PARTITION_TABLE cpu_to_be16(0x0016)
86#define OPA_ATTRIB_ID_SL_TO_SC_MAP cpu_to_be16(0x0017)
87#define OPA_ATTRIB_ID_VL_ARBITRATION cpu_to_be16(0x0018)
88#define OPA_ATTRIB_ID_SM_INFO cpu_to_be16(0x0020)
89#define OPA_ATTRIB_ID_CABLE_INFO cpu_to_be16(0x0032)
90#define OPA_ATTRIB_ID_AGGREGATE cpu_to_be16(0x0080)
91#define OPA_ATTRIB_ID_SC_TO_SL_MAP cpu_to_be16(0x0082)
92#define OPA_ATTRIB_ID_SC_TO_VLR_MAP cpu_to_be16(0x0083)
93#define OPA_ATTRIB_ID_SC_TO_VLT_MAP cpu_to_be16(0x0084)
94#define OPA_ATTRIB_ID_SC_TO_VLNT_MAP cpu_to_be16(0x0085)
95/* ... */
96#define OPA_ATTRIB_ID_PORT_STATE_INFO cpu_to_be16(0x0087)
97/* ... */
98#define OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE cpu_to_be16(0x008A)
99/* ... */
100
101struct opa_node_description {
102 u8 data[64];
103} __attribute__ ((packed));
104
105struct opa_node_info {
106 u8 base_version;
107 u8 class_version;
108 u8 node_type;
109 u8 num_ports;
110 __be32 reserved;
111 __be64 system_image_guid;
112 __be64 node_guid;
113 __be64 port_guid;
114 __be16 partition_cap;
115 __be16 device_id;
116 __be32 revision;
117 u8 local_port_num;
118 u8 vendor_id[3]; /* network byte order */
119} __attribute__ ((packed));
120
121#define OPA_PARTITION_TABLE_BLK_SIZE 32
122
76static inline u8 123static inline u8
77opa_get_smp_direction(struct opa_smp *smp) 124opa_get_smp_direction(struct opa_smp *smp)
78{ 125{
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 0790882e0c9b..585266144329 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -77,4 +77,11 @@ int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
77int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, 77int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
78 unsigned int group, gfp_t flags); 78 unsigned int group, gfp_t flags);
79 79
80/**
81 * Check if there are any listeners to the netlink group
82 * @group: the netlink group ID
83 * Returns 0 on success or a negative for no listeners.
84 */
85int ibnl_chk_listeners(unsigned int group);
86
80#endif /* _RDMA_NETLINK_H */ 87#endif /* _RDMA_NETLINK_H */
diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild
index 687ae332200f..231901b08f6c 100644
--- a/include/uapi/rdma/Kbuild
+++ b/include/uapi/rdma/Kbuild
@@ -5,3 +5,4 @@ header-y += ib_user_sa.h
5header-y += ib_user_verbs.h 5header-y += ib_user_verbs.h
6header-y += rdma_netlink.h 6header-y += rdma_netlink.h
7header-y += rdma_user_cm.h 7header-y += rdma_user_cm.h
8header-y += hfi/
diff --git a/include/uapi/rdma/hfi/Kbuild b/include/uapi/rdma/hfi/Kbuild
new file mode 100644
index 000000000000..ef23c294fc71
--- /dev/null
+++ b/include/uapi/rdma/hfi/Kbuild
@@ -0,0 +1,2 @@
1# UAPI Header export list
2header-y += hfi1_user.h
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
new file mode 100644
index 000000000000..78c442fbf263
--- /dev/null
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -0,0 +1,427 @@
1/*
2 *
3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
5 *
6 * GPL LICENSE SUMMARY
7 *
8 * Copyright(c) 2015 Intel Corporation.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * BSD LICENSE
20 *
21 * Copyright(c) 2015 Intel Corporation.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 *
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
32 * distribution.
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 */
50
51/*
52 * This file contains defines, structures, etc. that are used
53 * to communicate between kernel and user code.
54 */
55
56#ifndef _LINUX__HFI1_USER_H
57#define _LINUX__HFI1_USER_H
58
59#include <linux/types.h>
60
61/*
62 * This version number is given to the driver by the user code during
63 * initialization in the spu_userversion field of hfi1_user_info, so
64 * the driver can check for compatibility with user code.
65 *
66 * The major version changes when data structures change in an incompatible
67 * way. The driver must be the same for initialization to succeed.
68 */
69#define HFI1_USER_SWMAJOR 4
70
71/*
72 * Minor version differences are always compatible
73 * a within a major version, however if user software is larger
74 * than driver software, some new features and/or structure fields
75 * may not be implemented; the user code must deal with this if it
76 * cares, or it must abort after initialization reports the difference.
77 */
78#define HFI1_USER_SWMINOR 0
79
80/*
81 * Set of HW and driver capability/feature bits.
82 * These bit values are used to configure enabled/disabled HW and
83 * driver features. The same set of bits are communicated to user
84 * space.
85 */
86#define HFI1_CAP_DMA_RTAIL (1UL << 0) /* Use DMA'ed RTail value */
87#define HFI1_CAP_SDMA (1UL << 1) /* Enable SDMA support */
88#define HFI1_CAP_SDMA_AHG (1UL << 2) /* Enable SDMA AHG support */
89#define HFI1_CAP_EXTENDED_PSN (1UL << 3) /* Enable Extended PSN support */
90#define HFI1_CAP_HDRSUPP (1UL << 4) /* Enable Header Suppression */
91/* 1UL << 5 reserved */
92#define HFI1_CAP_USE_SDMA_HEAD (1UL << 6) /* DMA Hdr Q tail vs. use CSR */
93#define HFI1_CAP_MULTI_PKT_EGR (1UL << 7) /* Enable multi-packet Egr buffs*/
94#define HFI1_CAP_NODROP_RHQ_FULL (1UL << 8) /* Don't drop on Hdr Q full */
95#define HFI1_CAP_NODROP_EGR_FULL (1UL << 9) /* Don't drop on EGR buffs full */
96#define HFI1_CAP_TID_UNMAP (1UL << 10) /* Enable Expected TID caching */
97#define HFI1_CAP_PRINT_UNIMPL (1UL << 11) /* Show for unimplemented feats */
98#define HFI1_CAP_ALLOW_PERM_JKEY (1UL << 12) /* Allow use of permissive JKEY */
99#define HFI1_CAP_NO_INTEGRITY (1UL << 13) /* Enable ctxt integrity checks */
100#define HFI1_CAP_PKEY_CHECK (1UL << 14) /* Enable ctxt PKey checking */
101#define HFI1_CAP_STATIC_RATE_CTRL (1UL << 15) /* Allow PBC.StaticRateControl */
102#define HFI1_CAP_QSFP_ENABLED (1UL << 16) /* Enable QSFP check during LNI */
103#define HFI1_CAP_SDMA_HEAD_CHECK (1UL << 17) /* SDMA head checking */
104#define HFI1_CAP_EARLY_CREDIT_RETURN (1UL << 18) /* early credit return */
105
106#define HFI1_RCVHDR_ENTSIZE_2 (1UL << 0)
107#define HFI1_RCVHDR_ENTSIZE_16 (1UL << 1)
108#define HFI1_RCVDHR_ENTSIZE_32 (1UL << 2)
109
110/*
111 * If the unit is specified via open, HFI choice is fixed. If port is
112 * specified, it's also fixed. Otherwise we try to spread contexts
113 * across ports and HFIs, using different algorithms. WITHIN is
114 * the old default, prior to this mechanism.
115 */
116#define HFI1_ALG_ACROSS 0 /* round robin contexts across HFIs, then
117 * ports; this is the default */
118#define HFI1_ALG_WITHIN 1 /* use all contexts on an HFI (round robin
119 * active ports within), then next HFI */
120#define HFI1_ALG_COUNT 2 /* number of algorithm choices */
121
122
123/* User commands. */
124#define HFI1_CMD_ASSIGN_CTXT 1 /* allocate HFI and context */
125#define HFI1_CMD_CTXT_INFO 2 /* find out what resources we got */
126#define HFI1_CMD_USER_INFO 3 /* set up userspace */
127#define HFI1_CMD_TID_UPDATE 4 /* update expected TID entries */
128#define HFI1_CMD_TID_FREE 5 /* free expected TID entries */
129#define HFI1_CMD_CREDIT_UPD 6 /* force an update of PIO credit */
130#define HFI1_CMD_SDMA_STATUS_UPD 7 /* force update of SDMA status ring */
131
132#define HFI1_CMD_RECV_CTRL 8 /* control receipt of packets */
133#define HFI1_CMD_POLL_TYPE 9 /* set the kind of polling we want */
134#define HFI1_CMD_ACK_EVENT 10 /* ack & clear user status bits */
135#define HFI1_CMD_SET_PKEY 11 /* set context's pkey */
136#define HFI1_CMD_CTXT_RESET 12 /* reset context's HW send context */
137/* separate EPROM commands from normal PSM commands */
138#define HFI1_CMD_EP_INFO 64 /* read EPROM device ID */
139#define HFI1_CMD_EP_ERASE_CHIP 65 /* erase whole EPROM */
140#define HFI1_CMD_EP_ERASE_P0 66 /* erase EPROM partition 0 */
141#define HFI1_CMD_EP_ERASE_P1 67 /* erase EPROM partition 1 */
142#define HFI1_CMD_EP_READ_P0 68 /* read EPROM partition 0 */
143#define HFI1_CMD_EP_READ_P1 69 /* read EPROM partition 1 */
144#define HFI1_CMD_EP_WRITE_P0 70 /* write EPROM partition 0 */
145#define HFI1_CMD_EP_WRITE_P1 71 /* write EPROM partition 1 */
146
147#define _HFI1_EVENT_FROZEN_BIT 0
148#define _HFI1_EVENT_LINKDOWN_BIT 1
149#define _HFI1_EVENT_LID_CHANGE_BIT 2
150#define _HFI1_EVENT_LMC_CHANGE_BIT 3
151#define _HFI1_EVENT_SL2VL_CHANGE_BIT 4
152#define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_SL2VL_CHANGE_BIT
153
154#define HFI1_EVENT_FROZEN (1UL << _HFI1_EVENT_FROZEN_BIT)
155#define HFI1_EVENT_LINKDOWN_BIT (1UL << _HFI1_EVENT_LINKDOWN_BIT)
156#define HFI1_EVENT_LID_CHANGE_BIT (1UL << _HFI1_EVENT_LID_CHANGE_BIT)
157#define HFI1_EVENT_LMC_CHANGE_BIT (1UL << _HFI1_EVENT_LMC_CHANGE_BIT)
158#define HFI1_EVENT_SL2VL_CHANGE_BIT (1UL << _HFI1_EVENT_SL2VL_CHANGE_BIT)
159
160/*
161 * These are the status bits readable (in ASCII form, 64bit value)
162 * from the "status" sysfs file. For binary compatibility, values
163 * must remain as is; removed states can be reused for different
164 * purposes.
165 */
166#define HFI1_STATUS_INITTED 0x1 /* basic initialization done */
167/* Chip has been found and initialized */
168#define HFI1_STATUS_CHIP_PRESENT 0x20
169/* IB link is at ACTIVE, usable for data traffic */
170#define HFI1_STATUS_IB_READY 0x40
171/* link is configured, LID, MTU, etc. have been set */
172#define HFI1_STATUS_IB_CONF 0x80
173/* A Fatal hardware error has occurred. */
174#define HFI1_STATUS_HWERROR 0x200
175
176/*
177 * Number of supported shared contexts.
178 * This is the maximum number of software contexts that can share
179 * a hardware send/receive context.
180 */
181#define HFI1_MAX_SHARED_CTXTS 8
182
183/*
184 * Poll types
185 */
186#define HFI1_POLL_TYPE_ANYRCV 0x0
187#define HFI1_POLL_TYPE_URGENT 0x1
188
189/*
190 * This structure is passed to the driver to tell it where
191 * user code buffers are, sizes, etc. The offsets and sizes of the
192 * fields must remain unchanged, for binary compatibility. It can
193 * be extended, if userversion is changed so user code can tell, if needed
194 */
195struct hfi1_user_info {
196 /*
197 * version of user software, to detect compatibility issues.
198 * Should be set to HFI1_USER_SWVERSION.
199 */
200 __u32 userversion;
201 __u16 pad;
202 /* HFI selection algorithm, if unit has not selected */
203 __u16 hfi1_alg;
204 /*
205 * If two or more processes wish to share a context, each process
206 * must set the subcontext_cnt and subcontext_id to the same
207 * values. The only restriction on the subcontext_id is that
208 * it be unique for a given node.
209 */
210 __u16 subctxt_cnt;
211 __u16 subctxt_id;
212 /* 128bit UUID passed in by PSM. */
213 __u8 uuid[16];
214};
215
216struct hfi1_ctxt_info {
217 __u64 runtime_flags; /* chip/drv runtime flags (HFI1_CAP_*) */
218 __u32 rcvegr_size; /* size of each eager buffer */
219 __u16 num_active; /* number of active units */
220 __u16 unit; /* unit (chip) assigned to caller */
221 __u16 ctxt; /* ctxt on unit assigned to caller */
222 __u16 subctxt; /* subctxt on unit assigned to caller */
223 __u16 rcvtids; /* number of Rcv TIDs for this context */
224 __u16 credits; /* number of PIO credits for this context */
225 __u16 numa_node; /* NUMA node of the assigned device */
226 __u16 rec_cpu; /* cpu # for affinity (0xffff if none) */
227 __u16 send_ctxt; /* send context in use by this user context */
228 __u16 egrtids; /* number of RcvArray entries for Eager Rcvs */
229 __u16 rcvhdrq_cnt; /* number of RcvHdrQ entries */
230 __u16 rcvhdrq_entsize; /* size (in bytes) for each RcvHdrQ entry */
231 __u16 sdma_ring_size; /* number of entries in SDMA request ring */
232};
233
234struct hfi1_tid_info {
235 /* virtual address of first page in transfer */
236 __u64 vaddr;
237 /* pointer to tid array. this array is big enough */
238 __u64 tidlist;
239 /* number of tids programmed by this request */
240 __u32 tidcnt;
241 /* length of transfer buffer programmed by this request */
242 __u32 length;
243 /*
244 * pointer to bitmap of TIDs used for this call;
245 * checked for being large enough at open
246 */
247 __u64 tidmap;
248};
249
250struct hfi1_cmd {
251 __u32 type; /* command type */
252 __u32 len; /* length of struct pointed to by add */
253 __u64 addr; /* pointer to user structure */
254};
255
256enum hfi1_sdma_comp_state {
257 FREE = 0,
258 QUEUED,
259 COMPLETE,
260 ERROR
261};
262
263/*
264 * SDMA completion ring entry
265 */
266struct hfi1_sdma_comp_entry {
267 __u32 status;
268 __u32 errcode;
269};
270
271/*
272 * Device status and notifications from driver to user-space.
273 */
274struct hfi1_status {
275 __u64 dev; /* device/hw status bits */
276 __u64 port; /* port state and status bits */
277 char freezemsg[0];
278};
279
280/*
281 * This structure is returned by the driver immediately after
282 * open to get implementation-specific info, and info specific to this
283 * instance.
284 *
285 * This struct must have explicit pad fields where type sizes
286 * may result in different alignments between 32 and 64 bit
287 * programs, since the 64 bit * bit kernel requires the user code
288 * to have matching offsets
289 */
290struct hfi1_base_info {
291 /* version of hardware, for feature checking. */
292 __u32 hw_version;
293 /* version of software, for feature checking. */
294 __u32 sw_version;
295 /* Job key */
296 __u16 jkey;
297 __u16 padding1;
298 /*
299 * The special QP (queue pair) value that identifies PSM
300 * protocol packet from standard IB packets.
301 */
302 __u32 bthqp;
303 /* PIO credit return address, */
304 __u64 sc_credits_addr;
305 /*
306 * Base address of write-only pio buffers for this process.
307 * Each buffer has sendpio_credits*64 bytes.
308 */
309 __u64 pio_bufbase_sop;
310 /*
311 * Base address of write-only pio buffers for this process.
312 * Each buffer has sendpio_credits*64 bytes.
313 */
314 __u64 pio_bufbase;
315 /* address where receive buffer queue is mapped into */
316 __u64 rcvhdr_bufbase;
317 /* base address of Eager receive buffers. */
318 __u64 rcvegr_bufbase;
319 /* base address of SDMA completion ring */
320 __u64 sdma_comp_bufbase;
321 /*
322 * User register base for init code, not to be used directly by
323 * protocol or applications. Always maps real chip register space.
324 * the register addresses are:
325 * ur_rcvhdrhead, ur_rcvhdrtail, ur_rcvegrhead, ur_rcvegrtail,
326 * ur_rcvtidflow
327 */
328 __u64 user_regbase;
329 /* notification events */
330 __u64 events_bufbase;
331 /* status page */
332 __u64 status_bufbase;
333 /* rcvhdrtail update */
334 __u64 rcvhdrtail_base;
335 /*
336 * shared memory pages for subctxts if ctxt is shared; these cover
337 * all the processes in the group sharing a single context.
338 * all have enough space for the num_subcontexts value on this job.
339 */
340 __u64 subctxt_uregbase;
341 __u64 subctxt_rcvegrbuf;
342 __u64 subctxt_rcvhdrbuf;
343};
344
345enum sdma_req_opcode {
346 EXPECTED = 0,
347 EAGER
348};
349
350#define HFI1_SDMA_REQ_VERSION_MASK 0xF
351#define HFI1_SDMA_REQ_VERSION_SHIFT 0x0
352#define HFI1_SDMA_REQ_OPCODE_MASK 0xF
353#define HFI1_SDMA_REQ_OPCODE_SHIFT 0x4
354#define HFI1_SDMA_REQ_IOVCNT_MASK 0xFF
355#define HFI1_SDMA_REQ_IOVCNT_SHIFT 0x8
356
357struct sdma_req_info {
358 /*
359 * bits 0-3 - version (currently unused)
360 * bits 4-7 - opcode (enum sdma_req_opcode)
361 * bits 8-15 - io vector count
362 */
363 __u16 ctrl;
364 /*
365 * Number of fragments contained in this request.
366 * User-space has already computed how many
367 * fragment-sized packet the user buffer will be
368 * split into.
369 */
370 __u16 npkts;
371 /*
372 * Size of each fragment the user buffer will be
373 * split into.
374 */
375 __u16 fragsize;
376 /*
377 * Index of the slot in the SDMA completion ring
378 * this request should be using. User-space is
379 * in charge of managing its own ring.
380 */
381 __u16 comp_idx;
382} __packed;
383
384/*
385 * SW KDETH header.
386 * swdata is SW defined portion.
387 */
388struct hfi1_kdeth_header {
389 __le32 ver_tid_offset;
390 __le16 jkey;
391 __le16 hcrc;
392 __le32 swdata[7];
393} __packed;
394
395/*
396 * Structure describing the headers that User space uses. The
397 * structure above is a subset of this one.
398 */
399struct hfi1_pkt_header {
400 __le16 pbc[4];
401 __be16 lrh[4];
402 __be32 bth[3];
403 struct hfi1_kdeth_header kdeth;
404} __packed;
405
406
407/*
408 * The list of usermode accessible registers.
409 */
410enum hfi1_ureg {
411 /* (RO) DMA RcvHdr to be used next. */
412 ur_rcvhdrtail = 0,
413 /* (RW) RcvHdr entry to be processed next by host. */
414 ur_rcvhdrhead = 1,
415 /* (RO) Index of next Eager index to use. */
416 ur_rcvegrindextail = 2,
417 /* (RW) Eager TID to be processed next */
418 ur_rcvegrindexhead = 3,
419 /* (RO) Receive Eager Offset Tail */
420 ur_rcvegroffsettail = 4,
421 /* For internal use only; max register number. */
422 ur_maxreg,
423 /* (RW) Receive TID flow table */
424 ur_rcvtidflowtable = 256
425};
426
427#endif /* _LINIUX__HFI1_USER_H */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 6e4bb4270ca2..c19a5dc1531a 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -7,12 +7,14 @@ enum {
7 RDMA_NL_RDMA_CM = 1, 7 RDMA_NL_RDMA_CM = 1,
8 RDMA_NL_NES, 8 RDMA_NL_NES,
9 RDMA_NL_C4IW, 9 RDMA_NL_C4IW,
10 RDMA_NL_LS, /* RDMA Local Services */
10 RDMA_NL_NUM_CLIENTS 11 RDMA_NL_NUM_CLIENTS
11}; 12};
12 13
13enum { 14enum {
14 RDMA_NL_GROUP_CM = 1, 15 RDMA_NL_GROUP_CM = 1,
15 RDMA_NL_GROUP_IWPM, 16 RDMA_NL_GROUP_IWPM,
17 RDMA_NL_GROUP_LS,
16 RDMA_NL_NUM_GROUPS 18 RDMA_NL_NUM_GROUPS
17}; 19};
18 20
@@ -128,5 +130,85 @@ enum {
128 IWPM_NLA_ERR_MAX 130 IWPM_NLA_ERR_MAX
129}; 131};
130 132
133/*
134 * Local service operations:
135 * RESOLVE - The client requests the local service to resolve a path.
136 * SET_TIMEOUT - The local service requests the client to set the timeout.
137 */
138enum {
139 RDMA_NL_LS_OP_RESOLVE = 0,
140 RDMA_NL_LS_OP_SET_TIMEOUT,
141 RDMA_NL_LS_NUM_OPS
142};
143
144/* Local service netlink message flags */
145#define RDMA_NL_LS_F_ERR 0x0100 /* Failed response */
146
147/*
148 * Local service resolve operation family header.
149 * The layout for the resolve operation:
150 * nlmsg header
151 * family header
152 * attributes
153 */
154
155/*
156 * Local service path use:
157 * Specify how the path(s) will be used.
158 * ALL - For connected CM operation (6 pathrecords)
159 * UNIDIRECTIONAL - For unidirectional UD (1 pathrecord)
160 * GMP - For miscellaneous GMP like operation (at least 1 reversible
161 * pathrecord)
162 */
163enum {
164 LS_RESOLVE_PATH_USE_ALL = 0,
165 LS_RESOLVE_PATH_USE_UNIDIRECTIONAL,
166 LS_RESOLVE_PATH_USE_GMP,
167 LS_RESOLVE_PATH_USE_MAX
168};
169
170#define LS_DEVICE_NAME_MAX 64
171
172struct rdma_ls_resolve_header {
173 __u8 device_name[LS_DEVICE_NAME_MAX];
174 __u8 port_num;
175 __u8 path_use;
176};
177
178/* Local service attribute type */
179#define RDMA_NLA_F_MANDATORY (1 << 13)
180#define RDMA_NLA_TYPE_MASK (~(NLA_F_NESTED | NLA_F_NET_BYTEORDER | \
181 RDMA_NLA_F_MANDATORY))
182
183/*
184 * Local service attributes:
185 * Attr Name Size Byte order
186 * -----------------------------------------------------
187 * PATH_RECORD struct ib_path_rec_data
188 * TIMEOUT u32 cpu
189 * SERVICE_ID u64 cpu
190 * DGID u8[16] BE
191 * SGID u8[16] BE
192 * TCLASS u8
193 * PKEY u16 cpu
194 * QOS_CLASS u16 cpu
195 */
196enum {
197 LS_NLA_TYPE_UNSPEC = 0,
198 LS_NLA_TYPE_PATH_RECORD,
199 LS_NLA_TYPE_TIMEOUT,
200 LS_NLA_TYPE_SERVICE_ID,
201 LS_NLA_TYPE_DGID,
202 LS_NLA_TYPE_SGID,
203 LS_NLA_TYPE_TCLASS,
204 LS_NLA_TYPE_PKEY,
205 LS_NLA_TYPE_QOS_CLASS,
206 LS_NLA_TYPE_MAX
207};
208
209/* Local service DGID/SGID attribute: big endian */
210struct rdma_nla_ls_gid {
211 __u8 gid[16];
212};
131 213
132#endif /* _UAPI_RDMA_NETLINK_H */ 214#endif /* _UAPI_RDMA_NETLINK_H */
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 37a78d20c0f6..ba1210253f5e 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -94,8 +94,6 @@ struct p9_trans_rdma {
94 struct ib_pd *pd; 94 struct ib_pd *pd;
95 struct ib_qp *qp; 95 struct ib_qp *qp;
96 struct ib_cq *cq; 96 struct ib_cq *cq;
97 struct ib_mr *dma_mr;
98 u32 lkey;
99 long timeout; 97 long timeout;
100 int sq_depth; 98 int sq_depth;
101 struct semaphore sq_sem; 99 struct semaphore sq_sem;
@@ -382,9 +380,6 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
382 if (!rdma) 380 if (!rdma)
383 return; 381 return;
384 382
385 if (rdma->dma_mr && !IS_ERR(rdma->dma_mr))
386 ib_dereg_mr(rdma->dma_mr);
387
388 if (rdma->qp && !IS_ERR(rdma->qp)) 383 if (rdma->qp && !IS_ERR(rdma->qp))
389 ib_destroy_qp(rdma->qp); 384 ib_destroy_qp(rdma->qp);
390 385
@@ -415,7 +410,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
415 410
416 sge.addr = c->busa; 411 sge.addr = c->busa;
417 sge.length = client->msize; 412 sge.length = client->msize;
418 sge.lkey = rdma->lkey; 413 sge.lkey = rdma->pd->local_dma_lkey;
419 414
420 wr.next = NULL; 415 wr.next = NULL;
421 c->wc_op = IB_WC_RECV; 416 c->wc_op = IB_WC_RECV;
@@ -506,7 +501,7 @@ dont_need_post_recv:
506 501
507 sge.addr = c->busa; 502 sge.addr = c->busa;
508 sge.length = c->req->tc->size; 503 sge.length = c->req->tc->size;
509 sge.lkey = rdma->lkey; 504 sge.lkey = rdma->pd->local_dma_lkey;
510 505
511 wr.next = NULL; 506 wr.next = NULL;
512 c->wc_op = IB_WC_SEND; 507 c->wc_op = IB_WC_SEND;
@@ -647,7 +642,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
647 struct p9_trans_rdma *rdma; 642 struct p9_trans_rdma *rdma;
648 struct rdma_conn_param conn_param; 643 struct rdma_conn_param conn_param;
649 struct ib_qp_init_attr qp_attr; 644 struct ib_qp_init_attr qp_attr;
650 struct ib_device_attr devattr;
651 struct ib_cq_init_attr cq_attr = {}; 645 struct ib_cq_init_attr cq_attr = {};
652 646
653 /* Parse the transport specific mount options */ 647 /* Parse the transport specific mount options */
@@ -700,11 +694,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
700 if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) 694 if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
701 goto error; 695 goto error;
702 696
703 /* Query the device attributes */
704 err = ib_query_device(rdma->cm_id->device, &devattr);
705 if (err)
706 goto error;
707
708 /* Create the Completion Queue */ 697 /* Create the Completion Queue */
709 cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1; 698 cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1;
710 rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, 699 rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler,
@@ -719,17 +708,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
719 if (IS_ERR(rdma->pd)) 708 if (IS_ERR(rdma->pd))
720 goto error; 709 goto error;
721 710
722 /* Cache the DMA lkey in the transport */
723 rdma->dma_mr = NULL;
724 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
725 rdma->lkey = rdma->cm_id->device->local_dma_lkey;
726 else {
727 rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE);
728 if (IS_ERR(rdma->dma_mr))
729 goto error;
730 rdma->lkey = rdma->dma_mr->lkey;
731 }
732
733 /* Create the Queue Pair */ 711 /* Create the Queue Pair */
734 memset(&qp_attr, 0, sizeof qp_attr); 712 memset(&qp_attr, 0, sizeof qp_attr);
735 qp_attr.event_handler = qp_event_handler; 713 qp_attr.event_handler = qp_event_handler;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 99c0f2b843f0..030fefdc9aed 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1943,37 +1943,6 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
1943 __ipv6_dev_ac_dec(ifp->idev, &addr); 1943 __ipv6_dev_ac_dec(ifp->idev, &addr);
1944} 1944}
1945 1945
1946static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
1947{
1948 if (dev->addr_len != ETH_ALEN)
1949 return -1;
1950 memcpy(eui, dev->dev_addr, 3);
1951 memcpy(eui + 5, dev->dev_addr + 3, 3);
1952
1953 /*
1954 * The zSeries OSA network cards can be shared among various
1955 * OS instances, but the OSA cards have only one MAC address.
1956 * This leads to duplicate address conflicts in conjunction
1957 * with IPv6 if more than one instance uses the same card.
1958 *
1959 * The driver for these cards can deliver a unique 16-bit
1960 * identifier for each instance sharing the same card. It is
1961 * placed instead of 0xFFFE in the interface identifier. The
1962 * "u" bit of the interface identifier is not inverted in this
1963 * case. Hence the resulting interface identifier has local
1964 * scope according to RFC2373.
1965 */
1966 if (dev->dev_id) {
1967 eui[3] = (dev->dev_id >> 8) & 0xFF;
1968 eui[4] = dev->dev_id & 0xFF;
1969 } else {
1970 eui[3] = 0xFF;
1971 eui[4] = 0xFE;
1972 eui[0] ^= 2;
1973 }
1974 return 0;
1975}
1976
1977static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev) 1946static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev)
1978{ 1947{
1979 if (dev->addr_len != IEEE802154_ADDR_LEN) 1948 if (dev->addr_len != IEEE802154_ADDR_LEN)
diff --git a/net/rds/ib.c b/net/rds/ib.c
index d020fade312c..2d3f2ab475df 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -99,8 +99,6 @@ static void rds_ib_dev_free(struct work_struct *work)
99 99
100 if (rds_ibdev->mr_pool) 100 if (rds_ibdev->mr_pool)
101 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); 101 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
102 if (rds_ibdev->mr)
103 ib_dereg_mr(rds_ibdev->mr);
104 if (rds_ibdev->pd) 102 if (rds_ibdev->pd)
105 ib_dealloc_pd(rds_ibdev->pd); 103 ib_dealloc_pd(rds_ibdev->pd);
106 104
@@ -164,12 +162,6 @@ static void rds_ib_add_one(struct ib_device *device)
164 goto put_dev; 162 goto put_dev;
165 } 163 }
166 164
167 rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
168 if (IS_ERR(rds_ibdev->mr)) {
169 rds_ibdev->mr = NULL;
170 goto put_dev;
171 }
172
173 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); 165 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
174 if (IS_ERR(rds_ibdev->mr_pool)) { 166 if (IS_ERR(rds_ibdev->mr_pool)) {
175 rds_ibdev->mr_pool = NULL; 167 rds_ibdev->mr_pool = NULL;
@@ -230,11 +222,10 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
230 * 222 *
231 * This can be called at any time and can be racing with any other RDS path. 223 * This can be called at any time and can be racing with any other RDS path.
232 */ 224 */
233static void rds_ib_remove_one(struct ib_device *device) 225static void rds_ib_remove_one(struct ib_device *device, void *client_data)
234{ 226{
235 struct rds_ib_device *rds_ibdev; 227 struct rds_ib_device *rds_ibdev = client_data;
236 228
237 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
238 if (!rds_ibdev) 229 if (!rds_ibdev)
239 return; 230 return;
240 231
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 9fc95e38659a..aae60fda77f6 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -100,7 +100,6 @@ struct rds_ib_connection {
100 /* alphabet soup, IBTA style */ 100 /* alphabet soup, IBTA style */
101 struct rdma_cm_id *i_cm_id; 101 struct rdma_cm_id *i_cm_id;
102 struct ib_pd *i_pd; 102 struct ib_pd *i_pd;
103 struct ib_mr *i_mr;
104 struct ib_cq *i_send_cq; 103 struct ib_cq *i_send_cq;
105 struct ib_cq *i_recv_cq; 104 struct ib_cq *i_recv_cq;
106 105
@@ -173,7 +172,6 @@ struct rds_ib_device {
173 struct list_head conn_list; 172 struct list_head conn_list;
174 struct ib_device *dev; 173 struct ib_device *dev;
175 struct ib_pd *pd; 174 struct ib_pd *pd;
176 struct ib_mr *mr;
177 struct rds_ib_mr_pool *mr_pool; 175 struct rds_ib_mr_pool *mr_pool;
178 unsigned int fmr_max_remaps; 176 unsigned int fmr_max_remaps;
179 unsigned int max_fmrs; 177 unsigned int max_fmrs;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index d150bb4aa3cb..9043f5c04787 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -269,7 +269,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
269 269
270 /* Protection domain and memory range */ 270 /* Protection domain and memory range */
271 ic->i_pd = rds_ibdev->pd; 271 ic->i_pd = rds_ibdev->pd;
272 ic->i_mr = rds_ibdev->mr;
273 272
274 cq_attr.cqe = ic->i_send_ring.w_nr + 1; 273 cq_attr.cqe = ic->i_send_ring.w_nr + 1;
275 ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, 274 ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
@@ -375,7 +374,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
375 374
376 rds_ib_recv_init_ack(ic); 375 rds_ib_recv_init_ack(ic);
377 376
378 rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, 377 rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
379 ic->i_send_cq, ic->i_recv_cq); 378 ic->i_send_cq, ic->i_recv_cq);
380 379
381out: 380out:
@@ -682,7 +681,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
682 681
683 ic->i_cm_id = NULL; 682 ic->i_cm_id = NULL;
684 ic->i_pd = NULL; 683 ic->i_pd = NULL;
685 ic->i_mr = NULL;
686 ic->i_send_cq = NULL; 684 ic->i_send_cq = NULL;
687 ic->i_recv_cq = NULL; 685 ic->i_recv_cq = NULL;
688 ic->i_send_hdrs = NULL; 686 ic->i_send_hdrs = NULL;
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 6bbe62060060..f43831e4186a 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -62,12 +62,12 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
62 sge = &recv->r_sge[0]; 62 sge = &recv->r_sge[0];
63 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); 63 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
64 sge->length = sizeof(struct rds_header); 64 sge->length = sizeof(struct rds_header);
65 sge->lkey = ic->i_mr->lkey; 65 sge->lkey = ic->i_pd->local_dma_lkey;
66 66
67 sge = &recv->r_sge[1]; 67 sge = &recv->r_sge[1];
68 sge->addr = 0; 68 sge->addr = 0;
69 sge->length = RDS_FRAG_SIZE; 69 sge->length = RDS_FRAG_SIZE;
70 sge->lkey = ic->i_mr->lkey; 70 sge->lkey = ic->i_pd->local_dma_lkey;
71 } 71 }
72} 72}
73 73
@@ -564,7 +564,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
564 564
565 sge->addr = ic->i_ack_dma; 565 sge->addr = ic->i_ack_dma;
566 sge->length = sizeof(struct rds_header); 566 sge->length = sizeof(struct rds_header);
567 sge->lkey = ic->i_mr->lkey; 567 sge->lkey = ic->i_pd->local_dma_lkey;
568 568
569 wr->sg_list = sge; 569 wr->sg_list = sge;
570 wr->num_sge = 1; 570 wr->num_sge = 1;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index c576ebeb4115..4e88047086b6 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -202,9 +202,9 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
202 sge = &send->s_sge[0]; 202 sge = &send->s_sge[0];
203 sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); 203 sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
204 sge->length = sizeof(struct rds_header); 204 sge->length = sizeof(struct rds_header);
205 sge->lkey = ic->i_mr->lkey; 205 sge->lkey = ic->i_pd->local_dma_lkey;
206 206
207 send->s_sge[1].lkey = ic->i_mr->lkey; 207 send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
208 } 208 }
209} 209}
210 210
@@ -818,7 +818,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
818 /* Convert our struct scatterlist to struct ib_sge */ 818 /* Convert our struct scatterlist to struct ib_sge */
819 send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); 819 send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
820 send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); 820 send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
821 send->s_sge[0].lkey = ic->i_mr->lkey; 821 send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
822 822
823 rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, 823 rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
824 send->s_sge[0].addr, send->s_sge[0].length); 824 send->s_sge[0].addr, send->s_sge[0].length);
@@ -932,7 +932,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
932 send->s_sge[j].addr = 932 send->s_sge[j].addr =
933 ib_sg_dma_address(ic->i_cm_id->device, scat); 933 ib_sg_dma_address(ic->i_cm_id->device, scat);
934 send->s_sge[j].length = len; 934 send->s_sge[j].length = len;
935 send->s_sge[j].lkey = ic->i_mr->lkey; 935 send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
936 936
937 sent += len; 937 sent += len;
938 rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); 938 rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 5d5a9d258658..3df0295c6659 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -125,12 +125,11 @@ free_attr:
125 kfree(dev_attr); 125 kfree(dev_attr);
126} 126}
127 127
128static void rds_iw_remove_one(struct ib_device *device) 128static void rds_iw_remove_one(struct ib_device *device, void *client_data)
129{ 129{
130 struct rds_iw_device *rds_iwdev; 130 struct rds_iw_device *rds_iwdev = client_data;
131 struct rds_iw_cm_id *i_cm_id, *next; 131 struct rds_iw_cm_id *i_cm_id, *next;
132 132
133 rds_iwdev = ib_get_client_data(device, &rds_iw_client);
134 if (!rds_iwdev) 133 if (!rds_iwdev)
135 return; 134 return;
136 135
@@ -149,10 +148,7 @@ static void rds_iw_remove_one(struct ib_device *device)
149 if (rds_iwdev->mr) 148 if (rds_iwdev->mr)
150 ib_dereg_mr(rds_iwdev->mr); 149 ib_dereg_mr(rds_iwdev->mr);
151 150
152 while (ib_dealloc_pd(rds_iwdev->pd)) { 151 ib_dealloc_pd(rds_iwdev->pd);
153 rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
154 msleep(1);
155 }
156 152
157 list_del(&rds_iwdev->list); 153 list_del(&rds_iwdev->list);
158 kfree(rds_iwdev); 154 kfree(rds_iwdev);
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index dba8d0864f18..6a8fbd6e69e7 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -667,11 +667,12 @@ static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
667 struct ib_mr *mr; 667 struct ib_mr *mr;
668 int err; 668 int err;
669 669
670 mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size); 670 mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG,
671 pool->max_message_size);
671 if (IS_ERR(mr)) { 672 if (IS_ERR(mr)) {
672 err = PTR_ERR(mr); 673 err = PTR_ERR(mr);
673 674
674 printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err); 675 printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err);
675 return err; 676 return err;
676 } 677 }
677 678
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 334fe98c5084..86152ec3b887 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -153,9 +153,10 @@ void rds_iw_send_init_ring(struct rds_iw_connection *ic)
153 sge->length = sizeof(struct rds_header); 153 sge->length = sizeof(struct rds_header);
154 sge->lkey = 0; 154 sge->lkey = 0;
155 155
156 send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size); 156 send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG,
157 fastreg_message_size);
157 if (IS_ERR(send->s_mr)) { 158 if (IS_ERR(send->s_mr)) {
158 printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n"); 159 printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n");
159 break; 160 break;
160 } 161 }
161 162
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 63f282e770b8..d6653f5d0830 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -117,7 +117,7 @@ __frwr_recovery_worker(struct work_struct *work)
117 if (ib_dereg_mr(r->r.frmr.fr_mr)) 117 if (ib_dereg_mr(r->r.frmr.fr_mr))
118 goto out_fail; 118 goto out_fail;
119 119
120 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(pd, depth); 120 r->r.frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
121 if (IS_ERR(r->r.frmr.fr_mr)) 121 if (IS_ERR(r->r.frmr.fr_mr))
122 goto out_fail; 122 goto out_fail;
123 123
@@ -148,7 +148,7 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
148 struct rpcrdma_frmr *f = &r->r.frmr; 148 struct rpcrdma_frmr *f = &r->r.frmr;
149 int rc; 149 int rc;
150 150
151 f->fr_mr = ib_alloc_fast_reg_mr(pd, depth); 151 f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
152 if (IS_ERR(f->fr_mr)) 152 if (IS_ERR(f->fr_mr))
153 goto out_mr_err; 153 goto out_mr_err;
154 f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); 154 f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
@@ -158,7 +158,7 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
158 158
159out_mr_err: 159out_mr_err:
160 rc = PTR_ERR(f->fr_mr); 160 rc = PTR_ERR(f->fr_mr);
161 dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n", 161 dprintk("RPC: %s: ib_alloc_mr status %i\n",
162 __func__, rc); 162 __func__, rc);
163 return rc; 163 return rc;
164 164
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2e1348bde325..cb5174284074 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -115,15 +115,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
115 rqstp->rq_arg.tail[0].iov_len = 0; 115 rqstp->rq_arg.tail[0].iov_len = 0;
116} 116}
117 117
118static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
119{
120 if (!rdma_cap_read_multi_sge(xprt->sc_cm_id->device,
121 xprt->sc_cm_id->port_num))
122 return 1;
123 else
124 return min_t(int, sge_count, xprt->sc_max_sge);
125}
126
127/* Issue an RDMA_READ using the local lkey to map the data sink */ 118/* Issue an RDMA_READ using the local lkey to map the data sink */
128int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, 119int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
129 struct svc_rqst *rqstp, 120 struct svc_rqst *rqstp,
@@ -144,8 +135,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
144 135
145 ctxt->direction = DMA_FROM_DEVICE; 136 ctxt->direction = DMA_FROM_DEVICE;
146 ctxt->read_hdr = head; 137 ctxt->read_hdr = head;
147 pages_needed = 138 pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd);
148 min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
149 read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); 139 read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
150 140
151 for (pno = 0; pno < pages_needed; pno++) { 141 for (pno = 0; pno < pages_needed; pno++) {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 21e40365042c..fcc3eb80c265 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -734,17 +734,19 @@ static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
734 struct ib_mr *mr; 734 struct ib_mr *mr;
735 struct ib_fast_reg_page_list *pl; 735 struct ib_fast_reg_page_list *pl;
736 struct svc_rdma_fastreg_mr *frmr; 736 struct svc_rdma_fastreg_mr *frmr;
737 u32 num_sg;
737 738
738 frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); 739 frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
739 if (!frmr) 740 if (!frmr)
740 goto err; 741 goto err;
741 742
742 mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); 743 num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len);
744 mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg);
743 if (IS_ERR(mr)) 745 if (IS_ERR(mr))
744 goto err_free_frmr; 746 goto err_free_frmr;
745 747
746 pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, 748 pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
747 RPCSVC_MAXPAGES); 749 num_sg);
748 if (IS_ERR(pl)) 750 if (IS_ERR(pl))
749 goto err_free_mr; 751 goto err_free_mr;
750 752
@@ -873,6 +875,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
873 * capabilities of this particular device */ 875 * capabilities of this particular device */
874 newxprt->sc_max_sge = min((size_t)devattr.max_sge, 876 newxprt->sc_max_sge = min((size_t)devattr.max_sge,
875 (size_t)RPCSVC_MAXPAGES); 877 (size_t)RPCSVC_MAXPAGES);
878 newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd,
879 RPCSVC_MAXPAGES);
876 newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, 880 newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
877 (size_t)svcrdma_max_requests); 881 (size_t)svcrdma_max_requests);
878 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; 882 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
@@ -1047,6 +1051,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1047 " remote_ip : %pI4\n" 1051 " remote_ip : %pI4\n"
1048 " remote_port : %d\n" 1052 " remote_port : %d\n"
1049 " max_sge : %d\n" 1053 " max_sge : %d\n"
1054 " max_sge_rd : %d\n"
1050 " sq_depth : %d\n" 1055 " sq_depth : %d\n"
1051 " max_requests : %d\n" 1056 " max_requests : %d\n"
1052 " ord : %d\n", 1057 " ord : %d\n",
@@ -1060,6 +1065,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1060 ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> 1065 ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
1061 route.addr.dst_addr)->sin_port), 1066 route.addr.dst_addr)->sin_port),
1062 newxprt->sc_max_sge, 1067 newxprt->sc_max_sge,
1068 newxprt->sc_max_sge_rd,
1063 newxprt->sc_sq_depth, 1069 newxprt->sc_sq_depth,
1064 newxprt->sc_max_requests, 1070 newxprt->sc_max_requests,
1065 newxprt->sc_ord); 1071 newxprt->sc_ord);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f73d7a71035c..682996779970 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -611,7 +611,7 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
611 611
612 /* If the pd is still busy, xprtrdma missed freeing a resource */ 612 /* If the pd is still busy, xprtrdma missed freeing a resource */
613 if (ia->ri_pd && !IS_ERR(ia->ri_pd)) 613 if (ia->ri_pd && !IS_ERR(ia->ri_pd))
614 WARN_ON(ib_dealloc_pd(ia->ri_pd)); 614 ib_dealloc_pd(ia->ri_pd);
615} 615}
616 616
617/* 617/*