From e1d60ec6699f19b760df8261e922ae236ea7bb31 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 30 Mar 2009 08:31:05 -0700 Subject: IB/mlx4: Use pgprot_writecombine() for BlueFlame pages The PAT work on x86 has finally made pgprot_writecombine() a usable API for modular drivers. As the comment indicates, this is exactly what we want to use in mlx4_ib to map BlueFlame pages up to userspace, since using WC for these pages improves small message latency significantly. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 2ccb9d31771f..ae3d7590346e 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -394,8 +394,7 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) { - /* FIXME want pgprot_writecombine() for BlueFlame pages */ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn + -- cgit v1.2.2 From 04b5d028f50ff05a8f9ae049ee71f8fdfcf1f5de Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 30 Mar 2009 08:37:56 -0700 Subject: RDMA/cxgb3: Handle EEH events - wrap calls into cxgb3 and fail them if we're in the middle of a PCI EEH event. - correctly unwind and release endpoint and other resources when we are in an EEH event. - dispatch IB_EVENT_DEVICE_FATAL event when cxgb3 notifies iw_cxgb3 of a fatal error. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb3/cxio_hal.c | 10 ++-- drivers/infiniband/hw/cxgb3/cxio_hal.h | 6 +++ drivers/infiniband/hw/cxgb3/iwch.c | 11 ++++- drivers/infiniband/hw/cxgb3/iwch.h | 5 ++ drivers/infiniband/hw/cxgb3/iwch_cm.c | 90 ++++++++++++++++++++++++---------- drivers/infiniband/hw/cxgb3/iwch_qp.c | 4 +- 6 files changed, 92 insertions(+), 34 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index a4a82bff7100..8d71086f5a1c 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -152,7 +152,7 @@ static int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid) sge_cmd = qpid << 8 | 3; wqe->sge_cmd = cpu_to_be64(sge_cmd); skb->priority = CPL_PRIORITY_CONTROL; - return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb)); + return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb); } int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) @@ -571,7 +571,7 @@ static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p) (unsigned long long) rdev_p->ctrl_qp.dma_addr, rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2); skb->priority = CPL_PRIORITY_CONTROL; - return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb)); + return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb); err: kfree_skb(skb); return err; @@ -701,7 +701,7 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, u32 stag_idx; u32 wptr; - if (rdev_p->flags) + if (cxio_fatal_error(rdev_p)) return -EIO; stag_state = stag_state > 0; @@ -858,7 +858,7 @@ int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) wqe->qp_dma_size = cpu_to_be32(attr->qp_dma_size); wqe->irs = cpu_to_be32(attr->irs); skb->priority = 0; /* 0=>ToeQ; 1=>CtrlQ */ - return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb)); + return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb); } void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb) @@ -1041,9 +1041,9 @@ void cxio_rdev_close(struct cxio_rdev *rdev_p) cxio_hal_pblpool_destroy(rdev_p); cxio_hal_rqtpool_destroy(rdev_p); list_del(&rdev_p->entry); - rdev_p->t3cdev_p->ulp = NULL; cxio_hal_destroy_ctrl_qp(rdev_p); cxio_hal_destroy_resource(rdev_p->rscp); + rdev_p->t3cdev_p->ulp = NULL; } } diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h index 094a66d1480c..bfd03bf8be54 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.h +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -115,6 +115,11 @@ struct cxio_rdev { #define CXIO_ERROR_FATAL 1 }; +static inline int cxio_fatal_error(struct cxio_rdev *rdev_p) +{ + return rdev_p->flags & CXIO_ERROR_FATAL; +} + static inline int cxio_num_stags(struct cxio_rdev *rdev_p) { return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5)); @@ -188,6 +193,7 @@ void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count); void cxio_flush_hw_cq(struct t3_cq *cq); int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, u8 *cqe_flushed, u64 *cookie, u32 *credit); +int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb); #define MOD "iw_cxgb3: " #define PDBG(fmt, args...) pr_debug(MOD fmt, ## args) diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c index 37a4fc264a07..26fc0a4eaa74 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.c +++ b/drivers/infiniband/hw/cxgb3/iwch.c @@ -165,12 +165,19 @@ static void close_rnic_dev(struct t3cdev *tdev) static void iwch_err_handler(struct t3cdev *tdev, u32 status, u32 error) { struct cxio_rdev *rdev = tdev->ulp; + struct iwch_dev *rnicp = rdev_to_iwch_dev(rdev); + struct ib_event event; - if (status == OFFLOAD_STATUS_DOWN) + if (status == OFFLOAD_STATUS_DOWN) { rdev->flags = CXIO_ERROR_FATAL; - return; + event.device = &rnicp->ibdev; + event.event = IB_EVENT_DEVICE_FATAL; + event.element.port_num = 0; + ib_dispatch_event(&event); + } + return; } static int __init iwch_init_module(void) diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h index 3773453b2cf0..84735506333f 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.h +++ b/drivers/infiniband/hw/cxgb3/iwch.h @@ -117,6 +117,11 @@ static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev) return container_of(ibdev, struct iwch_dev, ibdev); } +static inline struct iwch_dev *rdev_to_iwch_dev(struct cxio_rdev *rdev) +{ + return container_of(rdev, struct iwch_dev, rdev); +} + static inline int t3b_device(const struct iwch_dev *rhp) { return rhp->rdev.t3cdev_p->type == T3B; diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 8699947aaf6c..59e1c5f00785 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -139,6 +139,38 @@ static void stop_ep_timer(struct iwch_ep *ep) put_ep(&ep->com); } +int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_entry *l2e) +{ + int error = 0; + struct cxio_rdev *rdev; + + rdev = (struct cxio_rdev *)tdev->ulp; + if (cxio_fatal_error(rdev)) { + kfree_skb(skb); + return -EIO; + } + error = l2t_send(tdev, skb, l2e); + if (error) + kfree_skb(skb); + return error; +} + +int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb) +{ + int error = 0; + struct cxio_rdev *rdev; + + rdev = (struct cxio_rdev *)tdev->ulp; + if (cxio_fatal_error(rdev)) { + kfree_skb(skb); + return -EIO; + } + error = cxgb3_ofld_send(tdev, skb); + if (error) + kfree_skb(skb); + return error; +} + static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb) { struct cpl_tid_release *req; @@ -150,7 +182,7 @@ static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb) req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid)); skb->priority = CPL_PRIORITY_SETUP; - cxgb3_ofld_send(tdev, skb); + iwch_cxgb3_ofld_send(tdev, skb); return; } @@ -172,8 +204,7 @@ int iwch_quiesce_tid(struct iwch_ep *ep) req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE); skb->priority = CPL_PRIORITY_DATA; - cxgb3_ofld_send(ep->com.tdev, skb); - return 0; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); } int iwch_resume_tid(struct iwch_ep *ep) @@ -194,8 +225,7 @@ int iwch_resume_tid(struct iwch_ep *ep) req->val = 0; skb->priority = CPL_PRIORITY_DATA; - cxgb3_ofld_send(ep->com.tdev, skb); - return 0; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); } static void set_emss(struct iwch_ep *ep, u16 opt) @@ -382,7 +412,7 @@ static void abort_arp_failure(struct t3cdev *dev, struct sk_buff *skb) PDBG("%s t3cdev %p\n", __func__, dev); req->cmd = CPL_ABORT_NO_RST; - cxgb3_ofld_send(dev, skb); + iwch_cxgb3_ofld_send(dev, skb); } static int send_halfclose(struct iwch_ep *ep, gfp_t gfp) @@ -402,8 +432,7 @@ static int send_halfclose(struct iwch_ep *ep, gfp_t gfp) req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, ep->hwtid)); - l2t_send(ep->com.tdev, skb, ep->l2t); - return 0; + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); } static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) @@ -424,8 +453,7 @@ static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); req->cmd = CPL_ABORT_SEND_RST; - l2t_send(ep->com.tdev, skb, ep->l2t); - return 0; + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); } static int send_connect(struct iwch_ep *ep) @@ -469,8 +497,7 @@ static int send_connect(struct iwch_ep *ep) req->opt0l = htonl(opt0l); req->params = 0; req->opt2 = htonl(opt2); - l2t_send(ep->com.tdev, skb, ep->l2t); - return 0; + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); } static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb) @@ -527,7 +554,7 @@ static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb) req->sndseq = htonl(ep->snd_seq); BUG_ON(ep->mpa_skb); ep->mpa_skb = skb; - l2t_send(ep->com.tdev, skb, ep->l2t); + iwch_l2t_send(ep->com.tdev, skb, ep->l2t); start_ep_timer(ep); state_set(&ep->com, MPA_REQ_SENT); return; @@ -578,8 +605,7 @@ static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen) req->sndseq = htonl(ep->snd_seq); BUG_ON(ep->mpa_skb); ep->mpa_skb = skb; - l2t_send(ep->com.tdev, skb, ep->l2t); - return 0; + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); } static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen) @@ -630,8 +656,7 @@ static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen) req->sndseq = htonl(ep->snd_seq); ep->mpa_skb = skb; state_set(&ep->com, MPA_REP_SENT); - l2t_send(ep->com.tdev, skb, ep->l2t); - return 0; + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); } static int act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) @@ -795,7 +820,7 @@ static int update_rx_credits(struct iwch_ep *ep, u32 credits) OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, ep->hwtid)); req->credit_dack = htonl(V_RX_CREDITS(credits) | V_RX_FORCE_ACK(1)); skb->priority = CPL_PRIORITY_ACK; - cxgb3_ofld_send(ep->com.tdev, skb); + iwch_cxgb3_ofld_send(ep->com.tdev, skb); return credits; } @@ -1203,8 +1228,7 @@ static int listen_start(struct iwch_listen_ep *ep) req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); skb->priority = 1; - cxgb3_ofld_send(ep->com.tdev, skb); - return 0; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); } static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) @@ -1237,8 +1261,7 @@ static int listen_stop(struct iwch_listen_ep *ep) req->cpu_idx = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid)); skb->priority = 1; - cxgb3_ofld_send(ep->com.tdev, skb); - return 0; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); } static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb, @@ -1286,7 +1309,7 @@ static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb) rpl->opt2 = htonl(opt2); rpl->rsvd = rpl->opt2; /* workaround for HW bug */ skb->priority = CPL_PRIORITY_SETUP; - l2t_send(ep->com.tdev, skb, ep->l2t); + iwch_l2t_send(ep->com.tdev, skb, ep->l2t); return; } @@ -1315,7 +1338,7 @@ static void reject_cr(struct t3cdev *tdev, u32 hwtid, __be32 peer_ip, rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); rpl->opt2 = 0; rpl->rsvd = rpl->opt2; - cxgb3_ofld_send(tdev, skb); + iwch_cxgb3_ofld_send(tdev, skb); } } @@ -1613,7 +1636,7 @@ static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) rpl->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid)); rpl->cmd = CPL_ABORT_NO_RST; - cxgb3_ofld_send(ep->com.tdev, rpl_skb); + iwch_cxgb3_ofld_send(ep->com.tdev, rpl_skb); out: if (release) release_ep_resources(ep); @@ -2017,8 +2040,11 @@ int iwch_destroy_listen(struct iw_cm_id *cm_id) ep->com.rpl_done = 0; ep->com.rpl_err = 0; err = listen_stop(ep); + if (err) + goto done; wait_event(ep->com.waitq, ep->com.rpl_done); cxgb3_free_stid(ep->com.tdev, ep->stid); +done: err = ep->com.rpl_err; cm_id->rem_ref(cm_id); put_ep(&ep->com); @@ -2030,12 +2056,22 @@ int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp) int ret=0; unsigned long flags; int close = 0; + int fatal = 0; + struct t3cdev *tdev; + struct cxio_rdev *rdev; spin_lock_irqsave(&ep->com.lock, flags); PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep, states[ep->com.state], abrupt); + tdev = (struct t3cdev *)ep->com.tdev; + rdev = (struct cxio_rdev *)tdev->ulp; + if (cxio_fatal_error(rdev)) { + fatal = 1; + close_complete_upcall(ep); + ep->com.state = DEAD; + } switch (ep->com.state) { case MPA_REQ_WAIT: case MPA_REQ_SENT: @@ -2075,7 +2111,11 @@ int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp) ret = send_abort(ep, NULL, gfp); else ret = send_halfclose(ep, gfp); + if (ret) + fatal = 1; } + if (fatal) + release_ep_resources(ep); return ret; } diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index c758fbd58478..2f546a625330 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -751,7 +751,7 @@ int iwch_post_zb_read(struct iwch_qp *qhp) wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(qhp->ep->hwtid)| V_FW_RIWR_LEN(flit_cnt)); skb->priority = CPL_PRIORITY_DATA; - return cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb); + return iwch_cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb); } /* @@ -783,7 +783,7 @@ int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg) V_FW_RIWR_FLAGS(T3_COMPLETION_FLAG | T3_NOTIFY_FLAG)); wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(qhp->ep->hwtid)); skb->priority = CPL_PRIORITY_DATA; - return cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb); + return iwch_cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb); } /* -- cgit v1.2.2 From 874d8df5ed6e36fed07b524c266f6a96dd6d10d9 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 30 Mar 2009 08:37:59 -0700 Subject: RDMA/cxgb3: Release dependent resources only when endpoint memory is freed. The cxgb3 l2t entry, hwtid, and dst entry were being released before all the iwch_ep references were released. This can cause a crash in t3_l2t_send_slow() and other places where the l2t entry is used. The fix is to defer releasing these resources until all endpoint references are gone. Details: - move flags field to the iwch_ep_common struct. - add a flag indicating resources are to be released. - release resources at endpoint free time instead of close/abort time. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb3/iwch_cm.c | 26 +++++++++++++++----------- drivers/infiniband/hw/cxgb3/iwch_cm.h | 3 ++- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 59e1c5f00785..fef3f1ae7225 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -282,18 +282,22 @@ static void *alloc_ep(int size, gfp_t gfp) void __free_ep(struct kref *kref) { - struct iwch_ep_common *epc; - epc = container_of(kref, struct iwch_ep_common, kref); - PDBG("%s ep %p state %s\n", __func__, epc, states[state_read(epc)]); - kfree(epc); + struct iwch_ep *ep; + ep = container_of(container_of(kref, struct iwch_ep_common, kref), + struct iwch_ep, com); + PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]); + if (ep->com.flags & RELEASE_RESOURCES) { + cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid); + dst_release(ep->dst); + l2t_release(L2DATA(ep->com.tdev), ep->l2t); + } + kfree(ep); } static void release_ep_resources(struct iwch_ep *ep) { PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid); - cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid); - dst_release(ep->dst); - l2t_release(L2DATA(ep->com.tdev), ep->l2t); + ep->com.flags |= RELEASE_RESOURCES; put_ep(&ep->com); } @@ -1152,8 +1156,8 @@ static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) * We get 2 abort replies from the HW. The first one must * be ignored except for scribbling that we need one more. */ - if (!(ep->flags & ABORT_REQ_IN_PROGRESS)) { - ep->flags |= ABORT_REQ_IN_PROGRESS; + if (!(ep->com.flags & ABORT_REQ_IN_PROGRESS)) { + ep->com.flags |= ABORT_REQ_IN_PROGRESS; return CPL_RET_BUF_DONE; } @@ -1557,8 +1561,8 @@ static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) * We get 2 peer aborts from the HW. The first one must * be ignored except for scribbling that we need one more. */ - if (!(ep->flags & PEER_ABORT_IN_PROGRESS)) { - ep->flags |= PEER_ABORT_IN_PROGRESS; + if (!(ep->com.flags & PEER_ABORT_IN_PROGRESS)) { + ep->com.flags |= PEER_ABORT_IN_PROGRESS; return CPL_RET_BUF_DONE; } diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h index d7c7e09f0996..43c0aea7eadc 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.h +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h @@ -147,6 +147,7 @@ enum iwch_ep_state { enum iwch_ep_flags { PEER_ABORT_IN_PROGRESS = (1 << 0), ABORT_REQ_IN_PROGRESS = (1 << 1), + RELEASE_RESOURCES = (1 << 2), }; struct iwch_ep_common { @@ -161,6 +162,7 @@ struct iwch_ep_common { wait_queue_head_t waitq; int rpl_done; int rpl_err; + u32 flags; }; struct iwch_listen_ep { @@ -188,7 +190,6 @@ struct iwch_ep { u16 plen; u32 ird; u32 ord; - u32 flags; }; static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id) -- cgit v1.2.2 From 352b09edd7fa8145bfc9e5db0cc0fed971b69440 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 31 Mar 2009 09:54:15 -0700 Subject: mlx4_core: Don't leak mailbox for SET_PORT on Ethernet ports Commit 793730bf ("mlx4_core: Don't perform SET_PORT command for Ethernet ports") introduced a leak of mailbox buffers when SET_PORT was called for Ethernet ports, since it added a return after the mailbox was allocated. Fix this by checking the port type and returning *before* allocating the mailbox. Signed-off-by: Roland Dreier --- drivers/net/mlx4/port.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c index 7cce3342ef8c..606aa58afdea 100644 --- a/drivers/net/mlx4/port.c +++ b/drivers/net/mlx4/port.c @@ -299,13 +299,14 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port) struct mlx4_cmd_mailbox *mailbox; int err; + if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) + return 0; + mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, 256); - if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) - return 0; ((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port]; err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, -- cgit v1.2.2 From edb5abb1e2a84fd8802a3577d95eac84fe1405ab Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 31 Mar 2009 10:22:32 -0700 Subject: IPoIB: Avoid free_netdev() BUG when destroying a child interface We have to release the RTNL before calling free_netdev() so that the device state has a chance to become NETREG_UNREGISTERED. Otherwise when removing a child interface, we hit the BUG() that tests the device state in free_netdev(). Reported-by: Yossi Etigin Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 5a76a5510350..4c57f329dd50 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -70,12 +70,14 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) */ if (ppriv->pkey == pkey) { result = -ENOTUNIQ; + priv = NULL; goto err; } list_for_each_entry(priv, &ppriv->child_intfs, list) { if (priv->pkey == pkey) { result = -ENOTUNIQ; + priv = NULL; goto err; } } @@ -96,7 +98,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) result = ipoib_set_dev_features(priv, ppriv->ca); if (result) - goto device_init_failed; + goto err; priv->pkey = pkey; @@ -109,7 +111,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ipoib_warn(ppriv, "failed to initialize subinterface: " "device %s, port %d", ppriv->ca->name, ppriv->port); - goto device_init_failed; + goto err; } result = register_netdevice(priv->dev); @@ -146,19 +148,19 @@ sysfs_failed: register_failed: ipoib_dev_cleanup(priv->dev); -device_init_failed: - free_netdev(priv->dev); - err: mutex_unlock(&ppriv->vlan_mutex); rtnl_unlock(); + if (priv) + free_netdev(priv->dev); + return result; } int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) { struct ipoib_dev_priv *ppriv, *priv, *tpriv; - int ret = -ENOENT; + struct net_device *dev = NULL; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -172,14 +174,17 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) unregister_netdevice(priv->dev); ipoib_dev_cleanup(priv->dev); list_del(&priv->list); - free_netdev(priv->dev); - - ret = 0; + dev = priv->dev; break; } } mutex_unlock(&ppriv->vlan_mutex); rtnl_unlock(); - return ret; + if (dev) { + free_netdev(dev); + return 0; + } + + return -ENODEV; } -- cgit v1.2.2 From 84adeee9aaa0d81712de1e0ea74caed3398e4a1d Mon Sep 17 00:00:00 2001 From: Yossi Etigin Date: Wed, 1 Apr 2009 13:55:32 -0700 Subject: RDMA/cma: Use rate from IPoIB broadcast when joining IPoIB multicast groups When joining an IPoIB multicast group, use the same rate as in the broadcast group. Otherwise, if the RDMA CM creates this group before IPoIB does, it might get a different rate. This will cause IPoIB to fail joining to the same group later on, because IPoIB uses strict rate selection. Signed-off-by: Yossi Etigin Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 2a2e50871b40..3f9c03a36571 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2713,6 +2713,10 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; + if (id_priv->id.ps == RDMA_PS_IPOIB) + comp_mask |= IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_RATE_SELECTOR; + mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, -- cgit v1.2.2 From d2ca39f262806aa2f035f680a14aa55ff9e3d889 Mon Sep 17 00:00:00 2001 From: Yossi Etigin Date: Wed, 8 Apr 2009 13:42:33 -0700 Subject: RDMA/cma: Create cm id even when IB port is down When doing rdma_resolve_addr(), if the relevant IB port is down, the function fails and the cm_id is not bound to the correct device. Therefore, application does not have a device handle and cannot wait for the port to become active. The function fails because the underlying IPoIB interface is not joined to the broadcast group and therefore the SA does not have a multicast record to take a Q_Key from. The fix is to use lazy Q_Key resolution - cma_set_qkey() will set id_priv->qkey if it was not set, and will be called just before the Q_Key is really required. Signed-off-by: Yossi Etigin Acked-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 3f9c03a36571..851de83ff455 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -297,21 +297,25 @@ static void cma_detach_from_dev(struct rdma_id_private *id_priv) id_priv->cma_dev = NULL; } -static int cma_set_qkey(struct ib_device *device, u8 port_num, - enum rdma_port_space ps, - struct rdma_dev_addr *dev_addr, u32 *qkey) +static int cma_set_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; - switch (ps) { + if (id_priv->qkey) + return 0; + + switch (id_priv->id.ps) { case RDMA_PS_UDP: - *qkey = RDMA_UDP_QKEY; + id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: - ib_addr_get_mgid(dev_addr, &rec.mgid); - ret = ib_sa_get_mcmember_rec(device, port_num, &rec.mgid, &rec); - *qkey = be32_to_cpu(rec.qkey); + ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); + ret = ib_sa_get_mcmember_rec(id_priv->id.device, + id_priv->id.port_num, &rec.mgid, + &rec); + if (!ret) + id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; @@ -341,12 +345,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) ret = ib_find_cached_gid(cma_dev->device, &gid, &id_priv->id.port_num, NULL); if (!ret) { - ret = cma_set_qkey(cma_dev->device, - id_priv->id.port_num, - id_priv->id.ps, dev_addr, - &id_priv->qkey); - if (!ret) - cma_attach_to_dev(id_priv, cma_dev); + cma_attach_to_dev(id_priv, cma_dev); break; } } @@ -578,6 +577,10 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (cma_is_ud_ps(id_priv->id.ps)) { + ret = cma_set_qkey(id_priv); + if (ret) + return ret; + qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { @@ -2201,6 +2204,12 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, event.status = ib_event->param.sidr_rep_rcvd.status; break; } + ret = cma_set_qkey(id_priv); + if (ret) { + event.event = RDMA_CM_EVENT_ADDR_ERROR; + event.status = -EINVAL; + break; + } if (id_priv->qkey != rep->qkey) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -EINVAL; @@ -2480,10 +2489,14 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; + int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { + ret = cma_set_qkey(id_priv); + if (ret) + return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; } -- cgit v1.2.2 From 6a3335b43342b42dd6c69b4bbbde15d622cb49ca Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 8 Apr 2009 13:52:01 -0700 Subject: IPoIB: Document newish features Update the documentation to include connected mode, stateless offloads and interrupt moderation, and add a reference to the connected mode RFC. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- Documentation/infiniband/ipoib.txt | 45 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt index 864ff3283780..6d40f00b358c 100644 --- a/Documentation/infiniband/ipoib.txt +++ b/Documentation/infiniband/ipoib.txt @@ -24,6 +24,49 @@ Partitions and P_Keys The P_Key for any interface is given by the "pkey" file, and the main interface for a subinterface is in "parent." +Datagram vs Connected modes + + The IPoIB driver supports two modes of operation: datagram and + connected. The mode is set and read through an interface's + /sys/class/net//mode file. + + In datagram mode, the IB UD (Unreliable Datagram) transport is used + and so the interface MTU has is equal to the IB L2 MTU minus the + IPoIB encapsulation header (4 bytes). For example, in a typical IB + fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes. + + In connected mode, the IB RC (Reliable Connected) transport is used. + Connected mode is to takes advantage of the connected nature of the + IB transport and allows an MTU up to the maximal IP packet size of + 64K, which reduces the number of IP packets needed for handling + large UDP datagrams, TCP segments, etc and increases the performance + for large messages. + + In connected mode, the interface's UD QP is still used for multicast + and communication with peers that don't support connected mode. In + this case, RX emulation of ICMP PMTU packets is used to cause the + networking stack to use the smaller UD MTU for these neighbours. + +Stateless offloads + + If the IB HW supports IPoIB stateless offloads, IPoIB advertises + TCP/IP checksum and/or Large Send (LSO) offloading capability to the + network stack. + + Large Receive (LRO) offloading is also implemented and may be turned + on/off using ethtool calls. Currently LRO is supported only for + checksum offload capable devices. + + Stateless offloads are supported only in datagram mode. + +Interrupt moderation + + If the underlying IB device supports CQ event moderation, one can + use ethtool to set interrupt mitigation parameters and thus reduce + the overhead incurred by handling interrupts. The main code path of + IPoIB doesn't use events for TX completion signaling so only RX + moderation is supported. + Debugging Information By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set @@ -55,3 +98,5 @@ References http://ietf.org/rfc/rfc4391.txt IP over InfiniBand (IPoIB) Architecture (RFC 4392) http://ietf.org/rfc/rfc4392.txt + IP over InfiniBand: Connected Mode (RFC 4755) + http://ietf.org/rfc/rfc4755.txt -- cgit v1.2.2 From 7a5efb62f6ae366cefac6be475434906c5061e15 Mon Sep 17 00:00:00 2001 From: Don Wood Date: Wed, 8 Apr 2009 14:21:02 -0700 Subject: RDMA/nes: Fix incorrect casts on 32-bit architectures The were some incorrect casts to unsigned long that caused 64-bit values to be truncated on 32-bit architectures and made the driver pass invalid adresses and lengths to the hardware. The problems were primarily seen with kernels with highmem configured but some could show up in non-highmem kernels, too. Signed-off-by: Don Wood Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes.h | 4 ++-- drivers/infiniband/hw/nes/nes_cm.c | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index 04b12ad23390..17621de54a9f 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -289,8 +289,8 @@ static inline __le32 get_crc_value(struct nes_v4_quad *nes_quad) static inline void set_wqe_64bit_value(__le32 *wqe_words, u32 index, u64 value) { - wqe_words[index] = cpu_to_le32((u32) ((unsigned long)value)); - wqe_words[index + 1] = cpu_to_le32((u32)(upper_32_bits((unsigned long)value))); + wqe_words[index] = cpu_to_le32((u32) value); + wqe_words[index + 1] = cpu_to_le32(upper_32_bits(value)); } static inline void diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 52425154acd4..7c942470b980 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -2690,6 +2690,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct ib_mr *ibmr = NULL; struct ib_phys_buf ibphysbuf; struct nes_pd *nespd; + u64 tagged_offset; @@ -2755,10 +2756,11 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ibphysbuf.addr = nesqp->ietf_frame_pbase; ibphysbuf.size = conn_param->private_data_len + sizeof(struct ietf_mpa_frame); + tagged_offset = (u64)(unsigned long)nesqp->ietf_frame; ibmr = nesibdev->ibdev.reg_phys_mr((struct ib_pd *)nespd, &ibphysbuf, 1, IB_ACCESS_LOCAL_WRITE, - (u64 *)&nesqp->ietf_frame); + &tagged_offset); if (!ibmr) { nes_debug(NES_DBG_CM, "Unable to register memory region" "for lSMM for cm_node = %p \n", @@ -2782,7 +2784,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) sizeof(struct ietf_mpa_frame)); set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, - (u64)nesqp->ietf_frame); + (u64)(unsigned long)nesqp->ietf_frame); wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = cpu_to_le32(conn_param->private_data_len + sizeof(struct ietf_mpa_frame)); -- cgit v1.2.2 From 79fc3d7410c861c8ced5b81a5c3759f6bbf891dc Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 8 Apr 2009 14:22:20 -0700 Subject: RDMA/nes: Fix error handling issues Fix issues found by static code analysis: (1) Check if cm_node was successfully created for loopback connection. (2) schedule_nes_timer() does not free up allocated memory after encountering an error. There is a WARN_ON() for this condition. (3) there is a cm_node->freed flag which is set but not used. Reported-by: Dan Carpenter Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 8 ++++++-- drivers/infiniband/hw/nes/nes_cm.h | 1 - 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 7c942470b980..a09caf5b387d 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -426,6 +426,7 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, if (type == NES_TIMER_TYPE_CLOSE) { new_send->timetosend += (HZ/10); if (cm_node->recv_entry) { + kfree(new_send); WARN_ON(1); return -EINVAL; } @@ -1262,7 +1263,6 @@ static int rem_ref_cm_node(struct nes_cm_core *cm_core, cm_node->nesqp = NULL; } - cm_node->freed = 1; kfree(cm_node); return 0; } @@ -1999,13 +1999,17 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, if (loopbackremotelistener == NULL) { create_event(cm_node, NES_CM_EVENT_ABORTED); } else { - atomic_inc(&cm_loopbacks); loopback_cm_info = *cm_info; loopback_cm_info.loc_port = cm_info->rem_port; loopback_cm_info.rem_port = cm_info->loc_port; loopback_cm_info.cm_id = loopbackremotelistener->cm_id; loopbackremotenode = make_cm_node(cm_core, nesvnic, &loopback_cm_info, loopbackremotelistener); + if (!loopbackremotenode) { + rem_ref_cm_node(cm_node->cm_core, cm_node); + return NULL; + } + atomic_inc(&cm_loopbacks); loopbackremotenode->loopbackpartner = cm_node; loopbackremotenode->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index d5f778202eb7..80bba1892571 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -298,7 +298,6 @@ struct nes_cm_node { struct nes_vnic *nesvnic; int apbvt_set; int accept_pend; - int freed; struct list_head timer_entry; struct list_head reset_entry; struct nes_qp *nesqp; -- cgit v1.2.2 From 5962c2c8036b4dcf10ec6c481be656ae4700b664 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 8 Apr 2009 14:23:55 -0700 Subject: RDMA/nes: Fix nes_nic_cm_xmit() error handling We are getting crash or hung situation when we are running network cable pull tests during RDMA traffic. In schedule_nes_timer(), we return an error if nes_nic_cm_xmit() returns failure. This is changed to success as skb is being put on the timer routines to be processed later. In send_syn() case, we are indicating connect failure once from nes_connect() and the other when the rexmit retries expires. The other issue is skb->users which we are incrementing before calling nes_nic_cm_xmit() which calls dev_queue_xmit() but in case of failure we are decrementing the skb->users at the same time putting the skb on the rexmit path. Even if dev_queue_xmit() fails, the skb->users is decremented already. We are removing the decrement of skb->users in case of failure from both schedule_nes_timer() as well as from nes_cm_timer_tick(). There is also extra check in nes_cm_timer_tick() for rexmit failure which does a break from the loop is removed. This causes problem as the other nodes have their cm_node->ref_count incremented and are not processed. Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index a09caf5b387d..dbd9a75474e3 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -446,8 +446,8 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, if (ret != NETDEV_TX_OK) { nes_debug(NES_DBG_CM, "Error sending packet %p " "(jiffies = %lu)\n", new_send, jiffies); - atomic_dec(&new_send->skb->users); new_send->timetosend = jiffies; + ret = NETDEV_TX_OK; } else { cm_packets_sent++; if (!send_retrans) { @@ -631,7 +631,6 @@ static void nes_cm_timer_tick(unsigned long pass) nes_debug(NES_DBG_CM, "rexmit failed for " "node=%p\n", cm_node); cm_packets_bounced++; - atomic_dec(&send_entry->skb->users); send_entry->retrycount--; nexttimeout = jiffies + NES_SHORT_TIME; settimer = 1; @@ -667,11 +666,6 @@ static void nes_cm_timer_tick(unsigned long pass) spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); rem_ref_cm_node(cm_node->cm_core, cm_node); - if (ret != NETDEV_TX_OK) { - nes_debug(NES_DBG_CM, "rexmit failed for cm_node=%p\n", - cm_node); - break; - } } if (settimer) { -- cgit v1.2.2 From 1b9493248cf5e9f1ecc045488100cbf3ccd91be1 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 8 Apr 2009 14:27:09 -0700 Subject: RDMA/nes: Fix SFP+ PHY initialization SFP+ PHY initialization has very long delays, incorrect settings for direct attach copper cables, and inconsistent link detection. Adjust delays to the minimum required by the PHY. Worst case is now less than 4 seconds. Add new register settings for direct attach cables. Change link detection logic to use two new registers for more consistent link state detection. Reorganize code to shorten line length. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 289 ++++++++++++++++--------------------- 1 file changed, 122 insertions(+), 167 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 52e734042b8e..466c730e6297 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -757,6 +757,10 @@ static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count, ((port_count > 2) && (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G))) { /* init serdes 1 */ + if (nesadapter->phy_type[0] == NES_PHY_TYPE_ARGUS) { + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000); + } nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF); if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) { serdes_common_control = nes_read_indexed(nesdev, @@ -1259,203 +1263,155 @@ int nes_init_phy(struct nes_device *nesdev) { struct nes_adapter *nesadapter = nesdev->nesadapter; u32 counter = 0; - u32 sds_common_control0; + u32 sds; u32 mac_index = nesdev->mac_index; u32 tx_config = 0; u16 phy_data; u32 temp_phy_data = 0; u32 temp_phy_data2 = 0; - u32 i = 0; + u8 phy_type = nesadapter->phy_type[mac_index]; + u8 phy_index = nesadapter->phy_index[mac_index]; if ((nesadapter->OneG_Mode) && - (nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) { + (phy_type != NES_PHY_TYPE_PUMA_1G)) { nes_debug(NES_DBG_PHY, "1G PHY, mac_index = %d.\n", mac_index); - if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_1G) { - printk(PFX "%s: Programming mdc config for 1G\n", __func__); + if (phy_type == NES_PHY_TYPE_1G) { tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); tx_config &= 0xFFFFFFE3; tx_config |= 0x04; nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); } - nes_read_1G_phy_reg(nesdev, 1, nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy data from register 1 phy address %u = 0x%X.\n", - nesadapter->phy_index[mac_index], phy_data); - nes_write_1G_phy_reg(nesdev, 23, nesadapter->phy_index[mac_index], 0xb000); + nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000); /* Reset the PHY */ - nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], 0x8000); + nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000); udelay(100); counter = 0; do { - nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy data from register 0 = 0x%X.\n", phy_data); - if (counter++ > 100) break; + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + if (counter++ > 100) + break; } while (phy_data & 0x8000); /* Setting no phy loopback */ phy_data &= 0xbfff; phy_data |= 0x1140; - nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], phy_data); - nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy data from register 0 = 0x%X.\n", phy_data); - - nes_read_1G_phy_reg(nesdev, 0x17, nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy data from register 0x17 = 0x%X.\n", phy_data); - - nes_read_1G_phy_reg(nesdev, 0x1e, nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy data from register 0x1e = 0x%X.\n", phy_data); + nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data); + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data); + nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data); /* Setting the interrupt mask */ - nes_read_1G_phy_reg(nesdev, 0x19, nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy data from register 0x19 = 0x%X.\n", phy_data); - nes_write_1G_phy_reg(nesdev, 0x19, nesadapter->phy_index[mac_index], 0xffee); - - nes_read_1G_phy_reg(nesdev, 0x19, nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy data from register 0x19 = 0x%X.\n", phy_data); + nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee); + nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data); /* turning on flow control */ - nes_read_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy data from register 0x4 = 0x%X.\n", phy_data); - nes_write_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index], - (phy_data & ~(0x03E0)) | 0xc00); - /* nes_write_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index], - phy_data | 0xc00); */ - nes_read_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy data from register 0x4 = 0x%X.\n", phy_data); - - nes_read_1G_phy_reg(nesdev, 9, nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy data from register 0x9 = 0x%X.\n", phy_data); - /* Clear Half duplex */ - nes_write_1G_phy_reg(nesdev, 9, nesadapter->phy_index[mac_index], - phy_data & ~(0x0100)); - nes_read_1G_phy_reg(nesdev, 9, nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy data from register 0x9 = 0x%X.\n", phy_data); + nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00); + nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data); - nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], &phy_data); - nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], phy_data | 0x0300); - } else { - if ((nesadapter->phy_type[mac_index] == NES_PHY_TYPE_IRIS) || - (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_ARGUS)) { - /* setup 10G MDIO operation */ - tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); - tx_config &= 0xFFFFFFE3; - tx_config |= 0x15; - nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); - } - if ((nesadapter->phy_type[mac_index] == NES_PHY_TYPE_ARGUS)) { - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0xd7ee); + /* Clear Half duplex */ + nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100)); + nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data); - temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - mdelay(10); - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0xd7ee); - temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300); - /* - * if firmware is already running (like from a - * driver un-load/load, don't do anything. - */ - if (temp_phy_data == temp_phy_data2) { - /* configure QT2505 AMCC PHY */ - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0x0000, 0x8000); - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc300, 0x0000); - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc302, 0x0044); - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc318, 0x0052); - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc319, 0x0008); - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc31a, 0x0098); - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0x0026, 0x0E00); - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0x0027, 0x0001); - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0x0028, 0xA528); + return 0; + } - /* - * remove micro from reset; chip boots from ROM, - * uploads EEPROM f/w image, uC executes f/w - */ - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc300, 0x0002); + if ((phy_type == NES_PHY_TYPE_IRIS) || + (phy_type == NES_PHY_TYPE_ARGUS)) { + /* setup 10G MDIO operation */ + tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); + tx_config &= 0xFFFFFFE3; + tx_config |= 0x15; + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); + } + if ((phy_type == NES_PHY_TYPE_ARGUS)) { + /* Check firmware heartbeat */ + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + udelay(1500); + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - /* - * wait for heart beat to start to - * know loading is done - */ - counter = 0; - do { - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0xd7ee); - temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - if (counter++ > 1000) { - nes_debug(NES_DBG_PHY, "AMCC PHY- breaking from heartbeat check \n"); - break; - } - mdelay(100); - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0xd7ee); - temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - } while ((temp_phy_data2 == temp_phy_data)); + if (temp_phy_data != temp_phy_data2) + return 0; - /* - * wait for tracking to start to know - * f/w is good to go - */ - counter = 0; - do { - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0xd7fd); - temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - if (counter++ > 1000) { - nes_debug(NES_DBG_PHY, "AMCC PHY- breaking from status check \n"); - break; - } - mdelay(1000); - /* - * nes_debug(NES_DBG_PHY, "AMCC PHY- phy_status not ready yet = 0x%02X\n", - * temp_phy_data); - */ - } while (((temp_phy_data & 0xff) != 0x50) && ((temp_phy_data & 0xff) != 0x70)); - - /* set LOS Control invert RXLOSB_I_PADINV */ - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xd003, 0x0000); - /* set LOS Control to mask of RXLOSB_I */ - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc314, 0x0042); - /* set LED1 to input mode (LED1 and LED2 share same LED) */ - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xd006, 0x0007); - /* set LED2 to RX link_status and activity */ - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xd007, 0x000A); - /* set LED3 to RX link_status */ - nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xd008, 0x0009); + /* no heartbeat, configure the PHY */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001); - /* - * reset the res-calibration on t2 - * serdes; ensures it is stable after - * the amcc phy is stable - */ + /* setup LEDs */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009); - sds_common_control0 = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0); - sds_common_control0 |= 0x1; - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds_common_control0); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528); - /* release the res-calibration reset */ - sds_common_control0 &= 0xfffffffe; - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds_common_control0); + /* Bring PHY out of reset */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002); - i = 0; - while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040) - && (i++ < 5000)) { - /* mdelay(1); */ - } + /* Check for heartbeat */ + counter = 0; + mdelay(690); + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + do { + if (counter++ > 150) { + nes_debug(NES_DBG_PHY, "No PHY heartbeat\n"); + break; + } + mdelay(1); + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + } while ((temp_phy_data2 == temp_phy_data)); - /* - * wait for link train done before moving on, - * or will get an interupt storm - */ - counter = 0; - do { - temp_phy_data = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + - (0x200 * (nesdev->mac_index & 1))); - if (counter++ > 1000) { - nes_debug(NES_DBG_PHY, "AMCC PHY- breaking from link train wait \n"); - break; - } - mdelay(1); - } while (((temp_phy_data & 0x0f1f0000) != 0x0f0f0000)); + /* wait for tracking */ + counter = 0; + do { + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + if (counter++ > 300) { + nes_debug(NES_DBG_PHY, "PHY did not track\n"); + break; } - } + mdelay(10); + } while (((temp_phy_data & 0xff) != 0x50) && ((temp_phy_data & 0xff) != 0x70)); + + /* setup signal integrity */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x0002); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc314, 0x0063); + + /* reset serdes */ + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + + mac_index * 0x200); + sds |= 0x1; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + + mac_index * 0x200, sds); + sds &= 0xfffffffe; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + + mac_index * 0x200, sds); + + counter = 0; + while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040) + && (counter++ < 5000)) + ; } return 0; } @@ -2483,19 +2439,18 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9004); nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9005); /* check link status */ - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 1); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003); temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - u32temp = 100; - do { - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 1); - phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - if ((phy_data == temp_phy_data) || (!(--u32temp))) - break; - temp_phy_data = phy_data; - } while (1); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); + nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); + phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0; + nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n", - __func__, phy_data, nesadapter->mac_link_down ? "DOWN" : "UP"); + __func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP"); break; case NES_PHY_TYPE_PUMA_1G: -- cgit v1.2.2 From a4849fc157cdbe4fb68cfe37e7222697f003deb5 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 8 Apr 2009 14:27:18 -0700 Subject: RDMA/nes: Add wide_ppm_offset parm for switch compatibility We have observed unstable link with a new BNT switch. Add wide_ppm_offset parameter to allow the user to control the clock ppm offset on the CX4 interface for better compatibility. Default is 100ppm, setting it to 1 will increase it to 300ppm. Change default SerDes1 reference clock to external source. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 97 ++++++++++++++++++++++++++------------ drivers/infiniband/hw/nes/nes_hw.h | 1 + 2 files changed, 67 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 466c730e6297..f797064315c2 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -46,6 +46,10 @@ static unsigned int nes_lro_max_aggr = NES_LRO_MAX_AGGR; module_param(nes_lro_max_aggr, uint, 0444); MODULE_PARM_DESC(nes_lro_max_aggr, "NIC LRO max packet aggregation"); +static int wide_ppm_offset; +module_param(wide_ppm_offset, int, 0644); +MODULE_PARM_DESC(wide_ppm_offset, "Increase CX4 interface clock ppm offset, 0=100ppm (default), 1=300ppm"); + static u32 crit_err_count; u32 int_mod_timer_init; u32 int_mod_cq_depth_256; @@ -546,8 +550,11 @@ struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) { msleep(1); } if (int_cnt > 1) { + u32 sds; spin_lock_irqsave(&nesadapter->phy_lock, flags); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088); + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + sds |= 0x00000040; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds); mh_detected++; reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); reset_value |= 0x0000003d; @@ -736,43 +743,48 @@ static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count, { int i; u32 u32temp; - u32 serdes_common_control; + u32 sds; if (hw_rev != NE020_REV) { /* init serdes 0 */ + if (wide_ppm_offset && (nesadapter->phy_type[0] == NES_PHY_TYPE_CX4)) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) { - serdes_common_control = nes_read_indexed(nesdev, - NES_IDX_ETH_SERDES_COMMON_CONTROL0); - serdes_common_control |= 0x000000100; - nes_write_indexed(nesdev, - NES_IDX_ETH_SERDES_COMMON_CONTROL0, - serdes_common_control); - } else if (!OneG_Mode) { - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000); + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0); + sds |= 0x00000100; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds); } - if (((port_count > 1) && - (nesadapter->phy_type[0] != NES_PHY_TYPE_PUMA_1G)) || - ((port_count > 2) && - (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G))) { - /* init serdes 1 */ - if (nesadapter->phy_type[0] == NES_PHY_TYPE_ARGUS) { - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000); - } - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF); - if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) { - serdes_common_control = nes_read_indexed(nesdev, - NES_IDX_ETH_SERDES_COMMON_CONTROL1); - serdes_common_control |= 0x000000100; - nes_write_indexed(nesdev, - NES_IDX_ETH_SERDES_COMMON_CONTROL1, - serdes_common_control); - } else if (!OneG_Mode) { - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000); - } + if (!OneG_Mode) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000); + + if (port_count < 2) + return 0; + + /* init serdes 1 */ + switch (nesadapter->phy_type[1]) { + case NES_PHY_TYPE_ARGUS: + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000); + break; + case NES_PHY_TYPE_CX4: + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + sds &= 0xFFFFFFBF; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds); + if (wide_ppm_offset) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000FFFAA); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF); + break; + case NES_PHY_TYPE_PUMA_1G: + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + sds |= 0x000000100; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds); } + if (!OneG_Mode) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000); } else { /* init serdes 0 */ nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008); @@ -2315,6 +2327,7 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) u16 temp_phy_data; u32 pcs_val = 0x0f0f0000; u32 pcs_mask = 0x0f1f0000; + u32 cdr_ctrl; spin_lock_irqsave(&nesadapter->phy_lock, flags); if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) { @@ -2466,6 +2479,17 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) } if (phy_data & 0x0004) { + if (wide_ppm_offset && + (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) && + (nesadapter->hw_rev != NE020_REV)) { + cdr_ctrl = nes_read_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200); + nes_write_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200, + cdr_ctrl | 0x000F0000); + } nesadapter->mac_link_down[mac_index] = 0; list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { nes_debug(NES_DBG_PHY, "The Link is UP!!. linkup was %d\n", @@ -2480,6 +2504,17 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) } } } else { + if (wide_ppm_offset && + (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) && + (nesadapter->hw_rev != NE020_REV)) { + cdr_ctrl = nes_read_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200); + nes_write_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200, + cdr_ctrl & 0xFFF0FFFF); + } nesadapter->mac_link_down[mac_index] = 1; list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n", diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index f41a8710d2a8..13bc26a83159 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -35,6 +35,7 @@ #include +#define NES_PHY_TYPE_CX4 1 #define NES_PHY_TYPE_1G 2 #define NES_PHY_TYPE_IRIS 3 #define NES_PHY_TYPE_ARGUS 4 -- cgit v1.2.2 From 4303565df4eb425851ddd22136fec69bdfeede61 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Wed, 8 Apr 2009 14:27:56 -0700 Subject: RDMA/nes: Add support for new SFP+ PHY Add new register settings for new SFP+ PHY/firmware. Add new PHY to to nes_netdev_get/set_settings. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 19 ++++++++++---- drivers/infiniband/hw/nes/nes_hw.h | 1 + drivers/infiniband/hw/nes/nes_nic.c | 52 +++++++++++++++++++++---------------- 3 files changed, 44 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index f797064315c2..d6fc9ae44062 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -765,7 +765,8 @@ static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count, /* init serdes 1 */ switch (nesadapter->phy_type[1]) { - case NES_PHY_TYPE_ARGUS: + case NES_PHY_TYPE_ARGUS: + case NES_PHY_TYPE_SFP_D: nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000); nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000); break; @@ -1337,14 +1338,16 @@ int nes_init_phy(struct nes_device *nesdev) } if ((phy_type == NES_PHY_TYPE_IRIS) || - (phy_type == NES_PHY_TYPE_ARGUS)) { + (phy_type == NES_PHY_TYPE_ARGUS) || + (phy_type == NES_PHY_TYPE_SFP_D)) { /* setup 10G MDIO operation */ tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); tx_config &= 0xFFFFFFE3; tx_config |= 0x15; nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); } - if ((phy_type == NES_PHY_TYPE_ARGUS)) { + if ((phy_type == NES_PHY_TYPE_ARGUS) || + (phy_type == NES_PHY_TYPE_SFP_D)) { /* Check firmware heartbeat */ nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); @@ -1358,10 +1361,15 @@ int nes_init_phy(struct nes_device *nesdev) /* no heartbeat, configure the PHY */ nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000); nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C); nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008); + if (phy_type == NES_PHY_TYPE_ARGUS) { + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008); + } else { + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038); + } nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098); nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00); nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001); @@ -2442,6 +2450,7 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) break; case NES_PHY_TYPE_ARGUS: + case NES_PHY_TYPE_SFP_D: /* clear the alarms */ nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008); nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001); diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index 13bc26a83159..c3654c6383fe 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -42,6 +42,7 @@ #define NES_PHY_TYPE_PUMA_1G 5 #define NES_PHY_TYPE_PUMA_10G 6 #define NES_PHY_TYPE_GLADIUS 7 +#define NES_PHY_TYPE_SFP_D 8 #define NES_MULTICAST_PF_MAX 8 diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index ecb1f6fd6276..c6e6611d3016 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -1426,49 +1426,55 @@ static int nes_netdev_get_settings(struct net_device *netdev, struct ethtool_cmd struct nes_vnic *nesvnic = netdev_priv(netdev); struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 mac_index = nesdev->mac_index; + u8 phy_type = nesadapter->phy_type[mac_index]; + u8 phy_index = nesadapter->phy_index[mac_index]; u16 phy_data; et_cmd->duplex = DUPLEX_FULL; et_cmd->port = PORT_MII; + et_cmd->maxtxpkt = 511; + et_cmd->maxrxpkt = 511; if (nesadapter->OneG_Mode) { et_cmd->speed = SPEED_1000; - if (nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_PUMA_1G) { + if (phy_type == NES_PHY_TYPE_PUMA_1G) { et_cmd->supported = SUPPORTED_1000baseT_Full; et_cmd->advertising = ADVERTISED_1000baseT_Full; et_cmd->autoneg = AUTONEG_DISABLE; et_cmd->transceiver = XCVR_INTERNAL; - et_cmd->phy_address = nesdev->mac_index; + et_cmd->phy_address = mac_index; } else { - et_cmd->supported = SUPPORTED_1000baseT_Full | SUPPORTED_Autoneg; - et_cmd->advertising = ADVERTISED_1000baseT_Full | ADVERTISED_Autoneg; - nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index], &phy_data); + et_cmd->supported = SUPPORTED_1000baseT_Full + | SUPPORTED_Autoneg; + et_cmd->advertising = ADVERTISED_1000baseT_Full + | ADVERTISED_Autoneg; + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); if (phy_data & 0x1000) et_cmd->autoneg = AUTONEG_ENABLE; else et_cmd->autoneg = AUTONEG_DISABLE; et_cmd->transceiver = XCVR_EXTERNAL; - et_cmd->phy_address = nesadapter->phy_index[nesdev->mac_index]; + et_cmd->phy_address = phy_index; } + return 0; + } + if ((phy_type == NES_PHY_TYPE_IRIS) || + (phy_type == NES_PHY_TYPE_ARGUS) || + (phy_type == NES_PHY_TYPE_SFP_D)) { + et_cmd->transceiver = XCVR_EXTERNAL; + et_cmd->port = PORT_FIBRE; + et_cmd->supported = SUPPORTED_FIBRE; + et_cmd->advertising = ADVERTISED_FIBRE; + et_cmd->phy_address = phy_index; } else { - if ((nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_IRIS) || - (nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_ARGUS)) { - et_cmd->transceiver = XCVR_EXTERNAL; - et_cmd->port = PORT_FIBRE; - et_cmd->supported = SUPPORTED_FIBRE; - et_cmd->advertising = ADVERTISED_FIBRE; - et_cmd->phy_address = nesadapter->phy_index[nesdev->mac_index]; - } else { - et_cmd->transceiver = XCVR_INTERNAL; - et_cmd->supported = SUPPORTED_10000baseT_Full; - et_cmd->advertising = ADVERTISED_10000baseT_Full; - et_cmd->phy_address = nesdev->mac_index; - } - et_cmd->speed = SPEED_10000; - et_cmd->autoneg = AUTONEG_DISABLE; + et_cmd->transceiver = XCVR_INTERNAL; + et_cmd->supported = SUPPORTED_10000baseT_Full; + et_cmd->advertising = ADVERTISED_10000baseT_Full; + et_cmd->phy_address = mac_index; } - et_cmd->maxtxpkt = 511; - et_cmd->maxrxpkt = 511; + et_cmd->speed = SPEED_10000; + et_cmd->autoneg = AUTONEG_DISABLE; return 0; } -- cgit v1.2.2