aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYishai Hadas <yishaih@mellanox.com>2015-02-08 04:49:34 -0500
committerDavid S. Miller <davem@davemloft.net>2015-02-09 17:03:53 -0500
commit35f05dabf95ac3ebc4c15bafd6833f7a3046e66f (patch)
treea39bb7c432f4e36467e61b316050e41ecd408b1f
parent824c25c1abe70a527646056f6911d181facde9cc (diff)
IB/mlx4: Reset flow support for IB kernel ULPs
The driver exposes interfaces that directly relate to HW state. Upon fatal error, consumers of these interfaces (ULPs) that rely on completion of all their posted work-request could hang, thereby introducing dependencies in shutdown order. To prevent this from happening, we manage the relevant resources (CQs, QPs) that are used by the device. Upon a fatal error, we now generate simulated completions for outstanding WQEs that were not completed at the time the HW was reset. It includes invoking the completion event handler for all involved CQs so that the ULPs will poll those CQs. When polled we return simulated CQEs with IB_WC_WR_FLUSH_ERR return code enabling ULPs to clean up their resources and not wait forever for completions upon receiving remove_one. The above change requires an extra check in the data path to make sure that when device is in error state, the simulated CQEs will be returned and no further WQEs will be posted. Signed-off-by: Yishai Hadas <yishaih@mellanox.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c57
-rw-r--r--drivers/infiniband/hw/mlx4/main.c64
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h9
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c59
-rw-r--r--drivers/infiniband/hw/mlx4/srq.c8
-rw-r--r--include/linux/mlx4/device.h2
6 files changed, 193 insertions, 6 deletions
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index a3b70f6c4035..543ecdd8667b 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -188,6 +188,8 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
188 spin_lock_init(&cq->lock); 188 spin_lock_init(&cq->lock);
189 cq->resize_buf = NULL; 189 cq->resize_buf = NULL;
190 cq->resize_umem = NULL; 190 cq->resize_umem = NULL;
191 INIT_LIST_HEAD(&cq->send_qp_list);
192 INIT_LIST_HEAD(&cq->recv_qp_list);
191 193
192 if (context) { 194 if (context) {
193 struct mlx4_ib_create_cq ucmd; 195 struct mlx4_ib_create_cq ucmd;
@@ -594,6 +596,55 @@ static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct
594 return 0; 596 return 0;
595} 597}
596 598
599static void mlx4_ib_qp_sw_comp(struct mlx4_ib_qp *qp, int num_entries,
600 struct ib_wc *wc, int *npolled, int is_send)
601{
602 struct mlx4_ib_wq *wq;
603 unsigned cur;
604 int i;
605
606 wq = is_send ? &qp->sq : &qp->rq;
607 cur = wq->head - wq->tail;
608
609 if (cur == 0)
610 return;
611
612 for (i = 0; i < cur && *npolled < num_entries; i++) {
613 wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
614 wc->status = IB_WC_WR_FLUSH_ERR;
615 wc->vendor_err = MLX4_CQE_SYNDROME_WR_FLUSH_ERR;
616 wq->tail++;
617 (*npolled)++;
618 wc->qp = &qp->ibqp;
619 wc++;
620 }
621}
622
623static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries,
624 struct ib_wc *wc, int *npolled)
625{
626 struct mlx4_ib_qp *qp;
627
628 *npolled = 0;
629 /* Find uncompleted WQEs belonging to that cq and retrun
630 * simulated FLUSH_ERR completions
631 */
632 list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) {
633 mlx4_ib_qp_sw_comp(qp, num_entries, wc, npolled, 1);
634 if (*npolled >= num_entries)
635 goto out;
636 }
637
638 list_for_each_entry(qp, &cq->recv_qp_list, cq_recv_list) {
639 mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 0);
640 if (*npolled >= num_entries)
641 goto out;
642 }
643
644out:
645 return;
646}
647
597static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, 648static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
598 struct mlx4_ib_qp **cur_qp, 649 struct mlx4_ib_qp **cur_qp,
599 struct ib_wc *wc) 650 struct ib_wc *wc)
@@ -836,8 +887,13 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
836 unsigned long flags; 887 unsigned long flags;
837 int npolled; 888 int npolled;
838 int err = 0; 889 int err = 0;
890 struct mlx4_ib_dev *mdev = to_mdev(cq->ibcq.device);
839 891
840 spin_lock_irqsave(&cq->lock, flags); 892 spin_lock_irqsave(&cq->lock, flags);
893 if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
894 mlx4_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
895 goto out;
896 }
841 897
842 for (npolled = 0; npolled < num_entries; ++npolled) { 898 for (npolled = 0; npolled < num_entries; ++npolled) {
843 err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled); 899 err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
@@ -847,6 +903,7 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
847 903
848 mlx4_cq_set_ci(&cq->mcq); 904 mlx4_cq_set_ci(&cq->mcq);
849 905
906out:
850 spin_unlock_irqrestore(&cq->lock, flags); 907 spin_unlock_irqrestore(&cq->lock, flags);
851 908
852 if (err == 0 || err == -EAGAIN) 909 if (err == 0 || err == -EAGAIN)
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 3140da518a07..eb8e215f1613 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2308,6 +2308,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
2308 2308
2309 spin_lock_init(&ibdev->sm_lock); 2309 spin_lock_init(&ibdev->sm_lock);
2310 mutex_init(&ibdev->cap_mask_mutex); 2310 mutex_init(&ibdev->cap_mask_mutex);
2311 INIT_LIST_HEAD(&ibdev->qp_list);
2312 spin_lock_init(&ibdev->reset_flow_resource_lock);
2311 2313
2312 if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED && 2314 if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED &&
2313 ib_num_ports) { 2315 ib_num_ports) {
@@ -2622,6 +2624,67 @@ out:
2622 return; 2624 return;
2623} 2625}
2624 2626
2627static void mlx4_ib_handle_catas_error(struct mlx4_ib_dev *ibdev)
2628{
2629 struct mlx4_ib_qp *mqp;
2630 unsigned long flags_qp;
2631 unsigned long flags_cq;
2632 struct mlx4_ib_cq *send_mcq, *recv_mcq;
2633 struct list_head cq_notify_list;
2634 struct mlx4_cq *mcq;
2635 unsigned long flags;
2636
2637 pr_warn("mlx4_ib_handle_catas_error was started\n");
2638 INIT_LIST_HEAD(&cq_notify_list);
2639
2640 /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
2641 spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
2642
2643 list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
2644 spin_lock_irqsave(&mqp->sq.lock, flags_qp);
2645 if (mqp->sq.tail != mqp->sq.head) {
2646 send_mcq = to_mcq(mqp->ibqp.send_cq);
2647 spin_lock_irqsave(&send_mcq->lock, flags_cq);
2648 if (send_mcq->mcq.comp &&
2649 mqp->ibqp.send_cq->comp_handler) {
2650 if (!send_mcq->mcq.reset_notify_added) {
2651 send_mcq->mcq.reset_notify_added = 1;
2652 list_add_tail(&send_mcq->mcq.reset_notify,
2653 &cq_notify_list);
2654 }
2655 }
2656 spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
2657 }
2658 spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
2659 /* Now, handle the QP's receive queue */
2660 spin_lock_irqsave(&mqp->rq.lock, flags_qp);
2661 /* no handling is needed for SRQ */
2662 if (!mqp->ibqp.srq) {
2663 if (mqp->rq.tail != mqp->rq.head) {
2664 recv_mcq = to_mcq(mqp->ibqp.recv_cq);
2665 spin_lock_irqsave(&recv_mcq->lock, flags_cq);
2666 if (recv_mcq->mcq.comp &&
2667 mqp->ibqp.recv_cq->comp_handler) {
2668 if (!recv_mcq->mcq.reset_notify_added) {
2669 recv_mcq->mcq.reset_notify_added = 1;
2670 list_add_tail(&recv_mcq->mcq.reset_notify,
2671 &cq_notify_list);
2672 }
2673 }
2674 spin_unlock_irqrestore(&recv_mcq->lock,
2675 flags_cq);
2676 }
2677 }
2678 spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
2679 }
2680
2681 list_for_each_entry(mcq, &cq_notify_list, reset_notify) {
2682 mcq->comp(mcq);
2683 }
2684 spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
2685 pr_warn("mlx4_ib_handle_catas_error ended\n");
2686}
2687
2625static void handle_bonded_port_state_event(struct work_struct *work) 2688static void handle_bonded_port_state_event(struct work_struct *work)
2626{ 2689{
2627 struct ib_event_work *ew = 2690 struct ib_event_work *ew =
@@ -2701,6 +2764,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
2701 case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: 2764 case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
2702 ibdev->ib_active = false; 2765 ibdev->ib_active = false;
2703 ibev.event = IB_EVENT_DEVICE_FATAL; 2766 ibev.event = IB_EVENT_DEVICE_FATAL;
2767 mlx4_ib_handle_catas_error(ibdev);
2704 break; 2768 break;
2705 2769
2706 case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: 2770 case MLX4_DEV_EVENT_PORT_MGMT_CHANGE:
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 721540c9163d..f829fd935b79 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -110,6 +110,9 @@ struct mlx4_ib_cq {
110 struct mutex resize_mutex; 110 struct mutex resize_mutex;
111 struct ib_umem *umem; 111 struct ib_umem *umem;
112 struct ib_umem *resize_umem; 112 struct ib_umem *resize_umem;
113 /* List of qps that it serves.*/
114 struct list_head send_qp_list;
115 struct list_head recv_qp_list;
113}; 116};
114 117
115struct mlx4_ib_mr { 118struct mlx4_ib_mr {
@@ -300,6 +303,9 @@ struct mlx4_ib_qp {
300 struct mlx4_roce_smac_vlan_info pri; 303 struct mlx4_roce_smac_vlan_info pri;
301 struct mlx4_roce_smac_vlan_info alt; 304 struct mlx4_roce_smac_vlan_info alt;
302 u64 reg_id; 305 u64 reg_id;
306 struct list_head qps_list;
307 struct list_head cq_recv_list;
308 struct list_head cq_send_list;
303}; 309};
304 310
305struct mlx4_ib_srq { 311struct mlx4_ib_srq {
@@ -535,6 +541,9 @@ struct mlx4_ib_dev {
535 /* lock when destroying qp1_proxy and getting netdev events */ 541 /* lock when destroying qp1_proxy and getting netdev events */
536 struct mutex qp1_proxy_lock[MLX4_MAX_PORTS]; 542 struct mutex qp1_proxy_lock[MLX4_MAX_PORTS];
537 u8 bond_next_port; 543 u8 bond_next_port;
544 /* protect resources needed as part of reset flow */
545 spinlock_t reset_flow_resource_lock;
546 struct list_head qp_list;
538}; 547};
539 548
540struct ib_event_work { 549struct ib_event_work {
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 792f9dc86ada..dfc6ca128a7e 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -46,6 +46,11 @@
46#include "mlx4_ib.h" 46#include "mlx4_ib.h"
47#include "user.h" 47#include "user.h"
48 48
49static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
50 struct mlx4_ib_cq *recv_cq);
51static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
52 struct mlx4_ib_cq *recv_cq);
53
49enum { 54enum {
50 MLX4_IB_ACK_REQ_FREQ = 8, 55 MLX4_IB_ACK_REQ_FREQ = 8,
51}; 56};
@@ -618,6 +623,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
618 struct mlx4_ib_sqp *sqp; 623 struct mlx4_ib_sqp *sqp;
619 struct mlx4_ib_qp *qp; 624 struct mlx4_ib_qp *qp;
620 enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; 625 enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
626 struct mlx4_ib_cq *mcq;
627 unsigned long flags;
621 628
622 /* When tunneling special qps, we use a plain UD qp */ 629 /* When tunneling special qps, we use a plain UD qp */
623 if (sqpn) { 630 if (sqpn) {
@@ -828,6 +835,24 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
828 qp->mqp.event = mlx4_ib_qp_event; 835 qp->mqp.event = mlx4_ib_qp_event;
829 if (!*caller_qp) 836 if (!*caller_qp)
830 *caller_qp = qp; 837 *caller_qp = qp;
838
839 spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
840 mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
841 to_mcq(init_attr->recv_cq));
842 /* Maintain device to QPs access, needed for further handling
843 * via reset flow
844 */
845 list_add_tail(&qp->qps_list, &dev->qp_list);
846 /* Maintain CQ to QPs access, needed for further handling
847 * via reset flow
848 */
849 mcq = to_mcq(init_attr->send_cq);
850 list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
851 mcq = to_mcq(init_attr->recv_cq);
852 list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
853 mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
854 to_mcq(init_attr->recv_cq));
855 spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
831 return 0; 856 return 0;
832 857
833err_qpn: 858err_qpn:
@@ -886,13 +911,13 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv
886 __acquires(&send_cq->lock) __acquires(&recv_cq->lock) 911 __acquires(&send_cq->lock) __acquires(&recv_cq->lock)
887{ 912{
888 if (send_cq == recv_cq) { 913 if (send_cq == recv_cq) {
889 spin_lock_irq(&send_cq->lock); 914 spin_lock(&send_cq->lock);
890 __acquire(&recv_cq->lock); 915 __acquire(&recv_cq->lock);
891 } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { 916 } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
892 spin_lock_irq(&send_cq->lock); 917 spin_lock(&send_cq->lock);
893 spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); 918 spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
894 } else { 919 } else {
895 spin_lock_irq(&recv_cq->lock); 920 spin_lock(&recv_cq->lock);
896 spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); 921 spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
897 } 922 }
898} 923}
@@ -902,13 +927,13 @@ static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *re
902{ 927{
903 if (send_cq == recv_cq) { 928 if (send_cq == recv_cq) {
904 __release(&recv_cq->lock); 929 __release(&recv_cq->lock);
905 spin_unlock_irq(&send_cq->lock); 930 spin_unlock(&send_cq->lock);
906 } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { 931 } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
907 spin_unlock(&recv_cq->lock); 932 spin_unlock(&recv_cq->lock);
908 spin_unlock_irq(&send_cq->lock); 933 spin_unlock(&send_cq->lock);
909 } else { 934 } else {
910 spin_unlock(&send_cq->lock); 935 spin_unlock(&send_cq->lock);
911 spin_unlock_irq(&recv_cq->lock); 936 spin_unlock(&recv_cq->lock);
912 } 937 }
913} 938}
914 939
@@ -953,6 +978,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
953 int is_user) 978 int is_user)
954{ 979{
955 struct mlx4_ib_cq *send_cq, *recv_cq; 980 struct mlx4_ib_cq *send_cq, *recv_cq;
981 unsigned long flags;
956 982
957 if (qp->state != IB_QPS_RESET) { 983 if (qp->state != IB_QPS_RESET) {
958 if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), 984 if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
@@ -984,8 +1010,13 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
984 1010
985 get_cqs(qp, &send_cq, &recv_cq); 1011 get_cqs(qp, &send_cq, &recv_cq);
986 1012
1013 spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
987 mlx4_ib_lock_cqs(send_cq, recv_cq); 1014 mlx4_ib_lock_cqs(send_cq, recv_cq);
988 1015
1016 /* del from lists under both locks above to protect reset flow paths */
1017 list_del(&qp->qps_list);
1018 list_del(&qp->cq_send_list);
1019 list_del(&qp->cq_recv_list);
989 if (!is_user) { 1020 if (!is_user) {
990 __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, 1021 __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
991 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL); 1022 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
@@ -996,6 +1027,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
996 mlx4_qp_remove(dev->dev, &qp->mqp); 1027 mlx4_qp_remove(dev->dev, &qp->mqp);
997 1028
998 mlx4_ib_unlock_cqs(send_cq, recv_cq); 1029 mlx4_ib_unlock_cqs(send_cq, recv_cq);
1030 spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
999 1031
1000 mlx4_qp_free(dev->dev, &qp->mqp); 1032 mlx4_qp_free(dev->dev, &qp->mqp);
1001 1033
@@ -2618,8 +2650,15 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
2618 __be32 uninitialized_var(lso_hdr_sz); 2650 __be32 uninitialized_var(lso_hdr_sz);
2619 __be32 blh; 2651 __be32 blh;
2620 int i; 2652 int i;
2653 struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
2621 2654
2622 spin_lock_irqsave(&qp->sq.lock, flags); 2655 spin_lock_irqsave(&qp->sq.lock, flags);
2656 if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
2657 err = -EIO;
2658 *bad_wr = wr;
2659 nreq = 0;
2660 goto out;
2661 }
2623 2662
2624 ind = qp->sq_next_wqe; 2663 ind = qp->sq_next_wqe;
2625 2664
@@ -2917,10 +2956,18 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
2917 int ind; 2956 int ind;
2918 int max_gs; 2957 int max_gs;
2919 int i; 2958 int i;
2959 struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
2920 2960
2921 max_gs = qp->rq.max_gs; 2961 max_gs = qp->rq.max_gs;
2922 spin_lock_irqsave(&qp->rq.lock, flags); 2962 spin_lock_irqsave(&qp->rq.lock, flags);
2923 2963
2964 if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
2965 err = -EIO;
2966 *bad_wr = wr;
2967 nreq = 0;
2968 goto out;
2969 }
2970
2924 ind = qp->rq.head & (qp->rq.wqe_cnt - 1); 2971 ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
2925 2972
2926 for (nreq = 0; wr; ++nreq, wr = wr->next) { 2973 for (nreq = 0; wr; ++nreq, wr = wr->next) {
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 62d9285300af..dce5dfe3a70e 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -316,8 +316,15 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
316 int err = 0; 316 int err = 0;
317 int nreq; 317 int nreq;
318 int i; 318 int i;
319 struct mlx4_ib_dev *mdev = to_mdev(ibsrq->device);
319 320
320 spin_lock_irqsave(&srq->lock, flags); 321 spin_lock_irqsave(&srq->lock, flags);
322 if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
323 err = -EIO;
324 *bad_wr = wr;
325 nreq = 0;
326 goto out;
327 }
321 328
322 for (nreq = 0; wr; ++nreq, wr = wr->next) { 329 for (nreq = 0; wr; ++nreq, wr = wr->next) {
323 if (unlikely(wr->num_sge > srq->msrq.max_gs)) { 330 if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
@@ -362,6 +369,7 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
362 369
363 *srq->db.db = cpu_to_be32(srq->wqe_ctr); 370 *srq->db.db = cpu_to_be32(srq->wqe_ctr);
364 } 371 }
372out:
365 373
366 spin_unlock_irqrestore(&srq->lock, flags); 374 spin_unlock_irqrestore(&srq->lock, flags);
367 375
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index c116cb02475c..e4ebff7e9d02 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -689,6 +689,8 @@ struct mlx4_cq {
689 void (*comp)(struct mlx4_cq *); 689 void (*comp)(struct mlx4_cq *);
690 void *priv; 690 void *priv;
691 } tasklet_ctx; 691 } tasklet_ctx;
692 int reset_notify_added;
693 struct list_head reset_notify;
692}; 694};
693 695
694struct mlx4_qp { 696struct mlx4_qp {