aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Smart <jsmart2021@gmail.com>2018-11-14 19:35:10 -0500
committerChristoph Hellwig <hch@lst.de>2018-11-15 05:37:55 -0500
commit4cff280a5fccf6513ed9e895bb3a4e7ad8b0cedc (patch)
treeb250b7a408e8ffb1a51346fca54301d76aaebf22
parent8dc765d438f1e42b3e8227b3b09fad7d73f4ec9a (diff)
nvme-fc: resolve io failures during connect
If an io error occurs on an io issued while connecting, recovery of the io falls flat as the state checking ends up nooping the error handler. Create an err_work work item that is scheduled upon an io error while connecting. The work thread terminates all io on all queues and marks the queues as not connected. The termination of the io will return back to the callee, which will then back out of the connection attempt and will reschedule, if possible, the connection attempt. The changes: - in case there are several commands hitting the error handler, a state flag is kept so that the error work is only scheduled once, on the first error. The subsequent errors can be ignored. - The calling sequence to stop keep alive and terminate the queues and their io is lifted from the reset routine. Made a small service routine used by both reset and err_work. - During debugging, found that the teardown path can reference an uninitialized pointer, resulting in a NULL pointer oops. The aen_ops weren't initialized yet. Add validation on their initialization before calling the teardown routine. Signed-off-by: James Smart <jsmart2021@gmail.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--drivers/nvme/host/fc.c73
1 files changed, 63 insertions, 10 deletions
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 0b70c8bab045..54032c466636 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -152,6 +152,7 @@ struct nvme_fc_ctrl {
152 152
153 bool ioq_live; 153 bool ioq_live;
154 bool assoc_active; 154 bool assoc_active;
155 atomic_t err_work_active;
155 u64 association_id; 156 u64 association_id;
156 157
157 struct list_head ctrl_list; /* rport->ctrl_list */ 158 struct list_head ctrl_list; /* rport->ctrl_list */
@@ -160,6 +161,7 @@ struct nvme_fc_ctrl {
160 struct blk_mq_tag_set tag_set; 161 struct blk_mq_tag_set tag_set;
161 162
162 struct delayed_work connect_work; 163 struct delayed_work connect_work;
164 struct work_struct err_work;
163 165
164 struct kref ref; 166 struct kref ref;
165 u32 flags; 167 u32 flags;
@@ -1531,6 +1533,10 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
1531 struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops; 1533 struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops;
1532 int i; 1534 int i;
1533 1535
1536 /* ensure we've initialized the ops once */
1537 if (!(aen_op->flags & FCOP_FLAGS_AEN))
1538 return;
1539
1534 for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) 1540 for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++)
1535 __nvme_fc_abort_op(ctrl, aen_op); 1541 __nvme_fc_abort_op(ctrl, aen_op);
1536} 1542}
@@ -2049,7 +2055,25 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
2049static void 2055static void
2050nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) 2056nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
2051{ 2057{
2052 /* only proceed if in LIVE state - e.g. on first error */ 2058 int active;
2059
2060 /*
2061 * if an error (io timeout, etc) while (re)connecting,
2062 * it's an error on creating the new association.
2063 * Start the error recovery thread if it hasn't already
2064 * been started. It is expected there could be multiple
2065 * ios hitting this path before things are cleaned up.
2066 */
2067 if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) {
2068 active = atomic_xchg(&ctrl->err_work_active, 1);
2069 if (!active && !schedule_work(&ctrl->err_work)) {
2070 atomic_set(&ctrl->err_work_active, 0);
2071 WARN_ON(1);
2072 }
2073 return;
2074 }
2075
2076 /* Otherwise, only proceed if in LIVE state - e.g. on first error */
2053 if (ctrl->ctrl.state != NVME_CTRL_LIVE) 2077 if (ctrl->ctrl.state != NVME_CTRL_LIVE)
2054 return; 2078 return;
2055 2079
@@ -2814,6 +2838,7 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
2814{ 2838{
2815 struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); 2839 struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
2816 2840
2841 cancel_work_sync(&ctrl->err_work);
2817 cancel_delayed_work_sync(&ctrl->connect_work); 2842 cancel_delayed_work_sync(&ctrl->connect_work);
2818 /* 2843 /*
2819 * kill the association on the link side. this will block 2844 * kill the association on the link side. this will block
@@ -2866,23 +2891,30 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
2866} 2891}
2867 2892
2868static void 2893static void
2869nvme_fc_reset_ctrl_work(struct work_struct *work) 2894__nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl)
2870{ 2895{
2871 struct nvme_fc_ctrl *ctrl = 2896 nvme_stop_keep_alive(&ctrl->ctrl);
2872 container_of(work, struct nvme_fc_ctrl, ctrl.reset_work);
2873 int ret;
2874
2875 nvme_stop_ctrl(&ctrl->ctrl);
2876 2897
2877 /* will block will waiting for io to terminate */ 2898 /* will block will waiting for io to terminate */
2878 nvme_fc_delete_association(ctrl); 2899 nvme_fc_delete_association(ctrl);
2879 2900
2880 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { 2901 if (ctrl->ctrl.state != NVME_CTRL_CONNECTING &&
2902 !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
2881 dev_err(ctrl->ctrl.device, 2903 dev_err(ctrl->ctrl.device,
2882 "NVME-FC{%d}: error_recovery: Couldn't change state " 2904 "NVME-FC{%d}: error_recovery: Couldn't change state "
2883 "to CONNECTING\n", ctrl->cnum); 2905 "to CONNECTING\n", ctrl->cnum);
2884 return; 2906}
2885 } 2907
2908static void
2909nvme_fc_reset_ctrl_work(struct work_struct *work)
2910{
2911 struct nvme_fc_ctrl *ctrl =
2912 container_of(work, struct nvme_fc_ctrl, ctrl.reset_work);
2913 int ret;
2914
2915 __nvme_fc_terminate_io(ctrl);
2916
2917 nvme_stop_ctrl(&ctrl->ctrl);
2886 2918
2887 if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) 2919 if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE)
2888 ret = nvme_fc_create_association(ctrl); 2920 ret = nvme_fc_create_association(ctrl);
@@ -2897,6 +2929,24 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
2897 ctrl->cnum); 2929 ctrl->cnum);
2898} 2930}
2899 2931
2932static void
2933nvme_fc_connect_err_work(struct work_struct *work)
2934{
2935 struct nvme_fc_ctrl *ctrl =
2936 container_of(work, struct nvme_fc_ctrl, err_work);
2937
2938 __nvme_fc_terminate_io(ctrl);
2939
2940 atomic_set(&ctrl->err_work_active, 0);
2941
2942 /*
2943 * Rescheduling the connection after recovering
2944 * from the io error is left to the reconnect work
2945 * item, which is what should have stalled waiting on
2946 * the io that had the error that scheduled this work.
2947 */
2948}
2949
2900static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { 2950static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
2901 .name = "fc", 2951 .name = "fc",
2902 .module = THIS_MODULE, 2952 .module = THIS_MODULE,
@@ -3007,6 +3057,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
3007 ctrl->cnum = idx; 3057 ctrl->cnum = idx;
3008 ctrl->ioq_live = false; 3058 ctrl->ioq_live = false;
3009 ctrl->assoc_active = false; 3059 ctrl->assoc_active = false;
3060 atomic_set(&ctrl->err_work_active, 0);
3010 init_waitqueue_head(&ctrl->ioabort_wait); 3061 init_waitqueue_head(&ctrl->ioabort_wait);
3011 3062
3012 get_device(ctrl->dev); 3063 get_device(ctrl->dev);
@@ -3014,6 +3065,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
3014 3065
3015 INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); 3066 INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work);
3016 INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); 3067 INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
3068 INIT_WORK(&ctrl->err_work, nvme_fc_connect_err_work);
3017 spin_lock_init(&ctrl->lock); 3069 spin_lock_init(&ctrl->lock);
3018 3070
3019 /* io queue count */ 3071 /* io queue count */
@@ -3103,6 +3155,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
3103fail_ctrl: 3155fail_ctrl:
3104 nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); 3156 nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING);
3105 cancel_work_sync(&ctrl->ctrl.reset_work); 3157 cancel_work_sync(&ctrl->ctrl.reset_work);
3158 cancel_work_sync(&ctrl->err_work);
3106 cancel_delayed_work_sync(&ctrl->connect_work); 3159 cancel_delayed_work_sync(&ctrl->connect_work);
3107 3160
3108 ctrl->ctrl.opts = NULL; 3161 ctrl->ctrl.opts = NULL;