summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-13 17:22:26 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-13 17:22:26 -0500
commit9ea18f8cab5f1c36cdd0f09717e35ceb48c36a87 (patch)
tree0c8da7ac47cb59fe39f177ab0407f554aff77194 /drivers
parentcaf292ae5bb9d57198ce001d8b762f7abae3a94d (diff)
parent849c6e7746e4f6317ace6aa7d2fcdcd844e99ddb (diff)
Merge branch 'for-3.19/drivers' of git://git.kernel.dk/linux-block
Pull block layer driver updates from Jens Axboe: - NVMe updates: - The blk-mq conversion from Matias (and others) - A stack of NVMe bug fixes from the nvme tree, mostly from Keith. - Various bug fixes from me, fixing issues in both the blk-mq conversion and generic bugs. - Abort and CPU online fix from Sam. - Hot add/remove fix from Indraneel. - A couple of drbd fixes from the drbd team (Andreas, Lars, Philipp) - With the generic IO stat accounting from 3.19/core, converting md, bcache, and rsxx to use those. From Gu Zheng. - Boundary check for queue/irq mode for null_blk from Matias. Fixes cases where invalid values could be given, causing the device to hang. - The xen blkfront pull request, with two bug fixes from Vitaly. * 'for-3.19/drivers' of git://git.kernel.dk/linux-block: (56 commits) NVMe: fix race condition in nvme_submit_sync_cmd() NVMe: fix retry/error logic in nvme_queue_rq() NVMe: Fix FS mount issue (hot-remove followed by hot-add) NVMe: fix error return checking from blk_mq_alloc_request() NVMe: fix freeing of wrong request in abort path xen/blkfront: remove redundant flush_op xen/blkfront: improve protection against issuing unsupported REQ_FUA NVMe: Fix command setup on IO retry null_blk: boundary check queue_mode and irqmode block/rsxx: use generic io stats accounting functions to simplify io stat accounting md: use generic io stats accounting functions to simplify io stat accounting drbd: use generic io stats accounting functions to simplify io stat accounting md/bcache: use generic io stats accounting functions to simplify io stat accounting NVMe: Update module version major number NVMe: fail pci initialization if the device doesn't have any BARs NVMe: add ->exit_hctx() hook NVMe: make setup work for devices that don't do INTx NVMe: enable IO stats by default NVMe: nvme_submit_async_admin_req() must use atomic rq allocation NVMe: replace blk_put_request() with blk_mq_free_request() ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/drbd/drbd_actlog.c3
-rw-r--r--drivers/block/drbd/drbd_int.h39
-rw-r--r--drivers/block/drbd/drbd_main.c23
-rw-r--r--drivers/block/drbd/drbd_nl.c64
-rw-r--r--drivers/block/drbd/drbd_receiver.c2
-rw-r--r--drivers/block/drbd/drbd_req.c25
-rw-r--r--drivers/block/drbd/drbd_state.c42
-rw-r--r--drivers/block/drbd/drbd_state.h5
-rw-r--r--drivers/block/drbd/drbd_worker.c5
-rw-r--r--drivers/block/null_blk.c42
-rw-r--r--drivers/block/nvme-core.c1594
-rw-r--r--drivers/block/nvme-scsi.c162
-rw-r--r--drivers/block/rsxx/dev.c29
-rw-r--r--drivers/block/xen-blkfront.c65
-rw-r--r--drivers/md/bcache/request.c23
-rw-r--r--drivers/md/dm.c13
-rw-r--r--drivers/md/md.c6
17 files changed, 952 insertions, 1190 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index a2dfa169237d..1318e3217cb0 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -827,8 +827,7 @@ static int update_sync_bits(struct drbd_device *device,
827 * 827 *
828 */ 828 */
829int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, 829int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
830 enum update_sync_bits_mode mode, 830 enum update_sync_bits_mode mode)
831 const char *file, const unsigned int line)
832{ 831{
833 /* Is called from worker and receiver context _only_ */ 832 /* Is called from worker and receiver context _only_ */
834 unsigned long sbnr, ebnr, lbnr; 833 unsigned long sbnr, ebnr, lbnr;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 9b22f8f01b57..b905e9888b88 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1454,7 +1454,6 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
1454 1454
1455 1455
1456/* drbd_nl.c */ 1456/* drbd_nl.c */
1457extern int drbd_msg_put_info(struct sk_buff *skb, const char *info);
1458extern void drbd_suspend_io(struct drbd_device *device); 1457extern void drbd_suspend_io(struct drbd_device *device);
1459extern void drbd_resume_io(struct drbd_device *device); 1458extern void drbd_resume_io(struct drbd_device *device);
1460extern char *ppsize(char *buf, unsigned long long size); 1459extern char *ppsize(char *buf, unsigned long long size);
@@ -1558,52 +1557,31 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
1558extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); 1557extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
1559extern int drbd_connected(struct drbd_peer_device *); 1558extern int drbd_connected(struct drbd_peer_device *);
1560 1559
1561/* Yes, there is kernel_setsockopt, but only since 2.6.18.
1562 * So we have our own copy of it here. */
1563static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
1564 char *optval, int optlen)
1565{
1566 mm_segment_t oldfs = get_fs();
1567 char __user *uoptval;
1568 int err;
1569
1570 uoptval = (char __user __force *)optval;
1571
1572 set_fs(KERNEL_DS);
1573 if (level == SOL_SOCKET)
1574 err = sock_setsockopt(sock, level, optname, uoptval, optlen);
1575 else
1576 err = sock->ops->setsockopt(sock, level, optname, uoptval,
1577 optlen);
1578 set_fs(oldfs);
1579 return err;
1580}
1581
1582static inline void drbd_tcp_cork(struct socket *sock) 1560static inline void drbd_tcp_cork(struct socket *sock)
1583{ 1561{
1584 int val = 1; 1562 int val = 1;
1585 (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, 1563 (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
1586 (char*)&val, sizeof(val)); 1564 (char*)&val, sizeof(val));
1587} 1565}
1588 1566
1589static inline void drbd_tcp_uncork(struct socket *sock) 1567static inline void drbd_tcp_uncork(struct socket *sock)
1590{ 1568{
1591 int val = 0; 1569 int val = 0;
1592 (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, 1570 (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
1593 (char*)&val, sizeof(val)); 1571 (char*)&val, sizeof(val));
1594} 1572}
1595 1573
1596static inline void drbd_tcp_nodelay(struct socket *sock) 1574static inline void drbd_tcp_nodelay(struct socket *sock)
1597{ 1575{
1598 int val = 1; 1576 int val = 1;
1599 (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, 1577 (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
1600 (char*)&val, sizeof(val)); 1578 (char*)&val, sizeof(val));
1601} 1579}
1602 1580
1603static inline void drbd_tcp_quickack(struct socket *sock) 1581static inline void drbd_tcp_quickack(struct socket *sock)
1604{ 1582{
1605 int val = 2; 1583 int val = 2;
1606 (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, 1584 (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
1607 (char*)&val, sizeof(val)); 1585 (char*)&val, sizeof(val));
1608} 1586}
1609 1587
@@ -1662,14 +1640,13 @@ extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long stil
1662 1640
1663enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC }; 1641enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
1664extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, 1642extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
1665 enum update_sync_bits_mode mode, 1643 enum update_sync_bits_mode mode);
1666 const char *file, const unsigned int line);
1667#define drbd_set_in_sync(device, sector, size) \ 1644#define drbd_set_in_sync(device, sector, size) \
1668 __drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__) 1645 __drbd_change_sync(device, sector, size, SET_IN_SYNC)
1669#define drbd_set_out_of_sync(device, sector, size) \ 1646#define drbd_set_out_of_sync(device, sector, size) \
1670 __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__) 1647 __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC)
1671#define drbd_rs_failed_io(device, sector, size) \ 1648#define drbd_rs_failed_io(device, sector, size) \
1672 __drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__) 1649 __drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
1673extern void drbd_al_shrink(struct drbd_device *device); 1650extern void drbd_al_shrink(struct drbd_device *device);
1674extern int drbd_initialize_al(struct drbd_device *, void *); 1651extern int drbd_initialize_al(struct drbd_device *, void *);
1675 1652
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 973c185c9cfe..1fc83427199c 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2532,10 +2532,6 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
2532 2532
2533 if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) 2533 if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
2534 return -ENOMEM; 2534 return -ENOMEM;
2535 /*
2536 retcode = ERR_NOMEM;
2537 drbd_msg_put_info("unable to allocate cpumask");
2538 */
2539 2535
2540 /* silently ignore cpu mask on UP kernel */ 2536 /* silently ignore cpu mask on UP kernel */
2541 if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { 2537 if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
@@ -2731,7 +2727,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
2731 2727
2732 device = minor_to_device(minor); 2728 device = minor_to_device(minor);
2733 if (device) 2729 if (device)
2734 return ERR_MINOR_EXISTS; 2730 return ERR_MINOR_OR_VOLUME_EXISTS;
2735 2731
2736 /* GFP_KERNEL, we are outside of all write-out paths */ 2732 /* GFP_KERNEL, we are outside of all write-out paths */
2737 device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL); 2733 device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL);
@@ -2793,20 +2789,16 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
2793 2789
2794 id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL); 2790 id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL);
2795 if (id < 0) { 2791 if (id < 0) {
2796 if (id == -ENOSPC) { 2792 if (id == -ENOSPC)
2797 err = ERR_MINOR_EXISTS; 2793 err = ERR_MINOR_OR_VOLUME_EXISTS;
2798 drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
2799 }
2800 goto out_no_minor_idr; 2794 goto out_no_minor_idr;
2801 } 2795 }
2802 kref_get(&device->kref); 2796 kref_get(&device->kref);
2803 2797
2804 id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL); 2798 id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL);
2805 if (id < 0) { 2799 if (id < 0) {
2806 if (id == -ENOSPC) { 2800 if (id == -ENOSPC)
2807 err = ERR_MINOR_EXISTS; 2801 err = ERR_MINOR_OR_VOLUME_EXISTS;
2808 drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
2809 }
2810 goto out_idr_remove_minor; 2802 goto out_idr_remove_minor;
2811 } 2803 }
2812 kref_get(&device->kref); 2804 kref_get(&device->kref);
@@ -2825,10 +2817,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
2825 2817
2826 id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL); 2818 id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL);
2827 if (id < 0) { 2819 if (id < 0) {
2828 if (id == -ENOSPC) { 2820 if (id == -ENOSPC)
2829 err = ERR_INVALID_REQUEST; 2821 err = ERR_INVALID_REQUEST;
2830 drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already");
2831 }
2832 goto out_idr_remove_from_resource; 2822 goto out_idr_remove_from_resource;
2833 } 2823 }
2834 kref_get(&connection->kref); 2824 kref_get(&connection->kref);
@@ -2836,7 +2826,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
2836 2826
2837 if (init_submitter(device)) { 2827 if (init_submitter(device)) {
2838 err = ERR_NOMEM; 2828 err = ERR_NOMEM;
2839 drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue");
2840 goto out_idr_remove_vol; 2829 goto out_idr_remove_vol;
2841 } 2830 }
2842 2831
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 1cd47df44bda..74df8cfad414 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -92,7 +92,7 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
92 92
93/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only 93/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
94 * reason it could fail was no space in skb, and there are 4k available. */ 94 * reason it could fail was no space in skb, and there are 4k available. */
95int drbd_msg_put_info(struct sk_buff *skb, const char *info) 95static int drbd_msg_put_info(struct sk_buff *skb, const char *info)
96{ 96{
97 struct nlattr *nla; 97 struct nlattr *nla;
98 int err = -EMSGSIZE; 98 int err = -EMSGSIZE;
@@ -588,7 +588,7 @@ drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int for
588 val.i = 0; val.role = new_role; 588 val.i = 0; val.role = new_role;
589 589
590 while (try++ < max_tries) { 590 while (try++ < max_tries) {
591 rv = _drbd_request_state(device, mask, val, CS_WAIT_COMPLETE); 591 rv = _drbd_request_state_holding_state_mutex(device, mask, val, CS_WAIT_COMPLETE);
592 592
593 /* in case we first succeeded to outdate, 593 /* in case we first succeeded to outdate,
594 * but now suddenly could establish a connection */ 594 * but now suddenly could establish a connection */
@@ -2052,7 +2052,7 @@ check_net_options(struct drbd_connection *connection, struct net_conf *new_net_c
2052 rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf); 2052 rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf);
2053 rcu_read_unlock(); 2053 rcu_read_unlock();
2054 2054
2055 /* connection->volumes protected by genl_lock() here */ 2055 /* connection->peer_devices protected by genl_lock() here */
2056 idr_for_each_entry(&connection->peer_devices, peer_device, i) { 2056 idr_for_each_entry(&connection->peer_devices, peer_device, i) {
2057 struct drbd_device *device = peer_device->device; 2057 struct drbd_device *device = peer_device->device;
2058 if (!device->bitmap) { 2058 if (!device->bitmap) {
@@ -3483,7 +3483,7 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
3483 * that first_peer_device(device)->connection and device->vnr match the request. */ 3483 * that first_peer_device(device)->connection and device->vnr match the request. */
3484 if (adm_ctx.device) { 3484 if (adm_ctx.device) {
3485 if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) 3485 if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
3486 retcode = ERR_MINOR_EXISTS; 3486 retcode = ERR_MINOR_OR_VOLUME_EXISTS;
3487 /* else: still NO_ERROR */ 3487 /* else: still NO_ERROR */
3488 goto out; 3488 goto out;
3489 } 3489 }
@@ -3530,6 +3530,27 @@ out:
3530 return 0; 3530 return 0;
3531} 3531}
3532 3532
3533static int adm_del_resource(struct drbd_resource *resource)
3534{
3535 struct drbd_connection *connection;
3536
3537 for_each_connection(connection, resource) {
3538 if (connection->cstate > C_STANDALONE)
3539 return ERR_NET_CONFIGURED;
3540 }
3541 if (!idr_is_empty(&resource->devices))
3542 return ERR_RES_IN_USE;
3543
3544 list_del_rcu(&resource->resources);
3545 /* Make sure all threads have actually stopped: state handling only
3546 * does drbd_thread_stop_nowait(). */
3547 list_for_each_entry(connection, &resource->connections, connections)
3548 drbd_thread_stop(&connection->worker);
3549 synchronize_rcu();
3550 drbd_free_resource(resource);
3551 return NO_ERROR;
3552}
3553
3533int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) 3554int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3534{ 3555{
3535 struct drbd_config_context adm_ctx; 3556 struct drbd_config_context adm_ctx;
@@ -3575,14 +3596,6 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3575 } 3596 }
3576 } 3597 }
3577 3598
3578 /* If we reach this, all volumes (of this connection) are Secondary,
3579 * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
3580 * actually stopped, state handling only does drbd_thread_stop_nowait(). */
3581 for_each_connection(connection, resource)
3582 drbd_thread_stop(&connection->worker);
3583
3584 /* Now, nothing can fail anymore */
3585
3586 /* delete volumes */ 3599 /* delete volumes */
3587 idr_for_each_entry(&resource->devices, device, i) { 3600 idr_for_each_entry(&resource->devices, device, i) {
3588 retcode = adm_del_minor(device); 3601 retcode = adm_del_minor(device);
@@ -3593,10 +3606,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3593 } 3606 }
3594 } 3607 }
3595 3608
3596 list_del_rcu(&resource->resources); 3609 retcode = adm_del_resource(resource);
3597 synchronize_rcu();
3598 drbd_free_resource(resource);
3599 retcode = NO_ERROR;
3600out: 3610out:
3601 mutex_unlock(&resource->adm_mutex); 3611 mutex_unlock(&resource->adm_mutex);
3602finish: 3612finish:
@@ -3608,7 +3618,6 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
3608{ 3618{
3609 struct drbd_config_context adm_ctx; 3619 struct drbd_config_context adm_ctx;
3610 struct drbd_resource *resource; 3620 struct drbd_resource *resource;
3611 struct drbd_connection *connection;
3612 enum drbd_ret_code retcode; 3621 enum drbd_ret_code retcode;
3613 3622
3614 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); 3623 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
@@ -3616,27 +3625,10 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
3616 return retcode; 3625 return retcode;
3617 if (retcode != NO_ERROR) 3626 if (retcode != NO_ERROR)
3618 goto finish; 3627 goto finish;
3619
3620 resource = adm_ctx.resource; 3628 resource = adm_ctx.resource;
3621 mutex_lock(&resource->adm_mutex);
3622 for_each_connection(connection, resource) {
3623 if (connection->cstate > C_STANDALONE) {
3624 retcode = ERR_NET_CONFIGURED;
3625 goto out;
3626 }
3627 }
3628 if (!idr_is_empty(&resource->devices)) {
3629 retcode = ERR_RES_IN_USE;
3630 goto out;
3631 }
3632 3629
3633 list_del_rcu(&resource->resources); 3630 mutex_lock(&resource->adm_mutex);
3634 for_each_connection(connection, resource) 3631 retcode = adm_del_resource(resource);
3635 drbd_thread_stop(&connection->worker);
3636 synchronize_rcu();
3637 drbd_free_resource(resource);
3638 retcode = NO_ERROR;
3639out:
3640 mutex_unlock(&resource->adm_mutex); 3632 mutex_unlock(&resource->adm_mutex);
3641finish: 3633finish:
3642 drbd_adm_finish(&adm_ctx, info, retcode); 3634 drbd_adm_finish(&adm_ctx, info, retcode);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6960fb064731..d169b4a79267 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2482,7 +2482,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2482 atomic_read(&device->rs_sect_ev); 2482 atomic_read(&device->rs_sect_ev);
2483 2483
2484 if (atomic_read(&device->ap_actlog_cnt) 2484 if (atomic_read(&device->ap_actlog_cnt)
2485 || !device->rs_last_events || curr_events - device->rs_last_events > 64) { 2485 || curr_events - device->rs_last_events > 64) {
2486 unsigned long rs_left; 2486 unsigned long rs_left;
2487 int i; 2487 int i;
2488 2488
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 5a01c53dddeb..34f2f0ba409b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -36,29 +36,15 @@ static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector,
36/* Update disk stats at start of I/O request */ 36/* Update disk stats at start of I/O request */
37static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req) 37static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req)
38{ 38{
39 const int rw = bio_data_dir(req->master_bio); 39 generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9,
40 int cpu; 40 &device->vdisk->part0);
41 cpu = part_stat_lock();
42 part_round_stats(cpu, &device->vdisk->part0);
43 part_stat_inc(cpu, &device->vdisk->part0, ios[rw]);
44 part_stat_add(cpu, &device->vdisk->part0, sectors[rw], req->i.size >> 9);
45 (void) cpu; /* The macro invocations above want the cpu argument, I do not like
46 the compiler warning about cpu only assigned but never used... */
47 part_inc_in_flight(&device->vdisk->part0, rw);
48 part_stat_unlock();
49} 41}
50 42
51/* Update disk stats when completing request upwards */ 43/* Update disk stats when completing request upwards */
52static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) 44static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
53{ 45{
54 int rw = bio_data_dir(req->master_bio); 46 generic_end_io_acct(bio_data_dir(req->master_bio),
55 unsigned long duration = jiffies - req->start_jif; 47 &device->vdisk->part0, req->start_jif);
56 int cpu;
57 cpu = part_stat_lock();
58 part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration);
59 part_round_stats(cpu, &device->vdisk->part0);
60 part_dec_in_flight(&device->vdisk->part0, rw);
61 part_stat_unlock();
62} 48}
63 49
64static struct drbd_request *drbd_req_new(struct drbd_device *device, 50static struct drbd_request *drbd_req_new(struct drbd_device *device,
@@ -1545,6 +1531,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
1545 struct request_queue * const b = 1531 struct request_queue * const b =
1546 device->ldev->backing_bdev->bd_disk->queue; 1532 device->ldev->backing_bdev->bd_disk->queue;
1547 if (b->merge_bvec_fn) { 1533 if (b->merge_bvec_fn) {
1534 bvm->bi_bdev = device->ldev->backing_bdev;
1548 backing_limit = b->merge_bvec_fn(b, bvm, bvec); 1535 backing_limit = b->merge_bvec_fn(b, bvm, bvec);
1549 limit = min(limit, backing_limit); 1536 limit = min(limit, backing_limit);
1550 } 1537 }
@@ -1628,7 +1615,7 @@ void request_timer_fn(unsigned long data)
1628 time_after(now, req_peer->pre_send_jif + ent) && 1615 time_after(now, req_peer->pre_send_jif + ent) &&
1629 !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { 1616 !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
1630 drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); 1617 drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
1631 _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); 1618 _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
1632 } 1619 }
1633 if (dt && oldest_submit_jif != now && 1620 if (dt && oldest_submit_jif != now &&
1634 time_after(now, oldest_submit_jif + dt) && 1621 time_after(now, oldest_submit_jif + dt) &&
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 84b11f887d73..2d7dd269b6a8 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -215,6 +215,18 @@ static bool no_peer_wf_report_params(struct drbd_connection *connection)
215 return rv; 215 return rv;
216} 216}
217 217
218static void wake_up_all_devices(struct drbd_connection *connection)
219{
220 struct drbd_peer_device *peer_device;
221 int vnr;
222
223 rcu_read_lock();
224 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
225 wake_up(&peer_device->device->state_wait);
226 rcu_read_unlock();
227
228}
229
218 230
219/** 231/**
220 * cl_wide_st_chg() - true if the state change is a cluster wide one 232 * cl_wide_st_chg() - true if the state change is a cluster wide one
@@ -410,6 +422,22 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask,
410 return rv; 422 return rv;
411} 423}
412 424
425enum drbd_state_rv
426_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask,
427 union drbd_state val, enum chg_state_flags f)
428{
429 enum drbd_state_rv rv;
430
431 BUG_ON(f & CS_SERIALIZE);
432
433 wait_event_cmd(device->state_wait,
434 (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE,
435 mutex_unlock(device->state_mutex),
436 mutex_lock(device->state_mutex));
437
438 return rv;
439}
440
413static void print_st(struct drbd_device *device, const char *name, union drbd_state ns) 441static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
414{ 442{
415 drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", 443 drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
@@ -629,14 +657,11 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_c
629 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) 657 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
630 rv = SS_IN_TRANSIENT_STATE; 658 rv = SS_IN_TRANSIENT_STATE;
631 659
632 /* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
633 rv = SS_IN_TRANSIENT_STATE; */
634
635 /* While establishing a connection only allow cstate to change. 660 /* While establishing a connection only allow cstate to change.
636 Delay/refuse role changes, detach attach etc... */ 661 Delay/refuse role changes, detach attach etc... (they do not touch cstate) */
637 if (test_bit(STATE_SENT, &connection->flags) && 662 if (test_bit(STATE_SENT, &connection->flags) &&
638 !(os.conn == C_WF_REPORT_PARAMS || 663 !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) ||
639 (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) 664 (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
640 rv = SS_IN_TRANSIENT_STATE; 665 rv = SS_IN_TRANSIENT_STATE;
641 666
642 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) 667 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
@@ -1032,8 +1057,10 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
1032 1057
1033 /* Wake up role changes, that were delayed because of connection establishing */ 1058 /* Wake up role changes, that were delayed because of connection establishing */
1034 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && 1059 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
1035 no_peer_wf_report_params(connection)) 1060 no_peer_wf_report_params(connection)) {
1036 clear_bit(STATE_SENT, &connection->flags); 1061 clear_bit(STATE_SENT, &connection->flags);
1062 wake_up_all_devices(connection);
1063 }
1037 1064
1038 wake_up(&device->misc_wait); 1065 wake_up(&device->misc_wait);
1039 wake_up(&device->state_wait); 1066 wake_up(&device->state_wait);
@@ -1072,7 +1099,6 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
1072 1099
1073 set_ov_position(device, ns.conn); 1100 set_ov_position(device, ns.conn);
1074 device->rs_start = now; 1101 device->rs_start = now;
1075 device->rs_last_events = 0;
1076 device->rs_last_sect_ev = 0; 1102 device->rs_last_sect_ev = 0;
1077 device->ov_last_oos_size = 0; 1103 device->ov_last_oos_size = 0;
1078 device->ov_last_oos_start = 0; 1104 device->ov_last_oos_start = 0;
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index cc41605ba21c..7f53c40823cd 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -117,6 +117,11 @@ extern enum drbd_state_rv _drbd_request_state(struct drbd_device *,
117 union drbd_state, 117 union drbd_state,
118 union drbd_state, 118 union drbd_state,
119 enum chg_state_flags); 119 enum chg_state_flags);
120
121extern enum drbd_state_rv
122_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
123 union drbd_state, enum chg_state_flags);
124
120extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, 125extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
121 enum chg_state_flags, 126 enum chg_state_flags,
122 struct completion *done); 127 struct completion *done);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index d2d1f97511bd..d0fae55d871d 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1592,11 +1592,15 @@ void drbd_resync_after_changed(struct drbd_device *device)
1592 1592
1593void drbd_rs_controller_reset(struct drbd_device *device) 1593void drbd_rs_controller_reset(struct drbd_device *device)
1594{ 1594{
1595 struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
1595 struct fifo_buffer *plan; 1596 struct fifo_buffer *plan;
1596 1597
1597 atomic_set(&device->rs_sect_in, 0); 1598 atomic_set(&device->rs_sect_in, 0);
1598 atomic_set(&device->rs_sect_ev, 0); 1599 atomic_set(&device->rs_sect_ev, 0);
1599 device->rs_in_flight = 0; 1600 device->rs_in_flight = 0;
1601 device->rs_last_events =
1602 (int)part_stat_read(&disk->part0, sectors[0]) +
1603 (int)part_stat_read(&disk->part0, sectors[1]);
1600 1604
1601 /* Updating the RCU protected object in place is necessary since 1605 /* Updating the RCU protected object in place is necessary since
1602 this function gets called from atomic context. 1606 this function gets called from atomic context.
@@ -1743,7 +1747,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1743 device->rs_failed = 0; 1747 device->rs_failed = 0;
1744 device->rs_paused = 0; 1748 device->rs_paused = 0;
1745 device->rs_same_csum = 0; 1749 device->rs_same_csum = 0;
1746 device->rs_last_events = 0;
1747 device->rs_last_sect_ev = 0; 1750 device->rs_last_sect_ev = 0;
1748 device->rs_total = tw; 1751 device->rs_total = tw;
1749 device->rs_start = now; 1752 device->rs_start = now;
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index caa61212fdb5..ae9f615382f6 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -78,7 +78,33 @@ module_param(home_node, int, S_IRUGO);
78MODULE_PARM_DESC(home_node, "Home node for the device"); 78MODULE_PARM_DESC(home_node, "Home node for the device");
79 79
80static int queue_mode = NULL_Q_MQ; 80static int queue_mode = NULL_Q_MQ;
81module_param(queue_mode, int, S_IRUGO); 81
82static int null_param_store_val(const char *str, int *val, int min, int max)
83{
84 int ret, new_val;
85
86 ret = kstrtoint(str, 10, &new_val);
87 if (ret)
88 return -EINVAL;
89
90 if (new_val < min || new_val > max)
91 return -EINVAL;
92
93 *val = new_val;
94 return 0;
95}
96
97static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
98{
99 return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ);
100}
101
102static struct kernel_param_ops null_queue_mode_param_ops = {
103 .set = null_set_queue_mode,
104 .get = param_get_int,
105};
106
107device_param_cb(queue_mode, &null_queue_mode_param_ops, &queue_mode, S_IRUGO);
82MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); 108MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
83 109
84static int gb = 250; 110static int gb = 250;
@@ -94,7 +120,19 @@ module_param(nr_devices, int, S_IRUGO);
94MODULE_PARM_DESC(nr_devices, "Number of devices to register"); 120MODULE_PARM_DESC(nr_devices, "Number of devices to register");
95 121
96static int irqmode = NULL_IRQ_SOFTIRQ; 122static int irqmode = NULL_IRQ_SOFTIRQ;
97module_param(irqmode, int, S_IRUGO); 123
124static int null_set_irqmode(const char *str, const struct kernel_param *kp)
125{
126 return null_param_store_val(str, &irqmode, NULL_IRQ_NONE,
127 NULL_IRQ_TIMER);
128}
129
130static struct kernel_param_ops null_irqmode_param_ops = {
131 .set = null_set_irqmode,
132 .get = param_get_int,
133};
134
135device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO);
98MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); 136MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
99 137
100static int completion_nsec = 10000; 138static int completion_nsec = 10000;
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e2bb8afbeae5..b1d5d8797315 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -13,9 +13,9 @@
13 */ 13 */
14 14
15#include <linux/nvme.h> 15#include <linux/nvme.h>
16#include <linux/bio.h>
17#include <linux/bitops.h> 16#include <linux/bitops.h>
18#include <linux/blkdev.h> 17#include <linux/blkdev.h>
18#include <linux/blk-mq.h>
19#include <linux/cpu.h> 19#include <linux/cpu.h>
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/errno.h> 21#include <linux/errno.h>
@@ -33,7 +33,6 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/pci.h> 35#include <linux/pci.h>
36#include <linux/percpu.h>
37#include <linux/poison.h> 36#include <linux/poison.h>
38#include <linux/ptrace.h> 37#include <linux/ptrace.h>
39#include <linux/sched.h> 38#include <linux/sched.h>
@@ -42,12 +41,12 @@
42#include <scsi/sg.h> 41#include <scsi/sg.h>
43#include <asm-generic/io-64-nonatomic-lo-hi.h> 42#include <asm-generic/io-64-nonatomic-lo-hi.h>
44 43
45#include <trace/events/block.h>
46
47#define NVME_Q_DEPTH 1024 44#define NVME_Q_DEPTH 1024
45#define NVME_AQ_DEPTH 64
48#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 46#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
49#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 47#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
50#define ADMIN_TIMEOUT (admin_timeout * HZ) 48#define ADMIN_TIMEOUT (admin_timeout * HZ)
49#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ)
51#define IOD_TIMEOUT (retry_time * HZ) 50#define IOD_TIMEOUT (retry_time * HZ)
52 51
53static unsigned char admin_timeout = 60; 52static unsigned char admin_timeout = 60;
@@ -62,6 +61,10 @@ static unsigned char retry_time = 30;
62module_param(retry_time, byte, 0644); 61module_param(retry_time, byte, 0644);
63MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O"); 62MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O");
64 63
64static unsigned char shutdown_timeout = 5;
65module_param(shutdown_timeout, byte, 0644);
66MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
67
65static int nvme_major; 68static int nvme_major;
66module_param(nvme_major, int, 0); 69module_param(nvme_major, int, 0);
67 70
@@ -76,10 +79,12 @@ static wait_queue_head_t nvme_kthread_wait;
76static struct notifier_block nvme_nb; 79static struct notifier_block nvme_nb;
77 80
78static void nvme_reset_failed_dev(struct work_struct *ws); 81static void nvme_reset_failed_dev(struct work_struct *ws);
82static int nvme_process_cq(struct nvme_queue *nvmeq);
79 83
80struct async_cmd_info { 84struct async_cmd_info {
81 struct kthread_work work; 85 struct kthread_work work;
82 struct kthread_worker *worker; 86 struct kthread_worker *worker;
87 struct request *req;
83 u32 result; 88 u32 result;
84 int status; 89 int status;
85 void *ctx; 90 void *ctx;
@@ -90,7 +95,7 @@ struct async_cmd_info {
90 * commands and one for I/O commands). 95 * commands and one for I/O commands).
91 */ 96 */
92struct nvme_queue { 97struct nvme_queue {
93 struct rcu_head r_head; 98 struct llist_node node;
94 struct device *q_dmadev; 99 struct device *q_dmadev;
95 struct nvme_dev *dev; 100 struct nvme_dev *dev;
96 char irqname[24]; /* nvme4294967295-65535\0 */ 101 char irqname[24]; /* nvme4294967295-65535\0 */
@@ -99,10 +104,6 @@ struct nvme_queue {
99 volatile struct nvme_completion *cqes; 104 volatile struct nvme_completion *cqes;
100 dma_addr_t sq_dma_addr; 105 dma_addr_t sq_dma_addr;
101 dma_addr_t cq_dma_addr; 106 dma_addr_t cq_dma_addr;
102 wait_queue_head_t sq_full;
103 wait_queue_t sq_cong_wait;
104 struct bio_list sq_cong;
105 struct list_head iod_bio;
106 u32 __iomem *q_db; 107 u32 __iomem *q_db;
107 u16 q_depth; 108 u16 q_depth;
108 u16 cq_vector; 109 u16 cq_vector;
@@ -112,10 +113,8 @@ struct nvme_queue {
112 u16 qid; 113 u16 qid;
113 u8 cq_phase; 114 u8 cq_phase;
114 u8 cqe_seen; 115 u8 cqe_seen;
115 u8 q_suspended;
116 cpumask_var_t cpu_mask;
117 struct async_cmd_info cmdinfo; 116 struct async_cmd_info cmdinfo;
118 unsigned long cmdid_data[]; 117 struct blk_mq_hw_ctx *hctx;
119}; 118};
120 119
121/* 120/*
@@ -143,62 +142,79 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
143struct nvme_cmd_info { 142struct nvme_cmd_info {
144 nvme_completion_fn fn; 143 nvme_completion_fn fn;
145 void *ctx; 144 void *ctx;
146 unsigned long timeout;
147 int aborted; 145 int aborted;
146 struct nvme_queue *nvmeq;
148}; 147};
149 148
150static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) 149static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
150 unsigned int hctx_idx)
151{ 151{
152 return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; 152 struct nvme_dev *dev = data;
153 struct nvme_queue *nvmeq = dev->queues[0];
154
155 WARN_ON(nvmeq->hctx);
156 nvmeq->hctx = hctx;
157 hctx->driver_data = nvmeq;
158 return 0;
153} 159}
154 160
155static unsigned nvme_queue_extra(int depth) 161static int nvme_admin_init_request(void *data, struct request *req,
162 unsigned int hctx_idx, unsigned int rq_idx,
163 unsigned int numa_node)
156{ 164{
157 return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); 165 struct nvme_dev *dev = data;
166 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
167 struct nvme_queue *nvmeq = dev->queues[0];
168
169 BUG_ON(!nvmeq);
170 cmd->nvmeq = nvmeq;
171 return 0;
158} 172}
159 173
160/** 174static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
161 * alloc_cmdid() - Allocate a Command ID
162 * @nvmeq: The queue that will be used for this command
163 * @ctx: A pointer that will be passed to the handler
164 * @handler: The function to call on completion
165 *
166 * Allocate a Command ID for a queue. The data passed in will
167 * be passed to the completion handler. This is implemented by using
168 * the bottom two bits of the ctx pointer to store the handler ID.
169 * Passing in a pointer that's not 4-byte aligned will cause a BUG.
170 * We can change this if it becomes a problem.
171 *
172 * May be called with local interrupts disabled and the q_lock held,
173 * or with interrupts enabled and no locks held.
174 */
175static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
176 nvme_completion_fn handler, unsigned timeout)
177{ 175{
178 int depth = nvmeq->q_depth - 1; 176 struct nvme_queue *nvmeq = hctx->driver_data;
179 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
180 int cmdid;
181 177
182 do { 178 nvmeq->hctx = NULL;
183 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); 179}
184 if (cmdid >= depth) 180
185 return -EBUSY; 181static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
186 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); 182 unsigned int hctx_idx)
183{
184 struct nvme_dev *dev = data;
185 struct nvme_queue *nvmeq = dev->queues[
186 (hctx_idx % dev->queue_count) + 1];
187
188 if (!nvmeq->hctx)
189 nvmeq->hctx = hctx;
187 190
188 info[cmdid].fn = handler; 191 /* nvmeq queues are shared between namespaces. We assume here that
189 info[cmdid].ctx = ctx; 192 * blk-mq map the tags so they match up with the nvme queue tags. */
190 info[cmdid].timeout = jiffies + timeout; 193 WARN_ON(nvmeq->hctx->tags != hctx->tags);
191 info[cmdid].aborted = 0; 194
192 return cmdid; 195 hctx->driver_data = nvmeq;
196 return 0;
193} 197}
194 198
195static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, 199static int nvme_init_request(void *data, struct request *req,
196 nvme_completion_fn handler, unsigned timeout) 200 unsigned int hctx_idx, unsigned int rq_idx,
201 unsigned int numa_node)
197{ 202{
198 int cmdid; 203 struct nvme_dev *dev = data;
199 wait_event_killable(nvmeq->sq_full, 204 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
200 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); 205 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
201 return (cmdid < 0) ? -EINTR : cmdid; 206
207 BUG_ON(!nvmeq);
208 cmd->nvmeq = nvmeq;
209 return 0;
210}
211
212static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
213 nvme_completion_fn handler)
214{
215 cmd->fn = handler;
216 cmd->ctx = ctx;
217 cmd->aborted = 0;
202} 218}
203 219
204/* Special values must be less than 0x1000 */ 220/* Special values must be less than 0x1000 */
@@ -206,17 +222,12 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
206#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 222#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
207#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 223#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
208#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 224#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
209#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE)
210 225
211static void special_completion(struct nvme_queue *nvmeq, void *ctx, 226static void special_completion(struct nvme_queue *nvmeq, void *ctx,
212 struct nvme_completion *cqe) 227 struct nvme_completion *cqe)
213{ 228{
214 if (ctx == CMD_CTX_CANCELLED) 229 if (ctx == CMD_CTX_CANCELLED)
215 return; 230 return;
216 if (ctx == CMD_CTX_ABORT) {
217 ++nvmeq->dev->abort_limit;
218 return;
219 }
220 if (ctx == CMD_CTX_COMPLETED) { 231 if (ctx == CMD_CTX_COMPLETED) {
221 dev_warn(nvmeq->q_dmadev, 232 dev_warn(nvmeq->q_dmadev,
222 "completed id %d twice on queue %d\n", 233 "completed id %d twice on queue %d\n",
@@ -229,99 +240,89 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx,
229 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 240 cqe->command_id, le16_to_cpup(&cqe->sq_id));
230 return; 241 return;
231 } 242 }
232
233 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); 243 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
234} 244}
235 245
236static void async_completion(struct nvme_queue *nvmeq, void *ctx, 246static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
237 struct nvme_completion *cqe)
238{
239 struct async_cmd_info *cmdinfo = ctx;
240 cmdinfo->result = le32_to_cpup(&cqe->result);
241 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
242 queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
243}
244
245/*
246 * Called with local interrupts disabled and the q_lock held. May not sleep.
247 */
248static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
249 nvme_completion_fn *fn)
250{ 247{
251 void *ctx; 248 void *ctx;
252 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
253 249
254 if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) {
255 if (fn)
256 *fn = special_completion;
257 return CMD_CTX_INVALID;
258 }
259 if (fn) 250 if (fn)
260 *fn = info[cmdid].fn; 251 *fn = cmd->fn;
261 ctx = info[cmdid].ctx; 252 ctx = cmd->ctx;
262 info[cmdid].fn = special_completion; 253 cmd->fn = special_completion;
263 info[cmdid].ctx = CMD_CTX_COMPLETED; 254 cmd->ctx = CMD_CTX_CANCELLED;
264 clear_bit(cmdid, nvmeq->cmdid_data);
265 wake_up(&nvmeq->sq_full);
266 return ctx; 255 return ctx;
267} 256}
268 257
269static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, 258static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
270 nvme_completion_fn *fn) 259 struct nvme_completion *cqe)
271{ 260{
272 void *ctx; 261 struct request *req = ctx;
273 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
274 if (fn)
275 *fn = info[cmdid].fn;
276 ctx = info[cmdid].ctx;
277 info[cmdid].fn = special_completion;
278 info[cmdid].ctx = CMD_CTX_CANCELLED;
279 return ctx;
280}
281 262
282static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) 263 u32 result = le32_to_cpup(&cqe->result);
283{ 264 u16 status = le16_to_cpup(&cqe->status) >> 1;
284 return rcu_dereference_raw(dev->queues[qid]); 265
266 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
267 ++nvmeq->dev->event_limit;
268 if (status == NVME_SC_SUCCESS)
269 dev_warn(nvmeq->q_dmadev,
270 "async event result %08x\n", result);
271
272 blk_mq_free_hctx_request(nvmeq->hctx, req);
285} 273}
286 274
287static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) 275static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
276 struct nvme_completion *cqe)
288{ 277{
289 struct nvme_queue *nvmeq; 278 struct request *req = ctx;
290 unsigned queue_id = get_cpu_var(*dev->io_queue);
291 279
292 rcu_read_lock(); 280 u16 status = le16_to_cpup(&cqe->status) >> 1;
293 nvmeq = rcu_dereference(dev->queues[queue_id]); 281 u32 result = le32_to_cpup(&cqe->result);
294 if (nvmeq)
295 return nvmeq;
296 282
297 rcu_read_unlock(); 283 blk_mq_free_hctx_request(nvmeq->hctx, req);
298 put_cpu_var(*dev->io_queue); 284
299 return NULL; 285 dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
286 ++nvmeq->dev->abort_limit;
300} 287}
301 288
302static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) 289static void async_completion(struct nvme_queue *nvmeq, void *ctx,
290 struct nvme_completion *cqe)
303{ 291{
304 rcu_read_unlock(); 292 struct async_cmd_info *cmdinfo = ctx;
305 put_cpu_var(nvmeq->dev->io_queue); 293 cmdinfo->result = le32_to_cpup(&cqe->result);
294 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
295 queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
296 blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req);
306} 297}
307 298
308static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) 299static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
309 __acquires(RCU) 300 unsigned int tag)
310{ 301{
311 struct nvme_queue *nvmeq; 302 struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
303 struct request *req = blk_mq_tag_to_rq(hctx->tags, tag);
312 304
313 rcu_read_lock(); 305 return blk_mq_rq_to_pdu(req);
314 nvmeq = rcu_dereference(dev->queues[q_idx]);
315 if (nvmeq)
316 return nvmeq;
317
318 rcu_read_unlock();
319 return NULL;
320} 306}
321 307
322static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) 308/*
309 * Called with local interrupts disabled and the q_lock held. May not sleep.
310 */
311static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
312 nvme_completion_fn *fn)
323{ 313{
324 rcu_read_unlock(); 314 struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
315 void *ctx;
316 if (tag >= nvmeq->q_depth) {
317 *fn = special_completion;
318 return CMD_CTX_INVALID;
319 }
320 if (fn)
321 *fn = cmd->fn;
322 ctx = cmd->ctx;
323 cmd->fn = special_completion;
324 cmd->ctx = CMD_CTX_COMPLETED;
325 return ctx;
325} 326}
326 327
327/** 328/**
@@ -331,26 +332,29 @@ static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
331 * 332 *
332 * Safe to use from interrupt context 333 * Safe to use from interrupt context
333 */ 334 */
334static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 335static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
335{ 336{
336 unsigned long flags; 337 u16 tail = nvmeq->sq_tail;
337 u16 tail; 338
338 spin_lock_irqsave(&nvmeq->q_lock, flags);
339 if (nvmeq->q_suspended) {
340 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
341 return -EBUSY;
342 }
343 tail = nvmeq->sq_tail;
344 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 339 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
345 if (++tail == nvmeq->q_depth) 340 if (++tail == nvmeq->q_depth)
346 tail = 0; 341 tail = 0;
347 writel(tail, nvmeq->q_db); 342 writel(tail, nvmeq->q_db);
348 nvmeq->sq_tail = tail; 343 nvmeq->sq_tail = tail;
349 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
350 344
351 return 0; 345 return 0;
352} 346}
353 347
348static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
349{
350 unsigned long flags;
351 int ret;
352 spin_lock_irqsave(&nvmeq->q_lock, flags);
353 ret = __nvme_submit_cmd(nvmeq, cmd);
354 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
355 return ret;
356}
357
354static __le64 **iod_list(struct nvme_iod *iod) 358static __le64 **iod_list(struct nvme_iod *iod)
355{ 359{
356 return ((void *)iod) + iod->offset; 360 return ((void *)iod) + iod->offset;
@@ -361,17 +365,17 @@ static __le64 **iod_list(struct nvme_iod *iod)
361 * as it only leads to a small amount of wasted memory for the lifetime of 365 * as it only leads to a small amount of wasted memory for the lifetime of
362 * the I/O. 366 * the I/O.
363 */ 367 */
364static int nvme_npages(unsigned size) 368static int nvme_npages(unsigned size, struct nvme_dev *dev)
365{ 369{
366 unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); 370 unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
367 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 371 return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
368} 372}
369 373
370static struct nvme_iod * 374static struct nvme_iod *
371nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) 375nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
372{ 376{
373 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 377 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
374 sizeof(__le64 *) * nvme_npages(nbytes) + 378 sizeof(__le64 *) * nvme_npages(nbytes, dev) +
375 sizeof(struct scatterlist) * nseg, gfp); 379 sizeof(struct scatterlist) * nseg, gfp);
376 380
377 if (iod) { 381 if (iod) {
@@ -380,7 +384,6 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
380 iod->length = nbytes; 384 iod->length = nbytes;
381 iod->nents = 0; 385 iod->nents = 0;
382 iod->first_dma = 0ULL; 386 iod->first_dma = 0ULL;
383 iod->start_time = jiffies;
384 } 387 }
385 388
386 return iod; 389 return iod;
@@ -388,7 +391,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
388 391
389void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 392void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
390{ 393{
391 const int last_prp = PAGE_SIZE / 8 - 1; 394 const int last_prp = dev->page_size / 8 - 1;
392 int i; 395 int i;
393 __le64 **list = iod_list(iod); 396 __le64 **list = iod_list(iod);
394 dma_addr_t prp_dma = iod->first_dma; 397 dma_addr_t prp_dma = iod->first_dma;
@@ -404,65 +407,49 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
404 kfree(iod); 407 kfree(iod);
405} 408}
406 409
407static void nvme_start_io_acct(struct bio *bio) 410static int nvme_error_status(u16 status)
408{
409 struct gendisk *disk = bio->bi_bdev->bd_disk;
410 if (blk_queue_io_stat(disk->queue)) {
411 const int rw = bio_data_dir(bio);
412 int cpu = part_stat_lock();
413 part_round_stats(cpu, &disk->part0);
414 part_stat_inc(cpu, &disk->part0, ios[rw]);
415 part_stat_add(cpu, &disk->part0, sectors[rw],
416 bio_sectors(bio));
417 part_inc_in_flight(&disk->part0, rw);
418 part_stat_unlock();
419 }
420}
421
422static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
423{ 411{
424 struct gendisk *disk = bio->bi_bdev->bd_disk; 412 switch (status & 0x7ff) {
425 if (blk_queue_io_stat(disk->queue)) { 413 case NVME_SC_SUCCESS:
426 const int rw = bio_data_dir(bio); 414 return 0;
427 unsigned long duration = jiffies - start_time; 415 case NVME_SC_CAP_EXCEEDED:
428 int cpu = part_stat_lock(); 416 return -ENOSPC;
429 part_stat_add(cpu, &disk->part0, ticks[rw], duration); 417 default:
430 part_round_stats(cpu, &disk->part0); 418 return -EIO;
431 part_dec_in_flight(&disk->part0, rw);
432 part_stat_unlock();
433 } 419 }
434} 420}
435 421
436static void bio_completion(struct nvme_queue *nvmeq, void *ctx, 422static void req_completion(struct nvme_queue *nvmeq, void *ctx,
437 struct nvme_completion *cqe) 423 struct nvme_completion *cqe)
438{ 424{
439 struct nvme_iod *iod = ctx; 425 struct nvme_iod *iod = ctx;
440 struct bio *bio = iod->private; 426 struct request *req = iod->private;
427 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
428
441 u16 status = le16_to_cpup(&cqe->status) >> 1; 429 u16 status = le16_to_cpup(&cqe->status) >> 1;
442 int error = 0;
443 430
444 if (unlikely(status)) { 431 if (unlikely(status)) {
445 if (!(status & NVME_SC_DNR || 432 if (!(status & NVME_SC_DNR || blk_noretry_request(req))
446 bio->bi_rw & REQ_FAILFAST_MASK) && 433 && (jiffies - req->start_time) < req->timeout) {
447 (jiffies - iod->start_time) < IOD_TIMEOUT) { 434 blk_mq_requeue_request(req);
448 if (!waitqueue_active(&nvmeq->sq_full)) 435 blk_mq_kick_requeue_list(req->q);
449 add_wait_queue(&nvmeq->sq_full,
450 &nvmeq->sq_cong_wait);
451 list_add_tail(&iod->node, &nvmeq->iod_bio);
452 wake_up(&nvmeq->sq_full);
453 return; 436 return;
454 } 437 }
455 error = -EIO; 438 req->errors = nvme_error_status(status);
456 } 439 } else
457 if (iod->nents) { 440 req->errors = 0;
458 dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, 441
459 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 442 if (cmd_rq->aborted)
460 nvme_end_io_acct(bio, iod->start_time); 443 dev_warn(&nvmeq->dev->pci_dev->dev,
461 } 444 "completing aborted command with status:%04x\n",
445 status);
446
447 if (iod->nents)
448 dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
449 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
462 nvme_free_iod(nvmeq->dev, iod); 450 nvme_free_iod(nvmeq->dev, iod);
463 451
464 trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error); 452 blk_mq_complete_request(req);
465 bio_endio(bio, error);
466} 453}
467 454
468/* length is in bytes. gfp flags indicates whether we may sleep. */ 455/* length is in bytes. gfp flags indicates whether we may sleep. */
@@ -479,26 +466,27 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
479 __le64 **list = iod_list(iod); 466 __le64 **list = iod_list(iod);
480 dma_addr_t prp_dma; 467 dma_addr_t prp_dma;
481 int nprps, i; 468 int nprps, i;
469 u32 page_size = dev->page_size;
482 470
483 length -= (PAGE_SIZE - offset); 471 length -= (page_size - offset);
484 if (length <= 0) 472 if (length <= 0)
485 return total_len; 473 return total_len;
486 474
487 dma_len -= (PAGE_SIZE - offset); 475 dma_len -= (page_size - offset);
488 if (dma_len) { 476 if (dma_len) {
489 dma_addr += (PAGE_SIZE - offset); 477 dma_addr += (page_size - offset);
490 } else { 478 } else {
491 sg = sg_next(sg); 479 sg = sg_next(sg);
492 dma_addr = sg_dma_address(sg); 480 dma_addr = sg_dma_address(sg);
493 dma_len = sg_dma_len(sg); 481 dma_len = sg_dma_len(sg);
494 } 482 }
495 483
496 if (length <= PAGE_SIZE) { 484 if (length <= page_size) {
497 iod->first_dma = dma_addr; 485 iod->first_dma = dma_addr;
498 return total_len; 486 return total_len;
499 } 487 }
500 488
501 nprps = DIV_ROUND_UP(length, PAGE_SIZE); 489 nprps = DIV_ROUND_UP(length, page_size);
502 if (nprps <= (256 / 8)) { 490 if (nprps <= (256 / 8)) {
503 pool = dev->prp_small_pool; 491 pool = dev->prp_small_pool;
504 iod->npages = 0; 492 iod->npages = 0;
@@ -511,13 +499,13 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
511 if (!prp_list) { 499 if (!prp_list) {
512 iod->first_dma = dma_addr; 500 iod->first_dma = dma_addr;
513 iod->npages = -1; 501 iod->npages = -1;
514 return (total_len - length) + PAGE_SIZE; 502 return (total_len - length) + page_size;
515 } 503 }
516 list[0] = prp_list; 504 list[0] = prp_list;
517 iod->first_dma = prp_dma; 505 iod->first_dma = prp_dma;
518 i = 0; 506 i = 0;
519 for (;;) { 507 for (;;) {
520 if (i == PAGE_SIZE / 8) { 508 if (i == page_size >> 3) {
521 __le64 *old_prp_list = prp_list; 509 __le64 *old_prp_list = prp_list;
522 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 510 prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
523 if (!prp_list) 511 if (!prp_list)
@@ -528,9 +516,9 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
528 i = 1; 516 i = 1;
529 } 517 }
530 prp_list[i++] = cpu_to_le64(dma_addr); 518 prp_list[i++] = cpu_to_le64(dma_addr);
531 dma_len -= PAGE_SIZE; 519 dma_len -= page_size;
532 dma_addr += PAGE_SIZE; 520 dma_addr += page_size;
533 length -= PAGE_SIZE; 521 length -= page_size;
534 if (length <= 0) 522 if (length <= 0)
535 break; 523 break;
536 if (dma_len > 0) 524 if (dma_len > 0)
@@ -544,88 +532,25 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
544 return total_len; 532 return total_len;
545} 533}
546 534
547static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, 535/*
548 int len) 536 * We reuse the small pool to allocate the 16-byte range here as it is not
549{ 537 * worth having a special pool for these or additional cases to handle freeing
550 struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL); 538 * the iod.
551 if (!split) 539 */
552 return -ENOMEM; 540static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
553 541 struct request *req, struct nvme_iod *iod)
554 trace_block_split(bdev_get_queue(bio->bi_bdev), bio,
555 split->bi_iter.bi_sector);
556 bio_chain(split, bio);
557
558 if (!waitqueue_active(&nvmeq->sq_full))
559 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
560 bio_list_add(&nvmeq->sq_cong, split);
561 bio_list_add(&nvmeq->sq_cong, bio);
562 wake_up(&nvmeq->sq_full);
563
564 return 0;
565}
566
567/* NVMe scatterlists require no holes in the virtual address */
568#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \
569 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
570
571static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
572 struct bio *bio, enum dma_data_direction dma_dir, int psegs)
573{
574 struct bio_vec bvec, bvprv;
575 struct bvec_iter iter;
576 struct scatterlist *sg = NULL;
577 int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
578 int first = 1;
579
580 if (nvmeq->dev->stripe_size)
581 split_len = nvmeq->dev->stripe_size -
582 ((bio->bi_iter.bi_sector << 9) &
583 (nvmeq->dev->stripe_size - 1));
584
585 sg_init_table(iod->sg, psegs);
586 bio_for_each_segment(bvec, bio, iter) {
587 if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) {
588 sg->length += bvec.bv_len;
589 } else {
590 if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
591 return nvme_split_and_submit(bio, nvmeq,
592 length);
593
594 sg = sg ? sg + 1 : iod->sg;
595 sg_set_page(sg, bvec.bv_page,
596 bvec.bv_len, bvec.bv_offset);
597 nsegs++;
598 }
599
600 if (split_len - length < bvec.bv_len)
601 return nvme_split_and_submit(bio, nvmeq, split_len);
602 length += bvec.bv_len;
603 bvprv = bvec;
604 first = 0;
605 }
606 iod->nents = nsegs;
607 sg_mark_end(sg);
608 if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
609 return -ENOMEM;
610
611 BUG_ON(length != bio->bi_iter.bi_size);
612 return length;
613}
614
615static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
616 struct bio *bio, struct nvme_iod *iod, int cmdid)
617{ 542{
618 struct nvme_dsm_range *range = 543 struct nvme_dsm_range *range =
619 (struct nvme_dsm_range *)iod_list(iod)[0]; 544 (struct nvme_dsm_range *)iod_list(iod)[0];
620 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 545 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
621 546
622 range->cattr = cpu_to_le32(0); 547 range->cattr = cpu_to_le32(0);
623 range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift); 548 range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
624 range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); 549 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
625 550
626 memset(cmnd, 0, sizeof(*cmnd)); 551 memset(cmnd, 0, sizeof(*cmnd));
627 cmnd->dsm.opcode = nvme_cmd_dsm; 552 cmnd->dsm.opcode = nvme_cmd_dsm;
628 cmnd->dsm.command_id = cmdid; 553 cmnd->dsm.command_id = req->tag;
629 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 554 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
630 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 555 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
631 cmnd->dsm.nr = 0; 556 cmnd->dsm.nr = 0;
@@ -634,11 +559,9 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
634 if (++nvmeq->sq_tail == nvmeq->q_depth) 559 if (++nvmeq->sq_tail == nvmeq->q_depth)
635 nvmeq->sq_tail = 0; 560 nvmeq->sq_tail = 0;
636 writel(nvmeq->sq_tail, nvmeq->q_db); 561 writel(nvmeq->sq_tail, nvmeq->q_db);
637
638 return 0;
639} 562}
640 563
641static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 564static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
642 int cmdid) 565 int cmdid)
643{ 566{
644 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 567 struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
@@ -651,49 +574,34 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
651 if (++nvmeq->sq_tail == nvmeq->q_depth) 574 if (++nvmeq->sq_tail == nvmeq->q_depth)
652 nvmeq->sq_tail = 0; 575 nvmeq->sq_tail = 0;
653 writel(nvmeq->sq_tail, nvmeq->q_db); 576 writel(nvmeq->sq_tail, nvmeq->q_db);
654
655 return 0;
656} 577}
657 578
658static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) 579static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
580 struct nvme_ns *ns)
659{ 581{
660 struct bio *bio = iod->private; 582 struct request *req = iod->private;
661 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
662 struct nvme_command *cmnd; 583 struct nvme_command *cmnd;
663 int cmdid; 584 u16 control = 0;
664 u16 control; 585 u32 dsmgmt = 0;
665 u32 dsmgmt;
666
667 cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
668 if (unlikely(cmdid < 0))
669 return cmdid;
670 586
671 if (bio->bi_rw & REQ_DISCARD) 587 if (req->cmd_flags & REQ_FUA)
672 return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
673 if (bio->bi_rw & REQ_FLUSH)
674 return nvme_submit_flush(nvmeq, ns, cmdid);
675
676 control = 0;
677 if (bio->bi_rw & REQ_FUA)
678 control |= NVME_RW_FUA; 588 control |= NVME_RW_FUA;
679 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 589 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
680 control |= NVME_RW_LR; 590 control |= NVME_RW_LR;
681 591
682 dsmgmt = 0; 592 if (req->cmd_flags & REQ_RAHEAD)
683 if (bio->bi_rw & REQ_RAHEAD)
684 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 593 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
685 594
686 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; 595 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
687 memset(cmnd, 0, sizeof(*cmnd)); 596 memset(cmnd, 0, sizeof(*cmnd));
688 597
689 cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read; 598 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
690 cmnd->rw.command_id = cmdid; 599 cmnd->rw.command_id = req->tag;
691 cmnd->rw.nsid = cpu_to_le32(ns->ns_id); 600 cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
692 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 601 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
693 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); 602 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
694 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); 603 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
695 cmnd->rw.length = 604 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
696 cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1);
697 cmnd->rw.control = cpu_to_le16(control); 605 cmnd->rw.control = cpu_to_le16(control);
698 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 606 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
699 607
@@ -704,45 +612,26 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
704 return 0; 612 return 0;
705} 613}
706 614
707static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio) 615static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
708{ 616 const struct blk_mq_queue_data *bd)
709 struct bio *split = bio_clone(bio, GFP_ATOMIC);
710 if (!split)
711 return -ENOMEM;
712
713 split->bi_iter.bi_size = 0;
714 split->bi_phys_segments = 0;
715 bio->bi_rw &= ~REQ_FLUSH;
716 bio_chain(split, bio);
717
718 if (!waitqueue_active(&nvmeq->sq_full))
719 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
720 bio_list_add(&nvmeq->sq_cong, split);
721 bio_list_add(&nvmeq->sq_cong, bio);
722 wake_up_process(nvme_thread);
723
724 return 0;
725}
726
727/*
728 * Called with local interrupts disabled and the q_lock held. May not sleep.
729 */
730static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
731 struct bio *bio)
732{ 617{
618 struct nvme_ns *ns = hctx->queue->queuedata;
619 struct nvme_queue *nvmeq = hctx->driver_data;
620 struct request *req = bd->rq;
621 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
733 struct nvme_iod *iod; 622 struct nvme_iod *iod;
734 int psegs = bio_phys_segments(ns->queue, bio); 623 int psegs = req->nr_phys_segments;
735 int result; 624 enum dma_data_direction dma_dir;
625 unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
626 sizeof(struct nvme_dsm_range);
736 627
737 if ((bio->bi_rw & REQ_FLUSH) && psegs) 628 iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
738 return nvme_split_flush_data(nvmeq, bio);
739
740 iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
741 if (!iod) 629 if (!iod)
742 return -ENOMEM; 630 return BLK_MQ_RQ_QUEUE_BUSY;
743 631
744 iod->private = bio; 632 iod->private = req;
745 if (bio->bi_rw & REQ_DISCARD) { 633
634 if (req->cmd_flags & REQ_DISCARD) {
746 void *range; 635 void *range;
747 /* 636 /*
748 * We reuse the small pool to allocate the 16-byte range here 637 * We reuse the small pool to allocate the 16-byte range here
@@ -752,35 +641,50 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
752 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, 641 range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
753 GFP_ATOMIC, 642 GFP_ATOMIC,
754 &iod->first_dma); 643 &iod->first_dma);
755 if (!range) { 644 if (!range)
756 result = -ENOMEM; 645 goto retry_cmd;
757 goto free_iod;
758 }
759 iod_list(iod)[0] = (__le64 *)range; 646 iod_list(iod)[0] = (__le64 *)range;
760 iod->npages = 0; 647 iod->npages = 0;
761 } else if (psegs) { 648 } else if (psegs) {
762 result = nvme_map_bio(nvmeq, iod, bio, 649 dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
763 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, 650
764 psegs); 651 sg_init_table(iod->sg, psegs);
765 if (result <= 0) 652 iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
766 goto free_iod; 653 if (!iod->nents)
767 if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) != 654 goto error_cmd;
768 result) { 655
769 result = -ENOMEM; 656 if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
770 goto free_iod; 657 goto retry_cmd;
658
659 if (blk_rq_bytes(req) !=
660 nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
661 dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg,
662 iod->nents, dma_dir);
663 goto retry_cmd;
771 } 664 }
772 nvme_start_io_acct(bio);
773 } 665 }
774 if (unlikely(nvme_submit_iod(nvmeq, iod))) {
775 if (!waitqueue_active(&nvmeq->sq_full))
776 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
777 list_add_tail(&iod->node, &nvmeq->iod_bio);
778 }
779 return 0;
780 666
781 free_iod: 667 blk_mq_start_request(req);
668
669 nvme_set_info(cmd, iod, req_completion);
670 spin_lock_irq(&nvmeq->q_lock);
671 if (req->cmd_flags & REQ_DISCARD)
672 nvme_submit_discard(nvmeq, ns, req, iod);
673 else if (req->cmd_flags & REQ_FLUSH)
674 nvme_submit_flush(nvmeq, ns, req->tag);
675 else
676 nvme_submit_iod(nvmeq, iod, ns);
677
678 nvme_process_cq(nvmeq);
679 spin_unlock_irq(&nvmeq->q_lock);
680 return BLK_MQ_RQ_QUEUE_OK;
681
682 error_cmd:
782 nvme_free_iod(nvmeq->dev, iod); 683 nvme_free_iod(nvmeq->dev, iod);
783 return result; 684 return BLK_MQ_RQ_QUEUE_ERROR;
685 retry_cmd:
686 nvme_free_iod(nvmeq->dev, iod);
687 return BLK_MQ_RQ_QUEUE_BUSY;
784} 688}
785 689
786static int nvme_process_cq(struct nvme_queue *nvmeq) 690static int nvme_process_cq(struct nvme_queue *nvmeq)
@@ -801,8 +705,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
801 head = 0; 705 head = 0;
802 phase = !phase; 706 phase = !phase;
803 } 707 }
804 708 ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
805 ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
806 fn(nvmeq, ctx, &cqe); 709 fn(nvmeq, ctx, &cqe);
807 } 710 }
808 711
@@ -823,29 +726,13 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
823 return 1; 726 return 1;
824} 727}
825 728
826static void nvme_make_request(struct request_queue *q, struct bio *bio) 729/* Admin queue isn't initialized as a request queue. If at some point this
730 * happens anyway, make sure to notify the user */
731static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx,
732 const struct blk_mq_queue_data *bd)
827{ 733{
828 struct nvme_ns *ns = q->queuedata; 734 WARN_ON_ONCE(1);
829 struct nvme_queue *nvmeq = get_nvmeq(ns->dev); 735 return BLK_MQ_RQ_QUEUE_ERROR;
830 int result = -EBUSY;
831
832 if (!nvmeq) {
833 bio_endio(bio, -EIO);
834 return;
835 }
836
837 spin_lock_irq(&nvmeq->q_lock);
838 if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
839 result = nvme_submit_bio_queue(nvmeq, ns, bio);
840 if (unlikely(result)) {
841 if (!waitqueue_active(&nvmeq->sq_full))
842 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
843 bio_list_add(&nvmeq->sq_cong, bio);
844 }
845
846 nvme_process_cq(nvmeq);
847 spin_unlock_irq(&nvmeq->q_lock);
848 put_nvmeq(nvmeq);
849} 736}
850 737
851static irqreturn_t nvme_irq(int irq, void *data) 738static irqreturn_t nvme_irq(int irq, void *data)
@@ -869,10 +756,11 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
869 return IRQ_WAKE_THREAD; 756 return IRQ_WAKE_THREAD;
870} 757}
871 758
872static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) 759static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info *
760 cmd_info)
873{ 761{
874 spin_lock_irq(&nvmeq->q_lock); 762 spin_lock_irq(&nvmeq->q_lock);
875 cancel_cmdid(nvmeq, cmdid, NULL); 763 cancel_cmd_info(cmd_info, NULL);
876 spin_unlock_irq(&nvmeq->q_lock); 764 spin_unlock_irq(&nvmeq->q_lock);
877} 765}
878 766
@@ -895,47 +783,40 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
895 * Returns 0 on success. If the result is negative, it's a Linux error code; 783 * Returns 0 on success. If the result is negative, it's a Linux error code;
896 * if the result is positive, it's an NVM Express status code 784 * if the result is positive, it's an NVM Express status code
897 */ 785 */
898static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, 786static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
899 struct nvme_command *cmd,
900 u32 *result, unsigned timeout) 787 u32 *result, unsigned timeout)
901{ 788{
902 int cmdid, ret; 789 int ret;
903 struct sync_cmd_info cmdinfo; 790 struct sync_cmd_info cmdinfo;
904 struct nvme_queue *nvmeq; 791 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
905 792 struct nvme_queue *nvmeq = cmd_rq->nvmeq;
906 nvmeq = lock_nvmeq(dev, q_idx);
907 if (!nvmeq)
908 return -ENODEV;
909 793
910 cmdinfo.task = current; 794 cmdinfo.task = current;
911 cmdinfo.status = -EINTR; 795 cmdinfo.status = -EINTR;
912 796
913 cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout); 797 cmd->common.command_id = req->tag;
914 if (cmdid < 0) { 798
915 unlock_nvmeq(nvmeq); 799 nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
916 return cmdid;
917 }
918 cmd->common.command_id = cmdid;
919 800
920 set_current_state(TASK_KILLABLE); 801 set_current_state(TASK_KILLABLE);
921 ret = nvme_submit_cmd(nvmeq, cmd); 802 ret = nvme_submit_cmd(nvmeq, cmd);
922 if (ret) { 803 if (ret) {
923 free_cmdid(nvmeq, cmdid, NULL); 804 nvme_finish_cmd(nvmeq, req->tag, NULL);
924 unlock_nvmeq(nvmeq);
925 set_current_state(TASK_RUNNING); 805 set_current_state(TASK_RUNNING);
926 return ret;
927 } 806 }
928 unlock_nvmeq(nvmeq); 807 ret = schedule_timeout(timeout);
929 schedule_timeout(timeout); 808
930 809 /*
931 if (cmdinfo.status == -EINTR) { 810 * Ensure that sync_completion has either run, or that it will
932 nvmeq = lock_nvmeq(dev, q_idx); 811 * never run.
933 if (nvmeq) { 812 */
934 nvme_abort_command(nvmeq, cmdid); 813 nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req));
935 unlock_nvmeq(nvmeq); 814
936 } 815 /*
816 * We never got the completion
817 */
818 if (cmdinfo.status == -EINTR)
937 return -EINTR; 819 return -EINTR;
938 }
939 820
940 if (result) 821 if (result)
941 *result = cmdinfo.result; 822 *result = cmdinfo.result;
@@ -943,59 +824,99 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
943 return cmdinfo.status; 824 return cmdinfo.status;
944} 825}
945 826
946static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, 827static int nvme_submit_async_admin_req(struct nvme_dev *dev)
828{
829 struct nvme_queue *nvmeq = dev->queues[0];
830 struct nvme_command c;
831 struct nvme_cmd_info *cmd_info;
832 struct request *req;
833
834 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, false);
835 if (IS_ERR(req))
836 return PTR_ERR(req);
837
838 cmd_info = blk_mq_rq_to_pdu(req);
839 nvme_set_info(cmd_info, req, async_req_completion);
840
841 memset(&c, 0, sizeof(c));
842 c.common.opcode = nvme_admin_async_event;
843 c.common.command_id = req->tag;
844
845 return __nvme_submit_cmd(nvmeq, &c);
846}
847
848static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
947 struct nvme_command *cmd, 849 struct nvme_command *cmd,
948 struct async_cmd_info *cmdinfo, unsigned timeout) 850 struct async_cmd_info *cmdinfo, unsigned timeout)
949{ 851{
950 int cmdid; 852 struct nvme_queue *nvmeq = dev->queues[0];
853 struct request *req;
854 struct nvme_cmd_info *cmd_rq;
855
856 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
857 if (IS_ERR(req))
858 return PTR_ERR(req);
951 859
952 cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout); 860 req->timeout = timeout;
953 if (cmdid < 0) 861 cmd_rq = blk_mq_rq_to_pdu(req);
954 return cmdid; 862 cmdinfo->req = req;
863 nvme_set_info(cmd_rq, cmdinfo, async_completion);
955 cmdinfo->status = -EINTR; 864 cmdinfo->status = -EINTR;
956 cmd->common.command_id = cmdid; 865
866 cmd->common.command_id = req->tag;
867
957 return nvme_submit_cmd(nvmeq, cmd); 868 return nvme_submit_cmd(nvmeq, cmd);
958} 869}
959 870
960int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 871static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
961 u32 *result) 872 u32 *result, unsigned timeout)
962{ 873{
963 return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT); 874 int res;
875 struct request *req;
876
877 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
878 if (IS_ERR(req))
879 return PTR_ERR(req);
880 res = nvme_submit_sync_cmd(req, cmd, result, timeout);
881 blk_mq_free_request(req);
882 return res;
964} 883}
965 884
966int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 885int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
967 u32 *result) 886 u32 *result)
968{ 887{
969 return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result, 888 return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT);
970 NVME_IO_TIMEOUT);
971} 889}
972 890
973static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, 891int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
974 struct nvme_command *cmd, struct async_cmd_info *cmdinfo) 892 struct nvme_command *cmd, u32 *result)
975{ 893{
976 return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo, 894 int res;
977 ADMIN_TIMEOUT); 895 struct request *req;
896
897 req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT),
898 false);
899 if (IS_ERR(req))
900 return PTR_ERR(req);
901 res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
902 blk_mq_free_request(req);
903 return res;
978} 904}
979 905
980static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 906static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
981{ 907{
982 int status;
983 struct nvme_command c; 908 struct nvme_command c;
984 909
985 memset(&c, 0, sizeof(c)); 910 memset(&c, 0, sizeof(c));
986 c.delete_queue.opcode = opcode; 911 c.delete_queue.opcode = opcode;
987 c.delete_queue.qid = cpu_to_le16(id); 912 c.delete_queue.qid = cpu_to_le16(id);
988 913
989 status = nvme_submit_admin_cmd(dev, &c, NULL); 914 return nvme_submit_admin_cmd(dev, &c, NULL);
990 if (status)
991 return -EIO;
992 return 0;
993} 915}
994 916
995static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 917static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
996 struct nvme_queue *nvmeq) 918 struct nvme_queue *nvmeq)
997{ 919{
998 int status;
999 struct nvme_command c; 920 struct nvme_command c;
1000 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 921 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
1001 922
@@ -1007,16 +928,12 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1007 c.create_cq.cq_flags = cpu_to_le16(flags); 928 c.create_cq.cq_flags = cpu_to_le16(flags);
1008 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 929 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
1009 930
1010 status = nvme_submit_admin_cmd(dev, &c, NULL); 931 return nvme_submit_admin_cmd(dev, &c, NULL);
1011 if (status)
1012 return -EIO;
1013 return 0;
1014} 932}
1015 933
1016static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 934static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
1017 struct nvme_queue *nvmeq) 935 struct nvme_queue *nvmeq)
1018{ 936{
1019 int status;
1020 struct nvme_command c; 937 struct nvme_command c;
1021 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 938 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
1022 939
@@ -1028,10 +945,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
1028 c.create_sq.sq_flags = cpu_to_le16(flags); 945 c.create_sq.sq_flags = cpu_to_le16(flags);
1029 c.create_sq.cqid = cpu_to_le16(qid); 946 c.create_sq.cqid = cpu_to_le16(qid);
1030 947
1031 status = nvme_submit_admin_cmd(dev, &c, NULL); 948 return nvme_submit_admin_cmd(dev, &c, NULL);
1032 if (status)
1033 return -EIO;
1034 return 0;
1035} 949}
1036 950
1037static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 951static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1087,28 +1001,27 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
1087} 1001}
1088 1002
1089/** 1003/**
1090 * nvme_abort_cmd - Attempt aborting a command 1004 * nvme_abort_req - Attempt aborting a request
1091 * @cmdid: Command id of a timed out IO
1092 * @queue: The queue with timed out IO
1093 * 1005 *
1094 * Schedule controller reset if the command was already aborted once before and 1006 * Schedule controller reset if the command was already aborted once before and
1095 * still hasn't been returned to the driver, or if this is the admin queue. 1007 * still hasn't been returned to the driver, or if this is the admin queue.
1096 */ 1008 */
1097static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) 1009static void nvme_abort_req(struct request *req)
1098{ 1010{
1099 int a_cmdid; 1011 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
1100 struct nvme_command cmd; 1012 struct nvme_queue *nvmeq = cmd_rq->nvmeq;
1101 struct nvme_dev *dev = nvmeq->dev; 1013 struct nvme_dev *dev = nvmeq->dev;
1102 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1014 struct request *abort_req;
1103 struct nvme_queue *adminq; 1015 struct nvme_cmd_info *abort_cmd;
1016 struct nvme_command cmd;
1104 1017
1105 if (!nvmeq->qid || info[cmdid].aborted) { 1018 if (!nvmeq->qid || cmd_rq->aborted) {
1106 if (work_busy(&dev->reset_work)) 1019 if (work_busy(&dev->reset_work))
1107 return; 1020 return;
1108 list_del_init(&dev->node); 1021 list_del_init(&dev->node);
1109 dev_warn(&dev->pci_dev->dev, 1022 dev_warn(&dev->pci_dev->dev,
1110 "I/O %d QID %d timeout, reset controller\n", cmdid, 1023 "I/O %d QID %d timeout, reset controller\n",
1111 nvmeq->qid); 1024 req->tag, nvmeq->qid);
1112 dev->reset_workfn = nvme_reset_failed_dev; 1025 dev->reset_workfn = nvme_reset_failed_dev;
1113 queue_work(nvme_workq, &dev->reset_work); 1026 queue_work(nvme_workq, &dev->reset_work);
1114 return; 1027 return;
@@ -1117,120 +1030,110 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
1117 if (!dev->abort_limit) 1030 if (!dev->abort_limit)
1118 return; 1031 return;
1119 1032
1120 adminq = rcu_dereference(dev->queues[0]); 1033 abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
1121 a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion, 1034 false);
1122 ADMIN_TIMEOUT); 1035 if (IS_ERR(abort_req))
1123 if (a_cmdid < 0)
1124 return; 1036 return;
1125 1037
1038 abort_cmd = blk_mq_rq_to_pdu(abort_req);
1039 nvme_set_info(abort_cmd, abort_req, abort_completion);
1040
1126 memset(&cmd, 0, sizeof(cmd)); 1041 memset(&cmd, 0, sizeof(cmd));
1127 cmd.abort.opcode = nvme_admin_abort_cmd; 1042 cmd.abort.opcode = nvme_admin_abort_cmd;
1128 cmd.abort.cid = cmdid; 1043 cmd.abort.cid = req->tag;
1129 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1044 cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
1130 cmd.abort.command_id = a_cmdid; 1045 cmd.abort.command_id = abort_req->tag;
1131 1046
1132 --dev->abort_limit; 1047 --dev->abort_limit;
1133 info[cmdid].aborted = 1; 1048 cmd_rq->aborted = 1;
1134 info[cmdid].timeout = jiffies + ADMIN_TIMEOUT;
1135 1049
1136 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, 1050 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
1137 nvmeq->qid); 1051 nvmeq->qid);
1138 nvme_submit_cmd(adminq, &cmd); 1052 if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) {
1053 dev_warn(nvmeq->q_dmadev,
1054 "Could not abort I/O %d QID %d",
1055 req->tag, nvmeq->qid);
1056 blk_mq_free_request(abort_req);
1057 }
1139} 1058}
1140 1059
1141/** 1060static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx,
1142 * nvme_cancel_ios - Cancel outstanding I/Os 1061 struct request *req, void *data, bool reserved)
1143 * @queue: The queue to cancel I/Os on
1144 * @timeout: True to only cancel I/Os which have timed out
1145 */
1146static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
1147{ 1062{
1148 int depth = nvmeq->q_depth - 1; 1063 struct nvme_queue *nvmeq = data;
1149 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 1064 void *ctx;
1150 unsigned long now = jiffies; 1065 nvme_completion_fn fn;
1151 int cmdid; 1066 struct nvme_cmd_info *cmd;
1067 static struct nvme_completion cqe = {
1068 .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
1069 };
1152 1070
1153 for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { 1071 cmd = blk_mq_rq_to_pdu(req);
1154 void *ctx;
1155 nvme_completion_fn fn;
1156 static struct nvme_completion cqe = {
1157 .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
1158 };
1159 1072
1160 if (timeout && !time_after(now, info[cmdid].timeout)) 1073 if (cmd->ctx == CMD_CTX_CANCELLED)
1161 continue; 1074 return;
1162 if (info[cmdid].ctx == CMD_CTX_CANCELLED) 1075
1163 continue; 1076 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
1164 if (timeout && nvmeq->dev->initialized) { 1077 req->tag, nvmeq->qid);
1165 nvme_abort_cmd(cmdid, nvmeq); 1078 ctx = cancel_cmd_info(cmd, &fn);
1166 continue; 1079 fn(nvmeq, ctx, &cqe);
1167 }
1168 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
1169 nvmeq->qid);
1170 ctx = cancel_cmdid(nvmeq, cmdid, &fn);
1171 fn(nvmeq, ctx, &cqe);
1172 }
1173} 1080}
1174 1081
1175static void nvme_free_queue(struct rcu_head *r) 1082static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
1176{ 1083{
1177 struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head); 1084 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
1085 struct nvme_queue *nvmeq = cmd->nvmeq;
1178 1086
1179 spin_lock_irq(&nvmeq->q_lock); 1087 dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
1180 while (bio_list_peek(&nvmeq->sq_cong)) { 1088 nvmeq->qid);
1181 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1089 if (nvmeq->dev->initialized)
1182 bio_endio(bio, -EIO); 1090 nvme_abort_req(req);
1183 } 1091
1184 while (!list_empty(&nvmeq->iod_bio)) { 1092 /*
1185 static struct nvme_completion cqe = { 1093 * The aborted req will be completed on receiving the abort req.
1186 .status = cpu_to_le16( 1094 * We enable the timer again. If hit twice, it'll cause a device reset,
1187 (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1), 1095 * as the device then is in a faulty state.
1188 }; 1096 */
1189 struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio, 1097 return BLK_EH_RESET_TIMER;
1190 struct nvme_iod, 1098}
1191 node);
1192 list_del(&iod->node);
1193 bio_completion(nvmeq, iod, &cqe);
1194 }
1195 spin_unlock_irq(&nvmeq->q_lock);
1196 1099
1100static void nvme_free_queue(struct nvme_queue *nvmeq)
1101{
1197 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1102 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
1198 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1103 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1199 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1104 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
1200 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1105 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1201 if (nvmeq->qid)
1202 free_cpumask_var(nvmeq->cpu_mask);
1203 kfree(nvmeq); 1106 kfree(nvmeq);
1204} 1107}
1205 1108
1206static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1109static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1207{ 1110{
1111 LLIST_HEAD(q_list);
1112 struct nvme_queue *nvmeq, *next;
1113 struct llist_node *entry;
1208 int i; 1114 int i;
1209 1115
1210 for (i = dev->queue_count - 1; i >= lowest; i--) { 1116 for (i = dev->queue_count - 1; i >= lowest; i--) {
1211 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 1117 struct nvme_queue *nvmeq = dev->queues[i];
1212 rcu_assign_pointer(dev->queues[i], NULL); 1118 llist_add(&nvmeq->node, &q_list);
1213 call_rcu(&nvmeq->r_head, nvme_free_queue);
1214 dev->queue_count--; 1119 dev->queue_count--;
1120 dev->queues[i] = NULL;
1215 } 1121 }
1122 synchronize_rcu();
1123 entry = llist_del_all(&q_list);
1124 llist_for_each_entry_safe(nvmeq, next, entry, node)
1125 nvme_free_queue(nvmeq);
1216} 1126}
1217 1127
1218/** 1128/**
1219 * nvme_suspend_queue - put queue into suspended state 1129 * nvme_suspend_queue - put queue into suspended state
1220 * @nvmeq - queue to suspend 1130 * @nvmeq - queue to suspend
1221 *
1222 * Returns 1 if already suspended, 0 otherwise.
1223 */ 1131 */
1224static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1132static int nvme_suspend_queue(struct nvme_queue *nvmeq)
1225{ 1133{
1226 int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; 1134 int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
1227 1135
1228 spin_lock_irq(&nvmeq->q_lock); 1136 spin_lock_irq(&nvmeq->q_lock);
1229 if (nvmeq->q_suspended) {
1230 spin_unlock_irq(&nvmeq->q_lock);
1231 return 1;
1232 }
1233 nvmeq->q_suspended = 1;
1234 nvmeq->dev->online_queues--; 1137 nvmeq->dev->online_queues--;
1235 spin_unlock_irq(&nvmeq->q_lock); 1138 spin_unlock_irq(&nvmeq->q_lock);
1236 1139
@@ -1242,15 +1145,18 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
1242 1145
1243static void nvme_clear_queue(struct nvme_queue *nvmeq) 1146static void nvme_clear_queue(struct nvme_queue *nvmeq)
1244{ 1147{
1148 struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
1149
1245 spin_lock_irq(&nvmeq->q_lock); 1150 spin_lock_irq(&nvmeq->q_lock);
1246 nvme_process_cq(nvmeq); 1151 nvme_process_cq(nvmeq);
1247 nvme_cancel_ios(nvmeq, false); 1152 if (hctx && hctx->tags)
1153 blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq);
1248 spin_unlock_irq(&nvmeq->q_lock); 1154 spin_unlock_irq(&nvmeq->q_lock);
1249} 1155}
1250 1156
1251static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1157static void nvme_disable_queue(struct nvme_dev *dev, int qid)
1252{ 1158{
1253 struct nvme_queue *nvmeq = raw_nvmeq(dev, qid); 1159 struct nvme_queue *nvmeq = dev->queues[qid];
1254 1160
1255 if (!nvmeq) 1161 if (!nvmeq)
1256 return; 1162 return;
@@ -1270,25 +1176,20 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1270 int depth, int vector) 1176 int depth, int vector)
1271{ 1177{
1272 struct device *dmadev = &dev->pci_dev->dev; 1178 struct device *dmadev = &dev->pci_dev->dev;
1273 unsigned extra = nvme_queue_extra(depth); 1179 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
1274 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
1275 if (!nvmeq) 1180 if (!nvmeq)
1276 return NULL; 1181 return NULL;
1277 1182
1278 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), 1183 nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth),
1279 &nvmeq->cq_dma_addr, GFP_KERNEL); 1184 &nvmeq->cq_dma_addr, GFP_KERNEL);
1280 if (!nvmeq->cqes) 1185 if (!nvmeq->cqes)
1281 goto free_nvmeq; 1186 goto free_nvmeq;
1282 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
1283 1187
1284 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), 1188 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
1285 &nvmeq->sq_dma_addr, GFP_KERNEL); 1189 &nvmeq->sq_dma_addr, GFP_KERNEL);
1286 if (!nvmeq->sq_cmds) 1190 if (!nvmeq->sq_cmds)
1287 goto free_cqdma; 1191 goto free_cqdma;
1288 1192
1289 if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL))
1290 goto free_sqdma;
1291
1292 nvmeq->q_dmadev = dmadev; 1193 nvmeq->q_dmadev = dmadev;
1293 nvmeq->dev = dev; 1194 nvmeq->dev = dev;
1294 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1195 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
@@ -1296,23 +1197,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1296 spin_lock_init(&nvmeq->q_lock); 1197 spin_lock_init(&nvmeq->q_lock);
1297 nvmeq->cq_head = 0; 1198 nvmeq->cq_head = 0;
1298 nvmeq->cq_phase = 1; 1199 nvmeq->cq_phase = 1;
1299 init_waitqueue_head(&nvmeq->sq_full);
1300 init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
1301 bio_list_init(&nvmeq->sq_cong);
1302 INIT_LIST_HEAD(&nvmeq->iod_bio);
1303 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1200 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1304 nvmeq->q_depth = depth; 1201 nvmeq->q_depth = depth;
1305 nvmeq->cq_vector = vector; 1202 nvmeq->cq_vector = vector;
1306 nvmeq->qid = qid; 1203 nvmeq->qid = qid;
1307 nvmeq->q_suspended = 1;
1308 dev->queue_count++; 1204 dev->queue_count++;
1309 rcu_assign_pointer(dev->queues[qid], nvmeq); 1205 dev->queues[qid] = nvmeq;
1310 1206
1311 return nvmeq; 1207 return nvmeq;
1312 1208
1313 free_sqdma:
1314 dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds,
1315 nvmeq->sq_dma_addr);
1316 free_cqdma: 1209 free_cqdma:
1317 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1210 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
1318 nvmeq->cq_dma_addr); 1211 nvmeq->cq_dma_addr);
@@ -1335,17 +1228,15 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1335static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1228static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
1336{ 1229{
1337 struct nvme_dev *dev = nvmeq->dev; 1230 struct nvme_dev *dev = nvmeq->dev;
1338 unsigned extra = nvme_queue_extra(nvmeq->q_depth);
1339 1231
1232 spin_lock_irq(&nvmeq->q_lock);
1340 nvmeq->sq_tail = 0; 1233 nvmeq->sq_tail = 0;
1341 nvmeq->cq_head = 0; 1234 nvmeq->cq_head = 0;
1342 nvmeq->cq_phase = 1; 1235 nvmeq->cq_phase = 1;
1343 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1236 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1344 memset(nvmeq->cmdid_data, 0, extra);
1345 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1237 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
1346 nvme_cancel_ios(nvmeq, false);
1347 nvmeq->q_suspended = 0;
1348 dev->online_queues++; 1238 dev->online_queues++;
1239 spin_unlock_irq(&nvmeq->q_lock);
1349} 1240}
1350 1241
1351static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1242static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
@@ -1365,10 +1256,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
1365 if (result < 0) 1256 if (result < 0)
1366 goto release_sq; 1257 goto release_sq;
1367 1258
1368 spin_lock_irq(&nvmeq->q_lock);
1369 nvme_init_queue(nvmeq, qid); 1259 nvme_init_queue(nvmeq, qid);
1370 spin_unlock_irq(&nvmeq->q_lock);
1371
1372 return result; 1260 return result;
1373 1261
1374 release_sq: 1262 release_sq:
@@ -1408,27 +1296,32 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
1408 */ 1296 */
1409static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) 1297static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
1410{ 1298{
1411 u32 cc = readl(&dev->bar->cc); 1299 dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1300 dev->ctrl_config &= ~NVME_CC_ENABLE;
1301 writel(dev->ctrl_config, &dev->bar->cc);
1412 1302
1413 if (cc & NVME_CC_ENABLE)
1414 writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc);
1415 return nvme_wait_ready(dev, cap, false); 1303 return nvme_wait_ready(dev, cap, false);
1416} 1304}
1417 1305
1418static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) 1306static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
1419{ 1307{
1308 dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1309 dev->ctrl_config |= NVME_CC_ENABLE;
1310 writel(dev->ctrl_config, &dev->bar->cc);
1311
1420 return nvme_wait_ready(dev, cap, true); 1312 return nvme_wait_ready(dev, cap, true);
1421} 1313}
1422 1314
1423static int nvme_shutdown_ctrl(struct nvme_dev *dev) 1315static int nvme_shutdown_ctrl(struct nvme_dev *dev)
1424{ 1316{
1425 unsigned long timeout; 1317 unsigned long timeout;
1426 u32 cc;
1427 1318
1428 cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL; 1319 dev->ctrl_config &= ~NVME_CC_SHN_MASK;
1429 writel(cc, &dev->bar->cc); 1320 dev->ctrl_config |= NVME_CC_SHN_NORMAL;
1430 1321
1431 timeout = 2 * HZ + jiffies; 1322 writel(dev->ctrl_config, &dev->bar->cc);
1323
1324 timeout = SHUTDOWN_TIMEOUT + jiffies;
1432 while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != 1325 while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
1433 NVME_CSTS_SHST_CMPLT) { 1326 NVME_CSTS_SHST_CMPLT) {
1434 msleep(100); 1327 msleep(100);
@@ -1444,20 +1337,86 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
1444 return 0; 1337 return 0;
1445} 1338}
1446 1339
1340static struct blk_mq_ops nvme_mq_admin_ops = {
1341 .queue_rq = nvme_admin_queue_rq,
1342 .map_queue = blk_mq_map_queue,
1343 .init_hctx = nvme_admin_init_hctx,
1344 .exit_hctx = nvme_exit_hctx,
1345 .init_request = nvme_admin_init_request,
1346 .timeout = nvme_timeout,
1347};
1348
1349static struct blk_mq_ops nvme_mq_ops = {
1350 .queue_rq = nvme_queue_rq,
1351 .map_queue = blk_mq_map_queue,
1352 .init_hctx = nvme_init_hctx,
1353 .exit_hctx = nvme_exit_hctx,
1354 .init_request = nvme_init_request,
1355 .timeout = nvme_timeout,
1356};
1357
1358static int nvme_alloc_admin_tags(struct nvme_dev *dev)
1359{
1360 if (!dev->admin_q) {
1361 dev->admin_tagset.ops = &nvme_mq_admin_ops;
1362 dev->admin_tagset.nr_hw_queues = 1;
1363 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
1364 dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1365 dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
1366 dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
1367 dev->admin_tagset.driver_data = dev;
1368
1369 if (blk_mq_alloc_tag_set(&dev->admin_tagset))
1370 return -ENOMEM;
1371
1372 dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);
1373 if (!dev->admin_q) {
1374 blk_mq_free_tag_set(&dev->admin_tagset);
1375 return -ENOMEM;
1376 }
1377 }
1378
1379 return 0;
1380}
1381
1382static void nvme_free_admin_tags(struct nvme_dev *dev)
1383{
1384 if (dev->admin_q)
1385 blk_mq_free_tag_set(&dev->admin_tagset);
1386}
1387
1447static int nvme_configure_admin_queue(struct nvme_dev *dev) 1388static int nvme_configure_admin_queue(struct nvme_dev *dev)
1448{ 1389{
1449 int result; 1390 int result;
1450 u32 aqa; 1391 u32 aqa;
1451 u64 cap = readq(&dev->bar->cap); 1392 u64 cap = readq(&dev->bar->cap);
1452 struct nvme_queue *nvmeq; 1393 struct nvme_queue *nvmeq;
1394 unsigned page_shift = PAGE_SHIFT;
1395 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;
1396 unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;
1397
1398 if (page_shift < dev_page_min) {
1399 dev_err(&dev->pci_dev->dev,
1400 "Minimum device page size (%u) too large for "
1401 "host (%u)\n", 1 << dev_page_min,
1402 1 << page_shift);
1403 return -ENODEV;
1404 }
1405 if (page_shift > dev_page_max) {
1406 dev_info(&dev->pci_dev->dev,
1407 "Device maximum page size (%u) smaller than "
1408 "host (%u); enabling work-around\n",
1409 1 << dev_page_max, 1 << page_shift);
1410 page_shift = dev_page_max;
1411 }
1453 1412
1454 result = nvme_disable_ctrl(dev, cap); 1413 result = nvme_disable_ctrl(dev, cap);
1455 if (result < 0) 1414 if (result < 0)
1456 return result; 1415 return result;
1457 1416
1458 nvmeq = raw_nvmeq(dev, 0); 1417 nvmeq = dev->queues[0];
1459 if (!nvmeq) { 1418 if (!nvmeq) {
1460 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 1419 nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 0);
1461 if (!nvmeq) 1420 if (!nvmeq)
1462 return -ENOMEM; 1421 return -ENOMEM;
1463 } 1422 }
@@ -1465,27 +1424,35 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
1465 aqa = nvmeq->q_depth - 1; 1424 aqa = nvmeq->q_depth - 1;
1466 aqa |= aqa << 16; 1425 aqa |= aqa << 16;
1467 1426
1468 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; 1427 dev->page_size = 1 << page_shift;
1469 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 1428
1429 dev->ctrl_config = NVME_CC_CSS_NVM;
1430 dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
1470 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1431 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
1471 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1432 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
1472 1433
1473 writel(aqa, &dev->bar->aqa); 1434 writel(aqa, &dev->bar->aqa);
1474 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1435 writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
1475 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1436 writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
1476 writel(dev->ctrl_config, &dev->bar->cc);
1477 1437
1478 result = nvme_enable_ctrl(dev, cap); 1438 result = nvme_enable_ctrl(dev, cap);
1479 if (result) 1439 if (result)
1480 return result; 1440 goto free_nvmeq;
1441
1442 result = nvme_alloc_admin_tags(dev);
1443 if (result)
1444 goto free_nvmeq;
1481 1445
1482 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1446 result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
1483 if (result) 1447 if (result)
1484 return result; 1448 goto free_tags;
1485 1449
1486 spin_lock_irq(&nvmeq->q_lock); 1450 return result;
1487 nvme_init_queue(nvmeq, 0); 1451
1488 spin_unlock_irq(&nvmeq->q_lock); 1452 free_tags:
1453 nvme_free_admin_tags(dev);
1454 free_nvmeq:
1455 nvme_free_queues(dev, 0);
1489 return result; 1456 return result;
1490} 1457}
1491 1458
@@ -1516,7 +1483,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
1516 } 1483 }
1517 1484
1518 err = -ENOMEM; 1485 err = -ENOMEM;
1519 iod = nvme_alloc_iod(count, length, GFP_KERNEL); 1486 iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
1520 if (!iod) 1487 if (!iod)
1521 goto put_pages; 1488 goto put_pages;
1522 1489
@@ -1644,7 +1611,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1644 if (length != (io.nblocks + 1) << ns->lba_shift) 1611 if (length != (io.nblocks + 1) << ns->lba_shift)
1645 status = -ENOMEM; 1612 status = -ENOMEM;
1646 else 1613 else
1647 status = nvme_submit_io_cmd(dev, &c, NULL); 1614 status = nvme_submit_io_cmd(dev, ns, &c, NULL);
1648 1615
1649 if (meta_len) { 1616 if (meta_len) {
1650 if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { 1617 if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
@@ -1676,10 +1643,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1676 return status; 1643 return status;
1677} 1644}
1678 1645
1679static int nvme_user_admin_cmd(struct nvme_dev *dev, 1646static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
1680 struct nvme_admin_cmd __user *ucmd) 1647 struct nvme_passthru_cmd __user *ucmd)
1681{ 1648{
1682 struct nvme_admin_cmd cmd; 1649 struct nvme_passthru_cmd cmd;
1683 struct nvme_command c; 1650 struct nvme_command c;
1684 int status, length; 1651 int status, length;
1685 struct nvme_iod *uninitialized_var(iod); 1652 struct nvme_iod *uninitialized_var(iod);
@@ -1716,10 +1683,23 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
1716 1683
1717 timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : 1684 timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
1718 ADMIN_TIMEOUT; 1685 ADMIN_TIMEOUT;
1686
1719 if (length != cmd.data_len) 1687 if (length != cmd.data_len)
1720 status = -ENOMEM; 1688 status = -ENOMEM;
1721 else 1689 else if (ns) {
1722 status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); 1690 struct request *req;
1691
1692 req = blk_mq_alloc_request(ns->queue, WRITE,
1693 (GFP_KERNEL|__GFP_WAIT), false);
1694 if (IS_ERR(req))
1695 status = PTR_ERR(req);
1696 else {
1697 status = nvme_submit_sync_cmd(req, &c, &cmd.result,
1698 timeout);
1699 blk_mq_free_request(req);
1700 }
1701 } else
1702 status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout);
1723 1703
1724 if (cmd.data_len) { 1704 if (cmd.data_len) {
1725 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1705 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
@@ -1743,7 +1723,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
1743 force_successful_syscall_return(); 1723 force_successful_syscall_return();
1744 return ns->ns_id; 1724 return ns->ns_id;
1745 case NVME_IOCTL_ADMIN_CMD: 1725 case NVME_IOCTL_ADMIN_CMD:
1746 return nvme_user_admin_cmd(ns->dev, (void __user *)arg); 1726 return nvme_user_cmd(ns->dev, NULL, (void __user *)arg);
1727 case NVME_IOCTL_IO_CMD:
1728 return nvme_user_cmd(ns->dev, ns, (void __user *)arg);
1747 case NVME_IOCTL_SUBMIT_IO: 1729 case NVME_IOCTL_SUBMIT_IO:
1748 return nvme_submit_io(ns, (void __user *)arg); 1730 return nvme_submit_io(ns, (void __user *)arg);
1749 case SG_GET_VERSION_NUM: 1731 case SG_GET_VERSION_NUM:
@@ -1759,11 +1741,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
1759static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 1741static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1760 unsigned int cmd, unsigned long arg) 1742 unsigned int cmd, unsigned long arg)
1761{ 1743{
1762 struct nvme_ns *ns = bdev->bd_disk->private_data;
1763
1764 switch (cmd) { 1744 switch (cmd) {
1765 case SG_IO: 1745 case SG_IO:
1766 return nvme_sg_io32(ns, arg); 1746 return -ENOIOCTLCMD;
1767 } 1747 }
1768 return nvme_ioctl(bdev, mode, cmd, arg); 1748 return nvme_ioctl(bdev, mode, cmd, arg);
1769} 1749}
@@ -1773,11 +1753,18 @@ static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1773 1753
1774static int nvme_open(struct block_device *bdev, fmode_t mode) 1754static int nvme_open(struct block_device *bdev, fmode_t mode)
1775{ 1755{
1776 struct nvme_ns *ns = bdev->bd_disk->private_data; 1756 int ret = 0;
1777 struct nvme_dev *dev = ns->dev; 1757 struct nvme_ns *ns;
1778 1758
1779 kref_get(&dev->kref); 1759 spin_lock(&dev_list_lock);
1780 return 0; 1760 ns = bdev->bd_disk->private_data;
1761 if (!ns)
1762 ret = -ENXIO;
1763 else if (!kref_get_unless_zero(&ns->dev->kref))
1764 ret = -ENXIO;
1765 spin_unlock(&dev_list_lock);
1766
1767 return ret;
1781} 1768}
1782 1769
1783static void nvme_free_dev(struct kref *kref); 1770static void nvme_free_dev(struct kref *kref);
@@ -1799,6 +1786,35 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
1799 return 0; 1786 return 0;
1800} 1787}
1801 1788
1789static int nvme_revalidate_disk(struct gendisk *disk)
1790{
1791 struct nvme_ns *ns = disk->private_data;
1792 struct nvme_dev *dev = ns->dev;
1793 struct nvme_id_ns *id;
1794 dma_addr_t dma_addr;
1795 int lbaf;
1796
1797 id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
1798 GFP_KERNEL);
1799 if (!id) {
1800 dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n",
1801 __func__);
1802 return 0;
1803 }
1804
1805 if (nvme_identify(dev, ns->ns_id, 0, dma_addr))
1806 goto free;
1807
1808 lbaf = id->flbas & 0xf;
1809 ns->lba_shift = id->lbaf[lbaf].ds;
1810
1811 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
1812 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
1813 free:
1814 dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
1815 return 0;
1816}
1817
1802static const struct block_device_operations nvme_fops = { 1818static const struct block_device_operations nvme_fops = {
1803 .owner = THIS_MODULE, 1819 .owner = THIS_MODULE,
1804 .ioctl = nvme_ioctl, 1820 .ioctl = nvme_ioctl,
@@ -1806,43 +1822,9 @@ static const struct block_device_operations nvme_fops = {
1806 .open = nvme_open, 1822 .open = nvme_open,
1807 .release = nvme_release, 1823 .release = nvme_release,
1808 .getgeo = nvme_getgeo, 1824 .getgeo = nvme_getgeo,
1825 .revalidate_disk= nvme_revalidate_disk,
1809}; 1826};
1810 1827
1811static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
1812{
1813 struct nvme_iod *iod, *next;
1814
1815 list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) {
1816 if (unlikely(nvme_submit_iod(nvmeq, iod)))
1817 break;
1818 list_del(&iod->node);
1819 if (bio_list_empty(&nvmeq->sq_cong) &&
1820 list_empty(&nvmeq->iod_bio))
1821 remove_wait_queue(&nvmeq->sq_full,
1822 &nvmeq->sq_cong_wait);
1823 }
1824}
1825
1826static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
1827{
1828 while (bio_list_peek(&nvmeq->sq_cong)) {
1829 struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
1830 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
1831
1832 if (bio_list_empty(&nvmeq->sq_cong) &&
1833 list_empty(&nvmeq->iod_bio))
1834 remove_wait_queue(&nvmeq->sq_full,
1835 &nvmeq->sq_cong_wait);
1836 if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
1837 if (!waitqueue_active(&nvmeq->sq_full))
1838 add_wait_queue(&nvmeq->sq_full,
1839 &nvmeq->sq_cong_wait);
1840 bio_list_add_head(&nvmeq->sq_cong, bio);
1841 break;
1842 }
1843 }
1844}
1845
1846static int nvme_kthread(void *data) 1828static int nvme_kthread(void *data)
1847{ 1829{
1848 struct nvme_dev *dev, *next; 1830 struct nvme_dev *dev, *next;
@@ -1858,28 +1840,26 @@ static int nvme_kthread(void *data)
1858 continue; 1840 continue;
1859 list_del_init(&dev->node); 1841 list_del_init(&dev->node);
1860 dev_warn(&dev->pci_dev->dev, 1842 dev_warn(&dev->pci_dev->dev,
1861 "Failed status, reset controller\n"); 1843 "Failed status: %x, reset controller\n",
1844 readl(&dev->bar->csts));
1862 dev->reset_workfn = nvme_reset_failed_dev; 1845 dev->reset_workfn = nvme_reset_failed_dev;
1863 queue_work(nvme_workq, &dev->reset_work); 1846 queue_work(nvme_workq, &dev->reset_work);
1864 continue; 1847 continue;
1865 } 1848 }
1866 rcu_read_lock();
1867 for (i = 0; i < dev->queue_count; i++) { 1849 for (i = 0; i < dev->queue_count; i++) {
1868 struct nvme_queue *nvmeq = 1850 struct nvme_queue *nvmeq = dev->queues[i];
1869 rcu_dereference(dev->queues[i]);
1870 if (!nvmeq) 1851 if (!nvmeq)
1871 continue; 1852 continue;
1872 spin_lock_irq(&nvmeq->q_lock); 1853 spin_lock_irq(&nvmeq->q_lock);
1873 if (nvmeq->q_suspended)
1874 goto unlock;
1875 nvme_process_cq(nvmeq); 1854 nvme_process_cq(nvmeq);
1876 nvme_cancel_ios(nvmeq, true); 1855
1877 nvme_resubmit_bios(nvmeq); 1856 while ((i == 0) && (dev->event_limit > 0)) {
1878 nvme_resubmit_iods(nvmeq); 1857 if (nvme_submit_async_admin_req(dev))
1879 unlock: 1858 break;
1859 dev->event_limit--;
1860 }
1880 spin_unlock_irq(&nvmeq->q_lock); 1861 spin_unlock_irq(&nvmeq->q_lock);
1881 } 1862 }
1882 rcu_read_unlock();
1883 } 1863 }
1884 spin_unlock(&dev_list_lock); 1864 spin_unlock(&dev_list_lock);
1885 schedule_timeout(round_jiffies_relative(HZ)); 1865 schedule_timeout(round_jiffies_relative(HZ));
@@ -1902,28 +1882,28 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
1902{ 1882{
1903 struct nvme_ns *ns; 1883 struct nvme_ns *ns;
1904 struct gendisk *disk; 1884 struct gendisk *disk;
1885 int node = dev_to_node(&dev->pci_dev->dev);
1905 int lbaf; 1886 int lbaf;
1906 1887
1907 if (rt->attributes & NVME_LBART_ATTRIB_HIDE) 1888 if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
1908 return NULL; 1889 return NULL;
1909 1890
1910 ns = kzalloc(sizeof(*ns), GFP_KERNEL); 1891 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
1911 if (!ns) 1892 if (!ns)
1912 return NULL; 1893 return NULL;
1913 ns->queue = blk_alloc_queue(GFP_KERNEL); 1894 ns->queue = blk_mq_init_queue(&dev->tagset);
1914 if (!ns->queue) 1895 if (IS_ERR(ns->queue))
1915 goto out_free_ns; 1896 goto out_free_ns;
1916 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
1917 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1897 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
1918 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1898 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
1919 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, ns->queue); 1899 queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue);
1920 blk_queue_make_request(ns->queue, nvme_make_request);
1921 ns->dev = dev; 1900 ns->dev = dev;
1922 ns->queue->queuedata = ns; 1901 ns->queue->queuedata = ns;
1923 1902
1924 disk = alloc_disk(0); 1903 disk = alloc_disk_node(0, node);
1925 if (!disk) 1904 if (!disk)
1926 goto out_free_queue; 1905 goto out_free_queue;
1906
1927 ns->ns_id = nsid; 1907 ns->ns_id = nsid;
1928 ns->disk = disk; 1908 ns->disk = disk;
1929 lbaf = id->flbas & 0xf; 1909 lbaf = id->flbas & 0xf;
@@ -1932,6 +1912,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
1932 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1912 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
1933 if (dev->max_hw_sectors) 1913 if (dev->max_hw_sectors)
1934 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 1914 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
1915 if (dev->stripe_size)
1916 blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
1935 if (dev->vwc & NVME_CTRL_VWC_PRESENT) 1917 if (dev->vwc & NVME_CTRL_VWC_PRESENT)
1936 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); 1918 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
1937 1919
@@ -1957,143 +1939,19 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
1957 return NULL; 1939 return NULL;
1958} 1940}
1959 1941
1960static int nvme_find_closest_node(int node)
1961{
1962 int n, val, min_val = INT_MAX, best_node = node;
1963
1964 for_each_online_node(n) {
1965 if (n == node)
1966 continue;
1967 val = node_distance(node, n);
1968 if (val < min_val) {
1969 min_val = val;
1970 best_node = n;
1971 }
1972 }
1973 return best_node;
1974}
1975
1976static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
1977 int count)
1978{
1979 int cpu;
1980 for_each_cpu(cpu, qmask) {
1981 if (cpumask_weight(nvmeq->cpu_mask) >= count)
1982 break;
1983 if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask))
1984 *per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid;
1985 }
1986}
1987
1988static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
1989 const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue)
1990{
1991 int next_cpu;
1992 for_each_cpu(next_cpu, new_mask) {
1993 cpumask_or(mask, mask, get_cpu_mask(next_cpu));
1994 cpumask_or(mask, mask, topology_thread_cpumask(next_cpu));
1995 cpumask_and(mask, mask, unassigned_cpus);
1996 nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue);
1997 }
1998}
1999
2000static void nvme_create_io_queues(struct nvme_dev *dev) 1942static void nvme_create_io_queues(struct nvme_dev *dev)
2001{ 1943{
2002 unsigned i, max; 1944 unsigned i;
2003 1945
2004 max = min(dev->max_qid, num_online_cpus()); 1946 for (i = dev->queue_count; i <= dev->max_qid; i++)
2005 for (i = dev->queue_count; i <= max; i++)
2006 if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1)) 1947 if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
2007 break; 1948 break;
2008 1949
2009 max = min(dev->queue_count - 1, num_online_cpus()); 1950 for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
2010 for (i = dev->online_queues; i <= max; i++) 1951 if (nvme_create_queue(dev->queues[i], i))
2011 if (nvme_create_queue(raw_nvmeq(dev, i), i))
2012 break; 1952 break;
2013} 1953}
2014 1954
2015/*
2016 * If there are fewer queues than online cpus, this will try to optimally
2017 * assign a queue to multiple cpus by grouping cpus that are "close" together:
2018 * thread siblings, core, socket, closest node, then whatever else is
2019 * available.
2020 */
2021static void nvme_assign_io_queues(struct nvme_dev *dev)
2022{
2023 unsigned cpu, cpus_per_queue, queues, remainder, i;
2024 cpumask_var_t unassigned_cpus;
2025
2026 nvme_create_io_queues(dev);
2027
2028 queues = min(dev->online_queues - 1, num_online_cpus());
2029 if (!queues)
2030 return;
2031
2032 cpus_per_queue = num_online_cpus() / queues;
2033 remainder = queues - (num_online_cpus() - queues * cpus_per_queue);
2034
2035 if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL))
2036 return;
2037
2038 cpumask_copy(unassigned_cpus, cpu_online_mask);
2039 cpu = cpumask_first(unassigned_cpus);
2040 for (i = 1; i <= queues; i++) {
2041 struct nvme_queue *nvmeq = lock_nvmeq(dev, i);
2042 cpumask_t mask;
2043
2044 cpumask_clear(nvmeq->cpu_mask);
2045 if (!cpumask_weight(unassigned_cpus)) {
2046 unlock_nvmeq(nvmeq);
2047 break;
2048 }
2049
2050 mask = *get_cpu_mask(cpu);
2051 nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue);
2052 if (cpus_weight(mask) < cpus_per_queue)
2053 nvme_add_cpus(&mask, unassigned_cpus,
2054 topology_thread_cpumask(cpu),
2055 nvmeq, cpus_per_queue);
2056 if (cpus_weight(mask) < cpus_per_queue)
2057 nvme_add_cpus(&mask, unassigned_cpus,
2058 topology_core_cpumask(cpu),
2059 nvmeq, cpus_per_queue);
2060 if (cpus_weight(mask) < cpus_per_queue)
2061 nvme_add_cpus(&mask, unassigned_cpus,
2062 cpumask_of_node(cpu_to_node(cpu)),
2063 nvmeq, cpus_per_queue);
2064 if (cpus_weight(mask) < cpus_per_queue)
2065 nvme_add_cpus(&mask, unassigned_cpus,
2066 cpumask_of_node(
2067 nvme_find_closest_node(
2068 cpu_to_node(cpu))),
2069 nvmeq, cpus_per_queue);
2070 if (cpus_weight(mask) < cpus_per_queue)
2071 nvme_add_cpus(&mask, unassigned_cpus,
2072 unassigned_cpus,
2073 nvmeq, cpus_per_queue);
2074
2075 WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue,
2076 "nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
2077 dev->instance, i);
2078
2079 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
2080 nvmeq->cpu_mask);
2081 cpumask_andnot(unassigned_cpus, unassigned_cpus,
2082 nvmeq->cpu_mask);
2083 cpu = cpumask_next(cpu, unassigned_cpus);
2084 if (remainder && !--remainder)
2085 cpus_per_queue++;
2086 unlock_nvmeq(nvmeq);
2087 }
2088 WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n",
2089 dev->instance);
2090 i = 0;
2091 cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
2092 for_each_cpu(cpu, unassigned_cpus)
2093 *per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1;
2094 free_cpumask_var(unassigned_cpus);
2095}
2096
2097static int set_queue_count(struct nvme_dev *dev, int count) 1955static int set_queue_count(struct nvme_dev *dev, int count)
2098{ 1956{
2099 int status; 1957 int status;
@@ -2107,7 +1965,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
2107 if (status > 0) { 1965 if (status > 0) {
2108 dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", 1966 dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n",
2109 status); 1967 status);
2110 return -EBUSY; 1968 return 0;
2111 } 1969 }
2112 return min(result & 0xffff, result >> 16) + 1; 1970 return min(result & 0xffff, result >> 16) + 1;
2113} 1971}
@@ -2117,39 +1975,15 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
2117 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); 1975 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
2118} 1976}
2119 1977
2120static void nvme_cpu_workfn(struct work_struct *work)
2121{
2122 struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work);
2123 if (dev->initialized)
2124 nvme_assign_io_queues(dev);
2125}
2126
2127static int nvme_cpu_notify(struct notifier_block *self,
2128 unsigned long action, void *hcpu)
2129{
2130 struct nvme_dev *dev;
2131
2132 switch (action) {
2133 case CPU_ONLINE:
2134 case CPU_DEAD:
2135 spin_lock(&dev_list_lock);
2136 list_for_each_entry(dev, &dev_list, node)
2137 schedule_work(&dev->cpu_work);
2138 spin_unlock(&dev_list_lock);
2139 break;
2140 }
2141 return NOTIFY_OK;
2142}
2143
2144static int nvme_setup_io_queues(struct nvme_dev *dev) 1978static int nvme_setup_io_queues(struct nvme_dev *dev)
2145{ 1979{
2146 struct nvme_queue *adminq = raw_nvmeq(dev, 0); 1980 struct nvme_queue *adminq = dev->queues[0];
2147 struct pci_dev *pdev = dev->pci_dev; 1981 struct pci_dev *pdev = dev->pci_dev;
2148 int result, i, vecs, nr_io_queues, size; 1982 int result, i, vecs, nr_io_queues, size;
2149 1983
2150 nr_io_queues = num_possible_cpus(); 1984 nr_io_queues = num_possible_cpus();
2151 result = set_queue_count(dev, nr_io_queues); 1985 result = set_queue_count(dev, nr_io_queues);
2152 if (result < 0) 1986 if (result <= 0)
2153 return result; 1987 return result;
2154 if (result < nr_io_queues) 1988 if (result < nr_io_queues)
2155 nr_io_queues = result; 1989 nr_io_queues = result;
@@ -2172,6 +2006,13 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
2172 /* Deregister the admin queue's interrupt */ 2006 /* Deregister the admin queue's interrupt */
2173 free_irq(dev->entry[0].vector, adminq); 2007 free_irq(dev->entry[0].vector, adminq);
2174 2008
2009 /*
2010 * If we enable msix early due to not intx, disable it again before
2011 * setting up the full range we need.
2012 */
2013 if (!pdev->irq)
2014 pci_disable_msix(pdev);
2015
2175 for (i = 0; i < nr_io_queues; i++) 2016 for (i = 0; i < nr_io_queues; i++)
2176 dev->entry[i].entry = i; 2017 dev->entry[i].entry = i;
2177 vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); 2018 vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
@@ -2195,14 +2036,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
2195 dev->max_qid = nr_io_queues; 2036 dev->max_qid = nr_io_queues;
2196 2037
2197 result = queue_request_irq(dev, adminq, adminq->irqname); 2038 result = queue_request_irq(dev, adminq, adminq->irqname);
2198 if (result) { 2039 if (result)
2199 adminq->q_suspended = 1;
2200 goto free_queues; 2040 goto free_queues;
2201 }
2202 2041
2203 /* Free previously allocated queues that are no longer usable */ 2042 /* Free previously allocated queues that are no longer usable */
2204 nvme_free_queues(dev, nr_io_queues + 1); 2043 nvme_free_queues(dev, nr_io_queues + 1);
2205 nvme_assign_io_queues(dev); 2044 nvme_create_io_queues(dev);
2206 2045
2207 return 0; 2046 return 0;
2208 2047
@@ -2245,14 +2084,37 @@ static int nvme_dev_add(struct nvme_dev *dev)
2245 dev->oncs = le16_to_cpup(&ctrl->oncs); 2084 dev->oncs = le16_to_cpup(&ctrl->oncs);
2246 dev->abort_limit = ctrl->acl + 1; 2085 dev->abort_limit = ctrl->acl + 1;
2247 dev->vwc = ctrl->vwc; 2086 dev->vwc = ctrl->vwc;
2087 dev->event_limit = min(ctrl->aerl + 1, 8);
2248 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 2088 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
2249 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 2089 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
2250 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 2090 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
2251 if (ctrl->mdts) 2091 if (ctrl->mdts)
2252 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 2092 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
2253 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && 2093 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
2254 (pdev->device == 0x0953) && ctrl->vs[3]) 2094 (pdev->device == 0x0953) && ctrl->vs[3]) {
2095 unsigned int max_hw_sectors;
2096
2255 dev->stripe_size = 1 << (ctrl->vs[3] + shift); 2097 dev->stripe_size = 1 << (ctrl->vs[3] + shift);
2098 max_hw_sectors = dev->stripe_size >> (shift - 9);
2099 if (dev->max_hw_sectors) {
2100 dev->max_hw_sectors = min(max_hw_sectors,
2101 dev->max_hw_sectors);
2102 } else
2103 dev->max_hw_sectors = max_hw_sectors;
2104 }
2105
2106 dev->tagset.ops = &nvme_mq_ops;
2107 dev->tagset.nr_hw_queues = dev->online_queues - 1;
2108 dev->tagset.timeout = NVME_IO_TIMEOUT;
2109 dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
2110 dev->tagset.queue_depth =
2111 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
2112 dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
2113 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
2114 dev->tagset.driver_data = dev;
2115
2116 if (blk_mq_alloc_tag_set(&dev->tagset))
2117 goto out;
2256 2118
2257 id_ns = mem; 2119 id_ns = mem;
2258 for (i = 1; i <= nn; i++) { 2120 for (i = 1; i <= nn; i++) {
@@ -2293,6 +2155,9 @@ static int nvme_dev_map(struct nvme_dev *dev)
2293 dev->entry[0].vector = pdev->irq; 2155 dev->entry[0].vector = pdev->irq;
2294 pci_set_master(pdev); 2156 pci_set_master(pdev);
2295 bars = pci_select_bars(pdev, IORESOURCE_MEM); 2157 bars = pci_select_bars(pdev, IORESOURCE_MEM);
2158 if (!bars)
2159 goto disable_pci;
2160
2296 if (pci_request_selected_regions(pdev, bars, "nvme")) 2161 if (pci_request_selected_regions(pdev, bars, "nvme"))
2297 goto disable_pci; 2162 goto disable_pci;
2298 2163
@@ -2303,10 +2168,22 @@ static int nvme_dev_map(struct nvme_dev *dev)
2303 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 2168 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
2304 if (!dev->bar) 2169 if (!dev->bar)
2305 goto disable; 2170 goto disable;
2171
2306 if (readl(&dev->bar->csts) == -1) { 2172 if (readl(&dev->bar->csts) == -1) {
2307 result = -ENODEV; 2173 result = -ENODEV;
2308 goto unmap; 2174 goto unmap;
2309 } 2175 }
2176
2177 /*
2178 * Some devices don't advertse INTx interrupts, pre-enable a single
2179 * MSIX vec for setup. We'll adjust this later.
2180 */
2181 if (!pdev->irq) {
2182 result = pci_enable_msix(pdev, dev->entry, 1);
2183 if (result < 0)
2184 goto unmap;
2185 }
2186
2310 cap = readq(&dev->bar->cap); 2187 cap = readq(&dev->bar->cap);
2311 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); 2188 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
2312 dev->db_stride = 1 << NVME_CAP_STRIDE(cap); 2189 dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
@@ -2402,7 +2279,8 @@ static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
2402 c.delete_queue.qid = cpu_to_le16(nvmeq->qid); 2279 c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2403 2280
2404 init_kthread_work(&nvmeq->cmdinfo.work, fn); 2281 init_kthread_work(&nvmeq->cmdinfo.work, fn);
2405 return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo); 2282 return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
2283 ADMIN_TIMEOUT);
2406} 2284}
2407 2285
2408static void nvme_del_cq_work_handler(struct kthread_work *work) 2286static void nvme_del_cq_work_handler(struct kthread_work *work)
@@ -2465,7 +2343,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
2465 atomic_set(&dq.refcount, 0); 2343 atomic_set(&dq.refcount, 0);
2466 dq.worker = &worker; 2344 dq.worker = &worker;
2467 for (i = dev->queue_count - 1; i > 0; i--) { 2345 for (i = dev->queue_count - 1; i > 0; i--) {
2468 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 2346 struct nvme_queue *nvmeq = dev->queues[i];
2469 2347
2470 if (nvme_suspend_queue(nvmeq)) 2348 if (nvme_suspend_queue(nvmeq))
2471 continue; 2349 continue;
@@ -2501,13 +2379,16 @@ static void nvme_dev_list_remove(struct nvme_dev *dev)
2501static void nvme_dev_shutdown(struct nvme_dev *dev) 2379static void nvme_dev_shutdown(struct nvme_dev *dev)
2502{ 2380{
2503 int i; 2381 int i;
2382 u32 csts = -1;
2504 2383
2505 dev->initialized = 0; 2384 dev->initialized = 0;
2506 nvme_dev_list_remove(dev); 2385 nvme_dev_list_remove(dev);
2507 2386
2508 if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { 2387 if (dev->bar)
2388 csts = readl(&dev->bar->csts);
2389 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
2509 for (i = dev->queue_count - 1; i >= 0; i--) { 2390 for (i = dev->queue_count - 1; i >= 0; i--) {
2510 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 2391 struct nvme_queue *nvmeq = dev->queues[i];
2511 nvme_suspend_queue(nvmeq); 2392 nvme_suspend_queue(nvmeq);
2512 nvme_clear_queue(nvmeq); 2393 nvme_clear_queue(nvmeq);
2513 } 2394 }
@@ -2519,6 +2400,12 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
2519 nvme_dev_unmap(dev); 2400 nvme_dev_unmap(dev);
2520} 2401}
2521 2402
2403static void nvme_dev_remove_admin(struct nvme_dev *dev)
2404{
2405 if (dev->admin_q && !blk_queue_dying(dev->admin_q))
2406 blk_cleanup_queue(dev->admin_q);
2407}
2408
2522static void nvme_dev_remove(struct nvme_dev *dev) 2409static void nvme_dev_remove(struct nvme_dev *dev)
2523{ 2410{
2524 struct nvme_ns *ns; 2411 struct nvme_ns *ns;
@@ -2590,6 +2477,11 @@ static void nvme_free_namespaces(struct nvme_dev *dev)
2590 2477
2591 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 2478 list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
2592 list_del(&ns->list); 2479 list_del(&ns->list);
2480
2481 spin_lock(&dev_list_lock);
2482 ns->disk->private_data = NULL;
2483 spin_unlock(&dev_list_lock);
2484
2593 put_disk(ns->disk); 2485 put_disk(ns->disk);
2594 kfree(ns); 2486 kfree(ns);
2595 } 2487 }
@@ -2599,8 +2491,10 @@ static void nvme_free_dev(struct kref *kref)
2599{ 2491{
2600 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 2492 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
2601 2493
2494 pci_dev_put(dev->pci_dev);
2602 nvme_free_namespaces(dev); 2495 nvme_free_namespaces(dev);
2603 free_percpu(dev->io_queue); 2496 nvme_release_instance(dev);
2497 blk_mq_free_tag_set(&dev->tagset);
2604 kfree(dev->queues); 2498 kfree(dev->queues);
2605 kfree(dev->entry); 2499 kfree(dev->entry);
2606 kfree(dev); 2500 kfree(dev);
@@ -2625,9 +2519,16 @@ static int nvme_dev_release(struct inode *inode, struct file *f)
2625static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 2519static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
2626{ 2520{
2627 struct nvme_dev *dev = f->private_data; 2521 struct nvme_dev *dev = f->private_data;
2522 struct nvme_ns *ns;
2523
2628 switch (cmd) { 2524 switch (cmd) {
2629 case NVME_IOCTL_ADMIN_CMD: 2525 case NVME_IOCTL_ADMIN_CMD:
2630 return nvme_user_admin_cmd(dev, (void __user *)arg); 2526 return nvme_user_cmd(dev, NULL, (void __user *)arg);
2527 case NVME_IOCTL_IO_CMD:
2528 if (list_empty(&dev->namespaces))
2529 return -ENOTTY;
2530 ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
2531 return nvme_user_cmd(dev, ns, (void __user *)arg);
2631 default: 2532 default:
2632 return -ENOTTY; 2533 return -ENOTTY;
2633 } 2534 }
@@ -2641,6 +2542,22 @@ static const struct file_operations nvme_dev_fops = {
2641 .compat_ioctl = nvme_dev_ioctl, 2542 .compat_ioctl = nvme_dev_ioctl,
2642}; 2543};
2643 2544
2545static void nvme_set_irq_hints(struct nvme_dev *dev)
2546{
2547 struct nvme_queue *nvmeq;
2548 int i;
2549
2550 for (i = 0; i < dev->online_queues; i++) {
2551 nvmeq = dev->queues[i];
2552
2553 if (!nvmeq->hctx)
2554 continue;
2555
2556 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
2557 nvmeq->hctx->cpumask);
2558 }
2559}
2560
2644static int nvme_dev_start(struct nvme_dev *dev) 2561static int nvme_dev_start(struct nvme_dev *dev)
2645{ 2562{
2646 int result; 2563 int result;
@@ -2664,7 +2581,7 @@ static int nvme_dev_start(struct nvme_dev *dev)
2664 2581
2665 if (start_thread) { 2582 if (start_thread) {
2666 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 2583 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
2667 wake_up(&nvme_kthread_wait); 2584 wake_up_all(&nvme_kthread_wait);
2668 } else 2585 } else
2669 wait_event_killable(nvme_kthread_wait, nvme_thread); 2586 wait_event_killable(nvme_kthread_wait, nvme_thread);
2670 2587
@@ -2673,10 +2590,14 @@ static int nvme_dev_start(struct nvme_dev *dev)
2673 goto disable; 2590 goto disable;
2674 } 2591 }
2675 2592
2593 nvme_init_queue(dev->queues[0], 0);
2594
2676 result = nvme_setup_io_queues(dev); 2595 result = nvme_setup_io_queues(dev);
2677 if (result && result != -EBUSY) 2596 if (result)
2678 goto disable; 2597 goto disable;
2679 2598
2599 nvme_set_irq_hints(dev);
2600
2680 return result; 2601 return result;
2681 2602
2682 disable: 2603 disable:
@@ -2693,7 +2614,7 @@ static int nvme_remove_dead_ctrl(void *arg)
2693 struct pci_dev *pdev = dev->pci_dev; 2614 struct pci_dev *pdev = dev->pci_dev;
2694 2615
2695 if (pci_get_drvdata(pdev)) 2616 if (pci_get_drvdata(pdev))
2696 pci_stop_and_remove_bus_device(pdev); 2617 pci_stop_and_remove_bus_device_locked(pdev);
2697 kref_put(&dev->kref, nvme_free_dev); 2618 kref_put(&dev->kref, nvme_free_dev);
2698 return 0; 2619 return 0;
2699} 2620}
@@ -2702,8 +2623,8 @@ static void nvme_remove_disks(struct work_struct *ws)
2702{ 2623{
2703 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 2624 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
2704 2625
2705 nvme_dev_remove(dev);
2706 nvme_free_queues(dev, 1); 2626 nvme_free_queues(dev, 1);
2627 nvme_dev_remove(dev);
2707} 2628}
2708 2629
2709static int nvme_dev_resume(struct nvme_dev *dev) 2630static int nvme_dev_resume(struct nvme_dev *dev)
@@ -2711,9 +2632,9 @@ static int nvme_dev_resume(struct nvme_dev *dev)
2711 int ret; 2632 int ret;
2712 2633
2713 ret = nvme_dev_start(dev); 2634 ret = nvme_dev_start(dev);
2714 if (ret && ret != -EBUSY) 2635 if (ret)
2715 return ret; 2636 return ret;
2716 if (ret == -EBUSY) { 2637 if (dev->online_queues < 2) {
2717 spin_lock(&dev_list_lock); 2638 spin_lock(&dev_list_lock);
2718 dev->reset_workfn = nvme_remove_disks; 2639 dev->reset_workfn = nvme_remove_disks;
2719 queue_work(nvme_workq, &dev->reset_work); 2640 queue_work(nvme_workq, &dev->reset_work);
@@ -2727,7 +2648,7 @@ static void nvme_dev_reset(struct nvme_dev *dev)
2727{ 2648{
2728 nvme_dev_shutdown(dev); 2649 nvme_dev_shutdown(dev);
2729 if (nvme_dev_resume(dev)) { 2650 if (nvme_dev_resume(dev)) {
2730 dev_err(&dev->pci_dev->dev, "Device failed to resume\n"); 2651 dev_warn(&dev->pci_dev->dev, "Device failed to resume\n");
2731 kref_get(&dev->kref); 2652 kref_get(&dev->kref);
2732 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", 2653 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
2733 dev->instance))) { 2654 dev->instance))) {
@@ -2752,33 +2673,33 @@ static void nvme_reset_workfn(struct work_struct *work)
2752 2673
2753static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2674static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2754{ 2675{
2755 int result = -ENOMEM; 2676 int node, result = -ENOMEM;
2756 struct nvme_dev *dev; 2677 struct nvme_dev *dev;
2757 2678
2758 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2679 node = dev_to_node(&pdev->dev);
2680 if (node == NUMA_NO_NODE)
2681 set_dev_node(&pdev->dev, 0);
2682
2683 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
2759 if (!dev) 2684 if (!dev)
2760 return -ENOMEM; 2685 return -ENOMEM;
2761 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), 2686 dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
2762 GFP_KERNEL); 2687 GFP_KERNEL, node);
2763 if (!dev->entry) 2688 if (!dev->entry)
2764 goto free; 2689 goto free;
2765 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), 2690 dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
2766 GFP_KERNEL); 2691 GFP_KERNEL, node);
2767 if (!dev->queues) 2692 if (!dev->queues)
2768 goto free; 2693 goto free;
2769 dev->io_queue = alloc_percpu(unsigned short);
2770 if (!dev->io_queue)
2771 goto free;
2772 2694
2773 INIT_LIST_HEAD(&dev->namespaces); 2695 INIT_LIST_HEAD(&dev->namespaces);
2774 dev->reset_workfn = nvme_reset_failed_dev; 2696 dev->reset_workfn = nvme_reset_failed_dev;
2775 INIT_WORK(&dev->reset_work, nvme_reset_workfn); 2697 INIT_WORK(&dev->reset_work, nvme_reset_workfn);
2776 INIT_WORK(&dev->cpu_work, nvme_cpu_workfn); 2698 dev->pci_dev = pci_dev_get(pdev);
2777 dev->pci_dev = pdev;
2778 pci_set_drvdata(pdev, dev); 2699 pci_set_drvdata(pdev, dev);
2779 result = nvme_set_instance(dev); 2700 result = nvme_set_instance(dev);
2780 if (result) 2701 if (result)
2781 goto free; 2702 goto put_pci;
2782 2703
2783 result = nvme_setup_prp_pools(dev); 2704 result = nvme_setup_prp_pools(dev);
2784 if (result) 2705 if (result)
@@ -2786,17 +2707,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2786 2707
2787 kref_init(&dev->kref); 2708 kref_init(&dev->kref);
2788 result = nvme_dev_start(dev); 2709 result = nvme_dev_start(dev);
2789 if (result) { 2710 if (result)
2790 if (result == -EBUSY)
2791 goto create_cdev;
2792 goto release_pools; 2711 goto release_pools;
2793 }
2794 2712
2795 result = nvme_dev_add(dev); 2713 if (dev->online_queues > 1)
2714 result = nvme_dev_add(dev);
2796 if (result) 2715 if (result)
2797 goto shutdown; 2716 goto shutdown;
2798 2717
2799 create_cdev:
2800 scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); 2718 scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance);
2801 dev->miscdev.minor = MISC_DYNAMIC_MINOR; 2719 dev->miscdev.minor = MISC_DYNAMIC_MINOR;
2802 dev->miscdev.parent = &pdev->dev; 2720 dev->miscdev.parent = &pdev->dev;
@@ -2806,11 +2724,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2806 if (result) 2724 if (result)
2807 goto remove; 2725 goto remove;
2808 2726
2727 nvme_set_irq_hints(dev);
2728
2809 dev->initialized = 1; 2729 dev->initialized = 1;
2810 return 0; 2730 return 0;
2811 2731
2812 remove: 2732 remove:
2813 nvme_dev_remove(dev); 2733 nvme_dev_remove(dev);
2734 nvme_dev_remove_admin(dev);
2814 nvme_free_namespaces(dev); 2735 nvme_free_namespaces(dev);
2815 shutdown: 2736 shutdown:
2816 nvme_dev_shutdown(dev); 2737 nvme_dev_shutdown(dev);
@@ -2819,8 +2740,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2819 nvme_release_prp_pools(dev); 2740 nvme_release_prp_pools(dev);
2820 release: 2741 release:
2821 nvme_release_instance(dev); 2742 nvme_release_instance(dev);
2743 put_pci:
2744 pci_dev_put(dev->pci_dev);
2822 free: 2745 free:
2823 free_percpu(dev->io_queue);
2824 kfree(dev->queues); 2746 kfree(dev->queues);
2825 kfree(dev->entry); 2747 kfree(dev->entry);
2826 kfree(dev); 2748 kfree(dev);
@@ -2829,12 +2751,12 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2829 2751
2830static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) 2752static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
2831{ 2753{
2832 struct nvme_dev *dev = pci_get_drvdata(pdev); 2754 struct nvme_dev *dev = pci_get_drvdata(pdev);
2833 2755
2834 if (prepare) 2756 if (prepare)
2835 nvme_dev_shutdown(dev); 2757 nvme_dev_shutdown(dev);
2836 else 2758 else
2837 nvme_dev_resume(dev); 2759 nvme_dev_resume(dev);
2838} 2760}
2839 2761
2840static void nvme_shutdown(struct pci_dev *pdev) 2762static void nvme_shutdown(struct pci_dev *pdev)
@@ -2853,13 +2775,12 @@ static void nvme_remove(struct pci_dev *pdev)
2853 2775
2854 pci_set_drvdata(pdev, NULL); 2776 pci_set_drvdata(pdev, NULL);
2855 flush_work(&dev->reset_work); 2777 flush_work(&dev->reset_work);
2856 flush_work(&dev->cpu_work);
2857 misc_deregister(&dev->miscdev); 2778 misc_deregister(&dev->miscdev);
2858 nvme_dev_remove(dev); 2779 nvme_dev_remove(dev);
2859 nvme_dev_shutdown(dev); 2780 nvme_dev_shutdown(dev);
2781 nvme_dev_remove_admin(dev);
2860 nvme_free_queues(dev, 0); 2782 nvme_free_queues(dev, 0);
2861 rcu_barrier(); 2783 nvme_free_admin_tags(dev);
2862 nvme_release_instance(dev);
2863 nvme_release_prp_pools(dev); 2784 nvme_release_prp_pools(dev);
2864 kref_put(&dev->kref, nvme_free_dev); 2785 kref_put(&dev->kref, nvme_free_dev);
2865} 2786}
@@ -2942,18 +2863,11 @@ static int __init nvme_init(void)
2942 else if (result > 0) 2863 else if (result > 0)
2943 nvme_major = result; 2864 nvme_major = result;
2944 2865
2945 nvme_nb.notifier_call = &nvme_cpu_notify;
2946 result = register_hotcpu_notifier(&nvme_nb);
2947 if (result)
2948 goto unregister_blkdev;
2949
2950 result = pci_register_driver(&nvme_driver); 2866 result = pci_register_driver(&nvme_driver);
2951 if (result) 2867 if (result)
2952 goto unregister_hotcpu; 2868 goto unregister_blkdev;
2953 return 0; 2869 return 0;
2954 2870
2955 unregister_hotcpu:
2956 unregister_hotcpu_notifier(&nvme_nb);
2957 unregister_blkdev: 2871 unregister_blkdev:
2958 unregister_blkdev(nvme_major, "nvme"); 2872 unregister_blkdev(nvme_major, "nvme");
2959 kill_workq: 2873 kill_workq:
@@ -2973,6 +2887,6 @@ static void __exit nvme_exit(void)
2973 2887
2974MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2888MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
2975MODULE_LICENSE("GPL"); 2889MODULE_LICENSE("GPL");
2976MODULE_VERSION("0.9"); 2890MODULE_VERSION("1.0");
2977module_init(nvme_init); 2891module_init(nvme_init);
2978module_exit(nvme_exit); 2892module_exit(nvme_exit);
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 0b4b2775600e..5e78568026c3 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -2105,7 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
2105 2105
2106 nvme_offset += unit_num_blocks; 2106 nvme_offset += unit_num_blocks;
2107 2107
2108 nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); 2108 nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
2109 if (nvme_sc != NVME_SC_SUCCESS) { 2109 if (nvme_sc != NVME_SC_SUCCESS) {
2110 nvme_unmap_user_pages(dev, 2110 nvme_unmap_user_pages(dev,
2111 (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, 2111 (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
@@ -2658,7 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
2658 c.common.opcode = nvme_cmd_flush; 2658 c.common.opcode = nvme_cmd_flush;
2659 c.common.nsid = cpu_to_le32(ns->ns_id); 2659 c.common.nsid = cpu_to_le32(ns->ns_id);
2660 2660
2661 nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); 2661 nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
2662 res = nvme_trans_status_code(hdr, nvme_sc); 2662 res = nvme_trans_status_code(hdr, nvme_sc);
2663 if (res) 2663 if (res)
2664 goto out; 2664 goto out;
@@ -2686,7 +2686,7 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
2686 c.common.opcode = nvme_cmd_flush; 2686 c.common.opcode = nvme_cmd_flush;
2687 c.common.nsid = cpu_to_le32(ns->ns_id); 2687 c.common.nsid = cpu_to_le32(ns->ns_id);
2688 2688
2689 nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); 2689 nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
2690 2690
2691 res = nvme_trans_status_code(hdr, nvme_sc); 2691 res = nvme_trans_status_code(hdr, nvme_sc);
2692 if (res) 2692 if (res)
@@ -2894,7 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
2894 c.dsm.nr = cpu_to_le32(ndesc - 1); 2894 c.dsm.nr = cpu_to_le32(ndesc - 1);
2895 c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 2895 c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
2896 2896
2897 nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); 2897 nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
2898 res = nvme_trans_status_code(hdr, nvme_sc); 2898 res = nvme_trans_status_code(hdr, nvme_sc);
2899 2899
2900 dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), 2900 dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
@@ -2915,6 +2915,14 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr)
2915 if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) 2915 if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len))
2916 return -EFAULT; 2916 return -EFAULT;
2917 2917
2918 /*
2919 * Prime the hdr with good status for scsi commands that don't require
2920 * an nvme command for translation.
2921 */
2922 retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS);
2923 if (retcode)
2924 return retcode;
2925
2918 opcode = cmd[0]; 2926 opcode = cmd[0];
2919 2927
2920 switch (opcode) { 2928 switch (opcode) {
@@ -3016,152 +3024,6 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr)
3016 return retcode; 3024 return retcode;
3017} 3025}
3018 3026
3019#ifdef CONFIG_COMPAT
3020typedef struct sg_io_hdr32 {
3021 compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */
3022 compat_int_t dxfer_direction; /* [i] data transfer direction */
3023 unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */
3024 unsigned char mx_sb_len; /* [i] max length to write to sbp */
3025 unsigned short iovec_count; /* [i] 0 implies no scatter gather */
3026 compat_uint_t dxfer_len; /* [i] byte count of data transfer */
3027 compat_uint_t dxferp; /* [i], [*io] points to data transfer memory
3028 or scatter gather list */
3029 compat_uptr_t cmdp; /* [i], [*i] points to command to perform */
3030 compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */
3031 compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */
3032 compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */
3033 compat_int_t pack_id; /* [i->o] unused internally (normally) */
3034 compat_uptr_t usr_ptr; /* [i->o] unused internally */
3035 unsigned char status; /* [o] scsi status */
3036 unsigned char masked_status; /* [o] shifted, masked scsi status */
3037 unsigned char msg_status; /* [o] messaging level data (optional) */
3038 unsigned char sb_len_wr; /* [o] byte count actually written to sbp */
3039 unsigned short host_status; /* [o] errors from host adapter */
3040 unsigned short driver_status; /* [o] errors from software driver */
3041 compat_int_t resid; /* [o] dxfer_len - actual_transferred */
3042 compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */
3043 compat_uint_t info; /* [o] auxiliary information */
3044} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */
3045
3046typedef struct sg_iovec32 {
3047 compat_uint_t iov_base;
3048 compat_uint_t iov_len;
3049} sg_iovec32_t;
3050
3051static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count)
3052{
3053 sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1);
3054 sg_iovec32_t __user *iov32 = dxferp;
3055 int i;
3056
3057 for (i = 0; i < iovec_count; i++) {
3058 u32 base, len;
3059
3060 if (get_user(base, &iov32[i].iov_base) ||
3061 get_user(len, &iov32[i].iov_len) ||
3062 put_user(compat_ptr(base), &iov[i].iov_base) ||
3063 put_user(len, &iov[i].iov_len))
3064 return -EFAULT;
3065 }
3066
3067 if (put_user(iov, &sgio->dxferp))
3068 return -EFAULT;
3069 return 0;
3070}
3071
3072int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg)
3073{
3074 sg_io_hdr32_t __user *sgio32 = (sg_io_hdr32_t __user *)arg;
3075 sg_io_hdr_t __user *sgio;
3076 u16 iovec_count;
3077 u32 data;
3078 void __user *dxferp;
3079 int err;
3080 int interface_id;
3081
3082 if (get_user(interface_id, &sgio32->interface_id))
3083 return -EFAULT;
3084 if (interface_id != 'S')
3085 return -EINVAL;
3086
3087 if (get_user(iovec_count, &sgio32->iovec_count))
3088 return -EFAULT;
3089
3090 {
3091 void __user *top = compat_alloc_user_space(0);
3092 void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) +
3093 (iovec_count * sizeof(sg_iovec_t)));
3094 if (new > top)
3095 return -EINVAL;
3096
3097 sgio = new;
3098 }
3099
3100 /* Ok, now construct. */
3101 if (copy_in_user(&sgio->interface_id, &sgio32->interface_id,
3102 (2 * sizeof(int)) +
3103 (2 * sizeof(unsigned char)) +
3104 (1 * sizeof(unsigned short)) +
3105 (1 * sizeof(unsigned int))))
3106 return -EFAULT;
3107
3108 if (get_user(data, &sgio32->dxferp))
3109 return -EFAULT;
3110 dxferp = compat_ptr(data);
3111 if (iovec_count) {
3112 if (sg_build_iovec(sgio, dxferp, iovec_count))
3113 return -EFAULT;
3114 } else {
3115 if (put_user(dxferp, &sgio->dxferp))
3116 return -EFAULT;
3117 }
3118
3119 {
3120 unsigned char __user *cmdp;
3121 unsigned char __user *sbp;
3122
3123 if (get_user(data, &sgio32->cmdp))
3124 return -EFAULT;
3125 cmdp = compat_ptr(data);
3126
3127 if (get_user(data, &sgio32->sbp))
3128 return -EFAULT;
3129 sbp = compat_ptr(data);
3130
3131 if (put_user(cmdp, &sgio->cmdp) ||
3132 put_user(sbp, &sgio->sbp))
3133 return -EFAULT;
3134 }
3135
3136 if (copy_in_user(&sgio->timeout, &sgio32->timeout,
3137 3 * sizeof(int)))
3138 return -EFAULT;
3139
3140 if (get_user(data, &sgio32->usr_ptr))
3141 return -EFAULT;
3142 if (put_user(compat_ptr(data), &sgio->usr_ptr))
3143 return -EFAULT;
3144
3145 err = nvme_sg_io(ns, sgio);
3146 if (err >= 0) {
3147 void __user *datap;
3148
3149 if (copy_in_user(&sgio32->pack_id, &sgio->pack_id,
3150 sizeof(int)) ||
3151 get_user(datap, &sgio->usr_ptr) ||
3152 put_user((u32)(unsigned long)datap,
3153 &sgio32->usr_ptr) ||
3154 copy_in_user(&sgio32->status, &sgio->status,
3155 (4 * sizeof(unsigned char)) +
3156 (2 * sizeof(unsigned short)) +
3157 (3 * sizeof(int))))
3158 err = -EFAULT;
3159 }
3160
3161 return err;
3162}
3163#endif
3164
3165int nvme_sg_get_version_num(int __user *ip) 3027int nvme_sg_get_version_num(int __user *ip)
3166{ 3028{
3167 return put_user(sg_version_num, ip); 3029 return put_user(sg_version_num, ip);
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 40ee7705df63..ac8c62cb4875 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -112,37 +112,16 @@ static const struct block_device_operations rsxx_fops = {
112 112
113static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) 113static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
114{ 114{
115 struct hd_struct *part0 = &card->gendisk->part0; 115 generic_start_io_acct(bio_data_dir(bio), bio_sectors(bio),
116 int rw = bio_data_dir(bio); 116 &card->gendisk->part0);
117 int cpu;
118
119 cpu = part_stat_lock();
120
121 part_round_stats(cpu, part0);
122 part_inc_in_flight(part0, rw);
123
124 part_stat_unlock();
125} 117}
126 118
127static void disk_stats_complete(struct rsxx_cardinfo *card, 119static void disk_stats_complete(struct rsxx_cardinfo *card,
128 struct bio *bio, 120 struct bio *bio,
129 unsigned long start_time) 121 unsigned long start_time)
130{ 122{
131 struct hd_struct *part0 = &card->gendisk->part0; 123 generic_end_io_acct(bio_data_dir(bio), &card->gendisk->part0,
132 unsigned long duration = jiffies - start_time; 124 start_time);
133 int rw = bio_data_dir(bio);
134 int cpu;
135
136 cpu = part_stat_lock();
137
138 part_stat_add(cpu, part0, sectors[rw], bio_sectors(bio));
139 part_stat_inc(cpu, part0, ios[rw]);
140 part_stat_add(cpu, part0, ticks[rw], duration);
141
142 part_round_stats(cpu, part0);
143 part_dec_in_flight(part0, rw);
144
145 part_stat_unlock();
146} 125}
147 126
148static void bio_dma_done_cb(struct rsxx_cardinfo *card, 127static void bio_dma_done_cb(struct rsxx_cardinfo *card,
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 5ac312f6e0be..2236c6f31608 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -126,7 +126,6 @@ struct blkfront_info
126 unsigned int persistent_gnts_c; 126 unsigned int persistent_gnts_c;
127 unsigned long shadow_free; 127 unsigned long shadow_free;
128 unsigned int feature_flush; 128 unsigned int feature_flush;
129 unsigned int flush_op;
130 unsigned int feature_discard:1; 129 unsigned int feature_discard:1;
131 unsigned int feature_secdiscard:1; 130 unsigned int feature_secdiscard:1;
132 unsigned int discard_granularity; 131 unsigned int discard_granularity;
@@ -479,7 +478,19 @@ static int blkif_queue_request(struct request *req)
479 * way. (It's also a FLUSH+FUA, since it is 478 * way. (It's also a FLUSH+FUA, since it is
480 * guaranteed ordered WRT previous writes.) 479 * guaranteed ordered WRT previous writes.)
481 */ 480 */
482 ring_req->operation = info->flush_op; 481 switch (info->feature_flush &
482 ((REQ_FLUSH|REQ_FUA))) {
483 case REQ_FLUSH|REQ_FUA:
484 ring_req->operation =
485 BLKIF_OP_WRITE_BARRIER;
486 break;
487 case REQ_FLUSH:
488 ring_req->operation =
489 BLKIF_OP_FLUSH_DISKCACHE;
490 break;
491 default:
492 ring_req->operation = 0;
493 }
483 } 494 }
484 ring_req->u.rw.nr_segments = nseg; 495 ring_req->u.rw.nr_segments = nseg;
485 } 496 }
@@ -582,12 +593,14 @@ static inline void flush_requests(struct blkfront_info *info)
582 notify_remote_via_irq(info->irq); 593 notify_remote_via_irq(info->irq);
583} 594}
584 595
585static inline bool blkif_request_flush_valid(struct request *req, 596static inline bool blkif_request_flush_invalid(struct request *req,
586 struct blkfront_info *info) 597 struct blkfront_info *info)
587{ 598{
588 return ((req->cmd_type != REQ_TYPE_FS) || 599 return ((req->cmd_type != REQ_TYPE_FS) ||
589 ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) && 600 ((req->cmd_flags & REQ_FLUSH) &&
590 !info->flush_op)); 601 !(info->feature_flush & REQ_FLUSH)) ||
602 ((req->cmd_flags & REQ_FUA) &&
603 !(info->feature_flush & REQ_FUA)));
591} 604}
592 605
593/* 606/*
@@ -612,8 +625,8 @@ static void do_blkif_request(struct request_queue *rq)
612 625
613 blk_start_request(req); 626 blk_start_request(req);
614 627
615 if (blkif_request_flush_valid(req, info)) { 628 if (blkif_request_flush_invalid(req, info)) {
616 __blk_end_request_all(req, -EIO); 629 __blk_end_request_all(req, -EOPNOTSUPP);
617 continue; 630 continue;
618 } 631 }
619 632
@@ -683,20 +696,26 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
683 return 0; 696 return 0;
684} 697}
685 698
699static const char *flush_info(unsigned int feature_flush)
700{
701 switch (feature_flush & ((REQ_FLUSH | REQ_FUA))) {
702 case REQ_FLUSH|REQ_FUA:
703 return "barrier: enabled;";
704 case REQ_FLUSH:
705 return "flush diskcache: enabled;";
706 default:
707 return "barrier or flush: disabled;";
708 }
709}
686 710
687static void xlvbd_flush(struct blkfront_info *info) 711static void xlvbd_flush(struct blkfront_info *info)
688{ 712{
689 blk_queue_flush(info->rq, info->feature_flush); 713 blk_queue_flush(info->rq, info->feature_flush);
690 printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n", 714 pr_info("blkfront: %s: %s %s %s %s %s\n",
691 info->gd->disk_name, 715 info->gd->disk_name, flush_info(info->feature_flush),
692 info->flush_op == BLKIF_OP_WRITE_BARRIER ? 716 "persistent grants:", info->feature_persistent ?
693 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? 717 "enabled;" : "disabled;", "indirect descriptors:",
694 "flush diskcache" : "barrier or flush"), 718 info->max_indirect_segments ? "enabled;" : "disabled;");
695 info->feature_flush ? "enabled;" : "disabled;",
696 "persistent grants:",
697 info->feature_persistent ? "enabled;" : "disabled;",
698 "indirect descriptors:",
699 info->max_indirect_segments ? "enabled;" : "disabled;");
700} 719}
701 720
702static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) 721static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -1188,7 +1207,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1188 if (error == -EOPNOTSUPP) 1207 if (error == -EOPNOTSUPP)
1189 error = 0; 1208 error = 0;
1190 info->feature_flush = 0; 1209 info->feature_flush = 0;
1191 info->flush_op = 0;
1192 xlvbd_flush(info); 1210 xlvbd_flush(info);
1193 } 1211 }
1194 /* fall through */ 1212 /* fall through */
@@ -1808,7 +1826,6 @@ static void blkfront_connect(struct blkfront_info *info)
1808 physical_sector_size = sector_size; 1826 physical_sector_size = sector_size;
1809 1827
1810 info->feature_flush = 0; 1828 info->feature_flush = 0;
1811 info->flush_op = 0;
1812 1829
1813 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1830 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1814 "feature-barrier", "%d", &barrier, 1831 "feature-barrier", "%d", &barrier,
@@ -1821,10 +1838,8 @@ static void blkfront_connect(struct blkfront_info *info)
1821 * 1838 *
1822 * If there are barriers, then we use flush. 1839 * If there are barriers, then we use flush.
1823 */ 1840 */
1824 if (!err && barrier) { 1841 if (!err && barrier)
1825 info->feature_flush = REQ_FLUSH | REQ_FUA; 1842 info->feature_flush = REQ_FLUSH | REQ_FUA;
1826 info->flush_op = BLKIF_OP_WRITE_BARRIER;
1827 }
1828 /* 1843 /*
1829 * And if there is "feature-flush-cache" use that above 1844 * And if there is "feature-flush-cache" use that above
1830 * barriers. 1845 * barriers.
@@ -1833,10 +1848,8 @@ static void blkfront_connect(struct blkfront_info *info)
1833 "feature-flush-cache", "%d", &flush, 1848 "feature-flush-cache", "%d", &flush,
1834 NULL); 1849 NULL);
1835 1850
1836 if (!err && flush) { 1851 if (!err && flush)
1837 info->feature_flush = REQ_FLUSH; 1852 info->feature_flush = REQ_FLUSH;
1838 info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
1839 }
1840 1853
1841 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1854 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1842 "feature-discard", "%d", &discard, 1855 "feature-discard", "%d", &discard,
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 62e6e98186b5..ab43faddb447 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -601,13 +601,8 @@ static void request_endio(struct bio *bio, int error)
601static void bio_complete(struct search *s) 601static void bio_complete(struct search *s)
602{ 602{
603 if (s->orig_bio) { 603 if (s->orig_bio) {
604 int cpu, rw = bio_data_dir(s->orig_bio); 604 generic_end_io_acct(bio_data_dir(s->orig_bio),
605 unsigned long duration = jiffies - s->start_time; 605 &s->d->disk->part0, s->start_time);
606
607 cpu = part_stat_lock();
608 part_round_stats(cpu, &s->d->disk->part0);
609 part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
610 part_stat_unlock();
611 606
612 trace_bcache_request_end(s->d, s->orig_bio); 607 trace_bcache_request_end(s->d, s->orig_bio);
613 bio_endio(s->orig_bio, s->iop.error); 608 bio_endio(s->orig_bio, s->iop.error);
@@ -959,12 +954,9 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
959 struct search *s; 954 struct search *s;
960 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; 955 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
961 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 956 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
962 int cpu, rw = bio_data_dir(bio); 957 int rw = bio_data_dir(bio);
963 958
964 cpu = part_stat_lock(); 959 generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
965 part_stat_inc(cpu, &d->disk->part0, ios[rw]);
966 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
967 part_stat_unlock();
968 960
969 bio->bi_bdev = dc->bdev; 961 bio->bi_bdev = dc->bdev;
970 bio->bi_iter.bi_sector += dc->sb.data_offset; 962 bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1074,12 +1066,9 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1074 struct search *s; 1066 struct search *s;
1075 struct closure *cl; 1067 struct closure *cl;
1076 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; 1068 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1077 int cpu, rw = bio_data_dir(bio); 1069 int rw = bio_data_dir(bio);
1078 1070
1079 cpu = part_stat_lock(); 1071 generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
1080 part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1081 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1082 part_stat_unlock();
1083 1072
1084 s = search_alloc(bio, d); 1073 s = search_alloc(bio, d);
1085 cl = &s->cl; 1074 cl = &s->cl;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8f37ed215b19..4c06585bf165 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -605,13 +605,10 @@ static void end_io_acct(struct dm_io *io)
605 struct mapped_device *md = io->md; 605 struct mapped_device *md = io->md;
606 struct bio *bio = io->bio; 606 struct bio *bio = io->bio;
607 unsigned long duration = jiffies - io->start_time; 607 unsigned long duration = jiffies - io->start_time;
608 int pending, cpu; 608 int pending;
609 int rw = bio_data_dir(bio); 609 int rw = bio_data_dir(bio);
610 610
611 cpu = part_stat_lock(); 611 generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
612 part_round_stats(cpu, &dm_disk(md)->part0);
613 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
614 part_stat_unlock();
615 612
616 if (unlikely(dm_stats_used(&md->stats))) 613 if (unlikely(dm_stats_used(&md->stats)))
617 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 614 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
@@ -1651,16 +1648,12 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
1651{ 1648{
1652 int rw = bio_data_dir(bio); 1649 int rw = bio_data_dir(bio);
1653 struct mapped_device *md = q->queuedata; 1650 struct mapped_device *md = q->queuedata;
1654 int cpu;
1655 int srcu_idx; 1651 int srcu_idx;
1656 struct dm_table *map; 1652 struct dm_table *map;
1657 1653
1658 map = dm_get_live_table(md, &srcu_idx); 1654 map = dm_get_live_table(md, &srcu_idx);
1659 1655
1660 cpu = part_stat_lock(); 1656 generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
1661 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1662 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1663 part_stat_unlock();
1664 1657
1665 /* if we're suspended, we have to queue this io for later */ 1658 /* if we're suspended, we have to queue this io for later */
1666 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1659 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9233c71138f1..056ccd28c037 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -247,7 +247,6 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
247{ 247{
248 const int rw = bio_data_dir(bio); 248 const int rw = bio_data_dir(bio);
249 struct mddev *mddev = q->queuedata; 249 struct mddev *mddev = q->queuedata;
250 int cpu;
251 unsigned int sectors; 250 unsigned int sectors;
252 251
253 if (mddev == NULL || mddev->pers == NULL 252 if (mddev == NULL || mddev->pers == NULL
@@ -284,10 +283,7 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
284 sectors = bio_sectors(bio); 283 sectors = bio_sectors(bio);
285 mddev->pers->make_request(mddev, bio); 284 mddev->pers->make_request(mddev, bio);
286 285
287 cpu = part_stat_lock(); 286 generic_start_io_acct(rw, sectors, &mddev->gendisk->part0);
288 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
289 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
290 part_stat_unlock();
291 287
292 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 288 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
293 wake_up(&mddev->sb_wait); 289 wake_up(&mddev->sb_wait);