From 05e3db95ebfc5c06a29a1d8c7a3e02f46f3a25a7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:04 -0700 Subject: kthread: add a mechanism to store cgroup info kthread usually runs jobs on behalf of other threads. The jobs should be charged to cgroup of original threads. But the jobs run in a kthread, where we lose the cgroup context of original threads. The patch adds a machanism to record cgroup info of original threads in kthread context. Later we can retrieve the cgroup info and attach the cgroup info to jobs. Since this mechanism is only required by kthread, we store the cgroup info in kthread data instead of generic task_struct. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/kthread.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 82e197eeac91..bd4369c83dfb 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -3,6 +3,7 @@ /* Simple interface for creating and stopping kernel threads without mess. */ #include #include +#include __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), @@ -198,4 +199,14 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); +#ifdef CONFIG_CGROUPS +void kthread_associate_blkcg(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *kthread_blkcg(void); +#else +static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { } +static inline struct cgroup_subsys_state *kthread_blkcg(void) +{ + return NULL; +} +#endif #endif /* _LINUX_KTHREAD_H */ -- cgit v1.2.2 From af551fb3be26a22b7a6b345b3b7e7e6acfc41758 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:05 -0700 Subject: blkcg: delete unused APIs Nobody uses the APIs right now. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- include/linux/blk-cgroup.h | 12 ------------ 2 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 275c91c99516..9c75f58f6a50 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -522,13 +522,11 @@ do { \ #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); -int bio_associate_current(struct bio *bio); void bio_disassociate_task(struct bio *bio); void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ static inline int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { return 0; } -static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } static inline void bio_clone_blkcg_association(struct bio *dst, struct bio *src) { } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9d92153dd856..0cfa8d2ef4d6 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -235,12 +235,6 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return task_blkcg(current); } -static inline struct cgroup_subsys_state * -task_get_blkcg_css(struct task_struct *task) -{ - return task_get_css(task, io_cgrp_id); -} - /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest @@ -735,12 +729,6 @@ struct blkcg_policy { #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) -static inline struct cgroup_subsys_state * -task_get_blkcg_css(struct task_struct *task) -{ - return NULL; -} - #ifdef CONFIG_BLOCK static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -- cgit v1.2.2 From 902ec5b6de0678ac2effba0faaafdd1c3e7fcf2e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:06 -0700 Subject: block: make blkcg aware of kthread stored original cgroup info bio_blkcg is the only API to get cgroup info for a bio right now. If bio_blkcg finds current task is a kthread and has original blkcg associated, it will use the css instead of associating the bio to current task. This makes it possible that kthread dispatches bios on behalf of other threads. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 0cfa8d2ef4d6..f57e54d64529 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -19,6 +19,7 @@ #include #include #include +#include /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) @@ -223,16 +224,16 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) return css ? container_of(css, struct blkcg, css) : NULL; } -static inline struct blkcg *task_blkcg(struct task_struct *tsk) -{ - return css_to_blkcg(task_css(tsk, io_cgrp_id)); -} - static inline struct blkcg *bio_blkcg(struct bio *bio) { + struct cgroup_subsys_state *css; + if (bio && bio->bi_css) return css_to_blkcg(bio->bi_css); - return task_blkcg(current); + css = kthread_blkcg(); + if (css) + return css_to_blkcg(css); + return css_to_blkcg(task_css(current, io_cgrp_id)); } /** -- cgit v1.2.2 From 0b508bc926bdced678febee2a2b8cdba0a19e481 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 26 Sep 2017 11:02:12 -0700 Subject: block: fix a build error The code is only for blkcg not for all cgroups Fixes: d4478e92d618 ("block/loop: make loop cgroup aware") Reported-by: kbuild test robot Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/kthread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index bd4369c83dfb..fb201842c635 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -199,7 +199,7 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP void kthread_associate_blkcg(struct cgroup_subsys_state *css); struct cgroup_subsys_state *kthread_blkcg(void); #else -- cgit v1.2.2 From 640ab98fb3629c0f8417b9b2532eca596495f3bb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Sep 2017 05:40:16 -0600 Subject: buffer: have alloc_page_buffers() use __GFP_NOFAIL Instead of adding weird retry logic in that function, utilize __GFP_NOFAIL to ensure that the vm takes care of handling any potential retries appropriately. This means we don't have to call free_more_memory() from here. Reviewed-by: Nikolay Borisov Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c8dae555eccf..ae2d25f01b98 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -156,7 +156,7 @@ void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); int try_to_free_buffers(struct page *); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - int retry); + bool retry); void create_empty_buffers(struct page *, unsigned long, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); -- cgit v1.2.2 From 9ba4b2dfafaa711b41cc2102b0e9a529f3981218 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Sep 2017 08:58:25 -0600 Subject: fs: kill 'nr_pages' argument from wakeup_flusher_threads() Everybody is passing in 0 now, let's get rid of the argument. Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/writeback.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d5815794416c..1f9c6db5e29a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -189,7 +189,7 @@ bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void sync_inodes_sb(struct super_block *); -void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); +void wakeup_flusher_threads(enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ -- cgit v1.2.2 From 47410d88f665486bf91f02242ab5d5692b8887ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:25:03 -0600 Subject: writeback: remove 'range_cyclic' argument for wb_start_writeback() All the callers pass in 'true' for range_cyclic, so kill the argument. Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 854e1bdd0b2a..0f63493de9e7 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -39,7 +39,7 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) } void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, - bool range_cyclic, enum wb_reason reason); + enum wb_reason reason); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); -- cgit v1.2.2 From 595043e5f9ef1d8263bd9fda215cade489227491 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:26:59 -0600 Subject: writeback: provide a wakeup_flusher_threads_bdi() Similar to wakeup_flusher_threads(), except that we only wake up the flusher threads on the specified backing device. No functional changes in this patch. Acked-by: Johannes Weiner Tested-by: Chris Mason Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/writeback.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 1f9c6db5e29a..9c0091678af4 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -190,6 +190,8 @@ bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(enum wb_reason reason); +void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, + enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ -- cgit v1.2.2 From 9dfb176fae57a1dea68531fd25e867037e4d9bac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:28:55 -0600 Subject: writeback: make wb_start_writeback() static We don't have any callers outside of fs-writeback.c anymore, make it private. Acked-by: Johannes Weiner Tested-by: Chris Mason Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0f63493de9e7..157e950a70dc 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -38,8 +38,6 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) return bdi_alloc_node(gfp_mask, NUMA_NO_NODE); } -void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, - enum wb_reason reason); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); -- cgit v1.2.2 From aac8d41cd438f25bf3110fc6b98f1d16d7dbc169 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:31:55 -0600 Subject: writeback: only allow one inflight and pending full flush When someone calls wakeup_flusher_threads() or wakeup_flusher_threads_bdi(), they schedule writeback of all dirty pages in the system (or on that bdi). If we are tight on memory, we can get tons of these queued from kswapd/vmscan. This causes (at least) two problems: 1) We consume a ton of memory just allocating writeback work items. We've seen as much as 600 million of these writeback work items pending. That's a lot of memory to pointlessly hold hostage, while the box is under memory pressure. 2) We spend so much time processing these work items, that we introduce a softlockup in writeback processing. This is because each of the writeback work items don't end up doing any work (it's hard when you have millions of identical ones coming in to the flush machinery), so we just sit in a tight loop pulling work items and deleting/freeing them. Fix this by adding a 'start_all' bit to the writeback structure, and set that when someone attempts to flush all dirty pages. The bit is cleared when we start writeback on that work item. If the bit is already set when we attempt to queue !nr_pages writeback, then we simply ignore it. This provides us one full flush in flight, with one pending as well, and makes for more efficient handling of this type of writeback. Acked-by: Johannes Weiner Tested-by: Chris Mason Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 866c433e7d32..420de5c7c7f9 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -24,6 +24,7 @@ enum wb_state { WB_shutting_down, /* wb_shutdown() in progress */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ + WB_start_all, /* nr_pages == 0 (all) work pending */ }; enum wb_congested_state { -- cgit v1.2.2 From eaefd5abf6b095bfc55eb745bdf7c42cf66790eb Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 10:38:42 -0700 Subject: nvme-fc: add uevent for auto-connect To support auto-connecting to FC-NVME devices upon their dynamic appearance, add a uevent that can kick off connection scripts. uevent is posted against the fc_udev device. patch set tested with the following rule to kick an nvme-cli connect-all for the FC initiator and FC target ports. This is just an example for testing and not intended for real life use. ACTION=="change", SUBSYSTEM=="fc", ENV{FC_EVENT}=="nvmediscovery", \ ENV{NVMEFC_HOST_TRADDR}=="*", ENV{NVMEFC_TRADDR}=="*", \ RUN+="/bin/sh -c '/usr/local/sbin/nvme connect-all --transport=fc --host-traddr=$env{NVMEFC_HOST_TRADDR} --traddr=$env{NVMEFC_TRADDR} >> /tmp/nvme_fc.log'" I will post proposed udev/systemd scripts for possible kernel support. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc-driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index a726f96010d5..4ea03b9a5c8c 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -446,6 +446,8 @@ int nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport); +void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport); + /* -- cgit v1.2.2 From 85009b4f5f0399669a44f07cb9a5622c0e71d419 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 30 Sep 2017 02:09:06 -0600 Subject: writeback: eliminate work item allocation in bd_start_writeback() Handle start-all writeback like we do periodic or kupdate style writeback - by marking the bdi_writeback as needing a full flush, and simply waking the thread. This eliminates the need to allocate and queue a specific work item just for this purpose. After this change, we truly only ever have one of them running at any point in time. We mark the need to start all flushes, and the writeback thread will clear it once it has processed the request. Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 23 +++++++++++++++++++++++ include/linux/writeback.h | 22 ---------------------- 2 files changed, 23 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 420de5c7c7f9..b7c7be6f5986 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -44,6 +44,28 @@ enum wb_stat_item { #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +/* + * why some writeback work was initiated + */ +enum wb_reason { + WB_REASON_BACKGROUND, + WB_REASON_VMSCAN, + WB_REASON_SYNC, + WB_REASON_PERIODIC, + WB_REASON_LAPTOP_TIMER, + WB_REASON_FREE_MORE_MEM, + WB_REASON_FS_FREE_SPACE, + /* + * There is no bdi forker thread any more and works are done + * by emergency worker, however, this is TPs userland visible + * and we'll be exposing exactly the same information, + * so it has a mismatch name. + */ + WB_REASON_FORKER_THREAD, + + WB_REASON_MAX, +}; + /* * For cgroup writeback, multiple wb's may map to the same blkcg. Those * wb's can operate mostly independently but should share the congested @@ -116,6 +138,7 @@ struct bdi_writeback { struct fprop_local_percpu completions; int dirty_exceeded; + enum wb_reason start_all_reason; spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 9c0091678af4..dd1d2c23f743 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -41,28 +41,6 @@ enum writeback_sync_modes { WB_SYNC_ALL, /* Wait on every mapping */ }; -/* - * why some writeback work was initiated - */ -enum wb_reason { - WB_REASON_BACKGROUND, - WB_REASON_VMSCAN, - WB_REASON_SYNC, - WB_REASON_PERIODIC, - WB_REASON_LAPTOP_TIMER, - WB_REASON_FREE_MORE_MEM, - WB_REASON_FS_FREE_SPACE, - /* - * There is no bdi forker thread any more and works are done - * by emergency worker, however, this is TPs userland visible - * and we'll be exposing exactly the same information, - * so it has a mismatch name. - */ - WB_REASON_FORKER_THREAD, - - WB_REASON_MAX, -}; - /* * A control structure which tells the writeback code what to do. These are * always on the stack, and hence need no locking. They are always initialised -- cgit v1.2.2 From 5fdee2127faa77c9c91862ad5e001dfab7013e92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 5 Oct 2017 21:22:52 +0200 Subject: block: remove QUEUE_FLAG_STACKABLE We already have a queue_is_rq_based helper to check if a request_queue is request based, so we can remove the flag for it. Acked-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 02fa42d24b52..9fb71fc7d0e8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -609,7 +609,6 @@ struct request_queue { #define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */ -#define QUEUE_FLAG_STACKABLE 8 /* supports request stacking */ #define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 10 /* do IO stats */ @@ -633,12 +632,10 @@ struct request_queue { #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_ADD_RANDOM)) #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_POLL)) @@ -722,8 +719,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) -#define blk_queue_stackable(q) \ - test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) -- cgit v1.2.2 From 775d3a35dc3e13de55ec0e061c59e36faa7dd7f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Oct 2017 08:15:15 -0600 Subject: backing-dev: kill unused pdflush_proc_obsolete() After commit b35bd0d9f8a8, pdflush_proc_obsolete() is no longer used. Kill the function and declaration. Reported-by: Rakesh Pandit Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 157e950a70dc..872afa41abc2 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -172,8 +172,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) long congestion_wait(int sync, long timeout); long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); -int pdflush_proc_obsolete(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) { -- cgit v1.2.2 From 8264c3214f28b52b399d9e03bfa7feec275a0d71 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Mon, 9 Oct 2017 13:34:41 +0300 Subject: writeback: merge try_to_writeback_inodes_sb_nr() into caller Since commit 925a6efb8ff0c ("Btrfs: stop using try_to_writeback_inodes_sb_nr to flush delalloc") this function hasn't been used outside so stop exporting it. In addition we merge it into try_to_writeback_inodes_sb() which is the only caller. Also change return type of try_to_writeback_inodes_sb to void as the only user ext4 doesn't care. Reviewed-by: Jan Kara Signed-off-by: Rakesh Pandit Signed-off-by: Jens Axboe --- include/linux/writeback.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index dd1d2c23f743..e15ec14085ad 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -163,9 +163,7 @@ struct bdi_writeback; void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); -bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); -bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, - enum wb_reason reason); +void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(enum wb_reason reason); void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, -- cgit v1.2.2 From eca8b53a6769e60d6d8240d71202d73b0af81901 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 6 Oct 2017 17:55:59 -0700 Subject: blk-stat: delete useless code Fix two issues: - the per-cpu stat flush is unnecessary, nobody uses per-cpu stat except sum it to global stat. We can do the calculation there. The flush just wastes cpu time. - some fields are signed int/s64. I don't see the point. Reviewed-by: Omar Sandoval Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a2d2aa709cef..3385c89f402e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -329,11 +329,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie) } struct blk_rq_stat { - s64 mean; + u64 mean; u64 min; u64 max; - s32 nr_samples; - s32 nr_batch; + u32 nr_samples; u64 batch; }; -- cgit v1.2.2 From 7f66721a7d5bf6251f9c49aada9200c61ddcecc5 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Thu, 12 Oct 2017 19:58:10 +0300 Subject: fs/block_dev: remove vfs_msg() interface Replaced by pr_err usage in commit ef51042472f5 ("block, dax: move "select DAX" from BLOCK to FS_DAX") Signed-off-by: Rakesh Pandit Acked-by: Ross Zwisler Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9fb71fc7d0e8..72637028f3c9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -917,17 +917,6 @@ static inline void rq_flush_dcache_pages(struct request *rq) } #endif -#ifdef CONFIG_PRINTK -#define vfs_msg(sb, level, fmt, ...) \ - __vfs_msg(sb, level, fmt, ##__VA_ARGS__) -#else -#define vfs_msg(sb, level, fmt, ...) \ -do { \ - no_printk(fmt, ##__VA_ARGS__); \ - __vfs_msg(sb, "", " "); \ -} while (0) -#endif - extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern blk_qc_t generic_make_request(struct bio *bio); -- cgit v1.2.2 From 900148296b78c61aa8c443dc594c0da968c3be53 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:50 +0200 Subject: lightnvm: prevent target type module removal when in use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If target type module e.g. pblk here is unloaded (rmmod) while module is in use (after creating target) system crashes. We fix this by using module API refcnt. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7dfa56ebbc6d..7b80ac911d26 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -460,6 +460,7 @@ struct nvm_tgt_type { /* For internal use */ struct list_head list; + struct module *owner; }; extern struct nvm_tgt_type *nvm_find_target_type(const char *, int); -- cgit v1.2.2 From ef56b9ce562753cacf518f081a4ff3227efdab25 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:30 +0200 Subject: lightnvm: remove unused argument from nvm_set_tgt_bb_tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vblk isn't being used anyway and if we ever have a usecase we can introduce this again. This makes the logic easier and removes unnecessary checks. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7b80ac911d26..32ec35b5e18b 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -481,7 +481,7 @@ extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *, - const struct ppa_addr *, int, int); + const struct ppa_addr *, int); extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); -- cgit v1.2.2 From eb6f168f97438bf1cac8b9b1301c662eace9e39f Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:31 +0200 Subject: lightnvm: remove stale extern and unused exported symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all exported symbols are being used outside core and there were some stale entries in lightnvm.h Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 32ec35b5e18b..4f0e4a0fd204 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -463,8 +463,6 @@ struct nvm_tgt_type { struct module *owner; }; -extern struct nvm_tgt_type *nvm_find_target_type(const char *, int); - extern int nvm_register_tgt_type(struct nvm_tgt_type *); extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); @@ -480,9 +478,6 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); -extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *, - const struct ppa_addr *, int); -extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); @@ -491,8 +486,6 @@ extern void nvm_end_io(struct nvm_rq *); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -extern int nvm_dev_factory(struct nvm_dev *, int flags); - extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); #else /* CONFIG_NVM */ -- cgit v1.2.2 From 1a94b2d484677dc559c96251dd0e7c7b8811c378 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:47 +0200 Subject: lightnvm: implement generic path for sync I/O MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a generic path for sending sync I/O on LightNVM. This allows to reuse the standard synchronous path trough blk_execute_rq(), instead of implementing a wait_for_completion on the target side (e.g., pblk). Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 4f0e4a0fd204..b7f111ff4d3b 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -56,6 +56,7 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *); typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); typedef void (nvm_destroy_dma_pool_fn)(void *); typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, @@ -69,6 +70,7 @@ struct nvm_dev_ops { nvm_op_set_bb_fn *set_bb_tbl; nvm_submit_io_fn *submit_io; + nvm_submit_io_sync_fn *submit_io_sync; nvm_create_dma_pool_fn *create_dma_pool; nvm_destroy_dma_pool_fn *destroy_dma_pool; @@ -477,6 +479,7 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); +extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); -- cgit v1.2.2 From 149e10f8ff71439014dff97987e90e87e2684a16 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:53:06 +0300 Subject: block: introduce blk_mq_tagset_iter Iterator helper to apply a function on all the tags in a given tagset. export it as it will be used outside the block layer later on. Reviewed-by: Bart Van Assche Reviewed-by: Jens Axboe Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50c6485cb04f..6bc29f73a9aa 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -259,6 +259,8 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); +int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, + int (reinit_request)(void *, struct request *)); int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, int (reinit_request)(void *, struct request *)); -- cgit v1.2.2 From dab7487bdf63c6f2d70aac604494d023b189a9fd Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:53:08 +0300 Subject: block: remove blk_mq_reinit_tagset No callers left. Reviewed-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6bc29f73a9aa..cfd64e500d82 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -261,8 +261,6 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, int (reinit_request)(void *, struct request *)); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, - int (reinit_request)(void *, struct request *)); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); -- cgit v1.2.2 From 8ac0d9a81edf2ef4a2268b65b802a6b856dc77e6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Oct 2017 12:35:02 -0600 Subject: elevator: allow name aliases Since we now lookup elevator types with the appropriate multiqueue capability, allow schedulers to register with an alias alongside the real name. This is in preparation for allowing 'mq-deadline' to register an alias of 'deadline' as well. Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 5bc8f8682a3e..6df8b14f1f6a 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -144,6 +144,7 @@ struct elevator_type size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; + const char *elevator_alias; struct module *elevator_owner; bool uses_mq; #ifdef CONFIG_BLK_DEBUG_FS -- cgit v1.2.2 From ecad0d2cb8a7997afdc95031ee3328b997aba5c4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 23 Oct 2017 15:11:36 -0700 Subject: nvme-fc: remove NVME_FC_MAX_SEGMENTS The define is an arbitrary limit to the io size on the initiator, capping the io to 1MB-4KB. Remove the define from the transport. I/O size will solely be limited by the LLDD sg limits. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc-driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 4ea03b9a5c8c..2be4db353937 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -102,8 +102,6 @@ enum nvmefc_fcp_datadir { }; -#define NVME_FC_MAX_SEGMENTS 256 - /** * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport * to LLDD in order to perform a NVME FCP IO operation. -- cgit v1.2.2 From 7930d0a00ff5dbcc80f793d1a7a6b8de4e591f1a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:27 +0800 Subject: sbitmap: introduce __sbitmap_for_each_set() For blk-mq, we need to be able to iterate software queues starting from any queue in a round robin fashion, so introduce this helper. Reviewed-by: Omar Sandoval Cc: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 64 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index a1904aadbc45..0dcc60e820de 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb); */ bool sbitmap_any_bit_clear(const struct sbitmap *sb); +#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) +#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) + typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); /** - * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. + * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. + * @start: Where to start the iteration. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. @@ -222,35 +226,61 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); * This is inline even though it's non-trivial so that the function calls to the * callback will hopefully get optimized away. */ -static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, - void *data) +static inline void __sbitmap_for_each_set(struct sbitmap *sb, + unsigned int start, + sb_for_each_fn fn, void *data) { - unsigned int i; + unsigned int index; + unsigned int nr; + unsigned int scanned = 0; - for (i = 0; i < sb->map_nr; i++) { - struct sbitmap_word *word = &sb->map[i]; - unsigned int off, nr; + if (start >= sb->depth) + start = 0; + index = SB_NR_TO_INDEX(sb, start); + nr = SB_NR_TO_BIT(sb, start); - if (!word->word) - continue; + while (scanned < sb->depth) { + struct sbitmap_word *word = &sb->map[index]; + unsigned int depth = min_t(unsigned int, word->depth - nr, + sb->depth - scanned); - nr = 0; - off = i << sb->shift; + scanned += depth; + if (!word->word) + goto next; + + /* + * On the first iteration of the outer loop, we need to add the + * bit offset back to the size of the word for find_next_bit(). + * On all other iterations, nr is zero, so this is a noop. + */ + depth += nr; while (1) { - nr = find_next_bit(&word->word, word->depth, nr); - if (nr >= word->depth) + nr = find_next_bit(&word->word, depth, nr); + if (nr >= depth) break; - - if (!fn(sb, off + nr, data)) + if (!fn(sb, (index << sb->shift) + nr, data)) return; nr++; } +next: + nr = 0; + if (++index >= sb->map_nr) + index = 0; } } -#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) -#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) +/** + * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. + * @sb: Bitmap to iterate over. + * @fn: Callback. Should return true to continue or false to break early. + * @data: Pointer to pass to callback. + */ +static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, + void *data) +{ + __sbitmap_for_each_set(sb, 0, fn, data); +} static inline unsigned long *__sbitmap_word(struct sbitmap *sb, unsigned int bitnr) -- cgit v1.2.2 From de1482974080ec9ef414bf048b2646b246b63f6e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:29 +0800 Subject: blk-mq: introduce .get_budget and .put_budget in blk_mq_ops For SCSI devices, there is often a per-request-queue depth, which needs to be respected before queuing one request. Currently blk-mq always dequeues the request first, then calls .queue_rq() to dispatch the request to lld. One obvious issue with this approach is that I/O merging may not be successful, because when the per-request-queue depth can't be respected, .queue_rq() has to return BLK_STS_RESOURCE, and then this request has to stay in hctx->dispatch list. This means it never gets a chance to be merged with other IO. This patch introduces .get_budget and .put_budget callback in blk_mq_ops, then we can try to get reserved budget first before dequeuing request. If the budget for queueing I/O can't be satisfied, we don't need to dequeue request at all. Hence the request can be left in the IO scheduler queue, for more merging opportunities. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50c6485cb04f..901457df3d64 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -90,6 +90,8 @@ struct blk_mq_queue_data { typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); +typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *); +typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); @@ -111,6 +113,15 @@ struct blk_mq_ops { */ queue_rq_fn *queue_rq; + /* + * Reserve budget before queue request, once .queue_rq is + * run, it is driver's responsibility to release the + * reserved budget. Also we have to handle failure case + * of .get_budget for avoiding I/O deadlock. + */ + get_budget_fn *get_budget; + put_budget_fn *put_budget; + /* * Called on request timeout */ -- cgit v1.2.2 From b347689ffbca745ac457ee27400ce1affd571c6f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:30 +0800 Subject: blk-mq-sched: improve dispatching from sw queue SCSI devices use host-wide tagset, and the shared driver tag space is often quite big. However, there is also a queue depth for each lun( .cmd_per_lun), which is often small, for example, on both lpfc and qla2xxx, .cmd_per_lun is just 3. So lots of requests may stay in sw queue, and we always flush all belonging to same hw queue and dispatch them all to driver. Unfortunately it is easy to cause queue busy because of the small .cmd_per_lun. Once these requests are flushed out, they have to stay in hctx->dispatch, and no bio merge can happen on these requests, and sequential IO performance is harmed. This patch introduces blk_mq_dequeue_from_ctx for dequeuing a request from a sw queue, so that we can dispatch them in scheduler's way. We can then avoid dequeueing too many requests from sw queue, since we don't flush ->dispatch completely. This patch improves dispatching from sw queue by using the .get_budget and .put_budget callbacks. Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 901457df3d64..e5e6becd57d3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -30,6 +30,8 @@ struct blk_mq_hw_ctx { struct sbitmap ctx_map; + struct blk_mq_ctx *dispatch_from; + struct blk_mq_ctx **ctxs; unsigned int nr_ctx; -- cgit v1.2.2 From ac7fe82b6fcf77e757e88005c33b8147c1b7b73f Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:15 -0700 Subject: nvme-fc: add a dev_loss_tmo field to the remoteport Add a dev_loss_tmo value, paralleling the SCSI FC transport, for device connectivity loss. The transport initializes the value in the nvme_fc_register_remoteport() call. If the value is not set, a default of 60s is set. Add a new routine to the api, nvme_fc_set_remoteport_devloss() routine, which allows the lldd to dynamically update the value on an existing remoteport. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc-driver.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 2be4db353937..496ff759f84c 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -40,6 +40,8 @@ * @node_name: FC WWNN for the port * @port_name: FC WWPN for the port * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx) + * @dev_loss_tmo: maximum delay for reconnects to an association on + * this device. Used only on a remoteport. * * Initialization values for dynamic port fields: * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must @@ -50,6 +52,7 @@ struct nvme_fc_port_info { u64 port_name; u32 port_role; u32 port_id; + u32 dev_loss_tmo; }; @@ -200,6 +203,9 @@ enum nvme_fc_obj_state { * The length of the buffer corresponds to the local_priv_sz * value specified in the nvme_fc_port_template supplied by * the LLDD. + * @dev_loss_tmo: maximum delay for reconnects to an association on + * this device. To modify, lldd must call + * nvme_fc_set_remoteport_devloss(). * * Fields with dynamic values. Values may change base on link state. LLDD * may reference fields directly to change them. Initialized by the @@ -257,10 +263,9 @@ struct nvme_fc_remote_port { u32 port_role; u64 node_name; u64 port_name; - struct nvme_fc_local_port *localport; - void *private; + u32 dev_loss_tmo; /* dynamic fields */ u32 port_id; @@ -446,6 +451,8 @@ int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport); void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport); +int nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *remoteport, + u32 dev_loss_tmo); /* -- cgit v1.2.2 From 8977f563845bdd61fc61c4dfb399270b7d5667c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:48 +0300 Subject: block: move REQ_NOWAIT This flag should be before the operation-specific REQ_NOUNMAP bit. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3385c89f402e..c4c6c8bced2e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -224,11 +224,11 @@ enum req_flag_bits { __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ + __REQ_NOWAIT, /* Don't wait if request will block */ /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ - __REQ_NOWAIT, /* Don't wait if request will block */ __REQ_NR_BITS, /* stops here */ }; @@ -245,9 +245,9 @@ enum req_flag_bits { #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) +#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) -#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -- cgit v1.2.2 From 96222bcc732d0504363dc772637c50e53b4bd41e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:49 +0300 Subject: block: add REQ_DRV bit Set aside a bit in the request/bio flags for driver use. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c4c6c8bced2e..1b04085255fb 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -229,6 +229,9 @@ enum req_flag_bits { /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ + /* for driver use */ + __REQ_DRV, + __REQ_NR_BITS, /* stops here */ }; @@ -249,6 +252,8 @@ enum req_flag_bits { #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) +#define REQ_DRV (1ULL << __REQ_DRV) + #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -- cgit v1.2.2 From f421e1d9ade4e1b88183e54425cf50e390d16a7f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:50 +0300 Subject: block: provide a direct_make_request helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper allows reinserting a bio into a new queue without much overhead, but requires all queue limits to be the same for the upper and lower queues, and it does not provide any recursion preventions. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Javier González Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 72637028f3c9..eda3d25c0f68 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -920,6 +920,7 @@ static inline void rq_flush_dcache_pages(struct request *rq) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern blk_qc_t generic_make_request(struct bio *bio); +extern blk_qc_t direct_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); -- cgit v1.2.2 From ef71de8b15d891b27b8c983a9a8972b11cb4576a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:51 +0300 Subject: block: add a blk_steal_bios helper This helpers allows to bounce steal the uncompleted bios from a request so that they can be reissued on another path. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index eda3d25c0f68..fddda6a1f9b5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1094,6 +1094,8 @@ extern struct request *blk_peek_request(struct request_queue *q); extern void blk_start_request(struct request *rq); extern struct request *blk_fetch_request(struct request_queue *q); +void blk_steal_bios(struct bio_list *list, struct request *rq); + /* * Request completion related functions. * -- cgit v1.2.2 From 517bf3c306bad4fe0da631f90ae2ec40924dee2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:52 +0300 Subject: block: don't look at the struct device dev_t in disk_devt The hidden gendisks introduced in the next patch need to keep the dev field in their struct device empty so that udev won't try to create block device nodes for them. To support that rewrite disk_devt to look at the major and first_minor fields in the gendisk itself instead of looking into the struct device. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ea652bfcd675..5c0ed5db33c2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -234,7 +234,7 @@ static inline bool disk_part_scan_enabled(struct gendisk *disk) static inline dev_t disk_devt(struct gendisk *disk) { - return disk_to_dev(disk)->devt; + return MKDEV(disk->major, disk->first_minor); } static inline dev_t part_devt(struct hd_struct *part) -- cgit v1.2.2 From 8ddcd653257c18a669fcb75ee42c37054908e0d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:53 +0300 Subject: block: introduce GENHD_FL_HIDDEN With this flag a driver can create a gendisk that can be used for I/O submission inside the kernel, but which is not registered as user facing block device. This will be useful for the NVMe multipath implementation. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5c0ed5db33c2..93aae3476f58 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -140,6 +140,7 @@ struct hd_struct { #define GENHD_FL_NATIVE_CAPACITY 128 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 #define GENHD_FL_NO_PART_SCAN 512 +#define GENHD_FL_HIDDEN 1024 enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ -- cgit v1.2.2 From ea435e1b9392a33deceaea2a16ebaa3397bead93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:54 +0300 Subject: block: add a poll_fn callback to struct request_queue That we we can also poll non blk-mq queues. Mostly needed for the NVMe multipath code, but could also be useful elsewhere. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fddda6a1f9b5..225617dd0a3f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -266,6 +266,7 @@ struct blk_queue_ctx; typedef void (request_fn_proc) (struct request_queue *q); typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); +typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unprep_rq_fn) (struct request_queue *, struct request *); @@ -408,6 +409,7 @@ struct request_queue { request_fn_proc *request_fn; make_request_fn *make_request_fn; + poll_q_fn *poll_fn; prep_rq_fn *prep_rq_fn; unprep_rq_fn *unprep_rq_fn; softirq_done_fn *softirq_done_fn; @@ -975,7 +977,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); -bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); +bool blk_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { -- cgit v1.2.2 From 88022d7201e96b43f1754b0358fc6bcd8dbdcde1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 5 Nov 2017 02:21:12 +0800 Subject: blk-mq: don't handle failure in .get_budget It is enough to just check if we can get the budget via .get_budget(). And we don't need to deal with device state change in .get_budget(). For SCSI, one issue to be fixed is that we have to call scsi_mq_uninit_cmd() to free allocated ressources if SCSI device fails to handle the request. And it isn't enough to simply call blk_mq_end_request() to do that if this request is marked as RQF_DONTPREP. Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq) Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f2e3079eecdd..674641527da7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -92,7 +92,7 @@ struct blk_mq_queue_data { typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); -typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *); +typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); -- cgit v1.2.2 From 84fef62d135b6e47b52f4e9280b5dbc5bb0050ba Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 10:28:32 -0700 Subject: nvme: check admin passthru command effects The NVMe standard provides a command effects log page so the host may be aware of special requirements it may need to do for a particular command. For example, the command may need to run with IO quiesced to prevent timeouts or undefined behavior, or it may change the logical block formats that determine how the host needs to construct future commands. This patch saves the nvme command effects log page if the controller supports it, and performs appropriate actions before and after an admin passthrough command is completed. If the controller does not support the command effects log page, the driver will define the effects for known opcodes. The nvme format and santize are the only commands in this patch with known effects. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 9310ce77d8e1..fd1d4508a612 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -267,6 +267,7 @@ enum { NVME_CTRL_OACS_SEC_SUPP = 1 << 0, NVME_CTRL_OACS_DIRECTIVES = 1 << 5, NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, + NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1, }; struct nvme_lbaf { @@ -395,6 +396,21 @@ struct nvme_fw_slot_info_log { __u8 rsvd64[448]; }; +enum { + NVME_CMD_EFFECTS_CSUPP = 1 << 0, + NVME_CMD_EFFECTS_LBCC = 1 << 1, + NVME_CMD_EFFECTS_NCC = 1 << 2, + NVME_CMD_EFFECTS_NIC = 1 << 3, + NVME_CMD_EFFECTS_CCC = 1 << 4, + NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, +}; + +struct nvme_effects_log { + __le32 acs[256]; + __le32 iocs[256]; + __u8 resv[2048]; +}; + enum { NVME_SMART_CRIT_SPARE = 1 << 0, NVME_SMART_CRIT_TEMPERATURE = 1 << 1, @@ -681,6 +697,7 @@ enum nvme_admin_opcode { nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, nvme_admin_security_recv = 0x82, + nvme_admin_sanitize_nvm = 0x84, }; enum { @@ -712,6 +729,7 @@ enum { NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, + NVME_LOG_CMD_EFFECTS = 0x05, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), -- cgit v1.2.2 From 83f5f7ed72f316eda4c4c3833beebfe6578926f4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Nov 2017 11:15:37 -0700 Subject: block: kill bio_kmap/kunmap_irq() There are no users of it anymore. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9c75f58f6a50..1d7e63d7505f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -573,17 +573,6 @@ static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) } #endif -static inline char *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter, - unsigned long *flags) -{ - return bvec_kmap_irq(&bio_iter_iovec(bio, iter), flags); -} -#define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags) - -#define bio_kmap_irq(bio, flags) \ - __bio_kmap_irq((bio), (bio)->bi_iter, (flags)) -#define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) - /* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * -- cgit v1.2.2 From d004a5e7d4dd6335ce6e2044af42f5e0fbebb51d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Nov 2017 19:13:48 +0100 Subject: block: remove __bio_kmap_atomic This helper doesn't buy us much over calling kmap_atomic directly. In fact in the only caller it does a bit of useless work as the caller already has the bvec at hand, and said caller would even buggy for a multi-segment bio due to the use of this helper. So just remove it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 1d7e63d7505f..d4eec19a6d3c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -128,18 +128,6 @@ static inline void *bio_data(struct bio *bio) */ #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) -/* - * queues that have highmem support enabled may still need to revert to - * PIO transfers occasionally and thus map high pages temporarily. For - * permanent PIO fall back, user is probably better off disabling highmem - * I/O completely on that queue (see ide-dma for example) - */ -#define __bio_kmap_atomic(bio, iter) \ - (kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) + \ - bio_iter_iovec((bio), (iter)).bv_offset) - -#define __bio_kunmap_atomic(addr) kunmap_atomic(addr) - /* * merge helpers etc */ -- cgit v1.2.2 From f00c4d80ffdac9e3a64947ebd57489f3232d5a74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 5 Nov 2017 10:36:31 +0300 Subject: block: pass full fmode_t to blk_verify_command Use the obvious calling convention. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 225617dd0a3f..1437ef4d8037 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1360,7 +1360,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, gfp_mask, 0); } -extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); +extern int blk_verify_command(unsigned char *cmd, fmode_t mode); enum blk_default_limits { BLK_MAX_SEGMENTS = 128, -- cgit v1.2.2 From 38dabe210fbab4e7e8a03670ab3ba42f247ea08f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:10 -0700 Subject: nvme: centralize AEN defines All the transports were unnecessarilly duplicating the AEN request accounting. This patch defines everything in one place. Signed-off-by: Keith Busch Reviewed-by: Guan Junxiong Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fd1d4508a612..89ffa7eed2fd 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -90,6 +90,14 @@ enum { }; #define NVME_AQ_DEPTH 32 +#define NVME_NR_AEN_COMMANDS 1 +#define NVME_AQ_BLK_MQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) + +/* + * Subtract one to leave an empty queue entry for 'Full Queue' condition. See + * NVM-Express 1.2 specification, section 4.1.2. + */ +#define NVME_AQ_MQ_TAG_DEPTH (NVME_AQ_BLK_MQ_DEPTH - 1) enum { NVME_REG_CAP = 0x0000, /* Controller Capabilities */ -- cgit v1.2.2 From e3d7874dcf175cca2dca7795d6453f637ad8ba9b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:14 -0700 Subject: nvme: send uevent for some asynchronous events This will give udev a chance to observe and handle asynchronous event notifications and clear the log to unmask future events of the same type. The driver will create a change uevent of the asyncronuos event result before submitting the next AEN request to the device if a completed AEN event is of type error, smart, command set or vendor specific, Signed-off-by: Keith Busch Reviewed-by: Guan Junxiong Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 89ffa7eed2fd..aea87f0d917b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -428,6 +428,10 @@ enum { }; enum { + NVME_AER_ERROR = 0, + NVME_AER_SMART = 1, + NVME_AER_CSS = 6, + NVME_AER_VS = 7, NVME_AER_NOTICE_NS_CHANGED = 0x0002, NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102, }; -- cgit v1.2.2 From eb619fdb2d4cb8b3d3419e9113921e87e7daf557 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Nov 2017 08:32:43 -0700 Subject: blk-mq: fix issue with shared tag queue re-running This patch attempts to make the case of hctx re-running on driver tag failure more robust. Without this patch, it's pretty easy to trigger a stall condition with shared tags. An example is using null_blk like this: modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 which sets up 4 devices, sharing the same tag set with a depth of 1. Running a fio job ala: [global] bs=4k rw=randread norandommap direct=1 ioengine=libaio iodepth=4 [nullb0] filename=/dev/nullb0 [nullb1] filename=/dev/nullb1 [nullb2] filename=/dev/nullb2 [nullb3] filename=/dev/nullb3 will inevitably end with one or more threads being stuck waiting for a scheduler tag. That IO is then stuck forever, until someone else triggers a run of the queue. Ensure that we always re-run the hardware queue, if the driver tag we were waiting for got freed before we added our leftover request entries back on the dispatch list. Reviewed-by: Bart Van Assche Tested-by: Bart Van Assche Reviewed-by: Ming Lei Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 674641527da7..4ae987c2352c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -35,7 +35,7 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx **ctxs; unsigned int nr_ctx; - wait_queue_entry_t dispatch_wait; + wait_queue_entry_t dispatch_wait; atomic_t wait_index; struct blk_mq_tags *tags; @@ -181,8 +181,7 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, - BLK_MQ_S_TAG_WAITING = 3, - BLK_MQ_S_START_ON_RUN = 4, + BLK_MQ_S_START_ON_RUN = 3, BLK_MQ_MAX_DEPTH = 10240, -- cgit v1.2.2 From 6a15674d1e90917f1723a814e2e8c949000440f7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:54 -0800 Subject: block: Introduce blk_get_request_flags() A side effect of this patch is that the GFP mask that is passed to several allocation functions in the legacy block layer is changed from GFP_KERNEL into __GFP_DIRECT_RECLAIM. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1437ef4d8037..1af5ddd0f631 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -927,6 +927,9 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); +extern struct request *blk_get_request_flags(struct request_queue *, + unsigned int op, + unsigned int flags); extern struct request *blk_get_request(struct request_queue *, unsigned int op, gfp_t gfp_mask); extern void blk_requeue_request(struct request_queue *, struct request *); -- cgit v1.2.2 From 1b6d65a0bfb5df2a6029c1430e99fcc5d96bb59a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:55 -0800 Subject: block: Introduce BLK_MQ_REQ_PREEMPT Set RQF_PREEMPT if BLK_MQ_REQ_PREEMPT is passed to blk_get_request_flags(). Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4ae987c2352c..82b56609736a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -212,6 +212,7 @@ enum { BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ + BLK_MQ_REQ_PREEMPT = (1 << 3), /* set RQF_PREEMPT */ }; struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, -- cgit v1.2.2 From c9254f2ddb19387ea9714a57ea48463c20333b92 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:57 -0800 Subject: block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag This flag will be used in the next patch to let the block layer core know whether or not a SCSI request queue has been quiesced. A quiesced SCSI queue namely only processes RQF_PREEMPT requests. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Ming Lei Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1af5ddd0f631..2147e2381a22 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -632,6 +632,7 @@ struct request_queue { #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ +#define QUEUE_FLAG_PREEMPT_ONLY 29 /* only process REQ_PREEMPT requests */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ @@ -732,6 +733,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) +#define blk_queue_preempt_only(q) \ + test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags) + +extern int blk_set_preempt_only(struct request_queue *q); +extern void blk_clear_preempt_only(struct request_queue *q); static inline bool blk_account_rq(struct request *rq) { -- cgit v1.2.2 From 3a0a529971ec4e2d933e9c7798db101dfb6b1aec Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:58 -0800 Subject: block, scsi: Make SCSI quiesce and resume work reliably The contexts from which a SCSI device can be quiesced or resumed are: * Writing into /sys/class/scsi_device/*/device/state. * SCSI parallel (SPI) domain validation. * The SCSI device power management methods. See also scsi_bus_pm_ops. It is essential during suspend and resume that neither the filesystem state nor the filesystem metadata in RAM changes. This is why while the hibernation image is being written or restored that SCSI devices are quiesced. The SCSI core quiesces devices through scsi_device_quiesce() and scsi_device_resume(). In the SDEV_QUIESCE state execution of non-preempt requests is deferred. This is realized by returning BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI devices. Avoid that a full queue prevents power management requests to be submitted by deferring allocation of non-preempt requests for devices in the quiesced state. This patch has been tested by running the following commands and by verifying that after each resume the fio job was still running: for ((i=0; i<10; i++)); do ( cd /sys/block/md0/md && while true; do [ "$( sync_action sleep 1 done ) & pids=($!) for d in /sys/class/block/sd*[a-z]; do bdev=${d#/sys/class/block/} hcil=$(readlink "$d/device") hcil=${hcil#../../../} echo 4 > "$d/queue/nr_requests" echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth" fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 \ --rw=randread --ioengine=libaio --numjobs=4 --iodepth=16 \ --iodepth_batch=1 --thread --loops=$((2**31)) & pids+=($!) done sleep 1 echo "$(date) Hibernating ..." >>hibernate-test-log.txt systemctl hibernate sleep 10 kill "${pids[@]}" echo idle > /sys/block/md0/md/sync_action wait echo "$(date) Done." >>hibernate-test-log.txt done Reported-by: Oleksandr Natalenko References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348). Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Martin K. Petersen Cc: Ming Lei Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2147e2381a22..402c9d536ae1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); -extern int blk_queue_enter(struct request_queue *q, bool nowait); +extern int blk_queue_enter(struct request_queue *q, unsigned int flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); extern void blk_start_queue_async(struct request_queue *q); -- cgit v1.2.2 From 9a95e4ef709533efac4aafcb8bddf73f96db50ed Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:59 -0800 Subject: block, nvme: Introduce blk_mq_req_flags_t Several block layer and NVMe core functions accept a combination of BLK_MQ_REQ_* flags through the 'flags' argument but there is no verification at compile time whether the right type of block layer flags is passed. Make it possible for sparse to verify this. This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Oleksandr Natalenko Cc: linux-nvme@lists.infradead.org Cc: Christoph Hellwig Cc: Johannes Thumshirn Cc: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 17 +++++++++++------ include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 4 ++-- 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 82b56609736a..b326208277ee 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -209,16 +209,21 @@ void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { - BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ - BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ - BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ - BLK_MQ_REQ_PREEMPT = (1 << 3), /* set RQF_PREEMPT */ + /* return when out of requests */ + BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), + /* allocate from reserved pool */ + BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), + /* allocate internal/sched tag */ + BLK_MQ_REQ_INTERNAL = (__force blk_mq_req_flags_t)(1 << 2), + /* set RQF_PREEMPT */ + BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3), }; struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, - unsigned int flags); + blk_mq_req_flags_t flags); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, - unsigned int op, unsigned int flags, unsigned int hctx_idx); + unsigned int op, blk_mq_req_flags_t flags, + unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); enum { diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1b04085255fb..13ccfc9b210a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -162,6 +162,8 @@ struct bio { */ #define BIO_RESET_BITS BVEC_POOL_OFFSET +typedef __u32 __bitwise blk_mq_req_flags_t; + /* * Operations and flags common to the bio and request structures. * We use 8 bits for encoding the operation, and the remaining 24 for flags. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 402c9d536ae1..e80ea1d31343 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -935,7 +935,7 @@ extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request_flags(struct request_queue *, unsigned int op, - unsigned int flags); + blk_mq_req_flags_t flags); extern struct request *blk_get_request(struct request_queue *, unsigned int op, gfp_t gfp_mask); extern void blk_requeue_request(struct request_queue *, struct request *); @@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); -extern int blk_queue_enter(struct request_queue *q, unsigned int flags); +extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); extern void blk_start_queue_async(struct request_queue *q); -- cgit v1.2.2 From 67f2519fe2903c4041c0e94394d14d372fe51399 Mon Sep 17 00:00:00 2001 From: Greg Edwards Date: Tue, 24 Oct 2017 11:21:48 -0600 Subject: fs: guard_bio_eod() needs to consider partitions guard_bio_eod() needs to look at the partition capacity, not just the capacity of the whole device, when determining if truncation is necessary. [ 60.268688] attempt to access beyond end of device [ 60.268690] unknown-block(9,1): rw=0, want=67103509, limit=67103506 [ 60.268693] buffer_io_error: 2 callbacks suppressed [ 60.268696] Buffer I/O error on dev md1p7, logical block 4524305, async page read Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index") Cc: stable@vger.kernel.org # v4.13 Reviewed-by: Christoph Hellwig Signed-off-by: Greg Edwards Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 93aae3476f58..ca10cc292187 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -243,6 +243,7 @@ static inline dev_t part_devt(struct hd_struct *part) return part_to_dev(part)->devt; } +extern struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); static inline void disk_put_part(struct hd_struct *part) -- cgit v1.2.2 From 79f720a751cad613620d0237e3b44f89f4a69181 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 10 Nov 2017 09:13:21 -0700 Subject: blk-mq: only run the hardware queue if IO is pending Currently we are inconsistent in when we decide to run the queue. Using blk_mq_run_hw_queues() we check if the hctx has pending IO before running it, but we don't do that from the individual queue run function, blk_mq_run_hw_queue(). This results in a lot of extra and pointless queue runs, potentially, on flush requests and (much worse) on tag starvation situations. This is observable just looking at top output, with lots of kworkers active. For the !async runs, it just adds to the CPU overhead of blk-mq. Move the has-pending check into the run function instead of having callers do it. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b326208277ee..eb1e2cdffb31 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -266,7 +266,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); +bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, -- cgit v1.2.2