From 7681bfeeccff5efa9eb29bf09249a3c400b15327 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 19 Oct 2010 09:05:00 +0200 Subject: block: fix accounting bug on cross partition merges /proc/diskstats would display a strange output as follows. $ cat /proc/diskstats |grep sda 8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089 8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691 ~~~~~~~~~~ 8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390 8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92 8 4 sda4 4 0 8 0 0 0 0 0 0 0 0 8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137 Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE. The detailed root cause is as follows. Assuming that there are two partition, sda1 and sda2. 1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight is 0 and sda2's one is 1. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 2. A bio belongs to sda1 is issued and is merged into the request mentioned on step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed from sda2 region to sda1 region. However the two partition's hd_struct->in_flight are not changed. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 3. The request is finished and blk_account_io_done() is called. In this case, sda2's hd_struct->in_flight, not a sda1's one, is decremented. | hd_struct->in_flight --------------------------- sda1 | -1 sda2 | 1 --------------------------- The patch fixes the problem by caching the partition lookup inside the request structure, hence making sure that the increment and decrement will always happen on the same partition struct. This also speeds up IO with accounting enabled, since it cuts down on the number of lookups we have to do. When reloading partition tables, quiesce IO to ensure that no request references to the partition struct exists. When it is safe to free the partition table, the IO for that device is restarted again. Signed-off-by: Yasuaki Ishimatsu Cc: stable@kernel.org Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/elevator.h') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2c958f4fce1e..df1ee866d715 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -121,6 +121,8 @@ extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *, struct request *, gfp_t); extern void elv_put_request(struct request_queue *, struct request *); extern void elv_drain_elevator(struct request_queue *); +extern void elv_quiesce_start(struct request_queue *); +extern void elv_quiesce_end(struct request_queue *); /* * io scheduler registration -- cgit v1.2.2 From f253b86b4ad1b3220544e75880510fd455ebd23f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 24 Oct 2010 22:06:02 +0200 Subject: Revert "block: fix accounting bug on cross partition merges" This reverts commit 7681bfeeccff5efa9eb29bf09249a3c400b15327. Conflicts: include/linux/genhd.h It has numerous issues with the cleanup path and non-elevator devices. Revert it for now so we can come up with a clean version without rushing things. Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/elevator.h') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 80a0ece8f7e4..4fd978e7eb83 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -122,8 +122,6 @@ extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *, struct request *, gfp_t); extern void elv_put_request(struct request_queue *, struct request *); extern void elv_drain_elevator(struct request_queue *); -extern void elv_quiesce_start(struct request_queue *); -extern void elv_quiesce_end(struct request_queue *); /* * io scheduler registration -- cgit v1.2.2 From 909ea96468096b07fbb41aaf69be060d92bd9271 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 8 Dec 2010 16:22:55 +0100 Subject: core: Replace __get_cpu_var with __this_cpu_read if not used for an address. __get_cpu_var() can be replaced with this_cpu_read and will then use a single read instruction with implied address calculation to access the correct per cpu instance. However, the address of a per cpu variable passed to __this_cpu_read() cannot be determined (since it's an implied address conversion through segment prefixes). Therefore apply this only to uses of __get_cpu_var where the address of the variable is not used. Cc: Pekka Enberg Cc: Hugh Dickins Cc: Thomas Gleixner Acked-by: H. Peter Anvin Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- include/linux/elevator.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux/elevator.h') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 4fd978e7eb83..4d857973d2c9 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -195,15 +195,9 @@ enum { /* * io context count accounting */ -#define elv_ioc_count_mod(name, __val) \ - do { \ - preempt_disable(); \ - __get_cpu_var(name) += (__val); \ - preempt_enable(); \ - } while (0) - -#define elv_ioc_count_inc(name) elv_ioc_count_mod(name, 1) -#define elv_ioc_count_dec(name) elv_ioc_count_mod(name, -1) +#define elv_ioc_count_mod(name, __val) this_cpu_add(name, __val) +#define elv_ioc_count_inc(name) this_cpu_inc(name) +#define elv_ioc_count_dec(name) this_cpu_dec(name) #define elv_ioc_count_read(name) \ ({ \ -- cgit v1.2.2 From ae1b1539622fb46e51b4d13b3f9e5f4c713f86ae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jan 2011 12:43:54 +0100 Subject: block: reimplement FLUSH/FUA to support merge The current FLUSH/FUA support has evolved from the implementation which had to perform queue draining. As such, sequencing is done queue-wide one flush request after another. However, with the draining requirement gone, there's no reason to keep the queue-wide sequential approach. This patch reimplements FLUSH/FUA support such that each FLUSH/FUA request is sequenced individually. The actual FLUSH execution is double buffered and whenever a request wants to execute one for either PRE or POSTFLUSH, it queues on the pending queue. Once certain conditions are met, a flush request is issued and on its completion all pending requests proceed to the next sequence. This allows arbitrary merging of different type of flushes. How they are merged can be primarily controlled and tuned by adjusting the above said 'conditions' used to determine when to issue the next flush. This is inspired by Darrick's patches to merge multiple zero-data flushes which helps workloads with highly concurrent fsync requests. * As flush requests are never put on the IO scheduler, request fields used for flush share space with rq->rb_node. rq->completion_data is moved out of the union. This increases the request size by one pointer. As rq->elevator_private* are used only by the iosched too, it is possible to reduce the request size further. However, to do that, we need to modify request allocation path such that iosched data is not allocated for flush requests. * FLUSH/FUA processing happens on insertion now instead of dispatch. - Comments updated as per Vivek and Mike. Signed-off-by: Tejun Heo Cc: "Darrick J. Wong" Cc: Shaohua Li Cc: Christoph Hellwig Cc: Vivek Goyal Cc: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/elevator.h') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 4fd978e7eb83..86120c916fcc 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -167,6 +167,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define ELEVATOR_INSERT_BACK 2 #define ELEVATOR_INSERT_SORT 3 #define ELEVATOR_INSERT_REQUEUE 4 +#define ELEVATOR_INSERT_FLUSH 5 /* * return values from elevator_may_queue_fn -- cgit v1.2.2 From 73c101011926c5832e6e141682180c4debe2cf45 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Mar 2011 13:19:51 +0100 Subject: block: initial patch for on-stack per-task plugging This patch adds support for creating a queuing context outside of the queue itself. This enables us to batch up pieces of IO before grabbing the block device queue lock and submitting them to the IO scheduler. The context is created on the stack of the process and assigned in the task structure, so that we can auto-unplug it if we hit a schedule event. The current queue plugging happens implicitly if IO is submitted to an empty device, yet callers have to remember to unplug that IO when they are going to wait for it. This is an ugly API and has caused bugs in the past. Additionally, it requires hacks in the vm (->sync_page() callback) to handle that logic. By switching to an explicit plugging scheme we make the API a lot nicer and can get rid of the ->sync_page() hack in the vm. Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/elevator.h') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 39b68edb388d..8857cf9adbb7 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -105,6 +105,7 @@ extern void elv_add_request(struct request_queue *, struct request *, int, int); extern void __elv_add_request(struct request_queue *, struct request *, int, int); extern void elv_insert(struct request_queue *, struct request *, int); extern int elv_merge(struct request_queue *, struct request **, struct bio *); +extern int elv_try_merge(struct request *, struct bio *); extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, int); -- cgit v1.2.2 From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2011 08:52:07 +0100 Subject: block: remove per-queue plugging Code has been converted over to the new explicit on-stack plugging, and delay users have been converted to use the new API for that. So lets kill off the old plugging along with aops->sync_page(). Signed-off-by: Jens Axboe --- include/linux/elevator.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux/elevator.h') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 8857cf9adbb7..ec6f72b84477 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -20,7 +20,6 @@ typedef void (elevator_bio_merged_fn) (struct request_queue *, typedef int (elevator_dispatch_fn) (struct request_queue *, int); typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); -typedef int (elevator_queue_empty_fn) (struct request_queue *); typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *); typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); typedef int (elevator_may_queue_fn) (struct request_queue *, int); @@ -46,7 +45,6 @@ struct elevator_ops elevator_activate_req_fn *elevator_activate_req_fn; elevator_deactivate_req_fn *elevator_deactivate_req_fn; - elevator_queue_empty_fn *elevator_queue_empty_fn; elevator_completed_req_fn *elevator_completed_req_fn; elevator_request_list_fn *elevator_former_req_fn; @@ -101,8 +99,8 @@ struct elevator_queue */ extern void elv_dispatch_sort(struct request_queue *, struct request *); extern void elv_dispatch_add_tail(struct request_queue *, struct request *); -extern void elv_add_request(struct request_queue *, struct request *, int, int); -extern void __elv_add_request(struct request_queue *, struct request *, int, int); +extern void elv_add_request(struct request_queue *, struct request *, int); +extern void __elv_add_request(struct request_queue *, struct request *, int); extern void elv_insert(struct request_queue *, struct request *, int); extern int elv_merge(struct request_queue *, struct request **, struct bio *); extern int elv_try_merge(struct request *, struct bio *); @@ -112,7 +110,6 @@ extern void elv_merged_request(struct request_queue *, struct request *, int); extern void elv_bio_merged(struct request_queue *q, struct request *, struct bio *); extern void elv_requeue_request(struct request_queue *, struct request *); -extern int elv_queue_empty(struct request_queue *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); extern int elv_register_queue(struct request_queue *q); -- cgit v1.2.2 From 5e84ea3a9c662dc2d7a48703a4468fad954a3b7f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 21 Mar 2011 10:14:27 +0100 Subject: block: attempt to merge with existing requests on plug flush One of the disadvantages of on-stack plugging is that we potentially lose out on merging since all pending IO isn't always visible to everybody. When we flush the on-stack plugs, right now we don't do any checks to see if potential merge candidates could be utilized. Correct this by adding a new insert variant, ELEVATOR_INSERT_SORT_MERGE. It works just ELEVATOR_INSERT_SORT, but first checks whether we can merge with an existing request before doing the insertion (if we fail merging). This fixes a regression with multiple processes issuing IO that can be merged. Thanks to Shaohua Li for testing and fixing an accounting bug. Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/elevator.h') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index ec6f72b84477..d93efcc44570 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -166,6 +166,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define ELEVATOR_INSERT_SORT 3 #define ELEVATOR_INSERT_REQUEUE 4 #define ELEVATOR_INSERT_FLUSH 5 +#define ELEVATOR_INSERT_SORT_MERGE 6 /* * return values from elevator_may_queue_fn -- cgit v1.2.2 From b710a480554f2be682bac3cb59b0e085ba3d644b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2011 09:52:30 +0200 Subject: block: get rid of elv_insert() interface Merge it with __elv_add_request(), it's pretty pointless to have a function with only two callers. The main interface is elv_add_request()/__elv_add_request(). Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/elevator.h') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index d93efcc44570..21a8ebf2dc3a 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -101,7 +101,6 @@ extern void elv_dispatch_sort(struct request_queue *, struct request *); extern void elv_dispatch_add_tail(struct request_queue *, struct request *); extern void elv_add_request(struct request_queue *, struct request *, int); extern void __elv_add_request(struct request_queue *, struct request *, int); -extern void elv_insert(struct request_queue *, struct request *, int); extern int elv_merge(struct request_queue *, struct request **, struct bio *); extern int elv_try_merge(struct request *, struct bio *); extern void elv_merge_requests(struct request_queue *, struct request *, -- cgit v1.2.2