aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeil Brown <neilb@suse.de>2007-05-01 03:53:42 -0400
committerJens Axboe <axboe@nelson.home.kernel.dk>2007-05-11 07:28:37 -0400
commitd89d87965dcbe6fe4f96a2a7e8421b3a75f634d1 (patch)
tree8e87952d6e016192220aa090a8121e357a951d8f
parent129a84de2347002f09721cda3155ccfd19fade40 (diff)
When stacked block devices are in-use (e.g. md or dm), the recursive calls
to generic_make_request can use up a lot of space, and we would rather they didn't. As generic_make_request is a void function, and as it is generally not expected that it will have any effect immediately, it is safe to delay any call to generic_make_request until there is sufficient stack space available. As ->bi_next is reserved for the driver to use, it can have no valid value when generic_make_request is called, and as __make_request implicitly assumes it will be NULL (ELEVATOR_BACK_MERGE fork of switch) we can be certain that all callers set it to NULL. We can therefore safely use bi_next to link pending requests together, providing we clear it before making the real call. So, we choose to allow each thread to only be active in one generic_make_request at a time. If a subsequent (recursive) call is made, the bio is linked into a per-thread list, and is handled when the active call completes. As the list of pending bios is per-thread, there are no locking issues to worry about. I say above that it is "safe to delay any call...". There are, however, some behaviours of a make_request_fn which would make it unsafe. These include any behaviour that assumes anything will have changed after a recursive call to generic_make_request. These could include: - waiting for that call to finish and call it's bi_end_io function. md use to sometimes do this (marking the superblock dirty before completing a write) but doesn't any more - inspecting the bio for fields that generic_make_request might change, such as bi_sector or bi_bdev. It is hard to see a good reason for this, and I don't think anyone actually does it. - inspecing the queue to see if, e.g. it is 'full' yet. Again, I think this is very unlikely to be useful, or to be done. Signed-off-by: Neil Brown <neilb@suse.de> Cc: Jens Axboe <axboe@kernel.dk> Cc: <dm-devel@redhat.com> Alasdair G Kergon <agk@redhat.com> said: I can see nothing wrong with this in principle. For device-mapper at the moment though it's essential that, while the bio mappings may now get delayed, they still get processed in exactly the same order as they were passed to generic_make_request(). My main concern is whether the timing changes implicit in this patch will make the rare data-corrupting races in the existing snapshot code more likely. (I'm working on a fix for these races, but the unfinished patch is already several hundred lines long.) It would be helpful if some people on this mailing list would test this patch in various scenarios and report back. Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
-rw-r--r--block/ll_rw_blk.c53
-rw-r--r--include/linux/sched.h4
2 files changed, 56 insertions, 1 deletions
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 17e188973428..74a567afb830 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3116,7 +3116,7 @@ static inline int should_fail_request(struct bio *bio)
3116 * bi_sector for remaps as it sees fit. So the values of these fields 3116 * bi_sector for remaps as it sees fit. So the values of these fields
3117 * should NOT be depended on after the call to generic_make_request. 3117 * should NOT be depended on after the call to generic_make_request.
3118 */ 3118 */
3119void generic_make_request(struct bio *bio) 3119static inline void __generic_make_request(struct bio *bio)
3120{ 3120{
3121 request_queue_t *q; 3121 request_queue_t *q;
3122 sector_t maxsector; 3122 sector_t maxsector;
@@ -3215,6 +3215,57 @@ end_io:
3215 } while (ret); 3215 } while (ret);
3216} 3216}
3217 3217
3218/*
3219 * We only want one ->make_request_fn to be active at a time,
3220 * else stack usage with stacked devices could be a problem.
3221 * So use current->bio_{list,tail} to keep a list of requests
3222 * submited by a make_request_fn function.
3223 * current->bio_tail is also used as a flag to say if
3224 * generic_make_request is currently active in this task or not.
3225 * If it is NULL, then no make_request is active. If it is non-NULL,
3226 * then a make_request is active, and new requests should be added
3227 * at the tail
3228 */
3229void generic_make_request(struct bio *bio)
3230{
3231 if (current->bio_tail) {
3232 /* make_request is active */
3233 *(current->bio_tail) = bio;
3234 bio->bi_next = NULL;
3235 current->bio_tail = &bio->bi_next;
3236 return;
3237 }
3238 /* following loop may be a bit non-obvious, and so deserves some
3239 * explanation.
3240 * Before entering the loop, bio->bi_next is NULL (as all callers
3241 * ensure that) so we have a list with a single bio.
3242 * We pretend that we have just taken it off a longer list, so
3243 * we assign bio_list to the next (which is NULL) and bio_tail
3244 * to &bio_list, thus initialising the bio_list of new bios to be
3245 * added. __generic_make_request may indeed add some more bios
3246 * through a recursive call to generic_make_request. If it
3247 * did, we find a non-NULL value in bio_list and re-enter the loop
3248 * from the top. In this case we really did just take the bio
3249 * of the top of the list (no pretending) and so fixup bio_list and
3250 * bio_tail or bi_next, and call into __generic_make_request again.
3251 *
3252 * The loop was structured like this to make only one call to
3253 * __generic_make_request (which is important as it is large and
3254 * inlined) and to keep the structure simple.
3255 */
3256 BUG_ON(bio->bi_next);
3257 do {
3258 current->bio_list = bio->bi_next;
3259 if (bio->bi_next == NULL)
3260 current->bio_tail = &current->bio_list;
3261 else
3262 bio->bi_next = NULL;
3263 __generic_make_request(bio);
3264 bio = current->bio_list;
3265 } while (bio);
3266 current->bio_tail = NULL; /* deactivate */
3267}
3268
3218EXPORT_SYMBOL(generic_make_request); 3269EXPORT_SYMBOL(generic_make_request);
3219 3270
3220/** 3271/**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 17b72d88c4cb..e38c436ee12b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -88,6 +88,7 @@ struct sched_param {
88 88
89struct exec_domain; 89struct exec_domain;
90struct futex_pi_state; 90struct futex_pi_state;
91struct bio;
91 92
92/* 93/*
93 * List of flags we want to share for kernel threads, 94 * List of flags we want to share for kernel threads,
@@ -1014,6 +1015,9 @@ struct task_struct {
1014/* journalling filesystem info */ 1015/* journalling filesystem info */
1015 void *journal_info; 1016 void *journal_info;
1016 1017
1018/* stacked block device info */
1019 struct bio *bio_list, **bio_tail;
1020
1017/* VM state */ 1021/* VM state */
1018 struct reclaim_state *reclaim_state; 1022 struct reclaim_state *reclaim_state;
1019 1023