diff options
author | Coly Li <colyli@suse.de> | 2018-03-18 20:36:16 -0400 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2018-03-18 22:15:20 -0400 |
commit | 3fd47bfe55b00d5ac7b0a44c9301c07be39b1082 (patch) | |
tree | 32270c555e9f87fc2b4da0e7596ddbb0264d644e | |
parent | fadd94e05c02afec7b70b0b14915624f1782f578 (diff) |
bcache: stop dc->writeback_rate_update properly
struct delayed_work writeback_rate_update in struct cache_dev is a delayed
worker to call function update_writeback_rate() in period (the interval is
defined by dc->writeback_rate_update_seconds).
When a metadate I/O error happens on cache device, bcache error handling
routine bch_cache_set_error() will call bch_cache_set_unregister() to
retire whole cache set. On the unregister code path, this delayed work is
stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update).
dc->writeback_rate_update is a special delayed work from others in bcache.
In its routine update_writeback_rate(), this delayed work is re-armed
itself. That means when cancel_delayed_work_sync() returns, this delayed
work can still be executed after several seconds defined by
dc->writeback_rate_update_seconds.
The problem is, after cancel_delayed_work_sync() returns, the cache set
unregister code path will continue and release memory of struct cache set.
Then the delayed work is scheduled to run, __update_writeback_rate()
will reference the already released cache_set memory, and trigger a NULL
pointer deference fault.
This patch introduces two more bcache device flags,
- BCACHE_DEV_WB_RUNNING
bit set: bcache device is in writeback mode and running, it is OK for
dc->writeback_rate_update to re-arm itself.
bit clear:bcache device is trying to stop dc->writeback_rate_update,
this delayed work should not re-arm itself and quit.
- BCACHE_DEV_RATE_DW_RUNNING
bit set: routine update_writeback_rate() is executing.
bit clear: routine update_writeback_rate() quits.
This patch also adds a function cancel_writeback_rate_update_dwork() to
wait for dc->writeback_rate_update quits before cancel it by calling
cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected
quit dc->writeback_rate_update, after time_out seconds this function will
give up and continue to call cancel_delayed_work_sync().
And here I explain how this patch stops self re-armed delayed work properly
with the above stuffs.
update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning
and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling
cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING.
Before calling cancel_delayed_work_sync() wait utill flag
BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling
cancel_delayed_work_sync(), dc->writeback_rate_update must be already re-
armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases
delayed work routine update_writeback_rate() won't be executed after
cancel_delayed_work_sync() returns.
Inside update_writeback_rate() before calling schedule_delayed_work(), flag
BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means
someone is about to stop the delayed work. Because flag
BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync()
has to wait for this flag to be cleared, we don't need to worry about race
condition here.
If update_writeback_rate() is scheduled to run after checking
BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync()
in cancel_writeback_rate_update_dwork(), it is also safe. Because at this
moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned
previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear
and quit immediately.
Because there are more dependences inside update_writeback_rate() to struct
cache_set memory, dc->writeback_rate_update is not a simple self re-arm
delayed work. After trying many different methods (e.g. hold dc->count, or
use locks), this is the only way I can find which works to properly stop
dc->writeback_rate_update delayed work.
Changelog:
v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING
to bit index, for test_bit().
v2: Try to fix the race issue which is pointed out by Junhui.
v1: The initial version for review
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r-- | drivers/md/bcache/bcache.h | 9 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 38 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 3 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 29 |
4 files changed, 69 insertions, 10 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 12e5197f186c..b5ddb848cd31 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -258,10 +258,11 @@ struct bcache_device { | |||
258 | struct gendisk *disk; | 258 | struct gendisk *disk; |
259 | 259 | ||
260 | unsigned long flags; | 260 | unsigned long flags; |
261 | #define BCACHE_DEV_CLOSING 0 | 261 | #define BCACHE_DEV_CLOSING 0 |
262 | #define BCACHE_DEV_DETACHING 1 | 262 | #define BCACHE_DEV_DETACHING 1 |
263 | #define BCACHE_DEV_UNLINK_DONE 2 | 263 | #define BCACHE_DEV_UNLINK_DONE 2 |
264 | 264 | #define BCACHE_DEV_WB_RUNNING 3 | |
265 | #define BCACHE_DEV_RATE_DW_RUNNING 4 | ||
265 | unsigned nr_stripes; | 266 | unsigned nr_stripes; |
266 | unsigned stripe_size; | 267 | unsigned stripe_size; |
267 | atomic_t *stripe_sectors_dirty; | 268 | atomic_t *stripe_sectors_dirty; |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 020be4f1cd8b..e5be599338c5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
@@ -899,6 +899,31 @@ void bch_cached_dev_run(struct cached_dev *dc) | |||
899 | pr_debug("error creating sysfs link"); | 899 | pr_debug("error creating sysfs link"); |
900 | } | 900 | } |
901 | 901 | ||
902 | /* | ||
903 | * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed | ||
904 | * work dc->writeback_rate_update is running. Wait until the routine | ||
905 | * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to | ||
906 | * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out | ||
907 | * seconds, give up waiting here and continue to cancel it too. | ||
908 | */ | ||
909 | static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) | ||
910 | { | ||
911 | int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ; | ||
912 | |||
913 | do { | ||
914 | if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING, | ||
915 | &dc->disk.flags)) | ||
916 | break; | ||
917 | time_out--; | ||
918 | schedule_timeout_interruptible(1); | ||
919 | } while (time_out > 0); | ||
920 | |||
921 | if (time_out == 0) | ||
922 | pr_warn("give up waiting for dc->writeback_write_update to quit"); | ||
923 | |||
924 | cancel_delayed_work_sync(&dc->writeback_rate_update); | ||
925 | } | ||
926 | |||
902 | static void cached_dev_detach_finish(struct work_struct *w) | 927 | static void cached_dev_detach_finish(struct work_struct *w) |
903 | { | 928 | { |
904 | struct cached_dev *dc = container_of(w, struct cached_dev, detach); | 929 | struct cached_dev *dc = container_of(w, struct cached_dev, detach); |
@@ -911,7 +936,9 @@ static void cached_dev_detach_finish(struct work_struct *w) | |||
911 | 936 | ||
912 | mutex_lock(&bch_register_lock); | 937 | mutex_lock(&bch_register_lock); |
913 | 938 | ||
914 | cancel_delayed_work_sync(&dc->writeback_rate_update); | 939 | if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) |
940 | cancel_writeback_rate_update_dwork(dc); | ||
941 | |||
915 | if (!IS_ERR_OR_NULL(dc->writeback_thread)) { | 942 | if (!IS_ERR_OR_NULL(dc->writeback_thread)) { |
916 | kthread_stop(dc->writeback_thread); | 943 | kthread_stop(dc->writeback_thread); |
917 | dc->writeback_thread = NULL; | 944 | dc->writeback_thread = NULL; |
@@ -954,6 +981,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) | |||
954 | closure_get(&dc->disk.cl); | 981 | closure_get(&dc->disk.cl); |
955 | 982 | ||
956 | bch_writeback_queue(dc); | 983 | bch_writeback_queue(dc); |
984 | |||
957 | cached_dev_put(dc); | 985 | cached_dev_put(dc); |
958 | } | 986 | } |
959 | 987 | ||
@@ -1081,14 +1109,16 @@ static void cached_dev_free(struct closure *cl) | |||
1081 | { | 1109 | { |
1082 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); | 1110 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); |
1083 | 1111 | ||
1084 | cancel_delayed_work_sync(&dc->writeback_rate_update); | 1112 | mutex_lock(&bch_register_lock); |
1113 | |||
1114 | if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) | ||
1115 | cancel_writeback_rate_update_dwork(dc); | ||
1116 | |||
1085 | if (!IS_ERR_OR_NULL(dc->writeback_thread)) | 1117 | if (!IS_ERR_OR_NULL(dc->writeback_thread)) |
1086 | kthread_stop(dc->writeback_thread); | 1118 | kthread_stop(dc->writeback_thread); |
1087 | if (dc->writeback_write_wq) | 1119 | if (dc->writeback_write_wq) |
1088 | destroy_workqueue(dc->writeback_write_wq); | 1120 | destroy_workqueue(dc->writeback_write_wq); |
1089 | 1121 | ||
1090 | mutex_lock(&bch_register_lock); | ||
1091 | |||
1092 | if (atomic_read(&dc->running)) | 1122 | if (atomic_read(&dc->running)) |
1093 | bd_unlink_disk_holder(dc->bdev, dc->disk.disk); | 1123 | bd_unlink_disk_holder(dc->bdev, dc->disk.disk); |
1094 | bcache_device_free(&dc->disk); | 1124 | bcache_device_free(&dc->disk); |
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 78cd7bd50fdd..55673508628f 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
@@ -309,7 +309,8 @@ STORE(bch_cached_dev) | |||
309 | bch_writeback_queue(dc); | 309 | bch_writeback_queue(dc); |
310 | 310 | ||
311 | if (attr == &sysfs_writeback_percent) | 311 | if (attr == &sysfs_writeback_percent) |
312 | schedule_delayed_work(&dc->writeback_rate_update, | 312 | if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) |
313 | schedule_delayed_work(&dc->writeback_rate_update, | ||
313 | dc->writeback_rate_update_seconds * HZ); | 314 | dc->writeback_rate_update_seconds * HZ); |
314 | 315 | ||
315 | mutex_unlock(&bch_register_lock); | 316 | mutex_unlock(&bch_register_lock); |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 4dbeaaa575bf..8f98ef1038d3 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
@@ -115,6 +115,21 @@ static void update_writeback_rate(struct work_struct *work) | |||
115 | struct cached_dev, | 115 | struct cached_dev, |
116 | writeback_rate_update); | 116 | writeback_rate_update); |
117 | 117 | ||
118 | /* | ||
119 | * should check BCACHE_DEV_RATE_DW_RUNNING before calling | ||
120 | * cancel_delayed_work_sync(). | ||
121 | */ | ||
122 | set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); | ||
123 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ | ||
124 | smp_mb(); | ||
125 | |||
126 | if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { | ||
127 | clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); | ||
128 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ | ||
129 | smp_mb(); | ||
130 | return; | ||
131 | } | ||
132 | |||
118 | down_read(&dc->writeback_lock); | 133 | down_read(&dc->writeback_lock); |
119 | 134 | ||
120 | if (atomic_read(&dc->has_dirty) && | 135 | if (atomic_read(&dc->has_dirty) && |
@@ -123,8 +138,18 @@ static void update_writeback_rate(struct work_struct *work) | |||
123 | 138 | ||
124 | up_read(&dc->writeback_lock); | 139 | up_read(&dc->writeback_lock); |
125 | 140 | ||
126 | schedule_delayed_work(&dc->writeback_rate_update, | 141 | if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { |
142 | schedule_delayed_work(&dc->writeback_rate_update, | ||
127 | dc->writeback_rate_update_seconds * HZ); | 143 | dc->writeback_rate_update_seconds * HZ); |
144 | } | ||
145 | |||
146 | /* | ||
147 | * should check BCACHE_DEV_RATE_DW_RUNNING before calling | ||
148 | * cancel_delayed_work_sync(). | ||
149 | */ | ||
150 | clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); | ||
151 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ | ||
152 | smp_mb(); | ||
128 | } | 153 | } |
129 | 154 | ||
130 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | 155 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) |
@@ -675,6 +700,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) | |||
675 | dc->writeback_rate_p_term_inverse = 40; | 700 | dc->writeback_rate_p_term_inverse = 40; |
676 | dc->writeback_rate_i_term_inverse = 10000; | 701 | dc->writeback_rate_i_term_inverse = 10000; |
677 | 702 | ||
703 | WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); | ||
678 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | 704 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); |
679 | } | 705 | } |
680 | 706 | ||
@@ -693,6 +719,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) | |||
693 | return PTR_ERR(dc->writeback_thread); | 719 | return PTR_ERR(dc->writeback_thread); |
694 | } | 720 | } |
695 | 721 | ||
722 | WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); | ||
696 | schedule_delayed_work(&dc->writeback_rate_update, | 723 | schedule_delayed_work(&dc->writeback_rate_update, |
697 | dc->writeback_rate_update_seconds * HZ); | 724 | dc->writeback_rate_update_seconds * HZ); |
698 | 725 | ||