aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorColy Li <colyli@suse.de>2018-03-18 20:36:16 -0400
committerJens Axboe <axboe@kernel.dk>2018-03-18 22:15:20 -0400
commit3fd47bfe55b00d5ac7b0a44c9301c07be39b1082 (patch)
tree32270c555e9f87fc2b4da0e7596ddbb0264d644e
parentfadd94e05c02afec7b70b0b14915624f1782f578 (diff)
bcache: stop dc->writeback_rate_update properly
struct delayed_work writeback_rate_update in struct cache_dev is a delayed worker to call function update_writeback_rate() in period (the interval is defined by dc->writeback_rate_update_seconds). When a metadate I/O error happens on cache device, bcache error handling routine bch_cache_set_error() will call bch_cache_set_unregister() to retire whole cache set. On the unregister code path, this delayed work is stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update). dc->writeback_rate_update is a special delayed work from others in bcache. In its routine update_writeback_rate(), this delayed work is re-armed itself. That means when cancel_delayed_work_sync() returns, this delayed work can still be executed after several seconds defined by dc->writeback_rate_update_seconds. The problem is, after cancel_delayed_work_sync() returns, the cache set unregister code path will continue and release memory of struct cache set. Then the delayed work is scheduled to run, __update_writeback_rate() will reference the already released cache_set memory, and trigger a NULL pointer deference fault. This patch introduces two more bcache device flags, - BCACHE_DEV_WB_RUNNING bit set: bcache device is in writeback mode and running, it is OK for dc->writeback_rate_update to re-arm itself. bit clear:bcache device is trying to stop dc->writeback_rate_update, this delayed work should not re-arm itself and quit. - BCACHE_DEV_RATE_DW_RUNNING bit set: routine update_writeback_rate() is executing. bit clear: routine update_writeback_rate() quits. This patch also adds a function cancel_writeback_rate_update_dwork() to wait for dc->writeback_rate_update quits before cancel it by calling cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected quit dc->writeback_rate_update, after time_out seconds this function will give up and continue to call cancel_delayed_work_sync(). And here I explain how this patch stops self re-armed delayed work properly with the above stuffs. update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING. Before calling cancel_delayed_work_sync() wait utill flag BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling cancel_delayed_work_sync(), dc->writeback_rate_update must be already re- armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases delayed work routine update_writeback_rate() won't be executed after cancel_delayed_work_sync() returns. Inside update_writeback_rate() before calling schedule_delayed_work(), flag BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means someone is about to stop the delayed work. Because flag BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync() has to wait for this flag to be cleared, we don't need to worry about race condition here. If update_writeback_rate() is scheduled to run after checking BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync() in cancel_writeback_rate_update_dwork(), it is also safe. Because at this moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear and quit immediately. Because there are more dependences inside update_writeback_rate() to struct cache_set memory, dc->writeback_rate_update is not a simple self re-arm delayed work. After trying many different methods (e.g. hold dc->count, or use locks), this is the only way I can find which works to properly stop dc->writeback_rate_update delayed work. Changelog: v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING to bit index, for test_bit(). v2: Try to fix the race issue which is pointed out by Junhui. v1: The initial version for review Signed-off-by: Coly Li <colyli@suse.de> Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: Michael Lyle <mlyle@lyle.org> Cc: Hannes Reinecke <hare@suse.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--drivers/md/bcache/bcache.h9
-rw-r--r--drivers/md/bcache/super.c38
-rw-r--r--drivers/md/bcache/sysfs.c3
-rw-r--r--drivers/md/bcache/writeback.c29
4 files changed, 69 insertions, 10 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 12e5197f186c..b5ddb848cd31 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -258,10 +258,11 @@ struct bcache_device {
258 struct gendisk *disk; 258 struct gendisk *disk;
259 259
260 unsigned long flags; 260 unsigned long flags;
261#define BCACHE_DEV_CLOSING 0 261#define BCACHE_DEV_CLOSING 0
262#define BCACHE_DEV_DETACHING 1 262#define BCACHE_DEV_DETACHING 1
263#define BCACHE_DEV_UNLINK_DONE 2 263#define BCACHE_DEV_UNLINK_DONE 2
264 264#define BCACHE_DEV_WB_RUNNING 3
265#define BCACHE_DEV_RATE_DW_RUNNING 4
265 unsigned nr_stripes; 266 unsigned nr_stripes;
266 unsigned stripe_size; 267 unsigned stripe_size;
267 atomic_t *stripe_sectors_dirty; 268 atomic_t *stripe_sectors_dirty;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 020be4f1cd8b..e5be599338c5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -899,6 +899,31 @@ void bch_cached_dev_run(struct cached_dev *dc)
899 pr_debug("error creating sysfs link"); 899 pr_debug("error creating sysfs link");
900} 900}
901 901
902/*
903 * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
904 * work dc->writeback_rate_update is running. Wait until the routine
905 * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
906 * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
907 * seconds, give up waiting here and continue to cancel it too.
908 */
909static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
910{
911 int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
912
913 do {
914 if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
915 &dc->disk.flags))
916 break;
917 time_out--;
918 schedule_timeout_interruptible(1);
919 } while (time_out > 0);
920
921 if (time_out == 0)
922 pr_warn("give up waiting for dc->writeback_write_update to quit");
923
924 cancel_delayed_work_sync(&dc->writeback_rate_update);
925}
926
902static void cached_dev_detach_finish(struct work_struct *w) 927static void cached_dev_detach_finish(struct work_struct *w)
903{ 928{
904 struct cached_dev *dc = container_of(w, struct cached_dev, detach); 929 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
@@ -911,7 +936,9 @@ static void cached_dev_detach_finish(struct work_struct *w)
911 936
912 mutex_lock(&bch_register_lock); 937 mutex_lock(&bch_register_lock);
913 938
914 cancel_delayed_work_sync(&dc->writeback_rate_update); 939 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
940 cancel_writeback_rate_update_dwork(dc);
941
915 if (!IS_ERR_OR_NULL(dc->writeback_thread)) { 942 if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
916 kthread_stop(dc->writeback_thread); 943 kthread_stop(dc->writeback_thread);
917 dc->writeback_thread = NULL; 944 dc->writeback_thread = NULL;
@@ -954,6 +981,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
954 closure_get(&dc->disk.cl); 981 closure_get(&dc->disk.cl);
955 982
956 bch_writeback_queue(dc); 983 bch_writeback_queue(dc);
984
957 cached_dev_put(dc); 985 cached_dev_put(dc);
958} 986}
959 987
@@ -1081,14 +1109,16 @@ static void cached_dev_free(struct closure *cl)
1081{ 1109{
1082 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); 1110 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1083 1111
1084 cancel_delayed_work_sync(&dc->writeback_rate_update); 1112 mutex_lock(&bch_register_lock);
1113
1114 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1115 cancel_writeback_rate_update_dwork(dc);
1116
1085 if (!IS_ERR_OR_NULL(dc->writeback_thread)) 1117 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1086 kthread_stop(dc->writeback_thread); 1118 kthread_stop(dc->writeback_thread);
1087 if (dc->writeback_write_wq) 1119 if (dc->writeback_write_wq)
1088 destroy_workqueue(dc->writeback_write_wq); 1120 destroy_workqueue(dc->writeback_write_wq);
1089 1121
1090 mutex_lock(&bch_register_lock);
1091
1092 if (atomic_read(&dc->running)) 1122 if (atomic_read(&dc->running))
1093 bd_unlink_disk_holder(dc->bdev, dc->disk.disk); 1123 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1094 bcache_device_free(&dc->disk); 1124 bcache_device_free(&dc->disk);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 78cd7bd50fdd..55673508628f 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -309,7 +309,8 @@ STORE(bch_cached_dev)
309 bch_writeback_queue(dc); 309 bch_writeback_queue(dc);
310 310
311 if (attr == &sysfs_writeback_percent) 311 if (attr == &sysfs_writeback_percent)
312 schedule_delayed_work(&dc->writeback_rate_update, 312 if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
313 schedule_delayed_work(&dc->writeback_rate_update,
313 dc->writeback_rate_update_seconds * HZ); 314 dc->writeback_rate_update_seconds * HZ);
314 315
315 mutex_unlock(&bch_register_lock); 316 mutex_unlock(&bch_register_lock);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 4dbeaaa575bf..8f98ef1038d3 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -115,6 +115,21 @@ static void update_writeback_rate(struct work_struct *work)
115 struct cached_dev, 115 struct cached_dev,
116 writeback_rate_update); 116 writeback_rate_update);
117 117
118 /*
119 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
120 * cancel_delayed_work_sync().
121 */
122 set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
123 /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
124 smp_mb();
125
126 if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) {
127 clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
128 /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
129 smp_mb();
130 return;
131 }
132
118 down_read(&dc->writeback_lock); 133 down_read(&dc->writeback_lock);
119 134
120 if (atomic_read(&dc->has_dirty) && 135 if (atomic_read(&dc->has_dirty) &&
@@ -123,8 +138,18 @@ static void update_writeback_rate(struct work_struct *work)
123 138
124 up_read(&dc->writeback_lock); 139 up_read(&dc->writeback_lock);
125 140
126 schedule_delayed_work(&dc->writeback_rate_update, 141 if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) {
142 schedule_delayed_work(&dc->writeback_rate_update,
127 dc->writeback_rate_update_seconds * HZ); 143 dc->writeback_rate_update_seconds * HZ);
144 }
145
146 /*
147 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
148 * cancel_delayed_work_sync().
149 */
150 clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
151 /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
152 smp_mb();
128} 153}
129 154
130static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 155static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
@@ -675,6 +700,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
675 dc->writeback_rate_p_term_inverse = 40; 700 dc->writeback_rate_p_term_inverse = 40;
676 dc->writeback_rate_i_term_inverse = 10000; 701 dc->writeback_rate_i_term_inverse = 10000;
677 702
703 WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
678 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 704 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
679} 705}
680 706
@@ -693,6 +719,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
693 return PTR_ERR(dc->writeback_thread); 719 return PTR_ERR(dc->writeback_thread);
694 } 720 }
695 721
722 WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
696 schedule_delayed_work(&dc->writeback_rate_update, 723 schedule_delayed_work(&dc->writeback_rate_update,
697 dc->writeback_rate_update_seconds * HZ); 724 dc->writeback_rate_update_seconds * HZ);
698 725