diff options
author | Kent Overstreet <koverstreet@google.com> | 2013-03-23 19:11:31 -0400 |
---|---|---|
committer | Kent Overstreet <koverstreet@google.com> | 2013-03-23 19:11:31 -0400 |
commit | cafe563591446cf80bfbc2fe3bc72a2e36cf1060 (patch) | |
tree | c8ae27b13dcdb0219634376ca5e667df32b1173a /drivers/md/bcache/writeback.c | |
parent | ea6749c705d9e629ed03c7336cc929fc6014b834 (diff) |
bcache: A block layer cache
Does writethrough and writeback caching, handles unclean shutdown, and
has a bunch of other nifty features motivated by real world usage.
See the wiki at http://bcache.evilpiepirate.org for more.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Diffstat (limited to 'drivers/md/bcache/writeback.c')
-rw-r--r-- | drivers/md/bcache/writeback.c | 414 |
1 files changed, 414 insertions, 0 deletions
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c new file mode 100644 index 000000000000..a80ee5373fd8 --- /dev/null +++ b/drivers/md/bcache/writeback.c | |||
@@ -0,0 +1,414 @@ | |||
1 | /* | ||
2 | * background writeback - scan btree for dirty data and write it to the backing | ||
3 | * device | ||
4 | * | ||
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
6 | * Copyright 2012 Google, Inc. | ||
7 | */ | ||
8 | |||
9 | #include "bcache.h" | ||
10 | #include "btree.h" | ||
11 | #include "debug.h" | ||
12 | |||
13 | static struct workqueue_struct *dirty_wq; | ||
14 | |||
15 | static void read_dirty(struct closure *); | ||
16 | |||
17 | struct dirty_io { | ||
18 | struct closure cl; | ||
19 | struct cached_dev *dc; | ||
20 | struct bio bio; | ||
21 | }; | ||
22 | |||
23 | /* Rate limiting */ | ||
24 | |||
25 | static void __update_writeback_rate(struct cached_dev *dc) | ||
26 | { | ||
27 | struct cache_set *c = dc->disk.c; | ||
28 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; | ||
29 | uint64_t cache_dirty_target = | ||
30 | div_u64(cache_sectors * dc->writeback_percent, 100); | ||
31 | |||
32 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | ||
33 | c->cached_dev_sectors); | ||
34 | |||
35 | /* PD controller */ | ||
36 | |||
37 | int change = 0; | ||
38 | int64_t error; | ||
39 | int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); | ||
40 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; | ||
41 | |||
42 | dc->disk.sectors_dirty_last = dirty; | ||
43 | |||
44 | derivative *= dc->writeback_rate_d_term; | ||
45 | derivative = clamp(derivative, -dirty, dirty); | ||
46 | |||
47 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | ||
48 | dc->writeback_rate_d_smooth, 0); | ||
49 | |||
50 | /* Avoid divide by zero */ | ||
51 | if (!target) | ||
52 | goto out; | ||
53 | |||
54 | error = div64_s64((dirty + derivative - target) << 8, target); | ||
55 | |||
56 | change = div_s64((dc->writeback_rate.rate * error) >> 8, | ||
57 | dc->writeback_rate_p_term_inverse); | ||
58 | |||
59 | /* Don't increase writeback rate if the device isn't keeping up */ | ||
60 | if (change > 0 && | ||
61 | time_after64(local_clock(), | ||
62 | dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) | ||
63 | change = 0; | ||
64 | |||
65 | dc->writeback_rate.rate = | ||
66 | clamp_t(int64_t, dc->writeback_rate.rate + change, | ||
67 | 1, NSEC_PER_MSEC); | ||
68 | out: | ||
69 | dc->writeback_rate_derivative = derivative; | ||
70 | dc->writeback_rate_change = change; | ||
71 | dc->writeback_rate_target = target; | ||
72 | |||
73 | schedule_delayed_work(&dc->writeback_rate_update, | ||
74 | dc->writeback_rate_update_seconds * HZ); | ||
75 | } | ||
76 | |||
77 | static void update_writeback_rate(struct work_struct *work) | ||
78 | { | ||
79 | struct cached_dev *dc = container_of(to_delayed_work(work), | ||
80 | struct cached_dev, | ||
81 | writeback_rate_update); | ||
82 | |||
83 | down_read(&dc->writeback_lock); | ||
84 | |||
85 | if (atomic_read(&dc->has_dirty) && | ||
86 | dc->writeback_percent) | ||
87 | __update_writeback_rate(dc); | ||
88 | |||
89 | up_read(&dc->writeback_lock); | ||
90 | } | ||
91 | |||
92 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | ||
93 | { | ||
94 | if (atomic_read(&dc->disk.detaching) || | ||
95 | !dc->writeback_percent) | ||
96 | return 0; | ||
97 | |||
98 | return next_delay(&dc->writeback_rate, sectors * 10000000ULL); | ||
99 | } | ||
100 | |||
101 | /* Background writeback */ | ||
102 | |||
103 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | ||
104 | { | ||
105 | return KEY_DIRTY(k); | ||
106 | } | ||
107 | |||
108 | static void dirty_init(struct keybuf_key *w) | ||
109 | { | ||
110 | struct dirty_io *io = w->private; | ||
111 | struct bio *bio = &io->bio; | ||
112 | |||
113 | bio_init(bio); | ||
114 | if (!io->dc->writeback_percent) | ||
115 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | ||
116 | |||
117 | bio->bi_size = KEY_SIZE(&w->key) << 9; | ||
118 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); | ||
119 | bio->bi_private = w; | ||
120 | bio->bi_io_vec = bio->bi_inline_vecs; | ||
121 | bio_map(bio, NULL); | ||
122 | } | ||
123 | |||
124 | static void refill_dirty(struct closure *cl) | ||
125 | { | ||
126 | struct cached_dev *dc = container_of(cl, struct cached_dev, | ||
127 | writeback.cl); | ||
128 | struct keybuf *buf = &dc->writeback_keys; | ||
129 | bool searched_from_start = false; | ||
130 | struct bkey end = MAX_KEY; | ||
131 | SET_KEY_INODE(&end, dc->disk.id); | ||
132 | |||
133 | if (!atomic_read(&dc->disk.detaching) && | ||
134 | !dc->writeback_running) | ||
135 | closure_return(cl); | ||
136 | |||
137 | down_write(&dc->writeback_lock); | ||
138 | |||
139 | if (!atomic_read(&dc->has_dirty)) { | ||
140 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | ||
141 | bch_write_bdev_super(dc, NULL); | ||
142 | |||
143 | up_write(&dc->writeback_lock); | ||
144 | closure_return(cl); | ||
145 | } | ||
146 | |||
147 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | ||
148 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | ||
149 | searched_from_start = true; | ||
150 | } | ||
151 | |||
152 | bch_refill_keybuf(dc->disk.c, buf, &end); | ||
153 | |||
154 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { | ||
155 | /* Searched the entire btree - delay awhile */ | ||
156 | |||
157 | if (RB_EMPTY_ROOT(&buf->keys)) { | ||
158 | atomic_set(&dc->has_dirty, 0); | ||
159 | cached_dev_put(dc); | ||
160 | } | ||
161 | |||
162 | if (!atomic_read(&dc->disk.detaching)) | ||
163 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | ||
164 | } | ||
165 | |||
166 | up_write(&dc->writeback_lock); | ||
167 | |||
168 | ratelimit_reset(&dc->writeback_rate); | ||
169 | |||
170 | /* Punt to workqueue only so we don't recurse and blow the stack */ | ||
171 | continue_at(cl, read_dirty, dirty_wq); | ||
172 | } | ||
173 | |||
174 | void bch_writeback_queue(struct cached_dev *dc) | ||
175 | { | ||
176 | if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { | ||
177 | if (!atomic_read(&dc->disk.detaching)) | ||
178 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | ||
179 | |||
180 | continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); | ||
181 | } | ||
182 | } | ||
183 | |||
184 | void bch_writeback_add(struct cached_dev *dc, unsigned sectors) | ||
185 | { | ||
186 | atomic_long_add(sectors, &dc->disk.sectors_dirty); | ||
187 | |||
188 | if (!atomic_read(&dc->has_dirty) && | ||
189 | !atomic_xchg(&dc->has_dirty, 1)) { | ||
190 | atomic_inc(&dc->count); | ||
191 | |||
192 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { | ||
193 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); | ||
194 | /* XXX: should do this synchronously */ | ||
195 | bch_write_bdev_super(dc, NULL); | ||
196 | } | ||
197 | |||
198 | bch_writeback_queue(dc); | ||
199 | |||
200 | if (dc->writeback_percent) | ||
201 | schedule_delayed_work(&dc->writeback_rate_update, | ||
202 | dc->writeback_rate_update_seconds * HZ); | ||
203 | } | ||
204 | } | ||
205 | |||
206 | /* Background writeback - IO loop */ | ||
207 | |||
208 | static void dirty_io_destructor(struct closure *cl) | ||
209 | { | ||
210 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
211 | kfree(io); | ||
212 | } | ||
213 | |||
214 | static void write_dirty_finish(struct closure *cl) | ||
215 | { | ||
216 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
217 | struct keybuf_key *w = io->bio.bi_private; | ||
218 | struct cached_dev *dc = io->dc; | ||
219 | struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); | ||
220 | |||
221 | while (bv-- != io->bio.bi_io_vec) | ||
222 | __free_page(bv->bv_page); | ||
223 | |||
224 | /* This is kind of a dumb way of signalling errors. */ | ||
225 | if (KEY_DIRTY(&w->key)) { | ||
226 | unsigned i; | ||
227 | struct btree_op op; | ||
228 | bch_btree_op_init_stack(&op); | ||
229 | |||
230 | op.type = BTREE_REPLACE; | ||
231 | bkey_copy(&op.replace, &w->key); | ||
232 | |||
233 | SET_KEY_DIRTY(&w->key, false); | ||
234 | bch_keylist_add(&op.keys, &w->key); | ||
235 | |||
236 | for (i = 0; i < KEY_PTRS(&w->key); i++) | ||
237 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | ||
238 | |||
239 | pr_debug("clearing %s", pkey(&w->key)); | ||
240 | bch_btree_insert(&op, dc->disk.c); | ||
241 | closure_sync(&op.cl); | ||
242 | |||
243 | atomic_long_inc(op.insert_collision | ||
244 | ? &dc->disk.c->writeback_keys_failed | ||
245 | : &dc->disk.c->writeback_keys_done); | ||
246 | } | ||
247 | |||
248 | bch_keybuf_del(&dc->writeback_keys, w); | ||
249 | atomic_dec_bug(&dc->in_flight); | ||
250 | |||
251 | closure_wake_up(&dc->writeback_wait); | ||
252 | |||
253 | closure_return_with_destructor(cl, dirty_io_destructor); | ||
254 | } | ||
255 | |||
256 | static void dirty_endio(struct bio *bio, int error) | ||
257 | { | ||
258 | struct keybuf_key *w = bio->bi_private; | ||
259 | struct dirty_io *io = w->private; | ||
260 | |||
261 | if (error) | ||
262 | SET_KEY_DIRTY(&w->key, false); | ||
263 | |||
264 | closure_put(&io->cl); | ||
265 | } | ||
266 | |||
267 | static void write_dirty(struct closure *cl) | ||
268 | { | ||
269 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
270 | struct keybuf_key *w = io->bio.bi_private; | ||
271 | |||
272 | dirty_init(w); | ||
273 | io->bio.bi_rw = WRITE; | ||
274 | io->bio.bi_sector = KEY_START(&w->key); | ||
275 | io->bio.bi_bdev = io->dc->bdev; | ||
276 | io->bio.bi_end_io = dirty_endio; | ||
277 | |||
278 | trace_bcache_write_dirty(&io->bio); | ||
279 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | ||
280 | |||
281 | continue_at(cl, write_dirty_finish, dirty_wq); | ||
282 | } | ||
283 | |||
284 | static void read_dirty_endio(struct bio *bio, int error) | ||
285 | { | ||
286 | struct keybuf_key *w = bio->bi_private; | ||
287 | struct dirty_io *io = w->private; | ||
288 | |||
289 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | ||
290 | error, "reading dirty data from cache"); | ||
291 | |||
292 | dirty_endio(bio, error); | ||
293 | } | ||
294 | |||
295 | static void read_dirty_submit(struct closure *cl) | ||
296 | { | ||
297 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
298 | |||
299 | trace_bcache_read_dirty(&io->bio); | ||
300 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | ||
301 | |||
302 | continue_at(cl, write_dirty, dirty_wq); | ||
303 | } | ||
304 | |||
305 | static void read_dirty(struct closure *cl) | ||
306 | { | ||
307 | struct cached_dev *dc = container_of(cl, struct cached_dev, | ||
308 | writeback.cl); | ||
309 | unsigned delay = writeback_delay(dc, 0); | ||
310 | struct keybuf_key *w; | ||
311 | struct dirty_io *io; | ||
312 | |||
313 | /* | ||
314 | * XXX: if we error, background writeback just spins. Should use some | ||
315 | * mempools. | ||
316 | */ | ||
317 | |||
318 | while (1) { | ||
319 | w = bch_keybuf_next(&dc->writeback_keys); | ||
320 | if (!w) | ||
321 | break; | ||
322 | |||
323 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | ||
324 | |||
325 | if (delay > 0 && | ||
326 | (KEY_START(&w->key) != dc->last_read || | ||
327 | jiffies_to_msecs(delay) > 50)) { | ||
328 | w->private = NULL; | ||
329 | |||
330 | closure_delay(&dc->writeback, delay); | ||
331 | continue_at(cl, read_dirty, dirty_wq); | ||
332 | } | ||
333 | |||
334 | dc->last_read = KEY_OFFSET(&w->key); | ||
335 | |||
336 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | ||
337 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | ||
338 | GFP_KERNEL); | ||
339 | if (!io) | ||
340 | goto err; | ||
341 | |||
342 | w->private = io; | ||
343 | io->dc = dc; | ||
344 | |||
345 | dirty_init(w); | ||
346 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); | ||
347 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, | ||
348 | &w->key, 0)->bdev; | ||
349 | io->bio.bi_rw = READ; | ||
350 | io->bio.bi_end_io = read_dirty_endio; | ||
351 | |||
352 | if (bio_alloc_pages(&io->bio, GFP_KERNEL)) | ||
353 | goto err_free; | ||
354 | |||
355 | pr_debug("%s", pkey(&w->key)); | ||
356 | |||
357 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); | ||
358 | |||
359 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | ||
360 | |||
361 | atomic_inc(&dc->in_flight); | ||
362 | |||
363 | if (!closure_wait_event(&dc->writeback_wait, cl, | ||
364 | atomic_read(&dc->in_flight) < 64)) | ||
365 | continue_at(cl, read_dirty, dirty_wq); | ||
366 | } | ||
367 | |||
368 | if (0) { | ||
369 | err_free: | ||
370 | kfree(w->private); | ||
371 | err: | ||
372 | bch_keybuf_del(&dc->writeback_keys, w); | ||
373 | } | ||
374 | |||
375 | refill_dirty(cl); | ||
376 | } | ||
377 | |||
378 | void bch_writeback_init_cached_dev(struct cached_dev *dc) | ||
379 | { | ||
380 | closure_init_unlocked(&dc->writeback); | ||
381 | init_rwsem(&dc->writeback_lock); | ||
382 | |||
383 | bch_keybuf_init(&dc->writeback_keys, dirty_pred); | ||
384 | |||
385 | dc->writeback_metadata = true; | ||
386 | dc->writeback_running = true; | ||
387 | dc->writeback_percent = 10; | ||
388 | dc->writeback_delay = 30; | ||
389 | dc->writeback_rate.rate = 1024; | ||
390 | |||
391 | dc->writeback_rate_update_seconds = 30; | ||
392 | dc->writeback_rate_d_term = 16; | ||
393 | dc->writeback_rate_p_term_inverse = 64; | ||
394 | dc->writeback_rate_d_smooth = 8; | ||
395 | |||
396 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | ||
397 | schedule_delayed_work(&dc->writeback_rate_update, | ||
398 | dc->writeback_rate_update_seconds * HZ); | ||
399 | } | ||
400 | |||
401 | void bch_writeback_exit(void) | ||
402 | { | ||
403 | if (dirty_wq) | ||
404 | destroy_workqueue(dirty_wq); | ||
405 | } | ||
406 | |||
407 | int __init bch_writeback_init(void) | ||
408 | { | ||
409 | dirty_wq = create_singlethread_workqueue("bcache_writeback"); | ||
410 | if (!dirty_wq) | ||
411 | return -ENOMEM; | ||
412 | |||
413 | return 0; | ||
414 | } | ||