aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-15 19:33:41 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-15 19:33:41 -0500
commitf412f2c60b480fa5140a4b4cb321cd48c64e1812 (patch)
treeaafd5a5922b43daca4abdfa9bb723fc1f334108d
parentcd1177f25069cb494680eedd718e7c6d8fd85d10 (diff)
parent1cf7e9c68fe84248174e998922b39e508375e7c1 (diff)
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull second round of block driver updates from Jens Axboe: "As mentioned in the original pull request, the bcache bits were pulled because of their dependency on the immutable bio vecs. Kent re-did this part and resubmitted it, so here's the 2nd round of (mostly) driver updates for 3.13. It contains: - The bcache work from Kent. - Conversion of virtio-blk to blk-mq. This removes the bio and request path, and substitutes with the blk-mq path instead. The end result almost 200 deleted lines. Patch is acked by Asias and Christoph, who both did a bunch of testing. - A removal of bootmem.h include from Grygorii Strashko, part of a larger series of his killing the dependency on that header file. - Removal of __cpuinit from blk-mq from Paul Gortmaker" * 'for-linus' of git://git.kernel.dk/linux-block: (56 commits) virtio_blk: blk-mq support blk-mq: remove newly added instances of __cpuinit bcache: defensively handle format strings bcache: Bypass torture test bcache: Delete some slower inline asm bcache: Use ida for bcache block dev minor bcache: Fix sysfs splat on shutdown with flash only devs bcache: Better full stripe scanning bcache: Have btree_split() insert into parent directly bcache: Move spinlock into struct time_stats bcache: Kill sequential_merge option bcache: Kill bch_next_recurse_key() bcache: Avoid deadlocking in garbage collection bcache: Incremental gc bcache: Add make_btree_freeing_key() bcache: Add btree_node_write_sync() bcache: PRECEDING_KEY() bcache: bch_(btree|extent)_ptr_invalid() bcache: Don't bother with bucket refcount for btree node allocations bcache: Debug code improvements ...
-rw-r--r--block/blk-ioc.c1
-rw-r--r--block/blk-mq-cpu.c8
-rw-r--r--block/blk-mq.c6
-rw-r--r--drivers/block/virtio_blk.c322
-rw-r--r--drivers/md/bcache/Kconfig11
-rw-r--r--drivers/md/bcache/alloc.c383
-rw-r--r--drivers/md/bcache/bcache.h327
-rw-r--r--drivers/md/bcache/bset.c289
-rw-r--r--drivers/md/bcache/bset.h93
-rw-r--r--drivers/md/bcache/btree.c1396
-rw-r--r--drivers/md/bcache/btree.h195
-rw-r--r--drivers/md/bcache/closure.c103
-rw-r--r--drivers/md/bcache/closure.h183
-rw-r--r--drivers/md/bcache/debug.c185
-rw-r--r--drivers/md/bcache/debug.h50
-rw-r--r--drivers/md/bcache/journal.c293
-rw-r--r--drivers/md/bcache/journal.h52
-rw-r--r--drivers/md/bcache/movinggc.c87
-rw-r--r--drivers/md/bcache/request.c1102
-rw-r--r--drivers/md/bcache/request.h43
-rw-r--r--drivers/md/bcache/stats.c26
-rw-r--r--drivers/md/bcache/stats.h13
-rw-r--r--drivers/md/bcache/super.c190
-rw-r--r--drivers/md/bcache/sysfs.c42
-rw-r--r--drivers/md/bcache/trace.c1
-rw-r--r--drivers/md/bcache/util.c12
-rw-r--r--drivers/md/bcache/util.h15
-rw-r--r--drivers/md/bcache/writeback.c455
-rw-r--r--drivers/md/bcache/writeback.h46
-rw-r--r--include/trace/events/bcache.h47
-rw-r--r--include/uapi/linux/bcache.h373
31 files changed, 3069 insertions, 3280 deletions
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 46cd7bd18b34..242df01413f6 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -6,7 +6,6 @@
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/bio.h> 7#include <linux/bio.h>
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
10#include <linux/slab.h> 9#include <linux/slab.h>
11 10
12#include "blk.h" 11#include "blk.h"
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
index f8ea39d7ae54..0045ace9bdf0 100644
--- a/block/blk-mq-cpu.c
+++ b/block/blk-mq-cpu.c
@@ -13,8 +13,8 @@
13static LIST_HEAD(blk_mq_cpu_notify_list); 13static LIST_HEAD(blk_mq_cpu_notify_list);
14static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock); 14static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
15 15
16static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self, 16static int blk_mq_main_cpu_notify(struct notifier_block *self,
17 unsigned long action, void *hcpu) 17 unsigned long action, void *hcpu)
18{ 18{
19 unsigned int cpu = (unsigned long) hcpu; 19 unsigned int cpu = (unsigned long) hcpu;
20 struct blk_mq_cpu_notifier *notify; 20 struct blk_mq_cpu_notifier *notify;
@@ -28,8 +28,8 @@ static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self,
28 return NOTIFY_OK; 28 return NOTIFY_OK;
29} 29}
30 30
31static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action, 31static void blk_mq_cpu_notify(void *data, unsigned long action,
32 unsigned int cpu) 32 unsigned int cpu)
33{ 33{
34 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 34 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
35 /* 35 /*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c661896e2465..862f458d4760 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1444,7 +1444,7 @@ void blk_mq_free_queue(struct request_queue *q)
1444EXPORT_SYMBOL(blk_mq_free_queue); 1444EXPORT_SYMBOL(blk_mq_free_queue);
1445 1445
1446/* Basically redo blk_mq_init_queue with queue frozen */ 1446/* Basically redo blk_mq_init_queue with queue frozen */
1447static void __cpuinit blk_mq_queue_reinit(struct request_queue *q) 1447static void blk_mq_queue_reinit(struct request_queue *q)
1448{ 1448{
1449 blk_mq_freeze_queue(q); 1449 blk_mq_freeze_queue(q);
1450 1450
@@ -1461,8 +1461,8 @@ static void __cpuinit blk_mq_queue_reinit(struct request_queue *q)
1461 blk_mq_unfreeze_queue(q); 1461 blk_mq_unfreeze_queue(q);
1462} 1462}
1463 1463
1464static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb, 1464static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1465 unsigned long action, void *hcpu) 1465 unsigned long action, void *hcpu)
1466{ 1466{
1467 struct request_queue *q; 1467 struct request_queue *q;
1468 1468
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index f3be496ac8fa..588479d58f52 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -11,12 +11,11 @@
11#include <linux/string_helpers.h> 11#include <linux/string_helpers.h>
12#include <scsi/scsi_cmnd.h> 12#include <scsi/scsi_cmnd.h>
13#include <linux/idr.h> 13#include <linux/idr.h>
14#include <linux/blk-mq.h>
15#include <linux/numa.h>
14 16
15#define PART_BITS 4 17#define PART_BITS 4
16 18
17static bool use_bio;
18module_param(use_bio, bool, S_IRUGO);
19
20static int major; 19static int major;
21static DEFINE_IDA(vd_index_ida); 20static DEFINE_IDA(vd_index_ida);
22 21
@@ -26,13 +25,11 @@ struct virtio_blk
26{ 25{
27 struct virtio_device *vdev; 26 struct virtio_device *vdev;
28 struct virtqueue *vq; 27 struct virtqueue *vq;
29 wait_queue_head_t queue_wait; 28 spinlock_t vq_lock;
30 29
31 /* The disk structure for the kernel. */ 30 /* The disk structure for the kernel. */
32 struct gendisk *disk; 31 struct gendisk *disk;
33 32
34 mempool_t *pool;
35
36 /* Process context for config space updates */ 33 /* Process context for config space updates */
37 struct work_struct config_work; 34 struct work_struct config_work;
38 35
@@ -47,31 +44,17 @@ struct virtio_blk
47 44
48 /* Ida index - used to track minor number allocations. */ 45 /* Ida index - used to track minor number allocations. */
49 int index; 46 int index;
50
51 /* Scatterlist: can be too big for stack. */
52 struct scatterlist sg[/*sg_elems*/];
53}; 47};
54 48
55struct virtblk_req 49struct virtblk_req
56{ 50{
57 struct request *req; 51 struct request *req;
58 struct bio *bio;
59 struct virtio_blk_outhdr out_hdr; 52 struct virtio_blk_outhdr out_hdr;
60 struct virtio_scsi_inhdr in_hdr; 53 struct virtio_scsi_inhdr in_hdr;
61 struct work_struct work;
62 struct virtio_blk *vblk;
63 int flags;
64 u8 status; 54 u8 status;
65 struct scatterlist sg[]; 55 struct scatterlist sg[];
66}; 56};
67 57
68enum {
69 VBLK_IS_FLUSH = 1,
70 VBLK_REQ_FLUSH = 2,
71 VBLK_REQ_DATA = 4,
72 VBLK_REQ_FUA = 8,
73};
74
75static inline int virtblk_result(struct virtblk_req *vbr) 58static inline int virtblk_result(struct virtblk_req *vbr)
76{ 59{
77 switch (vbr->status) { 60 switch (vbr->status) {
@@ -84,22 +67,6 @@ static inline int virtblk_result(struct virtblk_req *vbr)
84 } 67 }
85} 68}
86 69
87static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
88 gfp_t gfp_mask)
89{
90 struct virtblk_req *vbr;
91
92 vbr = mempool_alloc(vblk->pool, gfp_mask);
93 if (!vbr)
94 return NULL;
95
96 vbr->vblk = vblk;
97 if (use_bio)
98 sg_init_table(vbr->sg, vblk->sg_elems);
99
100 return vbr;
101}
102
103static int __virtblk_add_req(struct virtqueue *vq, 70static int __virtblk_add_req(struct virtqueue *vq,
104 struct virtblk_req *vbr, 71 struct virtblk_req *vbr,
105 struct scatterlist *data_sg, 72 struct scatterlist *data_sg,
@@ -143,83 +110,8 @@ static int __virtblk_add_req(struct virtqueue *vq,
143 return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); 110 return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
144} 111}
145 112
146static void virtblk_add_req(struct virtblk_req *vbr, bool have_data)
147{
148 struct virtio_blk *vblk = vbr->vblk;
149 DEFINE_WAIT(wait);
150 int ret;
151
152 spin_lock_irq(vblk->disk->queue->queue_lock);
153 while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
154 have_data)) < 0)) {
155 prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
156 TASK_UNINTERRUPTIBLE);
157
158 spin_unlock_irq(vblk->disk->queue->queue_lock);
159 io_schedule();
160 spin_lock_irq(vblk->disk->queue->queue_lock);
161
162 finish_wait(&vblk->queue_wait, &wait);
163 }
164
165 virtqueue_kick(vblk->vq);
166 spin_unlock_irq(vblk->disk->queue->queue_lock);
167}
168
169static void virtblk_bio_send_flush(struct virtblk_req *vbr)
170{
171 vbr->flags |= VBLK_IS_FLUSH;
172 vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
173 vbr->out_hdr.sector = 0;
174 vbr->out_hdr.ioprio = 0;
175
176 virtblk_add_req(vbr, false);
177}
178
179static void virtblk_bio_send_data(struct virtblk_req *vbr)
180{
181 struct virtio_blk *vblk = vbr->vblk;
182 struct bio *bio = vbr->bio;
183 bool have_data;
184
185 vbr->flags &= ~VBLK_IS_FLUSH;
186 vbr->out_hdr.type = 0;
187 vbr->out_hdr.sector = bio->bi_sector;
188 vbr->out_hdr.ioprio = bio_prio(bio);
189
190 if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) {
191 have_data = true;
192 if (bio->bi_rw & REQ_WRITE)
193 vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
194 else
195 vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
196 } else
197 have_data = false;
198
199 virtblk_add_req(vbr, have_data);
200}
201
202static void virtblk_bio_send_data_work(struct work_struct *work)
203{
204 struct virtblk_req *vbr;
205
206 vbr = container_of(work, struct virtblk_req, work);
207
208 virtblk_bio_send_data(vbr);
209}
210
211static void virtblk_bio_send_flush_work(struct work_struct *work)
212{
213 struct virtblk_req *vbr;
214
215 vbr = container_of(work, struct virtblk_req, work);
216
217 virtblk_bio_send_flush(vbr);
218}
219
220static inline void virtblk_request_done(struct virtblk_req *vbr) 113static inline void virtblk_request_done(struct virtblk_req *vbr)
221{ 114{
222 struct virtio_blk *vblk = vbr->vblk;
223 struct request *req = vbr->req; 115 struct request *req = vbr->req;
224 int error = virtblk_result(vbr); 116 int error = virtblk_result(vbr);
225 117
@@ -231,92 +123,45 @@ static inline void virtblk_request_done(struct virtblk_req *vbr)
231 req->errors = (error != 0); 123 req->errors = (error != 0);
232 } 124 }
233 125
234 __blk_end_request_all(req, error); 126 blk_mq_end_io(req, error);
235 mempool_free(vbr, vblk->pool);
236}
237
238static inline void virtblk_bio_flush_done(struct virtblk_req *vbr)
239{
240 struct virtio_blk *vblk = vbr->vblk;
241
242 if (vbr->flags & VBLK_REQ_DATA) {
243 /* Send out the actual write data */
244 INIT_WORK(&vbr->work, virtblk_bio_send_data_work);
245 queue_work(virtblk_wq, &vbr->work);
246 } else {
247 bio_endio(vbr->bio, virtblk_result(vbr));
248 mempool_free(vbr, vblk->pool);
249 }
250}
251
252static inline void virtblk_bio_data_done(struct virtblk_req *vbr)
253{
254 struct virtio_blk *vblk = vbr->vblk;
255
256 if (unlikely(vbr->flags & VBLK_REQ_FUA)) {
257 /* Send out a flush before end the bio */
258 vbr->flags &= ~VBLK_REQ_DATA;
259 INIT_WORK(&vbr->work, virtblk_bio_send_flush_work);
260 queue_work(virtblk_wq, &vbr->work);
261 } else {
262 bio_endio(vbr->bio, virtblk_result(vbr));
263 mempool_free(vbr, vblk->pool);
264 }
265}
266
267static inline void virtblk_bio_done(struct virtblk_req *vbr)
268{
269 if (unlikely(vbr->flags & VBLK_IS_FLUSH))
270 virtblk_bio_flush_done(vbr);
271 else
272 virtblk_bio_data_done(vbr);
273} 127}
274 128
275static void virtblk_done(struct virtqueue *vq) 129static void virtblk_done(struct virtqueue *vq)
276{ 130{
277 struct virtio_blk *vblk = vq->vdev->priv; 131 struct virtio_blk *vblk = vq->vdev->priv;
278 bool bio_done = false, req_done = false; 132 bool req_done = false;
279 struct virtblk_req *vbr; 133 struct virtblk_req *vbr;
280 unsigned long flags; 134 unsigned long flags;
281 unsigned int len; 135 unsigned int len;
282 136
283 spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); 137 spin_lock_irqsave(&vblk->vq_lock, flags);
284 do { 138 do {
285 virtqueue_disable_cb(vq); 139 virtqueue_disable_cb(vq);
286 while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { 140 while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
287 if (vbr->bio) { 141 virtblk_request_done(vbr);
288 virtblk_bio_done(vbr); 142 req_done = true;
289 bio_done = true;
290 } else {
291 virtblk_request_done(vbr);
292 req_done = true;
293 }
294 } 143 }
295 if (unlikely(virtqueue_is_broken(vq))) 144 if (unlikely(virtqueue_is_broken(vq)))
296 break; 145 break;
297 } while (!virtqueue_enable_cb(vq)); 146 } while (!virtqueue_enable_cb(vq));
147 spin_unlock_irqrestore(&vblk->vq_lock, flags);
148
298 /* In case queue is stopped waiting for more buffers. */ 149 /* In case queue is stopped waiting for more buffers. */
299 if (req_done) 150 if (req_done)
300 blk_start_queue(vblk->disk->queue); 151 blk_mq_start_stopped_hw_queues(vblk->disk->queue);
301 spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
302
303 if (bio_done)
304 wake_up(&vblk->queue_wait);
305} 152}
306 153
307static bool do_req(struct request_queue *q, struct virtio_blk *vblk, 154static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
308 struct request *req)
309{ 155{
156 struct virtio_blk *vblk = hctx->queue->queuedata;
157 struct virtblk_req *vbr = req->special;
158 unsigned long flags;
310 unsigned int num; 159 unsigned int num;
311 struct virtblk_req *vbr; 160 const bool last = (req->cmd_flags & REQ_END) != 0;
312 161
313 vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); 162 BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
314 if (!vbr)
315 /* When another request finishes we'll try again. */
316 return false;
317 163
318 vbr->req = req; 164 vbr->req = req;
319 vbr->bio = NULL;
320 if (req->cmd_flags & REQ_FLUSH) { 165 if (req->cmd_flags & REQ_FLUSH) {
321 vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; 166 vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
322 vbr->out_hdr.sector = 0; 167 vbr->out_hdr.sector = 0;
@@ -344,7 +189,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
344 } 189 }
345 } 190 }
346 191
347 num = blk_rq_map_sg(q, vbr->req, vblk->sg); 192 num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
348 if (num) { 193 if (num) {
349 if (rq_data_dir(vbr->req) == WRITE) 194 if (rq_data_dir(vbr->req) == WRITE)
350 vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; 195 vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
@@ -352,63 +197,18 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
352 vbr->out_hdr.type |= VIRTIO_BLK_T_IN; 197 vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
353 } 198 }
354 199
355 if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) { 200 spin_lock_irqsave(&vblk->vq_lock, flags);
356 mempool_free(vbr, vblk->pool); 201 if (__virtblk_add_req(vblk->vq, vbr, vbr->sg, num) < 0) {
357 return false; 202 spin_unlock_irqrestore(&vblk->vq_lock, flags);
358 } 203 blk_mq_stop_hw_queue(hctx);
359
360 return true;
361}
362
363static void virtblk_request(struct request_queue *q)
364{
365 struct virtio_blk *vblk = q->queuedata;
366 struct request *req;
367 unsigned int issued = 0;
368
369 while ((req = blk_peek_request(q)) != NULL) {
370 BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
371
372 /* If this request fails, stop queue and wait for something to
373 finish to restart it. */
374 if (!do_req(q, vblk, req)) {
375 blk_stop_queue(q);
376 break;
377 }
378 blk_start_request(req);
379 issued++;
380 }
381
382 if (issued)
383 virtqueue_kick(vblk->vq); 204 virtqueue_kick(vblk->vq);
384} 205 return BLK_MQ_RQ_QUEUE_BUSY;
385
386static void virtblk_make_request(struct request_queue *q, struct bio *bio)
387{
388 struct virtio_blk *vblk = q->queuedata;
389 struct virtblk_req *vbr;
390
391 BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
392
393 vbr = virtblk_alloc_req(vblk, GFP_NOIO);
394 if (!vbr) {
395 bio_endio(bio, -ENOMEM);
396 return;
397 } 206 }
207 spin_unlock_irqrestore(&vblk->vq_lock, flags);
398 208
399 vbr->bio = bio; 209 if (last)
400 vbr->flags = 0; 210 virtqueue_kick(vblk->vq);
401 if (bio->bi_rw & REQ_FLUSH) 211 return BLK_MQ_RQ_QUEUE_OK;
402 vbr->flags |= VBLK_REQ_FLUSH;
403 if (bio->bi_rw & REQ_FUA)
404 vbr->flags |= VBLK_REQ_FUA;
405 if (bio->bi_size)
406 vbr->flags |= VBLK_REQ_DATA;
407
408 if (unlikely(vbr->flags & VBLK_REQ_FLUSH))
409 virtblk_bio_send_flush(vbr);
410 else
411 virtblk_bio_send_data(vbr);
412} 212}
413 213
414/* return id (s/n) string for *disk to *id_str 214/* return id (s/n) string for *disk to *id_str
@@ -673,12 +473,35 @@ static const struct device_attribute dev_attr_cache_type_rw =
673 __ATTR(cache_type, S_IRUGO|S_IWUSR, 473 __ATTR(cache_type, S_IRUGO|S_IWUSR,
674 virtblk_cache_type_show, virtblk_cache_type_store); 474 virtblk_cache_type_show, virtblk_cache_type_store);
675 475
476static struct blk_mq_ops virtio_mq_ops = {
477 .queue_rq = virtio_queue_rq,
478 .map_queue = blk_mq_map_queue,
479 .alloc_hctx = blk_mq_alloc_single_hw_queue,
480 .free_hctx = blk_mq_free_single_hw_queue,
481};
482
483static struct blk_mq_reg virtio_mq_reg = {
484 .ops = &virtio_mq_ops,
485 .nr_hw_queues = 1,
486 .queue_depth = 64,
487 .numa_node = NUMA_NO_NODE,
488 .flags = BLK_MQ_F_SHOULD_MERGE,
489};
490
491static void virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
492 struct request *rq, unsigned int nr)
493{
494 struct virtio_blk *vblk = data;
495 struct virtblk_req *vbr = rq->special;
496
497 sg_init_table(vbr->sg, vblk->sg_elems);
498}
499
676static int virtblk_probe(struct virtio_device *vdev) 500static int virtblk_probe(struct virtio_device *vdev)
677{ 501{
678 struct virtio_blk *vblk; 502 struct virtio_blk *vblk;
679 struct request_queue *q; 503 struct request_queue *q;
680 int err, index; 504 int err, index;
681 int pool_size;
682 505
683 u64 cap; 506 u64 cap;
684 u32 v, blk_size, sg_elems, opt_io_size; 507 u32 v, blk_size, sg_elems, opt_io_size;
@@ -702,17 +525,14 @@ static int virtblk_probe(struct virtio_device *vdev)
702 525
703 /* We need an extra sg elements at head and tail. */ 526 /* We need an extra sg elements at head and tail. */
704 sg_elems += 2; 527 sg_elems += 2;
705 vdev->priv = vblk = kmalloc(sizeof(*vblk) + 528 vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
706 sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL);
707 if (!vblk) { 529 if (!vblk) {
708 err = -ENOMEM; 530 err = -ENOMEM;
709 goto out_free_index; 531 goto out_free_index;
710 } 532 }
711 533
712 init_waitqueue_head(&vblk->queue_wait);
713 vblk->vdev = vdev; 534 vblk->vdev = vdev;
714 vblk->sg_elems = sg_elems; 535 vblk->sg_elems = sg_elems;
715 sg_init_table(vblk->sg, vblk->sg_elems);
716 mutex_init(&vblk->config_lock); 536 mutex_init(&vblk->config_lock);
717 537
718 INIT_WORK(&vblk->config_work, virtblk_config_changed_work); 538 INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
@@ -721,31 +541,27 @@ static int virtblk_probe(struct virtio_device *vdev)
721 err = init_vq(vblk); 541 err = init_vq(vblk);
722 if (err) 542 if (err)
723 goto out_free_vblk; 543 goto out_free_vblk;
724 544 spin_lock_init(&vblk->vq_lock);
725 pool_size = sizeof(struct virtblk_req);
726 if (use_bio)
727 pool_size += sizeof(struct scatterlist) * sg_elems;
728 vblk->pool = mempool_create_kmalloc_pool(1, pool_size);
729 if (!vblk->pool) {
730 err = -ENOMEM;
731 goto out_free_vq;
732 }
733 545
734 /* FIXME: How many partitions? How long is a piece of string? */ 546 /* FIXME: How many partitions? How long is a piece of string? */
735 vblk->disk = alloc_disk(1 << PART_BITS); 547 vblk->disk = alloc_disk(1 << PART_BITS);
736 if (!vblk->disk) { 548 if (!vblk->disk) {
737 err = -ENOMEM; 549 err = -ENOMEM;
738 goto out_mempool; 550 goto out_free_vq;
739 } 551 }
740 552
741 q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL); 553 virtio_mq_reg.cmd_size =
554 sizeof(struct virtblk_req) +
555 sizeof(struct scatterlist) * sg_elems;
556
557 q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk);
742 if (!q) { 558 if (!q) {
743 err = -ENOMEM; 559 err = -ENOMEM;
744 goto out_put_disk; 560 goto out_put_disk;
745 } 561 }
746 562
747 if (use_bio) 563 blk_mq_init_commands(q, virtblk_init_vbr, vblk);
748 blk_queue_make_request(q, virtblk_make_request); 564
749 q->queuedata = vblk; 565 q->queuedata = vblk;
750 566
751 virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); 567 virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@@ -848,8 +664,6 @@ out_del_disk:
848 blk_cleanup_queue(vblk->disk->queue); 664 blk_cleanup_queue(vblk->disk->queue);
849out_put_disk: 665out_put_disk:
850 put_disk(vblk->disk); 666 put_disk(vblk->disk);
851out_mempool:
852 mempool_destroy(vblk->pool);
853out_free_vq: 667out_free_vq:
854 vdev->config->del_vqs(vdev); 668 vdev->config->del_vqs(vdev);
855out_free_vblk: 669out_free_vblk:
@@ -881,7 +695,6 @@ static void virtblk_remove(struct virtio_device *vdev)
881 695
882 refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount); 696 refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
883 put_disk(vblk->disk); 697 put_disk(vblk->disk);
884 mempool_destroy(vblk->pool);
885 vdev->config->del_vqs(vdev); 698 vdev->config->del_vqs(vdev);
886 kfree(vblk); 699 kfree(vblk);
887 700
@@ -905,10 +718,7 @@ static int virtblk_freeze(struct virtio_device *vdev)
905 718
906 flush_work(&vblk->config_work); 719 flush_work(&vblk->config_work);
907 720
908 spin_lock_irq(vblk->disk->queue->queue_lock); 721 blk_mq_stop_hw_queues(vblk->disk->queue);
909 blk_stop_queue(vblk->disk->queue);
910 spin_unlock_irq(vblk->disk->queue->queue_lock);
911 blk_sync_queue(vblk->disk->queue);
912 722
913 vdev->config->del_vqs(vdev); 723 vdev->config->del_vqs(vdev);
914 return 0; 724 return 0;
@@ -921,11 +731,9 @@ static int virtblk_restore(struct virtio_device *vdev)
921 731
922 vblk->config_enable = true; 732 vblk->config_enable = true;
923 ret = init_vq(vdev->priv); 733 ret = init_vq(vdev->priv);
924 if (!ret) { 734 if (!ret)
925 spin_lock_irq(vblk->disk->queue->queue_lock); 735 blk_mq_start_stopped_hw_queues(vblk->disk->queue);
926 blk_start_queue(vblk->disk->queue); 736
927 spin_unlock_irq(vblk->disk->queue->queue_lock);
928 }
929 return ret; 737 return ret;
930} 738}
931#endif 739#endif
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index f950c9d29f3e..2638417b19aa 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -13,15 +13,8 @@ config BCACHE_DEBUG
13 ---help--- 13 ---help---
14 Don't select this option unless you're a developer 14 Don't select this option unless you're a developer
15 15
16 Enables extra debugging tools (primarily a fuzz tester) 16 Enables extra debugging tools, allows expensive runtime checks to be
17 17 turned on.
18config BCACHE_EDEBUG
19 bool "Extended runtime checks"
20 depends on BCACHE
21 ---help---
22 Don't select this option unless you're a developer
23
24 Enables extra runtime checks which significantly affect performance
25 18
26config BCACHE_CLOSURES_DEBUG 19config BCACHE_CLOSURES_DEBUG
27 bool "Debug closures" 20 bool "Debug closures"
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index e45f5575fd4d..2b46bf1d7e40 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -63,13 +63,12 @@
63#include "bcache.h" 63#include "bcache.h"
64#include "btree.h" 64#include "btree.h"
65 65
66#include <linux/blkdev.h>
66#include <linux/freezer.h> 67#include <linux/freezer.h>
67#include <linux/kthread.h> 68#include <linux/kthread.h>
68#include <linux/random.h> 69#include <linux/random.h>
69#include <trace/events/bcache.h> 70#include <trace/events/bcache.h>
70 71
71#define MAX_IN_FLIGHT_DISCARDS 8U
72
73/* Bucket heap / gen */ 72/* Bucket heap / gen */
74 73
75uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) 74uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
@@ -121,75 +120,6 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
121 mutex_unlock(&c->bucket_lock); 120 mutex_unlock(&c->bucket_lock);
122} 121}
123 122
124/* Discard/TRIM */
125
126struct discard {
127 struct list_head list;
128 struct work_struct work;
129 struct cache *ca;
130 long bucket;
131
132 struct bio bio;
133 struct bio_vec bv;
134};
135
136static void discard_finish(struct work_struct *w)
137{
138 struct discard *d = container_of(w, struct discard, work);
139 struct cache *ca = d->ca;
140 char buf[BDEVNAME_SIZE];
141
142 if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
143 pr_notice("discard error on %s, disabling",
144 bdevname(ca->bdev, buf));
145 d->ca->discard = 0;
146 }
147
148 mutex_lock(&ca->set->bucket_lock);
149
150 fifo_push(&ca->free, d->bucket);
151 list_add(&d->list, &ca->discards);
152 atomic_dec(&ca->discards_in_flight);
153
154 mutex_unlock(&ca->set->bucket_lock);
155
156 closure_wake_up(&ca->set->bucket_wait);
157 wake_up_process(ca->alloc_thread);
158
159 closure_put(&ca->set->cl);
160}
161
162static void discard_endio(struct bio *bio, int error)
163{
164 struct discard *d = container_of(bio, struct discard, bio);
165 schedule_work(&d->work);
166}
167
168static void do_discard(struct cache *ca, long bucket)
169{
170 struct discard *d = list_first_entry(&ca->discards,
171 struct discard, list);
172
173 list_del(&d->list);
174 d->bucket = bucket;
175
176 atomic_inc(&ca->discards_in_flight);
177 closure_get(&ca->set->cl);
178
179 bio_init(&d->bio);
180
181 d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket);
182 d->bio.bi_bdev = ca->bdev;
183 d->bio.bi_rw = REQ_WRITE|REQ_DISCARD;
184 d->bio.bi_max_vecs = 1;
185 d->bio.bi_io_vec = d->bio.bi_inline_vecs;
186 d->bio.bi_size = bucket_bytes(ca);
187 d->bio.bi_end_io = discard_endio;
188 bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
189
190 submit_bio(0, &d->bio);
191}
192
193/* Allocation */ 123/* Allocation */
194 124
195static inline bool can_inc_bucket_gen(struct bucket *b) 125static inline bool can_inc_bucket_gen(struct bucket *b)
@@ -280,7 +210,7 @@ static void invalidate_buckets_lru(struct cache *ca)
280 * multiple times when it can't do anything 210 * multiple times when it can't do anything
281 */ 211 */
282 ca->invalidate_needs_gc = 1; 212 ca->invalidate_needs_gc = 1;
283 bch_queue_gc(ca->set); 213 wake_up_gc(ca->set);
284 return; 214 return;
285 } 215 }
286 216
@@ -305,7 +235,7 @@ static void invalidate_buckets_fifo(struct cache *ca)
305 235
306 if (++checked >= ca->sb.nbuckets) { 236 if (++checked >= ca->sb.nbuckets) {
307 ca->invalidate_needs_gc = 1; 237 ca->invalidate_needs_gc = 1;
308 bch_queue_gc(ca->set); 238 wake_up_gc(ca->set);
309 return; 239 return;
310 } 240 }
311 } 241 }
@@ -330,7 +260,7 @@ static void invalidate_buckets_random(struct cache *ca)
330 260
331 if (++checked >= ca->sb.nbuckets / 2) { 261 if (++checked >= ca->sb.nbuckets / 2) {
332 ca->invalidate_needs_gc = 1; 262 ca->invalidate_needs_gc = 1;
333 bch_queue_gc(ca->set); 263 wake_up_gc(ca->set);
334 return; 264 return;
335 } 265 }
336 } 266 }
@@ -398,16 +328,18 @@ static int bch_allocator_thread(void *arg)
398 else 328 else
399 break; 329 break;
400 330
401 allocator_wait(ca, (int) fifo_free(&ca->free) >
402 atomic_read(&ca->discards_in_flight));
403
404 if (ca->discard) { 331 if (ca->discard) {
405 allocator_wait(ca, !list_empty(&ca->discards)); 332 mutex_unlock(&ca->set->bucket_lock);
406 do_discard(ca, bucket); 333 blkdev_issue_discard(ca->bdev,
407 } else { 334 bucket_to_sector(ca->set, bucket),
408 fifo_push(&ca->free, bucket); 335 ca->sb.block_size, GFP_KERNEL, 0);
409 closure_wake_up(&ca->set->bucket_wait); 336 mutex_lock(&ca->set->bucket_lock);
410 } 337 }
338
339 allocator_wait(ca, !fifo_full(&ca->free));
340
341 fifo_push(&ca->free, bucket);
342 wake_up(&ca->set->bucket_wait);
411 } 343 }
412 344
413 /* 345 /*
@@ -433,16 +365,40 @@ static int bch_allocator_thread(void *arg)
433 } 365 }
434} 366}
435 367
436long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) 368long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
437{ 369{
438 long r = -1; 370 DEFINE_WAIT(w);
439again: 371 struct bucket *b;
372 long r;
373
374 /* fastpath */
375 if (fifo_used(&ca->free) > ca->watermark[watermark]) {
376 fifo_pop(&ca->free, r);
377 goto out;
378 }
379
380 if (!wait)
381 return -1;
382
383 while (1) {
384 if (fifo_used(&ca->free) > ca->watermark[watermark]) {
385 fifo_pop(&ca->free, r);
386 break;
387 }
388
389 prepare_to_wait(&ca->set->bucket_wait, &w,
390 TASK_UNINTERRUPTIBLE);
391
392 mutex_unlock(&ca->set->bucket_lock);
393 schedule();
394 mutex_lock(&ca->set->bucket_lock);
395 }
396
397 finish_wait(&ca->set->bucket_wait, &w);
398out:
440 wake_up_process(ca->alloc_thread); 399 wake_up_process(ca->alloc_thread);
441 400
442 if (fifo_used(&ca->free) > ca->watermark[watermark] && 401 if (expensive_debug_checks(ca->set)) {
443 fifo_pop(&ca->free, r)) {
444 struct bucket *b = ca->buckets + r;
445#ifdef CONFIG_BCACHE_EDEBUG
446 size_t iter; 402 size_t iter;
447 long i; 403 long i;
448 404
@@ -455,36 +411,23 @@ again:
455 BUG_ON(i == r); 411 BUG_ON(i == r);
456 fifo_for_each(i, &ca->unused, iter) 412 fifo_for_each(i, &ca->unused, iter)
457 BUG_ON(i == r); 413 BUG_ON(i == r);
458#endif
459 BUG_ON(atomic_read(&b->pin) != 1);
460
461 SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
462
463 if (watermark <= WATERMARK_METADATA) {
464 SET_GC_MARK(b, GC_MARK_METADATA);
465 b->prio = BTREE_PRIO;
466 } else {
467 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
468 b->prio = INITIAL_PRIO;
469 }
470
471 return r;
472 } 414 }
473 415
474 trace_bcache_alloc_fail(ca); 416 b = ca->buckets + r;
475 417
476 if (cl) { 418 BUG_ON(atomic_read(&b->pin) != 1);
477 closure_wait(&ca->set->bucket_wait, cl);
478 419
479 if (closure_blocking(cl)) { 420 SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
480 mutex_unlock(&ca->set->bucket_lock); 421
481 closure_sync(cl); 422 if (watermark <= WATERMARK_METADATA) {
482 mutex_lock(&ca->set->bucket_lock); 423 SET_GC_MARK(b, GC_MARK_METADATA);
483 goto again; 424 b->prio = BTREE_PRIO;
484 } 425 } else {
426 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
427 b->prio = INITIAL_PRIO;
485 } 428 }
486 429
487 return -1; 430 return r;
488} 431}
489 432
490void bch_bucket_free(struct cache_set *c, struct bkey *k) 433void bch_bucket_free(struct cache_set *c, struct bkey *k)
@@ -501,7 +444,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
501} 444}
502 445
503int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 446int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
504 struct bkey *k, int n, struct closure *cl) 447 struct bkey *k, int n, bool wait)
505{ 448{
506 int i; 449 int i;
507 450
@@ -514,7 +457,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
514 457
515 for (i = 0; i < n; i++) { 458 for (i = 0; i < n; i++) {
516 struct cache *ca = c->cache_by_alloc[i]; 459 struct cache *ca = c->cache_by_alloc[i];
517 long b = bch_bucket_alloc(ca, watermark, cl); 460 long b = bch_bucket_alloc(ca, watermark, wait);
518 461
519 if (b == -1) 462 if (b == -1)
520 goto err; 463 goto err;
@@ -529,22 +472,202 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
529 return 0; 472 return 0;
530err: 473err:
531 bch_bucket_free(c, k); 474 bch_bucket_free(c, k);
532 __bkey_put(c, k); 475 bkey_put(c, k);
533 return -1; 476 return -1;
534} 477}
535 478
536int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 479int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
537 struct bkey *k, int n, struct closure *cl) 480 struct bkey *k, int n, bool wait)
538{ 481{
539 int ret; 482 int ret;
540 mutex_lock(&c->bucket_lock); 483 mutex_lock(&c->bucket_lock);
541 ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); 484 ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
542 mutex_unlock(&c->bucket_lock); 485 mutex_unlock(&c->bucket_lock);
543 return ret; 486 return ret;
544} 487}
545 488
489/* Sector allocator */
490
491struct open_bucket {
492 struct list_head list;
493 unsigned last_write_point;
494 unsigned sectors_free;
495 BKEY_PADDED(key);
496};
497
498/*
499 * We keep multiple buckets open for writes, and try to segregate different
500 * write streams for better cache utilization: first we look for a bucket where
501 * the last write to it was sequential with the current write, and failing that
502 * we look for a bucket that was last used by the same task.
503 *
504 * The ideas is if you've got multiple tasks pulling data into the cache at the
505 * same time, you'll get better cache utilization if you try to segregate their
506 * data and preserve locality.
507 *
508 * For example, say you've starting Firefox at the same time you're copying a
509 * bunch of files. Firefox will likely end up being fairly hot and stay in the
510 * cache awhile, but the data you copied might not be; if you wrote all that
511 * data to the same buckets it'd get invalidated at the same time.
512 *
513 * Both of those tasks will be doing fairly random IO so we can't rely on
514 * detecting sequential IO to segregate their data, but going off of the task
515 * should be a sane heuristic.
516 */
517static struct open_bucket *pick_data_bucket(struct cache_set *c,
518 const struct bkey *search,
519 unsigned write_point,
520 struct bkey *alloc)
521{
522 struct open_bucket *ret, *ret_task = NULL;
523
524 list_for_each_entry_reverse(ret, &c->data_buckets, list)
525 if (!bkey_cmp(&ret->key, search))
526 goto found;
527 else if (ret->last_write_point == write_point)
528 ret_task = ret;
529
530 ret = ret_task ?: list_first_entry(&c->data_buckets,
531 struct open_bucket, list);
532found:
533 if (!ret->sectors_free && KEY_PTRS(alloc)) {
534 ret->sectors_free = c->sb.bucket_size;
535 bkey_copy(&ret->key, alloc);
536 bkey_init(alloc);
537 }
538
539 if (!ret->sectors_free)
540 ret = NULL;
541
542 return ret;
543}
544
545/*
546 * Allocates some space in the cache to write to, and k to point to the newly
547 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
548 * end of the newly allocated space).
549 *
550 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
551 * sectors were actually allocated.
552 *
553 * If s->writeback is true, will not fail.
554 */
555bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
556 unsigned write_point, unsigned write_prio, bool wait)
557{
558 struct open_bucket *b;
559 BKEY_PADDED(key) alloc;
560 unsigned i;
561
562 /*
563 * We might have to allocate a new bucket, which we can't do with a
564 * spinlock held. So if we have to allocate, we drop the lock, allocate
565 * and then retry. KEY_PTRS() indicates whether alloc points to
566 * allocated bucket(s).
567 */
568
569 bkey_init(&alloc.key);
570 spin_lock(&c->data_bucket_lock);
571
572 while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
573 unsigned watermark = write_prio
574 ? WATERMARK_MOVINGGC
575 : WATERMARK_NONE;
576
577 spin_unlock(&c->data_bucket_lock);
578
579 if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait))
580 return false;
581
582 spin_lock(&c->data_bucket_lock);
583 }
584
585 /*
586 * If we had to allocate, we might race and not need to allocate the
587 * second time we call find_data_bucket(). If we allocated a bucket but
588 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
589 */
590 if (KEY_PTRS(&alloc.key))
591 bkey_put(c, &alloc.key);
592
593 for (i = 0; i < KEY_PTRS(&b->key); i++)
594 EBUG_ON(ptr_stale(c, &b->key, i));
595
596 /* Set up the pointer to the space we're allocating: */
597
598 for (i = 0; i < KEY_PTRS(&b->key); i++)
599 k->ptr[i] = b->key.ptr[i];
600
601 sectors = min(sectors, b->sectors_free);
602
603 SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
604 SET_KEY_SIZE(k, sectors);
605 SET_KEY_PTRS(k, KEY_PTRS(&b->key));
606
607 /*
608 * Move b to the end of the lru, and keep track of what this bucket was
609 * last used for:
610 */
611 list_move_tail(&b->list, &c->data_buckets);
612 bkey_copy_key(&b->key, k);
613 b->last_write_point = write_point;
614
615 b->sectors_free -= sectors;
616
617 for (i = 0; i < KEY_PTRS(&b->key); i++) {
618 SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
619
620 atomic_long_add(sectors,
621 &PTR_CACHE(c, &b->key, i)->sectors_written);
622 }
623
624 if (b->sectors_free < c->sb.block_size)
625 b->sectors_free = 0;
626
627 /*
628 * k takes refcounts on the buckets it points to until it's inserted
629 * into the btree, but if we're done with this bucket we just transfer
630 * get_data_bucket()'s refcount.
631 */
632 if (b->sectors_free)
633 for (i = 0; i < KEY_PTRS(&b->key); i++)
634 atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
635
636 spin_unlock(&c->data_bucket_lock);
637 return true;
638}
639
546/* Init */ 640/* Init */
547 641
642void bch_open_buckets_free(struct cache_set *c)
643{
644 struct open_bucket *b;
645
646 while (!list_empty(&c->data_buckets)) {
647 b = list_first_entry(&c->data_buckets,
648 struct open_bucket, list);
649 list_del(&b->list);
650 kfree(b);
651 }
652}
653
654int bch_open_buckets_alloc(struct cache_set *c)
655{
656 int i;
657
658 spin_lock_init(&c->data_bucket_lock);
659
660 for (i = 0; i < 6; i++) {
661 struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
662 if (!b)
663 return -ENOMEM;
664
665 list_add(&b->list, &c->data_buckets);
666 }
667
668 return 0;
669}
670
548int bch_cache_allocator_start(struct cache *ca) 671int bch_cache_allocator_start(struct cache *ca)
549{ 672{
550 struct task_struct *k = kthread_run(bch_allocator_thread, 673 struct task_struct *k = kthread_run(bch_allocator_thread,
@@ -556,22 +679,8 @@ int bch_cache_allocator_start(struct cache *ca)
556 return 0; 679 return 0;
557} 680}
558 681
559void bch_cache_allocator_exit(struct cache *ca)
560{
561 struct discard *d;
562
563 while (!list_empty(&ca->discards)) {
564 d = list_first_entry(&ca->discards, struct discard, list);
565 cancel_work_sync(&d->work);
566 list_del(&d->list);
567 kfree(d);
568 }
569}
570
571int bch_cache_allocator_init(struct cache *ca) 682int bch_cache_allocator_init(struct cache *ca)
572{ 683{
573 unsigned i;
574
575 /* 684 /*
576 * Reserve: 685 * Reserve:
577 * Prio/gen writes first 686 * Prio/gen writes first
@@ -589,15 +698,5 @@ int bch_cache_allocator_init(struct cache *ca)
589 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + 698 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
590 ca->watermark[WATERMARK_MOVINGGC]; 699 ca->watermark[WATERMARK_MOVINGGC];
591 700
592 for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
593 struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
594 if (!d)
595 return -ENOMEM;
596
597 d->ca = ca;
598 INIT_WORK(&d->work, discard_finish);
599 list_add(&d->list, &ca->discards);
600 }
601
602 return 0; 701 return 0;
603} 702}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 0f12382aa35d..4beb55a0ff30 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -177,6 +177,7 @@
177 177
178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ 178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
179 179
180#include <linux/bcache.h>
180#include <linux/bio.h> 181#include <linux/bio.h>
181#include <linux/kobject.h> 182#include <linux/kobject.h>
182#include <linux/list.h> 183#include <linux/list.h>
@@ -210,168 +211,6 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
210#define GC_MARK_METADATA 2 211#define GC_MARK_METADATA 2
211BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); 212BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
212 213
213struct bkey {
214 uint64_t high;
215 uint64_t low;
216 uint64_t ptr[];
217};
218
219/* Enough for a key with 6 pointers */
220#define BKEY_PAD 8
221
222#define BKEY_PADDED(key) \
223 union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
224
225/* Version 0: Cache device
226 * Version 1: Backing device
227 * Version 2: Seed pointer into btree node checksum
228 * Version 3: Cache device with new UUID format
229 * Version 4: Backing device with data offset
230 */
231#define BCACHE_SB_VERSION_CDEV 0
232#define BCACHE_SB_VERSION_BDEV 1
233#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
234#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
235#define BCACHE_SB_MAX_VERSION 4
236
237#define SB_SECTOR 8
238#define SB_SIZE 4096
239#define SB_LABEL_SIZE 32
240#define SB_JOURNAL_BUCKETS 256U
241/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
242#define MAX_CACHES_PER_SET 8
243
244#define BDEV_DATA_START_DEFAULT 16 /* sectors */
245
246struct cache_sb {
247 uint64_t csum;
248 uint64_t offset; /* sector where this sb was written */
249 uint64_t version;
250
251 uint8_t magic[16];
252
253 uint8_t uuid[16];
254 union {
255 uint8_t set_uuid[16];
256 uint64_t set_magic;
257 };
258 uint8_t label[SB_LABEL_SIZE];
259
260 uint64_t flags;
261 uint64_t seq;
262 uint64_t pad[8];
263
264 union {
265 struct {
266 /* Cache devices */
267 uint64_t nbuckets; /* device size */
268
269 uint16_t block_size; /* sectors */
270 uint16_t bucket_size; /* sectors */
271
272 uint16_t nr_in_set;
273 uint16_t nr_this_dev;
274 };
275 struct {
276 /* Backing devices */
277 uint64_t data_offset;
278
279 /*
280 * block_size from the cache device section is still used by
281 * backing devices, so don't add anything here until we fix
282 * things to not need it for backing devices anymore
283 */
284 };
285 };
286
287 uint32_t last_mount; /* time_t */
288
289 uint16_t first_bucket;
290 union {
291 uint16_t njournal_buckets;
292 uint16_t keys;
293 };
294 uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */
295};
296
297BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
298BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1);
299BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3);
300#define CACHE_REPLACEMENT_LRU 0U
301#define CACHE_REPLACEMENT_FIFO 1U
302#define CACHE_REPLACEMENT_RANDOM 2U
303
304BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
305#define CACHE_MODE_WRITETHROUGH 0U
306#define CACHE_MODE_WRITEBACK 1U
307#define CACHE_MODE_WRITEAROUND 2U
308#define CACHE_MODE_NONE 3U
309BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
310#define BDEV_STATE_NONE 0U
311#define BDEV_STATE_CLEAN 1U
312#define BDEV_STATE_DIRTY 2U
313#define BDEV_STATE_STALE 3U
314
315/* Version 1: Seed pointer into btree node checksum
316 */
317#define BCACHE_BSET_VERSION 1
318
319/*
320 * This is the on disk format for btree nodes - a btree node on disk is a list
321 * of these; within each set the keys are sorted
322 */
323struct bset {
324 uint64_t csum;
325 uint64_t magic;
326 uint64_t seq;
327 uint32_t version;
328 uint32_t keys;
329
330 union {
331 struct bkey start[0];
332 uint64_t d[0];
333 };
334};
335
336/*
337 * On disk format for priorities and gens - see super.c near prio_write() for
338 * more.
339 */
340struct prio_set {
341 uint64_t csum;
342 uint64_t magic;
343 uint64_t seq;
344 uint32_t version;
345 uint32_t pad;
346
347 uint64_t next_bucket;
348
349 struct bucket_disk {
350 uint16_t prio;
351 uint8_t gen;
352 } __attribute((packed)) data[];
353};
354
355struct uuid_entry {
356 union {
357 struct {
358 uint8_t uuid[16];
359 uint8_t label[32];
360 uint32_t first_reg;
361 uint32_t last_reg;
362 uint32_t invalidated;
363
364 uint32_t flags;
365 /* Size of flash only volumes */
366 uint64_t sectors;
367 };
368
369 uint8_t pad[128];
370 };
371};
372
373BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
374
375#include "journal.h" 214#include "journal.h"
376#include "stats.h" 215#include "stats.h"
377struct search; 216struct search;
@@ -384,8 +223,6 @@ struct keybuf_key {
384 void *private; 223 void *private;
385}; 224};
386 225
387typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
388
389struct keybuf { 226struct keybuf {
390 struct bkey last_scanned; 227 struct bkey last_scanned;
391 spinlock_t lock; 228 spinlock_t lock;
@@ -400,7 +237,7 @@ struct keybuf {
400 237
401 struct rb_root keys; 238 struct rb_root keys;
402 239
403#define KEYBUF_NR 100 240#define KEYBUF_NR 500
404 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); 241 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
405}; 242};
406 243
@@ -429,16 +266,15 @@ struct bcache_device {
429 266
430 struct gendisk *disk; 267 struct gendisk *disk;
431 268
432 /* If nonzero, we're closing */ 269 unsigned long flags;
433 atomic_t closing; 270#define BCACHE_DEV_CLOSING 0
434 271#define BCACHE_DEV_DETACHING 1
435 /* If nonzero, we're detaching/unregistering from cache set */ 272#define BCACHE_DEV_UNLINK_DONE 2
436 atomic_t detaching;
437 int flush_done;
438 273
439 uint64_t nr_stripes; 274 unsigned nr_stripes;
440 unsigned stripe_size_bits; 275 unsigned stripe_size;
441 atomic_t *stripe_sectors_dirty; 276 atomic_t *stripe_sectors_dirty;
277 unsigned long *full_dirty_stripes;
442 278
443 unsigned long sectors_dirty_last; 279 unsigned long sectors_dirty_last;
444 long sectors_dirty_derivative; 280 long sectors_dirty_derivative;
@@ -509,7 +345,7 @@ struct cached_dev {
509 345
510 /* Limit number of writeback bios in flight */ 346 /* Limit number of writeback bios in flight */
511 struct semaphore in_flight; 347 struct semaphore in_flight;
512 struct closure_with_timer writeback; 348 struct task_struct *writeback_thread;
513 349
514 struct keybuf writeback_keys; 350 struct keybuf writeback_keys;
515 351
@@ -527,8 +363,8 @@ struct cached_dev {
527 unsigned sequential_cutoff; 363 unsigned sequential_cutoff;
528 unsigned readahead; 364 unsigned readahead;
529 365
530 unsigned sequential_merge:1;
531 unsigned verify:1; 366 unsigned verify:1;
367 unsigned bypass_torture_test:1;
532 368
533 unsigned partial_stripes_expensive:1; 369 unsigned partial_stripes_expensive:1;
534 unsigned writeback_metadata:1; 370 unsigned writeback_metadata:1;
@@ -620,15 +456,6 @@ struct cache {
620 456
621 bool discard; /* Get rid of? */ 457 bool discard; /* Get rid of? */
622 458
623 /*
624 * We preallocate structs for issuing discards to buckets, and keep them
625 * on this list when they're not in use; do_discard() issues discards
626 * whenever there's work to do and is called by free_some_buckets() and
627 * when a discard finishes.
628 */
629 atomic_t discards_in_flight;
630 struct list_head discards;
631
632 struct journal_device journal; 459 struct journal_device journal;
633 460
634 /* The rest of this all shows up in sysfs */ 461 /* The rest of this all shows up in sysfs */
@@ -649,7 +476,6 @@ struct gc_stat {
649 476
650 size_t nkeys; 477 size_t nkeys;
651 uint64_t data; /* sectors */ 478 uint64_t data; /* sectors */
652 uint64_t dirty; /* sectors */
653 unsigned in_use; /* percent */ 479 unsigned in_use; /* percent */
654}; 480};
655 481
@@ -744,8 +570,8 @@ struct cache_set {
744 * basically a lock for this that we can wait on asynchronously. The 570 * basically a lock for this that we can wait on asynchronously. The
745 * btree_root() macro releases the lock when it returns. 571 * btree_root() macro releases the lock when it returns.
746 */ 572 */
747 struct closure *try_harder; 573 struct task_struct *try_harder;
748 struct closure_waitlist try_wait; 574 wait_queue_head_t try_wait;
749 uint64_t try_harder_start; 575 uint64_t try_harder_start;
750 576
751 /* 577 /*
@@ -759,7 +585,7 @@ struct cache_set {
759 * written. 585 * written.
760 */ 586 */
761 atomic_t prio_blocked; 587 atomic_t prio_blocked;
762 struct closure_waitlist bucket_wait; 588 wait_queue_head_t bucket_wait;
763 589
764 /* 590 /*
765 * For any bio we don't skip we subtract the number of sectors from 591 * For any bio we don't skip we subtract the number of sectors from
@@ -782,7 +608,7 @@ struct cache_set {
782 struct gc_stat gc_stats; 608 struct gc_stat gc_stats;
783 size_t nbuckets; 609 size_t nbuckets;
784 610
785 struct closure_with_waitlist gc; 611 struct task_struct *gc_thread;
786 /* Where in the btree gc currently is */ 612 /* Where in the btree gc currently is */
787 struct bkey gc_done; 613 struct bkey gc_done;
788 614
@@ -795,11 +621,10 @@ struct cache_set {
795 /* Counts how many sectors bio_insert has added to the cache */ 621 /* Counts how many sectors bio_insert has added to the cache */
796 atomic_t sectors_to_gc; 622 atomic_t sectors_to_gc;
797 623
798 struct closure moving_gc; 624 wait_queue_head_t moving_gc_wait;
799 struct closure_waitlist moving_gc_wait;
800 struct keybuf moving_gc_keys; 625 struct keybuf moving_gc_keys;
801 /* Number of moving GC bios in flight */ 626 /* Number of moving GC bios in flight */
802 atomic_t in_flight; 627 struct semaphore moving_in_flight;
803 628
804 struct btree *root; 629 struct btree *root;
805 630
@@ -841,22 +666,27 @@ struct cache_set {
841 unsigned congested_read_threshold_us; 666 unsigned congested_read_threshold_us;
842 unsigned congested_write_threshold_us; 667 unsigned congested_write_threshold_us;
843 668
844 spinlock_t sort_time_lock;
845 struct time_stats sort_time; 669 struct time_stats sort_time;
846 struct time_stats btree_gc_time; 670 struct time_stats btree_gc_time;
847 struct time_stats btree_split_time; 671 struct time_stats btree_split_time;
848 spinlock_t btree_read_time_lock;
849 struct time_stats btree_read_time; 672 struct time_stats btree_read_time;
850 struct time_stats try_harder_time; 673 struct time_stats try_harder_time;
851 674
852 atomic_long_t cache_read_races; 675 atomic_long_t cache_read_races;
853 atomic_long_t writeback_keys_done; 676 atomic_long_t writeback_keys_done;
854 atomic_long_t writeback_keys_failed; 677 atomic_long_t writeback_keys_failed;
678
679 enum {
680 ON_ERROR_UNREGISTER,
681 ON_ERROR_PANIC,
682 } on_error;
855 unsigned error_limit; 683 unsigned error_limit;
856 unsigned error_decay; 684 unsigned error_decay;
685
857 unsigned short journal_delay_ms; 686 unsigned short journal_delay_ms;
858 unsigned verify:1; 687 unsigned verify:1;
859 unsigned key_merging_disabled:1; 688 unsigned key_merging_disabled:1;
689 unsigned expensive_debug_checks:1;
860 unsigned gc_always_rewrite:1; 690 unsigned gc_always_rewrite:1;
861 unsigned shrinker_disabled:1; 691 unsigned shrinker_disabled:1;
862 unsigned copy_gc_enabled:1; 692 unsigned copy_gc_enabled:1;
@@ -865,21 +695,6 @@ struct cache_set {
865 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; 695 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
866}; 696};
867 697
868static inline bool key_merging_disabled(struct cache_set *c)
869{
870#ifdef CONFIG_BCACHE_DEBUG
871 return c->key_merging_disabled;
872#else
873 return 0;
874#endif
875}
876
877static inline bool SB_IS_BDEV(const struct cache_sb *sb)
878{
879 return sb->version == BCACHE_SB_VERSION_BDEV
880 || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
881}
882
883struct bbio { 698struct bbio {
884 unsigned submit_time_us; 699 unsigned submit_time_us;
885 union { 700 union {
@@ -933,59 +748,6 @@ static inline unsigned local_clock_us(void)
933#define prio_buckets(c) \ 748#define prio_buckets(c) \
934 DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) 749 DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
935 750
936#define JSET_MAGIC 0x245235c1a3625032ULL
937#define PSET_MAGIC 0x6750e15f87337f91ULL
938#define BSET_MAGIC 0x90135c78b99e07f5ULL
939
940#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC)
941#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC)
942#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC)
943
944/* Bkey fields: all units are in sectors */
945
946#define KEY_FIELD(name, field, offset, size) \
947 BITMASK(name, struct bkey, field, offset, size)
948
949#define PTR_FIELD(name, offset, size) \
950 static inline uint64_t name(const struct bkey *k, unsigned i) \
951 { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \
952 \
953 static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
954 { \
955 k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \
956 k->ptr[i] |= v << offset; \
957 }
958
959KEY_FIELD(KEY_PTRS, high, 60, 3)
960KEY_FIELD(HEADER_SIZE, high, 58, 2)
961KEY_FIELD(KEY_CSUM, high, 56, 2)
962KEY_FIELD(KEY_PINNED, high, 55, 1)
963KEY_FIELD(KEY_DIRTY, high, 36, 1)
964
965KEY_FIELD(KEY_SIZE, high, 20, 16)
966KEY_FIELD(KEY_INODE, high, 0, 20)
967
968/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
969
970static inline uint64_t KEY_OFFSET(const struct bkey *k)
971{
972 return k->low;
973}
974
975static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v)
976{
977 k->low = v;
978}
979
980PTR_FIELD(PTR_DEV, 51, 12)
981PTR_FIELD(PTR_OFFSET, 8, 43)
982PTR_FIELD(PTR_GEN, 0, 8)
983
984#define PTR_CHECK_DEV ((1 << 12) - 1)
985
986#define PTR(gen, offset, dev) \
987 ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
988
989static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) 751static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
990{ 752{
991 return s >> c->bucket_bits; 753 return s >> c->bucket_bits;
@@ -1024,27 +786,11 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
1024 786
1025/* Btree key macros */ 787/* Btree key macros */
1026 788
1027/*
1028 * The high bit being set is a relic from when we used it to do binary
1029 * searches - it told you where a key started. It's not used anymore,
1030 * and can probably be safely dropped.
1031 */
1032#define KEY(dev, sector, len) \
1033((struct bkey) { \
1034 .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \
1035 .low = (sector) \
1036})
1037
1038static inline void bkey_init(struct bkey *k) 789static inline void bkey_init(struct bkey *k)
1039{ 790{
1040 *k = KEY(0, 0, 0); 791 *k = ZERO_KEY;
1041} 792}
1042 793
1043#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k))
1044#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0)
1045#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0)
1046#define ZERO_KEY KEY(0, 0, 0)
1047
1048/* 794/*
1049 * This is used for various on disk data structures - cache_sb, prio_set, bset, 795 * This is used for various on disk data structures - cache_sb, prio_set, bset,
1050 * jset: The checksum is _always_ the first 8 bytes of these structs 796 * jset: The checksum is _always_ the first 8 bytes of these structs
@@ -1094,14 +840,6 @@ do { \
1094 for (b = (ca)->buckets + (ca)->sb.first_bucket; \ 840 for (b = (ca)->buckets + (ca)->sb.first_bucket; \
1095 b < (ca)->buckets + (ca)->sb.nbuckets; b++) 841 b < (ca)->buckets + (ca)->sb.nbuckets; b++)
1096 842
1097static inline void __bkey_put(struct cache_set *c, struct bkey *k)
1098{
1099 unsigned i;
1100
1101 for (i = 0; i < KEY_PTRS(k); i++)
1102 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
1103}
1104
1105static inline void cached_dev_put(struct cached_dev *dc) 843static inline void cached_dev_put(struct cached_dev *dc)
1106{ 844{
1107 if (atomic_dec_and_test(&dc->count)) 845 if (atomic_dec_and_test(&dc->count))
@@ -1173,13 +911,15 @@ uint8_t bch_inc_gen(struct cache *, struct bucket *);
1173void bch_rescale_priorities(struct cache_set *, int); 911void bch_rescale_priorities(struct cache_set *, int);
1174bool bch_bucket_add_unused(struct cache *, struct bucket *); 912bool bch_bucket_add_unused(struct cache *, struct bucket *);
1175 913
1176long bch_bucket_alloc(struct cache *, unsigned, struct closure *); 914long bch_bucket_alloc(struct cache *, unsigned, bool);
1177void bch_bucket_free(struct cache_set *, struct bkey *); 915void bch_bucket_free(struct cache_set *, struct bkey *);
1178 916
1179int __bch_bucket_alloc_set(struct cache_set *, unsigned, 917int __bch_bucket_alloc_set(struct cache_set *, unsigned,
1180 struct bkey *, int, struct closure *); 918 struct bkey *, int, bool);
1181int bch_bucket_alloc_set(struct cache_set *, unsigned, 919int bch_bucket_alloc_set(struct cache_set *, unsigned,
1182 struct bkey *, int, struct closure *); 920 struct bkey *, int, bool);
921bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
922 unsigned, unsigned, bool);
1183 923
1184__printf(2, 3) 924__printf(2, 3)
1185bool bch_cache_set_error(struct cache_set *, const char *, ...); 925bool bch_cache_set_error(struct cache_set *, const char *, ...);
@@ -1187,7 +927,7 @@ bool bch_cache_set_error(struct cache_set *, const char *, ...);
1187void bch_prio_write(struct cache *); 927void bch_prio_write(struct cache *);
1188void bch_write_bdev_super(struct cached_dev *, struct closure *); 928void bch_write_bdev_super(struct cached_dev *, struct closure *);
1189 929
1190extern struct workqueue_struct *bcache_wq, *bch_gc_wq; 930extern struct workqueue_struct *bcache_wq;
1191extern const char * const bch_cache_modes[]; 931extern const char * const bch_cache_modes[];
1192extern struct mutex bch_register_lock; 932extern struct mutex bch_register_lock;
1193extern struct list_head bch_cache_sets; 933extern struct list_head bch_cache_sets;
@@ -1220,15 +960,14 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *);
1220void bch_btree_cache_free(struct cache_set *); 960void bch_btree_cache_free(struct cache_set *);
1221int bch_btree_cache_alloc(struct cache_set *); 961int bch_btree_cache_alloc(struct cache_set *);
1222void bch_moving_init_cache_set(struct cache_set *); 962void bch_moving_init_cache_set(struct cache_set *);
963int bch_open_buckets_alloc(struct cache_set *);
964void bch_open_buckets_free(struct cache_set *);
1223 965
1224int bch_cache_allocator_start(struct cache *ca); 966int bch_cache_allocator_start(struct cache *ca);
1225void bch_cache_allocator_exit(struct cache *ca);
1226int bch_cache_allocator_init(struct cache *ca); 967int bch_cache_allocator_init(struct cache *ca);
1227 968
1228void bch_debug_exit(void); 969void bch_debug_exit(void);
1229int bch_debug_init(struct kobject *); 970int bch_debug_init(struct kobject *);
1230void bch_writeback_exit(void);
1231int bch_writeback_init(void);
1232void bch_request_exit(void); 971void bch_request_exit(void);
1233int bch_request_init(void); 972int bch_request_init(void);
1234void bch_btree_exit(void); 973void bch_btree_exit(void);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 22d1ae72c282..7d388b8bb50e 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -14,22 +14,12 @@
14 14
15/* Keylists */ 15/* Keylists */
16 16
17void bch_keylist_copy(struct keylist *dest, struct keylist *src)
18{
19 *dest = *src;
20
21 if (src->list == src->d) {
22 size_t n = (uint64_t *) src->top - src->d;
23 dest->top = (struct bkey *) &dest->d[n];
24 dest->list = dest->d;
25 }
26}
27
28int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) 17int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
29{ 18{
30 unsigned oldsize = (uint64_t *) l->top - l->list; 19 size_t oldsize = bch_keylist_nkeys(l);
31 unsigned newsize = oldsize + 2 + nptrs; 20 size_t newsize = oldsize + 2 + nptrs;
32 uint64_t *new; 21 uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
22 uint64_t *new_keys;
33 23
34 /* The journalling code doesn't handle the case where the keys to insert 24 /* The journalling code doesn't handle the case where the keys to insert
35 * is bigger than an empty write: If we just return -ENOMEM here, 25 * is bigger than an empty write: If we just return -ENOMEM here,
@@ -45,24 +35,23 @@ int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
45 roundup_pow_of_two(oldsize) == newsize) 35 roundup_pow_of_two(oldsize) == newsize)
46 return 0; 36 return 0;
47 37
48 new = krealloc(l->list == l->d ? NULL : l->list, 38 new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO);
49 sizeof(uint64_t) * newsize, GFP_NOIO);
50 39
51 if (!new) 40 if (!new_keys)
52 return -ENOMEM; 41 return -ENOMEM;
53 42
54 if (l->list == l->d) 43 if (!old_keys)
55 memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE); 44 memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize);
56 45
57 l->list = new; 46 l->keys_p = new_keys;
58 l->top = (struct bkey *) (&l->list[oldsize]); 47 l->top_p = new_keys + oldsize;
59 48
60 return 0; 49 return 0;
61} 50}
62 51
63struct bkey *bch_keylist_pop(struct keylist *l) 52struct bkey *bch_keylist_pop(struct keylist *l)
64{ 53{
65 struct bkey *k = l->bottom; 54 struct bkey *k = l->keys;
66 55
67 if (k == l->top) 56 if (k == l->top)
68 return NULL; 57 return NULL;
@@ -73,21 +62,20 @@ struct bkey *bch_keylist_pop(struct keylist *l)
73 return l->top = k; 62 return l->top = k;
74} 63}
75 64
76/* Pointer validation */ 65void bch_keylist_pop_front(struct keylist *l)
77
78bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
79{ 66{
80 unsigned i; 67 l->top_p -= bkey_u64s(l->keys);
81 char buf[80];
82 68
83 if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) 69 memmove(l->keys,
84 goto bad; 70 bkey_next(l->keys),
71 bch_keylist_bytes(l));
72}
85 73
86 if (!level && KEY_SIZE(k) > KEY_OFFSET(k)) 74/* Pointer validation */
87 goto bad;
88 75
89 if (!KEY_SIZE(k)) 76static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
90 return true; 77{
78 unsigned i;
91 79
92 for (i = 0; i < KEY_PTRS(k); i++) 80 for (i = 0; i < KEY_PTRS(k); i++)
93 if (ptr_available(c, k, i)) { 81 if (ptr_available(c, k, i)) {
@@ -98,13 +86,83 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
98 if (KEY_SIZE(k) + r > c->sb.bucket_size || 86 if (KEY_SIZE(k) + r > c->sb.bucket_size ||
99 bucket < ca->sb.first_bucket || 87 bucket < ca->sb.first_bucket ||
100 bucket >= ca->sb.nbuckets) 88 bucket >= ca->sb.nbuckets)
101 goto bad; 89 return true;
102 } 90 }
103 91
104 return false; 92 return false;
93}
94
95bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
96{
97 char buf[80];
98
99 if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
100 goto bad;
101
102 if (__ptr_invalid(c, k))
103 goto bad;
104
105 return false;
106bad:
107 bch_bkey_to_text(buf, sizeof(buf), k);
108 cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
109 return true;
110}
111
112bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
113{
114 char buf[80];
115
116 if (!KEY_SIZE(k))
117 return true;
118
119 if (KEY_SIZE(k) > KEY_OFFSET(k))
120 goto bad;
121
122 if (__ptr_invalid(c, k))
123 goto bad;
124
125 return false;
105bad: 126bad:
106 bch_bkey_to_text(buf, sizeof(buf), k); 127 bch_bkey_to_text(buf, sizeof(buf), k);
107 cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); 128 cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
129 return true;
130}
131
132static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
133 unsigned ptr)
134{
135 struct bucket *g = PTR_BUCKET(b->c, k, ptr);
136 char buf[80];
137
138 if (mutex_trylock(&b->c->bucket_lock)) {
139 if (b->level) {
140 if (KEY_DIRTY(k) ||
141 g->prio != BTREE_PRIO ||
142 (b->c->gc_mark_valid &&
143 GC_MARK(g) != GC_MARK_METADATA))
144 goto err;
145
146 } else {
147 if (g->prio == BTREE_PRIO)
148 goto err;
149
150 if (KEY_DIRTY(k) &&
151 b->c->gc_mark_valid &&
152 GC_MARK(g) != GC_MARK_DIRTY)
153 goto err;
154 }
155 mutex_unlock(&b->c->bucket_lock);
156 }
157
158 return false;
159err:
160 mutex_unlock(&b->c->bucket_lock);
161 bch_bkey_to_text(buf, sizeof(buf), k);
162 btree_bug(b,
163"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
164 buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
165 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
108 return true; 166 return true;
109} 167}
110 168
@@ -118,64 +176,29 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
118 bch_ptr_invalid(b, k)) 176 bch_ptr_invalid(b, k))
119 return true; 177 return true;
120 178
121 if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV) 179 for (i = 0; i < KEY_PTRS(k); i++) {
122 return true; 180 if (!ptr_available(b->c, k, i))
181 return true;
123 182
124 for (i = 0; i < KEY_PTRS(k); i++) 183 g = PTR_BUCKET(b->c, k, i);
125 if (ptr_available(b->c, k, i)) { 184 stale = ptr_stale(b->c, k, i);
126 g = PTR_BUCKET(b->c, k, i);
127 stale = ptr_stale(b->c, k, i);
128 185
129 btree_bug_on(stale > 96, b, 186 btree_bug_on(stale > 96, b,
130 "key too stale: %i, need_gc %u", 187 "key too stale: %i, need_gc %u",
131 stale, b->c->need_gc); 188 stale, b->c->need_gc);
132 189
133 btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), 190 btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
134 b, "stale dirty pointer"); 191 b, "stale dirty pointer");
135 192
136 if (stale) 193 if (stale)
137 return true; 194 return true;
138 195
139#ifdef CONFIG_BCACHE_EDEBUG 196 if (expensive_debug_checks(b->c) &&
140 if (!mutex_trylock(&b->c->bucket_lock)) 197 ptr_bad_expensive_checks(b, k, i))
141 continue; 198 return true;
142 199 }
143 if (b->level) {
144 if (KEY_DIRTY(k) ||
145 g->prio != BTREE_PRIO ||
146 (b->c->gc_mark_valid &&
147 GC_MARK(g) != GC_MARK_METADATA))
148 goto bug;
149
150 } else {
151 if (g->prio == BTREE_PRIO)
152 goto bug;
153
154 if (KEY_DIRTY(k) &&
155 b->c->gc_mark_valid &&
156 GC_MARK(g) != GC_MARK_DIRTY)
157 goto bug;
158 }
159 mutex_unlock(&b->c->bucket_lock);
160#endif
161 }
162 200
163 return false; 201 return false;
164#ifdef CONFIG_BCACHE_EDEBUG
165bug:
166 mutex_unlock(&b->c->bucket_lock);
167
168 {
169 char buf[80];
170
171 bch_bkey_to_text(buf, sizeof(buf), k);
172 btree_bug(b,
173"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
174 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
175 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
176 }
177 return true;
178#endif
179} 202}
180 203
181/* Key/pointer manipulation */ 204/* Key/pointer manipulation */
@@ -458,16 +481,8 @@ static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
458 481
459static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) 482static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
460{ 483{
461#ifdef CONFIG_X86_64
462 asm("shrd %[shift],%[high],%[low]"
463 : [low] "+Rm" (low)
464 : [high] "R" (high),
465 [shift] "ci" (shift)
466 : "cc");
467#else
468 low >>= shift; 484 low >>= shift;
469 low |= (high << 1) << (63U - shift); 485 low |= (high << 1) << (63U - shift);
470#endif
471 return low; 486 return low;
472} 487}
473 488
@@ -686,7 +701,7 @@ void bch_bset_init_next(struct btree *b)
686 } else 701 } else
687 get_random_bytes(&i->seq, sizeof(uint64_t)); 702 get_random_bytes(&i->seq, sizeof(uint64_t));
688 703
689 i->magic = bset_magic(b->c); 704 i->magic = bset_magic(&b->c->sb);
690 i->version = 0; 705 i->version = 0;
691 i->keys = 0; 706 i->keys = 0;
692 707
@@ -824,16 +839,16 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
824 } else 839 } else
825 i = bset_search_write_set(b, t, search); 840 i = bset_search_write_set(b, t, search);
826 841
827#ifdef CONFIG_BCACHE_EDEBUG 842 if (expensive_debug_checks(b->c)) {
828 BUG_ON(bset_written(b, t) && 843 BUG_ON(bset_written(b, t) &&
829 i.l != t->data->start && 844 i.l != t->data->start &&
830 bkey_cmp(tree_to_prev_bkey(t, 845 bkey_cmp(tree_to_prev_bkey(t,
831 inorder_to_tree(bkey_to_cacheline(t, i.l), t)), 846 inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
832 search) > 0); 847 search) > 0);
833 848
834 BUG_ON(i.r != end(t->data) && 849 BUG_ON(i.r != end(t->data) &&
835 bkey_cmp(i.r, search) <= 0); 850 bkey_cmp(i.r, search) <= 0);
836#endif 851 }
837 852
838 while (likely(i.l != i.r) && 853 while (likely(i.l != i.r) &&
839 bkey_cmp(i.l, search) <= 0) 854 bkey_cmp(i.l, search) <= 0)
@@ -844,6 +859,13 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
844 859
845/* Btree iterator */ 860/* Btree iterator */
846 861
862/*
863 * Returns true if l > r - unless l == r, in which case returns true if l is
864 * older than r.
865 *
866 * Necessary for btree_sort_fixup() - if there are multiple keys that compare
867 * equal in different sets, we have to process them newest to oldest.
868 */
847static inline bool btree_iter_cmp(struct btree_iter_set l, 869static inline bool btree_iter_cmp(struct btree_iter_set l,
848 struct btree_iter_set r) 870 struct btree_iter_set r)
849{ 871{
@@ -867,12 +889,16 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
867} 889}
868 890
869struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, 891struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
870 struct bkey *search, struct bset_tree *start) 892 struct bkey *search, struct bset_tree *start)
871{ 893{
872 struct bkey *ret = NULL; 894 struct bkey *ret = NULL;
873 iter->size = ARRAY_SIZE(iter->data); 895 iter->size = ARRAY_SIZE(iter->data);
874 iter->used = 0; 896 iter->used = 0;
875 897
898#ifdef CONFIG_BCACHE_DEBUG
899 iter->b = b;
900#endif
901
876 for (; start <= &b->sets[b->nsets]; start++) { 902 for (; start <= &b->sets[b->nsets]; start++) {
877 ret = bch_bset_search(b, start, search); 903 ret = bch_bset_search(b, start, search);
878 bch_btree_iter_push(iter, ret, end(start->data)); 904 bch_btree_iter_push(iter, ret, end(start->data));
@@ -887,6 +913,8 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
887 struct bkey *ret = NULL; 913 struct bkey *ret = NULL;
888 914
889 if (!btree_iter_end(iter)) { 915 if (!btree_iter_end(iter)) {
916 bch_btree_iter_next_check(iter);
917
890 ret = iter->data->k; 918 ret = iter->data->k;
891 iter->data->k = bkey_next(iter->data->k); 919 iter->data->k = bkey_next(iter->data->k);
892 920
@@ -916,14 +944,6 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
916 return ret; 944 return ret;
917} 945}
918 946
919struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
920{
921 struct btree_iter iter;
922
923 bch_btree_iter_init(b, &iter, search);
924 return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
925}
926
927/* Mergesort */ 947/* Mergesort */
928 948
929static void sort_key_next(struct btree_iter *iter, 949static void sort_key_next(struct btree_iter *iter,
@@ -998,7 +1018,6 @@ static void btree_mergesort(struct btree *b, struct bset *out,
998 out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; 1018 out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
999 1019
1000 pr_debug("sorted %i keys", out->keys); 1020 pr_debug("sorted %i keys", out->keys);
1001 bch_check_key_order(b, out);
1002} 1021}
1003 1022
1004static void __btree_sort(struct btree *b, struct btree_iter *iter, 1023static void __btree_sort(struct btree *b, struct btree_iter *iter,
@@ -1029,7 +1048,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
1029 * memcpy() 1048 * memcpy()
1030 */ 1049 */
1031 1050
1032 out->magic = bset_magic(b->c); 1051 out->magic = bset_magic(&b->c->sb);
1033 out->seq = b->sets[0].data->seq; 1052 out->seq = b->sets[0].data->seq;
1034 out->version = b->sets[0].data->version; 1053 out->version = b->sets[0].data->version;
1035 swap(out, b->sets[0].data); 1054 swap(out, b->sets[0].data);
@@ -1050,24 +1069,21 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
1050 if (b->written) 1069 if (b->written)
1051 bset_build_written_tree(b); 1070 bset_build_written_tree(b);
1052 1071
1053 if (!start) { 1072 if (!start)
1054 spin_lock(&b->c->sort_time_lock);
1055 bch_time_stats_update(&b->c->sort_time, start_time); 1073 bch_time_stats_update(&b->c->sort_time, start_time);
1056 spin_unlock(&b->c->sort_time_lock);
1057 }
1058} 1074}
1059 1075
1060void bch_btree_sort_partial(struct btree *b, unsigned start) 1076void bch_btree_sort_partial(struct btree *b, unsigned start)
1061{ 1077{
1062 size_t oldsize = 0, order = b->page_order, keys = 0; 1078 size_t order = b->page_order, keys = 0;
1063 struct btree_iter iter; 1079 struct btree_iter iter;
1080 int oldsize = bch_count_data(b);
1081
1064 __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); 1082 __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
1065 1083
1066 BUG_ON(b->sets[b->nsets].data == write_block(b) && 1084 BUG_ON(b->sets[b->nsets].data == write_block(b) &&
1067 (b->sets[b->nsets].size || b->nsets)); 1085 (b->sets[b->nsets].size || b->nsets));
1068 1086
1069 if (b->written)
1070 oldsize = bch_count_data(b);
1071 1087
1072 if (start) { 1088 if (start) {
1073 unsigned i; 1089 unsigned i;
@@ -1083,7 +1099,7 @@ void bch_btree_sort_partial(struct btree *b, unsigned start)
1083 1099
1084 __btree_sort(b, &iter, start, order, false); 1100 __btree_sort(b, &iter, start, order, false);
1085 1101
1086 EBUG_ON(b->written && bch_count_data(b) != oldsize); 1102 EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize);
1087} 1103}
1088 1104
1089void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) 1105void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
@@ -1101,9 +1117,7 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
1101 1117
1102 btree_mergesort(b, new->sets->data, &iter, false, true); 1118 btree_mergesort(b, new->sets->data, &iter, false, true);
1103 1119
1104 spin_lock(&b->c->sort_time_lock);
1105 bch_time_stats_update(&b->c->sort_time, start_time); 1120 bch_time_stats_update(&b->c->sort_time, start_time);
1106 spin_unlock(&b->c->sort_time_lock);
1107 1121
1108 bkey_copy_key(&new->key, &b->key); 1122 bkey_copy_key(&new->key, &b->key);
1109 new->sets->size = 0; 1123 new->sets->size = 0;
@@ -1148,16 +1162,16 @@ out:
1148/* Sysfs stuff */ 1162/* Sysfs stuff */
1149 1163
1150struct bset_stats { 1164struct bset_stats {
1165 struct btree_op op;
1151 size_t nodes; 1166 size_t nodes;
1152 size_t sets_written, sets_unwritten; 1167 size_t sets_written, sets_unwritten;
1153 size_t bytes_written, bytes_unwritten; 1168 size_t bytes_written, bytes_unwritten;
1154 size_t floats, failed; 1169 size_t floats, failed;
1155}; 1170};
1156 1171
1157static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, 1172static int btree_bset_stats(struct btree_op *op, struct btree *b)
1158 struct bset_stats *stats)
1159{ 1173{
1160 struct bkey *k; 1174 struct bset_stats *stats = container_of(op, struct bset_stats, op);
1161 unsigned i; 1175 unsigned i;
1162 1176
1163 stats->nodes++; 1177 stats->nodes++;
@@ -1182,30 +1196,19 @@ static int bch_btree_bset_stats(struct btree *b, struct btree_op *op,
1182 } 1196 }
1183 } 1197 }
1184 1198
1185 if (b->level) { 1199 return MAP_CONTINUE;
1186 struct btree_iter iter;
1187
1188 for_each_key_filter(b, k, &iter, bch_ptr_bad) {
1189 int ret = btree(bset_stats, k, b, op, stats);
1190 if (ret)
1191 return ret;
1192 }
1193 }
1194
1195 return 0;
1196} 1200}
1197 1201
1198int bch_bset_print_stats(struct cache_set *c, char *buf) 1202int bch_bset_print_stats(struct cache_set *c, char *buf)
1199{ 1203{
1200 struct btree_op op;
1201 struct bset_stats t; 1204 struct bset_stats t;
1202 int ret; 1205 int ret;
1203 1206
1204 bch_btree_op_init_stack(&op);
1205 memset(&t, 0, sizeof(struct bset_stats)); 1207 memset(&t, 0, sizeof(struct bset_stats));
1208 bch_btree_op_init(&t.op, -1);
1206 1209
1207 ret = btree_root(bset_stats, c, &op, &t); 1210 ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
1208 if (ret) 1211 if (ret < 0)
1209 return ret; 1212 return ret;
1210 1213
1211 return snprintf(buf, PAGE_SIZE, 1214 return snprintf(buf, PAGE_SIZE,
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index ae115a253d73..1d3c24f9fa0e 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -148,6 +148,9 @@
148 148
149struct btree_iter { 149struct btree_iter {
150 size_t size, used; 150 size_t size, used;
151#ifdef CONFIG_BCACHE_DEBUG
152 struct btree *b;
153#endif
151 struct btree_iter_set { 154 struct btree_iter_set {
152 struct bkey *k, *end; 155 struct bkey *k, *end;
153 } data[MAX_BSETS]; 156 } data[MAX_BSETS];
@@ -193,54 +196,26 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
193 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); 196 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
194} 197}
195 198
196static inline size_t bkey_u64s(const struct bkey *k)
197{
198 BUG_ON(KEY_CSUM(k) > 1);
199 return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
200}
201
202static inline size_t bkey_bytes(const struct bkey *k)
203{
204 return bkey_u64s(k) * sizeof(uint64_t);
205}
206
207static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
208{
209 memcpy(dest, src, bkey_bytes(src));
210}
211
212static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
213{
214 if (!src)
215 src = &KEY(0, 0, 0);
216
217 SET_KEY_INODE(dest, KEY_INODE(src));
218 SET_KEY_OFFSET(dest, KEY_OFFSET(src));
219}
220
221static inline struct bkey *bkey_next(const struct bkey *k)
222{
223 uint64_t *d = (void *) k;
224 return (struct bkey *) (d + bkey_u64s(k));
225}
226
227/* Keylists */ 199/* Keylists */
228 200
229struct keylist { 201struct keylist {
230 struct bkey *top;
231 union { 202 union {
232 uint64_t *list; 203 struct bkey *keys;
233 struct bkey *bottom; 204 uint64_t *keys_p;
205 };
206 union {
207 struct bkey *top;
208 uint64_t *top_p;
234 }; 209 };
235 210
236 /* Enough room for btree_split's keys without realloc */ 211 /* Enough room for btree_split's keys without realloc */
237#define KEYLIST_INLINE 16 212#define KEYLIST_INLINE 16
238 uint64_t d[KEYLIST_INLINE]; 213 uint64_t inline_keys[KEYLIST_INLINE];
239}; 214};
240 215
241static inline void bch_keylist_init(struct keylist *l) 216static inline void bch_keylist_init(struct keylist *l)
242{ 217{
243 l->top = (void *) (l->list = l->d); 218 l->top_p = l->keys_p = l->inline_keys;
244} 219}
245 220
246static inline void bch_keylist_push(struct keylist *l) 221static inline void bch_keylist_push(struct keylist *l)
@@ -256,17 +231,32 @@ static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
256 231
257static inline bool bch_keylist_empty(struct keylist *l) 232static inline bool bch_keylist_empty(struct keylist *l)
258{ 233{
259 return l->top == (void *) l->list; 234 return l->top == l->keys;
235}
236
237static inline void bch_keylist_reset(struct keylist *l)
238{
239 l->top = l->keys;
260} 240}
261 241
262static inline void bch_keylist_free(struct keylist *l) 242static inline void bch_keylist_free(struct keylist *l)
263{ 243{
264 if (l->list != l->d) 244 if (l->keys_p != l->inline_keys)
265 kfree(l->list); 245 kfree(l->keys_p);
246}
247
248static inline size_t bch_keylist_nkeys(struct keylist *l)
249{
250 return l->top_p - l->keys_p;
251}
252
253static inline size_t bch_keylist_bytes(struct keylist *l)
254{
255 return bch_keylist_nkeys(l) * sizeof(uint64_t);
266} 256}
267 257
268void bch_keylist_copy(struct keylist *, struct keylist *);
269struct bkey *bch_keylist_pop(struct keylist *); 258struct bkey *bch_keylist_pop(struct keylist *);
259void bch_keylist_pop_front(struct keylist *);
270int bch_keylist_realloc(struct keylist *, int, struct cache_set *); 260int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
271 261
272void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, 262void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
@@ -287,7 +277,9 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
287} 277}
288 278
289const char *bch_ptr_status(struct cache_set *, const struct bkey *); 279const char *bch_ptr_status(struct cache_set *, const struct bkey *);
290bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *); 280bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
281bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
282
291bool bch_ptr_bad(struct btree *, const struct bkey *); 283bool bch_ptr_bad(struct btree *, const struct bkey *);
292 284
293static inline uint8_t gen_after(uint8_t a, uint8_t b) 285static inline uint8_t gen_after(uint8_t a, uint8_t b)
@@ -311,7 +303,6 @@ static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
311 303
312typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); 304typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
313 305
314struct bkey *bch_next_recurse_key(struct btree *, struct bkey *);
315struct bkey *bch_btree_iter_next(struct btree_iter *); 306struct bkey *bch_btree_iter_next(struct btree_iter *);
316struct bkey *bch_btree_iter_next_filter(struct btree_iter *, 307struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
317 struct btree *, ptr_filter_fn); 308 struct btree *, ptr_filter_fn);
@@ -361,12 +352,30 @@ void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
361struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, 352struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
362 const struct bkey *); 353 const struct bkey *);
363 354
355/*
356 * Returns the first key that is strictly greater than search
357 */
364static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, 358static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
365 const struct bkey *search) 359 const struct bkey *search)
366{ 360{
367 return search ? __bch_bset_search(b, t, search) : t->data->start; 361 return search ? __bch_bset_search(b, t, search) : t->data->start;
368} 362}
369 363
364#define PRECEDING_KEY(_k) \
365({ \
366 struct bkey *_ret = NULL; \
367 \
368 if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
369 _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
370 \
371 if (!_ret->low) \
372 _ret->high--; \
373 _ret->low--; \
374 } \
375 \
376 _ret; \
377})
378
370bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); 379bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
371void bch_btree_sort_lazy(struct btree *); 380void bch_btree_sort_lazy(struct btree *);
372void bch_btree_sort_into(struct btree *, struct btree *); 381void bch_btree_sort_into(struct btree *, struct btree *);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f42fc7ed9cd6..5e2765aadce1 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -23,12 +23,13 @@
23#include "bcache.h" 23#include "bcache.h"
24#include "btree.h" 24#include "btree.h"
25#include "debug.h" 25#include "debug.h"
26#include "request.h"
27#include "writeback.h" 26#include "writeback.h"
28 27
29#include <linux/slab.h> 28#include <linux/slab.h>
30#include <linux/bitops.h> 29#include <linux/bitops.h>
30#include <linux/freezer.h>
31#include <linux/hash.h> 31#include <linux/hash.h>
32#include <linux/kthread.h>
32#include <linux/prefetch.h> 33#include <linux/prefetch.h>
33#include <linux/random.h> 34#include <linux/random.h>
34#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
@@ -88,15 +89,13 @@
88 * Test module load/unload 89 * Test module load/unload
89 */ 90 */
90 91
91static const char * const op_types[] = { 92enum {
92 "insert", "replace" 93 BTREE_INSERT_STATUS_INSERT,
94 BTREE_INSERT_STATUS_BACK_MERGE,
95 BTREE_INSERT_STATUS_OVERWROTE,
96 BTREE_INSERT_STATUS_FRONT_MERGE,
93}; 97};
94 98
95static const char *op_type(struct btree_op *op)
96{
97 return op_types[op->type];
98}
99
100#define MAX_NEED_GC 64 99#define MAX_NEED_GC 64
101#define MAX_SAVE_PRIO 72 100#define MAX_SAVE_PRIO 72
102 101
@@ -105,23 +104,89 @@ static const char *op_type(struct btree_op *op)
105#define PTR_HASH(c, k) \ 104#define PTR_HASH(c, k) \
106 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 105 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
107 106
108struct workqueue_struct *bch_gc_wq;
109static struct workqueue_struct *btree_io_wq; 107static struct workqueue_struct *btree_io_wq;
110 108
111void bch_btree_op_init_stack(struct btree_op *op) 109static inline bool should_split(struct btree *b)
112{ 110{
113 memset(op, 0, sizeof(struct btree_op)); 111 struct bset *i = write_block(b);
114 closure_init_stack(&op->cl); 112 return b->written >= btree_blocks(b) ||
115 op->lock = -1; 113 (b->written + __set_blocks(i, i->keys + 15, b->c)
116 bch_keylist_init(&op->keys); 114 > btree_blocks(b));
117} 115}
118 116
117#define insert_lock(s, b) ((b)->level <= (s)->lock)
118
119/*
120 * These macros are for recursing down the btree - they handle the details of
121 * locking and looking up nodes in the cache for you. They're best treated as
122 * mere syntax when reading code that uses them.
123 *
124 * op->lock determines whether we take a read or a write lock at a given depth.
125 * If you've got a read lock and find that you need a write lock (i.e. you're
126 * going to have to split), set op->lock and return -EINTR; btree_root() will
127 * call you again and you'll have the correct lock.
128 */
129
130/**
131 * btree - recurse down the btree on a specified key
132 * @fn: function to call, which will be passed the child node
133 * @key: key to recurse on
134 * @b: parent btree node
135 * @op: pointer to struct btree_op
136 */
137#define btree(fn, key, b, op, ...) \
138({ \
139 int _r, l = (b)->level - 1; \
140 bool _w = l <= (op)->lock; \
141 struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \
142 if (!IS_ERR(_child)) { \
143 _child->parent = (b); \
144 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
145 rw_unlock(_w, _child); \
146 } else \
147 _r = PTR_ERR(_child); \
148 _r; \
149})
150
151/**
152 * btree_root - call a function on the root of the btree
153 * @fn: function to call, which will be passed the child node
154 * @c: cache set
155 * @op: pointer to struct btree_op
156 */
157#define btree_root(fn, c, op, ...) \
158({ \
159 int _r = -EINTR; \
160 do { \
161 struct btree *_b = (c)->root; \
162 bool _w = insert_lock(op, _b); \
163 rw_lock(_w, _b, _b->level); \
164 if (_b == (c)->root && \
165 _w == insert_lock(op, _b)) { \
166 _b->parent = NULL; \
167 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
168 } \
169 rw_unlock(_w, _b); \
170 bch_cannibalize_unlock(c); \
171 if (_r == -ENOSPC) { \
172 wait_event((c)->try_wait, \
173 !(c)->try_harder); \
174 _r = -EINTR; \
175 } \
176 } while (_r == -EINTR); \
177 \
178 _r; \
179})
180
119/* Btree key manipulation */ 181/* Btree key manipulation */
120 182
121static void bkey_put(struct cache_set *c, struct bkey *k, int level) 183void bkey_put(struct cache_set *c, struct bkey *k)
122{ 184{
123 if ((level && KEY_OFFSET(k)) || !level) 185 unsigned i;
124 __bkey_put(c, k); 186
187 for (i = 0; i < KEY_PTRS(k); i++)
188 if (ptr_available(c, k, i))
189 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
125} 190}
126 191
127/* Btree IO */ 192/* Btree IO */
@@ -145,6 +210,10 @@ static void bch_btree_node_read_done(struct btree *b)
145 iter->size = b->c->sb.bucket_size / b->c->sb.block_size; 210 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
146 iter->used = 0; 211 iter->used = 0;
147 212
213#ifdef CONFIG_BCACHE_DEBUG
214 iter->b = b;
215#endif
216
148 if (!i->seq) 217 if (!i->seq)
149 goto err; 218 goto err;
150 219
@@ -160,7 +229,7 @@ static void bch_btree_node_read_done(struct btree *b)
160 goto err; 229 goto err;
161 230
162 err = "bad magic"; 231 err = "bad magic";
163 if (i->magic != bset_magic(b->c)) 232 if (i->magic != bset_magic(&b->c->sb))
164 goto err; 233 goto err;
165 234
166 err = "bad checksum"; 235 err = "bad checksum";
@@ -248,10 +317,7 @@ void bch_btree_node_read(struct btree *b)
248 goto err; 317 goto err;
249 318
250 bch_btree_node_read_done(b); 319 bch_btree_node_read_done(b);
251
252 spin_lock(&b->c->btree_read_time_lock);
253 bch_time_stats_update(&b->c->btree_read_time, start_time); 320 bch_time_stats_update(&b->c->btree_read_time, start_time);
254 spin_unlock(&b->c->btree_read_time_lock);
255 321
256 return; 322 return;
257err: 323err:
@@ -327,7 +393,7 @@ static void do_btree_node_write(struct btree *b)
327 b->bio = bch_bbio_alloc(b->c); 393 b->bio = bch_bbio_alloc(b->c);
328 394
329 b->bio->bi_end_io = btree_node_write_endio; 395 b->bio->bi_end_io = btree_node_write_endio;
330 b->bio->bi_private = &b->io.cl; 396 b->bio->bi_private = cl;
331 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; 397 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
332 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 398 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
333 bch_bio_map(b->bio, i); 399 bch_bio_map(b->bio, i);
@@ -383,7 +449,7 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
383 BUG_ON(b->written >= btree_blocks(b)); 449 BUG_ON(b->written >= btree_blocks(b));
384 BUG_ON(b->written && !i->keys); 450 BUG_ON(b->written && !i->keys);
385 BUG_ON(b->sets->data->seq != i->seq); 451 BUG_ON(b->sets->data->seq != i->seq);
386 bch_check_key_order(b, i); 452 bch_check_keys(b, "writing");
387 453
388 cancel_delayed_work(&b->work); 454 cancel_delayed_work(&b->work);
389 455
@@ -405,6 +471,15 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
405 bch_bset_init_next(b); 471 bch_bset_init_next(b);
406} 472}
407 473
474static void bch_btree_node_write_sync(struct btree *b)
475{
476 struct closure cl;
477
478 closure_init_stack(&cl);
479 bch_btree_node_write(b, &cl);
480 closure_sync(&cl);
481}
482
408static void btree_node_write_work(struct work_struct *w) 483static void btree_node_write_work(struct work_struct *w)
409{ 484{
410 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 485 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
@@ -416,7 +491,7 @@ static void btree_node_write_work(struct work_struct *w)
416 rw_unlock(true, b); 491 rw_unlock(true, b);
417} 492}
418 493
419static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) 494static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
420{ 495{
421 struct bset *i = b->sets[b->nsets].data; 496 struct bset *i = b->sets[b->nsets].data;
422 struct btree_write *w = btree_current_write(b); 497 struct btree_write *w = btree_current_write(b);
@@ -429,15 +504,15 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
429 504
430 set_btree_node_dirty(b); 505 set_btree_node_dirty(b);
431 506
432 if (op && op->journal) { 507 if (journal_ref) {
433 if (w->journal && 508 if (w->journal &&
434 journal_pin_cmp(b->c, w, op)) { 509 journal_pin_cmp(b->c, w->journal, journal_ref)) {
435 atomic_dec_bug(w->journal); 510 atomic_dec_bug(w->journal);
436 w->journal = NULL; 511 w->journal = NULL;
437 } 512 }
438 513
439 if (!w->journal) { 514 if (!w->journal) {
440 w->journal = op->journal; 515 w->journal = journal_ref;
441 atomic_inc(w->journal); 516 atomic_inc(w->journal);
442 } 517 }
443 } 518 }
@@ -566,33 +641,32 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
566 return b; 641 return b;
567} 642}
568 643
569static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) 644static int mca_reap(struct btree *b, unsigned min_order, bool flush)
570{ 645{
646 struct closure cl;
647
648 closure_init_stack(&cl);
571 lockdep_assert_held(&b->c->bucket_lock); 649 lockdep_assert_held(&b->c->bucket_lock);
572 650
573 if (!down_write_trylock(&b->lock)) 651 if (!down_write_trylock(&b->lock))
574 return -ENOMEM; 652 return -ENOMEM;
575 653
576 if (b->page_order < min_order) { 654 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
655
656 if (b->page_order < min_order ||
657 (!flush &&
658 (btree_node_dirty(b) ||
659 atomic_read(&b->io.cl.remaining) != -1))) {
577 rw_unlock(true, b); 660 rw_unlock(true, b);
578 return -ENOMEM; 661 return -ENOMEM;
579 } 662 }
580 663
581 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 664 if (btree_node_dirty(b))
582 665 bch_btree_node_write_sync(b);
583 if (cl && btree_node_dirty(b))
584 bch_btree_node_write(b, NULL);
585
586 if (cl)
587 closure_wait_event_async(&b->io.wait, cl,
588 atomic_read(&b->io.cl.remaining) == -1);
589 666
590 if (btree_node_dirty(b) || 667 /* wait for any in flight btree write */
591 !closure_is_unlocked(&b->io.cl) || 668 closure_wait_event(&b->io.wait, &cl,
592 work_pending(&b->work.work)) { 669 atomic_read(&b->io.cl.remaining) == -1);
593 rw_unlock(true, b);
594 return -EAGAIN;
595 }
596 670
597 return 0; 671 return 0;
598} 672}
@@ -633,7 +707,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
633 break; 707 break;
634 708
635 if (++i > 3 && 709 if (++i > 3 &&
636 !mca_reap(b, NULL, 0)) { 710 !mca_reap(b, 0, false)) {
637 mca_data_free(b); 711 mca_data_free(b);
638 rw_unlock(true, b); 712 rw_unlock(true, b);
639 freed++; 713 freed++;
@@ -652,7 +726,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
652 list_rotate_left(&c->btree_cache); 726 list_rotate_left(&c->btree_cache);
653 727
654 if (!b->accessed && 728 if (!b->accessed &&
655 !mca_reap(b, NULL, 0)) { 729 !mca_reap(b, 0, false)) {
656 mca_bucket_free(b); 730 mca_bucket_free(b);
657 mca_data_free(b); 731 mca_data_free(b);
658 rw_unlock(true, b); 732 rw_unlock(true, b);
@@ -723,12 +797,9 @@ int bch_btree_cache_alloc(struct cache_set *c)
723{ 797{
724 unsigned i; 798 unsigned i;
725 799
726 /* XXX: doesn't check for errors */
727
728 closure_init_unlocked(&c->gc);
729
730 for (i = 0; i < mca_reserve(c); i++) 800 for (i = 0; i < mca_reserve(c); i++)
731 mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 801 if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
802 return -ENOMEM;
732 803
733 list_splice_init(&c->btree_cache, 804 list_splice_init(&c->btree_cache,
734 &c->btree_cache_freeable); 805 &c->btree_cache_freeable);
@@ -775,52 +846,27 @@ out:
775 return b; 846 return b;
776} 847}
777 848
778static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, 849static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
779 int level, struct closure *cl)
780{ 850{
781 int ret = -ENOMEM; 851 struct btree *b;
782 struct btree *i;
783 852
784 trace_bcache_btree_cache_cannibalize(c); 853 trace_bcache_btree_cache_cannibalize(c);
785 854
786 if (!cl) 855 if (!c->try_harder) {
787 return ERR_PTR(-ENOMEM); 856 c->try_harder = current;
788 857 c->try_harder_start = local_clock();
789 /* 858 } else if (c->try_harder != current)
790 * Trying to free up some memory - i.e. reuse some btree nodes - may 859 return ERR_PTR(-ENOSPC);
791 * require initiating IO to flush the dirty part of the node. If we're
792 * running under generic_make_request(), that IO will never finish and
793 * we would deadlock. Returning -EAGAIN causes the cache lookup code to
794 * punt to workqueue and retry.
795 */
796 if (current->bio_list)
797 return ERR_PTR(-EAGAIN);
798
799 if (c->try_harder && c->try_harder != cl) {
800 closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
801 return ERR_PTR(-EAGAIN);
802 }
803 860
804 c->try_harder = cl; 861 list_for_each_entry_reverse(b, &c->btree_cache, list)
805 c->try_harder_start = local_clock(); 862 if (!mca_reap(b, btree_order(k), false))
806retry: 863 return b;
807 list_for_each_entry_reverse(i, &c->btree_cache, list) {
808 int r = mca_reap(i, cl, btree_order(k));
809 if (!r)
810 return i;
811 if (r != -ENOMEM)
812 ret = r;
813 }
814 864
815 if (ret == -EAGAIN && 865 list_for_each_entry_reverse(b, &c->btree_cache, list)
816 closure_blocking(cl)) { 866 if (!mca_reap(b, btree_order(k), true))
817 mutex_unlock(&c->bucket_lock); 867 return b;
818 closure_sync(cl);
819 mutex_lock(&c->bucket_lock);
820 goto retry;
821 }
822 868
823 return ERR_PTR(ret); 869 return ERR_PTR(-ENOMEM);
824} 870}
825 871
826/* 872/*
@@ -829,20 +875,21 @@ retry:
829 * cannibalize_bucket() will take. This means every time we unlock the root of 875 * cannibalize_bucket() will take. This means every time we unlock the root of
830 * the btree, we need to release this lock if we have it held. 876 * the btree, we need to release this lock if we have it held.
831 */ 877 */
832void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) 878static void bch_cannibalize_unlock(struct cache_set *c)
833{ 879{
834 if (c->try_harder == cl) { 880 if (c->try_harder == current) {
835 bch_time_stats_update(&c->try_harder_time, c->try_harder_start); 881 bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
836 c->try_harder = NULL; 882 c->try_harder = NULL;
837 __closure_wake_up(&c->try_wait); 883 wake_up(&c->try_wait);
838 } 884 }
839} 885}
840 886
841static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, 887static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
842 int level, struct closure *cl)
843{ 888{
844 struct btree *b; 889 struct btree *b;
845 890
891 BUG_ON(current->bio_list);
892
846 lockdep_assert_held(&c->bucket_lock); 893 lockdep_assert_held(&c->bucket_lock);
847 894
848 if (mca_find(c, k)) 895 if (mca_find(c, k))
@@ -852,14 +899,14 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
852 * the list. Check if there's any freed nodes there: 899 * the list. Check if there's any freed nodes there:
853 */ 900 */
854 list_for_each_entry(b, &c->btree_cache_freeable, list) 901 list_for_each_entry(b, &c->btree_cache_freeable, list)
855 if (!mca_reap(b, NULL, btree_order(k))) 902 if (!mca_reap(b, btree_order(k), false))
856 goto out; 903 goto out;
857 904
858 /* We never free struct btree itself, just the memory that holds the on 905 /* We never free struct btree itself, just the memory that holds the on
859 * disk node. Check the freed list before allocating a new one: 906 * disk node. Check the freed list before allocating a new one:
860 */ 907 */
861 list_for_each_entry(b, &c->btree_cache_freed, list) 908 list_for_each_entry(b, &c->btree_cache_freed, list)
862 if (!mca_reap(b, NULL, 0)) { 909 if (!mca_reap(b, 0, false)) {
863 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 910 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
864 if (!b->sets[0].data) 911 if (!b->sets[0].data)
865 goto err; 912 goto err;
@@ -884,6 +931,7 @@ out:
884 931
885 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 932 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
886 b->level = level; 933 b->level = level;
934 b->parent = (void *) ~0UL;
887 935
888 mca_reinit(b); 936 mca_reinit(b);
889 937
@@ -892,7 +940,7 @@ err:
892 if (b) 940 if (b)
893 rw_unlock(true, b); 941 rw_unlock(true, b);
894 942
895 b = mca_cannibalize(c, k, level, cl); 943 b = mca_cannibalize(c, k);
896 if (!IS_ERR(b)) 944 if (!IS_ERR(b))
897 goto out; 945 goto out;
898 946
@@ -903,17 +951,15 @@ err:
903 * bch_btree_node_get - find a btree node in the cache and lock it, reading it 951 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
904 * in from disk if necessary. 952 * in from disk if necessary.
905 * 953 *
906 * If IO is necessary, it uses the closure embedded in struct btree_op to wait; 954 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
907 * if that closure is in non blocking mode, will return -EAGAIN.
908 * 955 *
909 * The btree node will have either a read or a write lock held, depending on 956 * The btree node will have either a read or a write lock held, depending on
910 * level and op->lock. 957 * level and op->lock.
911 */ 958 */
912struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, 959struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
913 int level, struct btree_op *op) 960 int level, bool write)
914{ 961{
915 int i = 0; 962 int i = 0;
916 bool write = level <= op->lock;
917 struct btree *b; 963 struct btree *b;
918 964
919 BUG_ON(level < 0); 965 BUG_ON(level < 0);
@@ -925,7 +971,7 @@ retry:
925 return ERR_PTR(-EAGAIN); 971 return ERR_PTR(-EAGAIN);
926 972
927 mutex_lock(&c->bucket_lock); 973 mutex_lock(&c->bucket_lock);
928 b = mca_alloc(c, k, level, &op->cl); 974 b = mca_alloc(c, k, level);
929 mutex_unlock(&c->bucket_lock); 975 mutex_unlock(&c->bucket_lock);
930 976
931 if (!b) 977 if (!b)
@@ -971,7 +1017,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
971 struct btree *b; 1017 struct btree *b;
972 1018
973 mutex_lock(&c->bucket_lock); 1019 mutex_lock(&c->bucket_lock);
974 b = mca_alloc(c, k, level, NULL); 1020 b = mca_alloc(c, k, level);
975 mutex_unlock(&c->bucket_lock); 1021 mutex_unlock(&c->bucket_lock);
976 1022
977 if (!IS_ERR_OR_NULL(b)) { 1023 if (!IS_ERR_OR_NULL(b)) {
@@ -982,17 +1028,12 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
982 1028
983/* Btree alloc */ 1029/* Btree alloc */
984 1030
985static void btree_node_free(struct btree *b, struct btree_op *op) 1031static void btree_node_free(struct btree *b)
986{ 1032{
987 unsigned i; 1033 unsigned i;
988 1034
989 trace_bcache_btree_node_free(b); 1035 trace_bcache_btree_node_free(b);
990 1036
991 /*
992 * The BUG_ON() in btree_node_get() implies that we must have a write
993 * lock on parent to free or even invalidate a node
994 */
995 BUG_ON(op->lock <= b->level);
996 BUG_ON(b == b->c->root); 1037 BUG_ON(b == b->c->root);
997 1038
998 if (btree_node_dirty(b)) 1039 if (btree_node_dirty(b))
@@ -1015,27 +1056,26 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
1015 mutex_unlock(&b->c->bucket_lock); 1056 mutex_unlock(&b->c->bucket_lock);
1016} 1057}
1017 1058
1018struct btree *bch_btree_node_alloc(struct cache_set *c, int level, 1059struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
1019 struct closure *cl)
1020{ 1060{
1021 BKEY_PADDED(key) k; 1061 BKEY_PADDED(key) k;
1022 struct btree *b = ERR_PTR(-EAGAIN); 1062 struct btree *b = ERR_PTR(-EAGAIN);
1023 1063
1024 mutex_lock(&c->bucket_lock); 1064 mutex_lock(&c->bucket_lock);
1025retry: 1065retry:
1026 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) 1066 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
1027 goto err; 1067 goto err;
1028 1068
1069 bkey_put(c, &k.key);
1029 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1070 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1030 1071
1031 b = mca_alloc(c, &k.key, level, cl); 1072 b = mca_alloc(c, &k.key, level);
1032 if (IS_ERR(b)) 1073 if (IS_ERR(b))
1033 goto err_free; 1074 goto err_free;
1034 1075
1035 if (!b) { 1076 if (!b) {
1036 cache_bug(c, 1077 cache_bug(c,
1037 "Tried to allocate bucket that was in btree cache"); 1078 "Tried to allocate bucket that was in btree cache");
1038 __bkey_put(c, &k.key);
1039 goto retry; 1079 goto retry;
1040 } 1080 }
1041 1081
@@ -1048,7 +1088,6 @@ retry:
1048 return b; 1088 return b;
1049err_free: 1089err_free:
1050 bch_bucket_free(c, &k.key); 1090 bch_bucket_free(c, &k.key);
1051 __bkey_put(c, &k.key);
1052err: 1091err:
1053 mutex_unlock(&c->bucket_lock); 1092 mutex_unlock(&c->bucket_lock);
1054 1093
@@ -1056,16 +1095,31 @@ err:
1056 return b; 1095 return b;
1057} 1096}
1058 1097
1059static struct btree *btree_node_alloc_replacement(struct btree *b, 1098static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
1060 struct closure *cl)
1061{ 1099{
1062 struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); 1100 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
1063 if (!IS_ERR_OR_NULL(n)) 1101 if (!IS_ERR_OR_NULL(n))
1064 bch_btree_sort_into(b, n); 1102 bch_btree_sort_into(b, n);
1065 1103
1066 return n; 1104 return n;
1067} 1105}
1068 1106
1107static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1108{
1109 unsigned i;
1110
1111 bkey_copy(k, &b->key);
1112 bkey_copy_key(k, &ZERO_KEY);
1113
1114 for (i = 0; i < KEY_PTRS(k); i++) {
1115 uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1;
1116
1117 SET_PTR_GEN(k, i, g);
1118 }
1119
1120 atomic_inc(&b->c->prio_blocked);
1121}
1122
1069/* Garbage collection */ 1123/* Garbage collection */
1070 1124
1071uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) 1125uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
@@ -1119,12 +1173,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1119 1173
1120#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) 1174#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1121 1175
1122static int btree_gc_mark_node(struct btree *b, unsigned *keys, 1176static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1123 struct gc_stat *gc)
1124{ 1177{
1125 uint8_t stale = 0; 1178 uint8_t stale = 0;
1126 unsigned last_dev = -1; 1179 unsigned keys = 0, good_keys = 0;
1127 struct bcache_device *d = NULL;
1128 struct bkey *k; 1180 struct bkey *k;
1129 struct btree_iter iter; 1181 struct btree_iter iter;
1130 struct bset_tree *t; 1182 struct bset_tree *t;
@@ -1132,27 +1184,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1132 gc->nodes++; 1184 gc->nodes++;
1133 1185
1134 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1186 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1135 if (last_dev != KEY_INODE(k)) {
1136 last_dev = KEY_INODE(k);
1137
1138 d = KEY_INODE(k) < b->c->nr_uuids
1139 ? b->c->devices[last_dev]
1140 : NULL;
1141 }
1142
1143 stale = max(stale, btree_mark_key(b, k)); 1187 stale = max(stale, btree_mark_key(b, k));
1188 keys++;
1144 1189
1145 if (bch_ptr_bad(b, k)) 1190 if (bch_ptr_bad(b, k))
1146 continue; 1191 continue;
1147 1192
1148 *keys += bkey_u64s(k);
1149
1150 gc->key_bytes += bkey_u64s(k); 1193 gc->key_bytes += bkey_u64s(k);
1151 gc->nkeys++; 1194 gc->nkeys++;
1195 good_keys++;
1152 1196
1153 gc->data += KEY_SIZE(k); 1197 gc->data += KEY_SIZE(k);
1154 if (KEY_DIRTY(k))
1155 gc->dirty += KEY_SIZE(k);
1156 } 1198 }
1157 1199
1158 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1200 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@@ -1161,78 +1203,74 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1161 bkey_cmp(&b->key, &t->end) < 0, 1203 bkey_cmp(&b->key, &t->end) < 0,
1162 b, "found short btree key in gc"); 1204 b, "found short btree key in gc");
1163 1205
1164 return stale; 1206 if (b->c->gc_always_rewrite)
1165} 1207 return true;
1166
1167static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1168 struct btree_op *op)
1169{
1170 /*
1171 * We block priorities from being written for the duration of garbage
1172 * collection, so we can't sleep in btree_alloc() ->
1173 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
1174 * our closure.
1175 */
1176 struct btree *n = btree_node_alloc_replacement(b, NULL);
1177
1178 if (!IS_ERR_OR_NULL(n)) {
1179 swap(b, n);
1180 __bkey_put(b->c, &b->key);
1181 1208
1182 memcpy(k->ptr, b->key.ptr, 1209 if (stale > 10)
1183 sizeof(uint64_t) * KEY_PTRS(&b->key)); 1210 return true;
1184 1211
1185 btree_node_free(n, op); 1212 if ((keys - good_keys) * 2 > keys)
1186 up_write(&n->lock); 1213 return true;
1187 }
1188 1214
1189 return b; 1215 return false;
1190} 1216}
1191 1217
1192/* 1218#define GC_MERGE_NODES 4U
1193 * Leaving this at 2 until we've got incremental garbage collection done; it
1194 * could be higher (and has been tested with 4) except that garbage collection
1195 * could take much longer, adversely affecting latency.
1196 */
1197#define GC_MERGE_NODES 2U
1198 1219
1199struct gc_merge_info { 1220struct gc_merge_info {
1200 struct btree *b; 1221 struct btree *b;
1201 struct bkey *k;
1202 unsigned keys; 1222 unsigned keys;
1203}; 1223};
1204 1224
1205static void btree_gc_coalesce(struct btree *b, struct btree_op *op, 1225static int bch_btree_insert_node(struct btree *, struct btree_op *,
1206 struct gc_stat *gc, struct gc_merge_info *r) 1226 struct keylist *, atomic_t *, struct bkey *);
1227
1228static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1229 struct keylist *keylist, struct gc_stat *gc,
1230 struct gc_merge_info *r)
1207{ 1231{
1208 unsigned nodes = 0, keys = 0, blocks; 1232 unsigned i, nodes = 0, keys = 0, blocks;
1209 int i; 1233 struct btree *new_nodes[GC_MERGE_NODES];
1234 struct closure cl;
1235 struct bkey *k;
1236
1237 memset(new_nodes, 0, sizeof(new_nodes));
1238 closure_init_stack(&cl);
1210 1239
1211 while (nodes < GC_MERGE_NODES && r[nodes].b) 1240 while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
1212 keys += r[nodes++].keys; 1241 keys += r[nodes++].keys;
1213 1242
1214 blocks = btree_default_blocks(b->c) * 2 / 3; 1243 blocks = btree_default_blocks(b->c) * 2 / 3;
1215 1244
1216 if (nodes < 2 || 1245 if (nodes < 2 ||
1217 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) 1246 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
1218 return; 1247 return 0;
1219
1220 for (i = nodes - 1; i >= 0; --i) {
1221 if (r[i].b->written)
1222 r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
1223 1248
1224 if (r[i].b->written) 1249 for (i = 0; i < nodes; i++) {
1225 return; 1250 new_nodes[i] = btree_node_alloc_replacement(r[i].b, false);
1251 if (IS_ERR_OR_NULL(new_nodes[i]))
1252 goto out_nocoalesce;
1226 } 1253 }
1227 1254
1228 for (i = nodes - 1; i > 0; --i) { 1255 for (i = nodes - 1; i > 0; --i) {
1229 struct bset *n1 = r[i].b->sets->data; 1256 struct bset *n1 = new_nodes[i]->sets->data;
1230 struct bset *n2 = r[i - 1].b->sets->data; 1257 struct bset *n2 = new_nodes[i - 1]->sets->data;
1231 struct bkey *k, *last = NULL; 1258 struct bkey *k, *last = NULL;
1232 1259
1233 keys = 0; 1260 keys = 0;
1234 1261
1235 if (i == 1) { 1262 if (i > 1) {
1263 for (k = n2->start;
1264 k < end(n2);
1265 k = bkey_next(k)) {
1266 if (__set_blocks(n1, n1->keys + keys +
1267 bkey_u64s(k), b->c) > blocks)
1268 break;
1269
1270 last = k;
1271 keys += bkey_u64s(k);
1272 }
1273 } else {
1236 /* 1274 /*
1237 * Last node we're not getting rid of - we're getting 1275 * Last node we're not getting rid of - we're getting
1238 * rid of the node at r[0]. Have to try and fit all of 1276 * rid of the node at r[0]. Have to try and fit all of
@@ -1241,37 +1279,27 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1241 * length keys (shouldn't be possible in practice, 1279 * length keys (shouldn't be possible in practice,
1242 * though) 1280 * though)
1243 */ 1281 */
1244 if (__set_blocks(n1, n1->keys + r->keys, 1282 if (__set_blocks(n1, n1->keys + n2->keys,
1245 b->c) > btree_blocks(r[i].b)) 1283 b->c) > btree_blocks(new_nodes[i]))
1246 return; 1284 goto out_nocoalesce;
1247 1285
1248 keys = n2->keys; 1286 keys = n2->keys;
1287 /* Take the key of the node we're getting rid of */
1249 last = &r->b->key; 1288 last = &r->b->key;
1250 } else 1289 }
1251 for (k = n2->start;
1252 k < end(n2);
1253 k = bkey_next(k)) {
1254 if (__set_blocks(n1, n1->keys + keys +
1255 bkey_u64s(k), b->c) > blocks)
1256 break;
1257
1258 last = k;
1259 keys += bkey_u64s(k);
1260 }
1261 1290
1262 BUG_ON(__set_blocks(n1, n1->keys + keys, 1291 BUG_ON(__set_blocks(n1, n1->keys + keys,
1263 b->c) > btree_blocks(r[i].b)); 1292 b->c) > btree_blocks(new_nodes[i]));
1264 1293
1265 if (last) { 1294 if (last)
1266 bkey_copy_key(&r[i].b->key, last); 1295 bkey_copy_key(&new_nodes[i]->key, last);
1267 bkey_copy_key(r[i].k, last);
1268 }
1269 1296
1270 memcpy(end(n1), 1297 memcpy(end(n1),
1271 n2->start, 1298 n2->start,
1272 (void *) node(n2, keys) - (void *) n2->start); 1299 (void *) node(n2, keys) - (void *) n2->start);
1273 1300
1274 n1->keys += keys; 1301 n1->keys += keys;
1302 r[i].keys = n1->keys;
1275 1303
1276 memmove(n2->start, 1304 memmove(n2->start,
1277 node(n2, keys), 1305 node(n2, keys),
@@ -1279,95 +1307,176 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1279 1307
1280 n2->keys -= keys; 1308 n2->keys -= keys;
1281 1309
1282 r[i].keys = n1->keys; 1310 if (bch_keylist_realloc(keylist,
1283 r[i - 1].keys = n2->keys; 1311 KEY_PTRS(&new_nodes[i]->key), b->c))
1312 goto out_nocoalesce;
1313
1314 bch_btree_node_write(new_nodes[i], &cl);
1315 bch_keylist_add(keylist, &new_nodes[i]->key);
1284 } 1316 }
1285 1317
1286 btree_node_free(r->b, op); 1318 for (i = 0; i < nodes; i++) {
1287 up_write(&r->b->lock); 1319 if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
1320 goto out_nocoalesce;
1288 1321
1289 trace_bcache_btree_gc_coalesce(nodes); 1322 make_btree_freeing_key(r[i].b, keylist->top);
1323 bch_keylist_push(keylist);
1324 }
1325
1326 /* We emptied out this node */
1327 BUG_ON(new_nodes[0]->sets->data->keys);
1328 btree_node_free(new_nodes[0]);
1329 rw_unlock(true, new_nodes[0]);
1330
1331 closure_sync(&cl);
1332
1333 for (i = 0; i < nodes; i++) {
1334 btree_node_free(r[i].b);
1335 rw_unlock(true, r[i].b);
1336
1337 r[i].b = new_nodes[i];
1338 }
1339
1340 bch_btree_insert_node(b, op, keylist, NULL, NULL);
1341 BUG_ON(!bch_keylist_empty(keylist));
1342
1343 memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
1344 r[nodes - 1].b = ERR_PTR(-EINTR);
1290 1345
1346 trace_bcache_btree_gc_coalesce(nodes);
1291 gc->nodes--; 1347 gc->nodes--;
1292 nodes--;
1293 1348
1294 memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); 1349 /* Invalidated our iterator */
1295 memset(&r[nodes], 0, sizeof(struct gc_merge_info)); 1350 return -EINTR;
1351
1352out_nocoalesce:
1353 closure_sync(&cl);
1354
1355 while ((k = bch_keylist_pop(keylist)))
1356 if (!bkey_cmp(k, &ZERO_KEY))
1357 atomic_dec(&b->c->prio_blocked);
1358
1359 for (i = 0; i < nodes; i++)
1360 if (!IS_ERR_OR_NULL(new_nodes[i])) {
1361 btree_node_free(new_nodes[i]);
1362 rw_unlock(true, new_nodes[i]);
1363 }
1364 return 0;
1296} 1365}
1297 1366
1298static int btree_gc_recurse(struct btree *b, struct btree_op *op, 1367static unsigned btree_gc_count_keys(struct btree *b)
1299 struct closure *writes, struct gc_stat *gc)
1300{ 1368{
1301 void write(struct btree *r) 1369 struct bkey *k;
1302 { 1370 struct btree_iter iter;
1303 if (!r->written) 1371 unsigned ret = 0;
1304 bch_btree_node_write(r, &op->cl);
1305 else if (btree_node_dirty(r))
1306 bch_btree_node_write(r, writes);
1307 1372
1308 up_write(&r->lock); 1373 for_each_key_filter(b, k, &iter, bch_ptr_bad)
1309 } 1374 ret += bkey_u64s(k);
1375
1376 return ret;
1377}
1310 1378
1311 int ret = 0, stale; 1379static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1380 struct closure *writes, struct gc_stat *gc)
1381{
1312 unsigned i; 1382 unsigned i;
1383 int ret = 0;
1384 bool should_rewrite;
1385 struct btree *n;
1386 struct bkey *k;
1387 struct keylist keys;
1388 struct btree_iter iter;
1313 struct gc_merge_info r[GC_MERGE_NODES]; 1389 struct gc_merge_info r[GC_MERGE_NODES];
1390 struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
1314 1391
1315 memset(r, 0, sizeof(r)); 1392 bch_keylist_init(&keys);
1393 bch_btree_iter_init(b, &iter, &b->c->gc_done);
1316 1394
1317 while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { 1395 for (i = 0; i < GC_MERGE_NODES; i++)
1318 r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); 1396 r[i].b = ERR_PTR(-EINTR);
1319 1397
1320 if (IS_ERR(r->b)) { 1398 while (1) {
1321 ret = PTR_ERR(r->b); 1399 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1322 break; 1400 if (k) {
1401 r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
1402 if (IS_ERR(r->b)) {
1403 ret = PTR_ERR(r->b);
1404 break;
1405 }
1406
1407 r->keys = btree_gc_count_keys(r->b);
1408
1409 ret = btree_gc_coalesce(b, op, &keys, gc, r);
1410 if (ret)
1411 break;
1323 } 1412 }
1324 1413
1325 r->keys = 0; 1414 if (!last->b)
1326 stale = btree_gc_mark_node(r->b, &r->keys, gc); 1415 break;
1327 1416
1328 if (!b->written && 1417 if (!IS_ERR(last->b)) {
1329 (r->b->level || stale > 10 || 1418 should_rewrite = btree_gc_mark_node(last->b, gc);
1330 b->c->gc_always_rewrite)) 1419 if (should_rewrite) {
1331 r->b = btree_gc_alloc(r->b, r->k, op); 1420 n = btree_node_alloc_replacement(last->b,
1421 false);
1332 1422
1333 if (r->b->level) 1423 if (!IS_ERR_OR_NULL(n)) {
1334 ret = btree_gc_recurse(r->b, op, writes, gc); 1424 bch_btree_node_write_sync(n);
1425 bch_keylist_add(&keys, &n->key);
1335 1426
1336 if (ret) { 1427 make_btree_freeing_key(last->b,
1337 write(r->b); 1428 keys.top);
1338 break; 1429 bch_keylist_push(&keys);
1339 } 1430
1431 btree_node_free(last->b);
1432
1433 bch_btree_insert_node(b, op, &keys,
1434 NULL, NULL);
1435 BUG_ON(!bch_keylist_empty(&keys));
1340 1436
1341 bkey_copy_key(&b->c->gc_done, r->k); 1437 rw_unlock(true, last->b);
1438 last->b = n;
1342 1439
1343 if (!b->written) 1440 /* Invalidated our iterator */
1344 btree_gc_coalesce(b, op, gc, r); 1441 ret = -EINTR;
1442 break;
1443 }
1444 }
1345 1445
1346 if (r[GC_MERGE_NODES - 1].b) 1446 if (last->b->level) {
1347 write(r[GC_MERGE_NODES - 1].b); 1447 ret = btree_gc_recurse(last->b, op, writes, gc);
1448 if (ret)
1449 break;
1450 }
1348 1451
1349 memmove(&r[1], &r[0], 1452 bkey_copy_key(&b->c->gc_done, &last->b->key);
1350 sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); 1453
1454 /*
1455 * Must flush leaf nodes before gc ends, since replace
1456 * operations aren't journalled
1457 */
1458 if (btree_node_dirty(last->b))
1459 bch_btree_node_write(last->b, writes);
1460 rw_unlock(true, last->b);
1461 }
1462
1463 memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
1464 r->b = NULL;
1351 1465
1352 /* When we've got incremental GC working, we'll want to do
1353 * if (should_resched())
1354 * return -EAGAIN;
1355 */
1356 cond_resched();
1357#if 0
1358 if (need_resched()) { 1466 if (need_resched()) {
1359 ret = -EAGAIN; 1467 ret = -EAGAIN;
1360 break; 1468 break;
1361 } 1469 }
1362#endif
1363 } 1470 }
1364 1471
1365 for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) 1472 for (i = 0; i < GC_MERGE_NODES; i++)
1366 write(r[i].b); 1473 if (!IS_ERR_OR_NULL(r[i].b)) {
1474 if (btree_node_dirty(r[i].b))
1475 bch_btree_node_write(r[i].b, writes);
1476 rw_unlock(true, r[i].b);
1477 }
1367 1478
1368 /* Might have freed some children, must remove their keys */ 1479 bch_keylist_free(&keys);
1369 if (!b->written)
1370 bch_btree_sort(b);
1371 1480
1372 return ret; 1481 return ret;
1373} 1482}
@@ -1376,29 +1485,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1376 struct closure *writes, struct gc_stat *gc) 1485 struct closure *writes, struct gc_stat *gc)
1377{ 1486{
1378 struct btree *n = NULL; 1487 struct btree *n = NULL;
1379 unsigned keys = 0; 1488 int ret = 0;
1380 int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); 1489 bool should_rewrite;
1381
1382 if (b->level || stale > 10)
1383 n = btree_node_alloc_replacement(b, NULL);
1384 1490
1385 if (!IS_ERR_OR_NULL(n)) 1491 should_rewrite = btree_gc_mark_node(b, gc);
1386 swap(b, n); 1492 if (should_rewrite) {
1493 n = btree_node_alloc_replacement(b, false);
1387 1494
1388 if (b->level) 1495 if (!IS_ERR_OR_NULL(n)) {
1389 ret = btree_gc_recurse(b, op, writes, gc); 1496 bch_btree_node_write_sync(n);
1497 bch_btree_set_root(n);
1498 btree_node_free(b);
1499 rw_unlock(true, n);
1390 1500
1391 if (!b->written || btree_node_dirty(b)) { 1501 return -EINTR;
1392 bch_btree_node_write(b, n ? &op->cl : NULL); 1502 }
1393 } 1503 }
1394 1504
1395 if (!IS_ERR_OR_NULL(n)) { 1505 if (b->level) {
1396 closure_sync(&op->cl); 1506 ret = btree_gc_recurse(b, op, writes, gc);
1397 bch_btree_set_root(b); 1507 if (ret)
1398 btree_node_free(n, op); 1508 return ret;
1399 rw_unlock(true, b);
1400 } 1509 }
1401 1510
1511 bkey_copy_key(&b->c->gc_done, &b->key);
1512
1402 return ret; 1513 return ret;
1403} 1514}
1404 1515
@@ -1479,9 +1590,8 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1479 return available; 1590 return available;
1480} 1591}
1481 1592
1482static void bch_btree_gc(struct closure *cl) 1593static void bch_btree_gc(struct cache_set *c)
1483{ 1594{
1484 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
1485 int ret; 1595 int ret;
1486 unsigned long available; 1596 unsigned long available;
1487 struct gc_stat stats; 1597 struct gc_stat stats;
@@ -1493,47 +1603,73 @@ static void bch_btree_gc(struct closure *cl)
1493 1603
1494 memset(&stats, 0, sizeof(struct gc_stat)); 1604 memset(&stats, 0, sizeof(struct gc_stat));
1495 closure_init_stack(&writes); 1605 closure_init_stack(&writes);
1496 bch_btree_op_init_stack(&op); 1606 bch_btree_op_init(&op, SHRT_MAX);
1497 op.lock = SHRT_MAX;
1498 1607
1499 btree_gc_start(c); 1608 btree_gc_start(c);
1500 1609
1501 atomic_inc(&c->prio_blocked); 1610 do {
1502 1611 ret = btree_root(gc_root, c, &op, &writes, &stats);
1503 ret = btree_root(gc_root, c, &op, &writes, &stats); 1612 closure_sync(&writes);
1504 closure_sync(&op.cl);
1505 closure_sync(&writes);
1506
1507 if (ret) {
1508 pr_warn("gc failed!");
1509 continue_at(cl, bch_btree_gc, bch_gc_wq);
1510 }
1511 1613
1512 /* Possibly wait for new UUIDs or whatever to hit disk */ 1614 if (ret && ret != -EAGAIN)
1513 bch_journal_meta(c, &op.cl); 1615 pr_warn("gc failed!");
1514 closure_sync(&op.cl); 1616 } while (ret);
1515 1617
1516 available = bch_btree_gc_finish(c); 1618 available = bch_btree_gc_finish(c);
1517
1518 atomic_dec(&c->prio_blocked);
1519 wake_up_allocators(c); 1619 wake_up_allocators(c);
1520 1620
1521 bch_time_stats_update(&c->btree_gc_time, start_time); 1621 bch_time_stats_update(&c->btree_gc_time, start_time);
1522 1622
1523 stats.key_bytes *= sizeof(uint64_t); 1623 stats.key_bytes *= sizeof(uint64_t);
1524 stats.dirty <<= 9;
1525 stats.data <<= 9; 1624 stats.data <<= 9;
1526 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1625 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1527 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1626 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1528 1627
1529 trace_bcache_gc_end(c); 1628 trace_bcache_gc_end(c);
1530 1629
1531 continue_at(cl, bch_moving_gc, bch_gc_wq); 1630 bch_moving_gc(c);
1631}
1632
1633static int bch_gc_thread(void *arg)
1634{
1635 struct cache_set *c = arg;
1636 struct cache *ca;
1637 unsigned i;
1638
1639 while (1) {
1640again:
1641 bch_btree_gc(c);
1642
1643 set_current_state(TASK_INTERRUPTIBLE);
1644 if (kthread_should_stop())
1645 break;
1646
1647 mutex_lock(&c->bucket_lock);
1648
1649 for_each_cache(ca, c, i)
1650 if (ca->invalidate_needs_gc) {
1651 mutex_unlock(&c->bucket_lock);
1652 set_current_state(TASK_RUNNING);
1653 goto again;
1654 }
1655
1656 mutex_unlock(&c->bucket_lock);
1657
1658 try_to_freeze();
1659 schedule();
1660 }
1661
1662 return 0;
1532} 1663}
1533 1664
1534void bch_queue_gc(struct cache_set *c) 1665int bch_gc_thread_start(struct cache_set *c)
1535{ 1666{
1536 closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); 1667 c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
1668 if (IS_ERR(c->gc_thread))
1669 return PTR_ERR(c->gc_thread);
1670
1671 set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
1672 return 0;
1537} 1673}
1538 1674
1539/* Initial partial gc */ 1675/* Initial partial gc */
@@ -1541,9 +1677,9 @@ void bch_queue_gc(struct cache_set *c)
1541static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, 1677static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1542 unsigned long **seen) 1678 unsigned long **seen)
1543{ 1679{
1544 int ret; 1680 int ret = 0;
1545 unsigned i; 1681 unsigned i;
1546 struct bkey *k; 1682 struct bkey *k, *p = NULL;
1547 struct bucket *g; 1683 struct bucket *g;
1548 struct btree_iter iter; 1684 struct btree_iter iter;
1549 1685
@@ -1570,31 +1706,32 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1570 } 1706 }
1571 1707
1572 if (b->level) { 1708 if (b->level) {
1573 k = bch_next_recurse_key(b, &ZERO_KEY); 1709 bch_btree_iter_init(b, &iter, NULL);
1574 1710
1575 while (k) { 1711 do {
1576 struct bkey *p = bch_next_recurse_key(b, k); 1712 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1577 if (p) 1713 if (k)
1578 btree_node_prefetch(b->c, p, b->level - 1); 1714 btree_node_prefetch(b->c, k, b->level - 1);
1579 1715
1580 ret = btree(check_recurse, k, b, op, seen); 1716 if (p)
1581 if (ret) 1717 ret = btree(check_recurse, p, b, op, seen);
1582 return ret;
1583 1718
1584 k = p; 1719 p = k;
1585 } 1720 } while (p && !ret);
1586 } 1721 }
1587 1722
1588 return 0; 1723 return 0;
1589} 1724}
1590 1725
1591int bch_btree_check(struct cache_set *c, struct btree_op *op) 1726int bch_btree_check(struct cache_set *c)
1592{ 1727{
1593 int ret = -ENOMEM; 1728 int ret = -ENOMEM;
1594 unsigned i; 1729 unsigned i;
1595 unsigned long *seen[MAX_CACHES_PER_SET]; 1730 unsigned long *seen[MAX_CACHES_PER_SET];
1731 struct btree_op op;
1596 1732
1597 memset(seen, 0, sizeof(seen)); 1733 memset(seen, 0, sizeof(seen));
1734 bch_btree_op_init(&op, SHRT_MAX);
1598 1735
1599 for (i = 0; c->cache[i]; i++) { 1736 for (i = 0; c->cache[i]; i++) {
1600 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); 1737 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
@@ -1606,7 +1743,7 @@ int bch_btree_check(struct cache_set *c, struct btree_op *op)
1606 memset(seen[i], 0xFF, n); 1743 memset(seen[i], 0xFF, n);
1607 } 1744 }
1608 1745
1609 ret = btree_root(check_recurse, c, op, seen); 1746 ret = btree_root(check_recurse, c, &op, seen);
1610err: 1747err:
1611 for (i = 0; i < MAX_CACHES_PER_SET; i++) 1748 for (i = 0; i < MAX_CACHES_PER_SET; i++)
1612 kfree(seen[i]); 1749 kfree(seen[i]);
@@ -1628,10 +1765,9 @@ static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
1628 bch_bset_fix_lookup_table(b, where); 1765 bch_bset_fix_lookup_table(b, where);
1629} 1766}
1630 1767
1631static bool fix_overlapping_extents(struct btree *b, 1768static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
1632 struct bkey *insert,
1633 struct btree_iter *iter, 1769 struct btree_iter *iter,
1634 struct btree_op *op) 1770 struct bkey *replace_key)
1635{ 1771{
1636 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) 1772 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
1637 { 1773 {
@@ -1659,39 +1795,38 @@ static bool fix_overlapping_extents(struct btree *b,
1659 * We might overlap with 0 size extents; we can't skip these 1795 * We might overlap with 0 size extents; we can't skip these
1660 * because if they're in the set we're inserting to we have to 1796 * because if they're in the set we're inserting to we have to
1661 * adjust them so they don't overlap with the key we're 1797 * adjust them so they don't overlap with the key we're
1662 * inserting. But we don't want to check them for BTREE_REPLACE 1798 * inserting. But we don't want to check them for replace
1663 * operations. 1799 * operations.
1664 */ 1800 */
1665 1801
1666 if (op->type == BTREE_REPLACE && 1802 if (replace_key && KEY_SIZE(k)) {
1667 KEY_SIZE(k)) {
1668 /* 1803 /*
1669 * k might have been split since we inserted/found the 1804 * k might have been split since we inserted/found the
1670 * key we're replacing 1805 * key we're replacing
1671 */ 1806 */
1672 unsigned i; 1807 unsigned i;
1673 uint64_t offset = KEY_START(k) - 1808 uint64_t offset = KEY_START(k) -
1674 KEY_START(&op->replace); 1809 KEY_START(replace_key);
1675 1810
1676 /* But it must be a subset of the replace key */ 1811 /* But it must be a subset of the replace key */
1677 if (KEY_START(k) < KEY_START(&op->replace) || 1812 if (KEY_START(k) < KEY_START(replace_key) ||
1678 KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) 1813 KEY_OFFSET(k) > KEY_OFFSET(replace_key))
1679 goto check_failed; 1814 goto check_failed;
1680 1815
1681 /* We didn't find a key that we were supposed to */ 1816 /* We didn't find a key that we were supposed to */
1682 if (KEY_START(k) > KEY_START(insert) + sectors_found) 1817 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1683 goto check_failed; 1818 goto check_failed;
1684 1819
1685 if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) 1820 if (KEY_PTRS(replace_key) != KEY_PTRS(k))
1686 goto check_failed; 1821 goto check_failed;
1687 1822
1688 /* skip past gen */ 1823 /* skip past gen */
1689 offset <<= 8; 1824 offset <<= 8;
1690 1825
1691 BUG_ON(!KEY_PTRS(&op->replace)); 1826 BUG_ON(!KEY_PTRS(replace_key));
1692 1827
1693 for (i = 0; i < KEY_PTRS(&op->replace); i++) 1828 for (i = 0; i < KEY_PTRS(replace_key); i++)
1694 if (k->ptr[i] != op->replace.ptr[i] + offset) 1829 if (k->ptr[i] != replace_key->ptr[i] + offset)
1695 goto check_failed; 1830 goto check_failed;
1696 1831
1697 sectors_found = KEY_OFFSET(k) - KEY_START(insert); 1832 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
@@ -1742,6 +1877,9 @@ static bool fix_overlapping_extents(struct btree *b,
1742 if (bkey_cmp(insert, k) < 0) { 1877 if (bkey_cmp(insert, k) < 0) {
1743 bch_cut_front(insert, k); 1878 bch_cut_front(insert, k);
1744 } else { 1879 } else {
1880 if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
1881 old_offset = KEY_START(insert);
1882
1745 if (bkey_written(b, k) && 1883 if (bkey_written(b, k) &&
1746 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { 1884 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1747 /* 1885 /*
@@ -1759,9 +1897,8 @@ static bool fix_overlapping_extents(struct btree *b,
1759 } 1897 }
1760 1898
1761check_failed: 1899check_failed:
1762 if (op->type == BTREE_REPLACE) { 1900 if (replace_key) {
1763 if (!sectors_found) { 1901 if (!sectors_found) {
1764 op->insert_collision = true;
1765 return true; 1902 return true;
1766 } else if (sectors_found < KEY_SIZE(insert)) { 1903 } else if (sectors_found < KEY_SIZE(insert)) {
1767 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - 1904 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
@@ -1774,7 +1911,7 @@ check_failed:
1774} 1911}
1775 1912
1776static bool btree_insert_key(struct btree *b, struct btree_op *op, 1913static bool btree_insert_key(struct btree *b, struct btree_op *op,
1777 struct bkey *k) 1914 struct bkey *k, struct bkey *replace_key)
1778{ 1915{
1779 struct bset *i = b->sets[b->nsets].data; 1916 struct bset *i = b->sets[b->nsets].data;
1780 struct bkey *m, *prev; 1917 struct bkey *m, *prev;
@@ -1786,22 +1923,23 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1786 1923
1787 if (!b->level) { 1924 if (!b->level) {
1788 struct btree_iter iter; 1925 struct btree_iter iter;
1789 struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
1790 1926
1791 /* 1927 /*
1792 * bset_search() returns the first key that is strictly greater 1928 * bset_search() returns the first key that is strictly greater
1793 * than the search key - but for back merging, we want to find 1929 * than the search key - but for back merging, we want to find
1794 * the first key that is greater than or equal to KEY_START(k) - 1930 * the previous key.
1795 * unless KEY_START(k) is 0.
1796 */ 1931 */
1797 if (KEY_OFFSET(&search))
1798 SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
1799
1800 prev = NULL; 1932 prev = NULL;
1801 m = bch_btree_iter_init(b, &iter, &search); 1933 m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
1802 1934
1803 if (fix_overlapping_extents(b, k, &iter, op)) 1935 if (fix_overlapping_extents(b, k, &iter, replace_key)) {
1936 op->insert_collision = true;
1804 return false; 1937 return false;
1938 }
1939
1940 if (KEY_DIRTY(k))
1941 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1942 KEY_START(k), KEY_SIZE(k));
1805 1943
1806 while (m != end(i) && 1944 while (m != end(i) &&
1807 bkey_cmp(k, &START_KEY(m)) > 0) 1945 bkey_cmp(k, &START_KEY(m)) > 0)
@@ -1825,84 +1963,80 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1825 if (m != end(i) && 1963 if (m != end(i) &&
1826 bch_bkey_try_merge(b, k, m)) 1964 bch_bkey_try_merge(b, k, m))
1827 goto copy; 1965 goto copy;
1828 } else 1966 } else {
1967 BUG_ON(replace_key);
1829 m = bch_bset_search(b, &b->sets[b->nsets], k); 1968 m = bch_bset_search(b, &b->sets[b->nsets], k);
1969 }
1830 1970
1831insert: shift_keys(b, m, k); 1971insert: shift_keys(b, m, k);
1832copy: bkey_copy(m, k); 1972copy: bkey_copy(m, k);
1833merged: 1973merged:
1834 if (KEY_DIRTY(k)) 1974 bch_check_keys(b, "%u for %s", status,
1835 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1975 replace_key ? "replace" : "insert");
1836 KEY_START(k), KEY_SIZE(k));
1837
1838 bch_check_keys(b, "%u for %s", status, op_type(op));
1839 1976
1840 if (b->level && !KEY_OFFSET(k)) 1977 if (b->level && !KEY_OFFSET(k))
1841 btree_current_write(b)->prio_blocked++; 1978 btree_current_write(b)->prio_blocked++;
1842 1979
1843 trace_bcache_btree_insert_key(b, k, op->type, status); 1980 trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
1844 1981
1845 return true; 1982 return true;
1846} 1983}
1847 1984
1848static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) 1985static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
1986 struct keylist *insert_keys,
1987 struct bkey *replace_key)
1849{ 1988{
1850 bool ret = false; 1989 bool ret = false;
1851 struct bkey *k; 1990 int oldsize = bch_count_data(b);
1852 unsigned oldsize = bch_count_data(b);
1853
1854 while ((k = bch_keylist_pop(&op->keys))) {
1855 bkey_put(b->c, k, b->level);
1856 ret |= btree_insert_key(b, op, k);
1857 }
1858
1859 BUG_ON(bch_count_data(b) < oldsize);
1860 return ret;
1861}
1862 1991
1863bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, 1992 while (!bch_keylist_empty(insert_keys)) {
1864 struct bio *bio) 1993 struct bset *i = write_block(b);
1865{ 1994 struct bkey *k = insert_keys->keys;
1866 bool ret = false;
1867 uint64_t btree_ptr = b->key.ptr[0];
1868 unsigned long seq = b->seq;
1869 BKEY_PADDED(k) tmp;
1870 1995
1871 rw_unlock(false, b); 1996 if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
1872 rw_lock(true, b, b->level); 1997 > btree_blocks(b))
1998 break;
1873 1999
1874 if (b->key.ptr[0] != btree_ptr || 2000 if (bkey_cmp(k, &b->key) <= 0) {
1875 b->seq != seq + 1 || 2001 if (!b->level)
1876 should_split(b)) 2002 bkey_put(b->c, k);
1877 goto out;
1878 2003
1879 op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); 2004 ret |= btree_insert_key(b, op, k, replace_key);
2005 bch_keylist_pop_front(insert_keys);
2006 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
2007 BKEY_PADDED(key) temp;
2008 bkey_copy(&temp.key, insert_keys->keys);
1880 2009
1881 SET_KEY_PTRS(&op->replace, 1); 2010 bch_cut_back(&b->key, &temp.key);
1882 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); 2011 bch_cut_front(&b->key, insert_keys->keys);
1883 2012
1884 SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); 2013 ret |= btree_insert_key(b, op, &temp.key, replace_key);
2014 break;
2015 } else {
2016 break;
2017 }
2018 }
1885 2019
1886 bkey_copy(&tmp.k, &op->replace); 2020 BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
1887 2021
1888 BUG_ON(op->type != BTREE_INSERT); 2022 BUG_ON(bch_count_data(b) < oldsize);
1889 BUG_ON(!btree_insert_key(b, op, &tmp.k));
1890 ret = true;
1891out:
1892 downgrade_write(&b->lock);
1893 return ret; 2023 return ret;
1894} 2024}
1895 2025
1896static int btree_split(struct btree *b, struct btree_op *op) 2026static int btree_split(struct btree *b, struct btree_op *op,
2027 struct keylist *insert_keys,
2028 struct bkey *replace_key)
1897{ 2029{
1898 bool split, root = b == b->c->root; 2030 bool split;
1899 struct btree *n1, *n2 = NULL, *n3 = NULL; 2031 struct btree *n1, *n2 = NULL, *n3 = NULL;
1900 uint64_t start_time = local_clock(); 2032 uint64_t start_time = local_clock();
2033 struct closure cl;
2034 struct keylist parent_keys;
1901 2035
1902 if (b->level) 2036 closure_init_stack(&cl);
1903 set_closure_blocking(&op->cl); 2037 bch_keylist_init(&parent_keys);
1904 2038
1905 n1 = btree_node_alloc_replacement(b, &op->cl); 2039 n1 = btree_node_alloc_replacement(b, true);
1906 if (IS_ERR(n1)) 2040 if (IS_ERR(n1))
1907 goto err; 2041 goto err;
1908 2042
@@ -1913,19 +2047,20 @@ static int btree_split(struct btree *b, struct btree_op *op)
1913 2047
1914 trace_bcache_btree_node_split(b, n1->sets[0].data->keys); 2048 trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
1915 2049
1916 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); 2050 n2 = bch_btree_node_alloc(b->c, b->level, true);
1917 if (IS_ERR(n2)) 2051 if (IS_ERR(n2))
1918 goto err_free1; 2052 goto err_free1;
1919 2053
1920 if (root) { 2054 if (!b->parent) {
1921 n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); 2055 n3 = bch_btree_node_alloc(b->c, b->level + 1, true);
1922 if (IS_ERR(n3)) 2056 if (IS_ERR(n3))
1923 goto err_free2; 2057 goto err_free2;
1924 } 2058 }
1925 2059
1926 bch_btree_insert_keys(n1, op); 2060 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
1927 2061
1928 /* Has to be a linear search because we don't have an auxiliary 2062 /*
2063 * Has to be a linear search because we don't have an auxiliary
1929 * search tree yet 2064 * search tree yet
1930 */ 2065 */
1931 2066
@@ -1944,60 +2079,57 @@ static int btree_split(struct btree *b, struct btree_op *op)
1944 2079
1945 bkey_copy_key(&n2->key, &b->key); 2080 bkey_copy_key(&n2->key, &b->key);
1946 2081
1947 bch_keylist_add(&op->keys, &n2->key); 2082 bch_keylist_add(&parent_keys, &n2->key);
1948 bch_btree_node_write(n2, &op->cl); 2083 bch_btree_node_write(n2, &cl);
1949 rw_unlock(true, n2); 2084 rw_unlock(true, n2);
1950 } else { 2085 } else {
1951 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); 2086 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
1952 2087
1953 bch_btree_insert_keys(n1, op); 2088 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
1954 } 2089 }
1955 2090
1956 bch_keylist_add(&op->keys, &n1->key); 2091 bch_keylist_add(&parent_keys, &n1->key);
1957 bch_btree_node_write(n1, &op->cl); 2092 bch_btree_node_write(n1, &cl);
1958 2093
1959 if (n3) { 2094 if (n3) {
2095 /* Depth increases, make a new root */
1960 bkey_copy_key(&n3->key, &MAX_KEY); 2096 bkey_copy_key(&n3->key, &MAX_KEY);
1961 bch_btree_insert_keys(n3, op); 2097 bch_btree_insert_keys(n3, op, &parent_keys, NULL);
1962 bch_btree_node_write(n3, &op->cl); 2098 bch_btree_node_write(n3, &cl);
1963 2099
1964 closure_sync(&op->cl); 2100 closure_sync(&cl);
1965 bch_btree_set_root(n3); 2101 bch_btree_set_root(n3);
1966 rw_unlock(true, n3); 2102 rw_unlock(true, n3);
1967 } else if (root) {
1968 op->keys.top = op->keys.bottom;
1969 closure_sync(&op->cl);
1970 bch_btree_set_root(n1);
1971 } else {
1972 unsigned i;
1973 2103
1974 bkey_copy(op->keys.top, &b->key); 2104 btree_node_free(b);
1975 bkey_copy_key(op->keys.top, &ZERO_KEY); 2105 } else if (!b->parent) {
2106 /* Root filled up but didn't need to be split */
2107 closure_sync(&cl);
2108 bch_btree_set_root(n1);
1976 2109
1977 for (i = 0; i < KEY_PTRS(&b->key); i++) { 2110 btree_node_free(b);
1978 uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; 2111 } else {
2112 /* Split a non root node */
2113 closure_sync(&cl);
2114 make_btree_freeing_key(b, parent_keys.top);
2115 bch_keylist_push(&parent_keys);
1979 2116
1980 SET_PTR_GEN(op->keys.top, i, g); 2117 btree_node_free(b);
1981 }
1982 2118
1983 bch_keylist_push(&op->keys); 2119 bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
1984 closure_sync(&op->cl); 2120 BUG_ON(!bch_keylist_empty(&parent_keys));
1985 atomic_inc(&b->c->prio_blocked);
1986 } 2121 }
1987 2122
1988 rw_unlock(true, n1); 2123 rw_unlock(true, n1);
1989 btree_node_free(b, op);
1990 2124
1991 bch_time_stats_update(&b->c->btree_split_time, start_time); 2125 bch_time_stats_update(&b->c->btree_split_time, start_time);
1992 2126
1993 return 0; 2127 return 0;
1994err_free2: 2128err_free2:
1995 __bkey_put(n2->c, &n2->key); 2129 btree_node_free(n2);
1996 btree_node_free(n2, op);
1997 rw_unlock(true, n2); 2130 rw_unlock(true, n2);
1998err_free1: 2131err_free1:
1999 __bkey_put(n1->c, &n1->key); 2132 btree_node_free(n1);
2000 btree_node_free(n1, op);
2001 rw_unlock(true, n1); 2133 rw_unlock(true, n1);
2002err: 2134err:
2003 if (n3 == ERR_PTR(-EAGAIN) || 2135 if (n3 == ERR_PTR(-EAGAIN) ||
@@ -2009,116 +2141,126 @@ err:
2009 return -ENOMEM; 2141 return -ENOMEM;
2010} 2142}
2011 2143
2012static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, 2144static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2013 struct keylist *stack_keys) 2145 struct keylist *insert_keys,
2146 atomic_t *journal_ref,
2147 struct bkey *replace_key)
2014{ 2148{
2015 if (b->level) { 2149 BUG_ON(b->level && replace_key);
2016 int ret;
2017 struct bkey *insert = op->keys.bottom;
2018 struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
2019
2020 if (!k) {
2021 btree_bug(b, "no key to recurse on at level %i/%i",
2022 b->level, b->c->root->level);
2023 2150
2024 op->keys.top = op->keys.bottom; 2151 if (should_split(b)) {
2025 return -EIO; 2152 if (current->bio_list) {
2153 op->lock = b->c->root->level + 1;
2154 return -EAGAIN;
2155 } else if (op->lock <= b->c->root->level) {
2156 op->lock = b->c->root->level + 1;
2157 return -EINTR;
2158 } else {
2159 /* Invalidated all iterators */
2160 return btree_split(b, op, insert_keys, replace_key) ?:
2161 -EINTR;
2026 } 2162 }
2163 } else {
2164 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2027 2165
2028 if (bkey_cmp(insert, k) > 0) { 2166 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2029 unsigned i; 2167 if (!b->level)
2030 2168 bch_btree_leaf_dirty(b, journal_ref);
2031 if (op->type == BTREE_REPLACE) { 2169 else
2032 __bkey_put(b->c, insert); 2170 bch_btree_node_write_sync(b);
2033 op->keys.top = op->keys.bottom; 2171 }
2034 op->insert_collision = true;
2035 return 0;
2036 }
2037 2172
2038 for (i = 0; i < KEY_PTRS(insert); i++) 2173 return 0;
2039 atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); 2174 }
2175}
2040 2176
2041 bkey_copy(stack_keys->top, insert); 2177int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2178 struct bkey *check_key)
2179{
2180 int ret = -EINTR;
2181 uint64_t btree_ptr = b->key.ptr[0];
2182 unsigned long seq = b->seq;
2183 struct keylist insert;
2184 bool upgrade = op->lock == -1;
2042 2185
2043 bch_cut_back(k, insert); 2186 bch_keylist_init(&insert);
2044 bch_cut_front(k, stack_keys->top);
2045 2187
2046 bch_keylist_push(stack_keys); 2188 if (upgrade) {
2047 } 2189 rw_unlock(false, b);
2190 rw_lock(true, b, b->level);
2048 2191
2049 ret = btree(insert_recurse, k, b, op, stack_keys); 2192 if (b->key.ptr[0] != btree_ptr ||
2050 if (ret) 2193 b->seq != seq + 1)
2051 return ret; 2194 goto out;
2052 } 2195 }
2053 2196
2054 if (!bch_keylist_empty(&op->keys)) { 2197 SET_KEY_PTRS(check_key, 1);
2055 if (should_split(b)) { 2198 get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2056 if (op->lock <= b->c->root->level) {
2057 BUG_ON(b->level);
2058 op->lock = b->c->root->level + 1;
2059 return -EINTR;
2060 }
2061 return btree_split(b, op);
2062 }
2063 2199
2064 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2200 SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2065 2201
2066 if (bch_btree_insert_keys(b, op)) { 2202 bch_keylist_add(&insert, check_key);
2067 if (!b->level)
2068 bch_btree_leaf_dirty(b, op);
2069 else
2070 bch_btree_node_write(b, &op->cl);
2071 }
2072 }
2073 2203
2074 return 0; 2204 ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
2205
2206 BUG_ON(!ret && !bch_keylist_empty(&insert));
2207out:
2208 if (upgrade)
2209 downgrade_write(&b->lock);
2210 return ret;
2075} 2211}
2076 2212
2077int bch_btree_insert(struct btree_op *op, struct cache_set *c) 2213struct btree_insert_op {
2214 struct btree_op op;
2215 struct keylist *keys;
2216 atomic_t *journal_ref;
2217 struct bkey *replace_key;
2218};
2219
2220int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2078{ 2221{
2079 int ret = 0; 2222 struct btree_insert_op *op = container_of(b_op,
2080 struct keylist stack_keys; 2223 struct btree_insert_op, op);
2081 2224
2082 /* 2225 int ret = bch_btree_insert_node(b, &op->op, op->keys,
2083 * Don't want to block with the btree locked unless we have to, 2226 op->journal_ref, op->replace_key);
2084 * otherwise we get deadlocks with try_harder and between split/gc 2227 if (ret && !bch_keylist_empty(op->keys))
2085 */ 2228 return ret;
2086 clear_closure_blocking(&op->cl); 2229 else
2087 2230 return MAP_DONE;
2088 BUG_ON(bch_keylist_empty(&op->keys)); 2231}
2089 bch_keylist_copy(&stack_keys, &op->keys);
2090 bch_keylist_init(&op->keys);
2091
2092 while (!bch_keylist_empty(&stack_keys) ||
2093 !bch_keylist_empty(&op->keys)) {
2094 if (bch_keylist_empty(&op->keys)) {
2095 bch_keylist_add(&op->keys,
2096 bch_keylist_pop(&stack_keys));
2097 op->lock = 0;
2098 }
2099 2232
2100 ret = btree_root(insert_recurse, c, op, &stack_keys); 2233int bch_btree_insert(struct cache_set *c, struct keylist *keys,
2234 atomic_t *journal_ref, struct bkey *replace_key)
2235{
2236 struct btree_insert_op op;
2237 int ret = 0;
2101 2238
2102 if (ret == -EAGAIN) { 2239 BUG_ON(current->bio_list);
2103 ret = 0; 2240 BUG_ON(bch_keylist_empty(keys));
2104 closure_sync(&op->cl); 2241
2105 } else if (ret) { 2242 bch_btree_op_init(&op.op, 0);
2106 struct bkey *k; 2243 op.keys = keys;
2244 op.journal_ref = journal_ref;
2245 op.replace_key = replace_key;
2246
2247 while (!ret && !bch_keylist_empty(keys)) {
2248 op.op.lock = 0;
2249 ret = bch_btree_map_leaf_nodes(&op.op, c,
2250 &START_KEY(keys->keys),
2251 btree_insert_fn);
2252 }
2107 2253
2108 pr_err("error %i trying to insert key for %s", 2254 if (ret) {
2109 ret, op_type(op)); 2255 struct bkey *k;
2110 2256
2111 while ((k = bch_keylist_pop(&stack_keys) ?: 2257 pr_err("error %i", ret);
2112 bch_keylist_pop(&op->keys)))
2113 bkey_put(c, k, 0);
2114 }
2115 }
2116 2258
2117 bch_keylist_free(&stack_keys); 2259 while ((k = bch_keylist_pop(keys)))
2260 bkey_put(c, k);
2261 } else if (op.op.insert_collision)
2262 ret = -ESRCH;
2118 2263
2119 if (op->journal)
2120 atomic_dec_bug(op->journal);
2121 op->journal = NULL;
2122 return ret; 2264 return ret;
2123} 2265}
2124 2266
@@ -2141,132 +2283,81 @@ void bch_btree_set_root(struct btree *b)
2141 mutex_unlock(&b->c->bucket_lock); 2283 mutex_unlock(&b->c->bucket_lock);
2142 2284
2143 b->c->root = b; 2285 b->c->root = b;
2144 __bkey_put(b->c, &b->key);
2145 2286
2146 bch_journal_meta(b->c, &cl); 2287 bch_journal_meta(b->c, &cl);
2147 closure_sync(&cl); 2288 closure_sync(&cl);
2148} 2289}
2149 2290
2150/* Cache lookup */ 2291/* Map across nodes or keys */
2151 2292
2152static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, 2293static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2153 struct bkey *k) 2294 struct bkey *from,
2295 btree_map_nodes_fn *fn, int flags)
2154{ 2296{
2155 struct search *s = container_of(op, struct search, op); 2297 int ret = MAP_CONTINUE;
2156 struct bio *bio = &s->bio.bio; 2298
2157 int ret = 0; 2299 if (b->level) {
2300 struct bkey *k;
2301 struct btree_iter iter;
2158 2302
2159 while (!ret && 2303 bch_btree_iter_init(b, &iter, from);
2160 !op->lookup_done) {
2161 unsigned sectors = INT_MAX;
2162 2304
2163 if (KEY_INODE(k) == op->inode) { 2305 while ((k = bch_btree_iter_next_filter(&iter, b,
2164 if (KEY_START(k) <= bio->bi_sector) 2306 bch_ptr_bad))) {
2165 break; 2307 ret = btree(map_nodes_recurse, k, b,
2308 op, from, fn, flags);
2309 from = NULL;
2166 2310
2167 sectors = min_t(uint64_t, sectors, 2311 if (ret != MAP_CONTINUE)
2168 KEY_START(k) - bio->bi_sector); 2312 return ret;
2169 } 2313 }
2170
2171 ret = s->d->cache_miss(b, s, bio, sectors);
2172 } 2314 }
2173 2315
2316 if (!b->level || flags == MAP_ALL_NODES)
2317 ret = fn(op, b);
2318
2174 return ret; 2319 return ret;
2175} 2320}
2176 2321
2177/* 2322int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
2178 * Read from a single key, handling the initial cache miss if the key starts in 2323 struct bkey *from, btree_map_nodes_fn *fn, int flags)
2179 * the middle of the bio
2180 */
2181static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2182 struct bkey *k)
2183{ 2324{
2184 struct search *s = container_of(op, struct search, op); 2325 return btree_root(map_nodes_recurse, c, op, from, fn, flags);
2185 struct bio *bio = &s->bio.bio;
2186 unsigned ptr;
2187 struct bio *n;
2188
2189 int ret = submit_partial_cache_miss(b, op, k);
2190 if (ret || op->lookup_done)
2191 return ret;
2192
2193 /* XXX: figure out best pointer - for multiple cache devices */
2194 ptr = 0;
2195
2196 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
2197
2198 while (!op->lookup_done &&
2199 KEY_INODE(k) == op->inode &&
2200 bio->bi_sector < KEY_OFFSET(k)) {
2201 struct bkey *bio_key;
2202 sector_t sector = PTR_OFFSET(k, ptr) +
2203 (bio->bi_sector - KEY_START(k));
2204 unsigned sectors = min_t(uint64_t, INT_MAX,
2205 KEY_OFFSET(k) - bio->bi_sector);
2206
2207 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
2208 if (n == bio)
2209 op->lookup_done = true;
2210
2211 bio_key = &container_of(n, struct bbio, bio)->key;
2212
2213 /*
2214 * The bucket we're reading from might be reused while our bio
2215 * is in flight, and we could then end up reading the wrong
2216 * data.
2217 *
2218 * We guard against this by checking (in cache_read_endio()) if
2219 * the pointer is stale again; if so, we treat it as an error
2220 * and reread from the backing device (but we don't pass that
2221 * error up anywhere).
2222 */
2223
2224 bch_bkey_copy_single_ptr(bio_key, k, ptr);
2225 SET_PTR_OFFSET(bio_key, 0, sector);
2226
2227 n->bi_end_io = bch_cache_read_endio;
2228 n->bi_private = &s->cl;
2229
2230 __bch_submit_bbio(n, b->c);
2231 }
2232
2233 return 0;
2234} 2326}
2235 2327
2236int bch_btree_search_recurse(struct btree *b, struct btree_op *op) 2328static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2329 struct bkey *from, btree_map_keys_fn *fn,
2330 int flags)
2237{ 2331{
2238 struct search *s = container_of(op, struct search, op); 2332 int ret = MAP_CONTINUE;
2239 struct bio *bio = &s->bio.bio;
2240
2241 int ret = 0;
2242 struct bkey *k; 2333 struct bkey *k;
2243 struct btree_iter iter; 2334 struct btree_iter iter;
2244 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
2245 2335
2246 do { 2336 bch_btree_iter_init(b, &iter, from);
2247 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
2248 if (!k) {
2249 /*
2250 * b->key would be exactly what we want, except that
2251 * pointers to btree nodes have nonzero size - we
2252 * wouldn't go far enough
2253 */
2254 2337
2255 ret = submit_partial_cache_miss(b, op, 2338 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
2256 &KEY(KEY_INODE(&b->key), 2339 ret = !b->level
2257 KEY_OFFSET(&b->key), 0)); 2340 ? fn(op, b, k)
2258 break; 2341 : btree(map_keys_recurse, k, b, op, from, fn, flags);
2259 } 2342 from = NULL;
2343
2344 if (ret != MAP_CONTINUE)
2345 return ret;
2346 }
2260 2347
2261 ret = b->level 2348 if (!b->level && (flags & MAP_END_KEY))
2262 ? btree(search_recurse, k, b, op) 2349 ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2263 : submit_partial_cache_hit(b, op, k); 2350 KEY_OFFSET(&b->key), 0));
2264 } while (!ret &&
2265 !op->lookup_done);
2266 2351
2267 return ret; 2352 return ret;
2268} 2353}
2269 2354
2355int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
2356 struct bkey *from, btree_map_keys_fn *fn, int flags)
2357{
2358 return btree_root(map_keys_recurse, c, op, from, fn, flags);
2359}
2360
2270/* Keybuf code */ 2361/* Keybuf code */
2271 2362
2272static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) 2363static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
@@ -2285,80 +2376,79 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2285 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); 2376 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2286} 2377}
2287 2378
2288static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, 2379struct refill {
2289 struct keybuf *buf, struct bkey *end, 2380 struct btree_op op;
2290 keybuf_pred_fn *pred) 2381 unsigned nr_found;
2291{ 2382 struct keybuf *buf;
2292 struct btree_iter iter; 2383 struct bkey *end;
2293 bch_btree_iter_init(b, &iter, &buf->last_scanned); 2384 keybuf_pred_fn *pred;
2294 2385};
2295 while (!array_freelist_empty(&buf->freelist)) {
2296 struct bkey *k = bch_btree_iter_next_filter(&iter, b,
2297 bch_ptr_bad);
2298
2299 if (!b->level) {
2300 if (!k) {
2301 buf->last_scanned = b->key;
2302 break;
2303 }
2304 2386
2305 buf->last_scanned = *k; 2387static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
2306 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2388 struct bkey *k)
2307 break; 2389{
2390 struct refill *refill = container_of(op, struct refill, op);
2391 struct keybuf *buf = refill->buf;
2392 int ret = MAP_CONTINUE;
2308 2393
2309 if (pred(buf, k)) { 2394 if (bkey_cmp(k, refill->end) >= 0) {
2310 struct keybuf_key *w; 2395 ret = MAP_DONE;
2396 goto out;
2397 }
2311 2398
2312 spin_lock(&buf->lock); 2399 if (!KEY_SIZE(k)) /* end key */
2400 goto out;
2313 2401
2314 w = array_alloc(&buf->freelist); 2402 if (refill->pred(buf, k)) {
2403 struct keybuf_key *w;
2315 2404
2316 w->private = NULL; 2405 spin_lock(&buf->lock);
2317 bkey_copy(&w->key, k);
2318 2406
2319 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) 2407 w = array_alloc(&buf->freelist);
2320 array_free(&buf->freelist, w); 2408 if (!w) {
2409 spin_unlock(&buf->lock);
2410 return MAP_DONE;
2411 }
2321 2412
2322 spin_unlock(&buf->lock); 2413 w->private = NULL;
2323 } 2414 bkey_copy(&w->key, k);
2324 } else {
2325 if (!k)
2326 break;
2327 2415
2328 btree(refill_keybuf, k, b, op, buf, end, pred); 2416 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2329 /* 2417 array_free(&buf->freelist, w);
2330 * Might get an error here, but can't really do anything 2418 else
2331 * and it'll get logged elsewhere. Just read what we 2419 refill->nr_found++;
2332 * can.
2333 */
2334 2420
2335 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2421 if (array_freelist_empty(&buf->freelist))
2336 break; 2422 ret = MAP_DONE;
2337 2423
2338 cond_resched(); 2424 spin_unlock(&buf->lock);
2339 }
2340 } 2425 }
2341 2426out:
2342 return 0; 2427 buf->last_scanned = *k;
2428 return ret;
2343} 2429}
2344 2430
2345void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2431void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2346 struct bkey *end, keybuf_pred_fn *pred) 2432 struct bkey *end, keybuf_pred_fn *pred)
2347{ 2433{
2348 struct bkey start = buf->last_scanned; 2434 struct bkey start = buf->last_scanned;
2349 struct btree_op op; 2435 struct refill refill;
2350 bch_btree_op_init_stack(&op);
2351 2436
2352 cond_resched(); 2437 cond_resched();
2353 2438
2354 btree_root(refill_keybuf, c, &op, buf, end, pred); 2439 bch_btree_op_init(&refill.op, -1);
2355 closure_sync(&op.cl); 2440 refill.nr_found = 0;
2441 refill.buf = buf;
2442 refill.end = end;
2443 refill.pred = pred;
2444
2445 bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
2446 refill_keybuf_fn, MAP_END_KEY);
2356 2447
2357 pr_debug("found %s keys from %llu:%llu to %llu:%llu", 2448 trace_bcache_keyscan(refill.nr_found,
2358 RB_EMPTY_ROOT(&buf->keys) ? "no" : 2449 KEY_INODE(&start), KEY_OFFSET(&start),
2359 array_freelist_empty(&buf->freelist) ? "some" : "a few", 2450 KEY_INODE(&buf->last_scanned),
2360 KEY_INODE(&start), KEY_OFFSET(&start), 2451 KEY_OFFSET(&buf->last_scanned));
2361 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
2362 2452
2363 spin_lock(&buf->lock); 2453 spin_lock(&buf->lock);
2364 2454
@@ -2436,9 +2526,9 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2436} 2526}
2437 2527
2438struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2528struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2439 struct keybuf *buf, 2529 struct keybuf *buf,
2440 struct bkey *end, 2530 struct bkey *end,
2441 keybuf_pred_fn *pred) 2531 keybuf_pred_fn *pred)
2442{ 2532{
2443 struct keybuf_key *ret; 2533 struct keybuf_key *ret;
2444 2534
@@ -2471,14 +2561,12 @@ void bch_btree_exit(void)
2471{ 2561{
2472 if (btree_io_wq) 2562 if (btree_io_wq)
2473 destroy_workqueue(btree_io_wq); 2563 destroy_workqueue(btree_io_wq);
2474 if (bch_gc_wq)
2475 destroy_workqueue(bch_gc_wq);
2476} 2564}
2477 2565
2478int __init bch_btree_init(void) 2566int __init bch_btree_init(void)
2479{ 2567{
2480 if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || 2568 btree_io_wq = create_singlethread_workqueue("bch_btree_io");
2481 !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) 2569 if (!btree_io_wq)
2482 return -ENOMEM; 2570 return -ENOMEM;
2483 2571
2484 return 0; 2572 return 0;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 3333d3723633..767e75570896 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -125,6 +125,7 @@ struct btree {
125 unsigned long seq; 125 unsigned long seq;
126 struct rw_semaphore lock; 126 struct rw_semaphore lock;
127 struct cache_set *c; 127 struct cache_set *c;
128 struct btree *parent;
128 129
129 unsigned long flags; 130 unsigned long flags;
130 uint16_t written; /* would be nice to kill */ 131 uint16_t written; /* would be nice to kill */
@@ -200,12 +201,7 @@ static inline bool bkey_written(struct btree *b, struct bkey *k)
200 201
201static inline void set_gc_sectors(struct cache_set *c) 202static inline void set_gc_sectors(struct cache_set *c)
202{ 203{
203 atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); 204 atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
204}
205
206static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
207{
208 return __bch_ptr_invalid(b->c, b->level, k);
209} 205}
210 206
211static inline struct bkey *bch_btree_iter_init(struct btree *b, 207static inline struct bkey *bch_btree_iter_init(struct btree *b,
@@ -215,6 +211,16 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b,
215 return __bch_btree_iter_init(b, iter, search, b->sets); 211 return __bch_btree_iter_init(b, iter, search, b->sets);
216} 212}
217 213
214static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
215{
216 if (b->level)
217 return bch_btree_ptr_invalid(b->c, k);
218 else
219 return bch_extent_ptr_invalid(b->c, k);
220}
221
222void bkey_put(struct cache_set *c, struct bkey *k);
223
218/* Looping macros */ 224/* Looping macros */
219 225
220#define for_each_cached_btree(b, c, iter) \ 226#define for_each_cached_btree(b, c, iter) \
@@ -234,51 +240,17 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b,
234/* Recursing down the btree */ 240/* Recursing down the btree */
235 241
236struct btree_op { 242struct btree_op {
237 struct closure cl;
238 struct cache_set *c;
239
240 /* Journal entry we have a refcount on */
241 atomic_t *journal;
242
243 /* Bio to be inserted into the cache */
244 struct bio *cache_bio;
245
246 unsigned inode;
247
248 uint16_t write_prio;
249
250 /* Btree level at which we start taking write locks */ 243 /* Btree level at which we start taking write locks */
251 short lock; 244 short lock;
252 245
253 /* Btree insertion type */
254 enum {
255 BTREE_INSERT,
256 BTREE_REPLACE
257 } type:8;
258
259 unsigned csum:1;
260 unsigned skip:1;
261 unsigned flush_journal:1;
262
263 unsigned insert_data_done:1;
264 unsigned lookup_done:1;
265 unsigned insert_collision:1; 246 unsigned insert_collision:1;
266
267 /* Anything after this point won't get zeroed in do_bio_hook() */
268
269 /* Keys to be inserted */
270 struct keylist keys;
271 BKEY_PADDED(replace);
272}; 247};
273 248
274enum { 249static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
275 BTREE_INSERT_STATUS_INSERT, 250{
276 BTREE_INSERT_STATUS_BACK_MERGE, 251 memset(op, 0, sizeof(struct btree_op));
277 BTREE_INSERT_STATUS_OVERWROTE, 252 op->lock = write_lock_level;
278 BTREE_INSERT_STATUS_FRONT_MERGE, 253}
279};
280
281void bch_btree_op_init_stack(struct btree_op *);
282 254
283static inline void rw_lock(bool w, struct btree *b, int level) 255static inline void rw_lock(bool w, struct btree *b, int level)
284{ 256{
@@ -290,108 +262,71 @@ static inline void rw_lock(bool w, struct btree *b, int level)
290 262
291static inline void rw_unlock(bool w, struct btree *b) 263static inline void rw_unlock(bool w, struct btree *b)
292{ 264{
293#ifdef CONFIG_BCACHE_EDEBUG
294 unsigned i;
295
296 if (w && b->key.ptr[0])
297 for (i = 0; i <= b->nsets; i++)
298 bch_check_key_order(b, b->sets[i].data);
299#endif
300
301 if (w) 265 if (w)
302 b->seq++; 266 b->seq++;
303 (w ? up_write : up_read)(&b->lock); 267 (w ? up_write : up_read)(&b->lock);
304} 268}
305 269
306#define insert_lock(s, b) ((b)->level <= (s)->lock) 270void bch_btree_node_read(struct btree *);
271void bch_btree_node_write(struct btree *, struct closure *);
307 272
308/* 273void bch_btree_set_root(struct btree *);
309 * These macros are for recursing down the btree - they handle the details of 274struct btree *bch_btree_node_alloc(struct cache_set *, int, bool);
310 * locking and looking up nodes in the cache for you. They're best treated as 275struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
311 * mere syntax when reading code that uses them.
312 *
313 * op->lock determines whether we take a read or a write lock at a given depth.
314 * If you've got a read lock and find that you need a write lock (i.e. you're
315 * going to have to split), set op->lock and return -EINTR; btree_root() will
316 * call you again and you'll have the correct lock.
317 */
318 276
319/** 277int bch_btree_insert_check_key(struct btree *, struct btree_op *,
320 * btree - recurse down the btree on a specified key 278 struct bkey *);
321 * @fn: function to call, which will be passed the child node 279int bch_btree_insert(struct cache_set *, struct keylist *,
322 * @key: key to recurse on 280 atomic_t *, struct bkey *);
323 * @b: parent btree node 281
324 * @op: pointer to struct btree_op 282int bch_gc_thread_start(struct cache_set *);
325 */ 283size_t bch_btree_gc_finish(struct cache_set *);
326#define btree(fn, key, b, op, ...) \ 284void bch_moving_gc(struct cache_set *);
327({ \ 285int bch_btree_check(struct cache_set *);
328 int _r, l = (b)->level - 1; \ 286uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
329 bool _w = l <= (op)->lock; \
330 struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \
331 if (!IS_ERR(_b)) { \
332 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
333 rw_unlock(_w, _b); \
334 } else \
335 _r = PTR_ERR(_b); \
336 _r; \
337})
338
339/**
340 * btree_root - call a function on the root of the btree
341 * @fn: function to call, which will be passed the child node
342 * @c: cache set
343 * @op: pointer to struct btree_op
344 */
345#define btree_root(fn, c, op, ...) \
346({ \
347 int _r = -EINTR; \
348 do { \
349 struct btree *_b = (c)->root; \
350 bool _w = insert_lock(op, _b); \
351 rw_lock(_w, _b, _b->level); \
352 if (_b == (c)->root && \
353 _w == insert_lock(op, _b)) \
354 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
355 rw_unlock(_w, _b); \
356 bch_cannibalize_unlock(c, &(op)->cl); \
357 } while (_r == -EINTR); \
358 \
359 _r; \
360})
361 287
362static inline bool should_split(struct btree *b) 288static inline void wake_up_gc(struct cache_set *c)
363{ 289{
364 struct bset *i = write_block(b); 290 if (c->gc_thread)
365 return b->written >= btree_blocks(b) || 291 wake_up_process(c->gc_thread);
366 (i->seq == b->sets[0].data->seq &&
367 b->written + __set_blocks(i, i->keys + 15, b->c)
368 > btree_blocks(b));
369} 292}
370 293
371void bch_btree_node_read(struct btree *); 294#define MAP_DONE 0
372void bch_btree_node_write(struct btree *, struct closure *); 295#define MAP_CONTINUE 1
373 296
374void bch_cannibalize_unlock(struct cache_set *, struct closure *); 297#define MAP_ALL_NODES 0
375void bch_btree_set_root(struct btree *); 298#define MAP_LEAF_NODES 1
376struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
377struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
378 int, struct btree_op *);
379 299
380bool bch_btree_insert_check_key(struct btree *, struct btree_op *, 300#define MAP_END_KEY 1
381 struct bio *);
382int bch_btree_insert(struct btree_op *, struct cache_set *);
383 301
384int bch_btree_search_recurse(struct btree *, struct btree_op *); 302typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *);
303int __bch_btree_map_nodes(struct btree_op *, struct cache_set *,
304 struct bkey *, btree_map_nodes_fn *, int);
385 305
386void bch_queue_gc(struct cache_set *); 306static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
387size_t bch_btree_gc_finish(struct cache_set *); 307 struct bkey *from, btree_map_nodes_fn *fn)
388void bch_moving_gc(struct closure *); 308{
389int bch_btree_check(struct cache_set *, struct btree_op *); 309 return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
390uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); 310}
311
312static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
313 struct cache_set *c,
314 struct bkey *from,
315 btree_map_nodes_fn *fn)
316{
317 return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
318}
319
320typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *,
321 struct bkey *);
322int bch_btree_map_keys(struct btree_op *, struct cache_set *,
323 struct bkey *, btree_map_keys_fn *, int);
324
325typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
391 326
392void bch_keybuf_init(struct keybuf *); 327void bch_keybuf_init(struct keybuf *);
393void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, 328void bch_refill_keybuf(struct cache_set *, struct keybuf *,
394 keybuf_pred_fn *); 329 struct bkey *, keybuf_pred_fn *);
395bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, 330bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
396 struct bkey *); 331 struct bkey *);
397void bch_keybuf_del(struct keybuf *, struct keybuf_key *); 332void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 9aba2017f0d1..dfff2410322e 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -11,17 +11,6 @@
11 11
12#include "closure.h" 12#include "closure.h"
13 13
14void closure_queue(struct closure *cl)
15{
16 struct workqueue_struct *wq = cl->wq;
17 if (wq) {
18 INIT_WORK(&cl->work, cl->work.func);
19 BUG_ON(!queue_work(wq, &cl->work));
20 } else
21 cl->fn(cl);
22}
23EXPORT_SYMBOL_GPL(closure_queue);
24
25#define CL_FIELD(type, field) \ 14#define CL_FIELD(type, field) \
26 case TYPE_ ## type: \ 15 case TYPE_ ## type: \
27 return &container_of(cl, struct type, cl)->field 16 return &container_of(cl, struct type, cl)->field
@@ -30,17 +19,6 @@ static struct closure_waitlist *closure_waitlist(struct closure *cl)
30{ 19{
31 switch (cl->type) { 20 switch (cl->type) {
32 CL_FIELD(closure_with_waitlist, wait); 21 CL_FIELD(closure_with_waitlist, wait);
33 CL_FIELD(closure_with_waitlist_and_timer, wait);
34 default:
35 return NULL;
36 }
37}
38
39static struct timer_list *closure_timer(struct closure *cl)
40{
41 switch (cl->type) {
42 CL_FIELD(closure_with_timer, timer);
43 CL_FIELD(closure_with_waitlist_and_timer, timer);
44 default: 22 default:
45 return NULL; 23 return NULL;
46 } 24 }
@@ -51,7 +29,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
51 int r = flags & CLOSURE_REMAINING_MASK; 29 int r = flags & CLOSURE_REMAINING_MASK;
52 30
53 BUG_ON(flags & CLOSURE_GUARD_MASK); 31 BUG_ON(flags & CLOSURE_GUARD_MASK);
54 BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING))); 32 BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
55 33
56 /* Must deliver precisely one wakeup */ 34 /* Must deliver precisely one wakeup */
57 if (r == 1 && (flags & CLOSURE_SLEEPING)) 35 if (r == 1 && (flags & CLOSURE_SLEEPING))
@@ -59,7 +37,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
59 37
60 if (!r) { 38 if (!r) {
61 if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { 39 if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
62 /* CLOSURE_BLOCKING might be set - clear it */
63 atomic_set(&cl->remaining, 40 atomic_set(&cl->remaining,
64 CLOSURE_REMAINING_INITIALIZER); 41 CLOSURE_REMAINING_INITIALIZER);
65 closure_queue(cl); 42 closure_queue(cl);
@@ -90,13 +67,13 @@ void closure_sub(struct closure *cl, int v)
90{ 67{
91 closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); 68 closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
92} 69}
93EXPORT_SYMBOL_GPL(closure_sub); 70EXPORT_SYMBOL(closure_sub);
94 71
95void closure_put(struct closure *cl) 72void closure_put(struct closure *cl)
96{ 73{
97 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); 74 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
98} 75}
99EXPORT_SYMBOL_GPL(closure_put); 76EXPORT_SYMBOL(closure_put);
100 77
101static void set_waiting(struct closure *cl, unsigned long f) 78static void set_waiting(struct closure *cl, unsigned long f)
102{ 79{
@@ -133,7 +110,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
133 closure_sub(cl, CLOSURE_WAITING + 1); 110 closure_sub(cl, CLOSURE_WAITING + 1);
134 } 111 }
135} 112}
136EXPORT_SYMBOL_GPL(__closure_wake_up); 113EXPORT_SYMBOL(__closure_wake_up);
137 114
138bool closure_wait(struct closure_waitlist *list, struct closure *cl) 115bool closure_wait(struct closure_waitlist *list, struct closure *cl)
139{ 116{
@@ -146,7 +123,7 @@ bool closure_wait(struct closure_waitlist *list, struct closure *cl)
146 123
147 return true; 124 return true;
148} 125}
149EXPORT_SYMBOL_GPL(closure_wait); 126EXPORT_SYMBOL(closure_wait);
150 127
151/** 128/**
152 * closure_sync() - sleep until a closure a closure has nothing left to wait on 129 * closure_sync() - sleep until a closure a closure has nothing left to wait on
@@ -169,7 +146,7 @@ void closure_sync(struct closure *cl)
169 146
170 __closure_end_sleep(cl); 147 __closure_end_sleep(cl);
171} 148}
172EXPORT_SYMBOL_GPL(closure_sync); 149EXPORT_SYMBOL(closure_sync);
173 150
174/** 151/**
175 * closure_trylock() - try to acquire the closure, without waiting 152 * closure_trylock() - try to acquire the closure, without waiting
@@ -183,17 +160,17 @@ bool closure_trylock(struct closure *cl, struct closure *parent)
183 CLOSURE_REMAINING_INITIALIZER) != -1) 160 CLOSURE_REMAINING_INITIALIZER) != -1)
184 return false; 161 return false;
185 162
186 closure_set_ret_ip(cl);
187
188 smp_mb(); 163 smp_mb();
164
189 cl->parent = parent; 165 cl->parent = parent;
190 if (parent) 166 if (parent)
191 closure_get(parent); 167 closure_get(parent);
192 168
169 closure_set_ret_ip(cl);
193 closure_debug_create(cl); 170 closure_debug_create(cl);
194 return true; 171 return true;
195} 172}
196EXPORT_SYMBOL_GPL(closure_trylock); 173EXPORT_SYMBOL(closure_trylock);
197 174
198void __closure_lock(struct closure *cl, struct closure *parent, 175void __closure_lock(struct closure *cl, struct closure *parent,
199 struct closure_waitlist *wait_list) 176 struct closure_waitlist *wait_list)
@@ -205,57 +182,11 @@ void __closure_lock(struct closure *cl, struct closure *parent,
205 if (closure_trylock(cl, parent)) 182 if (closure_trylock(cl, parent))
206 return; 183 return;
207 184
208 closure_wait_event_sync(wait_list, &wait, 185 closure_wait_event(wait_list, &wait,
209 atomic_read(&cl->remaining) == -1); 186 atomic_read(&cl->remaining) == -1);
210 } 187 }
211} 188}
212EXPORT_SYMBOL_GPL(__closure_lock); 189EXPORT_SYMBOL(__closure_lock);
213
214static void closure_delay_timer_fn(unsigned long data)
215{
216 struct closure *cl = (struct closure *) data;
217 closure_sub(cl, CLOSURE_TIMER + 1);
218}
219
220void do_closure_timer_init(struct closure *cl)
221{
222 struct timer_list *timer = closure_timer(cl);
223
224 init_timer(timer);
225 timer->data = (unsigned long) cl;
226 timer->function = closure_delay_timer_fn;
227}
228EXPORT_SYMBOL_GPL(do_closure_timer_init);
229
230bool __closure_delay(struct closure *cl, unsigned long delay,
231 struct timer_list *timer)
232{
233 if (atomic_read(&cl->remaining) & CLOSURE_TIMER)
234 return false;
235
236 BUG_ON(timer_pending(timer));
237
238 timer->expires = jiffies + delay;
239
240 atomic_add(CLOSURE_TIMER + 1, &cl->remaining);
241 add_timer(timer);
242 return true;
243}
244EXPORT_SYMBOL_GPL(__closure_delay);
245
246void __closure_flush(struct closure *cl, struct timer_list *timer)
247{
248 if (del_timer(timer))
249 closure_sub(cl, CLOSURE_TIMER + 1);
250}
251EXPORT_SYMBOL_GPL(__closure_flush);
252
253void __closure_flush_sync(struct closure *cl, struct timer_list *timer)
254{
255 if (del_timer_sync(timer))
256 closure_sub(cl, CLOSURE_TIMER + 1);
257}
258EXPORT_SYMBOL_GPL(__closure_flush_sync);
259 190
260#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 191#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
261 192
@@ -273,7 +204,7 @@ void closure_debug_create(struct closure *cl)
273 list_add(&cl->all, &closure_list); 204 list_add(&cl->all, &closure_list);
274 spin_unlock_irqrestore(&closure_list_lock, flags); 205 spin_unlock_irqrestore(&closure_list_lock, flags);
275} 206}
276EXPORT_SYMBOL_GPL(closure_debug_create); 207EXPORT_SYMBOL(closure_debug_create);
277 208
278void closure_debug_destroy(struct closure *cl) 209void closure_debug_destroy(struct closure *cl)
279{ 210{
@@ -286,7 +217,7 @@ void closure_debug_destroy(struct closure *cl)
286 list_del(&cl->all); 217 list_del(&cl->all);
287 spin_unlock_irqrestore(&closure_list_lock, flags); 218 spin_unlock_irqrestore(&closure_list_lock, flags);
288} 219}
289EXPORT_SYMBOL_GPL(closure_debug_destroy); 220EXPORT_SYMBOL(closure_debug_destroy);
290 221
291static struct dentry *debug; 222static struct dentry *debug;
292 223
@@ -304,14 +235,12 @@ static int debug_seq_show(struct seq_file *f, void *data)
304 cl, (void *) cl->ip, cl->fn, cl->parent, 235 cl, (void *) cl->ip, cl->fn, cl->parent,
305 r & CLOSURE_REMAINING_MASK); 236 r & CLOSURE_REMAINING_MASK);
306 237
307 seq_printf(f, "%s%s%s%s%s%s\n", 238 seq_printf(f, "%s%s%s%s\n",
308 test_bit(WORK_STRUCT_PENDING, 239 test_bit(WORK_STRUCT_PENDING,
309 work_data_bits(&cl->work)) ? "Q" : "", 240 work_data_bits(&cl->work)) ? "Q" : "",
310 r & CLOSURE_RUNNING ? "R" : "", 241 r & CLOSURE_RUNNING ? "R" : "",
311 r & CLOSURE_BLOCKING ? "B" : "",
312 r & CLOSURE_STACK ? "S" : "", 242 r & CLOSURE_STACK ? "S" : "",
313 r & CLOSURE_SLEEPING ? "Sl" : "", 243 r & CLOSURE_SLEEPING ? "Sl" : "");
314 r & CLOSURE_TIMER ? "T" : "");
315 244
316 if (r & CLOSURE_WAITING) 245 if (r & CLOSURE_WAITING)
317 seq_printf(f, " W %pF\n", 246 seq_printf(f, " W %pF\n",
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 00039924ea9d..9762f1be3304 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -155,21 +155,6 @@
155 * delayed_work embeds a work item and a timer_list. The important thing is, use 155 * delayed_work embeds a work item and a timer_list. The important thing is, use
156 * it exactly like you would a regular closure and closure_put() will magically 156 * it exactly like you would a regular closure and closure_put() will magically
157 * handle everything for you. 157 * handle everything for you.
158 *
159 * We've got closures that embed timers, too. They're called, appropriately
160 * enough:
161 * struct closure_with_timer;
162 *
163 * This gives you access to closure_delay(). It takes a refcount for a specified
164 * number of jiffies - you could then call closure_sync() (for a slightly
165 * convoluted version of msleep()) or continue_at() - which gives you the same
166 * effect as using a delayed work item, except you can reuse the work_struct
167 * already embedded in struct closure.
168 *
169 * Lastly, there's struct closure_with_waitlist_and_timer. It does what you
170 * probably expect, if you happen to need the features of both. (You don't
171 * really want to know how all this is implemented, but if I've done my job
172 * right you shouldn't have to care).
173 */ 158 */
174 159
175struct closure; 160struct closure;
@@ -182,16 +167,11 @@ struct closure_waitlist {
182enum closure_type { 167enum closure_type {
183 TYPE_closure = 0, 168 TYPE_closure = 0,
184 TYPE_closure_with_waitlist = 1, 169 TYPE_closure_with_waitlist = 1,
185 TYPE_closure_with_timer = 2, 170 MAX_CLOSURE_TYPE = 1,
186 TYPE_closure_with_waitlist_and_timer = 3,
187 MAX_CLOSURE_TYPE = 3,
188}; 171};
189 172
190enum closure_state { 173enum closure_state {
191 /* 174 /*
192 * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of
193 * waiting asynchronously
194 *
195 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by 175 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
196 * the thread that owns the closure, and cleared by the thread that's 176 * the thread that owns the closure, and cleared by the thread that's
197 * waking up the closure. 177 * waking up the closure.
@@ -200,10 +180,6 @@ enum closure_state {
200 * - indicates that cl->task is valid and closure_put() may wake it up. 180 * - indicates that cl->task is valid and closure_put() may wake it up.
201 * Only set or cleared by the thread that owns the closure. 181 * Only set or cleared by the thread that owns the closure.
202 * 182 *
203 * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure
204 * has an outstanding timer. Must be set by the thread that owns the
205 * closure, and cleared by the timer function when the timer goes off.
206 *
207 * The rest are for debugging and don't affect behaviour: 183 * The rest are for debugging and don't affect behaviour:
208 * 184 *
209 * CLOSURE_RUNNING: Set when a closure is running (i.e. by 185 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
@@ -218,19 +194,17 @@ enum closure_state {
218 * closure with this flag set 194 * closure with this flag set
219 */ 195 */
220 196
221 CLOSURE_BITS_START = (1 << 19), 197 CLOSURE_BITS_START = (1 << 23),
222 CLOSURE_DESTRUCTOR = (1 << 19), 198 CLOSURE_DESTRUCTOR = (1 << 23),
223 CLOSURE_BLOCKING = (1 << 21), 199 CLOSURE_WAITING = (1 << 25),
224 CLOSURE_WAITING = (1 << 23), 200 CLOSURE_SLEEPING = (1 << 27),
225 CLOSURE_SLEEPING = (1 << 25),
226 CLOSURE_TIMER = (1 << 27),
227 CLOSURE_RUNNING = (1 << 29), 201 CLOSURE_RUNNING = (1 << 29),
228 CLOSURE_STACK = (1 << 31), 202 CLOSURE_STACK = (1 << 31),
229}; 203};
230 204
231#define CLOSURE_GUARD_MASK \ 205#define CLOSURE_GUARD_MASK \
232 ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \ 206 ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \
233 CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1) 207 CLOSURE_RUNNING|CLOSURE_STACK) << 1)
234 208
235#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) 209#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
236#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) 210#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
@@ -268,17 +242,6 @@ struct closure_with_waitlist {
268 struct closure_waitlist wait; 242 struct closure_waitlist wait;
269}; 243};
270 244
271struct closure_with_timer {
272 struct closure cl;
273 struct timer_list timer;
274};
275
276struct closure_with_waitlist_and_timer {
277 struct closure cl;
278 struct closure_waitlist wait;
279 struct timer_list timer;
280};
281
282extern unsigned invalid_closure_type(void); 245extern unsigned invalid_closure_type(void);
283 246
284#define __CLOSURE_TYPE(cl, _t) \ 247#define __CLOSURE_TYPE(cl, _t) \
@@ -289,14 +252,11 @@ extern unsigned invalid_closure_type(void);
289( \ 252( \
290 __CLOSURE_TYPE(cl, closure) \ 253 __CLOSURE_TYPE(cl, closure) \
291 __CLOSURE_TYPE(cl, closure_with_waitlist) \ 254 __CLOSURE_TYPE(cl, closure_with_waitlist) \
292 __CLOSURE_TYPE(cl, closure_with_timer) \
293 __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \
294 invalid_closure_type() \ 255 invalid_closure_type() \
295) 256)
296 257
297void closure_sub(struct closure *cl, int v); 258void closure_sub(struct closure *cl, int v);
298void closure_put(struct closure *cl); 259void closure_put(struct closure *cl);
299void closure_queue(struct closure *cl);
300void __closure_wake_up(struct closure_waitlist *list); 260void __closure_wake_up(struct closure_waitlist *list);
301bool closure_wait(struct closure_waitlist *list, struct closure *cl); 261bool closure_wait(struct closure_waitlist *list, struct closure *cl);
302void closure_sync(struct closure *cl); 262void closure_sync(struct closure *cl);
@@ -305,12 +265,6 @@ bool closure_trylock(struct closure *cl, struct closure *parent);
305void __closure_lock(struct closure *cl, struct closure *parent, 265void __closure_lock(struct closure *cl, struct closure *parent,
306 struct closure_waitlist *wait_list); 266 struct closure_waitlist *wait_list);
307 267
308void do_closure_timer_init(struct closure *cl);
309bool __closure_delay(struct closure *cl, unsigned long delay,
310 struct timer_list *timer);
311void __closure_flush(struct closure *cl, struct timer_list *timer);
312void __closure_flush_sync(struct closure *cl, struct timer_list *timer);
313
314#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 268#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
315 269
316void closure_debug_init(void); 270void closure_debug_init(void);
@@ -354,11 +308,6 @@ static inline void closure_set_stopped(struct closure *cl)
354 atomic_sub(CLOSURE_RUNNING, &cl->remaining); 308 atomic_sub(CLOSURE_RUNNING, &cl->remaining);
355} 309}
356 310
357static inline bool closure_is_stopped(struct closure *cl)
358{
359 return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING);
360}
361
362static inline bool closure_is_unlocked(struct closure *cl) 311static inline bool closure_is_unlocked(struct closure *cl)
363{ 312{
364 return atomic_read(&cl->remaining) == -1; 313 return atomic_read(&cl->remaining) == -1;
@@ -367,14 +316,6 @@ static inline bool closure_is_unlocked(struct closure *cl)
367static inline void do_closure_init(struct closure *cl, struct closure *parent, 316static inline void do_closure_init(struct closure *cl, struct closure *parent,
368 bool running) 317 bool running)
369{ 318{
370 switch (cl->type) {
371 case TYPE_closure_with_timer:
372 case TYPE_closure_with_waitlist_and_timer:
373 do_closure_timer_init(cl);
374 default:
375 break;
376 }
377
378 cl->parent = parent; 319 cl->parent = parent;
379 if (parent) 320 if (parent)
380 closure_get(parent); 321 closure_get(parent);
@@ -429,8 +370,7 @@ do { \
429static inline void closure_init_stack(struct closure *cl) 370static inline void closure_init_stack(struct closure *cl)
430{ 371{
431 memset(cl, 0, sizeof(struct closure)); 372 memset(cl, 0, sizeof(struct closure));
432 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER| 373 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
433 CLOSURE_BLOCKING|CLOSURE_STACK);
434} 374}
435 375
436/** 376/**
@@ -461,24 +401,6 @@ do { \
461#define closure_lock(cl, parent) \ 401#define closure_lock(cl, parent) \
462 __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) 402 __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
463 403
464/**
465 * closure_delay() - delay some number of jiffies
466 * @cl: the closure that will sleep
467 * @delay: the delay in jiffies
468 *
469 * Takes a refcount on @cl which will be released after @delay jiffies; this may
470 * be used to have a function run after a delay with continue_at(), or
471 * closure_sync() may be used for a convoluted version of msleep().
472 */
473#define closure_delay(cl, delay) \
474 __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer)
475
476#define closure_flush(cl) \
477 __closure_flush(__to_internal_closure(cl), &(cl)->timer)
478
479#define closure_flush_sync(cl) \
480 __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer)
481
482static inline void __closure_end_sleep(struct closure *cl) 404static inline void __closure_end_sleep(struct closure *cl)
483{ 405{
484 __set_current_state(TASK_RUNNING); 406 __set_current_state(TASK_RUNNING);
@@ -498,40 +420,6 @@ static inline void __closure_start_sleep(struct closure *cl)
498} 420}
499 421
500/** 422/**
501 * closure_blocking() - returns true if the closure is in blocking mode.
502 *
503 * If a closure is in blocking mode, closure_wait_event() will sleep until the
504 * condition is true instead of waiting asynchronously.
505 */
506static inline bool closure_blocking(struct closure *cl)
507{
508 return atomic_read(&cl->remaining) & CLOSURE_BLOCKING;
509}
510
511/**
512 * set_closure_blocking() - put a closure in blocking mode.
513 *
514 * If a closure is in blocking mode, closure_wait_event() will sleep until the
515 * condition is true instead of waiting asynchronously.
516 *
517 * Not thread safe - can only be called by the thread running the closure.
518 */
519static inline void set_closure_blocking(struct closure *cl)
520{
521 if (!closure_blocking(cl))
522 atomic_add(CLOSURE_BLOCKING, &cl->remaining);
523}
524
525/*
526 * Not thread safe - can only be called by the thread running the closure.
527 */
528static inline void clear_closure_blocking(struct closure *cl)
529{
530 if (closure_blocking(cl))
531 atomic_sub(CLOSURE_BLOCKING, &cl->remaining);
532}
533
534/**
535 * closure_wake_up() - wake up all closures on a wait list. 423 * closure_wake_up() - wake up all closures on a wait list.
536 */ 424 */
537static inline void closure_wake_up(struct closure_waitlist *list) 425static inline void closure_wake_up(struct closure_waitlist *list)
@@ -561,63 +449,36 @@ static inline void closure_wake_up(struct closure_waitlist *list)
561 * refcount on our closure. If this was a stack allocated closure, that would be 449 * refcount on our closure. If this was a stack allocated closure, that would be
562 * bad. 450 * bad.
563 */ 451 */
564#define __closure_wait_event(list, cl, condition, _block) \ 452#define closure_wait_event(list, cl, condition) \
565({ \ 453({ \
566 bool block = _block; \
567 typeof(condition) ret; \ 454 typeof(condition) ret; \
568 \ 455 \
569 while (1) { \ 456 while (1) { \
570 ret = (condition); \ 457 ret = (condition); \
571 if (ret) { \ 458 if (ret) { \
572 __closure_wake_up(list); \ 459 __closure_wake_up(list); \
573 if (block) \ 460 closure_sync(cl); \
574 closure_sync(cl); \
575 \
576 break; \ 461 break; \
577 } \ 462 } \
578 \ 463 \
579 if (block) \ 464 __closure_start_sleep(cl); \
580 __closure_start_sleep(cl); \
581 \
582 if (!closure_wait(list, cl)) { \
583 if (!block) \
584 break; \
585 \ 465 \
466 if (!closure_wait(list, cl)) \
586 schedule(); \ 467 schedule(); \
587 } \
588 } \ 468 } \
589 \ 469 \
590 ret; \ 470 ret; \
591}) 471})
592 472
593/** 473static inline void closure_queue(struct closure *cl)
594 * closure_wait_event() - wait on a condition, synchronously or asynchronously. 474{
595 * @list: the wait list to wait on 475 struct workqueue_struct *wq = cl->wq;
596 * @cl: the closure that is doing the waiting 476 if (wq) {
597 * @condition: a C expression for the event to wait for 477 INIT_WORK(&cl->work, cl->work.func);
598 * 478 BUG_ON(!queue_work(wq, &cl->work));
599 * If the closure is in blocking mode, sleeps until the @condition evaluates to 479 } else
600 * true - exactly like wait_event(). 480 cl->fn(cl);
601 * 481}
602 * If the closure is not in blocking mode, waits asynchronously; if the
603 * condition is currently false the @cl is put onto @list and returns. @list
604 * owns a refcount on @cl; closure_sync() or continue_at() may be used later to
605 * wait for another thread to wake up @list, which drops the refcount on @cl.
606 *
607 * Returns the value of @condition; @cl will be on @list iff @condition was
608 * false.
609 *
610 * closure_wake_up(@list) must be called after changing any variable that could
611 * cause @condition to become true.
612 */
613#define closure_wait_event(list, cl, condition) \
614 __closure_wait_event(list, cl, condition, closure_blocking(cl))
615
616#define closure_wait_event_async(list, cl, condition) \
617 __closure_wait_event(list, cl, condition, false)
618
619#define closure_wait_event_sync(list, cl, condition) \
620 __closure_wait_event(list, cl, condition, true)
621 482
622static inline void set_closure_fn(struct closure *cl, closure_fn *fn, 483static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
623 struct workqueue_struct *wq) 484 struct workqueue_struct *wq)
@@ -642,7 +503,7 @@ do { \
642#define continue_at_nobarrier(_cl, _fn, _wq) \ 503#define continue_at_nobarrier(_cl, _fn, _wq) \
643do { \ 504do { \
644 set_closure_fn(_cl, _fn, _wq); \ 505 set_closure_fn(_cl, _fn, _wq); \
645 closure_queue(cl); \ 506 closure_queue(_cl); \
646 return; \ 507 return; \
647} while (0) 508} while (0)
648 509
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 88e6411eab4f..264fcfbd6290 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -8,7 +8,6 @@
8#include "bcache.h" 8#include "bcache.h"
9#include "btree.h" 9#include "btree.h"
10#include "debug.h" 10#include "debug.h"
11#include "request.h"
12 11
13#include <linux/console.h> 12#include <linux/console.h>
14#include <linux/debugfs.h> 13#include <linux/debugfs.h>
@@ -77,29 +76,17 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
77 return out - buf; 76 return out - buf;
78} 77}
79 78
80int bch_btree_to_text(char *buf, size_t size, const struct btree *b) 79#ifdef CONFIG_BCACHE_DEBUG
81{
82 return scnprintf(buf, size, "%zu level %i/%i",
83 PTR_BUCKET_NR(b->c, &b->key, 0),
84 b->level, b->c->root ? b->c->root->level : -1);
85}
86
87#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
88
89static bool skipped_backwards(struct btree *b, struct bkey *k)
90{
91 return bkey_cmp(k, (!b->level)
92 ? &START_KEY(bkey_next(k))
93 : bkey_next(k)) > 0;
94}
95 80
96static void dump_bset(struct btree *b, struct bset *i) 81static void dump_bset(struct btree *b, struct bset *i)
97{ 82{
98 struct bkey *k; 83 struct bkey *k, *next;
99 unsigned j; 84 unsigned j;
100 char buf[80]; 85 char buf[80];
101 86
102 for (k = i->start; k < end(i); k = bkey_next(k)) { 87 for (k = i->start; k < end(i); k = next) {
88 next = bkey_next(k);
89
103 bch_bkey_to_text(buf, sizeof(buf), k); 90 bch_bkey_to_text(buf, sizeof(buf), k);
104 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), 91 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
105 (uint64_t *) k - i->d, i->keys, buf); 92 (uint64_t *) k - i->d, i->keys, buf);
@@ -115,15 +102,21 @@ static void dump_bset(struct btree *b, struct bset *i)
115 102
116 printk(" %s\n", bch_ptr_status(b->c, k)); 103 printk(" %s\n", bch_ptr_status(b->c, k));
117 104
118 if (bkey_next(k) < end(i) && 105 if (next < end(i) &&
119 skipped_backwards(b, k)) 106 bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
120 printk(KERN_ERR "Key skipped backwards\n"); 107 printk(KERN_ERR "Key skipped backwards\n");
121 } 108 }
122} 109}
123 110
124#endif 111static void bch_dump_bucket(struct btree *b)
112{
113 unsigned i;
125 114
126#ifdef CONFIG_BCACHE_DEBUG 115 console_lock();
116 for (i = 0; i <= b->nsets; i++)
117 dump_bset(b, b->sets[i].data);
118 console_unlock();
119}
127 120
128void bch_btree_verify(struct btree *b, struct bset *new) 121void bch_btree_verify(struct btree *b, struct bset *new)
129{ 122{
@@ -176,66 +169,44 @@ void bch_btree_verify(struct btree *b, struct bset *new)
176 mutex_unlock(&b->c->verify_lock); 169 mutex_unlock(&b->c->verify_lock);
177} 170}
178 171
179static void data_verify_endio(struct bio *bio, int error) 172void bch_data_verify(struct cached_dev *dc, struct bio *bio)
180{
181 struct closure *cl = bio->bi_private;
182 closure_put(cl);
183}
184
185void bch_data_verify(struct search *s)
186{ 173{
187 char name[BDEVNAME_SIZE]; 174 char name[BDEVNAME_SIZE];
188 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
189 struct closure *cl = &s->cl;
190 struct bio *check; 175 struct bio *check;
191 struct bio_vec *bv; 176 struct bio_vec *bv;
192 int i; 177 int i;
193 178
194 if (!s->unaligned_bvec) 179 check = bio_clone(bio, GFP_NOIO);
195 bio_for_each_segment(bv, s->orig_bio, i)
196 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
197
198 check = bio_clone(s->orig_bio, GFP_NOIO);
199 if (!check) 180 if (!check)
200 return; 181 return;
201 182
202 if (bio_alloc_pages(check, GFP_NOIO)) 183 if (bio_alloc_pages(check, GFP_NOIO))
203 goto out_put; 184 goto out_put;
204 185
205 check->bi_rw = READ_SYNC; 186 submit_bio_wait(READ_SYNC, check);
206 check->bi_private = cl;
207 check->bi_end_io = data_verify_endio;
208
209 closure_bio_submit(check, cl, &dc->disk);
210 closure_sync(cl);
211 187
212 bio_for_each_segment(bv, s->orig_bio, i) { 188 bio_for_each_segment(bv, bio, i) {
213 void *p1 = kmap(bv->bv_page); 189 void *p1 = kmap_atomic(bv->bv_page);
214 void *p2 = kmap(check->bi_io_vec[i].bv_page); 190 void *p2 = page_address(check->bi_io_vec[i].bv_page);
215 191
216 if (memcmp(p1 + bv->bv_offset, 192 cache_set_err_on(memcmp(p1 + bv->bv_offset,
217 p2 + bv->bv_offset, 193 p2 + bv->bv_offset,
218 bv->bv_len)) 194 bv->bv_len),
219 printk(KERN_ERR 195 dc->disk.c,
220 "bcache (%s): verify failed at sector %llu\n", 196 "verify failed at dev %s sector %llu",
221 bdevname(dc->bdev, name), 197 bdevname(dc->bdev, name),
222 (uint64_t) s->orig_bio->bi_sector); 198 (uint64_t) bio->bi_sector);
223 199
224 kunmap(bv->bv_page); 200 kunmap_atomic(p1);
225 kunmap(check->bi_io_vec[i].bv_page);
226 } 201 }
227 202
228 __bio_for_each_segment(bv, check, i, 0) 203 bio_for_each_segment_all(bv, check, i)
229 __free_page(bv->bv_page); 204 __free_page(bv->bv_page);
230out_put: 205out_put:
231 bio_put(check); 206 bio_put(check);
232} 207}
233 208
234#endif 209int __bch_count_data(struct btree *b)
235
236#ifdef CONFIG_BCACHE_EDEBUG
237
238unsigned bch_count_data(struct btree *b)
239{ 210{
240 unsigned ret = 0; 211 unsigned ret = 0;
241 struct btree_iter iter; 212 struct btree_iter iter;
@@ -247,72 +218,60 @@ unsigned bch_count_data(struct btree *b)
247 return ret; 218 return ret;
248} 219}
249 220
250static void vdump_bucket_and_panic(struct btree *b, const char *fmt, 221void __bch_check_keys(struct btree *b, const char *fmt, ...)
251 va_list args)
252{
253 unsigned i;
254 char buf[80];
255
256 console_lock();
257
258 for (i = 0; i <= b->nsets; i++)
259 dump_bset(b, b->sets[i].data);
260
261 vprintk(fmt, args);
262
263 console_unlock();
264
265 bch_btree_to_text(buf, sizeof(buf), b);
266 panic("at %s\n", buf);
267}
268
269void bch_check_key_order_msg(struct btree *b, struct bset *i,
270 const char *fmt, ...)
271{
272 struct bkey *k;
273
274 if (!i->keys)
275 return;
276
277 for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))
278 if (skipped_backwards(b, k)) {
279 va_list args;
280 va_start(args, fmt);
281
282 vdump_bucket_and_panic(b, fmt, args);
283 va_end(args);
284 }
285}
286
287void bch_check_keys(struct btree *b, const char *fmt, ...)
288{ 222{
289 va_list args; 223 va_list args;
290 struct bkey *k, *p = NULL; 224 struct bkey *k, *p = NULL;
291 struct btree_iter iter; 225 struct btree_iter iter;
292 226 const char *err;
293 if (b->level)
294 return;
295 227
296 for_each_key(b, k, &iter) { 228 for_each_key(b, k, &iter) {
297 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { 229 if (!b->level) {
298 printk(KERN_ERR "Keys out of order:\n"); 230 err = "Keys out of order";
299 goto bug; 231 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
300 } 232 goto bug;
301 233
302 if (bch_ptr_invalid(b, k)) 234 if (bch_ptr_invalid(b, k))
303 continue; 235 continue;
304 236
305 if (p && bkey_cmp(p, &START_KEY(k)) > 0) { 237 err = "Overlapping keys";
306 printk(KERN_ERR "Overlapping keys:\n"); 238 if (p && bkey_cmp(p, &START_KEY(k)) > 0)
307 goto bug; 239 goto bug;
240 } else {
241 if (bch_ptr_bad(b, k))
242 continue;
243
244 err = "Duplicate keys";
245 if (p && !bkey_cmp(p, k))
246 goto bug;
308 } 247 }
309 p = k; 248 p = k;
310 } 249 }
250
251 err = "Key larger than btree node key";
252 if (p && bkey_cmp(p, &b->key) > 0)
253 goto bug;
254
311 return; 255 return;
312bug: 256bug:
257 bch_dump_bucket(b);
258
313 va_start(args, fmt); 259 va_start(args, fmt);
314 vdump_bucket_and_panic(b, fmt, args); 260 vprintk(fmt, args);
315 va_end(args); 261 va_end(args);
262
263 panic("bcache error: %s:\n", err);
264}
265
266void bch_btree_iter_next_check(struct btree_iter *iter)
267{
268 struct bkey *k = iter->data->k, *next = bkey_next(k);
269
270 if (next < iter->data->end &&
271 bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
272 bch_dump_bucket(iter->b);
273 panic("Key skipped backwards\n");
274 }
316} 275}
317 276
318#endif 277#endif
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 1c39b5a2489b..2ede60e31874 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -4,40 +4,44 @@
4/* Btree/bkey debug printing */ 4/* Btree/bkey debug printing */
5 5
6int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); 6int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
7int bch_btree_to_text(char *buf, size_t size, const struct btree *b);
8
9#ifdef CONFIG_BCACHE_EDEBUG
10
11unsigned bch_count_data(struct btree *);
12void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...);
13void bch_check_keys(struct btree *, const char *, ...);
14
15#define bch_check_key_order(b, i) \
16 bch_check_key_order_msg(b, i, "keys out of order")
17#define EBUG_ON(cond) BUG_ON(cond)
18
19#else /* EDEBUG */
20
21#define bch_count_data(b) 0
22#define bch_check_key_order(b, i) do {} while (0)
23#define bch_check_key_order_msg(b, i, ...) do {} while (0)
24#define bch_check_keys(b, ...) do {} while (0)
25#define EBUG_ON(cond) do {} while (0)
26
27#endif
28 7
29#ifdef CONFIG_BCACHE_DEBUG 8#ifdef CONFIG_BCACHE_DEBUG
30 9
31void bch_btree_verify(struct btree *, struct bset *); 10void bch_btree_verify(struct btree *, struct bset *);
32void bch_data_verify(struct search *); 11void bch_data_verify(struct cached_dev *, struct bio *);
12int __bch_count_data(struct btree *);
13void __bch_check_keys(struct btree *, const char *, ...);
14void bch_btree_iter_next_check(struct btree_iter *);
15
16#define EBUG_ON(cond) BUG_ON(cond)
17#define expensive_debug_checks(c) ((c)->expensive_debug_checks)
18#define key_merging_disabled(c) ((c)->key_merging_disabled)
19#define bypass_torture_test(d) ((d)->bypass_torture_test)
33 20
34#else /* DEBUG */ 21#else /* DEBUG */
35 22
36static inline void bch_btree_verify(struct btree *b, struct bset *i) {} 23static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
37static inline void bch_data_verify(struct search *s) {}; 24static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
25static inline int __bch_count_data(struct btree *b) { return -1; }
26static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
27static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
28
29#define EBUG_ON(cond) do { if (cond); } while (0)
30#define expensive_debug_checks(c) 0
31#define key_merging_disabled(c) 0
32#define bypass_torture_test(d) 0
38 33
39#endif 34#endif
40 35
36#define bch_count_data(b) \
37 (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
38
39#define bch_check_keys(b, ...) \
40do { \
41 if (expensive_debug_checks((b)->c)) \
42 __bch_check_keys(b, __VA_ARGS__); \
43} while (0)
44
41#ifdef CONFIG_DEBUG_FS 45#ifdef CONFIG_DEBUG_FS
42void bch_debug_init_cache_set(struct cache_set *); 46void bch_debug_init_cache_set(struct cache_set *);
43#else 47#else
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8435f81e5d85..ecdaa671bd50 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -7,7 +7,6 @@
7#include "bcache.h" 7#include "bcache.h"
8#include "btree.h" 8#include "btree.h"
9#include "debug.h" 9#include "debug.h"
10#include "request.h"
11 10
12#include <trace/events/bcache.h> 11#include <trace/events/bcache.h>
13 12
@@ -31,17 +30,20 @@ static void journal_read_endio(struct bio *bio, int error)
31} 30}
32 31
33static int journal_read_bucket(struct cache *ca, struct list_head *list, 32static int journal_read_bucket(struct cache *ca, struct list_head *list,
34 struct btree_op *op, unsigned bucket_index) 33 unsigned bucket_index)
35{ 34{
36 struct journal_device *ja = &ca->journal; 35 struct journal_device *ja = &ca->journal;
37 struct bio *bio = &ja->bio; 36 struct bio *bio = &ja->bio;
38 37
39 struct journal_replay *i; 38 struct journal_replay *i;
40 struct jset *j, *data = ca->set->journal.w[0].data; 39 struct jset *j, *data = ca->set->journal.w[0].data;
40 struct closure cl;
41 unsigned len, left, offset = 0; 41 unsigned len, left, offset = 0;
42 int ret = 0; 42 int ret = 0;
43 sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); 43 sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
44 44
45 closure_init_stack(&cl);
46
45 pr_debug("reading %llu", (uint64_t) bucket); 47 pr_debug("reading %llu", (uint64_t) bucket);
46 48
47 while (offset < ca->sb.bucket_size) { 49 while (offset < ca->sb.bucket_size) {
@@ -55,11 +57,11 @@ reread: left = ca->sb.bucket_size - offset;
55 bio->bi_size = len << 9; 57 bio->bi_size = len << 9;
56 58
57 bio->bi_end_io = journal_read_endio; 59 bio->bi_end_io = journal_read_endio;
58 bio->bi_private = &op->cl; 60 bio->bi_private = &cl;
59 bch_bio_map(bio, data); 61 bch_bio_map(bio, data);
60 62
61 closure_bio_submit(bio, &op->cl, ca); 63 closure_bio_submit(bio, &cl, ca);
62 closure_sync(&op->cl); 64 closure_sync(&cl);
63 65
64 /* This function could be simpler now since we no longer write 66 /* This function could be simpler now since we no longer write
65 * journal entries that overlap bucket boundaries; this means 67 * journal entries that overlap bucket boundaries; this means
@@ -72,7 +74,7 @@ reread: left = ca->sb.bucket_size - offset;
72 struct list_head *where; 74 struct list_head *where;
73 size_t blocks, bytes = set_bytes(j); 75 size_t blocks, bytes = set_bytes(j);
74 76
75 if (j->magic != jset_magic(ca->set)) 77 if (j->magic != jset_magic(&ca->sb))
76 return ret; 78 return ret;
77 79
78 if (bytes > left << 9) 80 if (bytes > left << 9)
@@ -129,12 +131,11 @@ next_set:
129 return ret; 131 return ret;
130} 132}
131 133
132int bch_journal_read(struct cache_set *c, struct list_head *list, 134int bch_journal_read(struct cache_set *c, struct list_head *list)
133 struct btree_op *op)
134{ 135{
135#define read_bucket(b) \ 136#define read_bucket(b) \
136 ({ \ 137 ({ \
137 int ret = journal_read_bucket(ca, list, op, b); \ 138 int ret = journal_read_bucket(ca, list, b); \
138 __set_bit(b, bitmap); \ 139 __set_bit(b, bitmap); \
139 if (ret < 0) \ 140 if (ret < 0) \
140 return ret; \ 141 return ret; \
@@ -292,8 +293,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
292 } 293 }
293} 294}
294 295
295int bch_journal_replay(struct cache_set *s, struct list_head *list, 296int bch_journal_replay(struct cache_set *s, struct list_head *list)
296 struct btree_op *op)
297{ 297{
298 int ret = 0, keys = 0, entries = 0; 298 int ret = 0, keys = 0, entries = 0;
299 struct bkey *k; 299 struct bkey *k;
@@ -301,31 +301,30 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
301 list_entry(list->prev, struct journal_replay, list); 301 list_entry(list->prev, struct journal_replay, list);
302 302
303 uint64_t start = i->j.last_seq, end = i->j.seq, n = start; 303 uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
304 struct keylist keylist;
305
306 bch_keylist_init(&keylist);
304 307
305 list_for_each_entry(i, list, list) { 308 list_for_each_entry(i, list, list) {
306 BUG_ON(i->pin && atomic_read(i->pin) != 1); 309 BUG_ON(i->pin && atomic_read(i->pin) != 1);
307 310
308 if (n != i->j.seq) 311 cache_set_err_on(n != i->j.seq, s,
309 pr_err( 312"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
310 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n", 313 n, i->j.seq - 1, start, end);
311 n, i->j.seq - 1, start, end);
312 314
313 for (k = i->j.start; 315 for (k = i->j.start;
314 k < end(&i->j); 316 k < end(&i->j);
315 k = bkey_next(k)) { 317 k = bkey_next(k)) {
316 trace_bcache_journal_replay_key(k); 318 trace_bcache_journal_replay_key(k);
317 319
318 bkey_copy(op->keys.top, k); 320 bkey_copy(keylist.top, k);
319 bch_keylist_push(&op->keys); 321 bch_keylist_push(&keylist);
320
321 op->journal = i->pin;
322 atomic_inc(op->journal);
323 322
324 ret = bch_btree_insert(op, s); 323 ret = bch_btree_insert(s, &keylist, i->pin, NULL);
325 if (ret) 324 if (ret)
326 goto err; 325 goto err;
327 326
328 BUG_ON(!bch_keylist_empty(&op->keys)); 327 BUG_ON(!bch_keylist_empty(&keylist));
329 keys++; 328 keys++;
330 329
331 cond_resched(); 330 cond_resched();
@@ -339,14 +338,13 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
339 338
340 pr_info("journal replay done, %i keys in %i entries, seq %llu", 339 pr_info("journal replay done, %i keys in %i entries, seq %llu",
341 keys, entries, end); 340 keys, entries, end);
342 341err:
343 while (!list_empty(list)) { 342 while (!list_empty(list)) {
344 i = list_first_entry(list, struct journal_replay, list); 343 i = list_first_entry(list, struct journal_replay, list);
345 list_del(&i->list); 344 list_del(&i->list);
346 kfree(i); 345 kfree(i);
347 } 346 }
348err: 347
349 closure_sync(&op->cl);
350 return ret; 348 return ret;
351} 349}
352 350
@@ -358,48 +356,35 @@ static void btree_flush_write(struct cache_set *c)
358 * Try to find the btree node with that references the oldest journal 356 * Try to find the btree node with that references the oldest journal
359 * entry, best is our current candidate and is locked if non NULL: 357 * entry, best is our current candidate and is locked if non NULL:
360 */ 358 */
361 struct btree *b, *best = NULL; 359 struct btree *b, *best;
362 unsigned iter; 360 unsigned i;
361retry:
362 best = NULL;
363
364 for_each_cached_btree(b, c, i)
365 if (btree_current_write(b)->journal) {
366 if (!best)
367 best = b;
368 else if (journal_pin_cmp(c,
369 btree_current_write(best)->journal,
370 btree_current_write(b)->journal)) {
371 best = b;
372 }
373 }
363 374
364 for_each_cached_btree(b, c, iter) { 375 b = best;
365 if (!down_write_trylock(&b->lock)) 376 if (b) {
366 continue; 377 rw_lock(true, b, b->level);
367 378
368 if (!btree_node_dirty(b) || 379 if (!btree_current_write(b)->journal) {
369 !btree_current_write(b)->journal) {
370 rw_unlock(true, b); 380 rw_unlock(true, b);
371 continue; 381 /* We raced */
382 goto retry;
372 } 383 }
373 384
374 if (!best) 385 bch_btree_node_write(b, NULL);
375 best = b; 386 rw_unlock(true, b);
376 else if (journal_pin_cmp(c,
377 btree_current_write(best),
378 btree_current_write(b))) {
379 rw_unlock(true, best);
380 best = b;
381 } else
382 rw_unlock(true, b);
383 } 387 }
384
385 if (best)
386 goto out;
387
388 /* We can't find the best btree node, just pick the first */
389 list_for_each_entry(b, &c->btree_cache, list)
390 if (!b->level && btree_node_dirty(b)) {
391 best = b;
392 rw_lock(true, best, best->level);
393 goto found;
394 }
395
396out:
397 if (!best)
398 return;
399found:
400 if (btree_node_dirty(best))
401 bch_btree_node_write(best, NULL);
402 rw_unlock(true, best);
403} 388}
404 389
405#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) 390#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
@@ -495,7 +480,7 @@ static void journal_reclaim(struct cache_set *c)
495 do_journal_discard(ca); 480 do_journal_discard(ca);
496 481
497 if (c->journal.blocks_free) 482 if (c->journal.blocks_free)
498 return; 483 goto out;
499 484
500 /* 485 /*
501 * Allocate: 486 * Allocate:
@@ -521,7 +506,7 @@ static void journal_reclaim(struct cache_set *c)
521 506
522 if (n) 507 if (n)
523 c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; 508 c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
524 509out:
525 if (!journal_full(&c->journal)) 510 if (!journal_full(&c->journal))
526 __closure_wake_up(&c->journal.wait); 511 __closure_wake_up(&c->journal.wait);
527} 512}
@@ -554,32 +539,26 @@ static void journal_write_endio(struct bio *bio, int error)
554 struct journal_write *w = bio->bi_private; 539 struct journal_write *w = bio->bi_private;
555 540
556 cache_set_err_on(error, w->c, "journal io error"); 541 cache_set_err_on(error, w->c, "journal io error");
557 closure_put(&w->c->journal.io.cl); 542 closure_put(&w->c->journal.io);
558} 543}
559 544
560static void journal_write(struct closure *); 545static void journal_write(struct closure *);
561 546
562static void journal_write_done(struct closure *cl) 547static void journal_write_done(struct closure *cl)
563{ 548{
564 struct journal *j = container_of(cl, struct journal, io.cl); 549 struct journal *j = container_of(cl, struct journal, io);
565 struct cache_set *c = container_of(j, struct cache_set, journal);
566
567 struct journal_write *w = (j->cur == j->w) 550 struct journal_write *w = (j->cur == j->w)
568 ? &j->w[1] 551 ? &j->w[1]
569 : &j->w[0]; 552 : &j->w[0];
570 553
571 __closure_wake_up(&w->wait); 554 __closure_wake_up(&w->wait);
572 555 continue_at_nobarrier(cl, journal_write, system_wq);
573 if (c->journal_delay_ms)
574 closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
575
576 continue_at(cl, journal_write, system_wq);
577} 556}
578 557
579static void journal_write_unlocked(struct closure *cl) 558static void journal_write_unlocked(struct closure *cl)
580 __releases(c->journal.lock) 559 __releases(c->journal.lock)
581{ 560{
582 struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); 561 struct cache_set *c = container_of(cl, struct cache_set, journal.io);
583 struct cache *ca; 562 struct cache *ca;
584 struct journal_write *w = c->journal.cur; 563 struct journal_write *w = c->journal.cur;
585 struct bkey *k = &c->journal.key; 564 struct bkey *k = &c->journal.key;
@@ -617,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
617 for_each_cache(ca, c, i) 596 for_each_cache(ca, c, i)
618 w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; 597 w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
619 598
620 w->data->magic = jset_magic(c); 599 w->data->magic = jset_magic(&c->sb);
621 w->data->version = BCACHE_JSET_VERSION; 600 w->data->version = BCACHE_JSET_VERSION;
622 w->data->last_seq = last_seq(&c->journal); 601 w->data->last_seq = last_seq(&c->journal);
623 w->data->csum = csum_set(w->data); 602 w->data->csum = csum_set(w->data);
@@ -660,121 +639,134 @@ static void journal_write_unlocked(struct closure *cl)
660 639
661static void journal_write(struct closure *cl) 640static void journal_write(struct closure *cl)
662{ 641{
663 struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); 642 struct cache_set *c = container_of(cl, struct cache_set, journal.io);
664 643
665 spin_lock(&c->journal.lock); 644 spin_lock(&c->journal.lock);
666 journal_write_unlocked(cl); 645 journal_write_unlocked(cl);
667} 646}
668 647
669static void __journal_try_write(struct cache_set *c, bool noflush) 648static void journal_try_write(struct cache_set *c)
670 __releases(c->journal.lock) 649 __releases(c->journal.lock)
671{ 650{
672 struct closure *cl = &c->journal.io.cl; 651 struct closure *cl = &c->journal.io;
652 struct journal_write *w = c->journal.cur;
673 653
674 if (!closure_trylock(cl, &c->cl)) 654 w->need_write = true;
675 spin_unlock(&c->journal.lock); 655
676 else if (noflush && journal_full(&c->journal)) { 656 if (closure_trylock(cl, &c->cl))
677 spin_unlock(&c->journal.lock);
678 continue_at(cl, journal_write, system_wq);
679 } else
680 journal_write_unlocked(cl); 657 journal_write_unlocked(cl);
658 else
659 spin_unlock(&c->journal.lock);
681} 660}
682 661
683#define journal_try_write(c) __journal_try_write(c, false) 662static struct journal_write *journal_wait_for_write(struct cache_set *c,
684 663 unsigned nkeys)
685void bch_journal_meta(struct cache_set *c, struct closure *cl)
686{ 664{
687 struct journal_write *w; 665 size_t sectors;
666 struct closure cl;
688 667
689 if (CACHE_SYNC(&c->sb)) { 668 closure_init_stack(&cl);
690 spin_lock(&c->journal.lock); 669
670 spin_lock(&c->journal.lock);
691 671
692 w = c->journal.cur; 672 while (1) {
693 w->need_write = true; 673 struct journal_write *w = c->journal.cur;
694 674
695 if (cl) 675 sectors = __set_blocks(w->data, w->data->keys + nkeys,
696 BUG_ON(!closure_wait(&w->wait, cl)); 676 c) * c->sb.block_size;
697 677
698 closure_flush(&c->journal.io); 678 if (sectors <= min_t(size_t,
699 __journal_try_write(c, true); 679 c->journal.blocks_free * c->sb.block_size,
680 PAGE_SECTORS << JSET_BITS))
681 return w;
682
683 /* XXX: tracepoint */
684 if (!journal_full(&c->journal)) {
685 trace_bcache_journal_entry_full(c);
686
687 /*
688 * XXX: If we were inserting so many keys that they
689 * won't fit in an _empty_ journal write, we'll
690 * deadlock. For now, handle this in
691 * bch_keylist_realloc() - but something to think about.
692 */
693 BUG_ON(!w->data->keys);
694
695 closure_wait(&w->wait, &cl);
696 journal_try_write(c); /* unlocks */
697 } else {
698 trace_bcache_journal_full(c);
699
700 closure_wait(&c->journal.wait, &cl);
701 journal_reclaim(c);
702 spin_unlock(&c->journal.lock);
703
704 btree_flush_write(c);
705 }
706
707 closure_sync(&cl);
708 spin_lock(&c->journal.lock);
700 } 709 }
701} 710}
702 711
712static void journal_write_work(struct work_struct *work)
713{
714 struct cache_set *c = container_of(to_delayed_work(work),
715 struct cache_set,
716 journal.work);
717 spin_lock(&c->journal.lock);
718 journal_try_write(c);
719}
720
703/* 721/*
704 * Entry point to the journalling code - bio_insert() and btree_invalidate() 722 * Entry point to the journalling code - bio_insert() and btree_invalidate()
705 * pass bch_journal() a list of keys to be journalled, and then 723 * pass bch_journal() a list of keys to be journalled, and then
706 * bch_journal() hands those same keys off to btree_insert_async() 724 * bch_journal() hands those same keys off to btree_insert_async()
707 */ 725 */
708 726
709void bch_journal(struct closure *cl) 727atomic_t *bch_journal(struct cache_set *c,
728 struct keylist *keys,
729 struct closure *parent)
710{ 730{
711 struct btree_op *op = container_of(cl, struct btree_op, cl);
712 struct cache_set *c = op->c;
713 struct journal_write *w; 731 struct journal_write *w;
714 size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; 732 atomic_t *ret;
715
716 if (op->type != BTREE_INSERT ||
717 !CACHE_SYNC(&c->sb))
718 goto out;
719 733
720 /* 734 if (!CACHE_SYNC(&c->sb))
721 * If we're looping because we errored, might already be waiting on 735 return NULL;
722 * another journal write:
723 */
724 while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
725 closure_sync(cl->parent);
726 736
727 spin_lock(&c->journal.lock); 737 w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
728 738
729 if (journal_full(&c->journal)) { 739 memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys));
730 trace_bcache_journal_full(c); 740 w->data->keys += bch_keylist_nkeys(keys);
731 741
732 closure_wait(&c->journal.wait, cl); 742 ret = &fifo_back(&c->journal.pin);
743 atomic_inc(ret);
733 744
734 journal_reclaim(c); 745 if (parent) {
746 closure_wait(&w->wait, parent);
747 journal_try_write(c);
748 } else if (!w->need_write) {
749 schedule_delayed_work(&c->journal.work,
750 msecs_to_jiffies(c->journal_delay_ms));
751 spin_unlock(&c->journal.lock);
752 } else {
735 spin_unlock(&c->journal.lock); 753 spin_unlock(&c->journal.lock);
736
737 btree_flush_write(c);
738 continue_at(cl, bch_journal, bcache_wq);
739 } 754 }
740 755
741 w = c->journal.cur;
742 w->need_write = true;
743 b = __set_blocks(w->data, w->data->keys + n, c);
744
745 if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
746 b > c->journal.blocks_free) {
747 trace_bcache_journal_entry_full(c);
748
749 /*
750 * XXX: If we were inserting so many keys that they won't fit in
751 * an _empty_ journal write, we'll deadlock. For now, handle
752 * this in bch_keylist_realloc() - but something to think about.
753 */
754 BUG_ON(!w->data->keys);
755
756 BUG_ON(!closure_wait(&w->wait, cl));
757
758 closure_flush(&c->journal.io);
759 756
760 journal_try_write(c); 757 return ret;
761 continue_at(cl, bch_journal, bcache_wq); 758}
762 }
763
764 memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
765 w->data->keys += n;
766 759
767 op->journal = &fifo_back(&c->journal.pin); 760void bch_journal_meta(struct cache_set *c, struct closure *cl)
768 atomic_inc(op->journal); 761{
762 struct keylist keys;
763 atomic_t *ref;
769 764
770 if (op->flush_journal) { 765 bch_keylist_init(&keys);
771 closure_flush(&c->journal.io);
772 closure_wait(&w->wait, cl->parent);
773 }
774 766
775 journal_try_write(c); 767 ref = bch_journal(c, &keys, cl);
776out: 768 if (ref)
777 bch_btree_insert_async(cl); 769 atomic_dec_bug(ref);
778} 770}
779 771
780void bch_journal_free(struct cache_set *c) 772void bch_journal_free(struct cache_set *c)
@@ -790,6 +782,7 @@ int bch_journal_alloc(struct cache_set *c)
790 782
791 closure_init_unlocked(&j->io); 783 closure_init_unlocked(&j->io);
792 spin_lock_init(&j->lock); 784 spin_lock_init(&j->lock);
785 INIT_DELAYED_WORK(&j->work, journal_write_work);
793 786
794 c->journal_delay_ms = 100; 787 c->journal_delay_ms = 100;
795 788
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 3d7851274b04..a6472fda94b2 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -75,43 +75,6 @@
75 * nodes that are pinning the oldest journal entries first. 75 * nodes that are pinning the oldest journal entries first.
76 */ 76 */
77 77
78#define BCACHE_JSET_VERSION_UUIDv1 1
79/* Always latest UUID format */
80#define BCACHE_JSET_VERSION_UUID 1
81#define BCACHE_JSET_VERSION 1
82
83/*
84 * On disk format for a journal entry:
85 * seq is monotonically increasing; every journal entry has its own unique
86 * sequence number.
87 *
88 * last_seq is the oldest journal entry that still has keys the btree hasn't
89 * flushed to disk yet.
90 *
91 * version is for on disk format changes.
92 */
93struct jset {
94 uint64_t csum;
95 uint64_t magic;
96 uint64_t seq;
97 uint32_t version;
98 uint32_t keys;
99
100 uint64_t last_seq;
101
102 BKEY_PADDED(uuid_bucket);
103 BKEY_PADDED(btree_root);
104 uint16_t btree_level;
105 uint16_t pad[3];
106
107 uint64_t prio_bucket[MAX_CACHES_PER_SET];
108
109 union {
110 struct bkey start[0];
111 uint64_t d[0];
112 };
113};
114
115/* 78/*
116 * Only used for holding the journal entries we read in btree_journal_read() 79 * Only used for holding the journal entries we read in btree_journal_read()
117 * during cache_registration 80 * during cache_registration
@@ -140,7 +103,8 @@ struct journal {
140 spinlock_t lock; 103 spinlock_t lock;
141 /* used when waiting because the journal was full */ 104 /* used when waiting because the journal was full */
142 struct closure_waitlist wait; 105 struct closure_waitlist wait;
143 struct closure_with_timer io; 106 struct closure io;
107 struct delayed_work work;
144 108
145 /* Number of blocks free in the bucket(s) we're currently writing to */ 109 /* Number of blocks free in the bucket(s) we're currently writing to */
146 unsigned blocks_free; 110 unsigned blocks_free;
@@ -188,8 +152,7 @@ struct journal_device {
188}; 152};
189 153
190#define journal_pin_cmp(c, l, r) \ 154#define journal_pin_cmp(c, l, r) \
191 (fifo_idx(&(c)->journal.pin, (l)->journal) > \ 155 (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
192 fifo_idx(&(c)->journal.pin, (r)->journal))
193 156
194#define JOURNAL_PIN 20000 157#define JOURNAL_PIN 20000
195 158
@@ -199,15 +162,14 @@ struct journal_device {
199struct closure; 162struct closure;
200struct cache_set; 163struct cache_set;
201struct btree_op; 164struct btree_op;
165struct keylist;
202 166
203void bch_journal(struct closure *); 167atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *);
204void bch_journal_next(struct journal *); 168void bch_journal_next(struct journal *);
205void bch_journal_mark(struct cache_set *, struct list_head *); 169void bch_journal_mark(struct cache_set *, struct list_head *);
206void bch_journal_meta(struct cache_set *, struct closure *); 170void bch_journal_meta(struct cache_set *, struct closure *);
207int bch_journal_read(struct cache_set *, struct list_head *, 171int bch_journal_read(struct cache_set *, struct list_head *);
208 struct btree_op *); 172int bch_journal_replay(struct cache_set *, struct list_head *);
209int bch_journal_replay(struct cache_set *, struct list_head *,
210 struct btree_op *);
211 173
212void bch_journal_free(struct cache_set *); 174void bch_journal_free(struct cache_set *);
213int bch_journal_alloc(struct cache_set *); 175int bch_journal_alloc(struct cache_set *);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 1a3b4f4786c3..7c1275e66025 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -12,8 +12,9 @@
12#include <trace/events/bcache.h> 12#include <trace/events/bcache.h>
13 13
14struct moving_io { 14struct moving_io {
15 struct closure cl;
15 struct keybuf_key *w; 16 struct keybuf_key *w;
16 struct search s; 17 struct data_insert_op op;
17 struct bbio bio; 18 struct bbio bio;
18}; 19};
19 20
@@ -38,13 +39,13 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
38 39
39static void moving_io_destructor(struct closure *cl) 40static void moving_io_destructor(struct closure *cl)
40{ 41{
41 struct moving_io *io = container_of(cl, struct moving_io, s.cl); 42 struct moving_io *io = container_of(cl, struct moving_io, cl);
42 kfree(io); 43 kfree(io);
43} 44}
44 45
45static void write_moving_finish(struct closure *cl) 46static void write_moving_finish(struct closure *cl)
46{ 47{
47 struct moving_io *io = container_of(cl, struct moving_io, s.cl); 48 struct moving_io *io = container_of(cl, struct moving_io, cl);
48 struct bio *bio = &io->bio.bio; 49 struct bio *bio = &io->bio.bio;
49 struct bio_vec *bv; 50 struct bio_vec *bv;
50 int i; 51 int i;
@@ -52,13 +53,12 @@ static void write_moving_finish(struct closure *cl)
52 bio_for_each_segment_all(bv, bio, i) 53 bio_for_each_segment_all(bv, bio, i)
53 __free_page(bv->bv_page); 54 __free_page(bv->bv_page);
54 55
55 if (io->s.op.insert_collision) 56 if (io->op.replace_collision)
56 trace_bcache_gc_copy_collision(&io->w->key); 57 trace_bcache_gc_copy_collision(&io->w->key);
57 58
58 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); 59 bch_keybuf_del(&io->op.c->moving_gc_keys, io->w);
59 60
60 atomic_dec_bug(&io->s.op.c->in_flight); 61 up(&io->op.c->moving_in_flight);
61 closure_wake_up(&io->s.op.c->moving_gc_wait);
62 62
63 closure_return_with_destructor(cl, moving_io_destructor); 63 closure_return_with_destructor(cl, moving_io_destructor);
64} 64}
@@ -66,12 +66,12 @@ static void write_moving_finish(struct closure *cl)
66static void read_moving_endio(struct bio *bio, int error) 66static void read_moving_endio(struct bio *bio, int error)
67{ 67{
68 struct moving_io *io = container_of(bio->bi_private, 68 struct moving_io *io = container_of(bio->bi_private,
69 struct moving_io, s.cl); 69 struct moving_io, cl);
70 70
71 if (error) 71 if (error)
72 io->s.error = error; 72 io->op.error = error;
73 73
74 bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); 74 bch_bbio_endio(io->op.c, bio, error, "reading data to move");
75} 75}
76 76
77static void moving_init(struct moving_io *io) 77static void moving_init(struct moving_io *io)
@@ -85,54 +85,53 @@ static void moving_init(struct moving_io *io)
85 bio->bi_size = KEY_SIZE(&io->w->key) << 9; 85 bio->bi_size = KEY_SIZE(&io->w->key) << 9;
86 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), 86 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key),
87 PAGE_SECTORS); 87 PAGE_SECTORS);
88 bio->bi_private = &io->s.cl; 88 bio->bi_private = &io->cl;
89 bio->bi_io_vec = bio->bi_inline_vecs; 89 bio->bi_io_vec = bio->bi_inline_vecs;
90 bch_bio_map(bio, NULL); 90 bch_bio_map(bio, NULL);
91} 91}
92 92
93static void write_moving(struct closure *cl) 93static void write_moving(struct closure *cl)
94{ 94{
95 struct search *s = container_of(cl, struct search, cl); 95 struct moving_io *io = container_of(cl, struct moving_io, cl);
96 struct moving_io *io = container_of(s, struct moving_io, s); 96 struct data_insert_op *op = &io->op;
97 97
98 if (!s->error) { 98 if (!op->error) {
99 moving_init(io); 99 moving_init(io);
100 100
101 io->bio.bio.bi_sector = KEY_START(&io->w->key); 101 io->bio.bio.bi_sector = KEY_START(&io->w->key);
102 s->op.lock = -1; 102 op->write_prio = 1;
103 s->op.write_prio = 1; 103 op->bio = &io->bio.bio;
104 s->op.cache_bio = &io->bio.bio;
105 104
106 s->writeback = KEY_DIRTY(&io->w->key); 105 op->writeback = KEY_DIRTY(&io->w->key);
107 s->op.csum = KEY_CSUM(&io->w->key); 106 op->csum = KEY_CSUM(&io->w->key);
108 107
109 s->op.type = BTREE_REPLACE; 108 bkey_copy(&op->replace_key, &io->w->key);
110 bkey_copy(&s->op.replace, &io->w->key); 109 op->replace = true;
111 110
112 closure_init(&s->op.cl, cl); 111 closure_call(&op->cl, bch_data_insert, NULL, cl);
113 bch_insert_data(&s->op.cl);
114 } 112 }
115 113
116 continue_at(cl, write_moving_finish, NULL); 114 continue_at(cl, write_moving_finish, system_wq);
117} 115}
118 116
119static void read_moving_submit(struct closure *cl) 117static void read_moving_submit(struct closure *cl)
120{ 118{
121 struct search *s = container_of(cl, struct search, cl); 119 struct moving_io *io = container_of(cl, struct moving_io, cl);
122 struct moving_io *io = container_of(s, struct moving_io, s);
123 struct bio *bio = &io->bio.bio; 120 struct bio *bio = &io->bio.bio;
124 121
125 bch_submit_bbio(bio, s->op.c, &io->w->key, 0); 122 bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
126 123
127 continue_at(cl, write_moving, bch_gc_wq); 124 continue_at(cl, write_moving, system_wq);
128} 125}
129 126
130static void read_moving(struct closure *cl) 127static void read_moving(struct cache_set *c)
131{ 128{
132 struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
133 struct keybuf_key *w; 129 struct keybuf_key *w;
134 struct moving_io *io; 130 struct moving_io *io;
135 struct bio *bio; 131 struct bio *bio;
132 struct closure cl;
133
134 closure_init_stack(&cl);
136 135
137 /* XXX: if we error, background writeback could stall indefinitely */ 136 /* XXX: if we error, background writeback could stall indefinitely */
138 137
@@ -150,8 +149,8 @@ static void read_moving(struct closure *cl)
150 149
151 w->private = io; 150 w->private = io;
152 io->w = w; 151 io->w = w;
153 io->s.op.inode = KEY_INODE(&w->key); 152 io->op.inode = KEY_INODE(&w->key);
154 io->s.op.c = c; 153 io->op.c = c;
155 154
156 moving_init(io); 155 moving_init(io);
157 bio = &io->bio.bio; 156 bio = &io->bio.bio;
@@ -164,13 +163,8 @@ static void read_moving(struct closure *cl)
164 163
165 trace_bcache_gc_copy(&w->key); 164 trace_bcache_gc_copy(&w->key);
166 165
167 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); 166 down(&c->moving_in_flight);
168 167 closure_call(&io->cl, read_moving_submit, NULL, &cl);
169 if (atomic_inc_return(&c->in_flight) >= 64) {
170 closure_wait_event(&c->moving_gc_wait, cl,
171 atomic_read(&c->in_flight) < 64);
172 continue_at(cl, read_moving, bch_gc_wq);
173 }
174 } 168 }
175 169
176 if (0) { 170 if (0) {
@@ -180,7 +174,7 @@ err: if (!IS_ERR_OR_NULL(w->private))
180 bch_keybuf_del(&c->moving_gc_keys, w); 174 bch_keybuf_del(&c->moving_gc_keys, w);
181 } 175 }
182 176
183 closure_return(cl); 177 closure_sync(&cl);
184} 178}
185 179
186static bool bucket_cmp(struct bucket *l, struct bucket *r) 180static bool bucket_cmp(struct bucket *l, struct bucket *r)
@@ -193,15 +187,14 @@ static unsigned bucket_heap_top(struct cache *ca)
193 return GC_SECTORS_USED(heap_peek(&ca->heap)); 187 return GC_SECTORS_USED(heap_peek(&ca->heap));
194} 188}
195 189
196void bch_moving_gc(struct closure *cl) 190void bch_moving_gc(struct cache_set *c)
197{ 191{
198 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
199 struct cache *ca; 192 struct cache *ca;
200 struct bucket *b; 193 struct bucket *b;
201 unsigned i; 194 unsigned i;
202 195
203 if (!c->copy_gc_enabled) 196 if (!c->copy_gc_enabled)
204 closure_return(cl); 197 return;
205 198
206 mutex_lock(&c->bucket_lock); 199 mutex_lock(&c->bucket_lock);
207 200
@@ -242,13 +235,11 @@ void bch_moving_gc(struct closure *cl)
242 235
243 c->moving_gc_keys.last_scanned = ZERO_KEY; 236 c->moving_gc_keys.last_scanned = ZERO_KEY;
244 237
245 closure_init(&c->moving_gc, cl); 238 read_moving(c);
246 read_moving(&c->moving_gc);
247
248 closure_return(cl);
249} 239}
250 240
251void bch_moving_init_cache_set(struct cache_set *c) 241void bch_moving_init_cache_set(struct cache_set *c)
252{ 242{
253 bch_keybuf_init(&c->moving_gc_keys); 243 bch_keybuf_init(&c->moving_gc_keys);
244 sema_init(&c->moving_in_flight, 64);
254} 245}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 2a7f0dd6abab..fbcc851ed5a5 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -25,7 +25,7 @@
25 25
26struct kmem_cache *bch_search_cache; 26struct kmem_cache *bch_search_cache;
27 27
28static void check_should_skip(struct cached_dev *, struct search *); 28static void bch_data_insert_start(struct closure *);
29 29
30/* Cgroup interface */ 30/* Cgroup interface */
31 31
@@ -213,221 +213,79 @@ static void bio_csum(struct bio *bio, struct bkey *k)
213 213
214/* Insert data into cache */ 214/* Insert data into cache */
215 215
216static void bio_invalidate(struct closure *cl) 216static void bch_data_insert_keys(struct closure *cl)
217{ 217{
218 struct btree_op *op = container_of(cl, struct btree_op, cl); 218 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
219 struct bio *bio = op->cache_bio; 219 atomic_t *journal_ref = NULL;
220 220 struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
221 pr_debug("invalidating %i sectors from %llu", 221 int ret;
222 bio_sectors(bio), (uint64_t) bio->bi_sector);
223
224 while (bio_sectors(bio)) {
225 unsigned len = min(bio_sectors(bio), 1U << 14);
226
227 if (bch_keylist_realloc(&op->keys, 0, op->c))
228 goto out;
229
230 bio->bi_sector += len;
231 bio->bi_size -= len << 9;
232
233 bch_keylist_add(&op->keys,
234 &KEY(op->inode, bio->bi_sector, len));
235 }
236
237 op->insert_data_done = true;
238 bio_put(bio);
239out:
240 continue_at(cl, bch_journal, bcache_wq);
241}
242
243struct open_bucket {
244 struct list_head list;
245 struct task_struct *last;
246 unsigned sectors_free;
247 BKEY_PADDED(key);
248};
249
250void bch_open_buckets_free(struct cache_set *c)
251{
252 struct open_bucket *b;
253
254 while (!list_empty(&c->data_buckets)) {
255 b = list_first_entry(&c->data_buckets,
256 struct open_bucket, list);
257 list_del(&b->list);
258 kfree(b);
259 }
260}
261
262int bch_open_buckets_alloc(struct cache_set *c)
263{
264 int i;
265
266 spin_lock_init(&c->data_bucket_lock);
267
268 for (i = 0; i < 6; i++) {
269 struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
270 if (!b)
271 return -ENOMEM;
272
273 list_add(&b->list, &c->data_buckets);
274 }
275
276 return 0;
277}
278
279/*
280 * We keep multiple buckets open for writes, and try to segregate different
281 * write streams for better cache utilization: first we look for a bucket where
282 * the last write to it was sequential with the current write, and failing that
283 * we look for a bucket that was last used by the same task.
284 *
285 * The ideas is if you've got multiple tasks pulling data into the cache at the
286 * same time, you'll get better cache utilization if you try to segregate their
287 * data and preserve locality.
288 *
289 * For example, say you've starting Firefox at the same time you're copying a
290 * bunch of files. Firefox will likely end up being fairly hot and stay in the
291 * cache awhile, but the data you copied might not be; if you wrote all that
292 * data to the same buckets it'd get invalidated at the same time.
293 *
294 * Both of those tasks will be doing fairly random IO so we can't rely on
295 * detecting sequential IO to segregate their data, but going off of the task
296 * should be a sane heuristic.
297 */
298static struct open_bucket *pick_data_bucket(struct cache_set *c,
299 const struct bkey *search,
300 struct task_struct *task,
301 struct bkey *alloc)
302{
303 struct open_bucket *ret, *ret_task = NULL;
304
305 list_for_each_entry_reverse(ret, &c->data_buckets, list)
306 if (!bkey_cmp(&ret->key, search))
307 goto found;
308 else if (ret->last == task)
309 ret_task = ret;
310
311 ret = ret_task ?: list_first_entry(&c->data_buckets,
312 struct open_bucket, list);
313found:
314 if (!ret->sectors_free && KEY_PTRS(alloc)) {
315 ret->sectors_free = c->sb.bucket_size;
316 bkey_copy(&ret->key, alloc);
317 bkey_init(alloc);
318 }
319
320 if (!ret->sectors_free)
321 ret = NULL;
322
323 return ret;
324}
325
326/*
327 * Allocates some space in the cache to write to, and k to point to the newly
328 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
329 * end of the newly allocated space).
330 *
331 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
332 * sectors were actually allocated.
333 *
334 * If s->writeback is true, will not fail.
335 */
336static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
337 struct search *s)
338{
339 struct cache_set *c = s->op.c;
340 struct open_bucket *b;
341 BKEY_PADDED(key) alloc;
342 struct closure cl, *w = NULL;
343 unsigned i;
344
345 if (s->writeback) {
346 closure_init_stack(&cl);
347 w = &cl;
348 }
349 222
350 /* 223 /*
351 * We might have to allocate a new bucket, which we can't do with a 224 * If we're looping, might already be waiting on
352 * spinlock held. So if we have to allocate, we drop the lock, allocate 225 * another journal write - can't wait on more than one journal write at
353 * and then retry. KEY_PTRS() indicates whether alloc points to 226 * a time
354 * allocated bucket(s). 227 *
228 * XXX: this looks wrong
355 */ 229 */
230#if 0
231 while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
232 closure_sync(&s->cl);
233#endif
356 234
357 bkey_init(&alloc.key); 235 if (!op->replace)
358 spin_lock(&c->data_bucket_lock); 236 journal_ref = bch_journal(op->c, &op->insert_keys,
359 237 op->flush_journal ? cl : NULL);
360 while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
361 unsigned watermark = s->op.write_prio
362 ? WATERMARK_MOVINGGC
363 : WATERMARK_NONE;
364
365 spin_unlock(&c->data_bucket_lock);
366
367 if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
368 return false;
369 238
370 spin_lock(&c->data_bucket_lock); 239 ret = bch_btree_insert(op->c, &op->insert_keys,
240 journal_ref, replace_key);
241 if (ret == -ESRCH) {
242 op->replace_collision = true;
243 } else if (ret) {
244 op->error = -ENOMEM;
245 op->insert_data_done = true;
371 } 246 }
372 247
373 /* 248 if (journal_ref)
374 * If we had to allocate, we might race and not need to allocate the 249 atomic_dec_bug(journal_ref);
375 * second time we call find_data_bucket(). If we allocated a bucket but
376 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
377 */
378 if (KEY_PTRS(&alloc.key))
379 __bkey_put(c, &alloc.key);
380
381 for (i = 0; i < KEY_PTRS(&b->key); i++)
382 EBUG_ON(ptr_stale(c, &b->key, i));
383 250
384 /* Set up the pointer to the space we're allocating: */ 251 if (!op->insert_data_done)
252 continue_at(cl, bch_data_insert_start, bcache_wq);
385 253
386 for (i = 0; i < KEY_PTRS(&b->key); i++) 254 bch_keylist_free(&op->insert_keys);
387 k->ptr[i] = b->key.ptr[i]; 255 closure_return(cl);
256}
388 257
389 sectors = min(sectors, b->sectors_free); 258static void bch_data_invalidate(struct closure *cl)
259{
260 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
261 struct bio *bio = op->bio;
390 262
391 SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); 263 pr_debug("invalidating %i sectors from %llu",
392 SET_KEY_SIZE(k, sectors); 264 bio_sectors(bio), (uint64_t) bio->bi_sector);
393 SET_KEY_PTRS(k, KEY_PTRS(&b->key));
394 265
395 /* 266 while (bio_sectors(bio)) {
396 * Move b to the end of the lru, and keep track of what this bucket was 267 unsigned sectors = min(bio_sectors(bio),
397 * last used for: 268 1U << (KEY_SIZE_BITS - 1));
398 */
399 list_move_tail(&b->list, &c->data_buckets);
400 bkey_copy_key(&b->key, k);
401 b->last = s->task;
402 269
403 b->sectors_free -= sectors; 270 if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
271 goto out;
404 272
405 for (i = 0; i < KEY_PTRS(&b->key); i++) { 273 bio->bi_sector += sectors;
406 SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); 274 bio->bi_size -= sectors << 9;
407 275
408 atomic_long_add(sectors, 276 bch_keylist_add(&op->insert_keys,
409 &PTR_CACHE(c, &b->key, i)->sectors_written); 277 &KEY(op->inode, bio->bi_sector, sectors));
410 } 278 }
411 279
412 if (b->sectors_free < c->sb.block_size) 280 op->insert_data_done = true;
413 b->sectors_free = 0; 281 bio_put(bio);
414 282out:
415 /* 283 continue_at(cl, bch_data_insert_keys, bcache_wq);
416 * k takes refcounts on the buckets it points to until it's inserted
417 * into the btree, but if we're done with this bucket we just transfer
418 * get_data_bucket()'s refcount.
419 */
420 if (b->sectors_free)
421 for (i = 0; i < KEY_PTRS(&b->key); i++)
422 atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
423
424 spin_unlock(&c->data_bucket_lock);
425 return true;
426} 284}
427 285
428static void bch_insert_data_error(struct closure *cl) 286static void bch_data_insert_error(struct closure *cl)
429{ 287{
430 struct btree_op *op = container_of(cl, struct btree_op, cl); 288 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
431 289
432 /* 290 /*
433 * Our data write just errored, which means we've got a bunch of keys to 291 * Our data write just errored, which means we've got a bunch of keys to
@@ -438,35 +296,34 @@ static void bch_insert_data_error(struct closure *cl)
438 * from the keys we'll accomplish just that. 296 * from the keys we'll accomplish just that.
439 */ 297 */
440 298
441 struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; 299 struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
442 300
443 while (src != op->keys.top) { 301 while (src != op->insert_keys.top) {
444 struct bkey *n = bkey_next(src); 302 struct bkey *n = bkey_next(src);
445 303
446 SET_KEY_PTRS(src, 0); 304 SET_KEY_PTRS(src, 0);
447 bkey_copy(dst, src); 305 memmove(dst, src, bkey_bytes(src));
448 306
449 dst = bkey_next(dst); 307 dst = bkey_next(dst);
450 src = n; 308 src = n;
451 } 309 }
452 310
453 op->keys.top = dst; 311 op->insert_keys.top = dst;
454 312
455 bch_journal(cl); 313 bch_data_insert_keys(cl);
456} 314}
457 315
458static void bch_insert_data_endio(struct bio *bio, int error) 316static void bch_data_insert_endio(struct bio *bio, int error)
459{ 317{
460 struct closure *cl = bio->bi_private; 318 struct closure *cl = bio->bi_private;
461 struct btree_op *op = container_of(cl, struct btree_op, cl); 319 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
462 struct search *s = container_of(op, struct search, op);
463 320
464 if (error) { 321 if (error) {
465 /* TODO: We could try to recover from this. */ 322 /* TODO: We could try to recover from this. */
466 if (s->writeback) 323 if (op->writeback)
467 s->error = error; 324 op->error = error;
468 else if (s->write) 325 else if (!op->replace)
469 set_closure_fn(cl, bch_insert_data_error, bcache_wq); 326 set_closure_fn(cl, bch_data_insert_error, bcache_wq);
470 else 327 else
471 set_closure_fn(cl, NULL, NULL); 328 set_closure_fn(cl, NULL, NULL);
472 } 329 }
@@ -474,18 +331,17 @@ static void bch_insert_data_endio(struct bio *bio, int error)
474 bch_bbio_endio(op->c, bio, error, "writing data to cache"); 331 bch_bbio_endio(op->c, bio, error, "writing data to cache");
475} 332}
476 333
477static void bch_insert_data_loop(struct closure *cl) 334static void bch_data_insert_start(struct closure *cl)
478{ 335{
479 struct btree_op *op = container_of(cl, struct btree_op, cl); 336 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
480 struct search *s = container_of(op, struct search, op); 337 struct bio *bio = op->bio, *n;
481 struct bio *bio = op->cache_bio, *n;
482 338
483 if (op->skip) 339 if (op->bypass)
484 return bio_invalidate(cl); 340 return bch_data_invalidate(cl);
485 341
486 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { 342 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
487 set_gc_sectors(op->c); 343 set_gc_sectors(op->c);
488 bch_queue_gc(op->c); 344 wake_up_gc(op->c);
489 } 345 }
490 346
491 /* 347 /*
@@ -497,29 +353,30 @@ static void bch_insert_data_loop(struct closure *cl)
497 do { 353 do {
498 unsigned i; 354 unsigned i;
499 struct bkey *k; 355 struct bkey *k;
500 struct bio_set *split = s->d 356 struct bio_set *split = op->c->bio_split;
501 ? s->d->bio_split : op->c->bio_split;
502 357
503 /* 1 for the device pointer and 1 for the chksum */ 358 /* 1 for the device pointer and 1 for the chksum */
504 if (bch_keylist_realloc(&op->keys, 359 if (bch_keylist_realloc(&op->insert_keys,
505 1 + (op->csum ? 1 : 0), 360 1 + (op->csum ? 1 : 0),
506 op->c)) 361 op->c))
507 continue_at(cl, bch_journal, bcache_wq); 362 continue_at(cl, bch_data_insert_keys, bcache_wq);
508 363
509 k = op->keys.top; 364 k = op->insert_keys.top;
510 bkey_init(k); 365 bkey_init(k);
511 SET_KEY_INODE(k, op->inode); 366 SET_KEY_INODE(k, op->inode);
512 SET_KEY_OFFSET(k, bio->bi_sector); 367 SET_KEY_OFFSET(k, bio->bi_sector);
513 368
514 if (!bch_alloc_sectors(k, bio_sectors(bio), s)) 369 if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
370 op->write_point, op->write_prio,
371 op->writeback))
515 goto err; 372 goto err;
516 373
517 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); 374 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
518 375
519 n->bi_end_io = bch_insert_data_endio; 376 n->bi_end_io = bch_data_insert_endio;
520 n->bi_private = cl; 377 n->bi_private = cl;
521 378
522 if (s->writeback) { 379 if (op->writeback) {
523 SET_KEY_DIRTY(k, true); 380 SET_KEY_DIRTY(k, true);
524 381
525 for (i = 0; i < KEY_PTRS(k); i++) 382 for (i = 0; i < KEY_PTRS(k); i++)
@@ -532,17 +389,17 @@ static void bch_insert_data_loop(struct closure *cl)
532 bio_csum(n, k); 389 bio_csum(n, k);
533 390
534 trace_bcache_cache_insert(k); 391 trace_bcache_cache_insert(k);
535 bch_keylist_push(&op->keys); 392 bch_keylist_push(&op->insert_keys);
536 393
537 n->bi_rw |= REQ_WRITE; 394 n->bi_rw |= REQ_WRITE;
538 bch_submit_bbio(n, op->c, k, 0); 395 bch_submit_bbio(n, op->c, k, 0);
539 } while (n != bio); 396 } while (n != bio);
540 397
541 op->insert_data_done = true; 398 op->insert_data_done = true;
542 continue_at(cl, bch_journal, bcache_wq); 399 continue_at(cl, bch_data_insert_keys, bcache_wq);
543err: 400err:
544 /* bch_alloc_sectors() blocks if s->writeback = true */ 401 /* bch_alloc_sectors() blocks if s->writeback = true */
545 BUG_ON(s->writeback); 402 BUG_ON(op->writeback);
546 403
547 /* 404 /*
548 * But if it's not a writeback write we'd rather just bail out if 405 * But if it's not a writeback write we'd rather just bail out if
@@ -550,15 +407,15 @@ err:
550 * we might be starving btree writes for gc or something. 407 * we might be starving btree writes for gc or something.
551 */ 408 */
552 409
553 if (s->write) { 410 if (!op->replace) {
554 /* 411 /*
555 * Writethrough write: We can't complete the write until we've 412 * Writethrough write: We can't complete the write until we've
556 * updated the index. But we don't want to delay the write while 413 * updated the index. But we don't want to delay the write while
557 * we wait for buckets to be freed up, so just invalidate the 414 * we wait for buckets to be freed up, so just invalidate the
558 * rest of the write. 415 * rest of the write.
559 */ 416 */
560 op->skip = true; 417 op->bypass = true;
561 return bio_invalidate(cl); 418 return bch_data_invalidate(cl);
562 } else { 419 } else {
563 /* 420 /*
564 * From a cache miss, we can just insert the keys for the data 421 * From a cache miss, we can just insert the keys for the data
@@ -567,15 +424,15 @@ err:
567 op->insert_data_done = true; 424 op->insert_data_done = true;
568 bio_put(bio); 425 bio_put(bio);
569 426
570 if (!bch_keylist_empty(&op->keys)) 427 if (!bch_keylist_empty(&op->insert_keys))
571 continue_at(cl, bch_journal, bcache_wq); 428 continue_at(cl, bch_data_insert_keys, bcache_wq);
572 else 429 else
573 closure_return(cl); 430 closure_return(cl);
574 } 431 }
575} 432}
576 433
577/** 434/**
578 * bch_insert_data - stick some data in the cache 435 * bch_data_insert - stick some data in the cache
579 * 436 *
580 * This is the starting point for any data to end up in a cache device; it could 437 * This is the starting point for any data to end up in a cache device; it could
581 * be from a normal write, or a writeback write, or a write to a flash only 438 * be from a normal write, or a writeback write, or a write to a flash only
@@ -587,56 +444,179 @@ err:
587 * data is written it calls bch_journal, and after the keys have been added to 444 * data is written it calls bch_journal, and after the keys have been added to
588 * the next journal write they're inserted into the btree. 445 * the next journal write they're inserted into the btree.
589 * 446 *
590 * It inserts the data in op->cache_bio; bi_sector is used for the key offset, 447 * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
591 * and op->inode is used for the key inode. 448 * and op->inode is used for the key inode.
592 * 449 *
593 * If op->skip is true, instead of inserting the data it invalidates the region 450 * If s->bypass is true, instead of inserting the data it invalidates the
594 * of the cache represented by op->cache_bio and op->inode. 451 * region of the cache represented by s->cache_bio and op->inode.
595 */ 452 */
596void bch_insert_data(struct closure *cl) 453void bch_data_insert(struct closure *cl)
597{ 454{
598 struct btree_op *op = container_of(cl, struct btree_op, cl); 455 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
456
457 trace_bcache_write(op->bio, op->writeback, op->bypass);
599 458
600 bch_keylist_init(&op->keys); 459 bch_keylist_init(&op->insert_keys);
601 bio_get(op->cache_bio); 460 bio_get(op->bio);
602 bch_insert_data_loop(cl); 461 bch_data_insert_start(cl);
603} 462}
604 463
605void bch_btree_insert_async(struct closure *cl) 464/* Congested? */
465
466unsigned bch_get_congested(struct cache_set *c)
606{ 467{
607 struct btree_op *op = container_of(cl, struct btree_op, cl); 468 int i;
608 struct search *s = container_of(op, struct search, op); 469 long rand;
609 470
610 if (bch_btree_insert(op, op->c)) { 471 if (!c->congested_read_threshold_us &&
611 s->error = -ENOMEM; 472 !c->congested_write_threshold_us)
612 op->insert_data_done = true; 473 return 0;
613 } 474
475 i = (local_clock_us() - c->congested_last_us) / 1024;
476 if (i < 0)
477 return 0;
478
479 i += atomic_read(&c->congested);
480 if (i >= 0)
481 return 0;
614 482
615 if (op->insert_data_done) { 483 i += CONGESTED_MAX;
616 bch_keylist_free(&op->keys); 484
617 closure_return(cl); 485 if (i > 0)
618 } else 486 i = fract_exp_two(i, 6);
619 continue_at(cl, bch_insert_data_loop, bcache_wq); 487
488 rand = get_random_int();
489 i -= bitmap_weight(&rand, BITS_PER_LONG);
490
491 return i > 0 ? i : 1;
620} 492}
621 493
622/* Common code for the make_request functions */ 494static void add_sequential(struct task_struct *t)
495{
496 ewma_add(t->sequential_io_avg,
497 t->sequential_io, 8, 0);
623 498
624static void request_endio(struct bio *bio, int error) 499 t->sequential_io = 0;
500}
501
502static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
625{ 503{
626 struct closure *cl = bio->bi_private; 504 return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
505}
627 506
628 if (error) { 507static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
629 struct search *s = container_of(cl, struct search, cl); 508{
630 s->error = error; 509 struct cache_set *c = dc->disk.c;
631 /* Only cache read errors are recoverable */ 510 unsigned mode = cache_mode(dc, bio);
632 s->recoverable = false; 511 unsigned sectors, congested = bch_get_congested(c);
512 struct task_struct *task = current;
513 struct io *i;
514
515 if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
516 c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
517 (bio->bi_rw & REQ_DISCARD))
518 goto skip;
519
520 if (mode == CACHE_MODE_NONE ||
521 (mode == CACHE_MODE_WRITEAROUND &&
522 (bio->bi_rw & REQ_WRITE)))
523 goto skip;
524
525 if (bio->bi_sector & (c->sb.block_size - 1) ||
526 bio_sectors(bio) & (c->sb.block_size - 1)) {
527 pr_debug("skipping unaligned io");
528 goto skip;
633 } 529 }
634 530
635 bio_put(bio); 531 if (bypass_torture_test(dc)) {
636 closure_put(cl); 532 if ((get_random_int() & 3) == 3)
533 goto skip;
534 else
535 goto rescale;
536 }
537
538 if (!congested && !dc->sequential_cutoff)
539 goto rescale;
540
541 if (!congested &&
542 mode == CACHE_MODE_WRITEBACK &&
543 (bio->bi_rw & REQ_WRITE) &&
544 (bio->bi_rw & REQ_SYNC))
545 goto rescale;
546
547 spin_lock(&dc->io_lock);
548
549 hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
550 if (i->last == bio->bi_sector &&
551 time_before(jiffies, i->jiffies))
552 goto found;
553
554 i = list_first_entry(&dc->io_lru, struct io, lru);
555
556 add_sequential(task);
557 i->sequential = 0;
558found:
559 if (i->sequential + bio->bi_size > i->sequential)
560 i->sequential += bio->bi_size;
561
562 i->last = bio_end_sector(bio);
563 i->jiffies = jiffies + msecs_to_jiffies(5000);
564 task->sequential_io = i->sequential;
565
566 hlist_del(&i->hash);
567 hlist_add_head(&i->hash, iohash(dc, i->last));
568 list_move_tail(&i->lru, &dc->io_lru);
569
570 spin_unlock(&dc->io_lock);
571
572 sectors = max(task->sequential_io,
573 task->sequential_io_avg) >> 9;
574
575 if (dc->sequential_cutoff &&
576 sectors >= dc->sequential_cutoff >> 9) {
577 trace_bcache_bypass_sequential(bio);
578 goto skip;
579 }
580
581 if (congested && sectors >= congested) {
582 trace_bcache_bypass_congested(bio);
583 goto skip;
584 }
585
586rescale:
587 bch_rescale_priorities(c, bio_sectors(bio));
588 return false;
589skip:
590 bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
591 return true;
637} 592}
638 593
639void bch_cache_read_endio(struct bio *bio, int error) 594/* Cache lookup */
595
596struct search {
597 /* Stack frame for bio_complete */
598 struct closure cl;
599
600 struct bcache_device *d;
601
602 struct bbio bio;
603 struct bio *orig_bio;
604 struct bio *cache_miss;
605
606 unsigned insert_bio_sectors;
607
608 unsigned recoverable:1;
609 unsigned unaligned_bvec:1;
610 unsigned write:1;
611 unsigned read_dirty_data:1;
612
613 unsigned long start_time;
614
615 struct btree_op op;
616 struct data_insert_op iop;
617};
618
619static void bch_cache_read_endio(struct bio *bio, int error)
640{ 620{
641 struct bbio *b = container_of(bio, struct bbio, bio); 621 struct bbio *b = container_of(bio, struct bbio, bio);
642 struct closure *cl = bio->bi_private; 622 struct closure *cl = bio->bi_private;
@@ -650,13 +630,113 @@ void bch_cache_read_endio(struct bio *bio, int error)
650 */ 630 */
651 631
652 if (error) 632 if (error)
653 s->error = error; 633 s->iop.error = error;
654 else if (ptr_stale(s->op.c, &b->key, 0)) { 634 else if (ptr_stale(s->iop.c, &b->key, 0)) {
655 atomic_long_inc(&s->op.c->cache_read_races); 635 atomic_long_inc(&s->iop.c->cache_read_races);
656 s->error = -EINTR; 636 s->iop.error = -EINTR;
657 } 637 }
658 638
659 bch_bbio_endio(s->op.c, bio, error, "reading from cache"); 639 bch_bbio_endio(s->iop.c, bio, error, "reading from cache");
640}
641
642/*
643 * Read from a single key, handling the initial cache miss if the key starts in
644 * the middle of the bio
645 */
646static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
647{
648 struct search *s = container_of(op, struct search, op);
649 struct bio *n, *bio = &s->bio.bio;
650 struct bkey *bio_key;
651 unsigned ptr;
652
653 if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0)
654 return MAP_CONTINUE;
655
656 if (KEY_INODE(k) != s->iop.inode ||
657 KEY_START(k) > bio->bi_sector) {
658 unsigned bio_sectors = bio_sectors(bio);
659 unsigned sectors = KEY_INODE(k) == s->iop.inode
660 ? min_t(uint64_t, INT_MAX,
661 KEY_START(k) - bio->bi_sector)
662 : INT_MAX;
663
664 int ret = s->d->cache_miss(b, s, bio, sectors);
665 if (ret != MAP_CONTINUE)
666 return ret;
667
668 /* if this was a complete miss we shouldn't get here */
669 BUG_ON(bio_sectors <= sectors);
670 }
671
672 if (!KEY_SIZE(k))
673 return MAP_CONTINUE;
674
675 /* XXX: figure out best pointer - for multiple cache devices */
676 ptr = 0;
677
678 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
679
680 if (KEY_DIRTY(k))
681 s->read_dirty_data = true;
682
683 n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
684 KEY_OFFSET(k) - bio->bi_sector),
685 GFP_NOIO, s->d->bio_split);
686
687 bio_key = &container_of(n, struct bbio, bio)->key;
688 bch_bkey_copy_single_ptr(bio_key, k, ptr);
689
690 bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key);
691 bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
692
693 n->bi_end_io = bch_cache_read_endio;
694 n->bi_private = &s->cl;
695
696 /*
697 * The bucket we're reading from might be reused while our bio
698 * is in flight, and we could then end up reading the wrong
699 * data.
700 *
701 * We guard against this by checking (in cache_read_endio()) if
702 * the pointer is stale again; if so, we treat it as an error
703 * and reread from the backing device (but we don't pass that
704 * error up anywhere).
705 */
706
707 __bch_submit_bbio(n, b->c);
708 return n == bio ? MAP_DONE : MAP_CONTINUE;
709}
710
711static void cache_lookup(struct closure *cl)
712{
713 struct search *s = container_of(cl, struct search, iop.cl);
714 struct bio *bio = &s->bio.bio;
715
716 int ret = bch_btree_map_keys(&s->op, s->iop.c,
717 &KEY(s->iop.inode, bio->bi_sector, 0),
718 cache_lookup_fn, MAP_END_KEY);
719 if (ret == -EAGAIN)
720 continue_at(cl, cache_lookup, bcache_wq);
721
722 closure_return(cl);
723}
724
725/* Common code for the make_request functions */
726
727static void request_endio(struct bio *bio, int error)
728{
729 struct closure *cl = bio->bi_private;
730
731 if (error) {
732 struct search *s = container_of(cl, struct search, cl);
733 s->iop.error = error;
734 /* Only cache read errors are recoverable */
735 s->recoverable = false;
736 }
737
738 bio_put(bio);
739 closure_put(cl);
660} 740}
661 741
662static void bio_complete(struct search *s) 742static void bio_complete(struct search *s)
@@ -670,8 +750,8 @@ static void bio_complete(struct search *s)
670 part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); 750 part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
671 part_stat_unlock(); 751 part_stat_unlock();
672 752
673 trace_bcache_request_end(s, s->orig_bio); 753 trace_bcache_request_end(s->d, s->orig_bio);
674 bio_endio(s->orig_bio, s->error); 754 bio_endio(s->orig_bio, s->iop.error);
675 s->orig_bio = NULL; 755 s->orig_bio = NULL;
676 } 756 }
677} 757}
@@ -691,8 +771,8 @@ static void search_free(struct closure *cl)
691 struct search *s = container_of(cl, struct search, cl); 771 struct search *s = container_of(cl, struct search, cl);
692 bio_complete(s); 772 bio_complete(s);
693 773
694 if (s->op.cache_bio) 774 if (s->iop.bio)
695 bio_put(s->op.cache_bio); 775 bio_put(s->iop.bio);
696 776
697 if (s->unaligned_bvec) 777 if (s->unaligned_bvec)
698 mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); 778 mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
@@ -703,21 +783,22 @@ static void search_free(struct closure *cl)
703 783
704static struct search *search_alloc(struct bio *bio, struct bcache_device *d) 784static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
705{ 785{
786 struct search *s;
706 struct bio_vec *bv; 787 struct bio_vec *bv;
707 struct search *s = mempool_alloc(d->c->search, GFP_NOIO); 788
708 memset(s, 0, offsetof(struct search, op.keys)); 789 s = mempool_alloc(d->c->search, GFP_NOIO);
790 memset(s, 0, offsetof(struct search, iop.insert_keys));
709 791
710 __closure_init(&s->cl, NULL); 792 __closure_init(&s->cl, NULL);
711 793
712 s->op.inode = d->id; 794 s->iop.inode = d->id;
713 s->op.c = d->c; 795 s->iop.c = d->c;
714 s->d = d; 796 s->d = d;
715 s->op.lock = -1; 797 s->op.lock = -1;
716 s->task = current; 798 s->iop.write_point = hash_long((unsigned long) current, 16);
717 s->orig_bio = bio; 799 s->orig_bio = bio;
718 s->write = (bio->bi_rw & REQ_WRITE) != 0; 800 s->write = (bio->bi_rw & REQ_WRITE) != 0;
719 s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; 801 s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
720 s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0;
721 s->recoverable = 1; 802 s->recoverable = 1;
722 s->start_time = jiffies; 803 s->start_time = jiffies;
723 do_bio_hook(s); 804 do_bio_hook(s);
@@ -734,18 +815,6 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
734 return s; 815 return s;
735} 816}
736 817
737static void btree_read_async(struct closure *cl)
738{
739 struct btree_op *op = container_of(cl, struct btree_op, cl);
740
741 int ret = btree_root(search_recurse, op->c, op);
742
743 if (ret == -EAGAIN)
744 continue_at(cl, btree_read_async, bcache_wq);
745
746 closure_return(cl);
747}
748
749/* Cached devices */ 818/* Cached devices */
750 819
751static void cached_dev_bio_complete(struct closure *cl) 820static void cached_dev_bio_complete(struct closure *cl)
@@ -759,27 +828,28 @@ static void cached_dev_bio_complete(struct closure *cl)
759 828
760/* Process reads */ 829/* Process reads */
761 830
762static void cached_dev_read_complete(struct closure *cl) 831static void cached_dev_cache_miss_done(struct closure *cl)
763{ 832{
764 struct search *s = container_of(cl, struct search, cl); 833 struct search *s = container_of(cl, struct search, cl);
765 834
766 if (s->op.insert_collision) 835 if (s->iop.replace_collision)
767 bch_mark_cache_miss_collision(s); 836 bch_mark_cache_miss_collision(s->iop.c, s->d);
768 837
769 if (s->op.cache_bio) { 838 if (s->iop.bio) {
770 int i; 839 int i;
771 struct bio_vec *bv; 840 struct bio_vec *bv;
772 841
773 __bio_for_each_segment(bv, s->op.cache_bio, i, 0) 842 bio_for_each_segment_all(bv, s->iop.bio, i)
774 __free_page(bv->bv_page); 843 __free_page(bv->bv_page);
775 } 844 }
776 845
777 cached_dev_bio_complete(cl); 846 cached_dev_bio_complete(cl);
778} 847}
779 848
780static void request_read_error(struct closure *cl) 849static void cached_dev_read_error(struct closure *cl)
781{ 850{
782 struct search *s = container_of(cl, struct search, cl); 851 struct search *s = container_of(cl, struct search, cl);
852 struct bio *bio = &s->bio.bio;
783 struct bio_vec *bv; 853 struct bio_vec *bv;
784 int i; 854 int i;
785 855
@@ -787,7 +857,7 @@ static void request_read_error(struct closure *cl)
787 /* Retry from the backing device: */ 857 /* Retry from the backing device: */
788 trace_bcache_read_retry(s->orig_bio); 858 trace_bcache_read_retry(s->orig_bio);
789 859
790 s->error = 0; 860 s->iop.error = 0;
791 bv = s->bio.bio.bi_io_vec; 861 bv = s->bio.bio.bi_io_vec;
792 do_bio_hook(s); 862 do_bio_hook(s);
793 s->bio.bio.bi_io_vec = bv; 863 s->bio.bio.bi_io_vec = bv;
@@ -803,146 +873,148 @@ static void request_read_error(struct closure *cl)
803 873
804 /* XXX: invalidate cache */ 874 /* XXX: invalidate cache */
805 875
806 closure_bio_submit(&s->bio.bio, &s->cl, s->d); 876 closure_bio_submit(bio, cl, s->d);
807 } 877 }
808 878
809 continue_at(cl, cached_dev_read_complete, NULL); 879 continue_at(cl, cached_dev_cache_miss_done, NULL);
810} 880}
811 881
812static void request_read_done(struct closure *cl) 882static void cached_dev_read_done(struct closure *cl)
813{ 883{
814 struct search *s = container_of(cl, struct search, cl); 884 struct search *s = container_of(cl, struct search, cl);
815 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 885 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
816 886
817 /* 887 /*
818 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now 888 * We had a cache miss; cache_bio now contains data ready to be inserted
819 * contains data ready to be inserted into the cache. 889 * into the cache.
820 * 890 *
821 * First, we copy the data we just read from cache_bio's bounce buffers 891 * First, we copy the data we just read from cache_bio's bounce buffers
822 * to the buffers the original bio pointed to: 892 * to the buffers the original bio pointed to:
823 */ 893 */
824 894
825 if (s->op.cache_bio) { 895 if (s->iop.bio) {
826 bio_reset(s->op.cache_bio); 896 bio_reset(s->iop.bio);
827 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; 897 s->iop.bio->bi_sector = s->cache_miss->bi_sector;
828 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; 898 s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
829 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; 899 s->iop.bio->bi_size = s->insert_bio_sectors << 9;
830 bch_bio_map(s->op.cache_bio, NULL); 900 bch_bio_map(s->iop.bio, NULL);
831 901
832 bio_copy_data(s->cache_miss, s->op.cache_bio); 902 bio_copy_data(s->cache_miss, s->iop.bio);
833 903
834 bio_put(s->cache_miss); 904 bio_put(s->cache_miss);
835 s->cache_miss = NULL; 905 s->cache_miss = NULL;
836 } 906 }
837 907
838 if (verify(dc, &s->bio.bio) && s->recoverable) 908 if (verify(dc, &s->bio.bio) && s->recoverable &&
839 bch_data_verify(s); 909 !s->unaligned_bvec && !s->read_dirty_data)
910 bch_data_verify(dc, s->orig_bio);
840 911
841 bio_complete(s); 912 bio_complete(s);
842 913
843 if (s->op.cache_bio && 914 if (s->iop.bio &&
844 !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { 915 !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
845 s->op.type = BTREE_REPLACE; 916 BUG_ON(!s->iop.replace);
846 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 917 closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
847 } 918 }
848 919
849 continue_at(cl, cached_dev_read_complete, NULL); 920 continue_at(cl, cached_dev_cache_miss_done, NULL);
850} 921}
851 922
852static void request_read_done_bh(struct closure *cl) 923static void cached_dev_read_done_bh(struct closure *cl)
853{ 924{
854 struct search *s = container_of(cl, struct search, cl); 925 struct search *s = container_of(cl, struct search, cl);
855 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 926 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
856 927
857 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); 928 bch_mark_cache_accounting(s->iop.c, s->d,
858 trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); 929 !s->cache_miss, s->iop.bypass);
930 trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
859 931
860 if (s->error) 932 if (s->iop.error)
861 continue_at_nobarrier(cl, request_read_error, bcache_wq); 933 continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
862 else if (s->op.cache_bio || verify(dc, &s->bio.bio)) 934 else if (s->iop.bio || verify(dc, &s->bio.bio))
863 continue_at_nobarrier(cl, request_read_done, bcache_wq); 935 continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
864 else 936 else
865 continue_at_nobarrier(cl, cached_dev_read_complete, NULL); 937 continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
866} 938}
867 939
868static int cached_dev_cache_miss(struct btree *b, struct search *s, 940static int cached_dev_cache_miss(struct btree *b, struct search *s,
869 struct bio *bio, unsigned sectors) 941 struct bio *bio, unsigned sectors)
870{ 942{
871 int ret = 0; 943 int ret = MAP_CONTINUE;
872 unsigned reada; 944 unsigned reada = 0;
873 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 945 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
874 struct bio *miss; 946 struct bio *miss, *cache_bio;
875
876 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
877 if (miss == bio)
878 s->op.lookup_done = true;
879 947
880 miss->bi_end_io = request_endio; 948 if (s->cache_miss || s->iop.bypass) {
881 miss->bi_private = &s->cl; 949 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
882 950 ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
883 if (s->cache_miss || s->op.skip)
884 goto out_submit; 951 goto out_submit;
885
886 if (miss != bio ||
887 (bio->bi_rw & REQ_RAHEAD) ||
888 (bio->bi_rw & REQ_META) ||
889 s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
890 reada = 0;
891 else {
892 reada = min(dc->readahead >> 9,
893 sectors - bio_sectors(miss));
894
895 if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
896 reada = bdev_sectors(miss->bi_bdev) -
897 bio_end_sector(miss);
898 } 952 }
899 953
900 s->cache_bio_sectors = bio_sectors(miss) + reada; 954 if (!(bio->bi_rw & REQ_RAHEAD) &&
901 s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, 955 !(bio->bi_rw & REQ_META) &&
902 DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), 956 s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
903 dc->disk.bio_split); 957 reada = min_t(sector_t, dc->readahead >> 9,
958 bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
904 959
905 if (!s->op.cache_bio) 960 s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
906 goto out_submit;
907 961
908 s->op.cache_bio->bi_sector = miss->bi_sector; 962 s->iop.replace_key = KEY(s->iop.inode,
909 s->op.cache_bio->bi_bdev = miss->bi_bdev; 963 bio->bi_sector + s->insert_bio_sectors,
910 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; 964 s->insert_bio_sectors);
911 965
912 s->op.cache_bio->bi_end_io = request_endio; 966 ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
913 s->op.cache_bio->bi_private = &s->cl; 967 if (ret)
968 return ret;
969
970 s->iop.replace = true;
971
972 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
914 973
915 /* btree_search_recurse()'s btree iterator is no good anymore */ 974 /* btree_search_recurse()'s btree iterator is no good anymore */
916 ret = -EINTR; 975 ret = miss == bio ? MAP_DONE : -EINTR;
917 if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) 976
918 goto out_put; 977 cache_bio = bio_alloc_bioset(GFP_NOWAIT,
978 DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
979 dc->disk.bio_split);
980 if (!cache_bio)
981 goto out_submit;
982
983 cache_bio->bi_sector = miss->bi_sector;
984 cache_bio->bi_bdev = miss->bi_bdev;
985 cache_bio->bi_size = s->insert_bio_sectors << 9;
986
987 cache_bio->bi_end_io = request_endio;
988 cache_bio->bi_private = &s->cl;
919 989
920 bch_bio_map(s->op.cache_bio, NULL); 990 bch_bio_map(cache_bio, NULL);
921 if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) 991 if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
922 goto out_put; 992 goto out_put;
923 993
924 s->cache_miss = miss; 994 if (reada)
925 bio_get(s->op.cache_bio); 995 bch_mark_cache_readahead(s->iop.c, s->d);
926 996
927 closure_bio_submit(s->op.cache_bio, &s->cl, s->d); 997 s->cache_miss = miss;
998 s->iop.bio = cache_bio;
999 bio_get(cache_bio);
1000 closure_bio_submit(cache_bio, &s->cl, s->d);
928 1001
929 return ret; 1002 return ret;
930out_put: 1003out_put:
931 bio_put(s->op.cache_bio); 1004 bio_put(cache_bio);
932 s->op.cache_bio = NULL;
933out_submit: 1005out_submit:
1006 miss->bi_end_io = request_endio;
1007 miss->bi_private = &s->cl;
934 closure_bio_submit(miss, &s->cl, s->d); 1008 closure_bio_submit(miss, &s->cl, s->d);
935 return ret; 1009 return ret;
936} 1010}
937 1011
938static void request_read(struct cached_dev *dc, struct search *s) 1012static void cached_dev_read(struct cached_dev *dc, struct search *s)
939{ 1013{
940 struct closure *cl = &s->cl; 1014 struct closure *cl = &s->cl;
941 1015
942 check_should_skip(dc, s); 1016 closure_call(&s->iop.cl, cache_lookup, NULL, cl);
943 closure_call(&s->op.cl, btree_read_async, NULL, cl); 1017 continue_at(cl, cached_dev_read_done_bh, NULL);
944
945 continue_at(cl, request_read_done_bh, NULL);
946} 1018}
947 1019
948/* Process writes */ 1020/* Process writes */
@@ -956,47 +1028,52 @@ static void cached_dev_write_complete(struct closure *cl)
956 cached_dev_bio_complete(cl); 1028 cached_dev_bio_complete(cl);
957} 1029}
958 1030
959static void request_write(struct cached_dev *dc, struct search *s) 1031static void cached_dev_write(struct cached_dev *dc, struct search *s)
960{ 1032{
961 struct closure *cl = &s->cl; 1033 struct closure *cl = &s->cl;
962 struct bio *bio = &s->bio.bio; 1034 struct bio *bio = &s->bio.bio;
963 struct bkey start, end; 1035 struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
964 start = KEY(dc->disk.id, bio->bi_sector, 0); 1036 struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
965 end = KEY(dc->disk.id, bio_end_sector(bio), 0);
966 1037
967 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); 1038 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
968 1039
969 check_should_skip(dc, s);
970 down_read_non_owner(&dc->writeback_lock); 1040 down_read_non_owner(&dc->writeback_lock);
971
972 if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { 1041 if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
973 s->op.skip = false; 1042 /*
974 s->writeback = true; 1043 * We overlap with some dirty data undergoing background
1044 * writeback, force this write to writeback
1045 */
1046 s->iop.bypass = false;
1047 s->iop.writeback = true;
975 } 1048 }
976 1049
1050 /*
1051 * Discards aren't _required_ to do anything, so skipping if
1052 * check_overlapping returned true is ok
1053 *
1054 * But check_overlapping drops dirty keys for which io hasn't started,
1055 * so we still want to call it.
1056 */
977 if (bio->bi_rw & REQ_DISCARD) 1057 if (bio->bi_rw & REQ_DISCARD)
978 goto skip; 1058 s->iop.bypass = true;
979 1059
980 if (should_writeback(dc, s->orig_bio, 1060 if (should_writeback(dc, s->orig_bio,
981 cache_mode(dc, bio), 1061 cache_mode(dc, bio),
982 s->op.skip)) { 1062 s->iop.bypass)) {
983 s->op.skip = false; 1063 s->iop.bypass = false;
984 s->writeback = true; 1064 s->iop.writeback = true;
985 } 1065 }
986 1066
987 if (s->op.skip) 1067 if (s->iop.bypass) {
988 goto skip; 1068 s->iop.bio = s->orig_bio;
989 1069 bio_get(s->iop.bio);
990 trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
991 1070
992 if (!s->writeback) { 1071 if (!(bio->bi_rw & REQ_DISCARD) ||
993 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, 1072 blk_queue_discard(bdev_get_queue(dc->bdev)))
994 dc->disk.bio_split); 1073 closure_bio_submit(bio, cl, s->d);
995 1074 } else if (s->iop.writeback) {
996 closure_bio_submit(bio, cl, s->d);
997 } else {
998 bch_writeback_add(dc); 1075 bch_writeback_add(dc);
999 s->op.cache_bio = bio; 1076 s->iop.bio = bio;
1000 1077
1001 if (bio->bi_rw & REQ_FLUSH) { 1078 if (bio->bi_rw & REQ_FLUSH) {
1002 /* Also need to send a flush to the backing device */ 1079 /* Also need to send a flush to the backing device */
@@ -1010,36 +1087,26 @@ static void request_write(struct cached_dev *dc, struct search *s)
1010 1087
1011 closure_bio_submit(flush, cl, s->d); 1088 closure_bio_submit(flush, cl, s->d);
1012 } 1089 }
1013 } 1090 } else {
1014out: 1091 s->iop.bio = bio_clone_bioset(bio, GFP_NOIO,
1015 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 1092 dc->disk.bio_split);
1016 continue_at(cl, cached_dev_write_complete, NULL);
1017skip:
1018 s->op.skip = true;
1019 s->op.cache_bio = s->orig_bio;
1020 bio_get(s->op.cache_bio);
1021 1093
1022 if ((bio->bi_rw & REQ_DISCARD) && 1094 closure_bio_submit(bio, cl, s->d);
1023 !blk_queue_discard(bdev_get_queue(dc->bdev))) 1095 }
1024 goto out;
1025 1096
1026 closure_bio_submit(bio, cl, s->d); 1097 closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
1027 goto out; 1098 continue_at(cl, cached_dev_write_complete, NULL);
1028} 1099}
1029 1100
1030static void request_nodata(struct cached_dev *dc, struct search *s) 1101static void cached_dev_nodata(struct closure *cl)
1031{ 1102{
1032 struct closure *cl = &s->cl; 1103 struct search *s = container_of(cl, struct search, cl);
1033 struct bio *bio = &s->bio.bio; 1104 struct bio *bio = &s->bio.bio;
1034 1105
1035 if (bio->bi_rw & REQ_DISCARD) { 1106 if (s->iop.flush_journal)
1036 request_write(dc, s); 1107 bch_journal_meta(s->iop.c, cl);
1037 return;
1038 }
1039
1040 if (s->op.flush_journal)
1041 bch_journal_meta(s->op.c, cl);
1042 1108
1109 /* If it's a flush, we send the flush to the backing device too */
1043 closure_bio_submit(bio, cl, s->d); 1110 closure_bio_submit(bio, cl, s->d);
1044 1111
1045 continue_at(cl, cached_dev_bio_complete, NULL); 1112 continue_at(cl, cached_dev_bio_complete, NULL);
@@ -1047,134 +1114,6 @@ static void request_nodata(struct cached_dev *dc, struct search *s)
1047 1114
1048/* Cached devices - read & write stuff */ 1115/* Cached devices - read & write stuff */
1049 1116
1050unsigned bch_get_congested(struct cache_set *c)
1051{
1052 int i;
1053 long rand;
1054
1055 if (!c->congested_read_threshold_us &&
1056 !c->congested_write_threshold_us)
1057 return 0;
1058
1059 i = (local_clock_us() - c->congested_last_us) / 1024;
1060 if (i < 0)
1061 return 0;
1062
1063 i += atomic_read(&c->congested);
1064 if (i >= 0)
1065 return 0;
1066
1067 i += CONGESTED_MAX;
1068
1069 if (i > 0)
1070 i = fract_exp_two(i, 6);
1071
1072 rand = get_random_int();
1073 i -= bitmap_weight(&rand, BITS_PER_LONG);
1074
1075 return i > 0 ? i : 1;
1076}
1077
1078static void add_sequential(struct task_struct *t)
1079{
1080 ewma_add(t->sequential_io_avg,
1081 t->sequential_io, 8, 0);
1082
1083 t->sequential_io = 0;
1084}
1085
1086static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
1087{
1088 return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
1089}
1090
1091static void check_should_skip(struct cached_dev *dc, struct search *s)
1092{
1093 struct cache_set *c = s->op.c;
1094 struct bio *bio = &s->bio.bio;
1095 unsigned mode = cache_mode(dc, bio);
1096 unsigned sectors, congested = bch_get_congested(c);
1097
1098 if (atomic_read(&dc->disk.detaching) ||
1099 c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
1100 (bio->bi_rw & REQ_DISCARD))
1101 goto skip;
1102
1103 if (mode == CACHE_MODE_NONE ||
1104 (mode == CACHE_MODE_WRITEAROUND &&
1105 (bio->bi_rw & REQ_WRITE)))
1106 goto skip;
1107
1108 if (bio->bi_sector & (c->sb.block_size - 1) ||
1109 bio_sectors(bio) & (c->sb.block_size - 1)) {
1110 pr_debug("skipping unaligned io");
1111 goto skip;
1112 }
1113
1114 if (!congested && !dc->sequential_cutoff)
1115 goto rescale;
1116
1117 if (!congested &&
1118 mode == CACHE_MODE_WRITEBACK &&
1119 (bio->bi_rw & REQ_WRITE) &&
1120 (bio->bi_rw & REQ_SYNC))
1121 goto rescale;
1122
1123 if (dc->sequential_merge) {
1124 struct io *i;
1125
1126 spin_lock(&dc->io_lock);
1127
1128 hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
1129 if (i->last == bio->bi_sector &&
1130 time_before(jiffies, i->jiffies))
1131 goto found;
1132
1133 i = list_first_entry(&dc->io_lru, struct io, lru);
1134
1135 add_sequential(s->task);
1136 i->sequential = 0;
1137found:
1138 if (i->sequential + bio->bi_size > i->sequential)
1139 i->sequential += bio->bi_size;
1140
1141 i->last = bio_end_sector(bio);
1142 i->jiffies = jiffies + msecs_to_jiffies(5000);
1143 s->task->sequential_io = i->sequential;
1144
1145 hlist_del(&i->hash);
1146 hlist_add_head(&i->hash, iohash(dc, i->last));
1147 list_move_tail(&i->lru, &dc->io_lru);
1148
1149 spin_unlock(&dc->io_lock);
1150 } else {
1151 s->task->sequential_io = bio->bi_size;
1152
1153 add_sequential(s->task);
1154 }
1155
1156 sectors = max(s->task->sequential_io,
1157 s->task->sequential_io_avg) >> 9;
1158
1159 if (dc->sequential_cutoff &&
1160 sectors >= dc->sequential_cutoff >> 9) {
1161 trace_bcache_bypass_sequential(s->orig_bio);
1162 goto skip;
1163 }
1164
1165 if (congested && sectors >= congested) {
1166 trace_bcache_bypass_congested(s->orig_bio);
1167 goto skip;
1168 }
1169
1170rescale:
1171 bch_rescale_priorities(c, bio_sectors(bio));
1172 return;
1173skip:
1174 bch_mark_sectors_bypassed(s, bio_sectors(bio));
1175 s->op.skip = true;
1176}
1177
1178static void cached_dev_make_request(struct request_queue *q, struct bio *bio) 1117static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
1179{ 1118{
1180 struct search *s; 1119 struct search *s;
@@ -1192,14 +1131,24 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
1192 1131
1193 if (cached_dev_get(dc)) { 1132 if (cached_dev_get(dc)) {
1194 s = search_alloc(bio, d); 1133 s = search_alloc(bio, d);
1195 trace_bcache_request_start(s, bio); 1134 trace_bcache_request_start(s->d, bio);
1196 1135
1197 if (!bio_has_data(bio)) 1136 if (!bio->bi_size) {
1198 request_nodata(dc, s); 1137 /*
1199 else if (rw) 1138 * can't call bch_journal_meta from under
1200 request_write(dc, s); 1139 * generic_make_request
1201 else 1140 */
1202 request_read(dc, s); 1141 continue_at_nobarrier(&s->cl,
1142 cached_dev_nodata,
1143 bcache_wq);
1144 } else {
1145 s->iop.bypass = check_should_bypass(dc, bio);
1146
1147 if (rw)
1148 cached_dev_write(dc, s);
1149 else
1150 cached_dev_read(dc, s);
1151 }
1203 } else { 1152 } else {
1204 if ((bio->bi_rw & REQ_DISCARD) && 1153 if ((bio->bi_rw & REQ_DISCARD) &&
1205 !blk_queue_discard(bdev_get_queue(dc->bdev))) 1154 !blk_queue_discard(bdev_get_queue(dc->bdev)))
@@ -1274,9 +1223,19 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
1274 bio_advance(bio, min(sectors << 9, bio->bi_size)); 1223 bio_advance(bio, min(sectors << 9, bio->bi_size));
1275 1224
1276 if (!bio->bi_size) 1225 if (!bio->bi_size)
1277 s->op.lookup_done = true; 1226 return MAP_DONE;
1278 1227
1279 return 0; 1228 return MAP_CONTINUE;
1229}
1230
1231static void flash_dev_nodata(struct closure *cl)
1232{
1233 struct search *s = container_of(cl, struct search, cl);
1234
1235 if (s->iop.flush_journal)
1236 bch_journal_meta(s->iop.c, cl);
1237
1238 continue_at(cl, search_free, NULL);
1280} 1239}
1281 1240
1282static void flash_dev_make_request(struct request_queue *q, struct bio *bio) 1241static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
@@ -1295,23 +1254,28 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1295 cl = &s->cl; 1254 cl = &s->cl;
1296 bio = &s->bio.bio; 1255 bio = &s->bio.bio;
1297 1256
1298 trace_bcache_request_start(s, bio); 1257 trace_bcache_request_start(s->d, bio);
1299 1258
1300 if (bio_has_data(bio) && !rw) { 1259 if (!bio->bi_size) {
1301 closure_call(&s->op.cl, btree_read_async, NULL, cl); 1260 /*
1302 } else if (bio_has_data(bio) || s->op.skip) { 1261 * can't call bch_journal_meta from under
1303 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, 1262 * generic_make_request
1263 */
1264 continue_at_nobarrier(&s->cl,
1265 flash_dev_nodata,
1266 bcache_wq);
1267 } else if (rw) {
1268 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
1304 &KEY(d->id, bio->bi_sector, 0), 1269 &KEY(d->id, bio->bi_sector, 0),
1305 &KEY(d->id, bio_end_sector(bio), 0)); 1270 &KEY(d->id, bio_end_sector(bio), 0));
1306 1271
1307 s->writeback = true; 1272 s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0;
1308 s->op.cache_bio = bio; 1273 s->iop.writeback = true;
1274 s->iop.bio = bio;
1309 1275
1310 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 1276 closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
1311 } else { 1277 } else {
1312 /* No data - probably a cache flush */ 1278 closure_call(&s->iop.cl, cache_lookup, NULL, cl);
1313 if (s->op.flush_journal)
1314 bch_journal_meta(s->op.c, cl);
1315 } 1279 }
1316 1280
1317 continue_at(cl, search_free, NULL); 1281 continue_at(cl, search_free, NULL);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 57dc4784f4f4..2cd65bf073c2 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -3,40 +3,33 @@
3 3
4#include <linux/cgroup.h> 4#include <linux/cgroup.h>
5 5
6struct search { 6struct data_insert_op {
7 /* Stack frame for bio_complete */
8 struct closure cl; 7 struct closure cl;
8 struct cache_set *c;
9 struct bio *bio;
9 10
10 struct bcache_device *d; 11 unsigned inode;
11 struct task_struct *task; 12 uint16_t write_point;
12 13 uint16_t write_prio;
13 struct bbio bio; 14 short error;
14 struct bio *orig_bio;
15 struct bio *cache_miss;
16 unsigned cache_bio_sectors;
17
18 unsigned recoverable:1;
19 unsigned unaligned_bvec:1;
20 15
21 unsigned write:1; 16 unsigned bypass:1;
22 unsigned writeback:1; 17 unsigned writeback:1;
18 unsigned flush_journal:1;
19 unsigned csum:1;
23 20
24 /* IO error returned to s->bio */ 21 unsigned replace:1;
25 short error; 22 unsigned replace_collision:1;
26 unsigned long start_time; 23
24 unsigned insert_data_done:1;
27 25
28 /* Anything past op->keys won't get zeroed in do_bio_hook */ 26 /* Anything past this point won't get zeroed in search_alloc() */
29 struct btree_op op; 27 struct keylist insert_keys;
28 BKEY_PADDED(replace_key);
30}; 29};
31 30
32void bch_cache_read_endio(struct bio *, int);
33unsigned bch_get_congested(struct cache_set *); 31unsigned bch_get_congested(struct cache_set *);
34void bch_insert_data(struct closure *cl); 32void bch_data_insert(struct closure *cl);
35void bch_btree_insert_async(struct closure *);
36void bch_cache_read_endio(struct bio *, int);
37
38void bch_open_buckets_free(struct cache_set *);
39int bch_open_buckets_alloc(struct cache_set *);
40 33
41void bch_cached_dev_request_init(struct cached_dev *dc); 34void bch_cached_dev_request_init(struct cached_dev *dc);
42void bch_flash_dev_request_init(struct bcache_device *d); 35void bch_flash_dev_request_init(struct bcache_device *d);
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index b8730e714d69..84d0782f702e 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -7,7 +7,6 @@
7#include "bcache.h" 7#include "bcache.h"
8#include "stats.h" 8#include "stats.h"
9#include "btree.h" 9#include "btree.h"
10#include "request.h"
11#include "sysfs.h" 10#include "sysfs.h"
12 11
13/* 12/*
@@ -196,35 +195,36 @@ static void mark_cache_stats(struct cache_stat_collector *stats,
196 atomic_inc(&stats->cache_bypass_misses); 195 atomic_inc(&stats->cache_bypass_misses);
197} 196}
198 197
199void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) 198void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
199 bool hit, bool bypass)
200{ 200{
201 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 201 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
202 mark_cache_stats(&dc->accounting.collector, hit, bypass); 202 mark_cache_stats(&dc->accounting.collector, hit, bypass);
203 mark_cache_stats(&s->op.c->accounting.collector, hit, bypass); 203 mark_cache_stats(&c->accounting.collector, hit, bypass);
204#ifdef CONFIG_CGROUP_BCACHE 204#ifdef CONFIG_CGROUP_BCACHE
205 mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); 205 mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
206#endif 206#endif
207} 207}
208 208
209void bch_mark_cache_readahead(struct search *s) 209void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
210{ 210{
211 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 211 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
212 atomic_inc(&dc->accounting.collector.cache_readaheads); 212 atomic_inc(&dc->accounting.collector.cache_readaheads);
213 atomic_inc(&s->op.c->accounting.collector.cache_readaheads); 213 atomic_inc(&c->accounting.collector.cache_readaheads);
214} 214}
215 215
216void bch_mark_cache_miss_collision(struct search *s) 216void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d)
217{ 217{
218 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 218 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
219 atomic_inc(&dc->accounting.collector.cache_miss_collisions); 219 atomic_inc(&dc->accounting.collector.cache_miss_collisions);
220 atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions); 220 atomic_inc(&c->accounting.collector.cache_miss_collisions);
221} 221}
222 222
223void bch_mark_sectors_bypassed(struct search *s, int sectors) 223void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc,
224 int sectors)
224{ 225{
225 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
226 atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); 226 atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
227 atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); 227 atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
228} 228}
229 229
230void bch_cache_accounting_init(struct cache_accounting *acc, 230void bch_cache_accounting_init(struct cache_accounting *acc,
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
index c7c7a8fd29fe..adbff141c887 100644
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@@ -38,7 +38,9 @@ struct cache_accounting {
38 struct cache_stats day; 38 struct cache_stats day;
39}; 39};
40 40
41struct search; 41struct cache_set;
42struct cached_dev;
43struct bcache_device;
42 44
43void bch_cache_accounting_init(struct cache_accounting *acc, 45void bch_cache_accounting_init(struct cache_accounting *acc,
44 struct closure *parent); 46 struct closure *parent);
@@ -50,9 +52,10 @@ void bch_cache_accounting_clear(struct cache_accounting *acc);
50 52
51void bch_cache_accounting_destroy(struct cache_accounting *acc); 53void bch_cache_accounting_destroy(struct cache_accounting *acc);
52 54
53void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass); 55void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *,
54void bch_mark_cache_readahead(struct search *s); 56 bool, bool);
55void bch_mark_cache_miss_collision(struct search *s); 57void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *);
56void bch_mark_sectors_bypassed(struct search *s, int sectors); 58void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *);
59void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int);
57 60
58#endif /* _BCACHE_STATS_H_ */ 61#endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 547c4c57b052..dec15cd2d797 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -16,6 +16,7 @@
16#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include <linux/debugfs.h> 17#include <linux/debugfs.h>
18#include <linux/genhd.h> 18#include <linux/genhd.h>
19#include <linux/idr.h>
19#include <linux/kthread.h> 20#include <linux/kthread.h>
20#include <linux/module.h> 21#include <linux/module.h>
21#include <linux/random.h> 22#include <linux/random.h>
@@ -45,21 +46,13 @@ const char * const bch_cache_modes[] = {
45 NULL 46 NULL
46}; 47};
47 48
48struct uuid_entry_v0 {
49 uint8_t uuid[16];
50 uint8_t label[32];
51 uint32_t first_reg;
52 uint32_t last_reg;
53 uint32_t invalidated;
54 uint32_t pad;
55};
56
57static struct kobject *bcache_kobj; 49static struct kobject *bcache_kobj;
58struct mutex bch_register_lock; 50struct mutex bch_register_lock;
59LIST_HEAD(bch_cache_sets); 51LIST_HEAD(bch_cache_sets);
60static LIST_HEAD(uncached_devices); 52static LIST_HEAD(uncached_devices);
61 53
62static int bcache_major, bcache_minor; 54static int bcache_major;
55static DEFINE_IDA(bcache_minor);
63static wait_queue_head_t unregister_wait; 56static wait_queue_head_t unregister_wait;
64struct workqueue_struct *bcache_wq; 57struct workqueue_struct *bcache_wq;
65 58
@@ -382,7 +375,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
382{ 375{
383 struct bkey *k = &j->uuid_bucket; 376 struct bkey *k = &j->uuid_bucket;
384 377
385 if (__bch_ptr_invalid(c, 1, k)) 378 if (bch_btree_ptr_invalid(c, k))
386 return "bad uuid pointer"; 379 return "bad uuid pointer";
387 380
388 bkey_copy(&c->uuid_bucket, k); 381 bkey_copy(&c->uuid_bucket, k);
@@ -427,7 +420,7 @@ static int __uuid_write(struct cache_set *c)
427 420
428 lockdep_assert_held(&bch_register_lock); 421 lockdep_assert_held(&bch_register_lock);
429 422
430 if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) 423 if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
431 return 1; 424 return 1;
432 425
433 SET_KEY_SIZE(&k.key, c->sb.bucket_size); 426 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
@@ -435,7 +428,7 @@ static int __uuid_write(struct cache_set *c)
435 closure_sync(&cl); 428 closure_sync(&cl);
436 429
437 bkey_copy(&c->uuid_bucket, &k.key); 430 bkey_copy(&c->uuid_bucket, &k.key);
438 __bkey_put(c, &k.key); 431 bkey_put(c, &k.key);
439 return 0; 432 return 0;
440} 433}
441 434
@@ -562,10 +555,10 @@ void bch_prio_write(struct cache *ca)
562 } 555 }
563 556
564 p->next_bucket = ca->prio_buckets[i + 1]; 557 p->next_bucket = ca->prio_buckets[i + 1];
565 p->magic = pset_magic(ca); 558 p->magic = pset_magic(&ca->sb);
566 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); 559 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
567 560
568 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); 561 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
569 BUG_ON(bucket == -1); 562 BUG_ON(bucket == -1);
570 563
571 mutex_unlock(&ca->set->bucket_lock); 564 mutex_unlock(&ca->set->bucket_lock);
@@ -613,7 +606,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
613 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) 606 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
614 pr_warn("bad csum reading priorities"); 607 pr_warn("bad csum reading priorities");
615 608
616 if (p->magic != pset_magic(ca)) 609 if (p->magic != pset_magic(&ca->sb))
617 pr_warn("bad magic reading priorities"); 610 pr_warn("bad magic reading priorities");
618 611
619 bucket = p->next_bucket; 612 bucket = p->next_bucket;
@@ -630,7 +623,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
630static int open_dev(struct block_device *b, fmode_t mode) 623static int open_dev(struct block_device *b, fmode_t mode)
631{ 624{
632 struct bcache_device *d = b->bd_disk->private_data; 625 struct bcache_device *d = b->bd_disk->private_data;
633 if (atomic_read(&d->closing)) 626 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
634 return -ENXIO; 627 return -ENXIO;
635 628
636 closure_get(&d->cl); 629 closure_get(&d->cl);
@@ -659,20 +652,24 @@ static const struct block_device_operations bcache_ops = {
659 652
660void bcache_device_stop(struct bcache_device *d) 653void bcache_device_stop(struct bcache_device *d)
661{ 654{
662 if (!atomic_xchg(&d->closing, 1)) 655 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
663 closure_queue(&d->cl); 656 closure_queue(&d->cl);
664} 657}
665 658
666static void bcache_device_unlink(struct bcache_device *d) 659static void bcache_device_unlink(struct bcache_device *d)
667{ 660{
668 unsigned i; 661 lockdep_assert_held(&bch_register_lock);
669 struct cache *ca;
670 662
671 sysfs_remove_link(&d->c->kobj, d->name); 663 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
672 sysfs_remove_link(&d->kobj, "cache"); 664 unsigned i;
665 struct cache *ca;
673 666
674 for_each_cache(ca, d->c, i) 667 sysfs_remove_link(&d->c->kobj, d->name);
675 bd_unlink_disk_holder(ca->bdev, d->disk); 668 sysfs_remove_link(&d->kobj, "cache");
669
670 for_each_cache(ca, d->c, i)
671 bd_unlink_disk_holder(ca->bdev, d->disk);
672 }
676} 673}
677 674
678static void bcache_device_link(struct bcache_device *d, struct cache_set *c, 675static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
@@ -696,19 +693,16 @@ static void bcache_device_detach(struct bcache_device *d)
696{ 693{
697 lockdep_assert_held(&bch_register_lock); 694 lockdep_assert_held(&bch_register_lock);
698 695
699 if (atomic_read(&d->detaching)) { 696 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
700 struct uuid_entry *u = d->c->uuids + d->id; 697 struct uuid_entry *u = d->c->uuids + d->id;
701 698
702 SET_UUID_FLASH_ONLY(u, 0); 699 SET_UUID_FLASH_ONLY(u, 0);
703 memcpy(u->uuid, invalid_uuid, 16); 700 memcpy(u->uuid, invalid_uuid, 16);
704 u->invalidated = cpu_to_le32(get_seconds()); 701 u->invalidated = cpu_to_le32(get_seconds());
705 bch_uuid_write(d->c); 702 bch_uuid_write(d->c);
706
707 atomic_set(&d->detaching, 0);
708 } 703 }
709 704
710 if (!d->flush_done) 705 bcache_device_unlink(d);
711 bcache_device_unlink(d);
712 706
713 d->c->devices[d->id] = NULL; 707 d->c->devices[d->id] = NULL;
714 closure_put(&d->c->caching); 708 closure_put(&d->c->caching);
@@ -739,14 +733,20 @@ static void bcache_device_free(struct bcache_device *d)
739 del_gendisk(d->disk); 733 del_gendisk(d->disk);
740 if (d->disk && d->disk->queue) 734 if (d->disk && d->disk->queue)
741 blk_cleanup_queue(d->disk->queue); 735 blk_cleanup_queue(d->disk->queue);
742 if (d->disk) 736 if (d->disk) {
737 ida_simple_remove(&bcache_minor, d->disk->first_minor);
743 put_disk(d->disk); 738 put_disk(d->disk);
739 }
744 740
745 bio_split_pool_free(&d->bio_split_hook); 741 bio_split_pool_free(&d->bio_split_hook);
746 if (d->unaligned_bvec) 742 if (d->unaligned_bvec)
747 mempool_destroy(d->unaligned_bvec); 743 mempool_destroy(d->unaligned_bvec);
748 if (d->bio_split) 744 if (d->bio_split)
749 bioset_free(d->bio_split); 745 bioset_free(d->bio_split);
746 if (is_vmalloc_addr(d->full_dirty_stripes))
747 vfree(d->full_dirty_stripes);
748 else
749 kfree(d->full_dirty_stripes);
750 if (is_vmalloc_addr(d->stripe_sectors_dirty)) 750 if (is_vmalloc_addr(d->stripe_sectors_dirty))
751 vfree(d->stripe_sectors_dirty); 751 vfree(d->stripe_sectors_dirty);
752 else 752 else
@@ -760,15 +760,19 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
760{ 760{
761 struct request_queue *q; 761 struct request_queue *q;
762 size_t n; 762 size_t n;
763 int minor;
763 764
764 if (!d->stripe_size_bits) 765 if (!d->stripe_size)
765 d->stripe_size_bits = 31; 766 d->stripe_size = 1 << 31;
766 767
767 d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> 768 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
768 d->stripe_size_bits;
769 769
770 if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) 770 if (!d->nr_stripes ||
771 d->nr_stripes > INT_MAX ||
772 d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
773 pr_err("nr_stripes too large");
771 return -ENOMEM; 774 return -ENOMEM;
775 }
772 776
773 n = d->nr_stripes * sizeof(atomic_t); 777 n = d->nr_stripes * sizeof(atomic_t);
774 d->stripe_sectors_dirty = n < PAGE_SIZE << 6 778 d->stripe_sectors_dirty = n < PAGE_SIZE << 6
@@ -777,22 +781,38 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
777 if (!d->stripe_sectors_dirty) 781 if (!d->stripe_sectors_dirty)
778 return -ENOMEM; 782 return -ENOMEM;
779 783
784 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
785 d->full_dirty_stripes = n < PAGE_SIZE << 6
786 ? kzalloc(n, GFP_KERNEL)
787 : vzalloc(n);
788 if (!d->full_dirty_stripes)
789 return -ENOMEM;
790
791 minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
792 if (minor < 0)
793 return minor;
794
780 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 795 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
781 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, 796 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
782 sizeof(struct bio_vec) * BIO_MAX_PAGES)) || 797 sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
783 bio_split_pool_init(&d->bio_split_hook) || 798 bio_split_pool_init(&d->bio_split_hook) ||
784 !(d->disk = alloc_disk(1)) || 799 !(d->disk = alloc_disk(1))) {
785 !(q = blk_alloc_queue(GFP_KERNEL))) 800 ida_simple_remove(&bcache_minor, minor);
786 return -ENOMEM; 801 return -ENOMEM;
802 }
787 803
788 set_capacity(d->disk, sectors); 804 set_capacity(d->disk, sectors);
789 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); 805 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
790 806
791 d->disk->major = bcache_major; 807 d->disk->major = bcache_major;
792 d->disk->first_minor = bcache_minor++; 808 d->disk->first_minor = minor;
793 d->disk->fops = &bcache_ops; 809 d->disk->fops = &bcache_ops;
794 d->disk->private_data = d; 810 d->disk->private_data = d;
795 811
812 q = blk_alloc_queue(GFP_KERNEL);
813 if (!q)
814 return -ENOMEM;
815
796 blk_queue_make_request(q, NULL); 816 blk_queue_make_request(q, NULL);
797 d->disk->queue = q; 817 d->disk->queue = q;
798 q->queuedata = d; 818 q->queuedata = d;
@@ -874,7 +894,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
874 struct closure cl; 894 struct closure cl;
875 closure_init_stack(&cl); 895 closure_init_stack(&cl);
876 896
877 BUG_ON(!atomic_read(&dc->disk.detaching)); 897 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
878 BUG_ON(atomic_read(&dc->count)); 898 BUG_ON(atomic_read(&dc->count));
879 899
880 mutex_lock(&bch_register_lock); 900 mutex_lock(&bch_register_lock);
@@ -888,6 +908,8 @@ static void cached_dev_detach_finish(struct work_struct *w)
888 bcache_device_detach(&dc->disk); 908 bcache_device_detach(&dc->disk);
889 list_move(&dc->list, &uncached_devices); 909 list_move(&dc->list, &uncached_devices);
890 910
911 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
912
891 mutex_unlock(&bch_register_lock); 913 mutex_unlock(&bch_register_lock);
892 914
893 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); 915 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
@@ -900,10 +922,10 @@ void bch_cached_dev_detach(struct cached_dev *dc)
900{ 922{
901 lockdep_assert_held(&bch_register_lock); 923 lockdep_assert_held(&bch_register_lock);
902 924
903 if (atomic_read(&dc->disk.closing)) 925 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
904 return; 926 return;
905 927
906 if (atomic_xchg(&dc->disk.detaching, 1)) 928 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
907 return; 929 return;
908 930
909 /* 931 /*
@@ -1030,6 +1052,7 @@ static void cached_dev_free(struct closure *cl)
1030 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); 1052 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1031 1053
1032 cancel_delayed_work_sync(&dc->writeback_rate_update); 1054 cancel_delayed_work_sync(&dc->writeback_rate_update);
1055 kthread_stop(dc->writeback_thread);
1033 1056
1034 mutex_lock(&bch_register_lock); 1057 mutex_lock(&bch_register_lock);
1035 1058
@@ -1058,11 +1081,7 @@ static void cached_dev_flush(struct closure *cl)
1058 struct bcache_device *d = &dc->disk; 1081 struct bcache_device *d = &dc->disk;
1059 1082
1060 mutex_lock(&bch_register_lock); 1083 mutex_lock(&bch_register_lock);
1061 d->flush_done = 1; 1084 bcache_device_unlink(d);
1062
1063 if (d->c)
1064 bcache_device_unlink(d);
1065
1066 mutex_unlock(&bch_register_lock); 1085 mutex_unlock(&bch_register_lock);
1067 1086
1068 bch_cache_accounting_destroy(&dc->accounting); 1087 bch_cache_accounting_destroy(&dc->accounting);
@@ -1088,7 +1107,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1088 spin_lock_init(&dc->io_lock); 1107 spin_lock_init(&dc->io_lock);
1089 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); 1108 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1090 1109
1091 dc->sequential_merge = true;
1092 dc->sequential_cutoff = 4 << 20; 1110 dc->sequential_cutoff = 4 << 20;
1093 1111
1094 for (io = dc->io; io < dc->io + RECENT_IO; io++) { 1112 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
@@ -1260,7 +1278,8 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1260{ 1278{
1261 va_list args; 1279 va_list args;
1262 1280
1263 if (test_bit(CACHE_SET_STOPPING, &c->flags)) 1281 if (c->on_error != ON_ERROR_PANIC &&
1282 test_bit(CACHE_SET_STOPPING, &c->flags))
1264 return false; 1283 return false;
1265 1284
1266 /* XXX: we can be called from atomic context 1285 /* XXX: we can be called from atomic context
@@ -1275,6 +1294,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1275 1294
1276 printk(", disabling caching\n"); 1295 printk(", disabling caching\n");
1277 1296
1297 if (c->on_error == ON_ERROR_PANIC)
1298 panic("panic forced after error\n");
1299
1278 bch_cache_set_unregister(c); 1300 bch_cache_set_unregister(c);
1279 return true; 1301 return true;
1280} 1302}
@@ -1339,6 +1361,9 @@ static void cache_set_flush(struct closure *cl)
1339 kobject_put(&c->internal); 1361 kobject_put(&c->internal);
1340 kobject_del(&c->kobj); 1362 kobject_del(&c->kobj);
1341 1363
1364 if (c->gc_thread)
1365 kthread_stop(c->gc_thread);
1366
1342 if (!IS_ERR_OR_NULL(c->root)) 1367 if (!IS_ERR_OR_NULL(c->root))
1343 list_add(&c->root->list, &c->btree_cache); 1368 list_add(&c->root->list, &c->btree_cache);
1344 1369
@@ -1433,12 +1458,19 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1433 1458
1434 c->sort_crit_factor = int_sqrt(c->btree_pages); 1459 c->sort_crit_factor = int_sqrt(c->btree_pages);
1435 1460
1436 mutex_init(&c->bucket_lock);
1437 mutex_init(&c->sort_lock);
1438 spin_lock_init(&c->sort_time_lock);
1439 closure_init_unlocked(&c->sb_write); 1461 closure_init_unlocked(&c->sb_write);
1462 mutex_init(&c->bucket_lock);
1463 init_waitqueue_head(&c->try_wait);
1464 init_waitqueue_head(&c->bucket_wait);
1440 closure_init_unlocked(&c->uuid_write); 1465 closure_init_unlocked(&c->uuid_write);
1441 spin_lock_init(&c->btree_read_time_lock); 1466 mutex_init(&c->sort_lock);
1467
1468 spin_lock_init(&c->sort_time.lock);
1469 spin_lock_init(&c->btree_gc_time.lock);
1470 spin_lock_init(&c->btree_split_time.lock);
1471 spin_lock_init(&c->btree_read_time.lock);
1472 spin_lock_init(&c->try_harder_time.lock);
1473
1442 bch_moving_init_cache_set(c); 1474 bch_moving_init_cache_set(c);
1443 1475
1444 INIT_LIST_HEAD(&c->list); 1476 INIT_LIST_HEAD(&c->list);
@@ -1483,11 +1515,10 @@ static void run_cache_set(struct cache_set *c)
1483 const char *err = "cannot allocate memory"; 1515 const char *err = "cannot allocate memory";
1484 struct cached_dev *dc, *t; 1516 struct cached_dev *dc, *t;
1485 struct cache *ca; 1517 struct cache *ca;
1518 struct closure cl;
1486 unsigned i; 1519 unsigned i;
1487 1520
1488 struct btree_op op; 1521 closure_init_stack(&cl);
1489 bch_btree_op_init_stack(&op);
1490 op.lock = SHRT_MAX;
1491 1522
1492 for_each_cache(ca, c, i) 1523 for_each_cache(ca, c, i)
1493 c->nbuckets += ca->sb.nbuckets; 1524 c->nbuckets += ca->sb.nbuckets;
@@ -1498,7 +1529,7 @@ static void run_cache_set(struct cache_set *c)
1498 struct jset *j; 1529 struct jset *j;
1499 1530
1500 err = "cannot allocate memory for journal"; 1531 err = "cannot allocate memory for journal";
1501 if (bch_journal_read(c, &journal, &op)) 1532 if (bch_journal_read(c, &journal))
1502 goto err; 1533 goto err;
1503 1534
1504 pr_debug("btree_journal_read() done"); 1535 pr_debug("btree_journal_read() done");
@@ -1522,23 +1553,23 @@ static void run_cache_set(struct cache_set *c)
1522 k = &j->btree_root; 1553 k = &j->btree_root;
1523 1554
1524 err = "bad btree root"; 1555 err = "bad btree root";
1525 if (__bch_ptr_invalid(c, j->btree_level + 1, k)) 1556 if (bch_btree_ptr_invalid(c, k))
1526 goto err; 1557 goto err;
1527 1558
1528 err = "error reading btree root"; 1559 err = "error reading btree root";
1529 c->root = bch_btree_node_get(c, k, j->btree_level, &op); 1560 c->root = bch_btree_node_get(c, k, j->btree_level, true);
1530 if (IS_ERR_OR_NULL(c->root)) 1561 if (IS_ERR_OR_NULL(c->root))
1531 goto err; 1562 goto err;
1532 1563
1533 list_del_init(&c->root->list); 1564 list_del_init(&c->root->list);
1534 rw_unlock(true, c->root); 1565 rw_unlock(true, c->root);
1535 1566
1536 err = uuid_read(c, j, &op.cl); 1567 err = uuid_read(c, j, &cl);
1537 if (err) 1568 if (err)
1538 goto err; 1569 goto err;
1539 1570
1540 err = "error in recovery"; 1571 err = "error in recovery";
1541 if (bch_btree_check(c, &op)) 1572 if (bch_btree_check(c))
1542 goto err; 1573 goto err;
1543 1574
1544 bch_journal_mark(c, &journal); 1575 bch_journal_mark(c, &journal);
@@ -1570,11 +1601,9 @@ static void run_cache_set(struct cache_set *c)
1570 if (j->version < BCACHE_JSET_VERSION_UUID) 1601 if (j->version < BCACHE_JSET_VERSION_UUID)
1571 __uuid_write(c); 1602 __uuid_write(c);
1572 1603
1573 bch_journal_replay(c, &journal, &op); 1604 bch_journal_replay(c, &journal);
1574 } else { 1605 } else {
1575 pr_notice("invalidating existing data"); 1606 pr_notice("invalidating existing data");
1576 /* Don't want invalidate_buckets() to queue a gc yet */
1577 closure_lock(&c->gc, NULL);
1578 1607
1579 for_each_cache(ca, c, i) { 1608 for_each_cache(ca, c, i) {
1580 unsigned j; 1609 unsigned j;
@@ -1600,15 +1629,15 @@ static void run_cache_set(struct cache_set *c)
1600 1629
1601 err = "cannot allocate new UUID bucket"; 1630 err = "cannot allocate new UUID bucket";
1602 if (__uuid_write(c)) 1631 if (__uuid_write(c))
1603 goto err_unlock_gc; 1632 goto err;
1604 1633
1605 err = "cannot allocate new btree root"; 1634 err = "cannot allocate new btree root";
1606 c->root = bch_btree_node_alloc(c, 0, &op.cl); 1635 c->root = bch_btree_node_alloc(c, 0, true);
1607 if (IS_ERR_OR_NULL(c->root)) 1636 if (IS_ERR_OR_NULL(c->root))
1608 goto err_unlock_gc; 1637 goto err;
1609 1638
1610 bkey_copy_key(&c->root->key, &MAX_KEY); 1639 bkey_copy_key(&c->root->key, &MAX_KEY);
1611 bch_btree_node_write(c->root, &op.cl); 1640 bch_btree_node_write(c->root, &cl);
1612 1641
1613 bch_btree_set_root(c->root); 1642 bch_btree_set_root(c->root);
1614 rw_unlock(true, c->root); 1643 rw_unlock(true, c->root);
@@ -1621,14 +1650,14 @@ static void run_cache_set(struct cache_set *c)
1621 SET_CACHE_SYNC(&c->sb, true); 1650 SET_CACHE_SYNC(&c->sb, true);
1622 1651
1623 bch_journal_next(&c->journal); 1652 bch_journal_next(&c->journal);
1624 bch_journal_meta(c, &op.cl); 1653 bch_journal_meta(c, &cl);
1625
1626 /* Unlock */
1627 closure_set_stopped(&c->gc.cl);
1628 closure_put(&c->gc.cl);
1629 } 1654 }
1630 1655
1631 closure_sync(&op.cl); 1656 err = "error starting gc thread";
1657 if (bch_gc_thread_start(c))
1658 goto err;
1659
1660 closure_sync(&cl);
1632 c->sb.last_mount = get_seconds(); 1661 c->sb.last_mount = get_seconds();
1633 bcache_write_super(c); 1662 bcache_write_super(c);
1634 1663
@@ -1638,13 +1667,10 @@ static void run_cache_set(struct cache_set *c)
1638 flash_devs_run(c); 1667 flash_devs_run(c);
1639 1668
1640 return; 1669 return;
1641err_unlock_gc:
1642 closure_set_stopped(&c->gc.cl);
1643 closure_put(&c->gc.cl);
1644err: 1670err:
1645 closure_sync(&op.cl); 1671 closure_sync(&cl);
1646 /* XXX: test this, it's broken */ 1672 /* XXX: test this, it's broken */
1647 bch_cache_set_error(c, err); 1673 bch_cache_set_error(c, "%s", err);
1648} 1674}
1649 1675
1650static bool can_attach_cache(struct cache *ca, struct cache_set *c) 1676static bool can_attach_cache(struct cache *ca, struct cache_set *c)
@@ -1725,8 +1751,6 @@ void bch_cache_release(struct kobject *kobj)
1725 if (ca->set) 1751 if (ca->set)
1726 ca->set->cache[ca->sb.nr_this_dev] = NULL; 1752 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1727 1753
1728 bch_cache_allocator_exit(ca);
1729
1730 bio_split_pool_free(&ca->bio_split_hook); 1754 bio_split_pool_free(&ca->bio_split_hook);
1731 1755
1732 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); 1756 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
@@ -1758,8 +1782,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1758 __module_get(THIS_MODULE); 1782 __module_get(THIS_MODULE);
1759 kobject_init(&ca->kobj, &bch_cache_ktype); 1783 kobject_init(&ca->kobj, &bch_cache_ktype);
1760 1784
1761 INIT_LIST_HEAD(&ca->discards);
1762
1763 bio_init(&ca->journal.bio); 1785 bio_init(&ca->journal.bio);
1764 ca->journal.bio.bi_max_vecs = 8; 1786 ca->journal.bio.bi_max_vecs = 8;
1765 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; 1787 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
@@ -2006,7 +2028,6 @@ static struct notifier_block reboot = {
2006static void bcache_exit(void) 2028static void bcache_exit(void)
2007{ 2029{
2008 bch_debug_exit(); 2030 bch_debug_exit();
2009 bch_writeback_exit();
2010 bch_request_exit(); 2031 bch_request_exit();
2011 bch_btree_exit(); 2032 bch_btree_exit();
2012 if (bcache_kobj) 2033 if (bcache_kobj)
@@ -2039,7 +2060,6 @@ static int __init bcache_init(void)
2039 sysfs_create_files(bcache_kobj, files) || 2060 sysfs_create_files(bcache_kobj, files) ||
2040 bch_btree_init() || 2061 bch_btree_init() ||
2041 bch_request_init() || 2062 bch_request_init() ||
2042 bch_writeback_init() ||
2043 bch_debug_init(bcache_kobj)) 2063 bch_debug_init(bcache_kobj))
2044 goto err; 2064 goto err;
2045 2065
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 924dcfdae111..80d4c2bee18a 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -21,6 +21,12 @@ static const char * const cache_replacement_policies[] = {
21 NULL 21 NULL
22}; 22};
23 23
24static const char * const error_actions[] = {
25 "unregister",
26 "panic",
27 NULL
28};
29
24write_attribute(attach); 30write_attribute(attach);
25write_attribute(detach); 31write_attribute(detach);
26write_attribute(unregister); 32write_attribute(unregister);
@@ -66,7 +72,6 @@ rw_attribute(congested_read_threshold_us);
66rw_attribute(congested_write_threshold_us); 72rw_attribute(congested_write_threshold_us);
67 73
68rw_attribute(sequential_cutoff); 74rw_attribute(sequential_cutoff);
69rw_attribute(sequential_merge);
70rw_attribute(data_csum); 75rw_attribute(data_csum);
71rw_attribute(cache_mode); 76rw_attribute(cache_mode);
72rw_attribute(writeback_metadata); 77rw_attribute(writeback_metadata);
@@ -90,11 +95,14 @@ rw_attribute(discard);
90rw_attribute(running); 95rw_attribute(running);
91rw_attribute(label); 96rw_attribute(label);
92rw_attribute(readahead); 97rw_attribute(readahead);
98rw_attribute(errors);
93rw_attribute(io_error_limit); 99rw_attribute(io_error_limit);
94rw_attribute(io_error_halflife); 100rw_attribute(io_error_halflife);
95rw_attribute(verify); 101rw_attribute(verify);
102rw_attribute(bypass_torture_test);
96rw_attribute(key_merging_disabled); 103rw_attribute(key_merging_disabled);
97rw_attribute(gc_always_rewrite); 104rw_attribute(gc_always_rewrite);
105rw_attribute(expensive_debug_checks);
98rw_attribute(freelist_percent); 106rw_attribute(freelist_percent);
99rw_attribute(cache_replacement_policy); 107rw_attribute(cache_replacement_policy);
100rw_attribute(btree_shrinker_disabled); 108rw_attribute(btree_shrinker_disabled);
@@ -116,6 +124,7 @@ SHOW(__bch_cached_dev)
116 124
117 sysfs_printf(data_csum, "%i", dc->disk.data_csum); 125 sysfs_printf(data_csum, "%i", dc->disk.data_csum);
118 var_printf(verify, "%i"); 126 var_printf(verify, "%i");
127 var_printf(bypass_torture_test, "%i");
119 var_printf(writeback_metadata, "%i"); 128 var_printf(writeback_metadata, "%i");
120 var_printf(writeback_running, "%i"); 129 var_printf(writeback_running, "%i");
121 var_print(writeback_delay); 130 var_print(writeback_delay);
@@ -150,10 +159,9 @@ SHOW(__bch_cached_dev)
150 sysfs_hprint(dirty_data, 159 sysfs_hprint(dirty_data,
151 bcache_dev_sectors_dirty(&dc->disk) << 9); 160 bcache_dev_sectors_dirty(&dc->disk) << 9);
152 161
153 sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); 162 sysfs_hprint(stripe_size, dc->disk.stripe_size << 9);
154 var_printf(partial_stripes_expensive, "%u"); 163 var_printf(partial_stripes_expensive, "%u");
155 164
156 var_printf(sequential_merge, "%i");
157 var_hprint(sequential_cutoff); 165 var_hprint(sequential_cutoff);
158 var_hprint(readahead); 166 var_hprint(readahead);
159 167
@@ -185,6 +193,7 @@ STORE(__cached_dev)
185 193
186 sysfs_strtoul(data_csum, dc->disk.data_csum); 194 sysfs_strtoul(data_csum, dc->disk.data_csum);
187 d_strtoul(verify); 195 d_strtoul(verify);
196 d_strtoul(bypass_torture_test);
188 d_strtoul(writeback_metadata); 197 d_strtoul(writeback_metadata);
189 d_strtoul(writeback_running); 198 d_strtoul(writeback_running);
190 d_strtoul(writeback_delay); 199 d_strtoul(writeback_delay);
@@ -199,7 +208,6 @@ STORE(__cached_dev)
199 dc->writeback_rate_p_term_inverse, 1, INT_MAX); 208 dc->writeback_rate_p_term_inverse, 1, INT_MAX);
200 d_strtoul(writeback_rate_d_smooth); 209 d_strtoul(writeback_rate_d_smooth);
201 210
202 d_strtoul(sequential_merge);
203 d_strtoi_h(sequential_cutoff); 211 d_strtoi_h(sequential_cutoff);
204 d_strtoi_h(readahead); 212 d_strtoi_h(readahead);
205 213
@@ -311,7 +319,6 @@ static struct attribute *bch_cached_dev_files[] = {
311 &sysfs_stripe_size, 319 &sysfs_stripe_size,
312 &sysfs_partial_stripes_expensive, 320 &sysfs_partial_stripes_expensive,
313 &sysfs_sequential_cutoff, 321 &sysfs_sequential_cutoff,
314 &sysfs_sequential_merge,
315 &sysfs_clear_stats, 322 &sysfs_clear_stats,
316 &sysfs_running, 323 &sysfs_running,
317 &sysfs_state, 324 &sysfs_state,
@@ -319,6 +326,7 @@ static struct attribute *bch_cached_dev_files[] = {
319 &sysfs_readahead, 326 &sysfs_readahead,
320#ifdef CONFIG_BCACHE_DEBUG 327#ifdef CONFIG_BCACHE_DEBUG
321 &sysfs_verify, 328 &sysfs_verify,
329 &sysfs_bypass_torture_test,
322#endif 330#endif
323 NULL 331 NULL
324}; 332};
@@ -366,7 +374,7 @@ STORE(__bch_flash_dev)
366 } 374 }
367 375
368 if (attr == &sysfs_unregister) { 376 if (attr == &sysfs_unregister) {
369 atomic_set(&d->detaching, 1); 377 set_bit(BCACHE_DEV_DETACHING, &d->flags);
370 bcache_device_stop(d); 378 bcache_device_stop(d);
371 } 379 }
372 380
@@ -481,7 +489,6 @@ lock_root:
481 489
482 sysfs_print(btree_used_percent, btree_used(c)); 490 sysfs_print(btree_used_percent, btree_used(c));
483 sysfs_print(btree_nodes, c->gc_stats.nodes); 491 sysfs_print(btree_nodes, c->gc_stats.nodes);
484 sysfs_hprint(dirty_data, c->gc_stats.dirty);
485 sysfs_hprint(average_key_size, average_key_size(c)); 492 sysfs_hprint(average_key_size, average_key_size(c));
486 493
487 sysfs_print(cache_read_races, 494 sysfs_print(cache_read_races,
@@ -492,6 +499,10 @@ lock_root:
492 sysfs_print(writeback_keys_failed, 499 sysfs_print(writeback_keys_failed,
493 atomic_long_read(&c->writeback_keys_failed)); 500 atomic_long_read(&c->writeback_keys_failed));
494 501
502 if (attr == &sysfs_errors)
503 return bch_snprint_string_list(buf, PAGE_SIZE, error_actions,
504 c->on_error);
505
495 /* See count_io_errors for why 88 */ 506 /* See count_io_errors for why 88 */
496 sysfs_print(io_error_halflife, c->error_decay * 88); 507 sysfs_print(io_error_halflife, c->error_decay * 88);
497 sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); 508 sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT);
@@ -506,6 +517,8 @@ lock_root:
506 sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); 517 sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
507 sysfs_printf(verify, "%i", c->verify); 518 sysfs_printf(verify, "%i", c->verify);
508 sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); 519 sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
520 sysfs_printf(expensive_debug_checks,
521 "%i", c->expensive_debug_checks);
509 sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); 522 sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
510 sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); 523 sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
511 sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); 524 sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
@@ -555,7 +568,7 @@ STORE(__bch_cache_set)
555 } 568 }
556 569
557 if (attr == &sysfs_trigger_gc) 570 if (attr == &sysfs_trigger_gc)
558 bch_queue_gc(c); 571 wake_up_gc(c);
559 572
560 if (attr == &sysfs_prune_cache) { 573 if (attr == &sysfs_prune_cache) {
561 struct shrink_control sc; 574 struct shrink_control sc;
@@ -569,6 +582,15 @@ STORE(__bch_cache_set)
569 sysfs_strtoul(congested_write_threshold_us, 582 sysfs_strtoul(congested_write_threshold_us,
570 c->congested_write_threshold_us); 583 c->congested_write_threshold_us);
571 584
585 if (attr == &sysfs_errors) {
586 ssize_t v = bch_read_string_list(buf, error_actions);
587
588 if (v < 0)
589 return v;
590
591 c->on_error = v;
592 }
593
572 if (attr == &sysfs_io_error_limit) 594 if (attr == &sysfs_io_error_limit)
573 c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; 595 c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
574 596
@@ -579,6 +601,7 @@ STORE(__bch_cache_set)
579 sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); 601 sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
580 sysfs_strtoul(verify, c->verify); 602 sysfs_strtoul(verify, c->verify);
581 sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); 603 sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
604 sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks);
582 sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); 605 sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
583 sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); 606 sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
584 sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); 607 sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
@@ -618,8 +641,8 @@ static struct attribute *bch_cache_set_files[] = {
618 &sysfs_cache_available_percent, 641 &sysfs_cache_available_percent,
619 642
620 &sysfs_average_key_size, 643 &sysfs_average_key_size,
621 &sysfs_dirty_data,
622 644
645 &sysfs_errors,
623 &sysfs_io_error_limit, 646 &sysfs_io_error_limit,
624 &sysfs_io_error_halflife, 647 &sysfs_io_error_halflife,
625 &sysfs_congested, 648 &sysfs_congested,
@@ -653,6 +676,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
653#ifdef CONFIG_BCACHE_DEBUG 676#ifdef CONFIG_BCACHE_DEBUG
654 &sysfs_verify, 677 &sysfs_verify,
655 &sysfs_key_merging_disabled, 678 &sysfs_key_merging_disabled,
679 &sysfs_expensive_debug_checks,
656#endif 680#endif
657 &sysfs_gc_always_rewrite, 681 &sysfs_gc_always_rewrite,
658 &sysfs_btree_shrinker_disabled, 682 &sysfs_btree_shrinker_disabled,
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
index f7b6c197f90f..adbc3df17a80 100644
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -1,6 +1,5 @@
1#include "bcache.h" 1#include "bcache.h"
2#include "btree.h" 2#include "btree.h"
3#include "request.h"
4 3
5#include <linux/blktrace_api.h> 4#include <linux/blktrace_api.h>
6#include <linux/module.h> 5#include <linux/module.h>
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 420dad545c7d..462214eeacbe 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -168,10 +168,14 @@ int bch_parse_uuid(const char *s, char *uuid)
168 168
169void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) 169void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
170{ 170{
171 uint64_t now = local_clock(); 171 uint64_t now, duration, last;
172 uint64_t duration = time_after64(now, start_time) 172
173 spin_lock(&stats->lock);
174
175 now = local_clock();
176 duration = time_after64(now, start_time)
173 ? now - start_time : 0; 177 ? now - start_time : 0;
174 uint64_t last = time_after64(now, stats->last) 178 last = time_after64(now, stats->last)
175 ? now - stats->last : 0; 179 ? now - stats->last : 0;
176 180
177 stats->max_duration = max(stats->max_duration, duration); 181 stats->max_duration = max(stats->max_duration, duration);
@@ -188,6 +192,8 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
188 } 192 }
189 193
190 stats->last = now ?: 1; 194 stats->last = now ?: 1;
195
196 spin_unlock(&stats->lock);
191} 197}
192 198
193/** 199/**
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ea345c6896f4..362c4b3f8b4a 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -15,28 +15,18 @@
15 15
16struct closure; 16struct closure;
17 17
18#ifdef CONFIG_BCACHE_EDEBUG 18#ifdef CONFIG_BCACHE_DEBUG
19 19
20#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) 20#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
21#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) 21#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
22 22
23#else /* EDEBUG */ 23#else /* DEBUG */
24 24
25#define atomic_dec_bug(v) atomic_dec(v) 25#define atomic_dec_bug(v) atomic_dec(v)
26#define atomic_inc_bug(v, i) atomic_inc(v) 26#define atomic_inc_bug(v, i) atomic_inc(v)
27 27
28#endif 28#endif
29 29
30#define BITMASK(name, type, field, offset, size) \
31static inline uint64_t name(const type *k) \
32{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \
33 \
34static inline void SET_##name(type *k, uint64_t v) \
35{ \
36 k->field &= ~(~((uint64_t) ~0 << size) << offset); \
37 k->field |= v << offset; \
38}
39
40#define DECLARE_HEAP(type, name) \ 30#define DECLARE_HEAP(type, name) \
41 struct { \ 31 struct { \
42 size_t size, used; \ 32 size_t size, used; \
@@ -388,6 +378,7 @@ ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[
388ssize_t bch_read_string_list(const char *buf, const char * const list[]); 378ssize_t bch_read_string_list(const char *buf, const char * const list[]);
389 379
390struct time_stats { 380struct time_stats {
381 spinlock_t lock;
391 /* 382 /*
392 * all fields are in nanoseconds, averages are ewmas stored left shifted 383 * all fields are in nanoseconds, averages are ewmas stored left shifted
393 * by 8 384 * by 8
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ba3ee48320f2..99053b1251be 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -11,18 +11,11 @@
11#include "debug.h" 11#include "debug.h"
12#include "writeback.h" 12#include "writeback.h"
13 13
14#include <linux/delay.h>
15#include <linux/freezer.h>
16#include <linux/kthread.h>
14#include <trace/events/bcache.h> 17#include <trace/events/bcache.h>
15 18
16static struct workqueue_struct *dirty_wq;
17
18static void read_dirty(struct closure *);
19
20struct dirty_io {
21 struct closure cl;
22 struct cached_dev *dc;
23 struct bio bio;
24};
25
26/* Rate limiting */ 19/* Rate limiting */
27 20
28static void __update_writeback_rate(struct cached_dev *dc) 21static void __update_writeback_rate(struct cached_dev *dc)
@@ -72,9 +65,6 @@ out:
72 dc->writeback_rate_derivative = derivative; 65 dc->writeback_rate_derivative = derivative;
73 dc->writeback_rate_change = change; 66 dc->writeback_rate_change = change;
74 dc->writeback_rate_target = target; 67 dc->writeback_rate_target = target;
75
76 schedule_delayed_work(&dc->writeback_rate_update,
77 dc->writeback_rate_update_seconds * HZ);
78} 68}
79 69
80static void update_writeback_rate(struct work_struct *work) 70static void update_writeback_rate(struct work_struct *work)
@@ -90,13 +80,16 @@ static void update_writeback_rate(struct work_struct *work)
90 __update_writeback_rate(dc); 80 __update_writeback_rate(dc);
91 81
92 up_read(&dc->writeback_lock); 82 up_read(&dc->writeback_lock);
83
84 schedule_delayed_work(&dc->writeback_rate_update,
85 dc->writeback_rate_update_seconds * HZ);
93} 86}
94 87
95static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 88static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
96{ 89{
97 uint64_t ret; 90 uint64_t ret;
98 91
99 if (atomic_read(&dc->disk.detaching) || 92 if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
100 !dc->writeback_percent) 93 !dc->writeback_percent)
101 return 0; 94 return 0;
102 95
@@ -105,37 +98,11 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
105 return min_t(uint64_t, ret, HZ); 98 return min_t(uint64_t, ret, HZ);
106} 99}
107 100
108/* Background writeback */ 101struct dirty_io {
109 102 struct closure cl;
110static bool dirty_pred(struct keybuf *buf, struct bkey *k) 103 struct cached_dev *dc;
111{ 104 struct bio bio;
112 return KEY_DIRTY(k); 105};
113}
114
115static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
116{
117 uint64_t stripe;
118 unsigned nr_sectors = KEY_SIZE(k);
119 struct cached_dev *dc = container_of(buf, struct cached_dev,
120 writeback_keys);
121 unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
122
123 if (!KEY_DIRTY(k))
124 return false;
125
126 stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
127 while (1) {
128 if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
129 stripe_size)
130 return false;
131
132 if (nr_sectors <= stripe_size)
133 return true;
134
135 nr_sectors -= stripe_size;
136 stripe++;
137 }
138}
139 106
140static void dirty_init(struct keybuf_key *w) 107static void dirty_init(struct keybuf_key *w)
141{ 108{
@@ -153,131 +120,6 @@ static void dirty_init(struct keybuf_key *w)
153 bch_bio_map(bio, NULL); 120 bch_bio_map(bio, NULL);
154} 121}
155 122
156static void refill_dirty(struct closure *cl)
157{
158 struct cached_dev *dc = container_of(cl, struct cached_dev,
159 writeback.cl);
160 struct keybuf *buf = &dc->writeback_keys;
161 bool searched_from_start = false;
162 struct bkey end = MAX_KEY;
163 SET_KEY_INODE(&end, dc->disk.id);
164
165 if (!atomic_read(&dc->disk.detaching) &&
166 !dc->writeback_running)
167 closure_return(cl);
168
169 down_write(&dc->writeback_lock);
170
171 if (!atomic_read(&dc->has_dirty)) {
172 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
173 bch_write_bdev_super(dc, NULL);
174
175 up_write(&dc->writeback_lock);
176 closure_return(cl);
177 }
178
179 if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
180 buf->last_scanned = KEY(dc->disk.id, 0, 0);
181 searched_from_start = true;
182 }
183
184 if (dc->partial_stripes_expensive) {
185 uint64_t i;
186
187 for (i = 0; i < dc->disk.nr_stripes; i++)
188 if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
189 1 << dc->disk.stripe_size_bits)
190 goto full_stripes;
191
192 goto normal_refill;
193full_stripes:
194 bch_refill_keybuf(dc->disk.c, buf, &end,
195 dirty_full_stripe_pred);
196 } else {
197normal_refill:
198 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
199 }
200
201 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
202 /* Searched the entire btree - delay awhile */
203
204 if (RB_EMPTY_ROOT(&buf->keys)) {
205 atomic_set(&dc->has_dirty, 0);
206 cached_dev_put(dc);
207 }
208
209 if (!atomic_read(&dc->disk.detaching))
210 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
211 }
212
213 up_write(&dc->writeback_lock);
214
215 bch_ratelimit_reset(&dc->writeback_rate);
216
217 /* Punt to workqueue only so we don't recurse and blow the stack */
218 continue_at(cl, read_dirty, dirty_wq);
219}
220
221void bch_writeback_queue(struct cached_dev *dc)
222{
223 if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
224 if (!atomic_read(&dc->disk.detaching))
225 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
226
227 continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
228 }
229}
230
231void bch_writeback_add(struct cached_dev *dc)
232{
233 if (!atomic_read(&dc->has_dirty) &&
234 !atomic_xchg(&dc->has_dirty, 1)) {
235 atomic_inc(&dc->count);
236
237 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
238 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
239 /* XXX: should do this synchronously */
240 bch_write_bdev_super(dc, NULL);
241 }
242
243 bch_writeback_queue(dc);
244
245 if (dc->writeback_percent)
246 schedule_delayed_work(&dc->writeback_rate_update,
247 dc->writeback_rate_update_seconds * HZ);
248 }
249}
250
251void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
252 uint64_t offset, int nr_sectors)
253{
254 struct bcache_device *d = c->devices[inode];
255 unsigned stripe_size, stripe_offset;
256 uint64_t stripe;
257
258 if (!d)
259 return;
260
261 stripe_size = 1 << d->stripe_size_bits;
262 stripe = offset >> d->stripe_size_bits;
263 stripe_offset = offset & (stripe_size - 1);
264
265 while (nr_sectors) {
266 int s = min_t(unsigned, abs(nr_sectors),
267 stripe_size - stripe_offset);
268
269 if (nr_sectors < 0)
270 s = -s;
271
272 atomic_add(s, d->stripe_sectors_dirty + stripe);
273 nr_sectors -= s;
274 stripe_offset = 0;
275 stripe++;
276 }
277}
278
279/* Background writeback - IO loop */
280
281static void dirty_io_destructor(struct closure *cl) 123static void dirty_io_destructor(struct closure *cl)
282{ 124{
283 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 125 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
@@ -297,26 +139,25 @@ static void write_dirty_finish(struct closure *cl)
297 139
298 /* This is kind of a dumb way of signalling errors. */ 140 /* This is kind of a dumb way of signalling errors. */
299 if (KEY_DIRTY(&w->key)) { 141 if (KEY_DIRTY(&w->key)) {
142 int ret;
300 unsigned i; 143 unsigned i;
301 struct btree_op op; 144 struct keylist keys;
302 bch_btree_op_init_stack(&op);
303 145
304 op.type = BTREE_REPLACE; 146 bch_keylist_init(&keys);
305 bkey_copy(&op.replace, &w->key);
306 147
307 SET_KEY_DIRTY(&w->key, false); 148 bkey_copy(keys.top, &w->key);
308 bch_keylist_add(&op.keys, &w->key); 149 SET_KEY_DIRTY(keys.top, false);
150 bch_keylist_push(&keys);
309 151
310 for (i = 0; i < KEY_PTRS(&w->key); i++) 152 for (i = 0; i < KEY_PTRS(&w->key); i++)
311 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); 153 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
312 154
313 bch_btree_insert(&op, dc->disk.c); 155 ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
314 closure_sync(&op.cl);
315 156
316 if (op.insert_collision) 157 if (ret)
317 trace_bcache_writeback_collision(&w->key); 158 trace_bcache_writeback_collision(&w->key);
318 159
319 atomic_long_inc(op.insert_collision 160 atomic_long_inc(ret
320 ? &dc->disk.c->writeback_keys_failed 161 ? &dc->disk.c->writeback_keys_failed
321 : &dc->disk.c->writeback_keys_done); 162 : &dc->disk.c->writeback_keys_done);
322 } 163 }
@@ -374,30 +215,33 @@ static void read_dirty_submit(struct closure *cl)
374 continue_at(cl, write_dirty, system_wq); 215 continue_at(cl, write_dirty, system_wq);
375} 216}
376 217
377static void read_dirty(struct closure *cl) 218static void read_dirty(struct cached_dev *dc)
378{ 219{
379 struct cached_dev *dc = container_of(cl, struct cached_dev, 220 unsigned delay = 0;
380 writeback.cl);
381 unsigned delay = writeback_delay(dc, 0);
382 struct keybuf_key *w; 221 struct keybuf_key *w;
383 struct dirty_io *io; 222 struct dirty_io *io;
223 struct closure cl;
224
225 closure_init_stack(&cl);
384 226
385 /* 227 /*
386 * XXX: if we error, background writeback just spins. Should use some 228 * XXX: if we error, background writeback just spins. Should use some
387 * mempools. 229 * mempools.
388 */ 230 */
389 231
390 while (1) { 232 while (!kthread_should_stop()) {
233 try_to_freeze();
234
391 w = bch_keybuf_next(&dc->writeback_keys); 235 w = bch_keybuf_next(&dc->writeback_keys);
392 if (!w) 236 if (!w)
393 break; 237 break;
394 238
395 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); 239 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
396 240
397 if (delay > 0 && 241 if (KEY_START(&w->key) != dc->last_read ||
398 (KEY_START(&w->key) != dc->last_read || 242 jiffies_to_msecs(delay) > 50)
399 jiffies_to_msecs(delay) > 50)) 243 while (!kthread_should_stop() && delay)
400 delay = schedule_timeout_uninterruptible(delay); 244 delay = schedule_timeout_interruptible(delay);
401 245
402 dc->last_read = KEY_OFFSET(&w->key); 246 dc->last_read = KEY_OFFSET(&w->key);
403 247
@@ -423,7 +267,7 @@ static void read_dirty(struct closure *cl)
423 trace_bcache_writeback(&w->key); 267 trace_bcache_writeback(&w->key);
424 268
425 down(&dc->in_flight); 269 down(&dc->in_flight);
426 closure_call(&io->cl, read_dirty_submit, NULL, cl); 270 closure_call(&io->cl, read_dirty_submit, NULL, &cl);
427 271
428 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 272 delay = writeback_delay(dc, KEY_SIZE(&w->key));
429 } 273 }
@@ -439,52 +283,205 @@ err:
439 * Wait for outstanding writeback IOs to finish (and keybuf slots to be 283 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
440 * freed) before refilling again 284 * freed) before refilling again
441 */ 285 */
442 continue_at(cl, refill_dirty, dirty_wq); 286 closure_sync(&cl);
443} 287}
444 288
445/* Init */ 289/* Scan for dirty data */
290
291void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
292 uint64_t offset, int nr_sectors)
293{
294 struct bcache_device *d = c->devices[inode];
295 unsigned stripe_offset, stripe, sectors_dirty;
296
297 if (!d)
298 return;
299
300 stripe = offset_to_stripe(d, offset);
301 stripe_offset = offset & (d->stripe_size - 1);
302
303 while (nr_sectors) {
304 int s = min_t(unsigned, abs(nr_sectors),
305 d->stripe_size - stripe_offset);
306
307 if (nr_sectors < 0)
308 s = -s;
309
310 if (stripe >= d->nr_stripes)
311 return;
312
313 sectors_dirty = atomic_add_return(s,
314 d->stripe_sectors_dirty + stripe);
315 if (sectors_dirty == d->stripe_size)
316 set_bit(stripe, d->full_dirty_stripes);
317 else
318 clear_bit(stripe, d->full_dirty_stripes);
319
320 nr_sectors -= s;
321 stripe_offset = 0;
322 stripe++;
323 }
324}
446 325
447static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, 326static bool dirty_pred(struct keybuf *buf, struct bkey *k)
448 struct cached_dev *dc)
449{ 327{
450 struct bkey *k; 328 return KEY_DIRTY(k);
451 struct btree_iter iter; 329}
452 330
453 bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); 331static void refill_full_stripes(struct cached_dev *dc)
454 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) 332{
455 if (!b->level) { 333 struct keybuf *buf = &dc->writeback_keys;
456 if (KEY_INODE(k) > dc->disk.id) 334 unsigned start_stripe, stripe, next_stripe;
457 break; 335 bool wrapped = false;
458 336
459 if (KEY_DIRTY(k)) 337 stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
460 bcache_dev_sectors_dirty_add(b->c, dc->disk.id, 338
461 KEY_START(k), 339 if (stripe >= dc->disk.nr_stripes)
462 KEY_SIZE(k)); 340 stripe = 0;
463 } else { 341
464 btree(sectors_dirty_init, k, b, op, dc); 342 start_stripe = stripe;
465 if (KEY_INODE(k) > dc->disk.id) 343
466 break; 344 while (1) {
467 345 stripe = find_next_bit(dc->disk.full_dirty_stripes,
468 cond_resched(); 346 dc->disk.nr_stripes, stripe);
347
348 if (stripe == dc->disk.nr_stripes)
349 goto next;
350
351 next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
352 dc->disk.nr_stripes, stripe);
353
354 buf->last_scanned = KEY(dc->disk.id,
355 stripe * dc->disk.stripe_size, 0);
356
357 bch_refill_keybuf(dc->disk.c, buf,
358 &KEY(dc->disk.id,
359 next_stripe * dc->disk.stripe_size, 0),
360 dirty_pred);
361
362 if (array_freelist_empty(&buf->freelist))
363 return;
364
365 stripe = next_stripe;
366next:
367 if (wrapped && stripe > start_stripe)
368 return;
369
370 if (stripe == dc->disk.nr_stripes) {
371 stripe = 0;
372 wrapped = true;
469 } 373 }
374 }
375}
376
377static bool refill_dirty(struct cached_dev *dc)
378{
379 struct keybuf *buf = &dc->writeback_keys;
380 struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
381 bool searched_from_start = false;
382
383 if (dc->partial_stripes_expensive) {
384 refill_full_stripes(dc);
385 if (array_freelist_empty(&buf->freelist))
386 return false;
387 }
388
389 if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
390 buf->last_scanned = KEY(dc->disk.id, 0, 0);
391 searched_from_start = true;
392 }
393
394 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
395
396 return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
397}
398
399static int bch_writeback_thread(void *arg)
400{
401 struct cached_dev *dc = arg;
402 bool searched_full_index;
403
404 while (!kthread_should_stop()) {
405 down_write(&dc->writeback_lock);
406 if (!atomic_read(&dc->has_dirty) ||
407 (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
408 !dc->writeback_running)) {
409 up_write(&dc->writeback_lock);
410 set_current_state(TASK_INTERRUPTIBLE);
411
412 if (kthread_should_stop())
413 return 0;
414
415 try_to_freeze();
416 schedule();
417 continue;
418 }
419
420 searched_full_index = refill_dirty(dc);
421
422 if (searched_full_index &&
423 RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
424 atomic_set(&dc->has_dirty, 0);
425 cached_dev_put(dc);
426 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
427 bch_write_bdev_super(dc, NULL);
428 }
429
430 up_write(&dc->writeback_lock);
431
432 bch_ratelimit_reset(&dc->writeback_rate);
433 read_dirty(dc);
434
435 if (searched_full_index) {
436 unsigned delay = dc->writeback_delay * HZ;
437
438 while (delay &&
439 !kthread_should_stop() &&
440 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
441 delay = schedule_timeout_interruptible(delay);
442 }
443 }
470 444
471 return 0; 445 return 0;
472} 446}
473 447
448/* Init */
449
450struct sectors_dirty_init {
451 struct btree_op op;
452 unsigned inode;
453};
454
455static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
456 struct bkey *k)
457{
458 struct sectors_dirty_init *op = container_of(_op,
459 struct sectors_dirty_init, op);
460 if (KEY_INODE(k) > op->inode)
461 return MAP_DONE;
462
463 if (KEY_DIRTY(k))
464 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
465 KEY_START(k), KEY_SIZE(k));
466
467 return MAP_CONTINUE;
468}
469
474void bch_sectors_dirty_init(struct cached_dev *dc) 470void bch_sectors_dirty_init(struct cached_dev *dc)
475{ 471{
476 struct btree_op op; 472 struct sectors_dirty_init op;
473
474 bch_btree_op_init(&op.op, -1);
475 op.inode = dc->disk.id;
477 476
478 bch_btree_op_init_stack(&op); 477 bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
479 btree_root(sectors_dirty_init, dc->disk.c, &op, dc); 478 sectors_dirty_init_fn, 0);
480} 479}
481 480
482void bch_cached_dev_writeback_init(struct cached_dev *dc) 481int bch_cached_dev_writeback_init(struct cached_dev *dc)
483{ 482{
484 sema_init(&dc->in_flight, 64); 483 sema_init(&dc->in_flight, 64);
485 closure_init_unlocked(&dc->writeback);
486 init_rwsem(&dc->writeback_lock); 484 init_rwsem(&dc->writeback_lock);
487
488 bch_keybuf_init(&dc->writeback_keys); 485 bch_keybuf_init(&dc->writeback_keys);
489 486
490 dc->writeback_metadata = true; 487 dc->writeback_metadata = true;
@@ -498,22 +495,16 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
498 dc->writeback_rate_p_term_inverse = 64; 495 dc->writeback_rate_p_term_inverse = 64;
499 dc->writeback_rate_d_smooth = 8; 496 dc->writeback_rate_d_smooth = 8;
500 497
498 dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
499 "bcache_writeback");
500 if (IS_ERR(dc->writeback_thread))
501 return PTR_ERR(dc->writeback_thread);
502
503 set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE);
504
501 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 505 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
502 schedule_delayed_work(&dc->writeback_rate_update, 506 schedule_delayed_work(&dc->writeback_rate_update,
503 dc->writeback_rate_update_seconds * HZ); 507 dc->writeback_rate_update_seconds * HZ);
504}
505
506void bch_writeback_exit(void)
507{
508 if (dirty_wq)
509 destroy_workqueue(dirty_wq);
510}
511
512int __init bch_writeback_init(void)
513{
514 dirty_wq = create_workqueue("bcache_writeback");
515 if (!dirty_wq)
516 return -ENOMEM;
517 508
518 return 0; 509 return 0;
519} 510}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index c91f61bb95b6..c9ddcf4614b9 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -14,20 +14,27 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
14 return ret; 14 return ret;
15} 15}
16 16
17static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, 17static inline unsigned offset_to_stripe(struct bcache_device *d,
18 uint64_t offset)
19{
20 do_div(offset, d->stripe_size);
21 return offset;
22}
23
24static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
18 uint64_t offset, 25 uint64_t offset,
19 unsigned nr_sectors) 26 unsigned nr_sectors)
20{ 27{
21 uint64_t stripe = offset >> d->stripe_size_bits; 28 unsigned stripe = offset_to_stripe(&dc->disk, offset);
22 29
23 while (1) { 30 while (1) {
24 if (atomic_read(d->stripe_sectors_dirty + stripe)) 31 if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
25 return true; 32 return true;
26 33
27 if (nr_sectors <= 1 << d->stripe_size_bits) 34 if (nr_sectors <= dc->disk.stripe_size)
28 return false; 35 return false;
29 36
30 nr_sectors -= 1 << d->stripe_size_bits; 37 nr_sectors -= dc->disk.stripe_size;
31 stripe++; 38 stripe++;
32 } 39 }
33} 40}
@@ -38,12 +45,12 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
38 unsigned in_use = dc->disk.c->gc_stats.in_use; 45 unsigned in_use = dc->disk.c->gc_stats.in_use;
39 46
40 if (cache_mode != CACHE_MODE_WRITEBACK || 47 if (cache_mode != CACHE_MODE_WRITEBACK ||
41 atomic_read(&dc->disk.detaching) || 48 test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
42 in_use > CUTOFF_WRITEBACK_SYNC) 49 in_use > CUTOFF_WRITEBACK_SYNC)
43 return false; 50 return false;
44 51
45 if (dc->partial_stripes_expensive && 52 if (dc->partial_stripes_expensive &&
46 bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, 53 bcache_dev_stripe_dirty(dc, bio->bi_sector,
47 bio_sectors(bio))) 54 bio_sectors(bio)))
48 return true; 55 return true;
49 56
@@ -54,11 +61,30 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
54 in_use <= CUTOFF_WRITEBACK; 61 in_use <= CUTOFF_WRITEBACK;
55} 62}
56 63
64static inline void bch_writeback_queue(struct cached_dev *dc)
65{
66 wake_up_process(dc->writeback_thread);
67}
68
69static inline void bch_writeback_add(struct cached_dev *dc)
70{
71 if (!atomic_read(&dc->has_dirty) &&
72 !atomic_xchg(&dc->has_dirty, 1)) {
73 atomic_inc(&dc->count);
74
75 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
76 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
77 /* XXX: should do this synchronously */
78 bch_write_bdev_super(dc, NULL);
79 }
80
81 bch_writeback_queue(dc);
82 }
83}
84
57void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); 85void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
58void bch_writeback_queue(struct cached_dev *);
59void bch_writeback_add(struct cached_dev *);
60 86
61void bch_sectors_dirty_init(struct cached_dev *dc); 87void bch_sectors_dirty_init(struct cached_dev *dc);
62void bch_cached_dev_writeback_init(struct cached_dev *); 88int bch_cached_dev_writeback_init(struct cached_dev *);
63 89
64#endif 90#endif
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 5ebda976ea93..e2b9576d00e2 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -6,11 +6,9 @@
6 6
7#include <linux/tracepoint.h> 7#include <linux/tracepoint.h>
8 8
9struct search;
10
11DECLARE_EVENT_CLASS(bcache_request, 9DECLARE_EVENT_CLASS(bcache_request,
12 TP_PROTO(struct search *s, struct bio *bio), 10 TP_PROTO(struct bcache_device *d, struct bio *bio),
13 TP_ARGS(s, bio), 11 TP_ARGS(d, bio),
14 12
15 TP_STRUCT__entry( 13 TP_STRUCT__entry(
16 __field(dev_t, dev ) 14 __field(dev_t, dev )
@@ -24,8 +22,8 @@ DECLARE_EVENT_CLASS(bcache_request,
24 22
25 TP_fast_assign( 23 TP_fast_assign(
26 __entry->dev = bio->bi_bdev->bd_dev; 24 __entry->dev = bio->bi_bdev->bd_dev;
27 __entry->orig_major = s->d->disk->major; 25 __entry->orig_major = d->disk->major;
28 __entry->orig_minor = s->d->disk->first_minor; 26 __entry->orig_minor = d->disk->first_minor;
29 __entry->sector = bio->bi_sector; 27 __entry->sector = bio->bi_sector;
30 __entry->orig_sector = bio->bi_sector - 16; 28 __entry->orig_sector = bio->bi_sector - 16;
31 __entry->nr_sector = bio->bi_size >> 9; 29 __entry->nr_sector = bio->bi_size >> 9;
@@ -79,13 +77,13 @@ DECLARE_EVENT_CLASS(btree_node,
79/* request.c */ 77/* request.c */
80 78
81DEFINE_EVENT(bcache_request, bcache_request_start, 79DEFINE_EVENT(bcache_request, bcache_request_start,
82 TP_PROTO(struct search *s, struct bio *bio), 80 TP_PROTO(struct bcache_device *d, struct bio *bio),
83 TP_ARGS(s, bio) 81 TP_ARGS(d, bio)
84); 82);
85 83
86DEFINE_EVENT(bcache_request, bcache_request_end, 84DEFINE_EVENT(bcache_request, bcache_request_end,
87 TP_PROTO(struct search *s, struct bio *bio), 85 TP_PROTO(struct bcache_device *d, struct bio *bio),
88 TP_ARGS(s, bio) 86 TP_ARGS(d, bio)
89); 87);
90 88
91DECLARE_EVENT_CLASS(bcache_bio, 89DECLARE_EVENT_CLASS(bcache_bio,
@@ -370,6 +368,35 @@ DEFINE_EVENT(btree_node, bcache_btree_set_root,
370 TP_ARGS(b) 368 TP_ARGS(b)
371); 369);
372 370
371TRACE_EVENT(bcache_keyscan,
372 TP_PROTO(unsigned nr_found,
373 unsigned start_inode, uint64_t start_offset,
374 unsigned end_inode, uint64_t end_offset),
375 TP_ARGS(nr_found,
376 start_inode, start_offset,
377 end_inode, end_offset),
378
379 TP_STRUCT__entry(
380 __field(__u32, nr_found )
381 __field(__u32, start_inode )
382 __field(__u64, start_offset )
383 __field(__u32, end_inode )
384 __field(__u64, end_offset )
385 ),
386
387 TP_fast_assign(
388 __entry->nr_found = nr_found;
389 __entry->start_inode = start_inode;
390 __entry->start_offset = start_offset;
391 __entry->end_inode = end_inode;
392 __entry->end_offset = end_offset;
393 ),
394
395 TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found,
396 __entry->start_inode, __entry->start_offset,
397 __entry->end_inode, __entry->end_offset)
398);
399
373/* Allocator */ 400/* Allocator */
374 401
375TRACE_EVENT(bcache_alloc_invalidate, 402TRACE_EVENT(bcache_alloc_invalidate,
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
new file mode 100644
index 000000000000..164a7e263988
--- /dev/null
+++ b/include/uapi/linux/bcache.h
@@ -0,0 +1,373 @@
1#ifndef _LINUX_BCACHE_H
2#define _LINUX_BCACHE_H
3
4/*
5 * Bcache on disk data structures
6 */
7
8#include <asm/types.h>
9
10#define BITMASK(name, type, field, offset, size) \
11static inline __u64 name(const type *k) \
12{ return (k->field >> offset) & ~(~0ULL << size); } \
13 \
14static inline void SET_##name(type *k, __u64 v) \
15{ \
16 k->field &= ~(~(~0ULL << size) << offset); \
17 k->field |= (v & ~(~0ULL << size)) << offset; \
18}
19
20/* Btree keys - all units are in sectors */
21
22struct bkey {
23 __u64 high;
24 __u64 low;
25 __u64 ptr[];
26};
27
28#define KEY_FIELD(name, field, offset, size) \
29 BITMASK(name, struct bkey, field, offset, size)
30
31#define PTR_FIELD(name, offset, size) \
32static inline __u64 name(const struct bkey *k, unsigned i) \
33{ return (k->ptr[i] >> offset) & ~(~0ULL << size); } \
34 \
35static inline void SET_##name(struct bkey *k, unsigned i, __u64 v) \
36{ \
37 k->ptr[i] &= ~(~(~0ULL << size) << offset); \
38 k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \
39}
40
41#define KEY_SIZE_BITS 16
42
43KEY_FIELD(KEY_PTRS, high, 60, 3)
44KEY_FIELD(HEADER_SIZE, high, 58, 2)
45KEY_FIELD(KEY_CSUM, high, 56, 2)
46KEY_FIELD(KEY_PINNED, high, 55, 1)
47KEY_FIELD(KEY_DIRTY, high, 36, 1)
48
49KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS)
50KEY_FIELD(KEY_INODE, high, 0, 20)
51
52/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
53
54static inline __u64 KEY_OFFSET(const struct bkey *k)
55{
56 return k->low;
57}
58
59static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v)
60{
61 k->low = v;
62}
63
64/*
65 * The high bit being set is a relic from when we used it to do binary
66 * searches - it told you where a key started. It's not used anymore,
67 * and can probably be safely dropped.
68 */
69#define KEY(inode, offset, size) \
70((struct bkey) { \
71 .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \
72 .low = (offset) \
73})
74
75#define ZERO_KEY KEY(0, 0, 0)
76
77#define MAX_KEY_INODE (~(~0 << 20))
78#define MAX_KEY_OFFSET (~0ULL >> 1)
79#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
80
81#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k))
82#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0)
83
84#define PTR_DEV_BITS 12
85
86PTR_FIELD(PTR_DEV, 51, PTR_DEV_BITS)
87PTR_FIELD(PTR_OFFSET, 8, 43)
88PTR_FIELD(PTR_GEN, 0, 8)
89
90#define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1)
91
92#define PTR(gen, offset, dev) \
93 ((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen)
94
95/* Bkey utility code */
96
97static inline unsigned long bkey_u64s(const struct bkey *k)
98{
99 return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k);
100}
101
102static inline unsigned long bkey_bytes(const struct bkey *k)
103{
104 return bkey_u64s(k) * sizeof(__u64);
105}
106
107#define bkey_copy(_dest, _src) memcpy(_dest, _src, bkey_bytes(_src))
108
109static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
110{
111 SET_KEY_INODE(dest, KEY_INODE(src));
112 SET_KEY_OFFSET(dest, KEY_OFFSET(src));
113}
114
115static inline struct bkey *bkey_next(const struct bkey *k)
116{
117 __u64 *d = (void *) k;
118 return (struct bkey *) (d + bkey_u64s(k));
119}
120
121static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys)
122{
123 __u64 *d = (void *) k;
124 return (struct bkey *) (d + nr_keys);
125}
126/* Enough for a key with 6 pointers */
127#define BKEY_PAD 8
128
129#define BKEY_PADDED(key) \
130 union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; }
131
132/* Superblock */
133
134/* Version 0: Cache device
135 * Version 1: Backing device
136 * Version 2: Seed pointer into btree node checksum
137 * Version 3: Cache device with new UUID format
138 * Version 4: Backing device with data offset
139 */
140#define BCACHE_SB_VERSION_CDEV 0
141#define BCACHE_SB_VERSION_BDEV 1
142#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
143#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
144#define BCACHE_SB_MAX_VERSION 4
145
146#define SB_SECTOR 8
147#define SB_SIZE 4096
148#define SB_LABEL_SIZE 32
149#define SB_JOURNAL_BUCKETS 256U
150/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
151#define MAX_CACHES_PER_SET 8
152
153#define BDEV_DATA_START_DEFAULT 16 /* sectors */
154
155struct cache_sb {
156 __u64 csum;
157 __u64 offset; /* sector where this sb was written */
158 __u64 version;
159
160 __u8 magic[16];
161
162 __u8 uuid[16];
163 union {
164 __u8 set_uuid[16];
165 __u64 set_magic;
166 };
167 __u8 label[SB_LABEL_SIZE];
168
169 __u64 flags;
170 __u64 seq;
171 __u64 pad[8];
172
173 union {
174 struct {
175 /* Cache devices */
176 __u64 nbuckets; /* device size */
177
178 __u16 block_size; /* sectors */
179 __u16 bucket_size; /* sectors */
180
181 __u16 nr_in_set;
182 __u16 nr_this_dev;
183 };
184 struct {
185 /* Backing devices */
186 __u64 data_offset;
187
188 /*
189 * block_size from the cache device section is still used by
190 * backing devices, so don't add anything here until we fix
191 * things to not need it for backing devices anymore
192 */
193 };
194 };
195
196 __u32 last_mount; /* time_t */
197
198 __u16 first_bucket;
199 union {
200 __u16 njournal_buckets;
201 __u16 keys;
202 };
203 __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */
204};
205
206static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
207{
208 return sb->version == BCACHE_SB_VERSION_BDEV
209 || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
210}
211
212BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
213BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1);
214BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3);
215#define CACHE_REPLACEMENT_LRU 0U
216#define CACHE_REPLACEMENT_FIFO 1U
217#define CACHE_REPLACEMENT_RANDOM 2U
218
219BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
220#define CACHE_MODE_WRITETHROUGH 0U
221#define CACHE_MODE_WRITEBACK 1U
222#define CACHE_MODE_WRITEAROUND 2U
223#define CACHE_MODE_NONE 3U
224BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
225#define BDEV_STATE_NONE 0U
226#define BDEV_STATE_CLEAN 1U
227#define BDEV_STATE_DIRTY 2U
228#define BDEV_STATE_STALE 3U
229
230/*
231 * Magic numbers
232 *
233 * The various other data structures have their own magic numbers, which are
234 * xored with the first part of the cache set's UUID
235 */
236
237#define JSET_MAGIC 0x245235c1a3625032ULL
238#define PSET_MAGIC 0x6750e15f87337f91ULL
239#define BSET_MAGIC 0x90135c78b99e07f5ULL
240
241static inline __u64 jset_magic(struct cache_sb *sb)
242{
243 return sb->set_magic ^ JSET_MAGIC;
244}
245
246static inline __u64 pset_magic(struct cache_sb *sb)
247{
248 return sb->set_magic ^ PSET_MAGIC;
249}
250
251static inline __u64 bset_magic(struct cache_sb *sb)
252{
253 return sb->set_magic ^ BSET_MAGIC;
254}
255
256/*
257 * Journal
258 *
259 * On disk format for a journal entry:
260 * seq is monotonically increasing; every journal entry has its own unique
261 * sequence number.
262 *
263 * last_seq is the oldest journal entry that still has keys the btree hasn't
264 * flushed to disk yet.
265 *
266 * version is for on disk format changes.
267 */
268
269#define BCACHE_JSET_VERSION_UUIDv1 1
270#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */
271#define BCACHE_JSET_VERSION 1
272
273struct jset {
274 __u64 csum;
275 __u64 magic;
276 __u64 seq;
277 __u32 version;
278 __u32 keys;
279
280 __u64 last_seq;
281
282 BKEY_PADDED(uuid_bucket);
283 BKEY_PADDED(btree_root);
284 __u16 btree_level;
285 __u16 pad[3];
286
287 __u64 prio_bucket[MAX_CACHES_PER_SET];
288
289 union {
290 struct bkey start[0];
291 __u64 d[0];
292 };
293};
294
295/* Bucket prios/gens */
296
297struct prio_set {
298 __u64 csum;
299 __u64 magic;
300 __u64 seq;
301 __u32 version;
302 __u32 pad;
303
304 __u64 next_bucket;
305
306 struct bucket_disk {
307 __u16 prio;
308 __u8 gen;
309 } __attribute((packed)) data[];
310};
311
312/* UUIDS - per backing device/flash only volume metadata */
313
314struct uuid_entry {
315 union {
316 struct {
317 __u8 uuid[16];
318 __u8 label[32];
319 __u32 first_reg;
320 __u32 last_reg;
321 __u32 invalidated;
322
323 __u32 flags;
324 /* Size of flash only volumes */
325 __u64 sectors;
326 };
327
328 __u8 pad[128];
329 };
330};
331
332BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
333
334/* Btree nodes */
335
336/* Version 1: Seed pointer into btree node checksum
337 */
338#define BCACHE_BSET_CSUM 1
339#define BCACHE_BSET_VERSION 1
340
341/*
342 * Btree nodes
343 *
344 * On disk a btree node is a list/log of these; within each set the keys are
345 * sorted
346 */
347struct bset {
348 __u64 csum;
349 __u64 magic;
350 __u64 seq;
351 __u32 version;
352 __u32 keys;
353
354 union {
355 struct bkey start[0];
356 __u64 d[0];
357 };
358};
359
360/* OBSOLETE */
361
362/* UUIDS - per backing device/flash only volume metadata */
363
364struct uuid_entry_v0 {
365 __u8 uuid[16];
366 __u8 label[32];
367 __u32 first_reg;
368 __u32 last_reg;
369 __u32 invalidated;
370 __u32 pad;
371};
372
373#endif /* _LINUX_BCACHE_H */