aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorArne Jansen <sensille@gmx.net>2011-05-23 08:33:49 -0400
committerArne Jansen <sensille@gmx.net>2011-10-02 02:48:44 -0400
commit7414a03fbf9e75fbbf2a3c16828cd862e572aa44 (patch)
treecc5fe98478b73281055a471fbce242b40a253490 /fs
parent90519d66abbccc251d14719ac76f191f70826e40 (diff)
btrfs: initial readahead code and prototypes
This is the implementation for the generic read ahead framework. To trigger a readahead, btrfs_reada_add must be called. It will start a read ahead for the given range [start, end) on tree root. The returned handle can either be used to wait on the readahead to finish (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). The read ahead works as follows: On btrfs_reada_add, the root of the tree is inserted into a radix_tree. reada_start_machine will then search for extents to prefetch and trigger some reads. When a read finishes for a node, all contained node/leaf pointers that lie in the given range will also be enqueued. The reads will be triggered in sequential order, thus giving a big win over a naive enumeration. It will also make use of multi-device layouts. Each disk will have its on read pointer and all disks will by utilized in parallel. Also will no two disks read both sides of a mirror simultaneously, as this would waste seeking capacity. Instead both disks will read different parts of the filesystem. Any number of readaheads can be started in parallel. The read order will be determined globally, i.e. 2 parallel readaheads will normally finish faster than the 2 started one after another. Changes v2: - protect root->node by transaction instead of node_lock - fix missed branches: The readahead had a too simple check to determine if a branch from a node should be checked or not. It now also records the upper bound of each node to see if the requested RA range lies within. - use KERN_CONT to debug output, to avoid line breaks - defer reada_start_machine to worker to avoid deadlock Changes v3: - protect root->node by rcu Changes v5: - changed EIO-semantics of reada_tree_block_flagged - remove spin_lock from reada_control and make elems an atomic_t - remove unused read_total from reada_control - kill reada_key_cmp, use btrfs_comp_cpu_keys instead - use kref-style release functions where possible - return struct reada_control * instead of void * from btrfs_reada_add Signed-off-by: Arne Jansen <sensille@gmx.net>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/ctree.h16
-rw-r--r--fs/btrfs/reada.c949
3 files changed, 967 insertions, 1 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21f..bdd6fb238ce1 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o
11 12
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f71fd24cc152..370af767440d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2702,4 +2702,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2702int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2702int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2703 struct btrfs_scrub_progress *progress); 2703 struct btrfs_scrub_progress *progress);
2704 2704
2705/* reada.c */
2706struct reada_control {
2707 struct btrfs_root *root; /* tree to prefetch */
2708 struct btrfs_key key_start;
2709 struct btrfs_key key_end; /* exclusive */
2710 atomic_t elems;
2711 struct kref refcnt;
2712 wait_queue_head_t wait;
2713};
2714struct reada_control *btrfs_reada_add(struct btrfs_root *root,
2715 struct btrfs_key *start, struct btrfs_key *end);
2716int btrfs_reada_wait(void *handle);
2717void btrfs_reada_detach(void *handle);
2718int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
2719 u64 start, int err);
2720
2705#endif 2721#endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..2b701d082227
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,949 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31#undef DEBUG
32
33/*
34 * This is the implementation for the generic read ahead framework.
35 *
36 * To trigger a readahead, btrfs_reada_add must be called. It will start
37 * a read ahead for the given range [start, end) on tree root. The returned
38 * handle can either be used to wait on the readahead to finish
39 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
40 *
41 * The read ahead works as follows:
42 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
43 * reada_start_machine will then search for extents to prefetch and trigger
44 * some reads. When a read finishes for a node, all contained node/leaf
45 * pointers that lie in the given range will also be enqueued. The reads will
46 * be triggered in sequential order, thus giving a big win over a naive
47 * enumeration. It will also make use of multi-device layouts. Each disk
48 * will have its on read pointer and all disks will by utilized in parallel.
49 * Also will no two disks read both sides of a mirror simultaneously, as this
50 * would waste seeking capacity. Instead both disks will read different parts
51 * of the filesystem.
52 * Any number of readaheads can be started in parallel. The read order will be
53 * determined globally, i.e. 2 parallel readaheads will normally finish faster
54 * than the 2 started one after another.
55 */
56
57#define MAX_MIRRORS 2
58#define MAX_IN_FLIGHT 6
59
60struct reada_extctl {
61 struct list_head list;
62 struct reada_control *rc;
63 u64 generation;
64};
65
66struct reada_extent {
67 u64 logical;
68 struct btrfs_key top;
69 u32 blocksize;
70 int err;
71 struct list_head extctl;
72 struct kref refcnt;
73 spinlock_t lock;
74 struct reada_zone *zones[MAX_MIRRORS];
75 int nzones;
76 struct btrfs_device *scheduled_for;
77};
78
79struct reada_zone {
80 u64 start;
81 u64 end;
82 u64 elems;
83 struct list_head list;
84 spinlock_t lock;
85 int locked;
86 struct btrfs_device *device;
87 struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
88 int ndevs;
89 struct kref refcnt;
90};
91
92struct reada_machine_work {
93 struct btrfs_work work;
94 struct btrfs_fs_info *fs_info;
95};
96
97static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
98static void reada_control_release(struct kref *kref);
99static void reada_zone_release(struct kref *kref);
100static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102
103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation);
105
106/* recurses */
107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
109 u64 start, int err)
110{
111 int level = 0;
112 int nritems;
113 int i;
114 u64 bytenr;
115 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121
122 if (eb)
123 level = btrfs_header_level(eb);
124
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 kref_get(&re->refcnt);
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock);
136 /*
137 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore
139 */
140 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock);
144
145 if (err == 0) {
146 nritems = level ? btrfs_header_nritems(eb) : 0;
147 generation = btrfs_header_generation(eb);
148 /*
149 * FIXME: currently we just set nritems to 0 if this is a leaf,
150 * effectively ignoring the content. In a next step we could
151 * trigger more readahead depending from the content, e.g.
152 * fetch the checksums for the extents in the leaf.
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164
165 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec;
167 u64 n_gen;
168 struct btrfs_key key;
169 struct btrfs_key next_key;
170
171 btrfs_node_key_to_cpu(eb, &key, i);
172 if (i + 1 < nritems)
173 btrfs_node_key_to_cpu(eb, &next_key, i + 1);
174 else
175 next_key = re->top;
176 bytenr = btrfs_node_blockptr(eb, i);
177 n_gen = btrfs_node_ptr_generation(eb, i);
178
179 list_for_each_entry(rec, &list, list) {
180 struct reada_control *rc = rec->rc;
181
182 /*
183 * if the generation doesn't match, just ignore this
184 * extctl. This will probably cut off a branch from
185 * prefetch. Alternatively one could start a new (sub-)
186 * prefetch for this branch, starting again from root.
187 * FIXME: move the generation check out of this loop
188 */
189#ifdef DEBUG
190 if (rec->generation != generation) {
191 printk(KERN_DEBUG "generation mismatch for "
192 "(%llu,%d,%llu) %llu != %llu\n",
193 key.objectid, key.type, key.offset,
194 rec->generation, generation);
195 }
196#endif
197 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key,
201 level - 1, n_gen);
202 }
203 }
204 /*
205 * free extctl records
206 */
207 while (!list_empty(&list)) {
208 struct reada_control *rc;
209 struct reada_extctl *rec;
210
211 rec = list_first_entry(&list, struct reada_extctl, list);
212 list_del(&rec->list);
213 rc = rec->rc;
214 kfree(rec);
215
216 kref_get(&rc->refcnt);
217 if (atomic_dec_and_test(&rc->elems)) {
218 kref_put(&rc->refcnt, reada_control_release);
219 wake_up(&rc->wait);
220 }
221 kref_put(&rc->refcnt, reada_control_release);
222
223 reada_extent_put(fs_info, re); /* one ref for each entry */
224 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228
229 return 0;
230}
231
232/*
233 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O
235 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
237 u64 start, int err)
238{
239 int ret;
240
241 ret = __readahead_hook(root, eb, start, err);
242
243 reada_start_machine(root->fs_info);
244
245 return ret;
246}
247
248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
249 struct btrfs_device *dev, u64 logical,
250 struct btrfs_multi_bio *multi)
251{
252 int ret;
253 int looped = 0;
254 struct reada_zone *zone;
255 struct btrfs_block_group_cache *cache = NULL;
256 u64 start;
257 u64 end;
258 int i;
259
260again:
261 zone = NULL;
262 spin_lock(&fs_info->reada_lock);
263 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
264 logical >> PAGE_CACHE_SHIFT, 1);
265 if (ret == 1)
266 kref_get(&zone->refcnt);
267 spin_unlock(&fs_info->reada_lock);
268
269 if (ret == 1) {
270 if (logical >= zone->start && logical < zone->end)
271 return zone;
272 spin_lock(&fs_info->reada_lock);
273 kref_put(&zone->refcnt, reada_zone_release);
274 spin_unlock(&fs_info->reada_lock);
275 }
276
277 if (looped)
278 return NULL;
279
280 cache = btrfs_lookup_block_group(fs_info, logical);
281 if (!cache)
282 return NULL;
283
284 start = cache->key.objectid;
285 end = start + cache->key.offset - 1;
286 btrfs_put_block_group(cache);
287
288 zone = kzalloc(sizeof(*zone), GFP_NOFS);
289 if (!zone)
290 return NULL;
291
292 zone->start = start;
293 zone->end = end;
294 INIT_LIST_HEAD(&zone->list);
295 spin_lock_init(&zone->lock);
296 zone->locked = 0;
297 kref_init(&zone->refcnt);
298 zone->elems = 0;
299 zone->device = dev; /* our device always sits at index 0 */
300 for (i = 0; i < multi->num_stripes; ++i) {
301 /* bounds have already been checked */
302 zone->devs[i] = multi->stripes[i].dev;
303 }
304 zone->ndevs = multi->num_stripes;
305
306 spin_lock(&fs_info->reada_lock);
307 ret = radix_tree_insert(&dev->reada_zones,
308 (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
309 zone);
310 spin_unlock(&fs_info->reada_lock);
311
312 if (ret) {
313 kfree(zone);
314 looped = 1;
315 goto again;
316 }
317
318 return zone;
319}
320
321static struct reada_extent *reada_find_extent(struct btrfs_root *root,
322 u64 logical,
323 struct btrfs_key *top, int level)
324{
325 int ret;
326 int looped = 0;
327 struct reada_extent *re = NULL;
328 struct btrfs_fs_info *fs_info = root->fs_info;
329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
330 struct btrfs_multi_bio *multi = NULL;
331 struct btrfs_device *dev;
332 u32 blocksize;
333 u64 length;
334 int nzones = 0;
335 int i;
336 unsigned long index = logical >> PAGE_CACHE_SHIFT;
337
338again:
339 spin_lock(&fs_info->reada_lock);
340 re = radix_tree_lookup(&fs_info->reada_tree, index);
341 if (re)
342 kref_get(&re->refcnt);
343 spin_unlock(&fs_info->reada_lock);
344
345 if (re || looped)
346 return re;
347
348 re = kzalloc(sizeof(*re), GFP_NOFS);
349 if (!re)
350 return NULL;
351
352 blocksize = btrfs_level_size(root, level);
353 re->logical = logical;
354 re->blocksize = blocksize;
355 re->top = *top;
356 INIT_LIST_HEAD(&re->extctl);
357 spin_lock_init(&re->lock);
358 kref_init(&re->refcnt);
359
360 /*
361 * map block
362 */
363 length = blocksize;
364 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &multi, 0);
365 if (ret || !multi || length < blocksize)
366 goto error;
367
368 if (multi->num_stripes > MAX_MIRRORS) {
369 printk(KERN_ERR "btrfs readahead: more than %d copies not "
370 "supported", MAX_MIRRORS);
371 goto error;
372 }
373
374 for (nzones = 0; nzones < multi->num_stripes; ++nzones) {
375 struct reada_zone *zone;
376
377 dev = multi->stripes[nzones].dev;
378 zone = reada_find_zone(fs_info, dev, logical, multi);
379 if (!zone)
380 break;
381
382 re->zones[nzones] = zone;
383 spin_lock(&zone->lock);
384 if (!zone->elems)
385 kref_get(&zone->refcnt);
386 ++zone->elems;
387 spin_unlock(&zone->lock);
388 spin_lock(&fs_info->reada_lock);
389 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock);
391 }
392 re->nzones = nzones;
393 if (nzones == 0) {
394 /* not a single zone found, error and out */
395 goto error;
396 }
397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 spin_lock(&fs_info->reada_lock);
400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
401 if (ret) {
402 spin_unlock(&fs_info->reada_lock);
403 if (ret != -ENOMEM) {
404 /* someone inserted the extent in the meantime */
405 looped = 1;
406 }
407 goto error;
408 }
409 for (i = 0; i < nzones; ++i) {
410 dev = multi->stripes[i].dev;
411 ret = radix_tree_insert(&dev->reada_extents, index, re);
412 if (ret) {
413 while (--i >= 0) {
414 dev = multi->stripes[i].dev;
415 BUG_ON(dev == NULL);
416 radix_tree_delete(&dev->reada_extents, index);
417 }
418 BUG_ON(fs_info == NULL);
419 radix_tree_delete(&fs_info->reada_tree, index);
420 spin_unlock(&fs_info->reada_lock);
421 goto error;
422 }
423 }
424 spin_unlock(&fs_info->reada_lock);
425
426 return re;
427
428error:
429 while (nzones) {
430 struct reada_zone *zone;
431
432 --nzones;
433 zone = re->zones[nzones];
434 kref_get(&zone->refcnt);
435 spin_lock(&zone->lock);
436 --zone->elems;
437 if (zone->elems == 0) {
438 /*
439 * no fs_info->reada_lock needed, as this can't be
440 * the last ref
441 */
442 kref_put(&zone->refcnt, reada_zone_release);
443 }
444 spin_unlock(&zone->lock);
445
446 spin_lock(&fs_info->reada_lock);
447 kref_put(&zone->refcnt, reada_zone_release);
448 spin_unlock(&fs_info->reada_lock);
449 }
450 kfree(re);
451 if (looped)
452 goto again;
453 return NULL;
454}
455
456static void reada_kref_dummy(struct kref *kr)
457{
458}
459
460static void reada_extent_put(struct btrfs_fs_info *fs_info,
461 struct reada_extent *re)
462{
463 int i;
464 unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
465
466 spin_lock(&fs_info->reada_lock);
467 if (!kref_put(&re->refcnt, reada_kref_dummy)) {
468 spin_unlock(&fs_info->reada_lock);
469 return;
470 }
471
472 radix_tree_delete(&fs_info->reada_tree, index);
473 for (i = 0; i < re->nzones; ++i) {
474 struct reada_zone *zone = re->zones[i];
475
476 radix_tree_delete(&zone->device->reada_extents, index);
477 }
478
479 spin_unlock(&fs_info->reada_lock);
480
481 for (i = 0; i < re->nzones; ++i) {
482 struct reada_zone *zone = re->zones[i];
483
484 kref_get(&zone->refcnt);
485 spin_lock(&zone->lock);
486 --zone->elems;
487 if (zone->elems == 0) {
488 /* no fs_info->reada_lock needed, as this can't be
489 * the last ref */
490 kref_put(&zone->refcnt, reada_zone_release);
491 }
492 spin_unlock(&zone->lock);
493
494 spin_lock(&fs_info->reada_lock);
495 kref_put(&zone->refcnt, reada_zone_release);
496 spin_unlock(&fs_info->reada_lock);
497 }
498 if (re->scheduled_for)
499 atomic_dec(&re->scheduled_for->reada_in_flight);
500
501 kfree(re);
502}
503
504static void reada_zone_release(struct kref *kref)
505{
506 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
507
508 radix_tree_delete(&zone->device->reada_zones,
509 zone->end >> PAGE_CACHE_SHIFT);
510
511 kfree(zone);
512}
513
514static void reada_control_release(struct kref *kref)
515{
516 struct reada_control *rc = container_of(kref, struct reada_control,
517 refcnt);
518
519 kfree(rc);
520}
521
522static int reada_add_block(struct reada_control *rc, u64 logical,
523 struct btrfs_key *top, int level, u64 generation)
524{
525 struct btrfs_root *root = rc->root;
526 struct reada_extent *re;
527 struct reada_extctl *rec;
528
529 re = reada_find_extent(root, logical, top, level); /* takes one ref */
530 if (!re)
531 return -1;
532
533 rec = kzalloc(sizeof(*rec), GFP_NOFS);
534 if (!rec) {
535 reada_extent_put(root->fs_info, re);
536 return -1;
537 }
538
539 rec->rc = rc;
540 rec->generation = generation;
541 atomic_inc(&rc->elems);
542
543 spin_lock(&re->lock);
544 list_add_tail(&rec->list, &re->extctl);
545 spin_unlock(&re->lock);
546
547 /* leave the ref on the extent */
548
549 return 0;
550}
551
552/*
553 * called with fs_info->reada_lock held
554 */
555static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
556{
557 int i;
558 unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
559
560 for (i = 0; i < zone->ndevs; ++i) {
561 struct reada_zone *peer;
562 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
563 if (peer && peer->device != zone->device)
564 peer->locked = lock;
565 }
566}
567
568/*
569 * called with fs_info->reada_lock held
570 */
571static int reada_pick_zone(struct btrfs_device *dev)
572{
573 struct reada_zone *top_zone = NULL;
574 struct reada_zone *top_locked_zone = NULL;
575 u64 top_elems = 0;
576 u64 top_locked_elems = 0;
577 unsigned long index = 0;
578 int ret;
579
580 if (dev->reada_curr_zone) {
581 reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
582 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
583 dev->reada_curr_zone = NULL;
584 }
585 /* pick the zone with the most elements */
586 while (1) {
587 struct reada_zone *zone;
588
589 ret = radix_tree_gang_lookup(&dev->reada_zones,
590 (void **)&zone, index, 1);
591 if (ret == 0)
592 break;
593 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
594 if (zone->locked) {
595 if (zone->elems > top_locked_elems) {
596 top_locked_elems = zone->elems;
597 top_locked_zone = zone;
598 }
599 } else {
600 if (zone->elems > top_elems) {
601 top_elems = zone->elems;
602 top_zone = zone;
603 }
604 }
605 }
606 if (top_zone)
607 dev->reada_curr_zone = top_zone;
608 else if (top_locked_zone)
609 dev->reada_curr_zone = top_locked_zone;
610 else
611 return 0;
612
613 dev->reada_next = dev->reada_curr_zone->start;
614 kref_get(&dev->reada_curr_zone->refcnt);
615 reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
616
617 return 1;
618}
619
620static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
621 struct btrfs_device *dev)
622{
623 struct reada_extent *re = NULL;
624 int mirror_num = 0;
625 struct extent_buffer *eb = NULL;
626 u64 logical;
627 u32 blocksize;
628 int ret;
629 int i;
630 int need_kick = 0;
631
632 spin_lock(&fs_info->reada_lock);
633 if (dev->reada_curr_zone == NULL) {
634 ret = reada_pick_zone(dev);
635 if (!ret) {
636 spin_unlock(&fs_info->reada_lock);
637 return 0;
638 }
639 }
640 /*
641 * FIXME currently we issue the reads one extent at a time. If we have
642 * a contiguous block of extents, we could also coagulate them or use
643 * plugging to speed things up
644 */
645 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
646 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
647 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
648 ret = reada_pick_zone(dev);
649 if (!ret) {
650 spin_unlock(&fs_info->reada_lock);
651 return 0;
652 }
653 re = NULL;
654 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
655 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
656 }
657 if (ret == 0) {
658 spin_unlock(&fs_info->reada_lock);
659 return 0;
660 }
661 dev->reada_next = re->logical + re->blocksize;
662 kref_get(&re->refcnt);
663
664 spin_unlock(&fs_info->reada_lock);
665
666 /*
667 * find mirror num
668 */
669 for (i = 0; i < re->nzones; ++i) {
670 if (re->zones[i]->device == dev) {
671 mirror_num = i + 1;
672 break;
673 }
674 }
675 logical = re->logical;
676 blocksize = re->blocksize;
677
678 spin_lock(&re->lock);
679 if (re->scheduled_for == NULL) {
680 re->scheduled_for = dev;
681 need_kick = 1;
682 }
683 spin_unlock(&re->lock);
684
685 reada_extent_put(fs_info, re);
686
687 if (!need_kick)
688 return 0;
689
690 atomic_inc(&dev->reada_in_flight);
691 ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
692 mirror_num, &eb);
693 if (ret)
694 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
695 else if (eb)
696 __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
697
698 if (eb)
699 free_extent_buffer(eb);
700
701 return 1;
702
703}
704
705static void reada_start_machine_worker(struct btrfs_work *work)
706{
707 struct reada_machine_work *rmw;
708 struct btrfs_fs_info *fs_info;
709
710 rmw = container_of(work, struct reada_machine_work, work);
711 fs_info = rmw->fs_info;
712
713 kfree(rmw);
714
715 __reada_start_machine(fs_info);
716}
717
718static void __reada_start_machine(struct btrfs_fs_info *fs_info)
719{
720 struct btrfs_device *device;
721 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
722 u64 enqueued;
723 u64 total = 0;
724 int i;
725
726 do {
727 enqueued = 0;
728 list_for_each_entry(device, &fs_devices->devices, dev_list) {
729 if (atomic_read(&device->reada_in_flight) <
730 MAX_IN_FLIGHT)
731 enqueued += reada_start_machine_dev(fs_info,
732 device);
733 }
734 total += enqueued;
735 } while (enqueued && total < 10000);
736
737 if (enqueued == 0)
738 return;
739
740 /*
741 * If everything is already in the cache, this is effectively single
742 * threaded. To a) not hold the caller for too long and b) to utilize
743 * more cores, we broke the loop above after 10000 iterations and now
744 * enqueue to workers to finish it. This will distribute the load to
745 * the cores.
746 */
747 for (i = 0; i < 2; ++i)
748 reada_start_machine(fs_info);
749}
750
751static void reada_start_machine(struct btrfs_fs_info *fs_info)
752{
753 struct reada_machine_work *rmw;
754
755 rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
756 if (!rmw) {
757 /* FIXME we cannot handle this properly right now */
758 BUG();
759 }
760 rmw->work.func = reada_start_machine_worker;
761 rmw->fs_info = fs_info;
762
763 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
764}
765
766#ifdef DEBUG
767static void dump_devs(struct btrfs_fs_info *fs_info, int all)
768{
769 struct btrfs_device *device;
770 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
771 unsigned long index;
772 int ret;
773 int i;
774 int j;
775 int cnt;
776
777 spin_lock(&fs_info->reada_lock);
778 list_for_each_entry(device, &fs_devices->devices, dev_list) {
779 printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
780 atomic_read(&device->reada_in_flight));
781 index = 0;
782 while (1) {
783 struct reada_zone *zone;
784 ret = radix_tree_gang_lookup(&device->reada_zones,
785 (void **)&zone, index, 1);
786 if (ret == 0)
787 break;
788 printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
789 "%d devs", zone->start, zone->end, zone->elems,
790 zone->locked);
791 for (j = 0; j < zone->ndevs; ++j) {
792 printk(KERN_CONT " %lld",
793 zone->devs[j]->devid);
794 }
795 if (device->reada_curr_zone == zone)
796 printk(KERN_CONT " curr off %llu",
797 device->reada_next - zone->start);
798 printk(KERN_CONT "\n");
799 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
800 }
801 cnt = 0;
802 index = 0;
803 while (all) {
804 struct reada_extent *re = NULL;
805
806 ret = radix_tree_gang_lookup(&device->reada_extents,
807 (void **)&re, index, 1);
808 if (ret == 0)
809 break;
810 printk(KERN_DEBUG
811 " re: logical %llu size %u empty %d for %lld",
812 re->logical, re->blocksize,
813 list_empty(&re->extctl), re->scheduled_for ?
814 re->scheduled_for->devid : -1);
815
816 for (i = 0; i < re->nzones; ++i) {
817 printk(KERN_CONT " zone %llu-%llu devs",
818 re->zones[i]->start,
819 re->zones[i]->end);
820 for (j = 0; j < re->zones[i]->ndevs; ++j) {
821 printk(KERN_CONT " %lld",
822 re->zones[i]->devs[j]->devid);
823 }
824 }
825 printk(KERN_CONT "\n");
826 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
827 if (++cnt > 15)
828 break;
829 }
830 }
831
832 index = 0;
833 cnt = 0;
834 while (all) {
835 struct reada_extent *re = NULL;
836
837 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
838 index, 1);
839 if (ret == 0)
840 break;
841 if (!re->scheduled_for) {
842 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
843 continue;
844 }
845 printk(KERN_DEBUG
846 "re: logical %llu size %u list empty %d for %lld",
847 re->logical, re->blocksize, list_empty(&re->extctl),
848 re->scheduled_for ? re->scheduled_for->devid : -1);
849 for (i = 0; i < re->nzones; ++i) {
850 printk(KERN_CONT " zone %llu-%llu devs",
851 re->zones[i]->start,
852 re->zones[i]->end);
853 for (i = 0; i < re->nzones; ++i) {
854 printk(KERN_CONT " zone %llu-%llu devs",
855 re->zones[i]->start,
856 re->zones[i]->end);
857 for (j = 0; j < re->zones[i]->ndevs; ++j) {
858 printk(KERN_CONT " %lld",
859 re->zones[i]->devs[j]->devid);
860 }
861 }
862 }
863 printk(KERN_CONT "\n");
864 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
865 }
866 spin_unlock(&fs_info->reada_lock);
867}
868#endif
869
870/*
871 * interface
872 */
873struct reada_control *btrfs_reada_add(struct btrfs_root *root,
874 struct btrfs_key *key_start, struct btrfs_key *key_end)
875{
876 struct reada_control *rc;
877 u64 start;
878 u64 generation;
879 int level;
880 struct extent_buffer *node;
881 static struct btrfs_key max_key = {
882 .objectid = (u64)-1,
883 .type = (u8)-1,
884 .offset = (u64)-1
885 };
886
887 rc = kzalloc(sizeof(*rc), GFP_NOFS);
888 if (!rc)
889 return ERR_PTR(-ENOMEM);
890
891 rc->root = root;
892 rc->key_start = *key_start;
893 rc->key_end = *key_end;
894 atomic_set(&rc->elems, 0);
895 init_waitqueue_head(&rc->wait);
896 kref_init(&rc->refcnt);
897 kref_get(&rc->refcnt); /* one ref for having elements */
898
899 node = btrfs_root_node(root);
900 start = node->start;
901 level = btrfs_header_level(node);
902 generation = btrfs_header_generation(node);
903 free_extent_buffer(node);
904
905 reada_add_block(rc, start, &max_key, level, generation);
906
907 reada_start_machine(root->fs_info);
908
909 return rc;
910}
911
912#ifdef DEBUG
913int btrfs_reada_wait(void *handle)
914{
915 struct reada_control *rc = handle;
916
917 while (atomic_read(&rc->elems)) {
918 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
919 5 * HZ);
920 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
921 }
922
923 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
924
925 kref_put(&rc->refcnt, reada_control_release);
926
927 return 0;
928}
929#else
930int btrfs_reada_wait(void *handle)
931{
932 struct reada_control *rc = handle;
933
934 while (atomic_read(&rc->elems)) {
935 wait_event(rc->wait, atomic_read(&rc->elems) == 0);
936 }
937
938 kref_put(&rc->refcnt, reada_control_release);
939
940 return 0;
941}
942#endif
943
944void btrfs_reada_detach(void *handle)
945{
946 struct reada_control *rc = handle;
947
948 kref_put(&rc->refcnt, reada_control_release);
949}