aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-01-24 16:13:08 -0500
committerChris Mason <chris.mason@oracle.com>2008-09-25 11:03:59 -0400
commitd1310b2e0cd98eb1348553e69b73827b436dca7b (patch)
treef55e9c380df0f3a7e9fb792b2308b52b632b90aa /fs/btrfs/extent_io.c
parent5f56406aabdf5444d040c5955effc665b1d0dbaf (diff)
Btrfs: Split the extent_map code into two parts
There is now extent_map for mapping offsets in the file to disk and extent_io for state tracking, IO submission and extent_bufers. The new extent_map code shifts from [start,end] pairs to [start,len], and pushes the locking out into the caller. This allows a few performance optimizations and is easier to use. A number of extent_map usage bugs were fixed, mostly with failing to remove extent_map entries when changing the file. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c3089
1 files changed, 3089 insertions, 0 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..15cc158a0498
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3089 @@
1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h>
7#include <linux/page-flags.h>
8#include <linux/module.h>
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
15#include "extent_io.h"
16#include "extent_map.h"
17
18/* temporary define until extent_map moves out of btrfs */
19struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
20 unsigned long extra_flags,
21 void (*ctor)(void *, struct kmem_cache *,
22 unsigned long));
23
24static struct kmem_cache *extent_state_cache;
25static struct kmem_cache *extent_buffer_cache;
26
27static LIST_HEAD(buffers);
28static LIST_HEAD(states);
29
30static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
31#define BUFFER_LRU_MAX 64
32
33struct tree_entry {
34 u64 start;
35 u64 end;
36 int in_tree;
37 struct rb_node rb_node;
38};
39
40struct extent_page_data {
41 struct bio *bio;
42 struct extent_io_tree *tree;
43 get_extent_t *get_extent;
44};
45
46int __init extent_io_init(void)
47{
48 extent_state_cache = btrfs_cache_create("extent_state",
49 sizeof(struct extent_state), 0,
50 NULL);
51 if (!extent_state_cache)
52 return -ENOMEM;
53
54 extent_buffer_cache = btrfs_cache_create("extent_buffers",
55 sizeof(struct extent_buffer), 0,
56 NULL);
57 if (!extent_buffer_cache)
58 goto free_state_cache;
59 return 0;
60
61free_state_cache:
62 kmem_cache_destroy(extent_state_cache);
63 return -ENOMEM;
64}
65
66void extent_io_exit(void)
67{
68 struct extent_state *state;
69
70 while (!list_empty(&states)) {
71 state = list_entry(states.next, struct extent_state, list);
72 printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
73 list_del(&state->list);
74 kmem_cache_free(extent_state_cache, state);
75
76 }
77
78 if (extent_state_cache)
79 kmem_cache_destroy(extent_state_cache);
80 if (extent_buffer_cache)
81 kmem_cache_destroy(extent_buffer_cache);
82}
83
84void extent_io_tree_init(struct extent_io_tree *tree,
85 struct address_space *mapping, gfp_t mask)
86{
87 tree->state.rb_node = NULL;
88 tree->ops = NULL;
89 tree->dirty_bytes = 0;
90 rwlock_init(&tree->lock);
91 spin_lock_init(&tree->lru_lock);
92 tree->mapping = mapping;
93 INIT_LIST_HEAD(&tree->buffer_lru);
94 tree->lru_size = 0;
95}
96EXPORT_SYMBOL(extent_io_tree_init);
97
98void extent_io_tree_empty_lru(struct extent_io_tree *tree)
99{
100 struct extent_buffer *eb;
101 while(!list_empty(&tree->buffer_lru)) {
102 eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
103 lru);
104 list_del_init(&eb->lru);
105 free_extent_buffer(eb);
106 }
107}
108EXPORT_SYMBOL(extent_io_tree_empty_lru);
109
110struct extent_state *alloc_extent_state(gfp_t mask)
111{
112 struct extent_state *state;
113 unsigned long flags;
114
115 state = kmem_cache_alloc(extent_state_cache, mask);
116 if (!state || IS_ERR(state))
117 return state;
118 state->state = 0;
119 state->in_tree = 0;
120 state->private = 0;
121
122 spin_lock_irqsave(&state_lock, flags);
123 list_add(&state->list, &states);
124 spin_unlock_irqrestore(&state_lock, flags);
125
126 atomic_set(&state->refs, 1);
127 init_waitqueue_head(&state->wq);
128 return state;
129}
130EXPORT_SYMBOL(alloc_extent_state);
131
132void free_extent_state(struct extent_state *state)
133{
134 unsigned long flags;
135 if (!state)
136 return;
137 if (atomic_dec_and_test(&state->refs)) {
138 WARN_ON(state->in_tree);
139 spin_lock_irqsave(&state_lock, flags);
140 list_del(&state->list);
141 spin_unlock_irqrestore(&state_lock, flags);
142 kmem_cache_free(extent_state_cache, state);
143 }
144}
145EXPORT_SYMBOL(free_extent_state);
146
147static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
148 struct rb_node *node)
149{
150 struct rb_node ** p = &root->rb_node;
151 struct rb_node * parent = NULL;
152 struct tree_entry *entry;
153
154 while(*p) {
155 parent = *p;
156 entry = rb_entry(parent, struct tree_entry, rb_node);
157
158 if (offset < entry->start)
159 p = &(*p)->rb_left;
160 else if (offset > entry->end)
161 p = &(*p)->rb_right;
162 else
163 return parent;
164 }
165
166 entry = rb_entry(node, struct tree_entry, rb_node);
167 entry->in_tree = 1;
168 rb_link_node(node, parent, p);
169 rb_insert_color(node, root);
170 return NULL;
171}
172
173static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
174 struct rb_node **prev_ret,
175 struct rb_node **next_ret)
176{
177 struct rb_node * n = root->rb_node;
178 struct rb_node *prev = NULL;
179 struct rb_node *orig_prev = NULL;
180 struct tree_entry *entry;
181 struct tree_entry *prev_entry = NULL;
182
183 while(n) {
184 entry = rb_entry(n, struct tree_entry, rb_node);
185 prev = n;
186 prev_entry = entry;
187
188 if (offset < entry->start)
189 n = n->rb_left;
190 else if (offset > entry->end)
191 n = n->rb_right;
192 else
193 return n;
194 }
195
196 if (prev_ret) {
197 orig_prev = prev;
198 while(prev && offset > prev_entry->end) {
199 prev = rb_next(prev);
200 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
201 }
202 *prev_ret = prev;
203 prev = orig_prev;
204 }
205
206 if (next_ret) {
207 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
208 while(prev && offset < prev_entry->start) {
209 prev = rb_prev(prev);
210 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
211 }
212 *next_ret = prev;
213 }
214 return NULL;
215}
216
217static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
218{
219 struct rb_node *prev;
220 struct rb_node *ret;
221 ret = __tree_search(root, offset, &prev, NULL);
222 if (!ret)
223 return prev;
224 return ret;
225}
226
227/*
228 * utility function to look for merge candidates inside a given range.
229 * Any extents with matching state are merged together into a single
230 * extent in the tree. Extents with EXTENT_IO in their state field
231 * are not merged because the end_io handlers need to be able to do
232 * operations on them without sleeping (or doing allocations/splits).
233 *
234 * This should be called with the tree lock held.
235 */
236static int merge_state(struct extent_io_tree *tree,
237 struct extent_state *state)
238{
239 struct extent_state *other;
240 struct rb_node *other_node;
241
242 if (state->state & EXTENT_IOBITS)
243 return 0;
244
245 other_node = rb_prev(&state->rb_node);
246 if (other_node) {
247 other = rb_entry(other_node, struct extent_state, rb_node);
248 if (other->end == state->start - 1 &&
249 other->state == state->state) {
250 state->start = other->start;
251 other->in_tree = 0;
252 rb_erase(&other->rb_node, &tree->state);
253 free_extent_state(other);
254 }
255 }
256 other_node = rb_next(&state->rb_node);
257 if (other_node) {
258 other = rb_entry(other_node, struct extent_state, rb_node);
259 if (other->start == state->end + 1 &&
260 other->state == state->state) {
261 other->start = state->start;
262 state->in_tree = 0;
263 rb_erase(&state->rb_node, &tree->state);
264 free_extent_state(state);
265 }
266 }
267 return 0;
268}
269
270/*
271 * insert an extent_state struct into the tree. 'bits' are set on the
272 * struct before it is inserted.
273 *
274 * This may return -EEXIST if the extent is already there, in which case the
275 * state struct is freed.
276 *
277 * The tree lock is not taken internally. This is a utility function and
278 * probably isn't what you want to call (see set/clear_extent_bit).
279 */
280static int insert_state(struct extent_io_tree *tree,
281 struct extent_state *state, u64 start, u64 end,
282 int bits)
283{
284 struct rb_node *node;
285
286 if (end < start) {
287 printk("end < start %Lu %Lu\n", end, start);
288 WARN_ON(1);
289 }
290 if (bits & EXTENT_DIRTY)
291 tree->dirty_bytes += end - start + 1;
292 state->state |= bits;
293 state->start = start;
294 state->end = end;
295 node = tree_insert(&tree->state, end, &state->rb_node);
296 if (node) {
297 struct extent_state *found;
298 found = rb_entry(node, struct extent_state, rb_node);
299 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
300 free_extent_state(state);
301 return -EEXIST;
302 }
303 merge_state(tree, state);
304 return 0;
305}
306
307/*
308 * split a given extent state struct in two, inserting the preallocated
309 * struct 'prealloc' as the newly created second half. 'split' indicates an
310 * offset inside 'orig' where it should be split.
311 *
312 * Before calling,
313 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
314 * are two extent state structs in the tree:
315 * prealloc: [orig->start, split - 1]
316 * orig: [ split, orig->end ]
317 *
318 * The tree locks are not taken by this function. They need to be held
319 * by the caller.
320 */
321static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
322 struct extent_state *prealloc, u64 split)
323{
324 struct rb_node *node;
325 prealloc->start = orig->start;
326 prealloc->end = split - 1;
327 prealloc->state = orig->state;
328 orig->start = split;
329
330 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
331 if (node) {
332 struct extent_state *found;
333 found = rb_entry(node, struct extent_state, rb_node);
334 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
335 free_extent_state(prealloc);
336 return -EEXIST;
337 }
338 return 0;
339}
340
341/*
342 * utility function to clear some bits in an extent state struct.
343 * it will optionally wake up any one waiting on this state (wake == 1), or
344 * forcibly remove the state from the tree (delete == 1).
345 *
346 * If no bits are set on the state struct after clearing things, the
347 * struct is freed and removed from the tree
348 */
349static int clear_state_bit(struct extent_io_tree *tree,
350 struct extent_state *state, int bits, int wake,
351 int delete)
352{
353 int ret = state->state & bits;
354
355 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
356 u64 range = state->end - state->start + 1;
357 WARN_ON(range > tree->dirty_bytes);
358 tree->dirty_bytes -= range;
359 }
360 state->state &= ~bits;
361 if (wake)
362 wake_up(&state->wq);
363 if (delete || state->state == 0) {
364 if (state->in_tree) {
365 rb_erase(&state->rb_node, &tree->state);
366 state->in_tree = 0;
367 free_extent_state(state);
368 } else {
369 WARN_ON(1);
370 }
371 } else {
372 merge_state(tree, state);
373 }
374 return ret;
375}
376
377/*
378 * clear some bits on a range in the tree. This may require splitting
379 * or inserting elements in the tree, so the gfp mask is used to
380 * indicate which allocations or sleeping are allowed.
381 *
382 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
383 * the given range from the tree regardless of state (ie for truncate).
384 *
385 * the range [start, end] is inclusive.
386 *
387 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
388 * bits were already set, or zero if none of the bits were already set.
389 */
390int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
391 int bits, int wake, int delete, gfp_t mask)
392{
393 struct extent_state *state;
394 struct extent_state *prealloc = NULL;
395 struct rb_node *node;
396 unsigned long flags;
397 int err;
398 int set = 0;
399
400again:
401 if (!prealloc && (mask & __GFP_WAIT)) {
402 prealloc = alloc_extent_state(mask);
403 if (!prealloc)
404 return -ENOMEM;
405 }
406
407 write_lock_irqsave(&tree->lock, flags);
408 /*
409 * this search will find the extents that end after
410 * our range starts
411 */
412 node = tree_search(&tree->state, start);
413 if (!node)
414 goto out;
415 state = rb_entry(node, struct extent_state, rb_node);
416 if (state->start > end)
417 goto out;
418 WARN_ON(state->end < start);
419
420 /*
421 * | ---- desired range ---- |
422 * | state | or
423 * | ------------- state -------------- |
424 *
425 * We need to split the extent we found, and may flip
426 * bits on second half.
427 *
428 * If the extent we found extends past our range, we
429 * just split and search again. It'll get split again
430 * the next time though.
431 *
432 * If the extent we found is inside our range, we clear
433 * the desired bit on it.
434 */
435
436 if (state->start < start) {
437 err = split_state(tree, state, prealloc, start);
438 BUG_ON(err == -EEXIST);
439 prealloc = NULL;
440 if (err)
441 goto out;
442 if (state->end <= end) {
443 start = state->end + 1;
444 set |= clear_state_bit(tree, state, bits,
445 wake, delete);
446 } else {
447 start = state->start;
448 }
449 goto search_again;
450 }
451 /*
452 * | ---- desired range ---- |
453 * | state |
454 * We need to split the extent, and clear the bit
455 * on the first half
456 */
457 if (state->start <= end && state->end > end) {
458 err = split_state(tree, state, prealloc, end + 1);
459 BUG_ON(err == -EEXIST);
460
461 if (wake)
462 wake_up(&state->wq);
463 set |= clear_state_bit(tree, prealloc, bits,
464 wake, delete);
465 prealloc = NULL;
466 goto out;
467 }
468
469 start = state->end + 1;
470 set |= clear_state_bit(tree, state, bits, wake, delete);
471 goto search_again;
472
473out:
474 write_unlock_irqrestore(&tree->lock, flags);
475 if (prealloc)
476 free_extent_state(prealloc);
477
478 return set;
479
480search_again:
481 if (start > end)
482 goto out;
483 write_unlock_irqrestore(&tree->lock, flags);
484 if (mask & __GFP_WAIT)
485 cond_resched();
486 goto again;
487}
488EXPORT_SYMBOL(clear_extent_bit);
489
490static int wait_on_state(struct extent_io_tree *tree,
491 struct extent_state *state)
492{
493 DEFINE_WAIT(wait);
494 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
495 read_unlock_irq(&tree->lock);
496 schedule();
497 read_lock_irq(&tree->lock);
498 finish_wait(&state->wq, &wait);
499 return 0;
500}
501
502/*
503 * waits for one or more bits to clear on a range in the state tree.
504 * The range [start, end] is inclusive.
505 * The tree lock is taken by this function
506 */
507int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
508{
509 struct extent_state *state;
510 struct rb_node *node;
511
512 read_lock_irq(&tree->lock);
513again:
514 while (1) {
515 /*
516 * this search will find all the extents that end after
517 * our range starts
518 */
519 node = tree_search(&tree->state, start);
520 if (!node)
521 break;
522
523 state = rb_entry(node, struct extent_state, rb_node);
524
525 if (state->start > end)
526 goto out;
527
528 if (state->state & bits) {
529 start = state->start;
530 atomic_inc(&state->refs);
531 wait_on_state(tree, state);
532 free_extent_state(state);
533 goto again;
534 }
535 start = state->end + 1;
536
537 if (start > end)
538 break;
539
540 if (need_resched()) {
541 read_unlock_irq(&tree->lock);
542 cond_resched();
543 read_lock_irq(&tree->lock);
544 }
545 }
546out:
547 read_unlock_irq(&tree->lock);
548 return 0;
549}
550EXPORT_SYMBOL(wait_extent_bit);
551
552static void set_state_bits(struct extent_io_tree *tree,
553 struct extent_state *state,
554 int bits)
555{
556 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
557 u64 range = state->end - state->start + 1;
558 tree->dirty_bytes += range;
559 }
560 state->state |= bits;
561}
562
563/*
564 * set some bits on a range in the tree. This may require allocations
565 * or sleeping, so the gfp mask is used to indicate what is allowed.
566 *
567 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
568 * range already has the desired bits set. The start of the existing
569 * range is returned in failed_start in this case.
570 *
571 * [start, end] is inclusive
572 * This takes the tree lock.
573 */
574int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
575 int exclusive, u64 *failed_start, gfp_t mask)
576{
577 struct extent_state *state;
578 struct extent_state *prealloc = NULL;
579 struct rb_node *node;
580 unsigned long flags;
581 int err = 0;
582 int set;
583 u64 last_start;
584 u64 last_end;
585again:
586 if (!prealloc && (mask & __GFP_WAIT)) {
587 prealloc = alloc_extent_state(mask);
588 if (!prealloc)
589 return -ENOMEM;
590 }
591
592 write_lock_irqsave(&tree->lock, flags);
593 /*
594 * this search will find all the extents that end after
595 * our range starts.
596 */
597 node = tree_search(&tree->state, start);
598 if (!node) {
599 err = insert_state(tree, prealloc, start, end, bits);
600 prealloc = NULL;
601 BUG_ON(err == -EEXIST);
602 goto out;
603 }
604
605 state = rb_entry(node, struct extent_state, rb_node);
606 last_start = state->start;
607 last_end = state->end;
608
609 /*
610 * | ---- desired range ---- |
611 * | state |
612 *
613 * Just lock what we found and keep going
614 */
615 if (state->start == start && state->end <= end) {
616 set = state->state & bits;
617 if (set && exclusive) {
618 *failed_start = state->start;
619 err = -EEXIST;
620 goto out;
621 }
622 set_state_bits(tree, state, bits);
623 start = state->end + 1;
624 merge_state(tree, state);
625 goto search_again;
626 }
627
628 /*
629 * | ---- desired range ---- |
630 * | state |
631 * or
632 * | ------------- state -------------- |
633 *
634 * We need to split the extent we found, and may flip bits on
635 * second half.
636 *
637 * If the extent we found extends past our
638 * range, we just split and search again. It'll get split
639 * again the next time though.
640 *
641 * If the extent we found is inside our range, we set the
642 * desired bit on it.
643 */
644 if (state->start < start) {
645 set = state->state & bits;
646 if (exclusive && set) {
647 *failed_start = start;
648 err = -EEXIST;
649 goto out;
650 }
651 err = split_state(tree, state, prealloc, start);
652 BUG_ON(err == -EEXIST);
653 prealloc = NULL;
654 if (err)
655 goto out;
656 if (state->end <= end) {
657 set_state_bits(tree, state, bits);
658 start = state->end + 1;
659 merge_state(tree, state);
660 } else {
661 start = state->start;
662 }
663 goto search_again;
664 }
665 /*
666 * | ---- desired range ---- |
667 * | state | or | state |
668 *
669 * There's a hole, we need to insert something in it and
670 * ignore the extent we found.
671 */
672 if (state->start > start) {
673 u64 this_end;
674 if (end < last_start)
675 this_end = end;
676 else
677 this_end = last_start -1;
678 err = insert_state(tree, prealloc, start, this_end,
679 bits);
680 prealloc = NULL;
681 BUG_ON(err == -EEXIST);
682 if (err)
683 goto out;
684 start = this_end + 1;
685 goto search_again;
686 }
687 /*
688 * | ---- desired range ---- |
689 * | state |
690 * We need to split the extent, and set the bit
691 * on the first half
692 */
693 if (state->start <= end && state->end > end) {
694 set = state->state & bits;
695 if (exclusive && set) {
696 *failed_start = start;
697 err = -EEXIST;
698 goto out;
699 }
700 err = split_state(tree, state, prealloc, end + 1);
701 BUG_ON(err == -EEXIST);
702
703 set_state_bits(tree, prealloc, bits);
704 merge_state(tree, prealloc);
705 prealloc = NULL;
706 goto out;
707 }
708
709 goto search_again;
710
711out:
712 write_unlock_irqrestore(&tree->lock, flags);
713 if (prealloc)
714 free_extent_state(prealloc);
715
716 return err;
717
718search_again:
719 if (start > end)
720 goto out;
721 write_unlock_irqrestore(&tree->lock, flags);
722 if (mask & __GFP_WAIT)
723 cond_resched();
724 goto again;
725}
726EXPORT_SYMBOL(set_extent_bit);
727
728/* wrappers around set/clear extent bit */
729int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
730 gfp_t mask)
731{
732 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
733 mask);
734}
735EXPORT_SYMBOL(set_extent_dirty);
736
737int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
738 int bits, gfp_t mask)
739{
740 return set_extent_bit(tree, start, end, bits, 0, NULL,
741 mask);
742}
743EXPORT_SYMBOL(set_extent_bits);
744
745int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
746 int bits, gfp_t mask)
747{
748 return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
749}
750EXPORT_SYMBOL(clear_extent_bits);
751
752int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
753 gfp_t mask)
754{
755 return set_extent_bit(tree, start, end,
756 EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
757 mask);
758}
759EXPORT_SYMBOL(set_extent_delalloc);
760
761int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
762 gfp_t mask)
763{
764 return clear_extent_bit(tree, start, end,
765 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
766}
767EXPORT_SYMBOL(clear_extent_dirty);
768
769int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
770 gfp_t mask)
771{
772 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
773 mask);
774}
775EXPORT_SYMBOL(set_extent_new);
776
777int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
778 gfp_t mask)
779{
780 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
781}
782EXPORT_SYMBOL(clear_extent_new);
783
784int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
785 gfp_t mask)
786{
787 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
788 mask);
789}
790EXPORT_SYMBOL(set_extent_uptodate);
791
792int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
793 gfp_t mask)
794{
795 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
796}
797EXPORT_SYMBOL(clear_extent_uptodate);
798
799int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
800 gfp_t mask)
801{
802 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
803 0, NULL, mask);
804}
805EXPORT_SYMBOL(set_extent_writeback);
806
807int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
808 gfp_t mask)
809{
810 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
811}
812EXPORT_SYMBOL(clear_extent_writeback);
813
814int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
815{
816 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
817}
818EXPORT_SYMBOL(wait_on_extent_writeback);
819
820/*
821 * locks a range in ascending order, waiting for any locked regions
822 * it hits on the way. [start,end] are inclusive, and this will sleep.
823 */
824int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
825{
826 int err;
827 u64 failed_start;
828 while (1) {
829 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
830 &failed_start, mask);
831 if (err == -EEXIST && (mask & __GFP_WAIT)) {
832 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
833 start = failed_start;
834 } else {
835 break;
836 }
837 WARN_ON(start > end);
838 }
839 return err;
840}
841EXPORT_SYMBOL(lock_extent);
842
843int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
844 gfp_t mask)
845{
846 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
847}
848EXPORT_SYMBOL(unlock_extent);
849
850/*
851 * helper function to set pages and extents in the tree dirty
852 */
853int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
854{
855 unsigned long index = start >> PAGE_CACHE_SHIFT;
856 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
857 struct page *page;
858
859 while (index <= end_index) {
860 page = find_get_page(tree->mapping, index);
861 BUG_ON(!page);
862 __set_page_dirty_nobuffers(page);
863 page_cache_release(page);
864 index++;
865 }
866 set_extent_dirty(tree, start, end, GFP_NOFS);
867 return 0;
868}
869EXPORT_SYMBOL(set_range_dirty);
870
871/*
872 * helper function to set both pages and extents in the tree writeback
873 */
874int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
875{
876 unsigned long index = start >> PAGE_CACHE_SHIFT;
877 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
878 struct page *page;
879
880 while (index <= end_index) {
881 page = find_get_page(tree->mapping, index);
882 BUG_ON(!page);
883 set_page_writeback(page);
884 page_cache_release(page);
885 index++;
886 }
887 set_extent_writeback(tree, start, end, GFP_NOFS);
888 return 0;
889}
890EXPORT_SYMBOL(set_range_writeback);
891
892int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
893 u64 *start_ret, u64 *end_ret, int bits)
894{
895 struct rb_node *node;
896 struct extent_state *state;
897 int ret = 1;
898
899 read_lock_irq(&tree->lock);
900 /*
901 * this search will find all the extents that end after
902 * our range starts.
903 */
904 node = tree_search(&tree->state, start);
905 if (!node || IS_ERR(node)) {
906 goto out;
907 }
908
909 while(1) {
910 state = rb_entry(node, struct extent_state, rb_node);
911 if (state->end >= start && (state->state & bits)) {
912 *start_ret = state->start;
913 *end_ret = state->end;
914 ret = 0;
915 break;
916 }
917 node = rb_next(node);
918 if (!node)
919 break;
920 }
921out:
922 read_unlock_irq(&tree->lock);
923 return ret;
924}
925EXPORT_SYMBOL(find_first_extent_bit);
926
927u64 find_lock_delalloc_range(struct extent_io_tree *tree,
928 u64 *start, u64 *end, u64 max_bytes)
929{
930 struct rb_node *node;
931 struct extent_state *state;
932 u64 cur_start = *start;
933 u64 found = 0;
934 u64 total_bytes = 0;
935
936 write_lock_irq(&tree->lock);
937 /*
938 * this search will find all the extents that end after
939 * our range starts.
940 */
941search_again:
942 node = tree_search(&tree->state, cur_start);
943 if (!node || IS_ERR(node)) {
944 *end = (u64)-1;
945 goto out;
946 }
947
948 while(1) {
949 state = rb_entry(node, struct extent_state, rb_node);
950 if (found && state->start != cur_start) {
951 goto out;
952 }
953 if (!(state->state & EXTENT_DELALLOC)) {
954 if (!found)
955 *end = state->end;
956 goto out;
957 }
958 if (!found) {
959 struct extent_state *prev_state;
960 struct rb_node *prev_node = node;
961 while(1) {
962 prev_node = rb_prev(prev_node);
963 if (!prev_node)
964 break;
965 prev_state = rb_entry(prev_node,
966 struct extent_state,
967 rb_node);
968 if (!(prev_state->state & EXTENT_DELALLOC))
969 break;
970 state = prev_state;
971 node = prev_node;
972 }
973 }
974 if (state->state & EXTENT_LOCKED) {
975 DEFINE_WAIT(wait);
976 atomic_inc(&state->refs);
977 prepare_to_wait(&state->wq, &wait,
978 TASK_UNINTERRUPTIBLE);
979 write_unlock_irq(&tree->lock);
980 schedule();
981 write_lock_irq(&tree->lock);
982 finish_wait(&state->wq, &wait);
983 free_extent_state(state);
984 goto search_again;
985 }
986 state->state |= EXTENT_LOCKED;
987 if (!found)
988 *start = state->start;
989 found++;
990 *end = state->end;
991 cur_start = state->end + 1;
992 node = rb_next(node);
993 if (!node)
994 break;
995 total_bytes += state->end - state->start + 1;
996 if (total_bytes >= max_bytes)
997 break;
998 }
999out:
1000 write_unlock_irq(&tree->lock);
1001 return found;
1002}
1003
1004u64 count_range_bits(struct extent_io_tree *tree,
1005 u64 *start, u64 search_end, u64 max_bytes,
1006 unsigned long bits)
1007{
1008 struct rb_node *node;
1009 struct extent_state *state;
1010 u64 cur_start = *start;
1011 u64 total_bytes = 0;
1012 int found = 0;
1013
1014 if (search_end <= cur_start) {
1015 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1016 WARN_ON(1);
1017 return 0;
1018 }
1019
1020 write_lock_irq(&tree->lock);
1021 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1022 total_bytes = tree->dirty_bytes;
1023 goto out;
1024 }
1025 /*
1026 * this search will find all the extents that end after
1027 * our range starts.
1028 */
1029 node = tree_search(&tree->state, cur_start);
1030 if (!node || IS_ERR(node)) {
1031 goto out;
1032 }
1033
1034 while(1) {
1035 state = rb_entry(node, struct extent_state, rb_node);
1036 if (state->start > search_end)
1037 break;
1038 if (state->end >= cur_start && (state->state & bits)) {
1039 total_bytes += min(search_end, state->end) + 1 -
1040 max(cur_start, state->start);
1041 if (total_bytes >= max_bytes)
1042 break;
1043 if (!found) {
1044 *start = state->start;
1045 found = 1;
1046 }
1047 }
1048 node = rb_next(node);
1049 if (!node)
1050 break;
1051 }
1052out:
1053 write_unlock_irq(&tree->lock);
1054 return total_bytes;
1055}
1056/*
1057 * helper function to lock both pages and extents in the tree.
1058 * pages must be locked first.
1059 */
1060int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1061{
1062 unsigned long index = start >> PAGE_CACHE_SHIFT;
1063 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1064 struct page *page;
1065 int err;
1066
1067 while (index <= end_index) {
1068 page = grab_cache_page(tree->mapping, index);
1069 if (!page) {
1070 err = -ENOMEM;
1071 goto failed;
1072 }
1073 if (IS_ERR(page)) {
1074 err = PTR_ERR(page);
1075 goto failed;
1076 }
1077 index++;
1078 }
1079 lock_extent(tree, start, end, GFP_NOFS);
1080 return 0;
1081
1082failed:
1083 /*
1084 * we failed above in getting the page at 'index', so we undo here
1085 * up to but not including the page at 'index'
1086 */
1087 end_index = index;
1088 index = start >> PAGE_CACHE_SHIFT;
1089 while (index < end_index) {
1090 page = find_get_page(tree->mapping, index);
1091 unlock_page(page);
1092 page_cache_release(page);
1093 index++;
1094 }
1095 return err;
1096}
1097EXPORT_SYMBOL(lock_range);
1098
1099/*
1100 * helper function to unlock both pages and extents in the tree.
1101 */
1102int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1103{
1104 unsigned long index = start >> PAGE_CACHE_SHIFT;
1105 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1106 struct page *page;
1107
1108 while (index <= end_index) {
1109 page = find_get_page(tree->mapping, index);
1110 unlock_page(page);
1111 page_cache_release(page);
1112 index++;
1113 }
1114 unlock_extent(tree, start, end, GFP_NOFS);
1115 return 0;
1116}
1117EXPORT_SYMBOL(unlock_range);
1118
1119int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1120{
1121 struct rb_node *node;
1122 struct extent_state *state;
1123 int ret = 0;
1124
1125 write_lock_irq(&tree->lock);
1126 /*
1127 * this search will find all the extents that end after
1128 * our range starts.
1129 */
1130 node = tree_search(&tree->state, start);
1131 if (!node || IS_ERR(node)) {
1132 ret = -ENOENT;
1133 goto out;
1134 }
1135 state = rb_entry(node, struct extent_state, rb_node);
1136 if (state->start != start) {
1137 ret = -ENOENT;
1138 goto out;
1139 }
1140 state->private = private;
1141out:
1142 write_unlock_irq(&tree->lock);
1143 return ret;
1144}
1145
1146int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1147{
1148 struct rb_node *node;
1149 struct extent_state *state;
1150 int ret = 0;
1151
1152 read_lock_irq(&tree->lock);
1153 /*
1154 * this search will find all the extents that end after
1155 * our range starts.
1156 */
1157 node = tree_search(&tree->state, start);
1158 if (!node || IS_ERR(node)) {
1159 ret = -ENOENT;
1160 goto out;
1161 }
1162 state = rb_entry(node, struct extent_state, rb_node);
1163 if (state->start != start) {
1164 ret = -ENOENT;
1165 goto out;
1166 }
1167 *private = state->private;
1168out:
1169 read_unlock_irq(&tree->lock);
1170 return ret;
1171}
1172
1173/*
1174 * searches a range in the state tree for a given mask.
1175 * If 'filled' == 1, this returns 1 only if ever extent in the tree
1176 * has the bits set. Otherwise, 1 is returned if any bit in the
1177 * range is found set.
1178 */
1179int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1180 int bits, int filled)
1181{
1182 struct extent_state *state = NULL;
1183 struct rb_node *node;
1184 int bitset = 0;
1185 unsigned long flags;
1186
1187 read_lock_irqsave(&tree->lock, flags);
1188 node = tree_search(&tree->state, start);
1189 while (node && start <= end) {
1190 state = rb_entry(node, struct extent_state, rb_node);
1191
1192 if (filled && state->start > start) {
1193 bitset = 0;
1194 break;
1195 }
1196
1197 if (state->start > end)
1198 break;
1199
1200 if (state->state & bits) {
1201 bitset = 1;
1202 if (!filled)
1203 break;
1204 } else if (filled) {
1205 bitset = 0;
1206 break;
1207 }
1208 start = state->end + 1;
1209 if (start > end)
1210 break;
1211 node = rb_next(node);
1212 if (!node) {
1213 if (filled)
1214 bitset = 0;
1215 break;
1216 }
1217 }
1218 read_unlock_irqrestore(&tree->lock, flags);
1219 return bitset;
1220}
1221EXPORT_SYMBOL(test_range_bit);
1222
1223/*
1224 * helper function to set a given page up to date if all the
1225 * extents in the tree for that page are up to date
1226 */
1227static int check_page_uptodate(struct extent_io_tree *tree,
1228 struct page *page)
1229{
1230 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1231 u64 end = start + PAGE_CACHE_SIZE - 1;
1232 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1233 SetPageUptodate(page);
1234 return 0;
1235}
1236
1237/*
1238 * helper function to unlock a page if all the extents in the tree
1239 * for that page are unlocked
1240 */
1241static int check_page_locked(struct extent_io_tree *tree,
1242 struct page *page)
1243{
1244 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1245 u64 end = start + PAGE_CACHE_SIZE - 1;
1246 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1247 unlock_page(page);
1248 return 0;
1249}
1250
1251/*
1252 * helper function to end page writeback if all the extents
1253 * in the tree for that page are done with writeback
1254 */
1255static int check_page_writeback(struct extent_io_tree *tree,
1256 struct page *page)
1257{
1258 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1259 u64 end = start + PAGE_CACHE_SIZE - 1;
1260 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1261 end_page_writeback(page);
1262 return 0;
1263}
1264
1265/* lots and lots of room for performance fixes in the end_bio funcs */
1266
1267/*
1268 * after a writepage IO is done, we need to:
1269 * clear the uptodate bits on error
1270 * clear the writeback bits in the extent tree for this IO
1271 * end_page_writeback if the page has no more pending IO
1272 *
1273 * Scheduling is not allowed, so the extent state tree is expected
1274 * to have one and only one object corresponding to this IO.
1275 */
1276#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1277static void end_bio_extent_writepage(struct bio *bio, int err)
1278#else
1279static int end_bio_extent_writepage(struct bio *bio,
1280 unsigned int bytes_done, int err)
1281#endif
1282{
1283 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1284 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1285 struct extent_io_tree *tree = bio->bi_private;
1286 u64 start;
1287 u64 end;
1288 int whole_page;
1289
1290#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1291 if (bio->bi_size)
1292 return 1;
1293#endif
1294
1295 do {
1296 struct page *page = bvec->bv_page;
1297 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1298 bvec->bv_offset;
1299 end = start + bvec->bv_len - 1;
1300
1301 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1302 whole_page = 1;
1303 else
1304 whole_page = 0;
1305
1306 if (--bvec >= bio->bi_io_vec)
1307 prefetchw(&bvec->bv_page->flags);
1308
1309 if (!uptodate) {
1310 clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1311 ClearPageUptodate(page);
1312 SetPageError(page);
1313 }
1314 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1315
1316 if (whole_page)
1317 end_page_writeback(page);
1318 else
1319 check_page_writeback(tree, page);
1320 if (tree->ops && tree->ops->writepage_end_io_hook)
1321 tree->ops->writepage_end_io_hook(page, start, end);
1322 } while (bvec >= bio->bi_io_vec);
1323
1324 bio_put(bio);
1325#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1326 return 0;
1327#endif
1328}
1329
1330/*
1331 * after a readpage IO is done, we need to:
1332 * clear the uptodate bits on error
1333 * set the uptodate bits if things worked
1334 * set the page up to date if all extents in the tree are uptodate
1335 * clear the lock bit in the extent tree
1336 * unlock the page if there are no other extents locked for it
1337 *
1338 * Scheduling is not allowed, so the extent state tree is expected
1339 * to have one and only one object corresponding to this IO.
1340 */
1341#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1342static void end_bio_extent_readpage(struct bio *bio, int err)
1343#else
1344static int end_bio_extent_readpage(struct bio *bio,
1345 unsigned int bytes_done, int err)
1346#endif
1347{
1348 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1349 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1350 struct extent_io_tree *tree = bio->bi_private;
1351 u64 start;
1352 u64 end;
1353 int whole_page;
1354 int ret;
1355
1356#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1357 if (bio->bi_size)
1358 return 1;
1359#endif
1360
1361 do {
1362 struct page *page = bvec->bv_page;
1363 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1364 bvec->bv_offset;
1365 end = start + bvec->bv_len - 1;
1366
1367 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1368 whole_page = 1;
1369 else
1370 whole_page = 0;
1371
1372 if (--bvec >= bio->bi_io_vec)
1373 prefetchw(&bvec->bv_page->flags);
1374
1375 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1376 ret = tree->ops->readpage_end_io_hook(page, start, end);
1377 if (ret)
1378 uptodate = 0;
1379 }
1380 if (uptodate) {
1381 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1382 if (whole_page)
1383 SetPageUptodate(page);
1384 else
1385 check_page_uptodate(tree, page);
1386 } else {
1387 ClearPageUptodate(page);
1388 SetPageError(page);
1389 }
1390
1391 unlock_extent(tree, start, end, GFP_ATOMIC);
1392
1393 if (whole_page)
1394 unlock_page(page);
1395 else
1396 check_page_locked(tree, page);
1397 } while (bvec >= bio->bi_io_vec);
1398
1399 bio_put(bio);
1400#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1401 return 0;
1402#endif
1403}
1404
1405/*
1406 * IO done from prepare_write is pretty simple, we just unlock
1407 * the structs in the extent tree when done, and set the uptodate bits
1408 * as appropriate.
1409 */
1410#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1411static void end_bio_extent_preparewrite(struct bio *bio, int err)
1412#else
1413static int end_bio_extent_preparewrite(struct bio *bio,
1414 unsigned int bytes_done, int err)
1415#endif
1416{
1417 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1418 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1419 struct extent_io_tree *tree = bio->bi_private;
1420 u64 start;
1421 u64 end;
1422
1423#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1424 if (bio->bi_size)
1425 return 1;
1426#endif
1427
1428 do {
1429 struct page *page = bvec->bv_page;
1430 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1431 bvec->bv_offset;
1432 end = start + bvec->bv_len - 1;
1433
1434 if (--bvec >= bio->bi_io_vec)
1435 prefetchw(&bvec->bv_page->flags);
1436
1437 if (uptodate) {
1438 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1439 } else {
1440 ClearPageUptodate(page);
1441 SetPageError(page);
1442 }
1443
1444 unlock_extent(tree, start, end, GFP_ATOMIC);
1445
1446 } while (bvec >= bio->bi_io_vec);
1447
1448 bio_put(bio);
1449#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1450 return 0;
1451#endif
1452}
1453
1454static struct bio *
1455extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1456 gfp_t gfp_flags)
1457{
1458 struct bio *bio;
1459
1460 bio = bio_alloc(gfp_flags, nr_vecs);
1461
1462 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1463 while (!bio && (nr_vecs /= 2))
1464 bio = bio_alloc(gfp_flags, nr_vecs);
1465 }
1466
1467 if (bio) {
1468 bio->bi_bdev = bdev;
1469 bio->bi_sector = first_sector;
1470 }
1471 return bio;
1472}
1473
1474static int submit_one_bio(int rw, struct bio *bio)
1475{
1476 u64 maxsector;
1477 int ret = 0;
1478
1479 bio_get(bio);
1480
1481 maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
1482 if (maxsector < bio->bi_sector) {
1483 printk("sector too large max %Lu got %llu\n", maxsector,
1484 (unsigned long long)bio->bi_sector);
1485 WARN_ON(1);
1486 }
1487
1488 submit_bio(rw, bio);
1489 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1490 ret = -EOPNOTSUPP;
1491 bio_put(bio);
1492 return ret;
1493}
1494
1495static int submit_extent_page(int rw, struct extent_io_tree *tree,
1496 struct page *page, sector_t sector,
1497 size_t size, unsigned long offset,
1498 struct block_device *bdev,
1499 struct bio **bio_ret,
1500 unsigned long max_pages,
1501 bio_end_io_t end_io_func)
1502{
1503 int ret = 0;
1504 struct bio *bio;
1505 int nr;
1506
1507 if (bio_ret && *bio_ret) {
1508 bio = *bio_ret;
1509 if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1510 bio_add_page(bio, page, size, offset) < size) {
1511 ret = submit_one_bio(rw, bio);
1512 bio = NULL;
1513 } else {
1514 return 0;
1515 }
1516 }
1517 nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
1518 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1519 if (!bio) {
1520 printk("failed to allocate bio nr %d\n", nr);
1521 }
1522 bio_add_page(bio, page, size, offset);
1523 bio->bi_end_io = end_io_func;
1524 bio->bi_private = tree;
1525 if (bio_ret) {
1526 *bio_ret = bio;
1527 } else {
1528 ret = submit_one_bio(rw, bio);
1529 }
1530
1531 return ret;
1532}
1533
1534void set_page_extent_mapped(struct page *page)
1535{
1536 if (!PagePrivate(page)) {
1537 SetPagePrivate(page);
1538 WARN_ON(!page->mapping->a_ops->invalidatepage);
1539 set_page_private(page, EXTENT_PAGE_PRIVATE);
1540 page_cache_get(page);
1541 }
1542}
1543
1544void set_page_extent_head(struct page *page, unsigned long len)
1545{
1546 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1547}
1548
1549/*
1550 * basic readpage implementation. Locked extent state structs are inserted
1551 * into the tree that are removed when the IO is done (by the end_io
1552 * handlers)
1553 */
1554static int __extent_read_full_page(struct extent_io_tree *tree,
1555 struct page *page,
1556 get_extent_t *get_extent,
1557 struct bio **bio)
1558{
1559 struct inode *inode = page->mapping->host;
1560 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1561 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1562 u64 end;
1563 u64 cur = start;
1564 u64 extent_offset;
1565 u64 last_byte = i_size_read(inode);
1566 u64 block_start;
1567 u64 cur_end;
1568 sector_t sector;
1569 struct extent_map *em;
1570 struct block_device *bdev;
1571 int ret;
1572 int nr = 0;
1573 size_t page_offset = 0;
1574 size_t iosize;
1575 size_t blocksize = inode->i_sb->s_blocksize;
1576
1577 set_page_extent_mapped(page);
1578
1579 end = page_end;
1580 lock_extent(tree, start, end, GFP_NOFS);
1581
1582 while (cur <= end) {
1583 if (cur >= last_byte) {
1584 char *userpage;
1585 iosize = PAGE_CACHE_SIZE - page_offset;
1586 userpage = kmap_atomic(page, KM_USER0);
1587 memset(userpage + page_offset, 0, iosize);
1588 flush_dcache_page(page);
1589 kunmap_atomic(userpage, KM_USER0);
1590 set_extent_uptodate(tree, cur, cur + iosize - 1,
1591 GFP_NOFS);
1592 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1593 break;
1594 }
1595 em = get_extent(inode, page, page_offset, cur,
1596 end - cur + 1, 0);
1597 if (IS_ERR(em) || !em) {
1598 SetPageError(page);
1599 unlock_extent(tree, cur, end, GFP_NOFS);
1600 break;
1601 }
1602
1603 extent_offset = cur - em->start;
1604 BUG_ON(extent_map_end(em) <= cur);
1605 BUG_ON(end < cur);
1606
1607 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1608 cur_end = min(extent_map_end(em) - 1, end);
1609 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1610 sector = (em->block_start + extent_offset) >> 9;
1611 bdev = em->bdev;
1612 block_start = em->block_start;
1613 free_extent_map(em);
1614 em = NULL;
1615
1616 /* we've found a hole, just zero and go on */
1617 if (block_start == EXTENT_MAP_HOLE) {
1618 char *userpage;
1619 userpage = kmap_atomic(page, KM_USER0);
1620 memset(userpage + page_offset, 0, iosize);
1621 flush_dcache_page(page);
1622 kunmap_atomic(userpage, KM_USER0);
1623
1624 set_extent_uptodate(tree, cur, cur + iosize - 1,
1625 GFP_NOFS);
1626 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1627 cur = cur + iosize;
1628 page_offset += iosize;
1629 continue;
1630 }
1631 /* the get_extent function already copied into the page */
1632 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1633 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1634 cur = cur + iosize;
1635 page_offset += iosize;
1636 continue;
1637 }
1638
1639 ret = 0;
1640 if (tree->ops && tree->ops->readpage_io_hook) {
1641 ret = tree->ops->readpage_io_hook(page, cur,
1642 cur + iosize - 1);
1643 }
1644 if (!ret) {
1645 unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1646 nr -= page->index;
1647 ret = submit_extent_page(READ, tree, page,
1648 sector, iosize, page_offset,
1649 bdev, bio, nr,
1650 end_bio_extent_readpage);
1651 }
1652 if (ret)
1653 SetPageError(page);
1654 cur = cur + iosize;
1655 page_offset += iosize;
1656 nr++;
1657 }
1658 if (!nr) {
1659 if (!PageError(page))
1660 SetPageUptodate(page);
1661 unlock_page(page);
1662 }
1663 return 0;
1664}
1665
1666int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1667 get_extent_t *get_extent)
1668{
1669 struct bio *bio = NULL;
1670 int ret;
1671
1672 ret = __extent_read_full_page(tree, page, get_extent, &bio);
1673 if (bio)
1674 submit_one_bio(READ, bio);
1675 return ret;
1676}
1677EXPORT_SYMBOL(extent_read_full_page);
1678
1679/*
1680 * the writepage semantics are similar to regular writepage. extent
1681 * records are inserted to lock ranges in the tree, and as dirty areas
1682 * are found, they are marked writeback. Then the lock bits are removed
1683 * and the end_io handler clears the writeback ranges
1684 */
1685static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1686 void *data)
1687{
1688 struct inode *inode = page->mapping->host;
1689 struct extent_page_data *epd = data;
1690 struct extent_io_tree *tree = epd->tree;
1691 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1692 u64 delalloc_start;
1693 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1694 u64 end;
1695 u64 cur = start;
1696 u64 extent_offset;
1697 u64 last_byte = i_size_read(inode);
1698 u64 block_start;
1699 u64 iosize;
1700 sector_t sector;
1701 struct extent_map *em;
1702 struct block_device *bdev;
1703 int ret;
1704 int nr = 0;
1705 size_t page_offset = 0;
1706 size_t blocksize;
1707 loff_t i_size = i_size_read(inode);
1708 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1709 u64 nr_delalloc;
1710 u64 delalloc_end;
1711
1712 WARN_ON(!PageLocked(page));
1713 if (page->index > end_index) {
1714 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1715 unlock_page(page);
1716 return 0;
1717 }
1718
1719 if (page->index == end_index) {
1720 char *userpage;
1721
1722 size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
1723
1724 userpage = kmap_atomic(page, KM_USER0);
1725 memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
1726 flush_dcache_page(page);
1727 kunmap_atomic(userpage, KM_USER0);
1728 }
1729
1730 set_page_extent_mapped(page);
1731
1732 delalloc_start = start;
1733 delalloc_end = 0;
1734 while(delalloc_end < page_end) {
1735 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
1736 &delalloc_end,
1737 128 * 1024 * 1024);
1738 if (nr_delalloc == 0) {
1739 delalloc_start = delalloc_end + 1;
1740 continue;
1741 }
1742 tree->ops->fill_delalloc(inode, delalloc_start,
1743 delalloc_end);
1744 clear_extent_bit(tree, delalloc_start,
1745 delalloc_end,
1746 EXTENT_LOCKED | EXTENT_DELALLOC,
1747 1, 0, GFP_NOFS);
1748 delalloc_start = delalloc_end + 1;
1749 }
1750 lock_extent(tree, start, page_end, GFP_NOFS);
1751
1752 end = page_end;
1753 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1754 printk("found delalloc bits after lock_extent\n");
1755 }
1756
1757 if (last_byte <= start) {
1758 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1759 goto done;
1760 }
1761
1762 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1763 blocksize = inode->i_sb->s_blocksize;
1764
1765 while (cur <= end) {
1766 if (cur >= last_byte) {
1767 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1768 break;
1769 }
1770 em = epd->get_extent(inode, page, page_offset, cur,
1771 end - cur + 1, 1);
1772 if (IS_ERR(em) || !em) {
1773 SetPageError(page);
1774 break;
1775 }
1776
1777 extent_offset = cur - em->start;
1778 BUG_ON(extent_map_end(em) <= cur);
1779 BUG_ON(end < cur);
1780 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1781 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1782 sector = (em->block_start + extent_offset) >> 9;
1783 bdev = em->bdev;
1784 block_start = em->block_start;
1785 free_extent_map(em);
1786 em = NULL;
1787
1788 if (block_start == EXTENT_MAP_HOLE ||
1789 block_start == EXTENT_MAP_INLINE) {
1790 clear_extent_dirty(tree, cur,
1791 cur + iosize - 1, GFP_NOFS);
1792 cur = cur + iosize;
1793 page_offset += iosize;
1794 continue;
1795 }
1796
1797 /* leave this out until we have a page_mkwrite call */
1798 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
1799 EXTENT_DIRTY, 0)) {
1800 cur = cur + iosize;
1801 page_offset += iosize;
1802 continue;
1803 }
1804 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
1805 if (tree->ops && tree->ops->writepage_io_hook) {
1806 ret = tree->ops->writepage_io_hook(page, cur,
1807 cur + iosize - 1);
1808 } else {
1809 ret = 0;
1810 }
1811 if (ret)
1812 SetPageError(page);
1813 else {
1814 unsigned long max_nr = end_index + 1;
1815 set_range_writeback(tree, cur, cur + iosize - 1);
1816 if (!PageWriteback(page)) {
1817 printk("warning page %lu not writeback, "
1818 "cur %llu end %llu\n", page->index,
1819 (unsigned long long)cur,
1820 (unsigned long long)end);
1821 }
1822
1823 ret = submit_extent_page(WRITE, tree, page, sector,
1824 iosize, page_offset, bdev,
1825 &epd->bio, max_nr,
1826 end_bio_extent_writepage);
1827 if (ret)
1828 SetPageError(page);
1829 }
1830 cur = cur + iosize;
1831 page_offset += iosize;
1832 nr++;
1833 }
1834done:
1835 if (nr == 0) {
1836 /* make sure the mapping tag for page dirty gets cleared */
1837 set_page_writeback(page);
1838 end_page_writeback(page);
1839 }
1840 unlock_extent(tree, start, page_end, GFP_NOFS);
1841 unlock_page(page);
1842 return 0;
1843}
1844
1845#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
1846
1847/* Taken directly from 2.6.23 for 2.6.18 back port */
1848typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
1849 void *data);
1850
1851/**
1852 * write_cache_pages - walk the list of dirty pages of the given address space
1853 * and write all of them.
1854 * @mapping: address space structure to write
1855 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
1856 * @writepage: function called for each page
1857 * @data: data passed to writepage function
1858 *
1859 * If a page is already under I/O, write_cache_pages() skips it, even
1860 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
1861 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
1862 * and msync() need to guarantee that all the data which was dirty at the time
1863 * the call was made get new I/O started against them. If wbc->sync_mode is
1864 * WB_SYNC_ALL then we were called for data integrity and we must wait for
1865 * existing IO to complete.
1866 */
1867static int write_cache_pages(struct address_space *mapping,
1868 struct writeback_control *wbc, writepage_t writepage,
1869 void *data)
1870{
1871 struct backing_dev_info *bdi = mapping->backing_dev_info;
1872 int ret = 0;
1873 int done = 0;
1874 struct pagevec pvec;
1875 int nr_pages;
1876 pgoff_t index;
1877 pgoff_t end; /* Inclusive */
1878 int scanned = 0;
1879 int range_whole = 0;
1880
1881 if (wbc->nonblocking && bdi_write_congested(bdi)) {
1882 wbc->encountered_congestion = 1;
1883 return 0;
1884 }
1885
1886 pagevec_init(&pvec, 0);
1887 if (wbc->range_cyclic) {
1888 index = mapping->writeback_index; /* Start from prev offset */
1889 end = -1;
1890 } else {
1891 index = wbc->range_start >> PAGE_CACHE_SHIFT;
1892 end = wbc->range_end >> PAGE_CACHE_SHIFT;
1893 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
1894 range_whole = 1;
1895 scanned = 1;
1896 }
1897retry:
1898 while (!done && (index <= end) &&
1899 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1900 PAGECACHE_TAG_DIRTY,
1901 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
1902 unsigned i;
1903
1904 scanned = 1;
1905 for (i = 0; i < nr_pages; i++) {
1906 struct page *page = pvec.pages[i];
1907
1908 /*
1909 * At this point we hold neither mapping->tree_lock nor
1910 * lock on the page itself: the page may be truncated or
1911 * invalidated (changing page->mapping to NULL), or even
1912 * swizzled back from swapper_space to tmpfs file
1913 * mapping
1914 */
1915 lock_page(page);
1916
1917 if (unlikely(page->mapping != mapping)) {
1918 unlock_page(page);
1919 continue;
1920 }
1921
1922 if (!wbc->range_cyclic && page->index > end) {
1923 done = 1;
1924 unlock_page(page);
1925 continue;
1926 }
1927
1928 if (wbc->sync_mode != WB_SYNC_NONE)
1929 wait_on_page_writeback(page);
1930
1931 if (PageWriteback(page) ||
1932 !clear_page_dirty_for_io(page)) {
1933 unlock_page(page);
1934 continue;
1935 }
1936
1937 ret = (*writepage)(page, wbc, data);
1938
1939 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
1940 unlock_page(page);
1941 ret = 0;
1942 }
1943 if (ret || (--(wbc->nr_to_write) <= 0))
1944 done = 1;
1945 if (wbc->nonblocking && bdi_write_congested(bdi)) {
1946 wbc->encountered_congestion = 1;
1947 done = 1;
1948 }
1949 }
1950 pagevec_release(&pvec);
1951 cond_resched();
1952 }
1953 if (!scanned && !done) {
1954 /*
1955 * We hit the last page and there is more work to be done: wrap
1956 * back to the start of the file
1957 */
1958 scanned = 1;
1959 index = 0;
1960 goto retry;
1961 }
1962 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
1963 mapping->writeback_index = index;
1964 return ret;
1965}
1966#endif
1967
1968int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
1969 get_extent_t *get_extent,
1970 struct writeback_control *wbc)
1971{
1972 int ret;
1973 struct address_space *mapping = page->mapping;
1974 struct extent_page_data epd = {
1975 .bio = NULL,
1976 .tree = tree,
1977 .get_extent = get_extent,
1978 };
1979 struct writeback_control wbc_writepages = {
1980 .bdi = wbc->bdi,
1981 .sync_mode = WB_SYNC_NONE,
1982 .older_than_this = NULL,
1983 .nr_to_write = 64,
1984 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
1985 .range_end = (loff_t)-1,
1986 };
1987
1988
1989 ret = __extent_writepage(page, wbc, &epd);
1990
1991 write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
1992 if (epd.bio) {
1993 submit_one_bio(WRITE, epd.bio);
1994 }
1995 return ret;
1996}
1997EXPORT_SYMBOL(extent_write_full_page);
1998
1999
2000int extent_writepages(struct extent_io_tree *tree,
2001 struct address_space *mapping,
2002 get_extent_t *get_extent,
2003 struct writeback_control *wbc)
2004{
2005 int ret = 0;
2006 struct extent_page_data epd = {
2007 .bio = NULL,
2008 .tree = tree,
2009 .get_extent = get_extent,
2010 };
2011
2012 ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
2013 if (epd.bio) {
2014 submit_one_bio(WRITE, epd.bio);
2015 }
2016 return ret;
2017}
2018EXPORT_SYMBOL(extent_writepages);
2019
2020int extent_readpages(struct extent_io_tree *tree,
2021 struct address_space *mapping,
2022 struct list_head *pages, unsigned nr_pages,
2023 get_extent_t get_extent)
2024{
2025 struct bio *bio = NULL;
2026 unsigned page_idx;
2027 struct pagevec pvec;
2028
2029 pagevec_init(&pvec, 0);
2030 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2031 struct page *page = list_entry(pages->prev, struct page, lru);
2032
2033 prefetchw(&page->flags);
2034 list_del(&page->lru);
2035 /*
2036 * what we want to do here is call add_to_page_cache_lru,
2037 * but that isn't exported, so we reproduce it here
2038 */
2039 if (!add_to_page_cache(page, mapping,
2040 page->index, GFP_KERNEL)) {
2041
2042 /* open coding of lru_cache_add, also not exported */
2043 page_cache_get(page);
2044 if (!pagevec_add(&pvec, page))
2045 __pagevec_lru_add(&pvec);
2046 __extent_read_full_page(tree, page, get_extent, &bio);
2047 }
2048 page_cache_release(page);
2049 }
2050 if (pagevec_count(&pvec))
2051 __pagevec_lru_add(&pvec);
2052 BUG_ON(!list_empty(pages));
2053 if (bio)
2054 submit_one_bio(READ, bio);
2055 return 0;
2056}
2057EXPORT_SYMBOL(extent_readpages);
2058
2059/*
2060 * basic invalidatepage code, this waits on any locked or writeback
2061 * ranges corresponding to the page, and then deletes any extent state
2062 * records from the tree
2063 */
2064int extent_invalidatepage(struct extent_io_tree *tree,
2065 struct page *page, unsigned long offset)
2066{
2067 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2068 u64 end = start + PAGE_CACHE_SIZE - 1;
2069 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2070
2071 start += (offset + blocksize -1) & ~(blocksize - 1);
2072 if (start > end)
2073 return 0;
2074
2075 lock_extent(tree, start, end, GFP_NOFS);
2076 wait_on_extent_writeback(tree, start, end);
2077 clear_extent_bit(tree, start, end,
2078 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2079 1, 1, GFP_NOFS);
2080 return 0;
2081}
2082EXPORT_SYMBOL(extent_invalidatepage);
2083
2084/*
2085 * simple commit_write call, set_range_dirty is used to mark both
2086 * the pages and the extent records as dirty
2087 */
2088int extent_commit_write(struct extent_io_tree *tree,
2089 struct inode *inode, struct page *page,
2090 unsigned from, unsigned to)
2091{
2092 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2093
2094 set_page_extent_mapped(page);
2095 set_page_dirty(page);
2096
2097 if (pos > inode->i_size) {
2098 i_size_write(inode, pos);
2099 mark_inode_dirty(inode);
2100 }
2101 return 0;
2102}
2103EXPORT_SYMBOL(extent_commit_write);
2104
2105int extent_prepare_write(struct extent_io_tree *tree,
2106 struct inode *inode, struct page *page,
2107 unsigned from, unsigned to, get_extent_t *get_extent)
2108{
2109 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2110 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2111 u64 block_start;
2112 u64 orig_block_start;
2113 u64 block_end;
2114 u64 cur_end;
2115 struct extent_map *em;
2116 unsigned blocksize = 1 << inode->i_blkbits;
2117 size_t page_offset = 0;
2118 size_t block_off_start;
2119 size_t block_off_end;
2120 int err = 0;
2121 int iocount = 0;
2122 int ret = 0;
2123 int isnew;
2124
2125 set_page_extent_mapped(page);
2126
2127 block_start = (page_start + from) & ~((u64)blocksize - 1);
2128 block_end = (page_start + to - 1) | (blocksize - 1);
2129 orig_block_start = block_start;
2130
2131 lock_extent(tree, page_start, page_end, GFP_NOFS);
2132 while(block_start <= block_end) {
2133 em = get_extent(inode, page, page_offset, block_start,
2134 block_end - block_start + 1, 1);
2135 if (IS_ERR(em) || !em) {
2136 goto err;
2137 }
2138 cur_end = min(block_end, extent_map_end(em) - 1);
2139 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2140 block_off_end = block_off_start + blocksize;
2141 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2142
2143 if (!PageUptodate(page) && isnew &&
2144 (block_off_end > to || block_off_start < from)) {
2145 void *kaddr;
2146
2147 kaddr = kmap_atomic(page, KM_USER0);
2148 if (block_off_end > to)
2149 memset(kaddr + to, 0, block_off_end - to);
2150 if (block_off_start < from)
2151 memset(kaddr + block_off_start, 0,
2152 from - block_off_start);
2153 flush_dcache_page(page);
2154 kunmap_atomic(kaddr, KM_USER0);
2155 }
2156 if ((em->block_start != EXTENT_MAP_HOLE &&
2157 em->block_start != EXTENT_MAP_INLINE) &&
2158 !isnew && !PageUptodate(page) &&
2159 (block_off_end > to || block_off_start < from) &&
2160 !test_range_bit(tree, block_start, cur_end,
2161 EXTENT_UPTODATE, 1)) {
2162 u64 sector;
2163 u64 extent_offset = block_start - em->start;
2164 size_t iosize;
2165 sector = (em->block_start + extent_offset) >> 9;
2166 iosize = (cur_end - block_start + blocksize) &
2167 ~((u64)blocksize - 1);
2168 /*
2169 * we've already got the extent locked, but we
2170 * need to split the state such that our end_bio
2171 * handler can clear the lock.
2172 */
2173 set_extent_bit(tree, block_start,
2174 block_start + iosize - 1,
2175 EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2176 ret = submit_extent_page(READ, tree, page,
2177 sector, iosize, page_offset, em->bdev,
2178 NULL, 1,
2179 end_bio_extent_preparewrite);
2180 iocount++;
2181 block_start = block_start + iosize;
2182 } else {
2183 set_extent_uptodate(tree, block_start, cur_end,
2184 GFP_NOFS);
2185 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2186 block_start = cur_end + 1;
2187 }
2188 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2189 free_extent_map(em);
2190 }
2191 if (iocount) {
2192 wait_extent_bit(tree, orig_block_start,
2193 block_end, EXTENT_LOCKED);
2194 }
2195 check_page_uptodate(tree, page);
2196err:
2197 /* FIXME, zero out newly allocated blocks on error */
2198 return err;
2199}
2200EXPORT_SYMBOL(extent_prepare_write);
2201
2202/*
2203 * a helper for releasepage. As long as there are no locked extents
2204 * in the range corresponding to the page, both state records and extent
2205 * map records are removed
2206 */
2207int try_release_extent_mapping(struct extent_map_tree *map,
2208 struct extent_io_tree *tree, struct page *page)
2209{
2210 struct extent_map *em;
2211 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2212 u64 end = start + PAGE_CACHE_SIZE - 1;
2213 u64 orig_start = start;
2214 int ret = 1;
2215
2216 while (start <= end) {
2217 spin_lock(&map->lock);
2218 em = lookup_extent_mapping(map, start, end);
2219 if (!em || IS_ERR(em)) {
2220 spin_unlock(&map->lock);
2221 break;
2222 }
2223 if (!test_range_bit(tree, em->start, extent_map_end(em) - 1,
2224 EXTENT_LOCKED, 0)) {
2225 remove_extent_mapping(map, em);
2226 /* once for the rb tree */
2227 free_extent_map(em);
2228 }
2229 start = extent_map_end(em);
2230 spin_unlock(&map->lock);
2231
2232 /* once for us */
2233 free_extent_map(em);
2234 }
2235 if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
2236 ret = 0;
2237 else
2238 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
2239 1, 1, GFP_NOFS);
2240 return ret;
2241}
2242EXPORT_SYMBOL(try_release_extent_mapping);
2243
2244sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2245 get_extent_t *get_extent)
2246{
2247 struct inode *inode = mapping->host;
2248 u64 start = iblock << inode->i_blkbits;
2249 sector_t sector = 0;
2250 struct extent_map *em;
2251
2252 em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
2253 if (!em || IS_ERR(em))
2254 return 0;
2255
2256 if (em->block_start == EXTENT_MAP_INLINE ||
2257 em->block_start == EXTENT_MAP_HOLE)
2258 goto out;
2259
2260 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2261printk("bmap finds %Lu %Lu block %Lu\n", em->start, em->len, em->block_start);
2262out:
2263 free_extent_map(em);
2264 return sector;
2265}
2266
2267static int add_lru(struct extent_io_tree *tree, struct extent_buffer *eb)
2268{
2269 if (list_empty(&eb->lru)) {
2270 extent_buffer_get(eb);
2271 list_add(&eb->lru, &tree->buffer_lru);
2272 tree->lru_size++;
2273 if (tree->lru_size >= BUFFER_LRU_MAX) {
2274 struct extent_buffer *rm;
2275 rm = list_entry(tree->buffer_lru.prev,
2276 struct extent_buffer, lru);
2277 tree->lru_size--;
2278 list_del_init(&rm->lru);
2279 free_extent_buffer(rm);
2280 }
2281 } else
2282 list_move(&eb->lru, &tree->buffer_lru);
2283 return 0;
2284}
2285static struct extent_buffer *find_lru(struct extent_io_tree *tree,
2286 u64 start, unsigned long len)
2287{
2288 struct list_head *lru = &tree->buffer_lru;
2289 struct list_head *cur = lru->next;
2290 struct extent_buffer *eb;
2291
2292 if (list_empty(lru))
2293 return NULL;
2294
2295 do {
2296 eb = list_entry(cur, struct extent_buffer, lru);
2297 if (eb->start == start && eb->len == len) {
2298 extent_buffer_get(eb);
2299 return eb;
2300 }
2301 cur = cur->next;
2302 } while (cur != lru);
2303 return NULL;
2304}
2305
2306static inline unsigned long num_extent_pages(u64 start, u64 len)
2307{
2308 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2309 (start >> PAGE_CACHE_SHIFT);
2310}
2311
2312static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2313 unsigned long i)
2314{
2315 struct page *p;
2316 struct address_space *mapping;
2317
2318 if (i == 0)
2319 return eb->first_page;
2320 i += eb->start >> PAGE_CACHE_SHIFT;
2321 mapping = eb->first_page->mapping;
2322 read_lock_irq(&mapping->tree_lock);
2323 p = radix_tree_lookup(&mapping->page_tree, i);
2324 read_unlock_irq(&mapping->tree_lock);
2325 return p;
2326}
2327
2328static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2329 u64 start,
2330 unsigned long len,
2331 gfp_t mask)
2332{
2333 struct extent_buffer *eb = NULL;
2334
2335 spin_lock(&tree->lru_lock);
2336 eb = find_lru(tree, start, len);
2337 spin_unlock(&tree->lru_lock);
2338 if (eb) {
2339 return eb;
2340 }
2341
2342 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2343 INIT_LIST_HEAD(&eb->lru);
2344 eb->start = start;
2345 eb->len = len;
2346 atomic_set(&eb->refs, 1);
2347
2348 return eb;
2349}
2350
2351static void __free_extent_buffer(struct extent_buffer *eb)
2352{
2353 kmem_cache_free(extent_buffer_cache, eb);
2354}
2355
2356struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2357 u64 start, unsigned long len,
2358 struct page *page0,
2359 gfp_t mask)
2360{
2361 unsigned long num_pages = num_extent_pages(start, len);
2362 unsigned long i;
2363 unsigned long index = start >> PAGE_CACHE_SHIFT;
2364 struct extent_buffer *eb;
2365 struct page *p;
2366 struct address_space *mapping = tree->mapping;
2367 int uptodate = 1;
2368
2369 eb = __alloc_extent_buffer(tree, start, len, mask);
2370 if (!eb || IS_ERR(eb))
2371 return NULL;
2372
2373 if (eb->flags & EXTENT_BUFFER_FILLED)
2374 goto lru_add;
2375
2376 if (page0) {
2377 eb->first_page = page0;
2378 i = 1;
2379 index++;
2380 page_cache_get(page0);
2381 mark_page_accessed(page0);
2382 set_page_extent_mapped(page0);
2383 WARN_ON(!PageUptodate(page0));
2384 set_page_extent_head(page0, len);
2385 } else {
2386 i = 0;
2387 }
2388 for (; i < num_pages; i++, index++) {
2389 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2390 if (!p) {
2391 WARN_ON(1);
2392 goto fail;
2393 }
2394 set_page_extent_mapped(p);
2395 mark_page_accessed(p);
2396 if (i == 0) {
2397 eb->first_page = p;
2398 set_page_extent_head(p, len);
2399 } else {
2400 set_page_private(p, EXTENT_PAGE_PRIVATE);
2401 }
2402 if (!PageUptodate(p))
2403 uptodate = 0;
2404 unlock_page(p);
2405 }
2406 if (uptodate)
2407 eb->flags |= EXTENT_UPTODATE;
2408 eb->flags |= EXTENT_BUFFER_FILLED;
2409
2410lru_add:
2411 spin_lock(&tree->lru_lock);
2412 add_lru(tree, eb);
2413 spin_unlock(&tree->lru_lock);
2414 return eb;
2415
2416fail:
2417 spin_lock(&tree->lru_lock);
2418 list_del_init(&eb->lru);
2419 spin_unlock(&tree->lru_lock);
2420 if (!atomic_dec_and_test(&eb->refs))
2421 return NULL;
2422 for (index = 1; index < i; index++) {
2423 page_cache_release(extent_buffer_page(eb, index));
2424 }
2425 if (i > 0)
2426 page_cache_release(extent_buffer_page(eb, 0));
2427 __free_extent_buffer(eb);
2428 return NULL;
2429}
2430EXPORT_SYMBOL(alloc_extent_buffer);
2431
2432struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
2433 u64 start, unsigned long len,
2434 gfp_t mask)
2435{
2436 unsigned long num_pages = num_extent_pages(start, len);
2437 unsigned long i;
2438 unsigned long index = start >> PAGE_CACHE_SHIFT;
2439 struct extent_buffer *eb;
2440 struct page *p;
2441 struct address_space *mapping = tree->mapping;
2442 int uptodate = 1;
2443
2444 eb = __alloc_extent_buffer(tree, start, len, mask);
2445 if (!eb || IS_ERR(eb))
2446 return NULL;
2447
2448 if (eb->flags & EXTENT_BUFFER_FILLED)
2449 goto lru_add;
2450
2451 for (i = 0; i < num_pages; i++, index++) {
2452 p = find_lock_page(mapping, index);
2453 if (!p) {
2454 goto fail;
2455 }
2456 set_page_extent_mapped(p);
2457 mark_page_accessed(p);
2458
2459 if (i == 0) {
2460 eb->first_page = p;
2461 set_page_extent_head(p, len);
2462 } else {
2463 set_page_private(p, EXTENT_PAGE_PRIVATE);
2464 }
2465
2466 if (!PageUptodate(p))
2467 uptodate = 0;
2468 unlock_page(p);
2469 }
2470 if (uptodate)
2471 eb->flags |= EXTENT_UPTODATE;
2472 eb->flags |= EXTENT_BUFFER_FILLED;
2473
2474lru_add:
2475 spin_lock(&tree->lru_lock);
2476 add_lru(tree, eb);
2477 spin_unlock(&tree->lru_lock);
2478 return eb;
2479fail:
2480 spin_lock(&tree->lru_lock);
2481 list_del_init(&eb->lru);
2482 spin_unlock(&tree->lru_lock);
2483 if (!atomic_dec_and_test(&eb->refs))
2484 return NULL;
2485 for (index = 1; index < i; index++) {
2486 page_cache_release(extent_buffer_page(eb, index));
2487 }
2488 if (i > 0)
2489 page_cache_release(extent_buffer_page(eb, 0));
2490 __free_extent_buffer(eb);
2491 return NULL;
2492}
2493EXPORT_SYMBOL(find_extent_buffer);
2494
2495void free_extent_buffer(struct extent_buffer *eb)
2496{
2497 unsigned long i;
2498 unsigned long num_pages;
2499
2500 if (!eb)
2501 return;
2502
2503 if (!atomic_dec_and_test(&eb->refs))
2504 return;
2505
2506 WARN_ON(!list_empty(&eb->lru));
2507 num_pages = num_extent_pages(eb->start, eb->len);
2508
2509 for (i = 1; i < num_pages; i++) {
2510 page_cache_release(extent_buffer_page(eb, i));
2511 }
2512 page_cache_release(extent_buffer_page(eb, 0));
2513 __free_extent_buffer(eb);
2514}
2515EXPORT_SYMBOL(free_extent_buffer);
2516
2517int clear_extent_buffer_dirty(struct extent_io_tree *tree,
2518 struct extent_buffer *eb)
2519{
2520 int set;
2521 unsigned long i;
2522 unsigned long num_pages;
2523 struct page *page;
2524
2525 u64 start = eb->start;
2526 u64 end = start + eb->len - 1;
2527
2528 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2529 num_pages = num_extent_pages(eb->start, eb->len);
2530
2531 for (i = 0; i < num_pages; i++) {
2532 page = extent_buffer_page(eb, i);
2533 lock_page(page);
2534 if (i == 0)
2535 set_page_extent_head(page, eb->len);
2536 else
2537 set_page_private(page, EXTENT_PAGE_PRIVATE);
2538
2539 /*
2540 * if we're on the last page or the first page and the
2541 * block isn't aligned on a page boundary, do extra checks
2542 * to make sure we don't clean page that is partially dirty
2543 */
2544 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2545 ((i == num_pages - 1) &&
2546 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2547 start = (u64)page->index << PAGE_CACHE_SHIFT;
2548 end = start + PAGE_CACHE_SIZE - 1;
2549 if (test_range_bit(tree, start, end,
2550 EXTENT_DIRTY, 0)) {
2551 unlock_page(page);
2552 continue;
2553 }
2554 }
2555 clear_page_dirty_for_io(page);
2556 write_lock_irq(&page->mapping->tree_lock);
2557 if (!PageDirty(page)) {
2558 radix_tree_tag_clear(&page->mapping->page_tree,
2559 page_index(page),
2560 PAGECACHE_TAG_DIRTY);
2561 }
2562 write_unlock_irq(&page->mapping->tree_lock);
2563 unlock_page(page);
2564 }
2565 return 0;
2566}
2567EXPORT_SYMBOL(clear_extent_buffer_dirty);
2568
2569int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
2570 struct extent_buffer *eb)
2571{
2572 return wait_on_extent_writeback(tree, eb->start,
2573 eb->start + eb->len - 1);
2574}
2575EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2576
2577int set_extent_buffer_dirty(struct extent_io_tree *tree,
2578 struct extent_buffer *eb)
2579{
2580 unsigned long i;
2581 unsigned long num_pages;
2582
2583 num_pages = num_extent_pages(eb->start, eb->len);
2584 for (i = 0; i < num_pages; i++) {
2585 struct page *page = extent_buffer_page(eb, i);
2586 /* writepage may need to do something special for the
2587 * first page, we have to make sure page->private is
2588 * properly set. releasepage may drop page->private
2589 * on us if the page isn't already dirty.
2590 */
2591 if (i == 0) {
2592 lock_page(page);
2593 set_page_extent_head(page, eb->len);
2594 } else if (PagePrivate(page) &&
2595 page->private != EXTENT_PAGE_PRIVATE) {
2596 lock_page(page);
2597 set_page_extent_mapped(page);
2598 unlock_page(page);
2599 }
2600 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2601 if (i == 0)
2602 unlock_page(page);
2603 }
2604 return set_extent_dirty(tree, eb->start,
2605 eb->start + eb->len - 1, GFP_NOFS);
2606}
2607EXPORT_SYMBOL(set_extent_buffer_dirty);
2608
2609int set_extent_buffer_uptodate(struct extent_io_tree *tree,
2610 struct extent_buffer *eb)
2611{
2612 unsigned long i;
2613 struct page *page;
2614 unsigned long num_pages;
2615
2616 num_pages = num_extent_pages(eb->start, eb->len);
2617
2618 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2619 GFP_NOFS);
2620 for (i = 0; i < num_pages; i++) {
2621 page = extent_buffer_page(eb, i);
2622 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2623 ((i == num_pages - 1) &&
2624 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2625 check_page_uptodate(tree, page);
2626 continue;
2627 }
2628 SetPageUptodate(page);
2629 }
2630 return 0;
2631}
2632EXPORT_SYMBOL(set_extent_buffer_uptodate);
2633
2634int extent_buffer_uptodate(struct extent_io_tree *tree,
2635 struct extent_buffer *eb)
2636{
2637 if (eb->flags & EXTENT_UPTODATE)
2638 return 1;
2639 return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2640 EXTENT_UPTODATE, 1);
2641}
2642EXPORT_SYMBOL(extent_buffer_uptodate);
2643
2644int read_extent_buffer_pages(struct extent_io_tree *tree,
2645 struct extent_buffer *eb,
2646 u64 start,
2647 int wait)
2648{
2649 unsigned long i;
2650 unsigned long start_i;
2651 struct page *page;
2652 int err;
2653 int ret = 0;
2654 unsigned long num_pages;
2655
2656 if (eb->flags & EXTENT_UPTODATE)
2657 return 0;
2658
2659 if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2660 EXTENT_UPTODATE, 1)) {
2661 return 0;
2662 }
2663
2664 if (start) {
2665 WARN_ON(start < eb->start);
2666 start_i = (start >> PAGE_CACHE_SHIFT) -
2667 (eb->start >> PAGE_CACHE_SHIFT);
2668 } else {
2669 start_i = 0;
2670 }
2671
2672 num_pages = num_extent_pages(eb->start, eb->len);
2673 for (i = start_i; i < num_pages; i++) {
2674 page = extent_buffer_page(eb, i);
2675 if (PageUptodate(page)) {
2676 continue;
2677 }
2678 if (!wait) {
2679 if (TestSetPageLocked(page)) {
2680 continue;
2681 }
2682 } else {
2683 lock_page(page);
2684 }
2685 if (!PageUptodate(page)) {
2686 err = page->mapping->a_ops->readpage(NULL, page);
2687 if (err) {
2688 ret = err;
2689 }
2690 } else {
2691 unlock_page(page);
2692 }
2693 }
2694
2695 if (ret || !wait) {
2696 return ret;
2697 }
2698
2699 for (i = start_i; i < num_pages; i++) {
2700 page = extent_buffer_page(eb, i);
2701 wait_on_page_locked(page);
2702 if (!PageUptodate(page)) {
2703 ret = -EIO;
2704 }
2705 }
2706 if (!ret)
2707 eb->flags |= EXTENT_UPTODATE;
2708 return ret;
2709}
2710EXPORT_SYMBOL(read_extent_buffer_pages);
2711
2712void read_extent_buffer(struct extent_buffer *eb, void *dstv,
2713 unsigned long start,
2714 unsigned long len)
2715{
2716 size_t cur;
2717 size_t offset;
2718 struct page *page;
2719 char *kaddr;
2720 char *dst = (char *)dstv;
2721 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2722 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2723 unsigned long num_pages = num_extent_pages(eb->start, eb->len);
2724
2725 WARN_ON(start > eb->len);
2726 WARN_ON(start + len > eb->start + eb->len);
2727
2728 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2729
2730 while(len > 0) {
2731 page = extent_buffer_page(eb, i);
2732 if (!PageUptodate(page)) {
2733 printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
2734 WARN_ON(1);
2735 }
2736 WARN_ON(!PageUptodate(page));
2737
2738 cur = min(len, (PAGE_CACHE_SIZE - offset));
2739 kaddr = kmap_atomic(page, KM_USER1);
2740 memcpy(dst, kaddr + offset, cur);
2741 kunmap_atomic(kaddr, KM_USER1);
2742
2743 dst += cur;
2744 len -= cur;
2745 offset = 0;
2746 i++;
2747 }
2748}
2749EXPORT_SYMBOL(read_extent_buffer);
2750
2751int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
2752 unsigned long min_len, char **token, char **map,
2753 unsigned long *map_start,
2754 unsigned long *map_len, int km)
2755{
2756 size_t offset = start & (PAGE_CACHE_SIZE - 1);
2757 char *kaddr;
2758 struct page *p;
2759 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2760 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2761 unsigned long end_i = (start_offset + start + min_len - 1) >>
2762 PAGE_CACHE_SHIFT;
2763
2764 if (i != end_i)
2765 return -EINVAL;
2766
2767 if (i == 0) {
2768 offset = start_offset;
2769 *map_start = 0;
2770 } else {
2771 offset = 0;
2772 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
2773 }
2774 if (start + min_len > eb->len) {
2775printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
2776 WARN_ON(1);
2777 }
2778
2779 p = extent_buffer_page(eb, i);
2780 WARN_ON(!PageUptodate(p));
2781 kaddr = kmap_atomic(p, km);
2782 *token = kaddr;
2783 *map = kaddr + offset;
2784 *map_len = PAGE_CACHE_SIZE - offset;
2785 return 0;
2786}
2787EXPORT_SYMBOL(map_private_extent_buffer);
2788
2789int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
2790 unsigned long min_len,
2791 char **token, char **map,
2792 unsigned long *map_start,
2793 unsigned long *map_len, int km)
2794{
2795 int err;
2796 int save = 0;
2797 if (eb->map_token) {
2798 unmap_extent_buffer(eb, eb->map_token, km);
2799 eb->map_token = NULL;
2800 save = 1;
2801 }
2802 err = map_private_extent_buffer(eb, start, min_len, token, map,
2803 map_start, map_len, km);
2804 if (!err && save) {
2805 eb->map_token = *token;
2806 eb->kaddr = *map;
2807 eb->map_start = *map_start;
2808 eb->map_len = *map_len;
2809 }
2810 return err;
2811}
2812EXPORT_SYMBOL(map_extent_buffer);
2813
2814void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
2815{
2816 kunmap_atomic(token, km);
2817}
2818EXPORT_SYMBOL(unmap_extent_buffer);
2819
2820int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
2821 unsigned long start,
2822 unsigned long len)
2823{
2824 size_t cur;
2825 size_t offset;
2826 struct page *page;
2827 char *kaddr;
2828 char *ptr = (char *)ptrv;
2829 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2830 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2831 int ret = 0;
2832
2833 WARN_ON(start > eb->len);
2834 WARN_ON(start + len > eb->start + eb->len);
2835
2836 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2837
2838 while(len > 0) {
2839 page = extent_buffer_page(eb, i);
2840 WARN_ON(!PageUptodate(page));
2841
2842 cur = min(len, (PAGE_CACHE_SIZE - offset));
2843
2844 kaddr = kmap_atomic(page, KM_USER0);
2845 ret = memcmp(ptr, kaddr + offset, cur);
2846 kunmap_atomic(kaddr, KM_USER0);
2847 if (ret)
2848 break;
2849
2850 ptr += cur;
2851 len -= cur;
2852 offset = 0;
2853 i++;
2854 }
2855 return ret;
2856}
2857EXPORT_SYMBOL(memcmp_extent_buffer);
2858
2859void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
2860 unsigned long start, unsigned long len)
2861{
2862 size_t cur;
2863 size_t offset;
2864 struct page *page;
2865 char *kaddr;
2866 char *src = (char *)srcv;
2867 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2868 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2869
2870 WARN_ON(start > eb->len);
2871 WARN_ON(start + len > eb->start + eb->len);
2872
2873 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2874
2875 while(len > 0) {
2876 page = extent_buffer_page(eb, i);
2877 WARN_ON(!PageUptodate(page));
2878
2879 cur = min(len, PAGE_CACHE_SIZE - offset);
2880 kaddr = kmap_atomic(page, KM_USER1);
2881 memcpy(kaddr + offset, src, cur);
2882 kunmap_atomic(kaddr, KM_USER1);
2883
2884 src += cur;
2885 len -= cur;
2886 offset = 0;
2887 i++;
2888 }
2889}
2890EXPORT_SYMBOL(write_extent_buffer);
2891
2892void memset_extent_buffer(struct extent_buffer *eb, char c,
2893 unsigned long start, unsigned long len)
2894{
2895 size_t cur;
2896 size_t offset;
2897 struct page *page;
2898 char *kaddr;
2899 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2900 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2901
2902 WARN_ON(start > eb->len);
2903 WARN_ON(start + len > eb->start + eb->len);
2904
2905 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2906
2907 while(len > 0) {
2908 page = extent_buffer_page(eb, i);
2909 WARN_ON(!PageUptodate(page));
2910
2911 cur = min(len, PAGE_CACHE_SIZE - offset);
2912 kaddr = kmap_atomic(page, KM_USER0);
2913 memset(kaddr + offset, c, cur);
2914 kunmap_atomic(kaddr, KM_USER0);
2915
2916 len -= cur;
2917 offset = 0;
2918 i++;
2919 }
2920}
2921EXPORT_SYMBOL(memset_extent_buffer);
2922
2923void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
2924 unsigned long dst_offset, unsigned long src_offset,
2925 unsigned long len)
2926{
2927 u64 dst_len = dst->len;
2928 size_t cur;
2929 size_t offset;
2930 struct page *page;
2931 char *kaddr;
2932 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
2933 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
2934
2935 WARN_ON(src->len != dst_len);
2936
2937 offset = (start_offset + dst_offset) &
2938 ((unsigned long)PAGE_CACHE_SIZE - 1);
2939
2940 while(len > 0) {
2941 page = extent_buffer_page(dst, i);
2942 WARN_ON(!PageUptodate(page));
2943
2944 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
2945
2946 kaddr = kmap_atomic(page, KM_USER0);
2947 read_extent_buffer(src, kaddr + offset, src_offset, cur);
2948 kunmap_atomic(kaddr, KM_USER0);
2949
2950 src_offset += cur;
2951 len -= cur;
2952 offset = 0;
2953 i++;
2954 }
2955}
2956EXPORT_SYMBOL(copy_extent_buffer);
2957
2958static void move_pages(struct page *dst_page, struct page *src_page,
2959 unsigned long dst_off, unsigned long src_off,
2960 unsigned long len)
2961{
2962 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
2963 if (dst_page == src_page) {
2964 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
2965 } else {
2966 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
2967 char *p = dst_kaddr + dst_off + len;
2968 char *s = src_kaddr + src_off + len;
2969
2970 while (len--)
2971 *--p = *--s;
2972
2973 kunmap_atomic(src_kaddr, KM_USER1);
2974 }
2975 kunmap_atomic(dst_kaddr, KM_USER0);
2976}
2977
2978static void copy_pages(struct page *dst_page, struct page *src_page,
2979 unsigned long dst_off, unsigned long src_off,
2980 unsigned long len)
2981{
2982 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
2983 char *src_kaddr;
2984
2985 if (dst_page != src_page)
2986 src_kaddr = kmap_atomic(src_page, KM_USER1);
2987 else
2988 src_kaddr = dst_kaddr;
2989
2990 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
2991 kunmap_atomic(dst_kaddr, KM_USER0);
2992 if (dst_page != src_page)
2993 kunmap_atomic(src_kaddr, KM_USER1);
2994}
2995
2996void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
2997 unsigned long src_offset, unsigned long len)
2998{
2999 size_t cur;
3000 size_t dst_off_in_page;
3001 size_t src_off_in_page;
3002 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3003 unsigned long dst_i;
3004 unsigned long src_i;
3005
3006 if (src_offset + len > dst->len) {
3007 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3008 src_offset, len, dst->len);
3009 BUG_ON(1);
3010 }
3011 if (dst_offset + len > dst->len) {
3012 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3013 dst_offset, len, dst->len);
3014 BUG_ON(1);
3015 }
3016
3017 while(len > 0) {
3018 dst_off_in_page = (start_offset + dst_offset) &
3019 ((unsigned long)PAGE_CACHE_SIZE - 1);
3020 src_off_in_page = (start_offset + src_offset) &
3021 ((unsigned long)PAGE_CACHE_SIZE - 1);
3022
3023 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3024 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3025
3026 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3027 src_off_in_page));
3028 cur = min_t(unsigned long, cur,
3029 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3030
3031 copy_pages(extent_buffer_page(dst, dst_i),
3032 extent_buffer_page(dst, src_i),
3033 dst_off_in_page, src_off_in_page, cur);
3034
3035 src_offset += cur;
3036 dst_offset += cur;
3037 len -= cur;
3038 }
3039}
3040EXPORT_SYMBOL(memcpy_extent_buffer);
3041
3042void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3043 unsigned long src_offset, unsigned long len)
3044{
3045 size_t cur;
3046 size_t dst_off_in_page;
3047 size_t src_off_in_page;
3048 unsigned long dst_end = dst_offset + len - 1;
3049 unsigned long src_end = src_offset + len - 1;
3050 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3051 unsigned long dst_i;
3052 unsigned long src_i;
3053
3054 if (src_offset + len > dst->len) {
3055 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3056 src_offset, len, dst->len);
3057 BUG_ON(1);
3058 }
3059 if (dst_offset + len > dst->len) {
3060 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3061 dst_offset, len, dst->len);
3062 BUG_ON(1);
3063 }
3064 if (dst_offset < src_offset) {
3065 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3066 return;
3067 }
3068 while(len > 0) {
3069 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3070 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3071
3072 dst_off_in_page = (start_offset + dst_end) &
3073 ((unsigned long)PAGE_CACHE_SIZE - 1);
3074 src_off_in_page = (start_offset + src_end) &
3075 ((unsigned long)PAGE_CACHE_SIZE - 1);
3076
3077 cur = min_t(unsigned long, len, src_off_in_page + 1);
3078 cur = min(cur, dst_off_in_page + 1);
3079 move_pages(extent_buffer_page(dst, dst_i),
3080 extent_buffer_page(dst, src_i),
3081 dst_off_in_page - cur + 1,
3082 src_off_in_page - cur + 1, cur);
3083
3084 dst_end -= cur;
3085 src_end -= cur;
3086 len -= cur;
3087 }
3088}
3089EXPORT_SYMBOL(memmove_extent_buffer);