aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLi Zefan <lizf@cn.fujitsu.com>2012-01-10 20:54:49 -0500
committerLi Zefan <lizf@cn.fujitsu.com>2012-01-10 20:54:49 -0500
commitd25223a0d22f7ec4203ec285dc6e51f696591ba3 (patch)
treef54428e64f692edfa5bf75f8eb301329e32a895f /fs
parent396e6e49c58bb23d1814d3c240c736c9f01523c5 (diff)
parent08c422c27f855d27b0b3d9fa30ebd938d4ae6f1f (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs into for-linus
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/acl.c17
-rw-r--r--fs/btrfs/async-thread.c120
-rw-r--r--fs/btrfs/async-thread.h4
-rw-r--r--fs/btrfs/backref.c776
-rw-r--r--fs/btrfs/backref.h62
-rw-r--r--fs/btrfs/btrfs_inode.h21
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/ctree.c27
-rw-r--r--fs/btrfs/ctree.h209
-rw-r--r--fs/btrfs/delayed-inode.c108
-rw-r--r--fs/btrfs/disk-io.c662
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c1126
-rw-r--r--fs/btrfs/extent_io.c640
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c31
-rw-r--r--fs/btrfs/free-space-cache.c994
-rw-r--r--fs/btrfs/inode-map.c34
-rw-r--r--fs/btrfs/inode.c732
-rw-r--r--fs/btrfs/ioctl.c244
-rw-r--r--fs/btrfs/ioctl.h29
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/reada.c951
-rw-r--r--fs/btrfs/relocation.c28
-rw-r--r--fs/btrfs/scrub.c668
-rw-r--r--fs/btrfs/super.c373
-rw-r--r--fs/btrfs/transaction.c156
-rw-r--r--fs/btrfs/tree-log.c19
-rw-r--r--fs/btrfs/volumes.c222
-rw-r--r--fs/btrfs/volumes.h24
-rw-r--r--fs/btrfs/xattr.c11
33 files changed, 6337 insertions, 2009 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21f..c0ddfd29c5e5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o
11 12
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 if (!value) 59 if (!value)
60 return ERR_PTR(-ENOMEM); 60 return ERR_PTR(-ENOMEM);
61 size = __btrfs_getxattr(inode, name, value, size); 61 size = __btrfs_getxattr(inode, name, value, size);
62 if (size > 0) { 62 }
63 acl = posix_acl_from_xattr(value, size); 63 if (size > 0) {
64 if (IS_ERR(acl)) { 64 acl = posix_acl_from_xattr(value, size);
65 kfree(value);
66 return acl;
67 }
68 set_cached_acl(inode, type, acl);
69 }
70 kfree(value);
71 } else if (size == -ENOENT || size == -ENODATA || size == 0) { 65 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
72 /* FIXME, who returns -ENOENT? I think nobody */ 66 /* FIXME, who returns -ENOENT? I think nobody */
73 acl = NULL; 67 acl = NULL;
74 set_cached_acl(inode, type, acl);
75 } else { 68 } else {
76 acl = ERR_PTR(-EIO); 69 acl = ERR_PTR(-EIO);
77 } 70 }
71 kfree(value);
72
73 if (!IS_ERR(acl))
74 set_cached_acl(inode, type, acl);
78 75
79 return acl; 76 return acl;
80} 77}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec14097fef1..0b394580d860 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,8 @@ struct btrfs_worker_thread {
64 int idle; 64 int idle;
65}; 65};
66 66
67static int __btrfs_start_workers(struct btrfs_workers *workers);
68
67/* 69/*
68 * btrfs_start_workers uses kthread_run, which can block waiting for memory 70 * btrfs_start_workers uses kthread_run, which can block waiting for memory
69 * for a very long time. It will actually throttle on page writeback, 71 * for a very long time. It will actually throttle on page writeback,
@@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work)
88{ 90{
89 struct worker_start *start; 91 struct worker_start *start;
90 start = container_of(work, struct worker_start, work); 92 start = container_of(work, struct worker_start, work);
91 btrfs_start_workers(start->queue, 1); 93 __btrfs_start_workers(start->queue);
92 kfree(start); 94 kfree(start);
93} 95}
94 96
95static int start_new_worker(struct btrfs_workers *queue)
96{
97 struct worker_start *start;
98 int ret;
99
100 start = kzalloc(sizeof(*start), GFP_NOFS);
101 if (!start)
102 return -ENOMEM;
103
104 start->work.func = start_new_worker_func;
105 start->queue = queue;
106 ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
107 if (ret)
108 kfree(start);
109 return ret;
110}
111
112/* 97/*
113 * helper function to move a thread onto the idle list after it 98 * helper function to move a thread onto the idle list after it
114 * has finished some requests. 99 * has finished some requests.
@@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
153static void check_pending_worker_creates(struct btrfs_worker_thread *worker) 138static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
154{ 139{
155 struct btrfs_workers *workers = worker->workers; 140 struct btrfs_workers *workers = worker->workers;
141 struct worker_start *start;
156 unsigned long flags; 142 unsigned long flags;
157 143
158 rmb(); 144 rmb();
159 if (!workers->atomic_start_pending) 145 if (!workers->atomic_start_pending)
160 return; 146 return;
161 147
148 start = kzalloc(sizeof(*start), GFP_NOFS);
149 if (!start)
150 return;
151
152 start->work.func = start_new_worker_func;
153 start->queue = workers;
154
162 spin_lock_irqsave(&workers->lock, flags); 155 spin_lock_irqsave(&workers->lock, flags);
163 if (!workers->atomic_start_pending) 156 if (!workers->atomic_start_pending)
164 goto out; 157 goto out;
@@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
170 163
171 workers->num_workers_starting += 1; 164 workers->num_workers_starting += 1;
172 spin_unlock_irqrestore(&workers->lock, flags); 165 spin_unlock_irqrestore(&workers->lock, flags);
173 start_new_worker(workers); 166 btrfs_queue_worker(workers->atomic_worker_start, &start->work);
174 return; 167 return;
175 168
176out: 169out:
170 kfree(start);
177 spin_unlock_irqrestore(&workers->lock, flags); 171 spin_unlock_irqrestore(&workers->lock, flags);
178} 172}
179 173
@@ -331,7 +325,7 @@ again:
331 run_ordered_completions(worker->workers, work); 325 run_ordered_completions(worker->workers, work);
332 326
333 check_pending_worker_creates(worker); 327 check_pending_worker_creates(worker);
334 328 cond_resched();
335 } 329 }
336 330
337 spin_lock_irq(&worker->lock); 331 spin_lock_irq(&worker->lock);
@@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
462 * starts new worker threads. This does not enforce the max worker 456 * starts new worker threads. This does not enforce the max worker
463 * count in case you need to temporarily go past it. 457 * count in case you need to temporarily go past it.
464 */ 458 */
465static int __btrfs_start_workers(struct btrfs_workers *workers, 459static int __btrfs_start_workers(struct btrfs_workers *workers)
466 int num_workers)
467{ 460{
468 struct btrfs_worker_thread *worker; 461 struct btrfs_worker_thread *worker;
469 int ret = 0; 462 int ret = 0;
470 int i;
471 463
472 for (i = 0; i < num_workers; i++) { 464 worker = kzalloc(sizeof(*worker), GFP_NOFS);
473 worker = kzalloc(sizeof(*worker), GFP_NOFS); 465 if (!worker) {
474 if (!worker) { 466 ret = -ENOMEM;
475 ret = -ENOMEM; 467 goto fail;
476 goto fail; 468 }
477 }
478 469
479 INIT_LIST_HEAD(&worker->pending); 470 INIT_LIST_HEAD(&worker->pending);
480 INIT_LIST_HEAD(&worker->prio_pending); 471 INIT_LIST_HEAD(&worker->prio_pending);
481 INIT_LIST_HEAD(&worker->worker_list); 472 INIT_LIST_HEAD(&worker->worker_list);
482 spin_lock_init(&worker->lock); 473 spin_lock_init(&worker->lock);
483 474
484 atomic_set(&worker->num_pending, 0); 475 atomic_set(&worker->num_pending, 0);
485 atomic_set(&worker->refs, 1); 476 atomic_set(&worker->refs, 1);
486 worker->workers = workers; 477 worker->workers = workers;
487 worker->task = kthread_run(worker_loop, worker, 478 worker->task = kthread_run(worker_loop, worker,
488 "btrfs-%s-%d", workers->name, 479 "btrfs-%s-%d", workers->name,
489 workers->num_workers + i); 480 workers->num_workers + 1);
490 if (IS_ERR(worker->task)) { 481 if (IS_ERR(worker->task)) {
491 ret = PTR_ERR(worker->task); 482 ret = PTR_ERR(worker->task);
492 kfree(worker); 483 kfree(worker);
493 goto fail; 484 goto fail;
494 }
495 spin_lock_irq(&workers->lock);
496 list_add_tail(&worker->worker_list, &workers->idle_list);
497 worker->idle = 1;
498 workers->num_workers++;
499 workers->num_workers_starting--;
500 WARN_ON(workers->num_workers_starting < 0);
501 spin_unlock_irq(&workers->lock);
502 } 485 }
486 spin_lock_irq(&workers->lock);
487 list_add_tail(&worker->worker_list, &workers->idle_list);
488 worker->idle = 1;
489 workers->num_workers++;
490 workers->num_workers_starting--;
491 WARN_ON(workers->num_workers_starting < 0);
492 spin_unlock_irq(&workers->lock);
493
503 return 0; 494 return 0;
504fail: 495fail:
505 btrfs_stop_workers(workers); 496 spin_lock_irq(&workers->lock);
497 workers->num_workers_starting--;
498 spin_unlock_irq(&workers->lock);
506 return ret; 499 return ret;
507} 500}
508 501
509int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) 502int btrfs_start_workers(struct btrfs_workers *workers)
510{ 503{
511 spin_lock_irq(&workers->lock); 504 spin_lock_irq(&workers->lock);
512 workers->num_workers_starting += num_workers; 505 workers->num_workers_starting++;
513 spin_unlock_irq(&workers->lock); 506 spin_unlock_irq(&workers->lock);
514 return __btrfs_start_workers(workers, num_workers); 507 return __btrfs_start_workers(workers);
515} 508}
516 509
517/* 510/*
@@ -568,9 +561,10 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
568 struct btrfs_worker_thread *worker; 561 struct btrfs_worker_thread *worker;
569 unsigned long flags; 562 unsigned long flags;
570 struct list_head *fallback; 563 struct list_head *fallback;
564 int ret;
571 565
572again:
573 spin_lock_irqsave(&workers->lock, flags); 566 spin_lock_irqsave(&workers->lock, flags);
567again:
574 worker = next_worker(workers); 568 worker = next_worker(workers);
575 569
576 if (!worker) { 570 if (!worker) {
@@ -584,7 +578,10 @@ again:
584 workers->num_workers_starting++; 578 workers->num_workers_starting++;
585 spin_unlock_irqrestore(&workers->lock, flags); 579 spin_unlock_irqrestore(&workers->lock, flags);
586 /* we're below the limit, start another worker */ 580 /* we're below the limit, start another worker */
587 __btrfs_start_workers(workers, 1); 581 ret = __btrfs_start_workers(workers);
582 spin_lock_irqsave(&workers->lock, flags);
583 if (ret)
584 goto fallback;
588 goto again; 585 goto again;
589 } 586 }
590 } 587 }
@@ -665,7 +662,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work)
665/* 662/*
666 * places a struct btrfs_work into the pending queue of one of the kthreads 663 * places a struct btrfs_work into the pending queue of one of the kthreads
667 */ 664 */
668int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) 665void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
669{ 666{
670 struct btrfs_worker_thread *worker; 667 struct btrfs_worker_thread *worker;
671 unsigned long flags; 668 unsigned long flags;
@@ -673,7 +670,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
673 670
674 /* don't requeue something already on a list */ 671 /* don't requeue something already on a list */
675 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 672 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
676 goto out; 673 return;
677 674
678 worker = find_worker(workers); 675 worker = find_worker(workers);
679 if (workers->ordered) { 676 if (workers->ordered) {
@@ -712,7 +709,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
712 if (wake) 709 if (wake)
713 wake_up_process(worker->task); 710 wake_up_process(worker->task);
714 spin_unlock_irqrestore(&worker->lock, flags); 711 spin_unlock_irqrestore(&worker->lock, flags);
715
716out:
717 return 0;
718} 712}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746cf85e..f34cc31fa3c9 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -109,8 +109,8 @@ struct btrfs_workers {
109 char *name; 109 char *name;
110}; 110};
111 111
112int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 112void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
113int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); 113int btrfs_start_workers(struct btrfs_workers *workers);
114int btrfs_stop_workers(struct btrfs_workers *workers); 114int btrfs_stop_workers(struct btrfs_workers *workers);
115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
116 struct btrfs_workers *async_starter); 116 struct btrfs_workers *async_starter);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 000000000000..22c64fff1bd5
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "backref.h"
22
23struct __data_ref {
24 struct list_head list;
25 u64 inum;
26 u64 root;
27 u64 extent_data_item_offset;
28};
29
30struct __shared_ref {
31 struct list_head list;
32 u64 disk_byte;
33};
34
35static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path,
37 struct btrfs_key *found_key)
38{
39 int ret;
40 struct btrfs_key key;
41 struct extent_buffer *eb;
42
43 key.type = key_type;
44 key.objectid = inum;
45 key.offset = ioff;
46
47 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
48 if (ret < 0)
49 return ret;
50
51 eb = path->nodes[0];
52 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
53 ret = btrfs_next_leaf(fs_root, path);
54 if (ret)
55 return ret;
56 eb = path->nodes[0];
57 }
58
59 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
60 if (found_key->type != key.type || found_key->objectid != key.objectid)
61 return 1;
62
63 return 0;
64}
65
66/*
67 * this makes the path point to (inum INODE_ITEM ioff)
68 */
69int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
70 struct btrfs_path *path)
71{
72 struct btrfs_key key;
73 return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
74 &key);
75}
76
77static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
78 struct btrfs_path *path,
79 struct btrfs_key *found_key)
80{
81 return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
82 found_key);
83}
84
85/*
86 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
87 * of the path are separated by '/' and the path is guaranteed to be
88 * 0-terminated. the path is only given within the current file system.
89 * Therefore, it never starts with a '/'. the caller is responsible to provide
90 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
91 * the start point of the resulting string is returned. this pointer is within
92 * dest, normally.
93 * in case the path buffer would overflow, the pointer is decremented further
94 * as if output was written to the buffer, though no more output is actually
95 * generated. that way, the caller can determine how much space would be
96 * required for the path to fit into the buffer. in that case, the returned
97 * value will be smaller than dest. callers must check this!
98 */
99static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
100 struct btrfs_inode_ref *iref,
101 struct extent_buffer *eb_in, u64 parent,
102 char *dest, u32 size)
103{
104 u32 len;
105 int slot;
106 u64 next_inum;
107 int ret;
108 s64 bytes_left = size - 1;
109 struct extent_buffer *eb = eb_in;
110 struct btrfs_key found_key;
111
112 if (bytes_left >= 0)
113 dest[bytes_left] = '\0';
114
115 while (1) {
116 len = btrfs_inode_ref_name_len(eb, iref);
117 bytes_left -= len;
118 if (bytes_left >= 0)
119 read_extent_buffer(eb, dest + bytes_left,
120 (unsigned long)(iref + 1), len);
121 if (eb != eb_in)
122 free_extent_buffer(eb);
123 ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
124 if (ret)
125 break;
126 next_inum = found_key.offset;
127
128 /* regular exit ahead */
129 if (parent == next_inum)
130 break;
131
132 slot = path->slots[0];
133 eb = path->nodes[0];
134 /* make sure we can use eb after releasing the path */
135 if (eb != eb_in)
136 atomic_inc(&eb->refs);
137 btrfs_release_path(path);
138
139 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
140 parent = next_inum;
141 --bytes_left;
142 if (bytes_left >= 0)
143 dest[bytes_left] = '/';
144 }
145
146 btrfs_release_path(path);
147
148 if (ret)
149 return ERR_PTR(ret);
150
151 return dest + bytes_left;
152}
153
154/*
155 * this makes the path point to (logical EXTENT_ITEM *)
156 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
157 * tree blocks and <0 on error.
158 */
159int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
160 struct btrfs_path *path, struct btrfs_key *found_key)
161{
162 int ret;
163 u64 flags;
164 u32 item_size;
165 struct extent_buffer *eb;
166 struct btrfs_extent_item *ei;
167 struct btrfs_key key;
168
169 key.type = BTRFS_EXTENT_ITEM_KEY;
170 key.objectid = logical;
171 key.offset = (u64)-1;
172
173 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
174 if (ret < 0)
175 return ret;
176 ret = btrfs_previous_item(fs_info->extent_root, path,
177 0, BTRFS_EXTENT_ITEM_KEY);
178 if (ret < 0)
179 return ret;
180
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical)
185 return -ENOENT;
186
187 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]);
189 BUG_ON(item_size < sizeof(*ei));
190
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei);
193
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA)
197 return BTRFS_EXTENT_FLAG_DATA;
198
199 return -EIO;
200}
201
202/*
203 * helper function to iterate extent inline refs. ptr must point to a 0 value
204 * for the first call and may be modified. it is used to track state.
205 * if more refs exist, 0 is returned and the next call to
206 * __get_extent_inline_ref must pass the modified ptr parameter to get the
207 * next ref. after the last ref was processed, 1 is returned.
208 * returns <0 on error
209 */
210static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
211 struct btrfs_extent_item *ei, u32 item_size,
212 struct btrfs_extent_inline_ref **out_eiref,
213 int *out_type)
214{
215 unsigned long end;
216 u64 flags;
217 struct btrfs_tree_block_info *info;
218
219 if (!*ptr) {
220 /* first call */
221 flags = btrfs_extent_flags(eb, ei);
222 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
223 info = (struct btrfs_tree_block_info *)(ei + 1);
224 *out_eiref =
225 (struct btrfs_extent_inline_ref *)(info + 1);
226 } else {
227 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
228 }
229 *ptr = (unsigned long)*out_eiref;
230 if ((void *)*ptr >= (void *)ei + item_size)
231 return -ENOENT;
232 }
233
234 end = (unsigned long)ei + item_size;
235 *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
236 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
237
238 *ptr += btrfs_extent_inline_ref_size(*out_type);
239 WARN_ON(*ptr > end);
240 if (*ptr == end)
241 return 1; /* last */
242
243 return 0;
244}
245
246/*
247 * reads the tree block backref for an extent. tree level and root are returned
248 * through out_level and out_root. ptr must point to a 0 value for the first
249 * call and may be modified (see __get_extent_inline_ref comment).
250 * returns 0 if data was provided, 1 if there was no more data to provide or
251 * <0 on error.
252 */
253int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
254 struct btrfs_extent_item *ei, u32 item_size,
255 u64 *out_root, u8 *out_level)
256{
257 int ret;
258 int type;
259 struct btrfs_tree_block_info *info;
260 struct btrfs_extent_inline_ref *eiref;
261
262 if (*ptr == (unsigned long)-1)
263 return 1;
264
265 while (1) {
266 ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
267 &eiref, &type);
268 if (ret < 0)
269 return ret;
270
271 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
272 type == BTRFS_SHARED_BLOCK_REF_KEY)
273 break;
274
275 if (ret == 1)
276 return 1;
277 }
278
279 /* we can treat both ref types equally here */
280 info = (struct btrfs_tree_block_info *)(ei + 1);
281 *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
282 *out_level = btrfs_tree_block_level(eb, info);
283
284 if (ret == 1)
285 *ptr = (unsigned long)-1;
286
287 return 0;
288}
289
290static int __data_list_add(struct list_head *head, u64 inum,
291 u64 extent_data_item_offset, u64 root)
292{
293 struct __data_ref *ref;
294
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{
413 u64 disk_byte;
414 struct btrfs_key key;
415 struct btrfs_file_extent_item *fi;
416 struct extent_buffer *eb;
417 int slot;
418 int nritems;
419 int ret;
420 int found = 0;
421
422 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0);
424 if (!eb)
425 return -EIO;
426
427 /*
428 * from the shared data ref, we only have the leaf but we need
429 * the key. thus, we must look into all items and see that we
430 * find one (some) with a reference to our extent item.
431 */
432 nritems = btrfs_header_nritems(eb);
433 for (slot = 0; slot < nritems; ++slot) {
434 btrfs_item_key_to_cpu(eb, &key, slot);
435 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) {
439 free_extent_buffer(eb);
440 return -EIO;
441 }
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) {
444 if (found)
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459
460 if (!found) {
461 printk(KERN_ERR "btrfs: failed to follow shared data backref "
462 "to parent %llu\n", logical);
463 WARN_ON(1);
464 ret = -EIO;
465 }
466
467 free_extent_buffer(eb);
468 return ret;
469}
470
471/*
472 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops.
476 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path,
479 u64 extent_item_objectid,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx)
482{
483 unsigned long ptr = 0;
484 int last;
485 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d;
497 struct __shared_ref *ref_s;
498
499 eb = path->nodes[0];
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
501 item_size = btrfs_item_size_nr(eb, path->slots[0]);
502
503 /* first we iterate the inline refs, ... */
504 do {
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
506 &eiref, &type);
507 if (last == -ENOENT) {
508 ret = 0;
509 break;
510 }
511 if (last < 0) {
512 ret = last;
513 break;
514 }
515
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524
525 /* ... then we proceed to in-tree references and ... */
526 while (!ret) {
527 ++path->slots[0];
528 if (path->slots[0] > btrfs_header_nritems(eb)) {
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
541 dref = btrfs_item_ptr(eb, path->slots[0],
542 struct btrfs_extent_data_ref);
543 ret = __data_list_add_eb(&data_refs, eb, dref);
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
545 ret = __shared_list_add(&shared_refs, key.offset);
546 }
547 }
548
549 btrfs_release_path(path);
550
551 /*
552 * ... only at the very end we can process the refs we found. this is
553 * because the iterator function we call is allowed to make tree lookups
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret;
582}
583
584int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
585 struct btrfs_path *path,
586 iterate_extent_inodes_t *iterate, void *ctx)
587{
588 int ret;
589 u64 offset;
590 struct btrfs_key found_key;
591
592 ret = extent_from_logical(fs_info, logical, path,
593 &found_key);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL;
596 if (ret < 0)
597 return ret;
598
599 offset = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx);
602
603 return ret;
604}
605
606static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
607 struct btrfs_path *path,
608 iterate_irefs_t *iterate, void *ctx)
609{
610 int ret;
611 int slot;
612 u32 cur;
613 u32 len;
614 u32 name_len;
615 u64 parent = 0;
616 int found = 0;
617 struct extent_buffer *eb;
618 struct btrfs_item *item;
619 struct btrfs_inode_ref *iref;
620 struct btrfs_key found_key;
621
622 while (1) {
623 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
624 &found_key);
625 if (ret < 0)
626 break;
627 if (ret) {
628 ret = found ? 0 : -ENOENT;
629 break;
630 }
631 ++found;
632
633 parent = found_key.offset;
634 slot = path->slots[0];
635 eb = path->nodes[0];
636 /* make sure we can use eb after releasing the path */
637 atomic_inc(&eb->refs);
638 btrfs_release_path(path);
639
640 item = btrfs_item_nr(eb, slot);
641 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
642
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */
646 ret = iterate(parent, iref, eb, ctx);
647 if (ret) {
648 free_extent_buffer(eb);
649 break;
650 }
651 len = sizeof(*iref) + name_len;
652 iref = (struct btrfs_inode_ref *)((char *)iref + len);
653 }
654 free_extent_buffer(eb);
655 }
656
657 btrfs_release_path(path);
658
659 return ret;
660}
661
662/*
663 * returns 0 if the path could be dumped (probably truncated)
664 * returns <0 in case of an error
665 */
666static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
667 struct extent_buffer *eb, void *ctx)
668{
669 struct inode_fs_paths *ipath = ctx;
670 char *fspath;
671 char *fspath_min;
672 int i = ipath->fspath->elem_cnt;
673 const int s_ptr = sizeof(char *);
674 u32 bytes_left;
675
676 bytes_left = ipath->fspath->bytes_left > s_ptr ?
677 ipath->fspath->bytes_left - s_ptr : 0;
678
679 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
680 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
681 inum, fspath_min, bytes_left);
682 if (IS_ERR(fspath))
683 return PTR_ERR(fspath);
684
685 if (fspath > fspath_min) {
686 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else {
690 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0;
693 }
694
695 return 0;
696}
697
698/*
699 * this dumps all file system paths to the inode into the ipath struct, provided
700 * is has been created large enough. each path is zero-terminated and accessed
701 * from ipath->fspath->val[i].
702 * when it returns, there are ipath->fspath->elem_cnt number of paths available
703 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
704 * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
705 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
706 * have been needed to return all paths.
707 */
708int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
709{
710 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
711 inode_to_path, ipath);
712}
713
714/*
715 * allocates space to return multiple file system paths for an inode.
716 * total_bytes to allocate are passed, note that space usable for actual path
717 * information will be total_bytes - sizeof(struct inode_fs_paths).
718 * the returned pointer must be freed with free_ipath() in the end.
719 */
720struct btrfs_data_container *init_data_container(u32 total_bytes)
721{
722 struct btrfs_data_container *data;
723 size_t alloc_bytes;
724
725 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
726 data = kmalloc(alloc_bytes, GFP_NOFS);
727 if (!data)
728 return ERR_PTR(-ENOMEM);
729
730 if (total_bytes >= sizeof(*data)) {
731 data->bytes_left = total_bytes - sizeof(*data);
732 data->bytes_missing = 0;
733 } else {
734 data->bytes_missing = sizeof(*data) - total_bytes;
735 data->bytes_left = 0;
736 }
737
738 data->elem_cnt = 0;
739 data->elem_missed = 0;
740
741 return data;
742}
743
744/*
745 * allocates space to return multiple file system paths for an inode.
746 * total_bytes to allocate are passed, note that space usable for actual path
747 * information will be total_bytes - sizeof(struct inode_fs_paths).
748 * the returned pointer must be freed with free_ipath() in the end.
749 */
750struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
751 struct btrfs_path *path)
752{
753 struct inode_fs_paths *ifp;
754 struct btrfs_data_container *fspath;
755
756 fspath = init_data_container(total_bytes);
757 if (IS_ERR(fspath))
758 return (void *)fspath;
759
760 ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
761 if (!ifp) {
762 kfree(fspath);
763 return ERR_PTR(-ENOMEM);
764 }
765
766 ifp->btrfs_path = path;
767 ifp->fspath = fspath;
768 ifp->fs_root = fs_root;
769
770 return ifp;
771}
772
773void free_ipath(struct inode_fs_paths *ipath)
774{
775 kfree(ipath);
776}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 000000000000..92618837cb8f
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__
21
22#include "ioctl.h"
23
24struct inode_fs_paths {
25 struct btrfs_path *btrfs_path;
26 struct btrfs_root *fs_root;
27 struct btrfs_data_container *fspath;
28};
29
30typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
31 void *ctx);
32typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
33 struct extent_buffer *eb, void *ctx);
34
35int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
36 struct btrfs_path *path);
37
38int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
39 struct btrfs_path *path, struct btrfs_key *found_key);
40
41int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
42 struct btrfs_extent_item *ei, u32 item_size,
43 u64 *out_root, u8 *out_level);
44
45int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
46 struct btrfs_path *path,
47 u64 extent_item_objectid,
48 u64 extent_offset,
49 iterate_extent_inodes_t *iterate, void *ctx);
50
51int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
52 struct btrfs_path *path,
53 iterate_extent_inodes_t *iterate, void *ctx);
54
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56
57struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path);
60void free_ipath(struct inode_fs_paths *ipath);
61
62#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..634608d2a6d0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
103 */ 103 */
104 u64 delalloc_bytes; 104 u64 delalloc_bytes;
105 105
106 /* total number of bytes that may be used for this inode for
107 * delalloc
108 */
109 u64 reserved_bytes;
110
111 /* 106 /*
112 * the size of the file stored in the metadata on disk. data=ordered 107 * the size of the file stored in the metadata on disk. data=ordered
113 * means the in-memory i_size might be larger than the size on disk 108 * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
115 */ 110 */
116 u64 disk_i_size; 111 u64 disk_i_size;
117 112
118 /* flags field from the on disk inode */
119 u32 flags;
120
121 /* 113 /*
122 * if this is a directory then index_cnt is the counter for the index 114 * if this is a directory then index_cnt is the counter for the index
123 * number for new files that are created 115 * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
132 u64 last_unlink_trans; 124 u64 last_unlink_trans;
133 125
134 /* 126 /*
127 * Number of bytes outstanding that are going to need csums. This is
128 * used in ENOSPC accounting.
129 */
130 u64 csum_bytes;
131
132 /* flags field from the on disk inode */
133 u32 flags;
134
135 /*
135 * Counters to keep track of the number of extent item's we may use due 136 * Counters to keep track of the number of extent item's we may use due
136 * to delalloc and such. outstanding_extents is the number of extent 137 * to delalloc and such. outstanding_extents is the number of extent
137 * items we think we'll end up using, and reserved_extents is the number 138 * items we think we'll end up using, and reserved_extents is the number
@@ -146,14 +147,12 @@ struct btrfs_inode {
146 * the btrfs file release call will add this inode to the 147 * the btrfs file release call will add this inode to the
147 * ordered operations list so that we make sure to flush out any 148 * ordered operations list so that we make sure to flush out any
148 * new data the application may have written before commit. 149 * new data the application may have written before commit.
149 *
150 * yes, its silly to have a single bitflag, but we might grow more
151 * of these.
152 */ 150 */
153 unsigned ordered_data_close:1; 151 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1; 152 unsigned orphan_meta_reserved:1;
155 unsigned dummy_inode:1; 153 unsigned dummy_inode:1;
156 unsigned in_defrag:1; 154 unsigned in_defrag:1;
155 unsigned delalloc_meta_reserved:1;
157 156
158 /* 157 /*
159 * always compress this one file 158 * always compress this one file
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
85static inline int compressed_bio_size(struct btrfs_root *root, 85static inline int compressed_bio_size(struct btrfs_root *root,
86 unsigned long disk_size) 86 unsigned long disk_size)
87{ 87{
88 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 88 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
89
89 return sizeof(struct compressed_bio) + 90 return sizeof(struct compressed_bio) +
90 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 91 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
91 csum_size; 92 csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..dede441bdeee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
514 struct btrfs_root *root, 514 struct btrfs_root *root,
515 struct extent_buffer *buf) 515 struct extent_buffer *buf)
516{ 516{
517 /* ensure we can see the force_cow */
518 smp_rmb();
519
520 /*
521 * We do not need to cow a block if
522 * 1) this block is not created or changed in this transaction;
523 * 2) this block does not belong to TREE_RELOC tree;
524 * 3) the root is not forced COW.
525 *
526 * What is forced COW:
527 * when we create snapshot during commiting the transaction,
528 * after we've finished coping src root, we must COW the shared
529 * block to ensure the metadata consistency.
530 */
517 if (btrfs_header_generation(buf) == trans->transid && 531 if (btrfs_header_generation(buf) == trans->transid &&
518 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 532 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
519 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 533 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
520 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 534 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
535 !root->force_cow)
521 return 0; 536 return 0;
522 return 1; 537 return 1;
523} 538}
@@ -902,9 +917,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
902 917
903 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 918 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
904 919
905 if (level < BTRFS_MAX_LEVEL - 1) 920 if (level < BTRFS_MAX_LEVEL - 1) {
906 parent = path->nodes[level + 1]; 921 parent = path->nodes[level + 1];
907 pslot = path->slots[level + 1]; 922 pslot = path->slots[level + 1];
923 }
908 924
909 /* 925 /*
910 * deal with the case where there is only one pointer in the root 926 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1107 mid = path->nodes[level]; 1123 mid = path->nodes[level];
1108 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1124 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1109 1125
1110 if (level < BTRFS_MAX_LEVEL - 1) 1126 if (level < BTRFS_MAX_LEVEL - 1) {
1111 parent = path->nodes[level + 1]; 1127 parent = path->nodes[level + 1];
1112 pslot = path->slots[level + 1]; 1128 pslot = path->slots[level + 1];
1129 }
1113 1130
1114 if (!parent) 1131 if (!parent)
1115 return 1; 1132 return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..67385033323d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h>
33#include "extent_io.h" 34#include "extent_io.h"
34#include "extent_map.h" 35#include "extent_map.h"
35#include "async-thread.h" 36#include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
360#define BTRFS_LABEL_SIZE 256 361#define BTRFS_LABEL_SIZE 256
361 362
362/* 363/*
364 * just in case we somehow lose the roots and are not able to mount,
365 * we store an array of the roots from previous transactions
366 * in the super.
367 */
368#define BTRFS_NUM_BACKUP_ROOTS 4
369struct btrfs_root_backup {
370 __le64 tree_root;
371 __le64 tree_root_gen;
372
373 __le64 chunk_root;
374 __le64 chunk_root_gen;
375
376 __le64 extent_root;
377 __le64 extent_root_gen;
378
379 __le64 fs_root;
380 __le64 fs_root_gen;
381
382 __le64 dev_root;
383 __le64 dev_root_gen;
384
385 __le64 csum_root;
386 __le64 csum_root_gen;
387
388 __le64 total_bytes;
389 __le64 bytes_used;
390 __le64 num_devices;
391 /* future */
392 __le64 unsed_64[4];
393
394 u8 tree_root_level;
395 u8 chunk_root_level;
396 u8 extent_root_level;
397 u8 fs_root_level;
398 u8 dev_root_level;
399 u8 csum_root_level;
400 /* future and to align */
401 u8 unused_8[10];
402} __attribute__ ((__packed__));
403
404/*
363 * the super block basically lists the main trees of the FS 405 * the super block basically lists the main trees of the FS
364 * it currently lacks any block count etc etc 406 * it currently lacks any block count etc etc
365 */ 407 */
@@ -405,6 +447,7 @@ struct btrfs_super_block {
405 /* future expansion */ 447 /* future expansion */
406 __le64 reserved[31]; 448 __le64 reserved[31];
407 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 449 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
450 struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
408} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
409 452
410/* 453/*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
772struct btrfs_block_rsv { 815struct btrfs_block_rsv {
773 u64 size; 816 u64 size;
774 u64 reserved; 817 u64 reserved;
775 u64 freed[2];
776 struct btrfs_space_info *space_info; 818 struct btrfs_space_info *space_info;
777 struct list_head list;
778 spinlock_t lock; 819 spinlock_t lock;
779 atomic_t usage;
780 unsigned int priority:8;
781 unsigned int durable:1;
782 unsigned int refill_used:1;
783 unsigned int full:1; 820 unsigned int full:1;
784}; 821};
785 822
@@ -811,7 +848,8 @@ struct btrfs_free_cluster {
811enum btrfs_caching_type { 848enum btrfs_caching_type {
812 BTRFS_CACHE_NO = 0, 849 BTRFS_CACHE_NO = 0,
813 BTRFS_CACHE_STARTED = 1, 850 BTRFS_CACHE_STARTED = 1,
814 BTRFS_CACHE_FINISHED = 2, 851 BTRFS_CACHE_FAST = 2,
852 BTRFS_CACHE_FINISHED = 3,
815}; 853};
816 854
817enum btrfs_disk_cache_state { 855enum btrfs_disk_cache_state {
@@ -840,10 +878,10 @@ struct btrfs_block_group_cache {
840 spinlock_t lock; 878 spinlock_t lock;
841 u64 pinned; 879 u64 pinned;
842 u64 reserved; 880 u64 reserved;
843 u64 reserved_pinned;
844 u64 bytes_super; 881 u64 bytes_super;
845 u64 flags; 882 u64 flags;
846 u64 sectorsize; 883 u64 sectorsize;
884 u64 cache_generation;
847 unsigned int ro:1; 885 unsigned int ro:1;
848 unsigned int dirty:1; 886 unsigned int dirty:1;
849 unsigned int iref:1; 887 unsigned int iref:1;
@@ -899,6 +937,10 @@ struct btrfs_fs_info {
899 spinlock_t block_group_cache_lock; 937 spinlock_t block_group_cache_lock;
900 struct rb_root block_group_cache_tree; 938 struct rb_root block_group_cache_tree;
901 939
940 /* keep track of unallocated space */
941 spinlock_t free_chunk_lock;
942 u64 free_chunk_space;
943
902 struct extent_io_tree freed_extents[2]; 944 struct extent_io_tree freed_extents[2];
903 struct extent_io_tree *pinned_extents; 945 struct extent_io_tree *pinned_extents;
904 946
@@ -916,14 +958,11 @@ struct btrfs_fs_info {
916 struct btrfs_block_rsv trans_block_rsv; 958 struct btrfs_block_rsv trans_block_rsv;
917 /* block reservation for chunk tree */ 959 /* block reservation for chunk tree */
918 struct btrfs_block_rsv chunk_block_rsv; 960 struct btrfs_block_rsv chunk_block_rsv;
961 /* block reservation for delayed operations */
962 struct btrfs_block_rsv delayed_block_rsv;
919 963
920 struct btrfs_block_rsv empty_block_rsv; 964 struct btrfs_block_rsv empty_block_rsv;
921 965
922 /* list of block reservations that cross multiple transactions */
923 struct list_head durable_block_rsv_list;
924
925 struct mutex durable_block_rsv_mutex;
926
927 u64 generation; 966 u64 generation;
928 u64 last_trans_committed; 967 u64 last_trans_committed;
929 968
@@ -942,8 +981,8 @@ struct btrfs_fs_info {
942 wait_queue_head_t transaction_blocked_wait; 981 wait_queue_head_t transaction_blocked_wait;
943 wait_queue_head_t async_submit_wait; 982 wait_queue_head_t async_submit_wait;
944 983
945 struct btrfs_super_block super_copy; 984 struct btrfs_super_block *super_copy;
946 struct btrfs_super_block super_for_commit; 985 struct btrfs_super_block *super_for_commit;
947 struct block_device *__bdev; 986 struct block_device *__bdev;
948 struct super_block *sb; 987 struct super_block *sb;
949 struct inode *btree_inode; 988 struct inode *btree_inode;
@@ -1036,6 +1075,7 @@ struct btrfs_fs_info {
1036 struct btrfs_workers endio_freespace_worker; 1075 struct btrfs_workers endio_freespace_worker;
1037 struct btrfs_workers submit_workers; 1076 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers; 1077 struct btrfs_workers caching_workers;
1078 struct btrfs_workers readahead_workers;
1039 1079
1040 /* 1080 /*
1041 * fixup workers take dirty pages that didn't properly go through 1081 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1159,13 @@ struct btrfs_fs_info {
1119 u64 fs_state; 1159 u64 fs_state;
1120 1160
1121 struct btrfs_delayed_root *delayed_root; 1161 struct btrfs_delayed_root *delayed_root;
1162
1163 /* readahead tree */
1164 spinlock_t reada_lock;
1165 struct radix_tree_root reada_tree;
1166
1167 /* next backup root to be overwritten */
1168 int backup_root_index;
1122}; 1169};
1123 1170
1124/* 1171/*
@@ -1225,6 +1272,8 @@ struct btrfs_root {
1225 * for stat. It may be used for more later 1272 * for stat. It may be used for more later
1226 */ 1273 */
1227 dev_t anon_dev; 1274 dev_t anon_dev;
1275
1276 int force_cow;
1228}; 1277};
1229 1278
1230struct btrfs_ioctl_defrag_range_args { 1279struct btrfs_ioctl_defrag_range_args {
@@ -1363,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args {
1363#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1412#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1364#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1413#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1365#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1414#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1415#define BTRFS_MOUNT_RECOVERY (1 << 18)
1366 1416
1367#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1417#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1368#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1418#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
1978 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; 2028 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1979} 2029}
1980 2030
2031/* struct btrfs_root_backup */
2032BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
2033 tree_root, 64);
2034BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
2035 tree_root_gen, 64);
2036BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
2037 tree_root_level, 8);
2038
2039BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
2040 chunk_root, 64);
2041BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
2042 chunk_root_gen, 64);
2043BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
2044 chunk_root_level, 8);
2045
2046BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
2047 extent_root, 64);
2048BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
2049 extent_root_gen, 64);
2050BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
2051 extent_root_level, 8);
2052
2053BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
2054 fs_root, 64);
2055BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
2056 fs_root_gen, 64);
2057BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
2058 fs_root_level, 8);
2059
2060BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
2061 dev_root, 64);
2062BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
2063 dev_root_gen, 64);
2064BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
2065 dev_root_level, 8);
2066
2067BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
2068 csum_root, 64);
2069BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
2070 csum_root_gen, 64);
2071BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
2072 csum_root_level, 8);
2073BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
2074 total_bytes, 64);
2075BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2076 bytes_used, 64);
2077BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2078 num_devices, 64);
2079
1981/* struct btrfs_super_block */ 2080/* struct btrfs_super_block */
1982 2081
1983BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2082BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2228,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2129 (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 2228 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2130} 2229}
2131 2230
2231static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
2232{
2233 return mapping_gfp_mask(mapping) & ~__GFP_FS;
2234}
2235
2132/* extent-tree.c */ 2236/* extent-tree.c */
2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2237static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2134 unsigned num_items) 2238 unsigned num_items)
@@ -2137,6 +2241,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2137 3 * num_items; 2241 3 * num_items;
2138} 2242}
2139 2243
2244/*
2245 * Doing a truncate won't result in new nodes or leaves, just what we need for
2246 * COW.
2247 */
2248static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
2249 unsigned num_items)
2250{
2251 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2252 num_items;
2253}
2254
2140void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2255void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
2141int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2256int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2142 struct btrfs_root *root, unsigned long count); 2257 struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2146 u64 num_bytes, u64 *refs, u64 *flags); 2261 u64 num_bytes, u64 *refs, u64 *flags);
2147int btrfs_pin_extent(struct btrfs_root *root, 2262int btrfs_pin_extent(struct btrfs_root *root,
2148 u64 bytenr, u64 num, int reserved); 2263 u64 bytenr, u64 num, int reserved);
2264int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2265 struct btrfs_root *root,
2266 u64 bytenr, u64 num_bytes);
2149int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2267int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2150 struct btrfs_root *root, 2268 struct btrfs_root *root,
2151 u64 objectid, u64 offset, u64 bytenr); 2269 u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2196 u64 root_objectid, u64 owner, u64 offset); 2314 u64 root_objectid, u64 owner, u64 offset);
2197 2315
2198int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2316int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2199int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 2317int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
2200 u64 num_bytes, int reserve, int sinfo); 2318 u64 start, u64 len);
2201int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2319int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2202 struct btrfs_root *root); 2320 struct btrfs_root *root);
2203int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2321int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2358,26 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2240struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2358struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2241void btrfs_free_block_rsv(struct btrfs_root *root, 2359void btrfs_free_block_rsv(struct btrfs_root *root,
2242 struct btrfs_block_rsv *rsv); 2360 struct btrfs_block_rsv *rsv);
2243void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, 2361int btrfs_block_rsv_add(struct btrfs_root *root,
2244 struct btrfs_block_rsv *rsv);
2245int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2246 struct btrfs_root *root,
2247 struct btrfs_block_rsv *block_rsv, 2362 struct btrfs_block_rsv *block_rsv,
2248 u64 num_bytes); 2363 u64 num_bytes);
2249int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2364int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2250 struct btrfs_root *root, 2365 struct btrfs_block_rsv *block_rsv,
2366 u64 num_bytes);
2367int btrfs_block_rsv_check(struct btrfs_root *root,
2368 struct btrfs_block_rsv *block_rsv, int min_factor);
2369int btrfs_block_rsv_refill(struct btrfs_root *root,
2251 struct btrfs_block_rsv *block_rsv, 2370 struct btrfs_block_rsv *block_rsv,
2252 u64 min_reserved, int min_factor); 2371 u64 min_reserved);
2372int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2373 struct btrfs_block_rsv *block_rsv,
2374 u64 min_reserved);
2253int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2375int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2254 struct btrfs_block_rsv *dst_rsv, 2376 struct btrfs_block_rsv *dst_rsv,
2255 u64 num_bytes); 2377 u64 num_bytes);
2256void btrfs_block_rsv_release(struct btrfs_root *root, 2378void btrfs_block_rsv_release(struct btrfs_root *root,
2257 struct btrfs_block_rsv *block_rsv, 2379 struct btrfs_block_rsv *block_rsv,
2258 u64 num_bytes); 2380 u64 num_bytes);
2259int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root,
2261 struct btrfs_block_rsv *rsv);
2262int btrfs_set_block_group_ro(struct btrfs_root *root, 2381int btrfs_set_block_group_ro(struct btrfs_root *root,
2263 struct btrfs_block_group_cache *cache); 2382 struct btrfs_block_group_cache *cache);
2264int btrfs_set_block_group_rw(struct btrfs_root *root, 2383int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2498,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2379 smp_mb(); 2498 smp_mb();
2380 return fs_info->closing; 2499 return fs_info->closing;
2381} 2500}
2501static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2502{
2503 kfree(fs_info->delayed_root);
2504 kfree(fs_info->extent_root);
2505 kfree(fs_info->tree_root);
2506 kfree(fs_info->chunk_root);
2507 kfree(fs_info->dev_root);
2508 kfree(fs_info->csum_root);
2509 kfree(fs_info->super_copy);
2510 kfree(fs_info->super_for_commit);
2511 kfree(fs_info);
2512}
2382 2513
2383/* root-item.c */ 2514/* root-item.c */
2384int btrfs_find_root_ref(struct btrfs_root *tree_root, 2515int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2561,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2561int btrfs_readpage(struct file *file, struct page *page); 2692int btrfs_readpage(struct file *file, struct page *page);
2562void btrfs_evict_inode(struct inode *inode); 2693void btrfs_evict_inode(struct inode *inode);
2563int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 2694int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2564void btrfs_dirty_inode(struct inode *inode, int flags); 2695int btrfs_dirty_inode(struct inode *inode);
2696int btrfs_update_time(struct file *file);
2565struct inode *btrfs_alloc_inode(struct super_block *sb); 2697struct inode *btrfs_alloc_inode(struct super_block *sb);
2566void btrfs_destroy_inode(struct inode *inode); 2698void btrfs_destroy_inode(struct inode *inode);
2567int btrfs_drop_inode(struct inode *inode); 2699int btrfs_drop_inode(struct inode *inode);
@@ -2579,11 +2711,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2579int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2711int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2580int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2712int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2581int btrfs_orphan_cleanup(struct btrfs_root *root); 2713int btrfs_orphan_cleanup(struct btrfs_root *root);
2582void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2583 struct btrfs_pending_snapshot *pending,
2584 u64 *bytes_to_reserve);
2585void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2586 struct btrfs_pending_snapshot *pending);
2587void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2714void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2588 struct btrfs_root *root); 2715 struct btrfs_root *root);
2589int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); 2716int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2824,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2697int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2824int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2698 struct btrfs_scrub_progress *progress); 2825 struct btrfs_scrub_progress *progress);
2699 2826
2827/* reada.c */
2828struct reada_control {
2829 struct btrfs_root *root; /* tree to prefetch */
2830 struct btrfs_key key_start;
2831 struct btrfs_key key_end; /* exclusive */
2832 atomic_t elems;
2833 struct kref refcnt;
2834 wait_queue_head_t wait;
2835};
2836struct reada_control *btrfs_reada_add(struct btrfs_root *root,
2837 struct btrfs_key *start, struct btrfs_key *end);
2838int btrfs_reada_wait(void *handle);
2839void btrfs_reada_detach(void *handle);
2840int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
2841 u64 start, int err);
2842
2700#endif 2843#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c18..c7ddf8a01c54 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
591 return 0; 591 return 0;
592 592
593 src_rsv = trans->block_rsv; 593 src_rsv = trans->block_rsv;
594 dst_rsv = &root->fs_info->global_block_rsv; 594 dst_rsv = &root->fs_info->delayed_block_rsv;
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
609 if (!item->bytes_reserved) 609 if (!item->bytes_reserved)
610 return; 610 return;
611 611
612 rsv = &root->fs_info->global_block_rsv; 612 rsv = &root->fs_info->delayed_block_rsv;
613 btrfs_block_rsv_release(root, rsv, 613 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 614 item->bytes_reserved);
615} 615}
@@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
617static int btrfs_delayed_inode_reserve_metadata( 617static int btrfs_delayed_inode_reserve_metadata(
618 struct btrfs_trans_handle *trans, 618 struct btrfs_trans_handle *trans,
619 struct btrfs_root *root, 619 struct btrfs_root *root,
620 struct inode *inode,
620 struct btrfs_delayed_node *node) 621 struct btrfs_delayed_node *node)
621{ 622{
622 struct btrfs_block_rsv *src_rsv; 623 struct btrfs_block_rsv *src_rsv;
623 struct btrfs_block_rsv *dst_rsv; 624 struct btrfs_block_rsv *dst_rsv;
624 u64 num_bytes; 625 u64 num_bytes;
625 int ret; 626 int ret;
626 627 int release = false;
627 if (!trans->bytes_reserved)
628 return 0;
629 628
630 src_rsv = trans->block_rsv; 629 src_rsv = trans->block_rsv;
631 dst_rsv = &root->fs_info->global_block_rsv; 630 dst_rsv = &root->fs_info->delayed_block_rsv;
632 631
633 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 632 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
633
634 /*
635 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
636 * which doesn't reserve space for speed. This is a problem since we
637 * still need to reserve space for this update, so try to reserve the
638 * space.
639 *
640 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
641 * we're accounted for.
642 */
643 if (!src_rsv || (!trans->bytes_reserved &&
644 src_rsv != &root->fs_info->delalloc_block_rsv)) {
645 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
646 /*
647 * Since we're under a transaction reserve_metadata_bytes could
648 * try to commit the transaction which will make it return
649 * EAGAIN to make us stop the transaction we have, so return
650 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
651 */
652 if (ret == -EAGAIN)
653 ret = -ENOSPC;
654 if (!ret)
655 node->bytes_reserved = num_bytes;
656 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock);
659 if (BTRFS_I(inode)->delalloc_meta_reserved) {
660 BTRFS_I(inode)->delalloc_meta_reserved = 0;
661 spin_unlock(&BTRFS_I(inode)->lock);
662 release = true;
663 goto migrate;
664 }
665 spin_unlock(&BTRFS_I(inode)->lock);
666
667 /* Ok we didn't have space pre-reserved. This shouldn't happen
668 * too often but it can happen if we do delalloc to an existing
669 * inode which gets dirtied because of the time update, and then
670 * isn't touched again until after the transaction commits and
671 * then we try to write out the data. First try to be nice and
672 * reserve something strictly for us. If not be a pain and try
673 * to steal from the delalloc block rsv.
674 */
675 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
676 if (!ret)
677 goto out;
678
679 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
680 if (!ret)
681 goto out;
682
683 /*
684 * Ok this is a problem, let's just steal from the global rsv
685 * since this really shouldn't happen that often.
686 */
687 WARN_ON(1);
688 ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
689 dst_rsv, num_bytes);
690 goto out;
691 }
692
693migrate:
634 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 694 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
695
696out:
697 /*
698 * Migrate only takes a reservation, it doesn't touch the size of the
699 * block_rsv. This is to simplify people who don't normally have things
700 * migrated from their block rsv. If they go to release their
701 * reservation, that will decrease the size as well, so if migrate
702 * reduced size we'd end up with a negative size. But for the
703 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
704 * but we could in fact do this reserve/migrate dance several times
705 * between the time we did the original reservation and we'd clean it
706 * up. So to take care of this, release the space for the meta
707 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work.
709 */
635 if (!ret) 710 if (!ret)
636 node->bytes_reserved = num_bytes; 711 node->bytes_reserved = num_bytes;
637 712
713 if (release)
714 btrfs_block_rsv_release(root, src_rsv, num_bytes);
715
638 return ret; 716 return ret;
639} 717}
640 718
@@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
646 if (!node->bytes_reserved) 724 if (!node->bytes_reserved)
647 return; 725 return;
648 726
649 rsv = &root->fs_info->global_block_rsv; 727 rsv = &root->fs_info->delayed_block_rsv;
650 btrfs_block_rsv_release(root, rsv, 728 btrfs_block_rsv_release(root, rsv,
651 node->bytes_reserved); 729 node->bytes_reserved);
652 node->bytes_reserved = 0; 730 node->bytes_reserved = 0;
@@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1026 path->leave_spinning = 1; 1104 path->leave_spinning = 1;
1027 1105
1028 block_rsv = trans->block_rsv; 1106 block_rsv = trans->block_rsv;
1029 trans->block_rsv = &root->fs_info->global_block_rsv; 1107 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1030 1108
1031 delayed_root = btrfs_get_delayed_root(root); 1109 delayed_root = btrfs_get_delayed_root(root);
1032 1110
@@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1069 path->leave_spinning = 1; 1147 path->leave_spinning = 1;
1070 1148
1071 block_rsv = trans->block_rsv; 1149 block_rsv = trans->block_rsv;
1072 trans->block_rsv = &node->root->fs_info->global_block_rsv; 1150 trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
1073 1151
1074 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1152 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1075 if (!ret) 1153 if (!ret)
@@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1149 goto free_path; 1227 goto free_path;
1150 1228
1151 block_rsv = trans->block_rsv; 1229 block_rsv = trans->block_rsv;
1152 trans->block_rsv = &root->fs_info->global_block_rsv; 1230 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1153 1231
1154 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1232 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
1155 if (!ret) 1233 if (!ret)
@@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1685 goto release_node; 1763 goto release_node;
1686 } 1764 }
1687 1765
1688 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1766 ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
1689 /* 1767 delayed_node);
1690 * we must reserve enough space when we start a new transaction, 1768 if (ret)
1691 * so reserving metadata failure is impossible 1769 goto release_node;
1692 */
1693 BUG_ON(ret);
1694 1770
1695 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1771 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1696 delayed_node->inode_dirty = 1; 1772 delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..3f9d5551e582 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
257 int verify) 257 int verify)
258{ 258{
259 u16 csum_size = 259 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
260 btrfs_super_csum_size(&root->fs_info->super_copy);
261 char *result = NULL; 260 char *result = NULL;
262 unsigned long len; 261 unsigned long len;
263 unsigned long cur_len; 262 unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
367 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 366 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
368 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 367 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
369 while (1) { 368 while (1) {
370 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 369 ret = read_extent_buffer_pages(io_tree, eb, start,
370 WAIT_COMPLETE,
371 btree_get_extent, mirror_num); 371 btree_get_extent, mirror_num);
372 if (!ret && 372 if (!ret &&
373 !verify_parent_transid(io_tree, eb, parent_transid)) 373 !verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
609 end = eb->start + end - 1; 609 end = eb->start + end - 1;
610err: 610err:
611 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
612 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
613 btree_readahead_hook(root, eb, eb->start, ret);
614 }
615
611 free_extent_buffer(eb); 616 free_extent_buffer(eb);
612out: 617out:
613 return ret; 618 return ret;
614} 619}
615 620
621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end,
623 int mirror_num, struct extent_state *state)
624{
625 struct extent_io_tree *tree;
626 unsigned long len;
627 struct extent_buffer *eb;
628 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
629
630 tree = &BTRFS_I(page->mapping->host)->io_tree;
631 if (page->private == EXTENT_PAGE_PRIVATE)
632 goto out;
633 if (!page->private)
634 goto out;
635
636 len = page->private >> 2;
637 WARN_ON(len == 0);
638
639 eb = alloc_extent_buffer(tree, start, len, page);
640 if (eb == NULL)
641 goto out;
642
643 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
644 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
645 btree_readahead_hook(root, eb, eb->start, -EIO);
646 }
647 free_extent_buffer(eb);
648
649out:
650 return -EIO; /* we fixed nothing */
651}
652
616static void end_workqueue_bio(struct bio *bio, int err) 653static void end_workqueue_bio(struct bio *bio, int err)
617{ 654{
618 struct end_io_wq *end_io_wq = bio->bi_private; 655 struct end_io_wq *end_io_wq = bio->bi_private;
@@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
908{ 945{
909 struct extent_io_tree *tree; 946 struct extent_io_tree *tree;
910 tree = &BTRFS_I(page->mapping->host)->io_tree; 947 tree = &BTRFS_I(page->mapping->host)->io_tree;
911 return extent_read_full_page(tree, page, btree_get_extent); 948 return extent_read_full_page(tree, page, btree_get_extent, 0);
912} 949}
913 950
914static int btree_releasepage(struct page *page, gfp_t gfp_flags) 951static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
974 if (!buf) 1011 if (!buf)
975 return 0; 1012 return 0;
976 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1013 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
977 buf, 0, 0, btree_get_extent, 0); 1014 buf, 0, WAIT_NONE, btree_get_extent, 0);
978 free_extent_buffer(buf); 1015 free_extent_buffer(buf);
979 return ret; 1016 return ret;
980} 1017}
981 1018
1019int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1020 int mirror_num, struct extent_buffer **eb)
1021{
1022 struct extent_buffer *buf = NULL;
1023 struct inode *btree_inode = root->fs_info->btree_inode;
1024 struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1025 int ret;
1026
1027 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1028 if (!buf)
1029 return 0;
1030
1031 set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1032
1033 ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1034 btree_get_extent, mirror_num);
1035 if (ret) {
1036 free_extent_buffer(buf);
1037 return ret;
1038 }
1039
1040 if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1041 free_extent_buffer(buf);
1042 return -EIO;
1043 } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
1044 *eb = buf;
1045 } else {
1046 free_extent_buffer(buf);
1047 }
1048 return 0;
1049}
1050
982struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1051struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
983 u64 bytenr, u32 blocksize) 1052 u64 bytenr, u32 blocksize)
984{ 1053{
@@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1135 1204
1136 generation = btrfs_root_generation(&root->root_item); 1205 generation = btrfs_root_generation(&root->root_item);
1137 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1206 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1207 root->commit_root = NULL;
1138 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1208 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1139 blocksize, generation); 1209 blocksize, generation);
1140 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { 1210 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1141 free_extent_buffer(root->node); 1211 free_extent_buffer(root->node);
1212 root->node = NULL;
1142 return -EIO; 1213 return -EIO;
1143 } 1214 }
1144 root->commit_root = btrfs_root_node(root); 1215 root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1648,235 @@ sleep:
1577 return 0; 1648 return 0;
1578} 1649}
1579 1650
1651/*
1652 * this will find the highest generation in the array of
1653 * root backups. The index of the highest array is returned,
1654 * or -1 if we can't find anything.
1655 *
1656 * We check to make sure the array is valid by comparing the
1657 * generation of the latest root in the array with the generation
1658 * in the super block. If they don't match we pitch it.
1659 */
1660static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1661{
1662 u64 cur;
1663 int newest_index = -1;
1664 struct btrfs_root_backup *root_backup;
1665 int i;
1666
1667 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1668 root_backup = info->super_copy->super_roots + i;
1669 cur = btrfs_backup_tree_root_gen(root_backup);
1670 if (cur == newest_gen)
1671 newest_index = i;
1672 }
1673
1674 /* check to see if we actually wrapped around */
1675 if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1676 root_backup = info->super_copy->super_roots;
1677 cur = btrfs_backup_tree_root_gen(root_backup);
1678 if (cur == newest_gen)
1679 newest_index = 0;
1680 }
1681 return newest_index;
1682}
1683
1684
1685/*
1686 * find the oldest backup so we know where to store new entries
1687 * in the backup array. This will set the backup_root_index
1688 * field in the fs_info struct
1689 */
1690static void find_oldest_super_backup(struct btrfs_fs_info *info,
1691 u64 newest_gen)
1692{
1693 int newest_index = -1;
1694
1695 newest_index = find_newest_super_backup(info, newest_gen);
1696 /* if there was garbage in there, just move along */
1697 if (newest_index == -1) {
1698 info->backup_root_index = 0;
1699 } else {
1700 info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1701 }
1702}
1703
1704/*
1705 * copy all the root pointers into the super backup array.
1706 * this will bump the backup pointer by one when it is
1707 * done
1708 */
1709static void backup_super_roots(struct btrfs_fs_info *info)
1710{
1711 int next_backup;
1712 struct btrfs_root_backup *root_backup;
1713 int last_backup;
1714
1715 next_backup = info->backup_root_index;
1716 last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1717 BTRFS_NUM_BACKUP_ROOTS;
1718
1719 /*
1720 * just overwrite the last backup if we're at the same generation
1721 * this happens only at umount
1722 */
1723 root_backup = info->super_for_commit->super_roots + last_backup;
1724 if (btrfs_backup_tree_root_gen(root_backup) ==
1725 btrfs_header_generation(info->tree_root->node))
1726 next_backup = last_backup;
1727
1728 root_backup = info->super_for_commit->super_roots + next_backup;
1729
1730 /*
1731 * make sure all of our padding and empty slots get zero filled
1732 * regardless of which ones we use today
1733 */
1734 memset(root_backup, 0, sizeof(*root_backup));
1735
1736 info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1737
1738 btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1739 btrfs_set_backup_tree_root_gen(root_backup,
1740 btrfs_header_generation(info->tree_root->node));
1741
1742 btrfs_set_backup_tree_root_level(root_backup,
1743 btrfs_header_level(info->tree_root->node));
1744
1745 btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1746 btrfs_set_backup_chunk_root_gen(root_backup,
1747 btrfs_header_generation(info->chunk_root->node));
1748 btrfs_set_backup_chunk_root_level(root_backup,
1749 btrfs_header_level(info->chunk_root->node));
1750
1751 btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1752 btrfs_set_backup_extent_root_gen(root_backup,
1753 btrfs_header_generation(info->extent_root->node));
1754 btrfs_set_backup_extent_root_level(root_backup,
1755 btrfs_header_level(info->extent_root->node));
1756
1757 /*
1758 * we might commit during log recovery, which happens before we set
1759 * the fs_root. Make sure it is valid before we fill it in.
1760 */
1761 if (info->fs_root && info->fs_root->node) {
1762 btrfs_set_backup_fs_root(root_backup,
1763 info->fs_root->node->start);
1764 btrfs_set_backup_fs_root_gen(root_backup,
1765 btrfs_header_generation(info->fs_root->node));
1766 btrfs_set_backup_fs_root_level(root_backup,
1767 btrfs_header_level(info->fs_root->node));
1768 }
1769
1770 btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1771 btrfs_set_backup_dev_root_gen(root_backup,
1772 btrfs_header_generation(info->dev_root->node));
1773 btrfs_set_backup_dev_root_level(root_backup,
1774 btrfs_header_level(info->dev_root->node));
1775
1776 btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1777 btrfs_set_backup_csum_root_gen(root_backup,
1778 btrfs_header_generation(info->csum_root->node));
1779 btrfs_set_backup_csum_root_level(root_backup,
1780 btrfs_header_level(info->csum_root->node));
1781
1782 btrfs_set_backup_total_bytes(root_backup,
1783 btrfs_super_total_bytes(info->super_copy));
1784 btrfs_set_backup_bytes_used(root_backup,
1785 btrfs_super_bytes_used(info->super_copy));
1786 btrfs_set_backup_num_devices(root_backup,
1787 btrfs_super_num_devices(info->super_copy));
1788
1789 /*
1790 * if we don't copy this out to the super_copy, it won't get remembered
1791 * for the next commit
1792 */
1793 memcpy(&info->super_copy->super_roots,
1794 &info->super_for_commit->super_roots,
1795 sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1796}
1797
1798/*
1799 * this copies info out of the root backup array and back into
1800 * the in-memory super block. It is meant to help iterate through
1801 * the array, so you send it the number of backups you've already
1802 * tried and the last backup index you used.
1803 *
1804 * this returns -1 when it has tried all the backups
1805 */
1806static noinline int next_root_backup(struct btrfs_fs_info *info,
1807 struct btrfs_super_block *super,
1808 int *num_backups_tried, int *backup_index)
1809{
1810 struct btrfs_root_backup *root_backup;
1811 int newest = *backup_index;
1812
1813 if (*num_backups_tried == 0) {
1814 u64 gen = btrfs_super_generation(super);
1815
1816 newest = find_newest_super_backup(info, gen);
1817 if (newest == -1)
1818 return -1;
1819
1820 *backup_index = newest;
1821 *num_backups_tried = 1;
1822 } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1823 /* we've tried all the backups, all done */
1824 return -1;
1825 } else {
1826 /* jump to the next oldest backup */
1827 newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1828 BTRFS_NUM_BACKUP_ROOTS;
1829 *backup_index = newest;
1830 *num_backups_tried += 1;
1831 }
1832 root_backup = super->super_roots + newest;
1833
1834 btrfs_set_super_generation(super,
1835 btrfs_backup_tree_root_gen(root_backup));
1836 btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1837 btrfs_set_super_root_level(super,
1838 btrfs_backup_tree_root_level(root_backup));
1839 btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1840
1841 /*
1842 * fixme: the total bytes and num_devices need to match or we should
1843 * need a fsck
1844 */
1845 btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1846 btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1847 return 0;
1848}
1849
1850/* helper to cleanup tree roots */
1851static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1852{
1853 free_extent_buffer(info->tree_root->node);
1854 free_extent_buffer(info->tree_root->commit_root);
1855 free_extent_buffer(info->dev_root->node);
1856 free_extent_buffer(info->dev_root->commit_root);
1857 free_extent_buffer(info->extent_root->node);
1858 free_extent_buffer(info->extent_root->commit_root);
1859 free_extent_buffer(info->csum_root->node);
1860 free_extent_buffer(info->csum_root->commit_root);
1861
1862 info->tree_root->node = NULL;
1863 info->tree_root->commit_root = NULL;
1864 info->dev_root->node = NULL;
1865 info->dev_root->commit_root = NULL;
1866 info->extent_root->node = NULL;
1867 info->extent_root->commit_root = NULL;
1868 info->csum_root->node = NULL;
1869 info->csum_root->commit_root = NULL;
1870
1871 if (chunk_root) {
1872 free_extent_buffer(info->chunk_root->node);
1873 free_extent_buffer(info->chunk_root->commit_root);
1874 info->chunk_root->node = NULL;
1875 info->chunk_root->commit_root = NULL;
1876 }
1877}
1878
1879
1580struct btrfs_root *open_ctree(struct super_block *sb, 1880struct btrfs_root *open_ctree(struct super_block *sb,
1581 struct btrfs_fs_devices *fs_devices, 1881 struct btrfs_fs_devices *fs_devices,
1582 char *options) 1882 char *options)
@@ -1590,29 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1590 u64 features; 1890 u64 features;
1591 struct btrfs_key location; 1891 struct btrfs_key location;
1592 struct buffer_head *bh; 1892 struct buffer_head *bh;
1593 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), 1893 struct btrfs_super_block *disk_super;
1594 GFP_NOFS);
1595 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1596 GFP_NOFS);
1597 struct btrfs_root *tree_root = btrfs_sb(sb); 1894 struct btrfs_root *tree_root = btrfs_sb(sb);
1598 struct btrfs_fs_info *fs_info = NULL; 1895 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1599 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1896 struct btrfs_root *extent_root;
1600 GFP_NOFS); 1897 struct btrfs_root *csum_root;
1601 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1898 struct btrfs_root *chunk_root;
1602 GFP_NOFS); 1899 struct btrfs_root *dev_root;
1603 struct btrfs_root *log_tree_root; 1900 struct btrfs_root *log_tree_root;
1604
1605 int ret; 1901 int ret;
1606 int err = -EINVAL; 1902 int err = -EINVAL;
1607 1903 int num_backups_tried = 0;
1608 struct btrfs_super_block *disk_super; 1904 int backup_index = 0;
1609 1905
1610 if (!extent_root || !tree_root || !tree_root->fs_info || 1906 extent_root = fs_info->extent_root =
1611 !chunk_root || !dev_root || !csum_root) { 1907 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1908 csum_root = fs_info->csum_root =
1909 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1910 chunk_root = fs_info->chunk_root =
1911 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1912 dev_root = fs_info->dev_root =
1913 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1914
1915 if (!extent_root || !csum_root || !chunk_root || !dev_root) {
1612 err = -ENOMEM; 1916 err = -ENOMEM;
1613 goto fail; 1917 goto fail;
1614 } 1918 }
1615 fs_info = tree_root->fs_info;
1616 1919
1617 ret = init_srcu_struct(&fs_info->subvol_srcu); 1920 ret = init_srcu_struct(&fs_info->subvol_srcu);
1618 if (ret) { 1921 if (ret) {
@@ -1648,15 +1951,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1648 spin_lock_init(&fs_info->fs_roots_radix_lock); 1951 spin_lock_init(&fs_info->fs_roots_radix_lock);
1649 spin_lock_init(&fs_info->delayed_iput_lock); 1952 spin_lock_init(&fs_info->delayed_iput_lock);
1650 spin_lock_init(&fs_info->defrag_inodes_lock); 1953 spin_lock_init(&fs_info->defrag_inodes_lock);
1954 spin_lock_init(&fs_info->free_chunk_lock);
1651 mutex_init(&fs_info->reloc_mutex); 1955 mutex_init(&fs_info->reloc_mutex);
1652 1956
1653 init_completion(&fs_info->kobj_unregister); 1957 init_completion(&fs_info->kobj_unregister);
1654 fs_info->tree_root = tree_root;
1655 fs_info->extent_root = extent_root;
1656 fs_info->csum_root = csum_root;
1657 fs_info->chunk_root = chunk_root;
1658 fs_info->dev_root = dev_root;
1659 fs_info->fs_devices = fs_devices;
1660 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1958 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1661 INIT_LIST_HEAD(&fs_info->space_info); 1959 INIT_LIST_HEAD(&fs_info->space_info);
1662 btrfs_mapping_init(&fs_info->mapping_tree); 1960 btrfs_mapping_init(&fs_info->mapping_tree);
@@ -1665,8 +1963,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1665 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 1963 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1666 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 1964 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1667 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 1965 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1668 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); 1966 btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
1669 mutex_init(&fs_info->durable_block_rsv_mutex);
1670 atomic_set(&fs_info->nr_async_submits, 0); 1967 atomic_set(&fs_info->nr_async_submits, 0);
1671 atomic_set(&fs_info->async_delalloc_pages, 0); 1968 atomic_set(&fs_info->async_delalloc_pages, 0);
1672 atomic_set(&fs_info->async_submit_draining, 0); 1969 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1974,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1677 fs_info->metadata_ratio = 0; 1974 fs_info->metadata_ratio = 0;
1678 fs_info->defrag_inodes = RB_ROOT; 1975 fs_info->defrag_inodes = RB_ROOT;
1679 fs_info->trans_no_join = 0; 1976 fs_info->trans_no_join = 0;
1977 fs_info->free_chunk_space = 0;
1978
1979 /* readahead state */
1980 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1981 spin_lock_init(&fs_info->reada_lock);
1680 1982
1681 fs_info->thread_pool_size = min_t(unsigned long, 1983 fs_info->thread_pool_size = min_t(unsigned long,
1682 num_online_cpus() + 2, 8); 1984 num_online_cpus() + 2, 8);
@@ -1766,14 +2068,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1766 goto fail_alloc; 2068 goto fail_alloc;
1767 } 2069 }
1768 2070
1769 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 2071 memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
1770 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 2072 memcpy(fs_info->super_for_commit, fs_info->super_copy,
1771 sizeof(fs_info->super_for_commit)); 2073 sizeof(*fs_info->super_for_commit));
1772 brelse(bh); 2074 brelse(bh);
1773 2075
1774 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); 2076 memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
1775 2077
1776 disk_super = &fs_info->super_copy; 2078 disk_super = fs_info->super_copy;
1777 if (!btrfs_super_root(disk_super)) 2079 if (!btrfs_super_root(disk_super))
1778 goto fail_alloc; 2080 goto fail_alloc;
1779 2081
@@ -1783,6 +2085,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1783 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2085 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1784 2086
1785 /* 2087 /*
2088 * run through our array of backup supers and setup
2089 * our ring pointer to the oldest one
2090 */
2091 generation = btrfs_super_generation(disk_super);
2092 find_oldest_super_backup(fs_info, generation);
2093
2094 /*
1786 * In the long term, we'll store the compression type in the super 2095 * In the long term, we'll store the compression type in the super
1787 * block, and it'll be used for per file compression control. 2096 * block, and it'll be used for per file compression control.
1788 */ 2097 */
@@ -1870,6 +2179,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1870 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", 2179 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1871 fs_info->thread_pool_size, 2180 fs_info->thread_pool_size,
1872 &fs_info->generic_worker); 2181 &fs_info->generic_worker);
2182 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2183 fs_info->thread_pool_size,
2184 &fs_info->generic_worker);
1873 2185
1874 /* 2186 /*
1875 * endios are largely parallel and should have a very 2187 * endios are largely parallel and should have a very
@@ -1880,19 +2192,29 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1880 2192
1881 fs_info->endio_write_workers.idle_thresh = 2; 2193 fs_info->endio_write_workers.idle_thresh = 2;
1882 fs_info->endio_meta_write_workers.idle_thresh = 2; 2194 fs_info->endio_meta_write_workers.idle_thresh = 2;
2195 fs_info->readahead_workers.idle_thresh = 2;
1883 2196
1884 btrfs_start_workers(&fs_info->workers, 1); 2197 /*
1885 btrfs_start_workers(&fs_info->generic_worker, 1); 2198 * btrfs_start_workers can really only fail because of ENOMEM so just
1886 btrfs_start_workers(&fs_info->submit_workers, 1); 2199 * return -ENOMEM if any of these fail.
1887 btrfs_start_workers(&fs_info->delalloc_workers, 1); 2200 */
1888 btrfs_start_workers(&fs_info->fixup_workers, 1); 2201 ret = btrfs_start_workers(&fs_info->workers);
1889 btrfs_start_workers(&fs_info->endio_workers, 1); 2202 ret |= btrfs_start_workers(&fs_info->generic_worker);
1890 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 2203 ret |= btrfs_start_workers(&fs_info->submit_workers);
1891 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 2204 ret |= btrfs_start_workers(&fs_info->delalloc_workers);
1892 btrfs_start_workers(&fs_info->endio_write_workers, 1); 2205 ret |= btrfs_start_workers(&fs_info->fixup_workers);
1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2206 ret |= btrfs_start_workers(&fs_info->endio_workers);
1894 btrfs_start_workers(&fs_info->delayed_workers, 1); 2207 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
1895 btrfs_start_workers(&fs_info->caching_workers, 1); 2208 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2209 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2210 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
2211 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2212 ret |= btrfs_start_workers(&fs_info->caching_workers);
2213 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2214 if (ret) {
2215 ret = -ENOMEM;
2216 goto fail_sb_buffer;
2217 }
1896 2218
1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2219 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2220 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2261,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1939 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2261 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1940 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", 2262 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1941 sb->s_id); 2263 sb->s_id);
1942 goto fail_chunk_root; 2264 goto fail_tree_roots;
1943 } 2265 }
1944 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 2266 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1945 chunk_root->commit_root = btrfs_root_node(chunk_root); 2267 chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2276,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1954 if (ret) { 2276 if (ret) {
1955 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2277 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1956 sb->s_id); 2278 sb->s_id);
1957 goto fail_chunk_root; 2279 goto fail_tree_roots;
1958 } 2280 }
1959 2281
1960 btrfs_close_extra_devices(fs_devices); 2282 btrfs_close_extra_devices(fs_devices);
1961 2283
2284retry_root_backup:
1962 blocksize = btrfs_level_size(tree_root, 2285 blocksize = btrfs_level_size(tree_root,
1963 btrfs_super_root_level(disk_super)); 2286 btrfs_super_root_level(disk_super));
1964 generation = btrfs_super_generation(disk_super); 2287 generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2289,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1966 tree_root->node = read_tree_block(tree_root, 2289 tree_root->node = read_tree_block(tree_root,
1967 btrfs_super_root(disk_super), 2290 btrfs_super_root(disk_super),
1968 blocksize, generation); 2291 blocksize, generation);
1969 if (!tree_root->node) 2292 if (!tree_root->node ||
1970 goto fail_chunk_root; 2293 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1971 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1972 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", 2294 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1973 sb->s_id); 2295 sb->s_id);
1974 goto fail_tree_root; 2296
2297 goto recovery_tree_root;
1975 } 2298 }
2299
1976 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2300 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1977 tree_root->commit_root = btrfs_root_node(tree_root); 2301 tree_root->commit_root = btrfs_root_node(tree_root);
1978 2302
1979 ret = find_and_setup_root(tree_root, fs_info, 2303 ret = find_and_setup_root(tree_root, fs_info,
1980 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2304 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1981 if (ret) 2305 if (ret)
1982 goto fail_tree_root; 2306 goto recovery_tree_root;
1983 extent_root->track_dirty = 1; 2307 extent_root->track_dirty = 1;
1984 2308
1985 ret = find_and_setup_root(tree_root, fs_info, 2309 ret = find_and_setup_root(tree_root, fs_info,
1986 BTRFS_DEV_TREE_OBJECTID, dev_root); 2310 BTRFS_DEV_TREE_OBJECTID, dev_root);
1987 if (ret) 2311 if (ret)
1988 goto fail_extent_root; 2312 goto recovery_tree_root;
1989 dev_root->track_dirty = 1; 2313 dev_root->track_dirty = 1;
1990 2314
1991 ret = find_and_setup_root(tree_root, fs_info, 2315 ret = find_and_setup_root(tree_root, fs_info,
1992 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2316 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1993 if (ret) 2317 if (ret)
1994 goto fail_dev_root; 2318 goto recovery_tree_root;
1995 2319
1996 csum_root->track_dirty = 1; 2320 csum_root->track_dirty = 1;
1997 2321
@@ -2124,22 +2448,13 @@ fail_cleaner:
2124 2448
2125fail_block_groups: 2449fail_block_groups:
2126 btrfs_free_block_groups(fs_info); 2450 btrfs_free_block_groups(fs_info);
2127 free_extent_buffer(csum_root->node); 2451
2128 free_extent_buffer(csum_root->commit_root); 2452fail_tree_roots:
2129fail_dev_root: 2453 free_root_pointers(fs_info, 1);
2130 free_extent_buffer(dev_root->node); 2454
2131 free_extent_buffer(dev_root->commit_root);
2132fail_extent_root:
2133 free_extent_buffer(extent_root->node);
2134 free_extent_buffer(extent_root->commit_root);
2135fail_tree_root:
2136 free_extent_buffer(tree_root->node);
2137 free_extent_buffer(tree_root->commit_root);
2138fail_chunk_root:
2139 free_extent_buffer(chunk_root->node);
2140 free_extent_buffer(chunk_root->commit_root);
2141fail_sb_buffer: 2455fail_sb_buffer:
2142 btrfs_stop_workers(&fs_info->generic_worker); 2456 btrfs_stop_workers(&fs_info->generic_worker);
2457 btrfs_stop_workers(&fs_info->readahead_workers);
2143 btrfs_stop_workers(&fs_info->fixup_workers); 2458 btrfs_stop_workers(&fs_info->fixup_workers);
2144 btrfs_stop_workers(&fs_info->delalloc_workers); 2459 btrfs_stop_workers(&fs_info->delalloc_workers);
2145 btrfs_stop_workers(&fs_info->workers); 2460 btrfs_stop_workers(&fs_info->workers);
@@ -2152,25 +2467,37 @@ fail_sb_buffer:
2152 btrfs_stop_workers(&fs_info->delayed_workers); 2467 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers); 2468 btrfs_stop_workers(&fs_info->caching_workers);
2154fail_alloc: 2469fail_alloc:
2155 kfree(fs_info->delayed_root);
2156fail_iput: 2470fail_iput:
2471 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2472
2157 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2473 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2158 iput(fs_info->btree_inode); 2474 iput(fs_info->btree_inode);
2159
2160 btrfs_close_devices(fs_info->fs_devices);
2161 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2162fail_bdi: 2475fail_bdi:
2163 bdi_destroy(&fs_info->bdi); 2476 bdi_destroy(&fs_info->bdi);
2164fail_srcu: 2477fail_srcu:
2165 cleanup_srcu_struct(&fs_info->subvol_srcu); 2478 cleanup_srcu_struct(&fs_info->subvol_srcu);
2166fail: 2479fail:
2167 kfree(extent_root); 2480 btrfs_close_devices(fs_info->fs_devices);
2168 kfree(tree_root); 2481 free_fs_info(fs_info);
2169 kfree(fs_info);
2170 kfree(chunk_root);
2171 kfree(dev_root);
2172 kfree(csum_root);
2173 return ERR_PTR(err); 2482 return ERR_PTR(err);
2483
2484recovery_tree_root:
2485 if (!btrfs_test_opt(tree_root, RECOVERY))
2486 goto fail_tree_roots;
2487
2488 free_root_pointers(fs_info, 0);
2489
2490 /* don't use the log in recovery mode, it won't be valid */
2491 btrfs_set_super_log_root(disk_super, 0);
2492
2493 /* we can't trust the free space cache either */
2494 btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2495
2496 ret = next_root_backup(fs_info, fs_info->super_copy,
2497 &num_backups_tried, &backup_index);
2498 if (ret == -1)
2499 goto fail_block_groups;
2500 goto retry_root_backup;
2174} 2501}
2175 2502
2176static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2503static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2254,22 +2581,10 @@ static int write_dev_supers(struct btrfs_device *device,
2254 int errors = 0; 2581 int errors = 0;
2255 u32 crc; 2582 u32 crc;
2256 u64 bytenr; 2583 u64 bytenr;
2257 int last_barrier = 0;
2258 2584
2259 if (max_mirrors == 0) 2585 if (max_mirrors == 0)
2260 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 2586 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
2261 2587
2262 /* make sure only the last submit_bh does a barrier */
2263 if (do_barriers) {
2264 for (i = 0; i < max_mirrors; i++) {
2265 bytenr = btrfs_sb_offset(i);
2266 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
2267 device->total_bytes)
2268 break;
2269 last_barrier = i;
2270 }
2271 }
2272
2273 for (i = 0; i < max_mirrors; i++) { 2588 for (i = 0; i < max_mirrors; i++) {
2274 bytenr = btrfs_sb_offset(i); 2589 bytenr = btrfs_sb_offset(i);
2275 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 2590 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2315,17 +2630,136 @@ static int write_dev_supers(struct btrfs_device *device,
2315 bh->b_end_io = btrfs_end_buffer_write_sync; 2630 bh->b_end_io = btrfs_end_buffer_write_sync;
2316 } 2631 }
2317 2632
2318 if (i == last_barrier && do_barriers) 2633 /*
2319 ret = submit_bh(WRITE_FLUSH_FUA, bh); 2634 * we fua the first super. The others we allow
2320 else 2635 * to go down lazy.
2321 ret = submit_bh(WRITE_SYNC, bh); 2636 */
2322 2637 ret = submit_bh(WRITE_FUA, bh);
2323 if (ret) 2638 if (ret)
2324 errors++; 2639 errors++;
2325 } 2640 }
2326 return errors < i ? 0 : -1; 2641 return errors < i ? 0 : -1;
2327} 2642}
2328 2643
2644/*
2645 * endio for the write_dev_flush, this will wake anyone waiting
2646 * for the barrier when it is done
2647 */
2648static void btrfs_end_empty_barrier(struct bio *bio, int err)
2649{
2650 if (err) {
2651 if (err == -EOPNOTSUPP)
2652 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2653 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2654 }
2655 if (bio->bi_private)
2656 complete(bio->bi_private);
2657 bio_put(bio);
2658}
2659
2660/*
2661 * trigger flushes for one the devices. If you pass wait == 0, the flushes are
2662 * sent down. With wait == 1, it waits for the previous flush.
2663 *
2664 * any device where the flush fails with eopnotsupp are flagged as not-barrier
2665 * capable
2666 */
2667static int write_dev_flush(struct btrfs_device *device, int wait)
2668{
2669 struct bio *bio;
2670 int ret = 0;
2671
2672 if (device->nobarriers)
2673 return 0;
2674
2675 if (wait) {
2676 bio = device->flush_bio;
2677 if (!bio)
2678 return 0;
2679
2680 wait_for_completion(&device->flush_wait);
2681
2682 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
2683 printk("btrfs: disabling barriers on dev %s\n",
2684 device->name);
2685 device->nobarriers = 1;
2686 }
2687 if (!bio_flagged(bio, BIO_UPTODATE)) {
2688 ret = -EIO;
2689 }
2690
2691 /* drop the reference from the wait == 0 run */
2692 bio_put(bio);
2693 device->flush_bio = NULL;
2694
2695 return ret;
2696 }
2697
2698 /*
2699 * one reference for us, and we leave it for the
2700 * caller
2701 */
2702 device->flush_bio = NULL;;
2703 bio = bio_alloc(GFP_NOFS, 0);
2704 if (!bio)
2705 return -ENOMEM;
2706
2707 bio->bi_end_io = btrfs_end_empty_barrier;
2708 bio->bi_bdev = device->bdev;
2709 init_completion(&device->flush_wait);
2710 bio->bi_private = &device->flush_wait;
2711 device->flush_bio = bio;
2712
2713 bio_get(bio);
2714 submit_bio(WRITE_FLUSH, bio);
2715
2716 return 0;
2717}
2718
2719/*
2720 * send an empty flush down to each device in parallel,
2721 * then wait for them
2722 */
2723static int barrier_all_devices(struct btrfs_fs_info *info)
2724{
2725 struct list_head *head;
2726 struct btrfs_device *dev;
2727 int errors = 0;
2728 int ret;
2729
2730 /* send down all the barriers */
2731 head = &info->fs_devices->devices;
2732 list_for_each_entry_rcu(dev, head, dev_list) {
2733 if (!dev->bdev) {
2734 errors++;
2735 continue;
2736 }
2737 if (!dev->in_fs_metadata || !dev->writeable)
2738 continue;
2739
2740 ret = write_dev_flush(dev, 0);
2741 if (ret)
2742 errors++;
2743 }
2744
2745 /* wait for all the barriers */
2746 list_for_each_entry_rcu(dev, head, dev_list) {
2747 if (!dev->bdev) {
2748 errors++;
2749 continue;
2750 }
2751 if (!dev->in_fs_metadata || !dev->writeable)
2752 continue;
2753
2754 ret = write_dev_flush(dev, 1);
2755 if (ret)
2756 errors++;
2757 }
2758 if (errors)
2759 return -EIO;
2760 return 0;
2761}
2762
2329int write_all_supers(struct btrfs_root *root, int max_mirrors) 2763int write_all_supers(struct btrfs_root *root, int max_mirrors)
2330{ 2764{
2331 struct list_head *head; 2765 struct list_head *head;
@@ -2338,14 +2772,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2338 int total_errors = 0; 2772 int total_errors = 0;
2339 u64 flags; 2773 u64 flags;
2340 2774
2341 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 2775 max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
2342 do_barriers = !btrfs_test_opt(root, NOBARRIER); 2776 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2777 backup_super_roots(root->fs_info);
2343 2778
2344 sb = &root->fs_info->super_for_commit; 2779 sb = root->fs_info->super_for_commit;
2345 dev_item = &sb->dev_item; 2780 dev_item = &sb->dev_item;
2346 2781
2347 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2782 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2348 head = &root->fs_info->fs_devices->devices; 2783 head = &root->fs_info->fs_devices->devices;
2784
2785 if (do_barriers)
2786 barrier_all_devices(root->fs_info);
2787
2349 list_for_each_entry_rcu(dev, head, dev_list) { 2788 list_for_each_entry_rcu(dev, head, dev_list) {
2350 if (!dev->bdev) { 2789 if (!dev->bdev) {
2351 total_errors++; 2790 total_errors++;
@@ -2545,8 +2984,6 @@ int close_ctree(struct btrfs_root *root)
2545 /* clear out the rbtree of defraggable inodes */ 2984 /* clear out the rbtree of defraggable inodes */
2546 btrfs_run_defrag_inodes(root->fs_info); 2985 btrfs_run_defrag_inodes(root->fs_info);
2547 2986
2548 btrfs_put_block_group_cache(fs_info);
2549
2550 /* 2987 /*
2551 * Here come 2 situations when btrfs is broken to flip readonly: 2988 * Here come 2 situations when btrfs is broken to flip readonly:
2552 * 2989 *
@@ -2572,6 +3009,8 @@ int close_ctree(struct btrfs_root *root)
2572 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 3009 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2573 } 3010 }
2574 3011
3012 btrfs_put_block_group_cache(fs_info);
3013
2575 kthread_stop(root->fs_info->transaction_kthread); 3014 kthread_stop(root->fs_info->transaction_kthread);
2576 kthread_stop(root->fs_info->cleaner_kthread); 3015 kthread_stop(root->fs_info->cleaner_kthread);
2577 3016
@@ -2603,7 +3042,6 @@ int close_ctree(struct btrfs_root *root)
2603 del_fs_roots(fs_info); 3042 del_fs_roots(fs_info);
2604 3043
2605 iput(fs_info->btree_inode); 3044 iput(fs_info->btree_inode);
2606 kfree(fs_info->delayed_root);
2607 3045
2608 btrfs_stop_workers(&fs_info->generic_worker); 3046 btrfs_stop_workers(&fs_info->generic_worker);
2609 btrfs_stop_workers(&fs_info->fixup_workers); 3047 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +3055,7 @@ int close_ctree(struct btrfs_root *root)
2617 btrfs_stop_workers(&fs_info->submit_workers); 3055 btrfs_stop_workers(&fs_info->submit_workers);
2618 btrfs_stop_workers(&fs_info->delayed_workers); 3056 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers); 3057 btrfs_stop_workers(&fs_info->caching_workers);
3058 btrfs_stop_workers(&fs_info->readahead_workers);
2620 3059
2621 btrfs_close_devices(fs_info->fs_devices); 3060 btrfs_close_devices(fs_info->fs_devices);
2622 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3061 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +3063,7 @@ int close_ctree(struct btrfs_root *root)
2624 bdi_destroy(&fs_info->bdi); 3063 bdi_destroy(&fs_info->bdi);
2625 cleanup_srcu_struct(&fs_info->subvol_srcu); 3064 cleanup_srcu_struct(&fs_info->subvol_srcu);
2626 3065
2627 kfree(fs_info->extent_root); 3066 free_fs_info(fs_info);
2628 kfree(fs_info->tree_root);
2629 kfree(fs_info->chunk_root);
2630 kfree(fs_info->dev_root);
2631 kfree(fs_info->csum_root);
2632 kfree(fs_info);
2633 3067
2634 return 0; 3068 return 0;
2635} 3069}
@@ -2735,7 +3169,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2735 return ret; 3169 return ret;
2736} 3170}
2737 3171
2738int btree_lock_page_hook(struct page *page) 3172static int btree_lock_page_hook(struct page *page, void *data,
3173 void (*flush_fn)(void *))
2739{ 3174{
2740 struct inode *inode = page->mapping->host; 3175 struct inode *inode = page->mapping->host;
2741 struct btrfs_root *root = BTRFS_I(inode)->root; 3176 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3187,10 @@ int btree_lock_page_hook(struct page *page)
2752 if (!eb) 3187 if (!eb)
2753 goto out; 3188 goto out;
2754 3189
2755 btrfs_tree_lock(eb); 3190 if (!btrfs_try_tree_write_lock(eb)) {
3191 flush_fn(data);
3192 btrfs_tree_lock(eb);
3193 }
2756 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3194 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2757 3195
2758 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3196 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3205,10 @@ int btree_lock_page_hook(struct page *page)
2767 btrfs_tree_unlock(eb); 3205 btrfs_tree_unlock(eb);
2768 free_extent_buffer(eb); 3206 free_extent_buffer(eb);
2769out: 3207out:
2770 lock_page(page); 3208 if (!trylock_page(page)) {
3209 flush_fn(data);
3210 lock_page(page);
3211 }
2771 return 0; 3212 return 0;
2772} 3213}
2773 3214
@@ -3123,6 +3564,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3123static struct extent_io_ops btree_extent_io_ops = { 3564static struct extent_io_ops btree_extent_io_ops = {
3124 .write_cache_pages_lock_hook = btree_lock_page_hook, 3565 .write_cache_pages_lock_hook = btree_lock_page_hook,
3125 .readpage_end_io_hook = btree_readpage_end_io_hook, 3566 .readpage_end_io_hook = btree_readpage_end_io_hook,
3567 .readpage_io_failed_hook = btree_io_failed_hook,
3126 .submit_bio_hook = btree_submit_bio_hook, 3568 .submit_bio_hook = btree_submit_bio_hook,
3127 /* note we're sharing with inode.c for the merge bio hook */ 3569 /* note we're sharing with inode.c for the merge bio hook */
3128 .merge_bio_hook = btrfs_merge_bio_hook, 3570 .merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..c99d0a8f13fa 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid); 40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid); 42 u64 parent_transid);
43int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
44 int mirror_num, struct extent_buffer **eb);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 45struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
83 struct btrfs_fs_info *fs_info); 85 struct btrfs_fs_info *fs_info);
84int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
85 struct btrfs_root *root); 87 struct btrfs_root *root);
86int btree_lock_page_hook(struct page *page);
87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_init_lockdep(void); 90void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..8603ee4e3dfd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h>
26#include "compat.h" 27#include "compat.h"
27#include "hash.h" 28#include "hash.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -52,6 +53,21 @@ enum {
52 CHUNK_ALLOC_LIMITED = 2, 53 CHUNK_ALLOC_LIMITED = 2,
53}; 54};
54 55
56/*
57 * Control how reservations are dealt with.
58 *
59 * RESERVE_FREE - freeing a reservation.
60 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
61 * ENOSPC accounting
62 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
63 * bytes_may_use as the ENOSPC accounting is done elsewhere
64 */
65enum {
66 RESERVE_FREE = 0,
67 RESERVE_ALLOC = 1,
68 RESERVE_ALLOC_NO_ACCOUNT = 2,
69};
70
55static int update_block_group(struct btrfs_trans_handle *trans, 71static int update_block_group(struct btrfs_trans_handle *trans,
56 struct btrfs_root *root, 72 struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int alloc); 73 u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
81 struct btrfs_key *key); 97 struct btrfs_key *key);
82static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 98static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
83 int dump_block_groups); 99 int dump_block_groups);
100static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
101 u64 num_bytes, int reserve);
84 102
85static noinline int 103static noinline int
86block_group_cache_done(struct btrfs_block_group_cache *cache) 104block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104 if (atomic_dec_and_test(&cache->count)) { 122 if (atomic_dec_and_test(&cache->count)) {
105 WARN_ON(cache->pinned > 0); 123 WARN_ON(cache->pinned > 0);
106 WARN_ON(cache->reserved > 0); 124 WARN_ON(cache->reserved > 0);
107 WARN_ON(cache->reserved_pinned > 0);
108 kfree(cache->free_space_ctl); 125 kfree(cache->free_space_ctl);
109 kfree(cache); 126 kfree(cache);
110 } 127 }
@@ -450,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
450 struct btrfs_root *root, 467 struct btrfs_root *root,
451 int load_cache_only) 468 int load_cache_only)
452{ 469{
470 DEFINE_WAIT(wait);
453 struct btrfs_fs_info *fs_info = cache->fs_info; 471 struct btrfs_fs_info *fs_info = cache->fs_info;
454 struct btrfs_caching_control *caching_ctl; 472 struct btrfs_caching_control *caching_ctl;
455 int ret = 0; 473 int ret = 0;
456 474
457 smp_mb(); 475 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
458 if (cache->cached != BTRFS_CACHE_NO) 476 BUG_ON(!caching_ctl);
477
478 INIT_LIST_HEAD(&caching_ctl->list);
479 mutex_init(&caching_ctl->mutex);
480 init_waitqueue_head(&caching_ctl->wait);
481 caching_ctl->block_group = cache;
482 caching_ctl->progress = cache->key.objectid;
483 atomic_set(&caching_ctl->count, 1);
484 caching_ctl->work.func = caching_thread;
485
486 spin_lock(&cache->lock);
487 /*
488 * This should be a rare occasion, but this could happen I think in the
489 * case where one thread starts to load the space cache info, and then
490 * some other thread starts a transaction commit which tries to do an
491 * allocation while the other thread is still loading the space cache
492 * info. The previous loop should have kept us from choosing this block
493 * group, but if we've moved to the state where we will wait on caching
494 * block groups we need to first check if we're doing a fast load here,
495 * so we can wait for it to finish, otherwise we could end up allocating
496 * from a block group who's cache gets evicted for one reason or
497 * another.
498 */
499 while (cache->cached == BTRFS_CACHE_FAST) {
500 struct btrfs_caching_control *ctl;
501
502 ctl = cache->caching_ctl;
503 atomic_inc(&ctl->count);
504 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
505 spin_unlock(&cache->lock);
506
507 schedule();
508
509 finish_wait(&ctl->wait, &wait);
510 put_caching_control(ctl);
511 spin_lock(&cache->lock);
512 }
513
514 if (cache->cached != BTRFS_CACHE_NO) {
515 spin_unlock(&cache->lock);
516 kfree(caching_ctl);
459 return 0; 517 return 0;
518 }
519 WARN_ON(cache->caching_ctl);
520 cache->caching_ctl = caching_ctl;
521 cache->cached = BTRFS_CACHE_FAST;
522 spin_unlock(&cache->lock);
460 523
461 /* 524 /*
462 * We can't do the read from on-disk cache during a commit since we need 525 * We can't do the read from on-disk cache during a commit since we need
@@ -465,57 +528,53 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
465 * we likely hold important locks. 528 * we likely hold important locks.
466 */ 529 */
467 if (trans && (!trans->transaction->in_commit) && 530 if (trans && (!trans->transaction->in_commit) &&
468 (root && root != root->fs_info->tree_root)) { 531 (root && root != root->fs_info->tree_root) &&
469 spin_lock(&cache->lock); 532 btrfs_test_opt(root, SPACE_CACHE)) {
470 if (cache->cached != BTRFS_CACHE_NO) {
471 spin_unlock(&cache->lock);
472 return 0;
473 }
474 cache->cached = BTRFS_CACHE_STARTED;
475 spin_unlock(&cache->lock);
476
477 ret = load_free_space_cache(fs_info, cache); 533 ret = load_free_space_cache(fs_info, cache);
478 534
479 spin_lock(&cache->lock); 535 spin_lock(&cache->lock);
480 if (ret == 1) { 536 if (ret == 1) {
537 cache->caching_ctl = NULL;
481 cache->cached = BTRFS_CACHE_FINISHED; 538 cache->cached = BTRFS_CACHE_FINISHED;
482 cache->last_byte_to_unpin = (u64)-1; 539 cache->last_byte_to_unpin = (u64)-1;
483 } else { 540 } else {
484 cache->cached = BTRFS_CACHE_NO; 541 if (load_cache_only) {
542 cache->caching_ctl = NULL;
543 cache->cached = BTRFS_CACHE_NO;
544 } else {
545 cache->cached = BTRFS_CACHE_STARTED;
546 }
485 } 547 }
486 spin_unlock(&cache->lock); 548 spin_unlock(&cache->lock);
549 wake_up(&caching_ctl->wait);
487 if (ret == 1) { 550 if (ret == 1) {
551 put_caching_control(caching_ctl);
488 free_excluded_extents(fs_info->extent_root, cache); 552 free_excluded_extents(fs_info->extent_root, cache);
489 return 0; 553 return 0;
490 } 554 }
555 } else {
556 /*
557 * We are not going to do the fast caching, set cached to the
558 * appropriate value and wakeup any waiters.
559 */
560 spin_lock(&cache->lock);
561 if (load_cache_only) {
562 cache->caching_ctl = NULL;
563 cache->cached = BTRFS_CACHE_NO;
564 } else {
565 cache->cached = BTRFS_CACHE_STARTED;
566 }
567 spin_unlock(&cache->lock);
568 wake_up(&caching_ctl->wait);
491 } 569 }
492 570
493 if (load_cache_only) 571 if (load_cache_only) {
494 return 0; 572 put_caching_control(caching_ctl);
495
496 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
497 BUG_ON(!caching_ctl);
498
499 INIT_LIST_HEAD(&caching_ctl->list);
500 mutex_init(&caching_ctl->mutex);
501 init_waitqueue_head(&caching_ctl->wait);
502 caching_ctl->block_group = cache;
503 caching_ctl->progress = cache->key.objectid;
504 /* one for caching kthread, one for caching block group list */
505 atomic_set(&caching_ctl->count, 2);
506 caching_ctl->work.func = caching_thread;
507
508 spin_lock(&cache->lock);
509 if (cache->cached != BTRFS_CACHE_NO) {
510 spin_unlock(&cache->lock);
511 kfree(caching_ctl);
512 return 0; 573 return 0;
513 } 574 }
514 cache->caching_ctl = caching_ctl;
515 cache->cached = BTRFS_CACHE_STARTED;
516 spin_unlock(&cache->lock);
517 575
518 down_write(&fs_info->extent_commit_sem); 576 down_write(&fs_info->extent_commit_sem);
577 atomic_inc(&caching_ctl->count);
519 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 578 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
520 up_write(&fs_info->extent_commit_sem); 579 up_write(&fs_info->extent_commit_sem);
521 580
@@ -1770,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1770{ 1829{
1771 int ret; 1830 int ret;
1772 u64 discarded_bytes = 0; 1831 u64 discarded_bytes = 0;
1773 struct btrfs_multi_bio *multi = NULL; 1832 struct btrfs_bio *bbio = NULL;
1774 1833
1775 1834
1776 /* Tell the block device(s) that the sectors can be discarded */ 1835 /* Tell the block device(s) that the sectors can be discarded */
1777 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1836 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1778 bytenr, &num_bytes, &multi, 0); 1837 bytenr, &num_bytes, &bbio, 0);
1779 if (!ret) { 1838 if (!ret) {
1780 struct btrfs_bio_stripe *stripe = multi->stripes; 1839 struct btrfs_bio_stripe *stripe = bbio->stripes;
1781 int i; 1840 int i;
1782 1841
1783 1842
1784 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1843 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1785 if (!stripe->dev->can_discard) 1844 if (!stripe->dev->can_discard)
1786 continue; 1845 continue;
1787 1846
@@ -1800,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1800 */ 1859 */
1801 ret = 0; 1860 ret = 0;
1802 } 1861 }
1803 kfree(multi); 1862 kfree(bbio);
1804 } 1863 }
1805 1864
1806 if (actual_bytes) 1865 if (actual_bytes)
@@ -2700,6 +2759,13 @@ again:
2700 goto again; 2759 goto again;
2701 } 2760 }
2702 2761
2762 /* We've already setup this transaction, go ahead and exit */
2763 if (block_group->cache_generation == trans->transid &&
2764 i_size_read(inode)) {
2765 dcs = BTRFS_DC_SETUP;
2766 goto out_put;
2767 }
2768
2703 /* 2769 /*
2704 * We want to set the generation to 0, that way if anything goes wrong 2770 * We want to set the generation to 0, that way if anything goes wrong
2705 * from here on out we know not to trust this cache when we load up next 2771 * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2815,15 @@ again:
2749 if (!ret) 2815 if (!ret)
2750 dcs = BTRFS_DC_SETUP; 2816 dcs = BTRFS_DC_SETUP;
2751 btrfs_free_reserved_data_space(inode, num_pages); 2817 btrfs_free_reserved_data_space(inode, num_pages);
2818
2752out_put: 2819out_put:
2753 iput(inode); 2820 iput(inode);
2754out_free: 2821out_free:
2755 btrfs_release_path(path); 2822 btrfs_release_path(path);
2756out: 2823out:
2757 spin_lock(&block_group->lock); 2824 spin_lock(&block_group->lock);
2825 if (!ret && dcs == BTRFS_DC_SETUP)
2826 block_group->cache_generation = trans->transid;
2758 block_group->disk_cache_state = dcs; 2827 block_group->disk_cache_state = dcs;
2759 spin_unlock(&block_group->lock); 2828 spin_unlock(&block_group->lock);
2760 2829
@@ -3122,16 +3191,13 @@ commit_trans:
3122 return -ENOSPC; 3191 return -ENOSPC;
3123 } 3192 }
3124 data_sinfo->bytes_may_use += bytes; 3193 data_sinfo->bytes_may_use += bytes;
3125 BTRFS_I(inode)->reserved_bytes += bytes;
3126 spin_unlock(&data_sinfo->lock); 3194 spin_unlock(&data_sinfo->lock);
3127 3195
3128 return 0; 3196 return 0;
3129} 3197}
3130 3198
3131/* 3199/*
3132 * called when we are clearing an delalloc extent from the 3200 * Called if we need to clear a data reservation for this inode.
3133 * inode's io_tree or there was an error for whatever reason
3134 * after calling btrfs_check_data_free_space
3135 */ 3201 */
3136void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3202void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3137{ 3203{
@@ -3144,7 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3144 data_sinfo = BTRFS_I(inode)->space_info; 3210 data_sinfo = BTRFS_I(inode)->space_info;
3145 spin_lock(&data_sinfo->lock); 3211 spin_lock(&data_sinfo->lock);
3146 data_sinfo->bytes_may_use -= bytes; 3212 data_sinfo->bytes_may_use -= bytes;
3147 BTRFS_I(inode)->reserved_bytes -= bytes;
3148 spin_unlock(&data_sinfo->lock); 3213 spin_unlock(&data_sinfo->lock);
3149} 3214}
3150 3215
@@ -3165,6 +3230,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3165 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3230 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3166 int force) 3231 int force)
3167{ 3232{
3233 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3168 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3234 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3169 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3235 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3170 u64 thresh; 3236 u64 thresh;
@@ -3173,11 +3239,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
3173 return 1; 3239 return 1;
3174 3240
3175 /* 3241 /*
3242 * We need to take into account the global rsv because for all intents
3243 * and purposes it's used space. Don't worry about locking the
3244 * global_rsv, it doesn't change except when the transaction commits.
3245 */
3246 num_allocated += global_rsv->size;
3247
3248 /*
3176 * in limited mode, we want to have some free space up to 3249 * in limited mode, we want to have some free space up to
3177 * about 1% of the FS size. 3250 * about 1% of the FS size.
3178 */ 3251 */
3179 if (force == CHUNK_ALLOC_LIMITED) { 3252 if (force == CHUNK_ALLOC_LIMITED) {
3180 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3253 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3181 thresh = max_t(u64, 64 * 1024 * 1024, 3254 thresh = max_t(u64, 64 * 1024 * 1024,
3182 div_factor_fine(thresh, 1)); 3255 div_factor_fine(thresh, 1));
3183 3256
@@ -3199,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3199 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) 3272 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3200 return 0; 3273 return 0;
3201 3274
3202 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3275 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3203 3276
3204 /* 256MB or 5% of the FS */ 3277 /* 256MB or 5% of the FS */
3205 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3278 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3375,26 @@ out:
3302/* 3375/*
3303 * shrink metadata reservation for delalloc 3376 * shrink metadata reservation for delalloc
3304 */ 3377 */
3305static int shrink_delalloc(struct btrfs_trans_handle *trans, 3378static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3306 struct btrfs_root *root, u64 to_reclaim, int sync) 3379 bool wait_ordered)
3307{ 3380{
3308 struct btrfs_block_rsv *block_rsv; 3381 struct btrfs_block_rsv *block_rsv;
3309 struct btrfs_space_info *space_info; 3382 struct btrfs_space_info *space_info;
3383 struct btrfs_trans_handle *trans;
3310 u64 reserved; 3384 u64 reserved;
3311 u64 max_reclaim; 3385 u64 max_reclaim;
3312 u64 reclaimed = 0; 3386 u64 reclaimed = 0;
3313 long time_left; 3387 long time_left;
3314 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3388 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3315 int loops = 0; 3389 int loops = 0;
3316 unsigned long progress; 3390 unsigned long progress;
3317 3391
3392 trans = (struct btrfs_trans_handle *)current->journal_info;
3318 block_rsv = &root->fs_info->delalloc_block_rsv; 3393 block_rsv = &root->fs_info->delalloc_block_rsv;
3319 space_info = block_rsv->space_info; 3394 space_info = block_rsv->space_info;
3320 3395
3321 smp_mb(); 3396 smp_mb();
3322 reserved = space_info->bytes_reserved; 3397 reserved = space_info->bytes_may_use;
3323 progress = space_info->reservation_progress; 3398 progress = space_info->reservation_progress;
3324 3399
3325 if (reserved == 0) 3400 if (reserved == 0)
@@ -3334,7 +3409,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334 } 3409 }
3335 3410
3336 max_reclaim = min(reserved, to_reclaim); 3411 max_reclaim = min(reserved, to_reclaim);
3337 3412 nr_pages = max_t(unsigned long, nr_pages,
3413 max_reclaim >> PAGE_CACHE_SHIFT);
3338 while (loops < 1024) { 3414 while (loops < 1024) {
3339 /* have the flusher threads jump in and do some IO */ 3415 /* have the flusher threads jump in and do some IO */
3340 smp_mb(); 3416 smp_mb();
@@ -3343,9 +3419,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3343 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3419 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3344 3420
3345 spin_lock(&space_info->lock); 3421 spin_lock(&space_info->lock);
3346 if (reserved > space_info->bytes_reserved) 3422 if (reserved > space_info->bytes_may_use)
3347 reclaimed += reserved - space_info->bytes_reserved; 3423 reclaimed += reserved - space_info->bytes_may_use;
3348 reserved = space_info->bytes_reserved; 3424 reserved = space_info->bytes_may_use;
3349 spin_unlock(&space_info->lock); 3425 spin_unlock(&space_info->lock);
3350 3426
3351 loops++; 3427 loops++;
@@ -3356,11 +3432,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 if (trans && trans->transaction->blocked) 3432 if (trans && trans->transaction->blocked)
3357 return -EAGAIN; 3433 return -EAGAIN;
3358 3434
3359 time_left = schedule_timeout_interruptible(1); 3435 if (wait_ordered && !trans) {
3436 btrfs_wait_ordered_extents(root, 0, 0);
3437 } else {
3438 time_left = schedule_timeout_interruptible(1);
3360 3439
3361 /* We were interrupted, exit */ 3440 /* We were interrupted, exit */
3362 if (time_left) 3441 if (time_left)
3363 break; 3442 break;
3443 }
3364 3444
3365 /* we've kicked the IO a few times, if anything has been freed, 3445 /* we've kicked the IO a few times, if anything has been freed,
3366 * exit. There is no sense in looping here for a long time 3446 * exit. There is no sense in looping here for a long time
@@ -3375,34 +3455,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3375 } 3455 }
3376 3456
3377 } 3457 }
3378 if (reclaimed >= to_reclaim && !trans) 3458
3379 btrfs_wait_ordered_extents(root, 0, 0);
3380 return reclaimed >= to_reclaim; 3459 return reclaimed >= to_reclaim;
3381} 3460}
3382 3461
3383/* 3462/**
3384 * Retries tells us how many times we've called reserve_metadata_bytes. The 3463 * maybe_commit_transaction - possibly commit the transaction if its ok to
3385 * idea is if this is the first call (retries == 0) then we will add to our 3464 * @root - the root we're allocating for
3386 * reserved count if we can't make the allocation in order to hold our place 3465 * @bytes - the number of bytes we want to reserve
3387 * while we go and try and free up space. That way for retries > 1 we don't try 3466 * @force - force the commit
3388 * and add space, we just check to see if the amount of unused space is >= the
3389 * total space, meaning that our reservation is valid.
3390 * 3467 *
3391 * However if we don't intend to retry this reservation, pass -1 as retries so 3468 * This will check to make sure that committing the transaction will actually
3392 * that it short circuits this logic. 3469 * get us somewhere and then commit the transaction if it does. Otherwise it
3470 * will return -ENOSPC.
3393 */ 3471 */
3394static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3472static int may_commit_transaction(struct btrfs_root *root,
3395 struct btrfs_root *root, 3473 struct btrfs_space_info *space_info,
3474 u64 bytes, int force)
3475{
3476 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3477 struct btrfs_trans_handle *trans;
3478
3479 trans = (struct btrfs_trans_handle *)current->journal_info;
3480 if (trans)
3481 return -EAGAIN;
3482
3483 if (force)
3484 goto commit;
3485
3486 /* See if there is enough pinned space to make this reservation */
3487 spin_lock(&space_info->lock);
3488 if (space_info->bytes_pinned >= bytes) {
3489 spin_unlock(&space_info->lock);
3490 goto commit;
3491 }
3492 spin_unlock(&space_info->lock);
3493
3494 /*
3495 * See if there is some space in the delayed insertion reservation for
3496 * this reservation.
3497 */
3498 if (space_info != delayed_rsv->space_info)
3499 return -ENOSPC;
3500
3501 spin_lock(&delayed_rsv->lock);
3502 if (delayed_rsv->size < bytes) {
3503 spin_unlock(&delayed_rsv->lock);
3504 return -ENOSPC;
3505 }
3506 spin_unlock(&delayed_rsv->lock);
3507
3508commit:
3509 trans = btrfs_join_transaction(root);
3510 if (IS_ERR(trans))
3511 return -ENOSPC;
3512
3513 return btrfs_commit_transaction(trans, root);
3514}
3515
3516/**
3517 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3518 * @root - the root we're allocating for
3519 * @block_rsv - the block_rsv we're allocating for
3520 * @orig_bytes - the number of bytes we want
3521 * @flush - wether or not we can flush to make our reservation
3522 *
3523 * This will reserve orgi_bytes number of bytes from the space info associated
3524 * with the block_rsv. If there is not enough space it will make an attempt to
3525 * flush out space to make room. It will do this by flushing delalloc if
3526 * possible or committing the transaction. If flush is 0 then no attempts to
3527 * regain reservations will be made and this will fail if there is not enough
3528 * space already.
3529 */
3530static int reserve_metadata_bytes(struct btrfs_root *root,
3396 struct btrfs_block_rsv *block_rsv, 3531 struct btrfs_block_rsv *block_rsv,
3397 u64 orig_bytes, int flush) 3532 u64 orig_bytes, int flush)
3398{ 3533{
3399 struct btrfs_space_info *space_info = block_rsv->space_info; 3534 struct btrfs_space_info *space_info = block_rsv->space_info;
3400 u64 unused; 3535 u64 used;
3401 u64 num_bytes = orig_bytes; 3536 u64 num_bytes = orig_bytes;
3402 int retries = 0; 3537 int retries = 0;
3403 int ret = 0; 3538 int ret = 0;
3404 bool committed = false; 3539 bool committed = false;
3405 bool flushing = false; 3540 bool flushing = false;
3541 bool wait_ordered = false;
3406 3542
3407again: 3543again:
3408 ret = 0; 3544 ret = 0;
@@ -3419,7 +3555,7 @@ again:
3419 * deadlock since we are waiting for the flusher to finish, but 3555 * deadlock since we are waiting for the flusher to finish, but
3420 * hold the current transaction open. 3556 * hold the current transaction open.
3421 */ 3557 */
3422 if (trans) 3558 if (current->journal_info)
3423 return -EAGAIN; 3559 return -EAGAIN;
3424 ret = wait_event_interruptible(space_info->wait, 3560 ret = wait_event_interruptible(space_info->wait,
3425 !space_info->flush); 3561 !space_info->flush);
@@ -3431,9 +3567,9 @@ again:
3431 } 3567 }
3432 3568
3433 ret = -ENOSPC; 3569 ret = -ENOSPC;
3434 unused = space_info->bytes_used + space_info->bytes_reserved + 3570 used = space_info->bytes_used + space_info->bytes_reserved +
3435 space_info->bytes_pinned + space_info->bytes_readonly + 3571 space_info->bytes_pinned + space_info->bytes_readonly +
3436 space_info->bytes_may_use; 3572 space_info->bytes_may_use;
3437 3573
3438 /* 3574 /*
3439 * The idea here is that we've not already over-reserved the block group 3575 * The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3578,9 @@ again:
3442 * lets start flushing stuff first and then come back and try to make 3578 * lets start flushing stuff first and then come back and try to make
3443 * our reservation. 3579 * our reservation.
3444 */ 3580 */
3445 if (unused <= space_info->total_bytes) { 3581 if (used <= space_info->total_bytes) {
3446 unused = space_info->total_bytes - unused; 3582 if (used + orig_bytes <= space_info->total_bytes) {
3447 if (unused >= num_bytes) { 3583 space_info->bytes_may_use += orig_bytes;
3448 space_info->bytes_reserved += orig_bytes;
3449 ret = 0; 3584 ret = 0;
3450 } else { 3585 } else {
3451 /* 3586 /*
@@ -3461,10 +3596,64 @@ again:
3461 * amount plus the amount of bytes that we need for this 3596 * amount plus the amount of bytes that we need for this
3462 * reservation. 3597 * reservation.
3463 */ 3598 */
3464 num_bytes = unused - space_info->total_bytes + 3599 wait_ordered = true;
3600 num_bytes = used - space_info->total_bytes +
3465 (orig_bytes * (retries + 1)); 3601 (orig_bytes * (retries + 1));
3466 } 3602 }
3467 3603
3604 if (ret) {
3605 u64 profile = btrfs_get_alloc_profile(root, 0);
3606 u64 avail;
3607
3608 /*
3609 * If we have a lot of space that's pinned, don't bother doing
3610 * the overcommit dance yet and just commit the transaction.
3611 */
3612 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3613 do_div(avail, 10);
3614 if (space_info->bytes_pinned >= avail && flush && !committed) {
3615 space_info->flush = 1;
3616 flushing = true;
3617 spin_unlock(&space_info->lock);
3618 ret = may_commit_transaction(root, space_info,
3619 orig_bytes, 1);
3620 if (ret)
3621 goto out;
3622 committed = true;
3623 goto again;
3624 }
3625
3626 spin_lock(&root->fs_info->free_chunk_lock);
3627 avail = root->fs_info->free_chunk_space;
3628
3629 /*
3630 * If we have dup, raid1 or raid10 then only half of the free
3631 * space is actually useable.
3632 */
3633 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3634 BTRFS_BLOCK_GROUP_RAID1 |
3635 BTRFS_BLOCK_GROUP_RAID10))
3636 avail >>= 1;
3637
3638 /*
3639 * If we aren't flushing don't let us overcommit too much, say
3640 * 1/8th of the space. If we can flush, let it overcommit up to
3641 * 1/2 of the space.
3642 */
3643 if (flush)
3644 avail >>= 3;
3645 else
3646 avail >>= 1;
3647 spin_unlock(&root->fs_info->free_chunk_lock);
3648
3649 if (used + num_bytes < space_info->total_bytes + avail) {
3650 space_info->bytes_may_use += orig_bytes;
3651 ret = 0;
3652 } else {
3653 wait_ordered = true;
3654 }
3655 }
3656
3468 /* 3657 /*
3469 * Couldn't make our reservation, save our place so while we're trying 3658 * Couldn't make our reservation, save our place so while we're trying
3470 * to reclaim space we can actually use it instead of somebody else 3659 * to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3673,7 @@ again:
3484 * We do synchronous shrinking since we don't actually unreserve 3673 * We do synchronous shrinking since we don't actually unreserve
3485 * metadata until after the IO is completed. 3674 * metadata until after the IO is completed.
3486 */ 3675 */
3487 ret = shrink_delalloc(trans, root, num_bytes, 1); 3676 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3488 if (ret < 0) 3677 if (ret < 0)
3489 goto out; 3678 goto out;
3490 3679
@@ -3496,35 +3685,17 @@ again:
3496 * so go back around and try again. 3685 * so go back around and try again.
3497 */ 3686 */
3498 if (retries < 2) { 3687 if (retries < 2) {
3688 wait_ordered = true;
3499 retries++; 3689 retries++;
3500 goto again; 3690 goto again;
3501 } 3691 }
3502 3692
3503 /*
3504 * Not enough space to be reclaimed, don't bother committing the
3505 * transaction.
3506 */
3507 spin_lock(&space_info->lock);
3508 if (space_info->bytes_pinned < orig_bytes)
3509 ret = -ENOSPC;
3510 spin_unlock(&space_info->lock);
3511 if (ret)
3512 goto out;
3513
3514 ret = -EAGAIN;
3515 if (trans)
3516 goto out;
3517
3518 ret = -ENOSPC; 3693 ret = -ENOSPC;
3519 if (committed) 3694 if (committed)
3520 goto out; 3695 goto out;
3521 3696
3522 trans = btrfs_join_transaction(root); 3697 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3523 if (IS_ERR(trans))
3524 goto out;
3525 ret = btrfs_commit_transaction(trans, root);
3526 if (!ret) { 3698 if (!ret) {
3527 trans = NULL;
3528 committed = true; 3699 committed = true;
3529 goto again; 3700 goto again;
3530 } 3701 }
@@ -3542,10 +3713,12 @@ out:
3542static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, 3713static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3543 struct btrfs_root *root) 3714 struct btrfs_root *root)
3544{ 3715{
3545 struct btrfs_block_rsv *block_rsv; 3716 struct btrfs_block_rsv *block_rsv = NULL;
3546 if (root->ref_cows) 3717
3718 if (root->ref_cows || root == root->fs_info->csum_root)
3547 block_rsv = trans->block_rsv; 3719 block_rsv = trans->block_rsv;
3548 else 3720
3721 if (!block_rsv)
3549 block_rsv = root->block_rsv; 3722 block_rsv = root->block_rsv;
3550 3723
3551 if (!block_rsv) 3724 if (!block_rsv)
@@ -3616,7 +3789,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3616 } 3789 }
3617 if (num_bytes) { 3790 if (num_bytes) {
3618 spin_lock(&space_info->lock); 3791 spin_lock(&space_info->lock);
3619 space_info->bytes_reserved -= num_bytes; 3792 space_info->bytes_may_use -= num_bytes;
3620 space_info->reservation_progress++; 3793 space_info->reservation_progress++;
3621 spin_unlock(&space_info->lock); 3794 spin_unlock(&space_info->lock);
3622 } 3795 }
@@ -3640,9 +3813,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3640{ 3813{
3641 memset(rsv, 0, sizeof(*rsv)); 3814 memset(rsv, 0, sizeof(*rsv));
3642 spin_lock_init(&rsv->lock); 3815 spin_lock_init(&rsv->lock);
3643 atomic_set(&rsv->usage, 1);
3644 rsv->priority = 6;
3645 INIT_LIST_HEAD(&rsv->list);
3646} 3816}
3647 3817
3648struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 3818struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,38 +3833,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3663void btrfs_free_block_rsv(struct btrfs_root *root, 3833void btrfs_free_block_rsv(struct btrfs_root *root,
3664 struct btrfs_block_rsv *rsv) 3834 struct btrfs_block_rsv *rsv)
3665{ 3835{
3666 if (rsv && atomic_dec_and_test(&rsv->usage)) { 3836 btrfs_block_rsv_release(root, rsv, (u64)-1);
3667 btrfs_block_rsv_release(root, rsv, (u64)-1); 3837 kfree(rsv);
3668 if (!rsv->durable)
3669 kfree(rsv);
3670 }
3671}
3672
3673/*
3674 * make the block_rsv struct be able to capture freed space.
3675 * the captured space will re-add to the the block_rsv struct
3676 * after transaction commit
3677 */
3678void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3679 struct btrfs_block_rsv *block_rsv)
3680{
3681 block_rsv->durable = 1;
3682 mutex_lock(&fs_info->durable_block_rsv_mutex);
3683 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3684 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3685} 3838}
3686 3839
3687int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3840static inline int __block_rsv_add(struct btrfs_root *root,
3688 struct btrfs_root *root, 3841 struct btrfs_block_rsv *block_rsv,
3689 struct btrfs_block_rsv *block_rsv, 3842 u64 num_bytes, int flush)
3690 u64 num_bytes)
3691{ 3843{
3692 int ret; 3844 int ret;
3693 3845
3694 if (num_bytes == 0) 3846 if (num_bytes == 0)
3695 return 0; 3847 return 0;
3696 3848
3697 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3849 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3698 if (!ret) { 3850 if (!ret) {
3699 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3851 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3700 return 0; 3852 return 0;
@@ -3703,55 +3855,80 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3703 return ret; 3855 return ret;
3704} 3856}
3705 3857
3706int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 3858int btrfs_block_rsv_add(struct btrfs_root *root,
3707 struct btrfs_root *root, 3859 struct btrfs_block_rsv *block_rsv,
3708 struct btrfs_block_rsv *block_rsv, 3860 u64 num_bytes)
3709 u64 min_reserved, int min_factor) 3861{
3862 return __block_rsv_add(root, block_rsv, num_bytes, 1);
3863}
3864
3865int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3866 struct btrfs_block_rsv *block_rsv,
3867 u64 num_bytes)
3868{
3869 return __block_rsv_add(root, block_rsv, num_bytes, 0);
3870}
3871
3872int btrfs_block_rsv_check(struct btrfs_root *root,
3873 struct btrfs_block_rsv *block_rsv, int min_factor)
3710{ 3874{
3711 u64 num_bytes = 0; 3875 u64 num_bytes = 0;
3712 int commit_trans = 0;
3713 int ret = -ENOSPC; 3876 int ret = -ENOSPC;
3714 3877
3715 if (!block_rsv) 3878 if (!block_rsv)
3716 return 0; 3879 return 0;
3717 3880
3718 spin_lock(&block_rsv->lock); 3881 spin_lock(&block_rsv->lock);
3719 if (min_factor > 0) 3882 num_bytes = div_factor(block_rsv->size, min_factor);
3720 num_bytes = div_factor(block_rsv->size, min_factor); 3883 if (block_rsv->reserved >= num_bytes)
3721 if (min_reserved > num_bytes) 3884 ret = 0;
3722 num_bytes = min_reserved; 3885 spin_unlock(&block_rsv->lock);
3723 3886
3724 if (block_rsv->reserved >= num_bytes) { 3887 return ret;
3888}
3889
3890static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
3891 struct btrfs_block_rsv *block_rsv,
3892 u64 min_reserved, int flush)
3893{
3894 u64 num_bytes = 0;
3895 int ret = -ENOSPC;
3896
3897 if (!block_rsv)
3898 return 0;
3899
3900 spin_lock(&block_rsv->lock);
3901 num_bytes = min_reserved;
3902 if (block_rsv->reserved >= num_bytes)
3725 ret = 0; 3903 ret = 0;
3726 } else { 3904 else
3727 num_bytes -= block_rsv->reserved; 3905 num_bytes -= block_rsv->reserved;
3728 if (block_rsv->durable &&
3729 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3730 commit_trans = 1;
3731 }
3732 spin_unlock(&block_rsv->lock); 3906 spin_unlock(&block_rsv->lock);
3907
3733 if (!ret) 3908 if (!ret)
3734 return 0; 3909 return 0;
3735 3910
3736 if (block_rsv->refill_used) { 3911 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3737 ret = reserve_metadata_bytes(trans, root, block_rsv, 3912 if (!ret) {
3738 num_bytes, 0); 3913 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3739 if (!ret) {
3740 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3741 return 0;
3742 }
3743 }
3744
3745 if (commit_trans) {
3746 if (trans)
3747 return -EAGAIN;
3748 trans = btrfs_join_transaction(root);
3749 BUG_ON(IS_ERR(trans));
3750 ret = btrfs_commit_transaction(trans, root);
3751 return 0; 3914 return 0;
3752 } 3915 }
3753 3916
3754 return -ENOSPC; 3917 return ret;
3918}
3919
3920int btrfs_block_rsv_refill(struct btrfs_root *root,
3921 struct btrfs_block_rsv *block_rsv,
3922 u64 min_reserved)
3923{
3924 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
3925}
3926
3927int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
3928 struct btrfs_block_rsv *block_rsv,
3929 u64 min_reserved)
3930{
3931 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
3755} 3932}
3756 3933
3757int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3934int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3783,7 +3960,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3783 u64 num_bytes; 3960 u64 num_bytes;
3784 u64 meta_used; 3961 u64 meta_used;
3785 u64 data_used; 3962 u64 data_used;
3786 int csum_size = btrfs_super_csum_size(&fs_info->super_copy); 3963 int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3787 3964
3788 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 3965 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3789 spin_lock(&sinfo->lock); 3966 spin_lock(&sinfo->lock);
@@ -3827,12 +4004,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3827 if (sinfo->total_bytes > num_bytes) { 4004 if (sinfo->total_bytes > num_bytes) {
3828 num_bytes = sinfo->total_bytes - num_bytes; 4005 num_bytes = sinfo->total_bytes - num_bytes;
3829 block_rsv->reserved += num_bytes; 4006 block_rsv->reserved += num_bytes;
3830 sinfo->bytes_reserved += num_bytes; 4007 sinfo->bytes_may_use += num_bytes;
3831 } 4008 }
3832 4009
3833 if (block_rsv->reserved >= block_rsv->size) { 4010 if (block_rsv->reserved >= block_rsv->size) {
3834 num_bytes = block_rsv->reserved - block_rsv->size; 4011 num_bytes = block_rsv->reserved - block_rsv->size;
3835 sinfo->bytes_reserved -= num_bytes; 4012 sinfo->bytes_may_use -= num_bytes;
3836 sinfo->reservation_progress++; 4013 sinfo->reservation_progress++;
3837 block_rsv->reserved = block_rsv->size; 4014 block_rsv->reserved = block_rsv->size;
3838 block_rsv->full = 1; 4015 block_rsv->full = 1;
@@ -3848,16 +4025,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3848 4025
3849 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4026 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3850 fs_info->chunk_block_rsv.space_info = space_info; 4027 fs_info->chunk_block_rsv.space_info = space_info;
3851 fs_info->chunk_block_rsv.priority = 10;
3852 4028
3853 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4029 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3854 fs_info->global_block_rsv.space_info = space_info; 4030 fs_info->global_block_rsv.space_info = space_info;
3855 fs_info->global_block_rsv.priority = 10;
3856 fs_info->global_block_rsv.refill_used = 1;
3857 fs_info->delalloc_block_rsv.space_info = space_info; 4031 fs_info->delalloc_block_rsv.space_info = space_info;
3858 fs_info->trans_block_rsv.space_info = space_info; 4032 fs_info->trans_block_rsv.space_info = space_info;
3859 fs_info->empty_block_rsv.space_info = space_info; 4033 fs_info->empty_block_rsv.space_info = space_info;
3860 fs_info->empty_block_rsv.priority = 10; 4034 fs_info->delayed_block_rsv.space_info = space_info;
3861 4035
3862 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4036 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3863 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4037 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +4039,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3865 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4039 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3866 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4040 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3867 4041
3868 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3869
3870 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3871
3872 update_global_block_rsv(fs_info); 4042 update_global_block_rsv(fs_info);
3873} 4043}
3874 4044
@@ -3881,37 +4051,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3881 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4051 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3882 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4052 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3883 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4053 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3884} 4054 WARN_ON(fs_info->delayed_block_rsv.size > 0);
3885 4055 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
3886int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3887 struct btrfs_root *root,
3888 struct btrfs_block_rsv *rsv)
3889{
3890 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3891 u64 num_bytes;
3892 int ret;
3893
3894 /*
3895 * Truncate should be freeing data, but give us 2 items just in case it
3896 * needs to use some space. We may want to be smarter about this in the
3897 * future.
3898 */
3899 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3900
3901 /* We already have enough bytes, just return */
3902 if (rsv->reserved >= num_bytes)
3903 return 0;
3904
3905 num_bytes -= rsv->reserved;
3906
3907 /*
3908 * You should have reserved enough space before hand to do this, so this
3909 * should not fail.
3910 */
3911 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3912 BUG_ON(ret);
3913
3914 return 0;
3915} 4056}
3916 4057
3917void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4058void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3920,9 +4061,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3920 if (!trans->bytes_reserved) 4061 if (!trans->bytes_reserved)
3921 return; 4062 return;
3922 4063
3923 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); 4064 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
3924 btrfs_block_rsv_release(root, trans->block_rsv,
3925 trans->bytes_reserved);
3926 trans->bytes_reserved = 0; 4065 trans->bytes_reserved = 0;
3927} 4066}
3928 4067
@@ -3964,33 +4103,99 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3964 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4103 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3965} 4104}
3966 4105
4106/**
4107 * drop_outstanding_extent - drop an outstanding extent
4108 * @inode: the inode we're dropping the extent for
4109 *
4110 * This is called when we are freeing up an outstanding extent, either called
4111 * after an error or after an extent is written. This will return the number of
4112 * reserved extents that need to be freed. This must be called with
4113 * BTRFS_I(inode)->lock held.
4114 */
3967static unsigned drop_outstanding_extent(struct inode *inode) 4115static unsigned drop_outstanding_extent(struct inode *inode)
3968{ 4116{
4117 unsigned drop_inode_space = 0;
3969 unsigned dropped_extents = 0; 4118 unsigned dropped_extents = 0;
3970 4119
3971 spin_lock(&BTRFS_I(inode)->lock);
3972 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4120 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3973 BTRFS_I(inode)->outstanding_extents--; 4121 BTRFS_I(inode)->outstanding_extents--;
3974 4122
4123 if (BTRFS_I(inode)->outstanding_extents == 0 &&
4124 BTRFS_I(inode)->delalloc_meta_reserved) {
4125 drop_inode_space = 1;
4126 BTRFS_I(inode)->delalloc_meta_reserved = 0;
4127 }
4128
3975 /* 4129 /*
3976 * If we have more or the same amount of outsanding extents than we have 4130 * If we have more or the same amount of outsanding extents than we have
3977 * reserved then we need to leave the reserved extents count alone. 4131 * reserved then we need to leave the reserved extents count alone.
3978 */ 4132 */
3979 if (BTRFS_I(inode)->outstanding_extents >= 4133 if (BTRFS_I(inode)->outstanding_extents >=
3980 BTRFS_I(inode)->reserved_extents) 4134 BTRFS_I(inode)->reserved_extents)
3981 goto out; 4135 return drop_inode_space;
3982 4136
3983 dropped_extents = BTRFS_I(inode)->reserved_extents - 4137 dropped_extents = BTRFS_I(inode)->reserved_extents -
3984 BTRFS_I(inode)->outstanding_extents; 4138 BTRFS_I(inode)->outstanding_extents;
3985 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4139 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3986out: 4140 return dropped_extents + drop_inode_space;
3987 spin_unlock(&BTRFS_I(inode)->lock);
3988 return dropped_extents;
3989} 4141}
3990 4142
3991static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 4143/**
4144 * calc_csum_metadata_size - return the amount of metada space that must be
4145 * reserved/free'd for the given bytes.
4146 * @inode: the inode we're manipulating
4147 * @num_bytes: the number of bytes in question
4148 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4149 *
4150 * This adjusts the number of csum_bytes in the inode and then returns the
4151 * correct amount of metadata that must either be reserved or freed. We
4152 * calculate how many checksums we can fit into one leaf and then divide the
4153 * number of bytes that will need to be checksumed by this value to figure out
4154 * how many checksums will be required. If we are adding bytes then the number
4155 * may go up and we will return the number of additional bytes that must be
4156 * reserved. If it is going down we will return the number of bytes that must
4157 * be freed.
4158 *
4159 * This must be called with BTRFS_I(inode)->lock held.
4160 */
4161static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4162 int reserve)
3992{ 4163{
3993 return num_bytes >>= 3; 4164 struct btrfs_root *root = BTRFS_I(inode)->root;
4165 u64 csum_size;
4166 int num_csums_per_leaf;
4167 int num_csums;
4168 int old_csums;
4169
4170 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4171 BTRFS_I(inode)->csum_bytes == 0)
4172 return 0;
4173
4174 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4175 if (reserve)
4176 BTRFS_I(inode)->csum_bytes += num_bytes;
4177 else
4178 BTRFS_I(inode)->csum_bytes -= num_bytes;
4179 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4180 num_csums_per_leaf = (int)div64_u64(csum_size,
4181 sizeof(struct btrfs_csum_item) +
4182 sizeof(struct btrfs_disk_key));
4183 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4184 num_csums = num_csums + num_csums_per_leaf - 1;
4185 num_csums = num_csums / num_csums_per_leaf;
4186
4187 old_csums = old_csums + num_csums_per_leaf - 1;
4188 old_csums = old_csums / num_csums_per_leaf;
4189
4190 /* No change, no need to reserve more */
4191 if (old_csums == num_csums)
4192 return 0;
4193
4194 if (reserve)
4195 return btrfs_calc_trans_metadata_size(root,
4196 num_csums - old_csums);
4197
4198 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
3994} 4199}
3995 4200
3996int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4201int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3998,10 +4203,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3998 struct btrfs_root *root = BTRFS_I(inode)->root; 4203 struct btrfs_root *root = BTRFS_I(inode)->root;
3999 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4204 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4000 u64 to_reserve = 0; 4205 u64 to_reserve = 0;
4206 u64 csum_bytes;
4001 unsigned nr_extents = 0; 4207 unsigned nr_extents = 0;
4208 int extra_reserve = 0;
4209 int flush = 1;
4002 int ret; 4210 int ret;
4003 4211
4004 if (btrfs_transaction_in_commit(root->fs_info)) 4212 /* Need to be holding the i_mutex here if we aren't free space cache */
4213 if (btrfs_is_free_space_inode(root, inode))
4214 flush = 0;
4215 else
4216 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4217
4218 if (flush && btrfs_transaction_in_commit(root->fs_info))
4005 schedule_timeout(1); 4219 schedule_timeout(1);
4006 4220
4007 num_bytes = ALIGN(num_bytes, root->sectorsize); 4221 num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4010,33 +4224,74 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4010 BTRFS_I(inode)->outstanding_extents++; 4224 BTRFS_I(inode)->outstanding_extents++;
4011 4225
4012 if (BTRFS_I(inode)->outstanding_extents > 4226 if (BTRFS_I(inode)->outstanding_extents >
4013 BTRFS_I(inode)->reserved_extents) { 4227 BTRFS_I(inode)->reserved_extents)
4014 nr_extents = BTRFS_I(inode)->outstanding_extents - 4228 nr_extents = BTRFS_I(inode)->outstanding_extents -
4015 BTRFS_I(inode)->reserved_extents; 4229 BTRFS_I(inode)->reserved_extents;
4016 BTRFS_I(inode)->reserved_extents += nr_extents;
4017 4230
4018 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4231 /*
4232 * Add an item to reserve for updating the inode when we complete the
4233 * delalloc io.
4234 */
4235 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4236 nr_extents++;
4237 extra_reserve = 1;
4019 } 4238 }
4239
4240 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4241 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4242 csum_bytes = BTRFS_I(inode)->csum_bytes;
4020 spin_unlock(&BTRFS_I(inode)->lock); 4243 spin_unlock(&BTRFS_I(inode)->lock);
4021 4244
4022 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4245 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4023 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4024 if (ret) { 4246 if (ret) {
4247 u64 to_free = 0;
4025 unsigned dropped; 4248 unsigned dropped;
4249
4250 spin_lock(&BTRFS_I(inode)->lock);
4251 dropped = drop_outstanding_extent(inode);
4026 /* 4252 /*
4027 * We don't need the return value since our reservation failed, 4253 * If the inodes csum_bytes is the same as the original
4028 * we just need to clean up our counter. 4254 * csum_bytes then we know we haven't raced with any free()ers
4255 * so we can just reduce our inodes csum bytes and carry on.
4256 * Otherwise we have to do the normal free thing to account for
4257 * the case that the free side didn't free up its reserve
4258 * because of this outstanding reservation.
4029 */ 4259 */
4030 dropped = drop_outstanding_extent(inode); 4260 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4031 WARN_ON(dropped > 1); 4261 calc_csum_metadata_size(inode, num_bytes, 0);
4262 else
4263 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4264 spin_unlock(&BTRFS_I(inode)->lock);
4265 if (dropped)
4266 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4267
4268 if (to_free)
4269 btrfs_block_rsv_release(root, block_rsv, to_free);
4032 return ret; 4270 return ret;
4033 } 4271 }
4034 4272
4273 spin_lock(&BTRFS_I(inode)->lock);
4274 if (extra_reserve) {
4275 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4276 nr_extents--;
4277 }
4278 BTRFS_I(inode)->reserved_extents += nr_extents;
4279 spin_unlock(&BTRFS_I(inode)->lock);
4280
4035 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4281 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4036 4282
4037 return 0; 4283 return 0;
4038} 4284}
4039 4285
4286/**
4287 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4288 * @inode: the inode to release the reservation for
4289 * @num_bytes: the number of bytes we're releasing
4290 *
4291 * This will release the metadata reservation for an inode. This can be called
4292 * once we complete IO for a given set of bytes to release their metadata
4293 * reservations.
4294 */
4040void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4295void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4041{ 4296{
4042 struct btrfs_root *root = BTRFS_I(inode)->root; 4297 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4299,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4044 unsigned dropped; 4299 unsigned dropped;
4045 4300
4046 num_bytes = ALIGN(num_bytes, root->sectorsize); 4301 num_bytes = ALIGN(num_bytes, root->sectorsize);
4302 spin_lock(&BTRFS_I(inode)->lock);
4047 dropped = drop_outstanding_extent(inode); 4303 dropped = drop_outstanding_extent(inode);
4048 4304
4049 to_free = calc_csum_metadata_size(inode, num_bytes); 4305 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4306 spin_unlock(&BTRFS_I(inode)->lock);
4050 if (dropped > 0) 4307 if (dropped > 0)
4051 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4308 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4052 4309
@@ -4054,6 +4311,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4054 to_free); 4311 to_free);
4055} 4312}
4056 4313
4314/**
4315 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4316 * @inode: inode we're writing to
4317 * @num_bytes: the number of bytes we want to allocate
4318 *
4319 * This will do the following things
4320 *
4321 * o reserve space in the data space info for num_bytes
4322 * o reserve space in the metadata space info based on number of outstanding
4323 * extents and how much csums will be needed
4324 * o add to the inodes ->delalloc_bytes
4325 * o add it to the fs_info's delalloc inodes list.
4326 *
4327 * This will return 0 for success and -ENOSPC if there is no space left.
4328 */
4057int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4329int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4058{ 4330{
4059 int ret; 4331 int ret;
@@ -4071,6 +4343,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4071 return 0; 4343 return 0;
4072} 4344}
4073 4345
4346/**
4347 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4348 * @inode: inode we're releasing space for
4349 * @num_bytes: the number of bytes we want to free up
4350 *
4351 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
4352 * called in the case that we don't need the metadata AND data reservations
4353 * anymore. So if there is an error or we insert an inline extent.
4354 *
4355 * This function will release the metadata space that was not used and will
4356 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4357 * list if there are no delalloc bytes left.
4358 */
4074void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4359void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4075{ 4360{
4076 btrfs_delalloc_release_metadata(inode, num_bytes); 4361 btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4090,12 +4375,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4090 4375
4091 /* block accounting for super block */ 4376 /* block accounting for super block */
4092 spin_lock(&info->delalloc_lock); 4377 spin_lock(&info->delalloc_lock);
4093 old_val = btrfs_super_bytes_used(&info->super_copy); 4378 old_val = btrfs_super_bytes_used(info->super_copy);
4094 if (alloc) 4379 if (alloc)
4095 old_val += num_bytes; 4380 old_val += num_bytes;
4096 else 4381 else
4097 old_val -= num_bytes; 4382 old_val -= num_bytes;
4098 btrfs_set_super_bytes_used(&info->super_copy, old_val); 4383 btrfs_set_super_bytes_used(info->super_copy, old_val);
4099 spin_unlock(&info->delalloc_lock); 4384 spin_unlock(&info->delalloc_lock);
4100 4385
4101 while (total) { 4386 while (total) {
@@ -4123,7 +4408,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4123 spin_lock(&cache->space_info->lock); 4408 spin_lock(&cache->space_info->lock);
4124 spin_lock(&cache->lock); 4409 spin_lock(&cache->lock);
4125 4410
4126 if (btrfs_super_cache_generation(&info->super_copy) != 0 && 4411 if (btrfs_test_opt(root, SPACE_CACHE) &&
4127 cache->disk_cache_state < BTRFS_DC_CLEAR) 4412 cache->disk_cache_state < BTRFS_DC_CLEAR)
4128 cache->disk_cache_state = BTRFS_DC_CLEAR; 4413 cache->disk_cache_state = BTRFS_DC_CLEAR;
4129 4414
@@ -4135,7 +4420,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4135 btrfs_set_block_group_used(&cache->item, old_val); 4420 btrfs_set_block_group_used(&cache->item, old_val);
4136 cache->reserved -= num_bytes; 4421 cache->reserved -= num_bytes;
4137 cache->space_info->bytes_reserved -= num_bytes; 4422 cache->space_info->bytes_reserved -= num_bytes;
4138 cache->space_info->reservation_progress++;
4139 cache->space_info->bytes_used += num_bytes; 4423 cache->space_info->bytes_used += num_bytes;
4140 cache->space_info->disk_used += num_bytes * factor; 4424 cache->space_info->disk_used += num_bytes * factor;
4141 spin_unlock(&cache->lock); 4425 spin_unlock(&cache->lock);
@@ -4187,7 +4471,6 @@ static int pin_down_extent(struct btrfs_root *root,
4187 if (reserved) { 4471 if (reserved) {
4188 cache->reserved -= num_bytes; 4472 cache->reserved -= num_bytes;
4189 cache->space_info->bytes_reserved -= num_bytes; 4473 cache->space_info->bytes_reserved -= num_bytes;
4190 cache->space_info->reservation_progress++;
4191 } 4474 }
4192 spin_unlock(&cache->lock); 4475 spin_unlock(&cache->lock);
4193 spin_unlock(&cache->space_info->lock); 4476 spin_unlock(&cache->space_info->lock);
@@ -4215,45 +4498,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
4215} 4498}
4216 4499
4217/* 4500/*
4218 * update size of reserved extents. this function may return -EAGAIN 4501 * this function must be called within transaction
4219 * if 'reserve' is true or 'sinfo' is false. 4502 */
4503int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4504 struct btrfs_root *root,
4505 u64 bytenr, u64 num_bytes)
4506{
4507 struct btrfs_block_group_cache *cache;
4508
4509 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4510 BUG_ON(!cache);
4511
4512 /*
4513 * pull in the free space cache (if any) so that our pin
4514 * removes the free space from the cache. We have load_only set
4515 * to one because the slow code to read in the free extents does check
4516 * the pinned extents.
4517 */
4518 cache_block_group(cache, trans, root, 1);
4519
4520 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4521
4522 /* remove us from the free space cache (if we're there at all) */
4523 btrfs_remove_free_space(cache, bytenr, num_bytes);
4524 btrfs_put_block_group(cache);
4525 return 0;
4526}
4527
4528/**
4529 * btrfs_update_reserved_bytes - update the block_group and space info counters
4530 * @cache: The cache we are manipulating
4531 * @num_bytes: The number of bytes in question
4532 * @reserve: One of the reservation enums
4533 *
4534 * This is called by the allocator when it reserves space, or by somebody who is
4535 * freeing space that was never actually used on disk. For example if you
4536 * reserve some space for a new leaf in transaction A and before transaction A
4537 * commits you free that leaf, you call this with reserve set to 0 in order to
4538 * clear the reservation.
4539 *
4540 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4541 * ENOSPC accounting. For data we handle the reservation through clearing the
4542 * delalloc bits in the io_tree. We have to do this since we could end up
4543 * allocating less disk space for the amount of data we have reserved in the
4544 * case of compression.
4545 *
4546 * If this is a reservation and the block group has become read only we cannot
4547 * make the reservation and return -EAGAIN, otherwise this function always
4548 * succeeds.
4220 */ 4549 */
4221int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 4550static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4222 u64 num_bytes, int reserve, int sinfo) 4551 u64 num_bytes, int reserve)
4223{ 4552{
4553 struct btrfs_space_info *space_info = cache->space_info;
4224 int ret = 0; 4554 int ret = 0;
4225 if (sinfo) { 4555 spin_lock(&space_info->lock);
4226 struct btrfs_space_info *space_info = cache->space_info; 4556 spin_lock(&cache->lock);
4227 spin_lock(&space_info->lock); 4557 if (reserve != RESERVE_FREE) {
4228 spin_lock(&cache->lock);
4229 if (reserve) {
4230 if (cache->ro) {
4231 ret = -EAGAIN;
4232 } else {
4233 cache->reserved += num_bytes;
4234 space_info->bytes_reserved += num_bytes;
4235 }
4236 } else {
4237 if (cache->ro)
4238 space_info->bytes_readonly += num_bytes;
4239 cache->reserved -= num_bytes;
4240 space_info->bytes_reserved -= num_bytes;
4241 space_info->reservation_progress++;
4242 }
4243 spin_unlock(&cache->lock);
4244 spin_unlock(&space_info->lock);
4245 } else {
4246 spin_lock(&cache->lock);
4247 if (cache->ro) { 4558 if (cache->ro) {
4248 ret = -EAGAIN; 4559 ret = -EAGAIN;
4249 } else { 4560 } else {
4250 if (reserve) 4561 cache->reserved += num_bytes;
4251 cache->reserved += num_bytes; 4562 space_info->bytes_reserved += num_bytes;
4252 else 4563 if (reserve == RESERVE_ALLOC) {
4253 cache->reserved -= num_bytes; 4564 BUG_ON(space_info->bytes_may_use < num_bytes);
4565 space_info->bytes_may_use -= num_bytes;
4566 }
4254 } 4567 }
4255 spin_unlock(&cache->lock); 4568 } else {
4569 if (cache->ro)
4570 space_info->bytes_readonly += num_bytes;
4571 cache->reserved -= num_bytes;
4572 space_info->bytes_reserved -= num_bytes;
4573 space_info->reservation_progress++;
4256 } 4574 }
4575 spin_unlock(&cache->lock);
4576 spin_unlock(&space_info->lock);
4257 return ret; 4577 return ret;
4258} 4578}
4259 4579
@@ -4319,13 +4639,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4319 spin_lock(&cache->lock); 4639 spin_lock(&cache->lock);
4320 cache->pinned -= len; 4640 cache->pinned -= len;
4321 cache->space_info->bytes_pinned -= len; 4641 cache->space_info->bytes_pinned -= len;
4322 if (cache->ro) { 4642 if (cache->ro)
4323 cache->space_info->bytes_readonly += len; 4643 cache->space_info->bytes_readonly += len;
4324 } else if (cache->reserved_pinned > 0) {
4325 len = min(len, cache->reserved_pinned);
4326 cache->reserved_pinned -= len;
4327 cache->space_info->bytes_reserved += len;
4328 }
4329 spin_unlock(&cache->lock); 4644 spin_unlock(&cache->lock);
4330 spin_unlock(&cache->space_info->lock); 4645 spin_unlock(&cache->space_info->lock);
4331 } 4646 }
@@ -4340,11 +4655,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4340{ 4655{
4341 struct btrfs_fs_info *fs_info = root->fs_info; 4656 struct btrfs_fs_info *fs_info = root->fs_info;
4342 struct extent_io_tree *unpin; 4657 struct extent_io_tree *unpin;
4343 struct btrfs_block_rsv *block_rsv;
4344 struct btrfs_block_rsv *next_rsv;
4345 u64 start; 4658 u64 start;
4346 u64 end; 4659 u64 end;
4347 int idx;
4348 int ret; 4660 int ret;
4349 4661
4350 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4662 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4679,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4367 cond_resched(); 4679 cond_resched();
4368 } 4680 }
4369 4681
4370 mutex_lock(&fs_info->durable_block_rsv_mutex);
4371 list_for_each_entry_safe(block_rsv, next_rsv,
4372 &fs_info->durable_block_rsv_list, list) {
4373
4374 idx = trans->transid & 0x1;
4375 if (block_rsv->freed[idx] > 0) {
4376 block_rsv_add_bytes(block_rsv,
4377 block_rsv->freed[idx], 0);
4378 block_rsv->freed[idx] = 0;
4379 }
4380 if (atomic_read(&block_rsv->usage) == 0) {
4381 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4382
4383 if (block_rsv->freed[0] == 0 &&
4384 block_rsv->freed[1] == 0) {
4385 list_del_init(&block_rsv->list);
4386 kfree(block_rsv);
4387 }
4388 } else {
4389 btrfs_block_rsv_release(root, block_rsv, 0);
4390 }
4391 }
4392 mutex_unlock(&fs_info->durable_block_rsv_mutex);
4393
4394 return 0; 4682 return 0;
4395} 4683}
4396 4684
@@ -4668,7 +4956,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4668 struct extent_buffer *buf, 4956 struct extent_buffer *buf,
4669 u64 parent, int last_ref) 4957 u64 parent, int last_ref)
4670{ 4958{
4671 struct btrfs_block_rsv *block_rsv;
4672 struct btrfs_block_group_cache *cache = NULL; 4959 struct btrfs_block_group_cache *cache = NULL;
4673 int ret; 4960 int ret;
4674 4961
@@ -4683,64 +4970,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4683 if (!last_ref) 4970 if (!last_ref)
4684 return; 4971 return;
4685 4972
4686 block_rsv = get_block_rsv(trans, root);
4687 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 4973 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4688 if (block_rsv->space_info != cache->space_info)
4689 goto out;
4690 4974
4691 if (btrfs_header_generation(buf) == trans->transid) { 4975 if (btrfs_header_generation(buf) == trans->transid) {
4692 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4976 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4693 ret = check_ref_cleanup(trans, root, buf->start); 4977 ret = check_ref_cleanup(trans, root, buf->start);
4694 if (!ret) 4978 if (!ret)
4695 goto pin; 4979 goto out;
4696 } 4980 }
4697 4981
4698 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 4982 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4699 pin_down_extent(root, cache, buf->start, buf->len, 1); 4983 pin_down_extent(root, cache, buf->start, buf->len, 1);
4700 goto pin; 4984 goto out;
4701 } 4985 }
4702 4986
4703 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4987 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4704 4988
4705 btrfs_add_free_space(cache, buf->start, buf->len); 4989 btrfs_add_free_space(cache, buf->start, buf->len);
4706 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); 4990 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4707 if (ret == -EAGAIN) {
4708 /* block group became read-only */
4709 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4710 goto out;
4711 }
4712
4713 ret = 1;
4714 spin_lock(&block_rsv->lock);
4715 if (block_rsv->reserved < block_rsv->size) {
4716 block_rsv->reserved += buf->len;
4717 ret = 0;
4718 }
4719 spin_unlock(&block_rsv->lock);
4720
4721 if (ret) {
4722 spin_lock(&cache->space_info->lock);
4723 cache->space_info->bytes_reserved -= buf->len;
4724 cache->space_info->reservation_progress++;
4725 spin_unlock(&cache->space_info->lock);
4726 }
4727 goto out;
4728 }
4729pin:
4730 if (block_rsv->durable && !cache->ro) {
4731 ret = 0;
4732 spin_lock(&cache->lock);
4733 if (!cache->ro) {
4734 cache->reserved_pinned += buf->len;
4735 ret = 1;
4736 }
4737 spin_unlock(&cache->lock);
4738
4739 if (ret) {
4740 spin_lock(&block_rsv->lock);
4741 block_rsv->freed[trans->transid & 0x1] += buf->len;
4742 spin_unlock(&block_rsv->lock);
4743 }
4744 } 4991 }
4745out: 4992out:
4746 /* 4993 /*
@@ -4876,17 +5123,20 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4876 struct btrfs_root *root = orig_root->fs_info->extent_root; 5123 struct btrfs_root *root = orig_root->fs_info->extent_root;
4877 struct btrfs_free_cluster *last_ptr = NULL; 5124 struct btrfs_free_cluster *last_ptr = NULL;
4878 struct btrfs_block_group_cache *block_group = NULL; 5125 struct btrfs_block_group_cache *block_group = NULL;
5126 struct btrfs_block_group_cache *used_block_group;
4879 int empty_cluster = 2 * 1024 * 1024; 5127 int empty_cluster = 2 * 1024 * 1024;
4880 int allowed_chunk_alloc = 0; 5128 int allowed_chunk_alloc = 0;
4881 int done_chunk_alloc = 0; 5129 int done_chunk_alloc = 0;
4882 struct btrfs_space_info *space_info; 5130 struct btrfs_space_info *space_info;
4883 int last_ptr_loop = 0;
4884 int loop = 0; 5131 int loop = 0;
4885 int index = 0; 5132 int index = 0;
5133 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5134 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
4886 bool found_uncached_bg = false; 5135 bool found_uncached_bg = false;
4887 bool failed_cluster_refill = false; 5136 bool failed_cluster_refill = false;
4888 bool failed_alloc = false; 5137 bool failed_alloc = false;
4889 bool use_cluster = true; 5138 bool use_cluster = true;
5139 bool have_caching_bg = false;
4890 u64 ideal_cache_percent = 0; 5140 u64 ideal_cache_percent = 0;
4891 u64 ideal_cache_offset = 0; 5141 u64 ideal_cache_offset = 0;
4892 5142
@@ -4939,6 +5189,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4939ideal_cache: 5189ideal_cache:
4940 block_group = btrfs_lookup_block_group(root->fs_info, 5190 block_group = btrfs_lookup_block_group(root->fs_info,
4941 search_start); 5191 search_start);
5192 used_block_group = block_group;
4942 /* 5193 /*
4943 * we don't want to use the block group if it doesn't match our 5194 * we don't want to use the block group if it doesn't match our
4944 * allocation bits, or if its not cached. 5195 * allocation bits, or if its not cached.
@@ -4969,12 +5220,14 @@ ideal_cache:
4969 } 5220 }
4970 } 5221 }
4971search: 5222search:
5223 have_caching_bg = false;
4972 down_read(&space_info->groups_sem); 5224 down_read(&space_info->groups_sem);
4973 list_for_each_entry(block_group, &space_info->block_groups[index], 5225 list_for_each_entry(block_group, &space_info->block_groups[index],
4974 list) { 5226 list) {
4975 u64 offset; 5227 u64 offset;
4976 int cached; 5228 int cached;
4977 5229
5230 used_block_group = block_group;
4978 btrfs_get_block_group(block_group); 5231 btrfs_get_block_group(block_group);
4979 search_start = block_group->key.objectid; 5232 search_start = block_group->key.objectid;
4980 5233
@@ -4998,13 +5251,15 @@ search:
4998 } 5251 }
4999 5252
5000have_block_group: 5253have_block_group:
5001 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 5254 cached = block_group_cache_done(block_group);
5255 if (unlikely(!cached)) {
5002 u64 free_percent; 5256 u64 free_percent;
5003 5257
5258 found_uncached_bg = true;
5004 ret = cache_block_group(block_group, trans, 5259 ret = cache_block_group(block_group, trans,
5005 orig_root, 1); 5260 orig_root, 1);
5006 if (block_group->cached == BTRFS_CACHE_FINISHED) 5261 if (block_group->cached == BTRFS_CACHE_FINISHED)
5007 goto have_block_group; 5262 goto alloc;
5008 5263
5009 free_percent = btrfs_block_group_used(&block_group->item); 5264 free_percent = btrfs_block_group_used(&block_group->item);
5010 free_percent *= 100; 5265 free_percent *= 100;
@@ -5026,7 +5281,6 @@ have_block_group:
5026 orig_root, 0); 5281 orig_root, 0);
5027 BUG_ON(ret); 5282 BUG_ON(ret);
5028 } 5283 }
5029 found_uncached_bg = true;
5030 5284
5031 /* 5285 /*
5032 * If loop is set for cached only, try the next block 5286 * If loop is set for cached only, try the next block
@@ -5036,94 +5290,80 @@ have_block_group:
5036 goto loop; 5290 goto loop;
5037 } 5291 }
5038 5292
5039 cached = block_group_cache_done(block_group); 5293alloc:
5040 if (unlikely(!cached))
5041 found_uncached_bg = true;
5042
5043 if (unlikely(block_group->ro)) 5294 if (unlikely(block_group->ro))
5044 goto loop; 5295 goto loop;
5045 5296
5046 spin_lock(&block_group->free_space_ctl->tree_lock); 5297 spin_lock(&block_group->free_space_ctl->tree_lock);
5047 if (cached && 5298 if (cached &&
5048 block_group->free_space_ctl->free_space < 5299 block_group->free_space_ctl->free_space <
5049 num_bytes + empty_size) { 5300 num_bytes + empty_cluster + empty_size) {
5050 spin_unlock(&block_group->free_space_ctl->tree_lock); 5301 spin_unlock(&block_group->free_space_ctl->tree_lock);
5051 goto loop; 5302 goto loop;
5052 } 5303 }
5053 spin_unlock(&block_group->free_space_ctl->tree_lock); 5304 spin_unlock(&block_group->free_space_ctl->tree_lock);
5054 5305
5055 /* 5306 /*
5056 * Ok we want to try and use the cluster allocator, so lets look 5307 * Ok we want to try and use the cluster allocator, so
5057 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will 5308 * lets look there
5058 * have tried the cluster allocator plenty of times at this
5059 * point and not have found anything, so we are likely way too
5060 * fragmented for the clustering stuff to find anything, so lets
5061 * just skip it and let the allocator find whatever block it can
5062 * find
5063 */ 5309 */
5064 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { 5310 if (last_ptr) {
5065 /* 5311 /*
5066 * the refill lock keeps out other 5312 * the refill lock keeps out other
5067 * people trying to start a new cluster 5313 * people trying to start a new cluster
5068 */ 5314 */
5069 spin_lock(&last_ptr->refill_lock); 5315 spin_lock(&last_ptr->refill_lock);
5070 if (last_ptr->block_group && 5316 used_block_group = last_ptr->block_group;
5071 (last_ptr->block_group->ro || 5317 if (used_block_group != block_group &&
5072 !block_group_bits(last_ptr->block_group, data))) { 5318 (!used_block_group ||
5073 offset = 0; 5319 used_block_group->ro ||
5320 !block_group_bits(used_block_group, data))) {
5321 used_block_group = block_group;
5074 goto refill_cluster; 5322 goto refill_cluster;
5075 } 5323 }
5076 5324
5077 offset = btrfs_alloc_from_cluster(block_group, last_ptr, 5325 if (used_block_group != block_group)
5078 num_bytes, search_start); 5326 btrfs_get_block_group(used_block_group);
5327
5328 offset = btrfs_alloc_from_cluster(used_block_group,
5329 last_ptr, num_bytes, used_block_group->key.objectid);
5079 if (offset) { 5330 if (offset) {
5080 /* we have a block, we're done */ 5331 /* we have a block, we're done */
5081 spin_unlock(&last_ptr->refill_lock); 5332 spin_unlock(&last_ptr->refill_lock);
5082 goto checks; 5333 goto checks;
5083 } 5334 }
5084 5335
5085 spin_lock(&last_ptr->lock); 5336 WARN_ON(last_ptr->block_group != used_block_group);
5086 /* 5337 if (used_block_group != block_group) {
5087 * whoops, this cluster doesn't actually point to 5338 btrfs_put_block_group(used_block_group);
5088 * this block group. Get a ref on the block 5339 used_block_group = block_group;
5089 * group is does point to and try again
5090 */
5091 if (!last_ptr_loop && last_ptr->block_group &&
5092 last_ptr->block_group != block_group &&
5093 index <=
5094 get_block_group_index(last_ptr->block_group)) {
5095
5096 btrfs_put_block_group(block_group);
5097 block_group = last_ptr->block_group;
5098 btrfs_get_block_group(block_group);
5099 spin_unlock(&last_ptr->lock);
5100 spin_unlock(&last_ptr->refill_lock);
5101
5102 last_ptr_loop = 1;
5103 search_start = block_group->key.objectid;
5104 /*
5105 * we know this block group is properly
5106 * in the list because
5107 * btrfs_remove_block_group, drops the
5108 * cluster before it removes the block
5109 * group from the list
5110 */
5111 goto have_block_group;
5112 } 5340 }
5113 spin_unlock(&last_ptr->lock);
5114refill_cluster: 5341refill_cluster:
5342 BUG_ON(used_block_group != block_group);
5343 /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5344 * set up a new clusters, so lets just skip it
5345 * and let the allocator find whatever block
5346 * it can find. If we reach this point, we
5347 * will have tried the cluster allocator
5348 * plenty of times and not have found
5349 * anything, so we are likely way too
5350 * fragmented for the clustering stuff to find
5351 * anything. */
5352 if (loop >= LOOP_NO_EMPTY_SIZE) {
5353 spin_unlock(&last_ptr->refill_lock);
5354 goto unclustered_alloc;
5355 }
5356
5115 /* 5357 /*
5116 * this cluster didn't work out, free it and 5358 * this cluster didn't work out, free it and
5117 * start over 5359 * start over
5118 */ 5360 */
5119 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5361 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5120 5362
5121 last_ptr_loop = 0;
5122
5123 /* allocate a cluster in this block group */ 5363 /* allocate a cluster in this block group */
5124 ret = btrfs_find_space_cluster(trans, root, 5364 ret = btrfs_find_space_cluster(trans, root,
5125 block_group, last_ptr, 5365 block_group, last_ptr,
5126 offset, num_bytes, 5366 search_start, num_bytes,
5127 empty_cluster + empty_size); 5367 empty_cluster + empty_size);
5128 if (ret == 0) { 5368 if (ret == 0) {
5129 /* 5369 /*
@@ -5159,6 +5399,7 @@ refill_cluster:
5159 goto loop; 5399 goto loop;
5160 } 5400 }
5161 5401
5402unclustered_alloc:
5162 offset = btrfs_find_space_for_alloc(block_group, search_start, 5403 offset = btrfs_find_space_for_alloc(block_group, search_start,
5163 num_bytes, empty_size); 5404 num_bytes, empty_size);
5164 /* 5405 /*
@@ -5177,20 +5418,22 @@ refill_cluster:
5177 failed_alloc = true; 5418 failed_alloc = true;
5178 goto have_block_group; 5419 goto have_block_group;
5179 } else if (!offset) { 5420 } else if (!offset) {
5421 if (!cached)
5422 have_caching_bg = true;
5180 goto loop; 5423 goto loop;
5181 } 5424 }
5182checks: 5425checks:
5183 search_start = stripe_align(root, offset); 5426 search_start = stripe_align(root, offset);
5184 /* move on to the next group */ 5427 /* move on to the next group */
5185 if (search_start + num_bytes >= search_end) { 5428 if (search_start + num_bytes >= search_end) {
5186 btrfs_add_free_space(block_group, offset, num_bytes); 5429 btrfs_add_free_space(used_block_group, offset, num_bytes);
5187 goto loop; 5430 goto loop;
5188 } 5431 }
5189 5432
5190 /* move on to the next group */ 5433 /* move on to the next group */
5191 if (search_start + num_bytes > 5434 if (search_start + num_bytes >
5192 block_group->key.objectid + block_group->key.offset) { 5435 used_block_group->key.objectid + used_block_group->key.offset) {
5193 btrfs_add_free_space(block_group, offset, num_bytes); 5436 btrfs_add_free_space(used_block_group, offset, num_bytes);
5194 goto loop; 5437 goto loop;
5195 } 5438 }
5196 5439
@@ -5198,14 +5441,14 @@ checks:
5198 ins->offset = num_bytes; 5441 ins->offset = num_bytes;
5199 5442
5200 if (offset < search_start) 5443 if (offset < search_start)
5201 btrfs_add_free_space(block_group, offset, 5444 btrfs_add_free_space(used_block_group, offset,
5202 search_start - offset); 5445 search_start - offset);
5203 BUG_ON(offset > search_start); 5446 BUG_ON(offset > search_start);
5204 5447
5205 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, 5448 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5206 (data & BTRFS_BLOCK_GROUP_DATA)); 5449 alloc_type);
5207 if (ret == -EAGAIN) { 5450 if (ret == -EAGAIN) {
5208 btrfs_add_free_space(block_group, offset, num_bytes); 5451 btrfs_add_free_space(used_block_group, offset, num_bytes);
5209 goto loop; 5452 goto loop;
5210 } 5453 }
5211 5454
@@ -5214,19 +5457,26 @@ checks:
5214 ins->offset = num_bytes; 5457 ins->offset = num_bytes;
5215 5458
5216 if (offset < search_start) 5459 if (offset < search_start)
5217 btrfs_add_free_space(block_group, offset, 5460 btrfs_add_free_space(used_block_group, offset,
5218 search_start - offset); 5461 search_start - offset);
5219 BUG_ON(offset > search_start); 5462 BUG_ON(offset > search_start);
5463 if (used_block_group != block_group)
5464 btrfs_put_block_group(used_block_group);
5220 btrfs_put_block_group(block_group); 5465 btrfs_put_block_group(block_group);
5221 break; 5466 break;
5222loop: 5467loop:
5223 failed_cluster_refill = false; 5468 failed_cluster_refill = false;
5224 failed_alloc = false; 5469 failed_alloc = false;
5225 BUG_ON(index != get_block_group_index(block_group)); 5470 BUG_ON(index != get_block_group_index(block_group));
5471 if (used_block_group != block_group)
5472 btrfs_put_block_group(used_block_group);
5226 btrfs_put_block_group(block_group); 5473 btrfs_put_block_group(block_group);
5227 } 5474 }
5228 up_read(&space_info->groups_sem); 5475 up_read(&space_info->groups_sem);
5229 5476
5477 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5478 goto search;
5479
5230 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 5480 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5231 goto search; 5481 goto search;
5232 5482
@@ -5325,7 +5575,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5325 int index = 0; 5575 int index = 0;
5326 5576
5327 spin_lock(&info->lock); 5577 spin_lock(&info->lock);
5328 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5578 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5579 (unsigned long long)info->flags,
5329 (unsigned long long)(info->total_bytes - info->bytes_used - 5580 (unsigned long long)(info->total_bytes - info->bytes_used -
5330 info->bytes_pinned - info->bytes_reserved - 5581 info->bytes_pinned - info->bytes_reserved -
5331 info->bytes_readonly), 5582 info->bytes_readonly),
@@ -5411,7 +5662,8 @@ again:
5411 return ret; 5662 return ret;
5412} 5663}
5413 5664
5414int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 5665static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5666 u64 start, u64 len, int pin)
5415{ 5667{
5416 struct btrfs_block_group_cache *cache; 5668 struct btrfs_block_group_cache *cache;
5417 int ret = 0; 5669 int ret = 0;
@@ -5426,8 +5678,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5426 if (btrfs_test_opt(root, DISCARD)) 5678 if (btrfs_test_opt(root, DISCARD))
5427 ret = btrfs_discard_extent(root, start, len, NULL); 5679 ret = btrfs_discard_extent(root, start, len, NULL);
5428 5680
5429 btrfs_add_free_space(cache, start, len); 5681 if (pin)
5430 btrfs_update_reserved_bytes(cache, len, 0, 1); 5682 pin_down_extent(root, cache, start, len, 1);
5683 else {
5684 btrfs_add_free_space(cache, start, len);
5685 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5686 }
5431 btrfs_put_block_group(cache); 5687 btrfs_put_block_group(cache);
5432 5688
5433 trace_btrfs_reserved_extent_free(root, start, len); 5689 trace_btrfs_reserved_extent_free(root, start, len);
@@ -5435,6 +5691,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5435 return ret; 5691 return ret;
5436} 5692}
5437 5693
5694int btrfs_free_reserved_extent(struct btrfs_root *root,
5695 u64 start, u64 len)
5696{
5697 return __btrfs_free_reserved_extent(root, start, len, 0);
5698}
5699
5700int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5701 u64 start, u64 len)
5702{
5703 return __btrfs_free_reserved_extent(root, start, len, 1);
5704}
5705
5438static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5706static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5439 struct btrfs_root *root, 5707 struct btrfs_root *root,
5440 u64 parent, u64 root_objectid, 5708 u64 parent, u64 root_objectid,
@@ -5630,7 +5898,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5630 put_caching_control(caching_ctl); 5898 put_caching_control(caching_ctl);
5631 } 5899 }
5632 5900
5633 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); 5901 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5902 RESERVE_ALLOC_NO_ACCOUNT);
5634 BUG_ON(ret); 5903 BUG_ON(ret);
5635 btrfs_put_block_group(block_group); 5904 btrfs_put_block_group(block_group);
5636 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5905 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5956,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5687 block_rsv = get_block_rsv(trans, root); 5956 block_rsv = get_block_rsv(trans, root);
5688 5957
5689 if (block_rsv->size == 0) { 5958 if (block_rsv->size == 0) {
5690 ret = reserve_metadata_bytes(trans, root, block_rsv, 5959 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5691 blocksize, 0);
5692 /* 5960 /*
5693 * If we couldn't reserve metadata bytes try and use some from 5961 * If we couldn't reserve metadata bytes try and use some from
5694 * the global reserve. 5962 * the global reserve.
@@ -5708,13 +5976,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5708 if (!ret) 5976 if (!ret)
5709 return block_rsv; 5977 return block_rsv;
5710 if (ret) { 5978 if (ret) {
5711 WARN_ON(1); 5979 static DEFINE_RATELIMIT_STATE(_rs,
5712 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 5980 DEFAULT_RATELIMIT_INTERVAL,
5713 0); 5981 /*DEFAULT_RATELIMIT_BURST*/ 2);
5982 if (__ratelimit(&_rs)) {
5983 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5984 WARN_ON(1);
5985 }
5986 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5714 if (!ret) { 5987 if (!ret) {
5715 spin_lock(&block_rsv->lock);
5716 block_rsv->size += blocksize;
5717 spin_unlock(&block_rsv->lock);
5718 return block_rsv; 5988 return block_rsv;
5719 } else if (ret && block_rsv != global_rsv) { 5989 } else if (ret && block_rsv != global_rsv) {
5720 ret = block_rsv_use_bytes(global_rsv, blocksize); 5990 ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6862,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6592 cache->bytes_super - btrfs_block_group_used(&cache->item); 6862 cache->bytes_super - btrfs_block_group_used(&cache->item);
6593 6863
6594 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6864 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6595 sinfo->bytes_may_use + sinfo->bytes_readonly + 6865 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6596 cache->reserved_pinned + num_bytes + min_allocable_bytes <= 6866 min_allocable_bytes <= sinfo->total_bytes) {
6597 sinfo->total_bytes) {
6598 sinfo->bytes_readonly += num_bytes; 6867 sinfo->bytes_readonly += num_bytes;
6599 sinfo->bytes_reserved += cache->reserved_pinned;
6600 cache->reserved_pinned = 0;
6601 cache->ro = 1; 6868 cache->ro = 1;
6602 ret = 0; 6869 ret = 0;
6603 } 6870 }
@@ -6964,7 +7231,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6964 struct btrfs_space_info, 7231 struct btrfs_space_info,
6965 list); 7232 list);
6966 if (space_info->bytes_pinned > 0 || 7233 if (space_info->bytes_pinned > 0 ||
6967 space_info->bytes_reserved > 0) { 7234 space_info->bytes_reserved > 0 ||
7235 space_info->bytes_may_use > 0) {
6968 WARN_ON(1); 7236 WARN_ON(1);
6969 dump_space_info(space_info, 0, 0); 7237 dump_space_info(space_info, 0, 0);
6970 } 7238 }
@@ -7006,14 +7274,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7006 return -ENOMEM; 7274 return -ENOMEM;
7007 path->reada = 1; 7275 path->reada = 1;
7008 7276
7009 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 7277 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7010 if (cache_gen != 0 && 7278 if (btrfs_test_opt(root, SPACE_CACHE) &&
7011 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 7279 btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7012 need_clear = 1; 7280 need_clear = 1;
7013 if (btrfs_test_opt(root, CLEAR_CACHE)) 7281 if (btrfs_test_opt(root, CLEAR_CACHE))
7014 need_clear = 1; 7282 need_clear = 1;
7015 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
7016 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
7017 7283
7018 while (1) { 7284 while (1) {
7019 ret = find_first_block_group(root, path, &key); 7285 ret = find_first_block_group(root, path, &key);
@@ -7252,7 +7518,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7252 goto out; 7518 goto out;
7253 } 7519 }
7254 7520
7255 inode = lookup_free_space_inode(root, block_group, path); 7521 inode = lookup_free_space_inode(tree_root, block_group, path);
7256 if (!IS_ERR(inode)) { 7522 if (!IS_ERR(inode)) {
7257 ret = btrfs_orphan_add(trans, inode); 7523 ret = btrfs_orphan_add(trans, inode);
7258 BUG_ON(ret); 7524 BUG_ON(ret);
@@ -7268,7 +7534,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7268 spin_unlock(&block_group->lock); 7534 spin_unlock(&block_group->lock);
7269 } 7535 }
7270 /* One for our lookup ref */ 7536 /* One for our lookup ref */
7271 iput(inode); 7537 btrfs_add_delayed_iput(inode);
7272 } 7538 }
7273 7539
7274 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 7540 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7339,7 +7605,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7339 int mixed = 0; 7605 int mixed = 0;
7340 int ret; 7606 int ret;
7341 7607
7342 disk_super = &fs_info->super_copy; 7608 disk_super = fs_info->super_copy;
7343 if (!btrfs_super_root(disk_super)) 7609 if (!btrfs_super_root(disk_super))
7344 return 1; 7610 return 1;
7345 7611
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..49f3c9dc09f4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -894,6 +895,202 @@ search_again:
894 goto again; 895 goto again;
895} 896}
896 897
898/**
899 * convert_extent - convert all bits in a given range from one bit to another
900 * @tree: the io tree to search
901 * @start: the start offset in bytes
902 * @end: the end offset in bytes (inclusive)
903 * @bits: the bits to set in this range
904 * @clear_bits: the bits to clear in this range
905 * @mask: the allocation mask
906 *
907 * This will go through and set bits for the given range. If any states exist
908 * already in this range they are set with the given bit and cleared of the
909 * clear_bits. This is only meant to be used by things that are mergeable, ie
910 * converting from say DELALLOC to DIRTY. This is not meant to be used with
911 * boundary bits like LOCK.
912 */
913int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914 int bits, int clear_bits, gfp_t mask)
915{
916 struct extent_state *state;
917 struct extent_state *prealloc = NULL;
918 struct rb_node *node;
919 int err = 0;
920 u64 last_start;
921 u64 last_end;
922
923again:
924 if (!prealloc && (mask & __GFP_WAIT)) {
925 prealloc = alloc_extent_state(mask);
926 if (!prealloc)
927 return -ENOMEM;
928 }
929
930 spin_lock(&tree->lock);
931 /*
932 * this search will find all the extents that end after
933 * our range starts.
934 */
935 node = tree_search(tree, start);
936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc) {
939 err = -ENOMEM;
940 goto out;
941 }
942 err = insert_state(tree, prealloc, start, end, &bits);
943 prealloc = NULL;
944 BUG_ON(err == -EEXIST);
945 goto out;
946 }
947 state = rb_entry(node, struct extent_state, rb_node);
948hit_next:
949 last_start = state->start;
950 last_end = state->end;
951
952 /*
953 * | ---- desired range ---- |
954 * | state |
955 *
956 * Just lock what we found and keep going
957 */
958 if (state->start == start && state->end <= end) {
959 struct rb_node *next_node;
960
961 set_state_bits(tree, state, &bits);
962 clear_state_bit(tree, state, &clear_bits, 0);
963
964 merge_state(tree, state);
965 if (last_end == (u64)-1)
966 goto out;
967
968 start = last_end + 1;
969 next_node = rb_next(&state->rb_node);
970 if (next_node && start < end && prealloc && !need_resched()) {
971 state = rb_entry(next_node, struct extent_state,
972 rb_node);
973 if (state->start == start)
974 goto hit_next;
975 }
976 goto search_again;
977 }
978
979 /*
980 * | ---- desired range ---- |
981 * | state |
982 * or
983 * | ------------- state -------------- |
984 *
985 * We need to split the extent we found, and may flip bits on
986 * second half.
987 *
988 * If the extent we found extends past our
989 * range, we just split and search again. It'll get split
990 * again the next time though.
991 *
992 * If the extent we found is inside our range, we set the
993 * desired bit on it.
994 */
995 if (state->start < start) {
996 prealloc = alloc_extent_state_atomic(prealloc);
997 if (!prealloc) {
998 err = -ENOMEM;
999 goto out;
1000 }
1001 err = split_state(tree, state, prealloc, start);
1002 BUG_ON(err == -EEXIST);
1003 prealloc = NULL;
1004 if (err)
1005 goto out;
1006 if (state->end <= end) {
1007 set_state_bits(tree, state, &bits);
1008 clear_state_bit(tree, state, &clear_bits, 0);
1009 merge_state(tree, state);
1010 if (last_end == (u64)-1)
1011 goto out;
1012 start = last_end + 1;
1013 }
1014 goto search_again;
1015 }
1016 /*
1017 * | ---- desired range ---- |
1018 * | state | or | state |
1019 *
1020 * There's a hole, we need to insert something in it and
1021 * ignore the extent we found.
1022 */
1023 if (state->start > start) {
1024 u64 this_end;
1025 if (end < last_start)
1026 this_end = end;
1027 else
1028 this_end = last_start - 1;
1029
1030 prealloc = alloc_extent_state_atomic(prealloc);
1031 if (!prealloc) {
1032 err = -ENOMEM;
1033 goto out;
1034 }
1035
1036 /*
1037 * Avoid to free 'prealloc' if it can be merged with
1038 * the later extent.
1039 */
1040 err = insert_state(tree, prealloc, start, this_end,
1041 &bits);
1042 BUG_ON(err == -EEXIST);
1043 if (err) {
1044 free_extent_state(prealloc);
1045 prealloc = NULL;
1046 goto out;
1047 }
1048 prealloc = NULL;
1049 start = this_end + 1;
1050 goto search_again;
1051 }
1052 /*
1053 * | ---- desired range ---- |
1054 * | state |
1055 * We need to split the extent, and set the bit
1056 * on the first half
1057 */
1058 if (state->start <= end && state->end > end) {
1059 prealloc = alloc_extent_state_atomic(prealloc);
1060 if (!prealloc) {
1061 err = -ENOMEM;
1062 goto out;
1063 }
1064
1065 err = split_state(tree, state, prealloc, end + 1);
1066 BUG_ON(err == -EEXIST);
1067
1068 set_state_bits(tree, prealloc, &bits);
1069 clear_state_bit(tree, prealloc, &clear_bits, 0);
1070
1071 merge_state(tree, prealloc);
1072 prealloc = NULL;
1073 goto out;
1074 }
1075
1076 goto search_again;
1077
1078out:
1079 spin_unlock(&tree->lock);
1080 if (prealloc)
1081 free_extent_state(prealloc);
1082
1083 return err;
1084
1085search_again:
1086 if (start > end)
1087 goto out;
1088 spin_unlock(&tree->lock);
1089 if (mask & __GFP_WAIT)
1090 cond_resched();
1091 goto again;
1092}
1093
897/* wrappers around set/clear extent bit */ 1094/* wrappers around set/clear extent bit */
898int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1095int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask) 1096 gfp_t mask)
@@ -919,7 +1116,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
919 struct extent_state **cached_state, gfp_t mask) 1116 struct extent_state **cached_state, gfp_t mask)
920{ 1117{
921 return set_extent_bit(tree, start, end, 1118 return set_extent_bit(tree, start, end,
922 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 1119 EXTENT_DELALLOC | EXTENT_UPTODATE,
923 0, NULL, cached_state, mask); 1120 0, NULL, cached_state, mask);
924} 1121}
925 1122
@@ -1599,6 +1796,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599 return 0; 1796 return 0;
1600} 1797}
1601 1798
1799/*
1800 * When IO fails, either with EIO or csum verification fails, we
1801 * try other mirrors that might have a good copy of the data. This
1802 * io_failure_record is used to record state as we go through all the
1803 * mirrors. If another mirror has good data, the page is set up to date
1804 * and things continue. If a good mirror can't be found, the original
1805 * bio end_io callback is called to indicate things have failed.
1806 */
1807struct io_failure_record {
1808 struct page *page;
1809 u64 start;
1810 u64 len;
1811 u64 logical;
1812 unsigned long bio_flags;
1813 int this_mirror;
1814 int failed_mirror;
1815 int in_validation;
1816};
1817
1818static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1819 int did_repair)
1820{
1821 int ret;
1822 int err = 0;
1823 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1824
1825 set_state_private(failure_tree, rec->start, 0);
1826 ret = clear_extent_bits(failure_tree, rec->start,
1827 rec->start + rec->len - 1,
1828 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1829 if (ret)
1830 err = ret;
1831
1832 if (did_repair) {
1833 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1834 rec->start + rec->len - 1,
1835 EXTENT_DAMAGED, GFP_NOFS);
1836 if (ret && !err)
1837 err = ret;
1838 }
1839
1840 kfree(rec);
1841 return err;
1842}
1843
1844static void repair_io_failure_callback(struct bio *bio, int err)
1845{
1846 complete(bio->bi_private);
1847}
1848
1849/*
1850 * this bypasses the standard btrfs submit functions deliberately, as
1851 * the standard behavior is to write all copies in a raid setup. here we only
1852 * want to write the one bad copy. so we do the mapping for ourselves and issue
1853 * submit_bio directly.
1854 * to avoid any synchonization issues, wait for the data after writing, which
1855 * actually prevents the read that triggered the error from finishing.
1856 * currently, there can be no more than two copies of every data bit. thus,
1857 * exactly one rewrite is required.
1858 */
1859int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1860 u64 length, u64 logical, struct page *page,
1861 int mirror_num)
1862{
1863 struct bio *bio;
1864 struct btrfs_device *dev;
1865 DECLARE_COMPLETION_ONSTACK(compl);
1866 u64 map_length = 0;
1867 u64 sector;
1868 struct btrfs_bio *bbio = NULL;
1869 int ret;
1870
1871 BUG_ON(!mirror_num);
1872
1873 bio = bio_alloc(GFP_NOFS, 1);
1874 if (!bio)
1875 return -EIO;
1876 bio->bi_private = &compl;
1877 bio->bi_end_io = repair_io_failure_callback;
1878 bio->bi_size = 0;
1879 map_length = length;
1880
1881 ret = btrfs_map_block(map_tree, WRITE, logical,
1882 &map_length, &bbio, mirror_num);
1883 if (ret) {
1884 bio_put(bio);
1885 return -EIO;
1886 }
1887 BUG_ON(mirror_num != bbio->mirror_num);
1888 sector = bbio->stripes[mirror_num-1].physical >> 9;
1889 bio->bi_sector = sector;
1890 dev = bbio->stripes[mirror_num-1].dev;
1891 kfree(bbio);
1892 if (!dev || !dev->bdev || !dev->writeable) {
1893 bio_put(bio);
1894 return -EIO;
1895 }
1896 bio->bi_bdev = dev->bdev;
1897 bio_add_page(bio, page, length, start-page_offset(page));
1898 submit_bio(WRITE_SYNC, bio);
1899 wait_for_completion(&compl);
1900
1901 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1902 /* try to remap that extent elsewhere? */
1903 bio_put(bio);
1904 return -EIO;
1905 }
1906
1907 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1908 "sector %llu)\n", page->mapping->host->i_ino, start,
1909 dev->name, sector);
1910
1911 bio_put(bio);
1912 return 0;
1913}
1914
1915/*
1916 * each time an IO finishes, we do a fast check in the IO failure tree
1917 * to see if we need to process or clean up an io_failure_record
1918 */
1919static int clean_io_failure(u64 start, struct page *page)
1920{
1921 u64 private;
1922 u64 private_failure;
1923 struct io_failure_record *failrec;
1924 struct btrfs_mapping_tree *map_tree;
1925 struct extent_state *state;
1926 int num_copies;
1927 int did_repair = 0;
1928 int ret;
1929 struct inode *inode = page->mapping->host;
1930
1931 private = 0;
1932 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1933 (u64)-1, 1, EXTENT_DIRTY, 0);
1934 if (!ret)
1935 return 0;
1936
1937 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1938 &private_failure);
1939 if (ret)
1940 return 0;
1941
1942 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1943 BUG_ON(!failrec->this_mirror);
1944
1945 if (failrec->in_validation) {
1946 /* there was no real error, just free the record */
1947 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1948 failrec->start);
1949 did_repair = 1;
1950 goto out;
1951 }
1952
1953 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1954 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1955 failrec->start,
1956 EXTENT_LOCKED);
1957 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1958
1959 if (state && state->start == failrec->start) {
1960 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1961 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1962 failrec->len);
1963 if (num_copies > 1) {
1964 ret = repair_io_failure(map_tree, start, failrec->len,
1965 failrec->logical, page,
1966 failrec->failed_mirror);
1967 did_repair = !ret;
1968 }
1969 }
1970
1971out:
1972 if (!ret)
1973 ret = free_io_failure(inode, failrec, did_repair);
1974
1975 return ret;
1976}
1977
1978/*
1979 * this is a generic handler for readpage errors (default
1980 * readpage_io_failed_hook). if other copies exist, read those and write back
1981 * good data to the failed position. does not investigate in remapping the
1982 * failed extent elsewhere, hoping the device will be smart enough to do this as
1983 * needed
1984 */
1985
1986static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1987 u64 start, u64 end, int failed_mirror,
1988 struct extent_state *state)
1989{
1990 struct io_failure_record *failrec = NULL;
1991 u64 private;
1992 struct extent_map *em;
1993 struct inode *inode = page->mapping->host;
1994 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1995 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1996 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1997 struct bio *bio;
1998 int num_copies;
1999 int ret;
2000 int read_mode;
2001 u64 logical;
2002
2003 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2004
2005 ret = get_state_private(failure_tree, start, &private);
2006 if (ret) {
2007 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2008 if (!failrec)
2009 return -ENOMEM;
2010 failrec->start = start;
2011 failrec->len = end - start + 1;
2012 failrec->this_mirror = 0;
2013 failrec->bio_flags = 0;
2014 failrec->in_validation = 0;
2015
2016 read_lock(&em_tree->lock);
2017 em = lookup_extent_mapping(em_tree, start, failrec->len);
2018 if (!em) {
2019 read_unlock(&em_tree->lock);
2020 kfree(failrec);
2021 return -EIO;
2022 }
2023
2024 if (em->start > start || em->start + em->len < start) {
2025 free_extent_map(em);
2026 em = NULL;
2027 }
2028 read_unlock(&em_tree->lock);
2029
2030 if (!em || IS_ERR(em)) {
2031 kfree(failrec);
2032 return -EIO;
2033 }
2034 logical = start - em->start;
2035 logical = em->block_start + logical;
2036 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2037 logical = em->block_start;
2038 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2039 extent_set_compress_type(&failrec->bio_flags,
2040 em->compress_type);
2041 }
2042 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2043 "len=%llu\n", logical, start, failrec->len);
2044 failrec->logical = logical;
2045 free_extent_map(em);
2046
2047 /* set the bits in the private failure tree */
2048 ret = set_extent_bits(failure_tree, start, end,
2049 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2050 if (ret >= 0)
2051 ret = set_state_private(failure_tree, start,
2052 (u64)(unsigned long)failrec);
2053 /* set the bits in the inode's tree */
2054 if (ret >= 0)
2055 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2056 GFP_NOFS);
2057 if (ret < 0) {
2058 kfree(failrec);
2059 return ret;
2060 }
2061 } else {
2062 failrec = (struct io_failure_record *)(unsigned long)private;
2063 pr_debug("bio_readpage_error: (found) logical=%llu, "
2064 "start=%llu, len=%llu, validation=%d\n",
2065 failrec->logical, failrec->start, failrec->len,
2066 failrec->in_validation);
2067 /*
2068 * when data can be on disk more than twice, add to failrec here
2069 * (e.g. with a list for failed_mirror) to make
2070 * clean_io_failure() clean all those errors at once.
2071 */
2072 }
2073 num_copies = btrfs_num_copies(
2074 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2075 failrec->logical, failrec->len);
2076 if (num_copies == 1) {
2077 /*
2078 * we only have a single copy of the data, so don't bother with
2079 * all the retry and error correction code that follows. no
2080 * matter what the error is, it is very likely to persist.
2081 */
2082 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2083 "state=%p, num_copies=%d, next_mirror %d, "
2084 "failed_mirror %d\n", state, num_copies,
2085 failrec->this_mirror, failed_mirror);
2086 free_io_failure(inode, failrec, 0);
2087 return -EIO;
2088 }
2089
2090 if (!state) {
2091 spin_lock(&tree->lock);
2092 state = find_first_extent_bit_state(tree, failrec->start,
2093 EXTENT_LOCKED);
2094 if (state && state->start != failrec->start)
2095 state = NULL;
2096 spin_unlock(&tree->lock);
2097 }
2098
2099 /*
2100 * there are two premises:
2101 * a) deliver good data to the caller
2102 * b) correct the bad sectors on disk
2103 */
2104 if (failed_bio->bi_vcnt > 1) {
2105 /*
2106 * to fulfill b), we need to know the exact failing sectors, as
2107 * we don't want to rewrite any more than the failed ones. thus,
2108 * we need separate read requests for the failed bio
2109 *
2110 * if the following BUG_ON triggers, our validation request got
2111 * merged. we need separate requests for our algorithm to work.
2112 */
2113 BUG_ON(failrec->in_validation);
2114 failrec->in_validation = 1;
2115 failrec->this_mirror = failed_mirror;
2116 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2117 } else {
2118 /*
2119 * we're ready to fulfill a) and b) alongside. get a good copy
2120 * of the failed sector and if we succeed, we have setup
2121 * everything for repair_io_failure to do the rest for us.
2122 */
2123 if (failrec->in_validation) {
2124 BUG_ON(failrec->this_mirror != failed_mirror);
2125 failrec->in_validation = 0;
2126 failrec->this_mirror = 0;
2127 }
2128 failrec->failed_mirror = failed_mirror;
2129 failrec->this_mirror++;
2130 if (failrec->this_mirror == failed_mirror)
2131 failrec->this_mirror++;
2132 read_mode = READ_SYNC;
2133 }
2134
2135 if (!state || failrec->this_mirror > num_copies) {
2136 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2137 "next_mirror %d, failed_mirror %d\n", state,
2138 num_copies, failrec->this_mirror, failed_mirror);
2139 free_io_failure(inode, failrec, 0);
2140 return -EIO;
2141 }
2142
2143 bio = bio_alloc(GFP_NOFS, 1);
2144 bio->bi_private = state;
2145 bio->bi_end_io = failed_bio->bi_end_io;
2146 bio->bi_sector = failrec->logical >> 9;
2147 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2148 bio->bi_size = 0;
2149
2150 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2151
2152 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2153 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2154 failrec->this_mirror, num_copies, failrec->in_validation);
2155
2156 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2157 failrec->bio_flags, 0);
2158 return 0;
2159}
2160
1602/* lots and lots of room for performance fixes in the end_bio funcs */ 2161/* lots and lots of room for performance fixes in the end_bio funcs */
1603 2162
1604/* 2163/*
@@ -1697,6 +2256,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697 struct extent_state *cached = NULL; 2256 struct extent_state *cached = NULL;
1698 struct extent_state *state; 2257 struct extent_state *state;
1699 2258
2259 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2260 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2261 (long int)bio->bi_bdev);
1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 2262 tree = &BTRFS_I(page->mapping->host)->io_tree;
1701 2263
1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2264 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,12 +2289,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727 state); 2289 state);
1728 if (ret) 2290 if (ret)
1729 uptodate = 0; 2291 uptodate = 0;
2292 else
2293 clean_io_failure(start, page);
1730 } 2294 }
1731 if (!uptodate && tree->ops && 2295 if (!uptodate) {
1732 tree->ops->readpage_io_failed_hook) { 2296 int failed_mirror;
1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 2297 failed_mirror = (int)(unsigned long)bio->bi_bdev;
1734 start, end, NULL); 2298 /*
2299 * The generic bio_readpage_error handles errors the
2300 * following way: If possible, new read requests are
2301 * created and submitted and will end up in
2302 * end_bio_extent_readpage as well (if we're lucky, not
2303 * in the !uptodate case). In that case it returns 0 and
2304 * we just go on with the next page in our bio. If it
2305 * can't handle the error it will return -EIO and we
2306 * remain responsible for that page.
2307 */
2308 ret = bio_readpage_error(bio, page, start, end,
2309 failed_mirror, NULL);
1735 if (ret == 0) { 2310 if (ret == 0) {
2311error_handled:
1736 uptodate = 2312 uptodate =
1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 2313 test_bit(BIO_UPTODATE, &bio->bi_flags);
1738 if (err) 2314 if (err)
@@ -1740,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1740 uncache_state(&cached); 2316 uncache_state(&cached);
1741 continue; 2317 continue;
1742 } 2318 }
2319 if (tree->ops && tree->ops->readpage_io_failed_hook) {
2320 ret = tree->ops->readpage_io_failed_hook(
2321 bio, page, start, end,
2322 failed_mirror, state);
2323 if (ret == 0)
2324 goto error_handled;
2325 }
1743 } 2326 }
1744 2327
1745 if (uptodate) { 2328 if (uptodate) {
@@ -1811,6 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811 mirror_num, bio_flags, start); 2394 mirror_num, bio_flags, start);
1812 else 2395 else
1813 submit_bio(rw, bio); 2396 submit_bio(rw, bio);
2397
1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2398 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815 ret = -EOPNOTSUPP; 2399 ret = -EOPNOTSUPP;
1816 bio_put(bio); 2400 bio_put(bio);
@@ -2076,16 +2660,16 @@ out:
2076} 2660}
2077 2661
2078int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2662int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2079 get_extent_t *get_extent) 2663 get_extent_t *get_extent, int mirror_num)
2080{ 2664{
2081 struct bio *bio = NULL; 2665 struct bio *bio = NULL;
2082 unsigned long bio_flags = 0; 2666 unsigned long bio_flags = 0;
2083 int ret; 2667 int ret;
2084 2668
2085 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2669 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2086 &bio_flags); 2670 &bio_flags);
2087 if (bio) 2671 if (bio)
2088 ret = submit_one_bio(READ, bio, 0, bio_flags); 2672 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2089 return ret; 2673 return ret;
2090} 2674}
2091 2675
@@ -2136,6 +2720,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 int compressed; 2720 int compressed;
2137 int write_flags; 2721 int write_flags;
2138 unsigned long nr_written = 0; 2722 unsigned long nr_written = 0;
2723 bool fill_delalloc = true;
2139 2724
2140 if (wbc->sync_mode == WB_SYNC_ALL) 2725 if (wbc->sync_mode == WB_SYNC_ALL)
2141 write_flags = WRITE_SYNC; 2726 write_flags = WRITE_SYNC;
@@ -2145,6 +2730,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2145 trace___extent_writepage(page, inode, wbc); 2730 trace___extent_writepage(page, inode, wbc);
2146 2731
2147 WARN_ON(!PageLocked(page)); 2732 WARN_ON(!PageLocked(page));
2733
2734 ClearPageError(page);
2735
2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2736 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149 if (page->index > end_index || 2737 if (page->index > end_index ||
2150 (page->index == end_index && !pg_offset)) { 2738 (page->index == end_index && !pg_offset)) {
@@ -2166,10 +2754,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2166 2754
2167 set_page_extent_mapped(page); 2755 set_page_extent_mapped(page);
2168 2756
2757 if (!tree->ops || !tree->ops->fill_delalloc)
2758 fill_delalloc = false;
2759
2169 delalloc_start = start; 2760 delalloc_start = start;
2170 delalloc_end = 0; 2761 delalloc_end = 0;
2171 page_started = 0; 2762 page_started = 0;
2172 if (!epd->extent_locked) { 2763 if (!epd->extent_locked && fill_delalloc) {
2173 u64 delalloc_to_write = 0; 2764 u64 delalloc_to_write = 0;
2174 /* 2765 /*
2175 * make sure the wbc mapping index is at least updated 2766 * make sure the wbc mapping index is at least updated
@@ -2421,10 +3012,16 @@ retry:
2421 * swizzled back from swapper_space to tmpfs file 3012 * swizzled back from swapper_space to tmpfs file
2422 * mapping 3013 * mapping
2423 */ 3014 */
2424 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 3015 if (tree->ops &&
2425 tree->ops->write_cache_pages_lock_hook(page); 3016 tree->ops->write_cache_pages_lock_hook) {
2426 else 3017 tree->ops->write_cache_pages_lock_hook(page,
2427 lock_page(page); 3018 data, flush_fn);
3019 } else {
3020 if (!trylock_page(page)) {
3021 flush_fn(data);
3022 lock_page(page);
3023 }
3024 }
2428 3025
2429 if (unlikely(page->mapping != mapping)) { 3026 if (unlikely(page->mapping != mapping)) {
2430 unlock_page(page); 3027 unlock_page(page);
@@ -2790,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2790 return -ENOMEM; 3387 return -ENOMEM;
2791 path->leave_spinning = 1; 3388 path->leave_spinning = 1;
2792 3389
3390 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3391 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3392
2793 /* 3393 /*
2794 * lookup the last file extent. We're not using i_size here 3394 * lookup the last file extent. We're not using i_size here
2795 * because there might be preallocation past i_size 3395 * because there might be preallocation past i_size
@@ -2837,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2837 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3437 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2838 &cached_state, GFP_NOFS); 3438 &cached_state, GFP_NOFS);
2839 3439
2840 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3440 em = get_extent_skip_holes(inode, start, last_for_get_extent,
2841 get_extent); 3441 get_extent);
2842 if (!em) 3442 if (!em)
2843 goto out; 3443 goto out;
@@ -2926,7 +3526,7 @@ out:
2926 return ret; 3526 return ret;
2927} 3527}
2928 3528
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3529inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930 unsigned long i) 3530 unsigned long i)
2931{ 3531{
2932 struct page *p; 3532 struct page *p;
@@ -2951,7 +3551,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2951 return p; 3551 return p;
2952} 3552}
2953 3553
2954static inline unsigned long num_extent_pages(u64 start, u64 len) 3554inline unsigned long num_extent_pages(u64 start, u64 len)
2955{ 3555{
2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3556 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957 (start >> PAGE_CACHE_SHIFT); 3557 (start >> PAGE_CACHE_SHIFT);
@@ -3204,6 +3804,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3204 PAGECACHE_TAG_DIRTY); 3804 PAGECACHE_TAG_DIRTY);
3205 } 3805 }
3206 spin_unlock_irq(&page->mapping->tree_lock); 3806 spin_unlock_irq(&page->mapping->tree_lock);
3807 ClearPageError(page);
3207 unlock_page(page); 3808 unlock_page(page);
3208 } 3809 }
3209 return 0; 3810 return 0;
@@ -3349,8 +3950,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3349} 3950}
3350 3951
3351int read_extent_buffer_pages(struct extent_io_tree *tree, 3952int read_extent_buffer_pages(struct extent_io_tree *tree,
3352 struct extent_buffer *eb, 3953 struct extent_buffer *eb, u64 start, int wait,
3353 u64 start, int wait,
3354 get_extent_t *get_extent, int mirror_num) 3954 get_extent_t *get_extent, int mirror_num)
3355{ 3955{
3356 unsigned long i; 3956 unsigned long i;
@@ -3386,7 +3986,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 num_pages = num_extent_pages(eb->start, eb->len); 3986 num_pages = num_extent_pages(eb->start, eb->len);
3387 for (i = start_i; i < num_pages; i++) { 3987 for (i = start_i; i < num_pages; i++) {
3388 page = extent_buffer_page(eb, i); 3988 page = extent_buffer_page(eb, i);
3389 if (!wait) { 3989 if (wait == WAIT_NONE) {
3390 if (!trylock_page(page)) 3990 if (!trylock_page(page))
3391 goto unlock_exit; 3991 goto unlock_exit;
3392 } else { 3992 } else {
@@ -3430,7 +4030,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3430 if (bio) 4030 if (bio)
3431 submit_one_bio(READ, bio, mirror_num, bio_flags); 4031 submit_one_bio(READ, bio, mirror_num, bio_flags);
3432 4032
3433 if (ret || !wait) 4033 if (ret || wait != WAIT_COMPLETE)
3434 return ret; 4034 return ret;
3435 4035
3436 for (i = start_i; i < num_pages; i++) { 4036 for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..7604c3001322 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,8 @@
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 24
@@ -32,6 +34,7 @@
32#define EXTENT_BUFFER_BLOCKING 1 34#define EXTENT_BUFFER_BLOCKING 1
33#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
35 38
36/* these are flags for extent_clear_unlock_delalloc */ 39/* these are flags for extent_clear_unlock_delalloc */
37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 40#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -67,7 +70,7 @@ struct extent_io_ops {
67 unsigned long bio_flags); 70 unsigned long bio_flags);
68 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
69 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
70 u64 start, u64 end, 73 u64 start, u64 end, int failed_mirror,
71 struct extent_state *state); 74 struct extent_state *state);
72 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
73 u64 start, u64 end, 76 u64 start, u64 end,
@@ -85,7 +88,8 @@ struct extent_io_ops {
85 struct extent_state *other); 88 struct extent_state *other);
86 void (*split_extent_hook)(struct inode *inode, 89 void (*split_extent_hook)(struct inode *inode,
87 struct extent_state *orig, u64 split); 90 struct extent_state *orig, u64 split);
88 int (*write_cache_pages_lock_hook)(struct page *page); 91 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
92 void (*flush_fn)(void *));
89}; 93};
90 94
91struct extent_io_tree { 95struct extent_io_tree {
@@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
185int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 189int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
186 gfp_t mask); 190 gfp_t mask);
187int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 191int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
188 get_extent_t *get_extent); 192 get_extent_t *get_extent, int mirror_num);
189int __init extent_io_init(void); 193int __init extent_io_init(void);
190void extent_io_exit(void); 194void extent_io_exit(void);
191 195
@@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
214 gfp_t mask); 218 gfp_t mask);
215int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 219int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
216 gfp_t mask); 220 gfp_t mask);
221int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
222 int bits, int clear_bits, gfp_t mask);
217int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 223int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
218 struct extent_state **cached_state, gfp_t mask); 224 struct extent_state **cached_state, gfp_t mask);
219int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 225int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
248struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 254struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
249 u64 start, unsigned long len); 255 u64 start, unsigned long len);
250void free_extent_buffer(struct extent_buffer *eb); 256void free_extent_buffer(struct extent_buffer *eb);
257#define WAIT_NONE 0
258#define WAIT_COMPLETE 1
259#define WAIT_PAGE_LOCK 2
251int read_extent_buffer_pages(struct extent_io_tree *tree, 260int read_extent_buffer_pages(struct extent_io_tree *tree,
252 struct extent_buffer *eb, u64 start, int wait, 261 struct extent_buffer *eb, u64 start, int wait,
253 get_extent_t *get_extent, int mirror_num); 262 get_extent_t *get_extent, int mirror_num);
263unsigned long num_extent_pages(u64 start, u64 len);
264struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
254 265
255static inline void extent_buffer_get(struct extent_buffer *eb) 266static inline void extent_buffer_get(struct extent_buffer *eb)
256{ 267{
@@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
300struct bio * 311struct bio *
301btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 312btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
302 gfp_t gfp_flags); 313 gfp_t gfp_flags);
314
315struct btrfs_mapping_tree;
316
317int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
318 u64 length, u64 logical, struct page *page,
319 int mirror_num);
303#endif 320#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
91 struct btrfs_csum_item *item; 91 struct btrfs_csum_item *item;
92 struct extent_buffer *leaf; 92 struct extent_buffer *leaf;
93 u64 csum_offset = 0; 93 u64 csum_offset = 0;
94 u16 csum_size = 94 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
95 btrfs_super_csum_size(&root->fs_info->super_copy);
96 int csums_in_item; 95 int csums_in_item;
97 96
98 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 97 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
162 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
163 u64 disk_bytenr; 162 u64 disk_bytenr;
164 u32 diff; 163 u32 diff;
165 u16 csum_size = 164 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
166 btrfs_super_csum_size(&root->fs_info->super_copy);
167 int ret; 165 int ret;
168 struct btrfs_path *path; 166 struct btrfs_path *path;
169 struct btrfs_csum_item *item = NULL; 167 struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
290 int ret; 288 int ret;
291 size_t size; 289 size_t size;
292 u64 csum_end; 290 u64 csum_end;
293 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 291 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
294 292
295 path = btrfs_alloc_path(); 293 path = btrfs_alloc_path();
296 if (!path) 294 if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
492 u64 bytenr, u64 len) 490 u64 bytenr, u64 len)
493{ 491{
494 struct extent_buffer *leaf; 492 struct extent_buffer *leaf;
495 u16 csum_size = 493 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
496 btrfs_super_csum_size(&root->fs_info->super_copy);
497 u64 csum_end; 494 u64 csum_end;
498 u64 end_byte = bytenr + len; 495 u64 end_byte = bytenr + len;
499 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; 496 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
549 u64 csum_end; 546 u64 csum_end;
550 struct extent_buffer *leaf; 547 struct extent_buffer *leaf;
551 int ret; 548 int ret;
552 u16 csum_size = 549 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
553 btrfs_super_csum_size(&root->fs_info->super_copy);
554 int blocksize_bits = root->fs_info->sb->s_blocksize_bits; 550 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
555 551
556 root = root->fs_info->csum_root; 552 root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
676 struct btrfs_sector_sum *sector_sum; 672 struct btrfs_sector_sum *sector_sum;
677 u32 nritems; 673 u32 nritems;
678 u32 ins_size; 674 u32 ins_size;
679 u16 csum_size = 675 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
680 btrfs_super_csum_size(&root->fs_info->super_copy);
681 676
682 path = btrfs_alloc_path(); 677 path = btrfs_alloc_path();
683 if (!path) 678 if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4e57d59edb7..cc7492c823f3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1069 int i; 1069 int i;
1070 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1070 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1071 struct inode *inode = fdentry(file)->d_inode; 1071 struct inode *inode = fdentry(file)->d_inode;
1072 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1072 int err = 0; 1073 int err = 0;
1073 int faili = 0; 1074 int faili = 0;
1074 u64 start_pos; 1075 u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1080again: 1081again:
1081 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1082 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1083 GFP_NOFS); 1084 mask);
1084 if (!pages[i]) { 1085 if (!pages[i]) {
1085 faili = i - 1; 1086 faili = i - 1;
1086 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1386,7 +1387,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1386 goto out; 1387 goto out;
1387 } 1388 }
1388 1389
1389 file_update_time(file); 1390 err = btrfs_update_time(file);
1391 if (err) {
1392 mutex_unlock(&inode->i_mutex);
1393 goto out;
1394 }
1390 BTRFS_I(inode)->sequence++; 1395 BTRFS_I(inode)->sequence++;
1391 1396
1392 start_pos = round_down(pos, root->sectorsize); 1397 start_pos = round_down(pos, root->sectorsize);
@@ -1615,10 +1620,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1615 goto out; 1620 goto out;
1616 } 1621 }
1617 1622
1618 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1619 if (ret)
1620 goto out;
1621
1622 locked_end = alloc_end - 1; 1623 locked_end = alloc_end - 1;
1623 while (1) { 1624 while (1) {
1624 struct btrfs_ordered_extent *ordered; 1625 struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1665,27 @@ static long btrfs_fallocate(struct file *file, int mode,
1664 if (em->block_start == EXTENT_MAP_HOLE || 1665 if (em->block_start == EXTENT_MAP_HOLE ||
1665 (cur_offset >= inode->i_size && 1666 (cur_offset >= inode->i_size &&
1666 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1667 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1668
1669 /*
1670 * Make sure we have enough space before we do the
1671 * allocation.
1672 */
1673 ret = btrfs_check_data_free_space(inode, last_byte -
1674 cur_offset);
1675 if (ret) {
1676 free_extent_map(em);
1677 break;
1678 }
1679
1667 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1680 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1668 last_byte - cur_offset, 1681 last_byte - cur_offset,
1669 1 << inode->i_blkbits, 1682 1 << inode->i_blkbits,
1670 offset + len, 1683 offset + len,
1671 &alloc_hint); 1684 &alloc_hint);
1685
1686 /* Let go of our reservation. */
1687 btrfs_free_reserved_data_space(inode, last_byte -
1688 cur_offset);
1672 if (ret < 0) { 1689 if (ret < 0) {
1673 free_extent_map(em); 1690 free_extent_map(em);
1674 break; 1691 break;
@@ -1694,8 +1711,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1694 } 1711 }
1695 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1712 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1696 &cached_state, GFP_NOFS); 1713 &cached_state, GFP_NOFS);
1697
1698 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1699out: 1714out:
1700 mutex_unlock(&inode->i_mutex); 1715 mutex_unlock(&inode->i_mutex);
1701 return ret; 1716 return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..ec23d43d0c35 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/math64.h> 22#include <linux/math64.h>
23#include <linux/ratelimit.h>
23#include "ctree.h" 24#include "ctree.h"
24#include "free-space-cache.h" 25#include "free-space-cache.h"
25#include "transaction.h" 26#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
84 *block_group, struct btrfs_path *path) 85 *block_group, struct btrfs_path *path)
85{ 86{
86 struct inode *inode = NULL; 87 struct inode *inode = NULL;
88 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
87 89
88 spin_lock(&block_group->lock); 90 spin_lock(&block_group->lock);
89 if (block_group->inode) 91 if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 100 return inode;
99 101
100 spin_lock(&block_group->lock); 102 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { 103 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
102 printk(KERN_INFO "Old style space inode found, converting.\n"); 104 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; 105 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
106 BTRFS_INODE_NODATACOW;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR; 107 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 } 108 }
106 109
107 if (!btrfs_fs_closing(root->fs_info)) { 110 if (!block_group->iref) {
108 block_group->inode = igrab(inode); 111 block_group->inode = igrab(inode);
109 block_group->iref = 1; 112 block_group->iref = 1;
110 } 113 }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
122 struct btrfs_free_space_header *header; 125 struct btrfs_free_space_header *header;
123 struct btrfs_inode_item *inode_item; 126 struct btrfs_inode_item *inode_item;
124 struct extent_buffer *leaf; 127 struct extent_buffer *leaf;
128 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
125 int ret; 129 int ret;
126 130
127 ret = btrfs_insert_empty_inode(trans, root, path, ino); 131 ret = btrfs_insert_empty_inode(trans, root, path, ino);
128 if (ret) 132 if (ret)
129 return ret; 133 return ret;
130 134
135 /* We inline crc's for the free disk space cache */
136 if (ino != BTRFS_FREE_INO_OBJECTID)
137 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
138
131 leaf = path->nodes[0]; 139 leaf = path->nodes[0];
132 inode_item = btrfs_item_ptr(leaf, path->slots[0], 140 inode_item = btrfs_item_ptr(leaf, path->slots[0],
133 struct btrfs_inode_item); 141 struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
140 btrfs_set_inode_uid(leaf, inode_item, 0); 148 btrfs_set_inode_uid(leaf, inode_item, 0);
141 btrfs_set_inode_gid(leaf, inode_item, 0); 149 btrfs_set_inode_gid(leaf, inode_item, 0);
142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 150 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 151 btrfs_set_inode_flags(leaf, inode_item, flags);
144 BTRFS_INODE_PREALLOC);
145 btrfs_set_inode_nlink(leaf, inode_item, 1); 152 btrfs_set_inode_nlink(leaf, inode_item, 1);
146 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 153 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
147 btrfs_set_inode_block_group(leaf, inode_item, offset); 154 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
191 struct inode *inode) 198 struct inode *inode)
192{ 199{
193 struct btrfs_block_rsv *rsv; 200 struct btrfs_block_rsv *rsv;
201 u64 needed_bytes;
194 loff_t oldsize; 202 loff_t oldsize;
195 int ret = 0; 203 int ret = 0;
196 204
197 rsv = trans->block_rsv; 205 rsv = trans->block_rsv;
198 trans->block_rsv = root->orphan_block_rsv; 206 trans->block_rsv = &root->fs_info->global_block_rsv;
199 ret = btrfs_block_rsv_check(trans, root, 207
200 root->orphan_block_rsv, 208 /* 1 for slack space, 1 for updating the inode */
201 0, 5); 209 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
202 if (ret) 210 btrfs_calc_trans_metadata_size(root, 1);
203 return ret; 211
212 spin_lock(&trans->block_rsv->lock);
213 if (trans->block_rsv->reserved < needed_bytes) {
214 spin_unlock(&trans->block_rsv->lock);
215 trans->block_rsv = rsv;
216 return -ENOSPC;
217 }
218 spin_unlock(&trans->block_rsv->lock);
204 219
205 oldsize = i_size_read(inode); 220 oldsize = i_size_read(inode);
206 btrfs_i_size_write(inode, 0); 221 btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
213 ret = btrfs_truncate_inode_items(trans, root, inode, 228 ret = btrfs_truncate_inode_items(trans, root, inode,
214 0, BTRFS_EXTENT_DATA_KEY); 229 0, BTRFS_EXTENT_DATA_KEY);
215 230
216 trans->block_rsv = rsv;
217 if (ret) { 231 if (ret) {
232 trans->block_rsv = rsv;
218 WARN_ON(1); 233 WARN_ON(1);
219 return ret; 234 return ret;
220 } 235 }
221 236
222 ret = btrfs_update_inode(trans, root, inode); 237 ret = btrfs_update_inode(trans, root, inode);
238 trans->block_rsv = rsv;
239
223 return ret; 240 return ret;
224} 241}
225 242
@@ -242,26 +259,348 @@ static int readahead_cache(struct inode *inode)
242 return 0; 259 return 0;
243} 260}
244 261
262struct io_ctl {
263 void *cur, *orig;
264 struct page *page;
265 struct page **pages;
266 struct btrfs_root *root;
267 unsigned long size;
268 int index;
269 int num_pages;
270 unsigned check_crcs:1;
271};
272
273static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
274 struct btrfs_root *root)
275{
276 memset(io_ctl, 0, sizeof(struct io_ctl));
277 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
278 PAGE_CACHE_SHIFT;
279 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
280 GFP_NOFS);
281 if (!io_ctl->pages)
282 return -ENOMEM;
283 io_ctl->root = root;
284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
285 io_ctl->check_crcs = 1;
286 return 0;
287}
288
289static void io_ctl_free(struct io_ctl *io_ctl)
290{
291 kfree(io_ctl->pages);
292}
293
294static void io_ctl_unmap_page(struct io_ctl *io_ctl)
295{
296 if (io_ctl->cur) {
297 kunmap(io_ctl->page);
298 io_ctl->cur = NULL;
299 io_ctl->orig = NULL;
300 }
301}
302
303static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
304{
305 WARN_ON(io_ctl->cur);
306 BUG_ON(io_ctl->index >= io_ctl->num_pages);
307 io_ctl->page = io_ctl->pages[io_ctl->index++];
308 io_ctl->cur = kmap(io_ctl->page);
309 io_ctl->orig = io_ctl->cur;
310 io_ctl->size = PAGE_CACHE_SIZE;
311 if (clear)
312 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
313}
314
315static void io_ctl_drop_pages(struct io_ctl *io_ctl)
316{
317 int i;
318
319 io_ctl_unmap_page(io_ctl);
320
321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]);
323 unlock_page(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]);
325 }
326}
327
328static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
329 int uptodate)
330{
331 struct page *page;
332 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
333 int i;
334
335 for (i = 0; i < io_ctl->num_pages; i++) {
336 page = find_or_create_page(inode->i_mapping, i, mask);
337 if (!page) {
338 io_ctl_drop_pages(io_ctl);
339 return -ENOMEM;
340 }
341 io_ctl->pages[i] = page;
342 if (uptodate && !PageUptodate(page)) {
343 btrfs_readpage(NULL, page);
344 lock_page(page);
345 if (!PageUptodate(page)) {
346 printk(KERN_ERR "btrfs: error reading free "
347 "space cache\n");
348 io_ctl_drop_pages(io_ctl);
349 return -EIO;
350 }
351 }
352 }
353
354 for (i = 0; i < io_ctl->num_pages; i++) {
355 clear_page_dirty_for_io(io_ctl->pages[i]);
356 set_page_extent_mapped(io_ctl->pages[i]);
357 }
358
359 return 0;
360}
361
362static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
363{
364 u64 *val;
365
366 io_ctl_map_page(io_ctl, 1);
367
368 /*
369 * Skip the csum areas. If we don't check crcs then we just have a
370 * 64bit chunk at the front of the first page.
371 */
372 if (io_ctl->check_crcs) {
373 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
374 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
375 } else {
376 io_ctl->cur += sizeof(u64);
377 io_ctl->size -= sizeof(u64) * 2;
378 }
379
380 val = io_ctl->cur;
381 *val = cpu_to_le64(generation);
382 io_ctl->cur += sizeof(u64);
383}
384
385static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
386{
387 u64 *gen;
388
389 /*
390 * Skip the crc area. If we don't check crcs then we just have a 64bit
391 * chunk at the front of the first page.
392 */
393 if (io_ctl->check_crcs) {
394 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
395 io_ctl->size -= sizeof(u64) +
396 (sizeof(u32) * io_ctl->num_pages);
397 } else {
398 io_ctl->cur += sizeof(u64);
399 io_ctl->size -= sizeof(u64) * 2;
400 }
401
402 gen = io_ctl->cur;
403 if (le64_to_cpu(*gen) != generation) {
404 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
405 "(%Lu) does not match inode (%Lu)\n", *gen,
406 generation);
407 io_ctl_unmap_page(io_ctl);
408 return -EIO;
409 }
410 io_ctl->cur += sizeof(u64);
411 return 0;
412}
413
414static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
415{
416 u32 *tmp;
417 u32 crc = ~(u32)0;
418 unsigned offset = 0;
419
420 if (!io_ctl->check_crcs) {
421 io_ctl_unmap_page(io_ctl);
422 return;
423 }
424
425 if (index == 0)
426 offset = sizeof(u32) * io_ctl->num_pages;;
427
428 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
429 PAGE_CACHE_SIZE - offset);
430 btrfs_csum_final(crc, (char *)&crc);
431 io_ctl_unmap_page(io_ctl);
432 tmp = kmap(io_ctl->pages[0]);
433 tmp += index;
434 *tmp = crc;
435 kunmap(io_ctl->pages[0]);
436}
437
438static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
439{
440 u32 *tmp, val;
441 u32 crc = ~(u32)0;
442 unsigned offset = 0;
443
444 if (!io_ctl->check_crcs) {
445 io_ctl_map_page(io_ctl, 0);
446 return 0;
447 }
448
449 if (index == 0)
450 offset = sizeof(u32) * io_ctl->num_pages;
451
452 tmp = kmap(io_ctl->pages[0]);
453 tmp += index;
454 val = *tmp;
455 kunmap(io_ctl->pages[0]);
456
457 io_ctl_map_page(io_ctl, 0);
458 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
459 PAGE_CACHE_SIZE - offset);
460 btrfs_csum_final(crc, (char *)&crc);
461 if (val != crc) {
462 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
463 "space cache\n");
464 io_ctl_unmap_page(io_ctl);
465 return -EIO;
466 }
467
468 return 0;
469}
470
471static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
472 void *bitmap)
473{
474 struct btrfs_free_space_entry *entry;
475
476 if (!io_ctl->cur)
477 return -ENOSPC;
478
479 entry = io_ctl->cur;
480 entry->offset = cpu_to_le64(offset);
481 entry->bytes = cpu_to_le64(bytes);
482 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
483 BTRFS_FREE_SPACE_EXTENT;
484 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
485 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
486
487 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
488 return 0;
489
490 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
491
492 /* No more pages to map */
493 if (io_ctl->index >= io_ctl->num_pages)
494 return 0;
495
496 /* map the next page */
497 io_ctl_map_page(io_ctl, 1);
498 return 0;
499}
500
501static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
502{
503 if (!io_ctl->cur)
504 return -ENOSPC;
505
506 /*
507 * If we aren't at the start of the current page, unmap this one and
508 * map the next one if there is any left.
509 */
510 if (io_ctl->cur != io_ctl->orig) {
511 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
512 if (io_ctl->index >= io_ctl->num_pages)
513 return -ENOSPC;
514 io_ctl_map_page(io_ctl, 0);
515 }
516
517 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
518 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
519 if (io_ctl->index < io_ctl->num_pages)
520 io_ctl_map_page(io_ctl, 0);
521 return 0;
522}
523
524static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
525{
526 /*
527 * If we're not on the boundary we know we've modified the page and we
528 * need to crc the page.
529 */
530 if (io_ctl->cur != io_ctl->orig)
531 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
532 else
533 io_ctl_unmap_page(io_ctl);
534
535 while (io_ctl->index < io_ctl->num_pages) {
536 io_ctl_map_page(io_ctl, 1);
537 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
538 }
539}
540
541static int io_ctl_read_entry(struct io_ctl *io_ctl,
542 struct btrfs_free_space *entry, u8 *type)
543{
544 struct btrfs_free_space_entry *e;
545 int ret;
546
547 if (!io_ctl->cur) {
548 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
549 if (ret)
550 return ret;
551 }
552
553 e = io_ctl->cur;
554 entry->offset = le64_to_cpu(e->offset);
555 entry->bytes = le64_to_cpu(e->bytes);
556 *type = e->type;
557 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
558 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
559
560 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
561 return 0;
562
563 io_ctl_unmap_page(io_ctl);
564
565 return 0;
566}
567
568static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
569 struct btrfs_free_space *entry)
570{
571 int ret;
572
573 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
574 if (ret)
575 return ret;
576
577 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
578 io_ctl_unmap_page(io_ctl);
579
580 return 0;
581}
582
245int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 583int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
246 struct btrfs_free_space_ctl *ctl, 584 struct btrfs_free_space_ctl *ctl,
247 struct btrfs_path *path, u64 offset) 585 struct btrfs_path *path, u64 offset)
248{ 586{
249 struct btrfs_free_space_header *header; 587 struct btrfs_free_space_header *header;
250 struct extent_buffer *leaf; 588 struct extent_buffer *leaf;
251 struct page *page; 589 struct io_ctl io_ctl;
252 struct btrfs_key key; 590 struct btrfs_key key;
591 struct btrfs_free_space *e, *n;
253 struct list_head bitmaps; 592 struct list_head bitmaps;
254 u64 num_entries; 593 u64 num_entries;
255 u64 num_bitmaps; 594 u64 num_bitmaps;
256 u64 generation; 595 u64 generation;
257 pgoff_t index = 0; 596 u8 type;
258 int ret = 0; 597 int ret = 0;
259 598
260 INIT_LIST_HEAD(&bitmaps); 599 INIT_LIST_HEAD(&bitmaps);
261 600
262 /* Nothing in the space cache, goodbye */ 601 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) 602 if (!i_size_read(inode))
264 goto out; 603 return 0;
265 604
266 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 605 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
267 key.offset = offset; 606 key.offset = offset;
@@ -269,11 +608,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
269 608
270 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 609 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
271 if (ret < 0) 610 if (ret < 0)
272 goto out; 611 return 0;
273 else if (ret > 0) { 612 else if (ret > 0) {
274 btrfs_release_path(path); 613 btrfs_release_path(path);
275 ret = 0; 614 return 0;
276 goto out;
277 } 615 }
278 616
279 ret = -1; 617 ret = -1;
@@ -291,169 +629,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
291 " not match free space cache generation (%llu)\n", 629 " not match free space cache generation (%llu)\n",
292 (unsigned long long)BTRFS_I(inode)->generation, 630 (unsigned long long)BTRFS_I(inode)->generation,
293 (unsigned long long)generation); 631 (unsigned long long)generation);
294 goto out; 632 return 0;
295 } 633 }
296 634
297 if (!num_entries) 635 if (!num_entries)
298 goto out; 636 return 0;
299 637
638 io_ctl_init(&io_ctl, inode, root);
300 ret = readahead_cache(inode); 639 ret = readahead_cache(inode);
301 if (ret) 640 if (ret)
302 goto out; 641 goto out;
303 642
304 while (1) { 643 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
305 struct btrfs_free_space_entry *entry; 644 if (ret)
306 struct btrfs_free_space *e; 645 goto out;
307 void *addr;
308 unsigned long offset = 0;
309 int need_loop = 0;
310 646
311 if (!num_entries && !num_bitmaps) 647 ret = io_ctl_check_crc(&io_ctl, 0);
312 break; 648 if (ret)
649 goto free_cache;
650
651 ret = io_ctl_check_generation(&io_ctl, generation);
652 if (ret)
653 goto free_cache;
313 654
314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 655 while (num_entries) {
315 if (!page) 656 e = kmem_cache_zalloc(btrfs_free_space_cachep,
657 GFP_NOFS);
658 if (!e)
316 goto free_cache; 659 goto free_cache;
317 660
318 if (!PageUptodate(page)) { 661 ret = io_ctl_read_entry(&io_ctl, e, &type);
319 btrfs_readpage(NULL, page); 662 if (ret) {
320 lock_page(page); 663 kmem_cache_free(btrfs_free_space_cachep, e);
321 if (!PageUptodate(page)) { 664 goto free_cache;
322 unlock_page(page);
323 page_cache_release(page);
324 printk(KERN_ERR "btrfs: error reading free "
325 "space cache\n");
326 goto free_cache;
327 }
328 } 665 }
329 addr = kmap(page);
330 666
331 if (index == 0) { 667 if (!e->bytes) {
332 u64 *gen; 668 kmem_cache_free(btrfs_free_space_cachep, e);
669 goto free_cache;
670 }
333 671
334 /* 672 if (type == BTRFS_FREE_SPACE_EXTENT) {
335 * We put a bogus crc in the front of the first page in 673 spin_lock(&ctl->tree_lock);
336 * case old kernels try to mount a fs with the new 674 ret = link_free_space(ctl, e);
337 * format to make sure they discard the cache. 675 spin_unlock(&ctl->tree_lock);
338 */ 676 if (ret) {
339 addr += sizeof(u64); 677 printk(KERN_ERR "Duplicate entries in "
340 offset += sizeof(u64); 678 "free space cache, dumping\n");
341 679 kmem_cache_free(btrfs_free_space_cachep, e);
342 gen = addr;
343 if (*gen != BTRFS_I(inode)->generation) {
344 printk(KERN_ERR "btrfs: space cache generation"
345 " (%llu) does not match inode (%llu)\n",
346 (unsigned long long)*gen,
347 (unsigned long long)
348 BTRFS_I(inode)->generation);
349 kunmap(page);
350 unlock_page(page);
351 page_cache_release(page);
352 goto free_cache; 680 goto free_cache;
353 } 681 }
354 addr += sizeof(u64); 682 } else {
355 offset += sizeof(u64); 683 BUG_ON(!num_bitmaps);
356 } 684 num_bitmaps--;
357 entry = addr; 685 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
358 686 if (!e->bitmap) {
359 while (1) { 687 kmem_cache_free(
360 if (!num_entries) 688 btrfs_free_space_cachep, e);
361 break;
362
363 need_loop = 1;
364 e = kmem_cache_zalloc(btrfs_free_space_cachep,
365 GFP_NOFS);
366 if (!e) {
367 kunmap(page);
368 unlock_page(page);
369 page_cache_release(page);
370 goto free_cache; 689 goto free_cache;
371 } 690 }
372 691 spin_lock(&ctl->tree_lock);
373 e->offset = le64_to_cpu(entry->offset); 692 ret = link_free_space(ctl, e);
374 e->bytes = le64_to_cpu(entry->bytes); 693 ctl->total_bitmaps++;
375 if (!e->bytes) { 694 ctl->op->recalc_thresholds(ctl);
376 kunmap(page); 695 spin_unlock(&ctl->tree_lock);
696 if (ret) {
697 printk(KERN_ERR "Duplicate entries in "
698 "free space cache, dumping\n");
377 kmem_cache_free(btrfs_free_space_cachep, e); 699 kmem_cache_free(btrfs_free_space_cachep, e);
378 unlock_page(page);
379 page_cache_release(page);
380 goto free_cache; 700 goto free_cache;
381 } 701 }
382 702 list_add_tail(&e->list, &bitmaps);
383 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
384 spin_lock(&ctl->tree_lock);
385 ret = link_free_space(ctl, e);
386 spin_unlock(&ctl->tree_lock);
387 if (ret) {
388 printk(KERN_ERR "Duplicate entries in "
389 "free space cache, dumping\n");
390 kunmap(page);
391 unlock_page(page);
392 page_cache_release(page);
393 goto free_cache;
394 }
395 } else {
396 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
397 if (!e->bitmap) {
398 kunmap(page);
399 kmem_cache_free(
400 btrfs_free_space_cachep, e);
401 unlock_page(page);
402 page_cache_release(page);
403 goto free_cache;
404 }
405 spin_lock(&ctl->tree_lock);
406 ret = link_free_space(ctl, e);
407 ctl->total_bitmaps++;
408 ctl->op->recalc_thresholds(ctl);
409 spin_unlock(&ctl->tree_lock);
410 if (ret) {
411 printk(KERN_ERR "Duplicate entries in "
412 "free space cache, dumping\n");
413 kunmap(page);
414 unlock_page(page);
415 page_cache_release(page);
416 goto free_cache;
417 }
418 list_add_tail(&e->list, &bitmaps);
419 }
420
421 num_entries--;
422 offset += sizeof(struct btrfs_free_space_entry);
423 if (offset + sizeof(struct btrfs_free_space_entry) >=
424 PAGE_CACHE_SIZE)
425 break;
426 entry++;
427 } 703 }
428 704
429 /* 705 num_entries--;
430 * We read an entry out of this page, we need to move on to the 706 }
431 * next page.
432 */
433 if (need_loop) {
434 kunmap(page);
435 goto next;
436 }
437 707
438 /* 708 io_ctl_unmap_page(&io_ctl);
439 * We add the bitmaps at the end of the entries in order that 709
440 * the bitmap entries are added to the cache. 710 /*
441 */ 711 * We add the bitmaps at the end of the entries in order that
442 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 712 * the bitmap entries are added to the cache.
713 */
714 list_for_each_entry_safe(e, n, &bitmaps, list) {
443 list_del_init(&e->list); 715 list_del_init(&e->list);
444 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 716 ret = io_ctl_read_bitmap(&io_ctl, e);
445 kunmap(page); 717 if (ret)
446 num_bitmaps--; 718 goto free_cache;
447next:
448 unlock_page(page);
449 page_cache_release(page);
450 index++;
451 } 719 }
452 720
721 io_ctl_drop_pages(&io_ctl);
453 ret = 1; 722 ret = 1;
454out: 723out:
724 io_ctl_free(&io_ctl);
455 return ret; 725 return ret;
456free_cache: 726free_cache:
727 io_ctl_drop_pages(&io_ctl);
457 __btrfs_remove_free_space_cache(ctl); 728 __btrfs_remove_free_space_cache(ctl);
458 goto out; 729 goto out;
459} 730}
@@ -465,7 +736,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
465 struct btrfs_root *root = fs_info->tree_root; 736 struct btrfs_root *root = fs_info->tree_root;
466 struct inode *inode; 737 struct inode *inode;
467 struct btrfs_path *path; 738 struct btrfs_path *path;
468 int ret; 739 int ret = 0;
469 bool matched; 740 bool matched;
470 u64 used = btrfs_block_group_used(&block_group->item); 741 u64 used = btrfs_block_group_used(&block_group->item);
471 742
@@ -497,6 +768,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
497 return 0; 768 return 0;
498 } 769 }
499 770
771 /* We may have converted the inode and made the cache invalid. */
772 spin_lock(&block_group->lock);
773 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
774 spin_unlock(&block_group->lock);
775 goto out;
776 }
777 spin_unlock(&block_group->lock);
778
500 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 779 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
501 path, block_group->key.objectid); 780 path, block_group->key.objectid);
502 btrfs_free_path(path); 781 btrfs_free_path(path);
@@ -530,6 +809,19 @@ out:
530 return ret; 809 return ret;
531} 810}
532 811
812/**
813 * __btrfs_write_out_cache - write out cached info to an inode
814 * @root - the root the inode belongs to
815 * @ctl - the free space cache we are going to write out
816 * @block_group - the block_group for this cache if it belongs to a block_group
817 * @trans - the trans handle
818 * @path - the path to use
819 * @offset - the offset for the key we'll insert
820 *
821 * This function writes out a free space cache struct to disk for quick recovery
822 * on mount. This will return 0 if it was successfull in writing the cache out,
823 * and -1 if it was not.
824 */
533int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 825int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
534 struct btrfs_free_space_ctl *ctl, 826 struct btrfs_free_space_ctl *ctl,
535 struct btrfs_block_group_cache *block_group, 827 struct btrfs_block_group_cache *block_group,
@@ -540,42 +832,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
540 struct extent_buffer *leaf; 832 struct extent_buffer *leaf;
541 struct rb_node *node; 833 struct rb_node *node;
542 struct list_head *pos, *n; 834 struct list_head *pos, *n;
543 struct page **pages;
544 struct page *page;
545 struct extent_state *cached_state = NULL; 835 struct extent_state *cached_state = NULL;
546 struct btrfs_free_cluster *cluster = NULL; 836 struct btrfs_free_cluster *cluster = NULL;
547 struct extent_io_tree *unpin = NULL; 837 struct extent_io_tree *unpin = NULL;
838 struct io_ctl io_ctl;
548 struct list_head bitmap_list; 839 struct list_head bitmap_list;
549 struct btrfs_key key; 840 struct btrfs_key key;
550 u64 start, end, len; 841 u64 start, end, len;
551 u64 bytes = 0;
552 u32 crc = ~(u32)0;
553 int index = 0, num_pages = 0;
554 int entries = 0; 842 int entries = 0;
555 int bitmaps = 0; 843 int bitmaps = 0;
556 int ret = -1; 844 int ret;
557 bool next_page = false; 845 int err = -1;
558 bool out_of_space = false;
559 846
560 INIT_LIST_HEAD(&bitmap_list); 847 INIT_LIST_HEAD(&bitmap_list);
561 848
562 node = rb_first(&ctl->free_space_offset);
563 if (!node)
564 return 0;
565
566 if (!i_size_read(inode)) 849 if (!i_size_read(inode))
567 return -1; 850 return -1;
568 851
569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 852 io_ctl_init(&io_ctl, inode, root);
570 PAGE_CACHE_SHIFT;
571
572 filemap_write_and_wait(inode->i_mapping);
573 btrfs_wait_ordered_range(inode, inode->i_size &
574 ~(root->sectorsize - 1), (u64)-1);
575
576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
577 if (!pages)
578 return -1;
579 853
580 /* Get the cluster for this block_group if it exists */ 854 /* Get the cluster for this block_group if it exists */
581 if (block_group && !list_empty(&block_group->cluster_list)) 855 if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +863,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
589 */ 863 */
590 unpin = root->fs_info->pinned_extents; 864 unpin = root->fs_info->pinned_extents;
591 865
592 /* 866 /* Lock all pages first so we can lock the extent safely. */
593 * Lock all pages first so we can lock the extent safely. 867 io_ctl_prepare_pages(&io_ctl, inode, 0);
594 *
595 * NOTE: Because we hold the ref the entire time we're going to write to
596 * the page find_get_page should never fail, so we don't do a check
597 * after find_get_page at this point. Just putting this here so people
598 * know and don't freak out.
599 */
600 while (index < num_pages) {
601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
602 if (!page) {
603 int i;
604 868
605 for (i = 0; i < num_pages; i++) {
606 unlock_page(pages[i]);
607 page_cache_release(pages[i]);
608 }
609 goto out;
610 }
611 pages[index] = page;
612 index++;
613 }
614
615 index = 0;
616 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 869 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
617 0, &cached_state, GFP_NOFS); 870 0, &cached_state, GFP_NOFS);
618 871
@@ -623,189 +876,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
623 if (block_group) 876 if (block_group)
624 start = block_group->key.objectid; 877 start = block_group->key.objectid;
625 878
626 /* Write out the extent entries */ 879 node = rb_first(&ctl->free_space_offset);
627 do { 880 if (!node && cluster) {
628 struct btrfs_free_space_entry *entry; 881 node = rb_first(&cluster->root);
629 void *addr, *orig; 882 cluster = NULL;
630 unsigned long offset = 0; 883 }
631 884
632 next_page = false; 885 /* Make sure we can fit our crcs into the first page */
886 if (io_ctl.check_crcs &&
887 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
888 WARN_ON(1);
889 goto out_nospc;
890 }
633 891
634 if (index >= num_pages) { 892 io_ctl_set_generation(&io_ctl, trans->transid);
635 out_of_space = true;
636 break;
637 }
638 893
639 page = pages[index]; 894 /* Write out the extent entries */
895 while (node) {
896 struct btrfs_free_space *e;
640 897
641 orig = addr = kmap(page); 898 e = rb_entry(node, struct btrfs_free_space, offset_index);
642 if (index == 0) { 899 entries++;
643 u64 *gen;
644 900
645 /* 901 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
646 * We're going to put in a bogus crc for this page to 902 e->bitmap);
647 * make sure that old kernels who aren't aware of this 903 if (ret)
648 * format will be sure to discard the cache. 904 goto out_nospc;
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652 905
653 gen = addr; 906 if (e->bitmap) {
654 *gen = trans->transid; 907 list_add_tail(&e->list, &bitmap_list);
655 addr += sizeof(u64); 908 bitmaps++;
656 offset += sizeof(u64);
657 } 909 }
658 entry = addr; 910 node = rb_next(node);
659 911 if (!node && cluster) {
660 memset(addr, 0, PAGE_CACHE_SIZE - offset); 912 node = rb_first(&cluster->root);
661 while (node && !next_page) { 913 cluster = NULL;
662 struct btrfs_free_space *e;
663
664 e = rb_entry(node, struct btrfs_free_space, offset_index);
665 entries++;
666
667 entry->offset = cpu_to_le64(e->offset);
668 entry->bytes = cpu_to_le64(e->bytes);
669 if (e->bitmap) {
670 entry->type = BTRFS_FREE_SPACE_BITMAP;
671 list_add_tail(&e->list, &bitmap_list);
672 bitmaps++;
673 } else {
674 entry->type = BTRFS_FREE_SPACE_EXTENT;
675 }
676 node = rb_next(node);
677 if (!node && cluster) {
678 node = rb_first(&cluster->root);
679 cluster = NULL;
680 }
681 offset += sizeof(struct btrfs_free_space_entry);
682 if (offset + sizeof(struct btrfs_free_space_entry) >=
683 PAGE_CACHE_SIZE)
684 next_page = true;
685 entry++;
686 } 914 }
915 }
687 916
688 /* 917 /*
689 * We want to add any pinned extents to our free space cache 918 * We want to add any pinned extents to our free space cache
690 * so we don't leak the space 919 * so we don't leak the space
691 */ 920 */
692 while (block_group && !next_page && 921 while (block_group && (start < block_group->key.objectid +
693 (start < block_group->key.objectid + 922 block_group->key.offset)) {
694 block_group->key.offset)) { 923 ret = find_first_extent_bit(unpin, start, &start, &end,
695 ret = find_first_extent_bit(unpin, start, &start, &end, 924 EXTENT_DIRTY);
696 EXTENT_DIRTY); 925 if (ret) {
697 if (ret) { 926 ret = 0;
698 ret = 0; 927 break;
699 break;
700 }
701
702 /* This pinned extent is out of our range */
703 if (start >= block_group->key.objectid +
704 block_group->key.offset)
705 break;
706
707 len = block_group->key.objectid +
708 block_group->key.offset - start;
709 len = min(len, end + 1 - start);
710
711 entries++;
712 entry->offset = cpu_to_le64(start);
713 entry->bytes = cpu_to_le64(len);
714 entry->type = BTRFS_FREE_SPACE_EXTENT;
715
716 start = end + 1;
717 offset += sizeof(struct btrfs_free_space_entry);
718 if (offset + sizeof(struct btrfs_free_space_entry) >=
719 PAGE_CACHE_SIZE)
720 next_page = true;
721 entry++;
722 } 928 }
723 929
724 /* Generate bogus crc value */ 930 /* This pinned extent is out of our range */
725 if (index == 0) { 931 if (start >= block_group->key.objectid +
726 u32 *tmp; 932 block_group->key.offset)
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc, 933 break;
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734 934
735 kunmap(page); 935 len = block_group->key.objectid +
936 block_group->key.offset - start;
937 len = min(len, end + 1 - start);
736 938
737 bytes += PAGE_CACHE_SIZE; 939 entries++;
940 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
941 if (ret)
942 goto out_nospc;
738 943
739 index++; 944 start = end + 1;
740 } while (node || next_page); 945 }
741 946
742 /* Write out the bitmaps */ 947 /* Write out the bitmaps */
743 list_for_each_safe(pos, n, &bitmap_list) { 948 list_for_each_safe(pos, n, &bitmap_list) {
744 void *addr;
745 struct btrfs_free_space *entry = 949 struct btrfs_free_space *entry =
746 list_entry(pos, struct btrfs_free_space, list); 950 list_entry(pos, struct btrfs_free_space, list);
747 951
748 if (index >= num_pages) { 952 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
749 out_of_space = true; 953 if (ret)
750 break; 954 goto out_nospc;
751 }
752 page = pages[index];
753
754 addr = kmap(page);
755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
756 kunmap(page);
757 bytes += PAGE_CACHE_SIZE;
758
759 list_del_init(&entry->list); 955 list_del_init(&entry->list);
760 index++;
761 }
762
763 if (out_of_space) {
764 btrfs_drop_pages(pages, num_pages);
765 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
766 i_size_read(inode) - 1, &cached_state,
767 GFP_NOFS);
768 ret = 0;
769 goto out;
770 } 956 }
771 957
772 /* Zero out the rest of the pages just to make sure */ 958 /* Zero out the rest of the pages just to make sure */
773 while (index < num_pages) { 959 io_ctl_zero_remaining_pages(&io_ctl);
774 void *addr;
775 960
776 page = pages[index]; 961 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
777 addr = kmap(page); 962 0, i_size_read(inode), &cached_state);
778 memset(addr, 0, PAGE_CACHE_SIZE); 963 io_ctl_drop_pages(&io_ctl);
779 kunmap(page);
780 bytes += PAGE_CACHE_SIZE;
781 index++;
782 }
783
784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
785 bytes, &cached_state);
786 btrfs_drop_pages(pages, num_pages);
787 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 964 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
788 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 965 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
789 966
790 if (ret) { 967 if (ret)
791 ret = 0;
792 goto out; 968 goto out;
793 }
794 969
795 BTRFS_I(inode)->generation = trans->transid;
796 970
797 filemap_write_and_wait(inode->i_mapping); 971 ret = filemap_write_and_wait(inode->i_mapping);
972 if (ret)
973 goto out;
798 974
799 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 975 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
800 key.offset = offset; 976 key.offset = offset;
801 key.type = 0; 977 key.type = 0;
802 978
803 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 979 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
804 if (ret < 0) { 980 if (ret < 0) {
805 ret = -1; 981 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 982 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
807 EXTENT_DIRTY | EXTENT_DELALLOC | 983 GFP_NOFS);
808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
809 goto out; 984 goto out;
810 } 985 }
811 leaf = path->nodes[0]; 986 leaf = path->nodes[0];
@@ -816,15 +991,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
816 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 991 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
817 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 992 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
818 found_key.offset != offset) { 993 found_key.offset != offset) {
819 ret = -1; 994 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
820 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 995 inode->i_size - 1,
821 EXTENT_DIRTY | EXTENT_DELALLOC | 996 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
822 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 997 NULL, GFP_NOFS);
823 GFP_NOFS);
824 btrfs_release_path(path); 998 btrfs_release_path(path);
825 goto out; 999 goto out;
826 } 1000 }
827 } 1001 }
1002
1003 BTRFS_I(inode)->generation = trans->transid;
828 header = btrfs_item_ptr(leaf, path->slots[0], 1004 header = btrfs_item_ptr(leaf, path->slots[0],
829 struct btrfs_free_space_header); 1005 struct btrfs_free_space_header);
830 btrfs_set_free_space_entries(leaf, header, entries); 1006 btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1009,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 btrfs_mark_buffer_dirty(leaf); 1009 btrfs_mark_buffer_dirty(leaf);
834 btrfs_release_path(path); 1010 btrfs_release_path(path);
835 1011
836 ret = 1; 1012 err = 0;
837
838out: 1013out:
839 kfree(pages); 1014 io_ctl_free(&io_ctl);
840 if (ret != 1) { 1015 if (err) {
841 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 1016 invalidate_inode_pages2(inode->i_mapping);
842 BTRFS_I(inode)->generation = 0; 1017 BTRFS_I(inode)->generation = 0;
843 } 1018 }
844 btrfs_update_inode(trans, root, inode); 1019 btrfs_update_inode(trans, root, inode);
845 return ret; 1020 return err;
1021
1022out_nospc:
1023 list_for_each_safe(pos, n, &bitmap_list) {
1024 struct btrfs_free_space *entry =
1025 list_entry(pos, struct btrfs_free_space, list);
1026 list_del_init(&entry->list);
1027 }
1028 io_ctl_drop_pages(&io_ctl);
1029 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1030 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1031 goto out;
846} 1032}
847 1033
848int btrfs_write_out_cache(struct btrfs_root *root, 1034int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1055,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
869 1055
870 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1056 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
871 path, block_group->key.objectid); 1057 path, block_group->key.objectid);
872 if (ret < 0) { 1058 if (ret) {
873 spin_lock(&block_group->lock); 1059 spin_lock(&block_group->lock);
874 block_group->disk_cache_state = BTRFS_DC_ERROR; 1060 block_group->disk_cache_state = BTRFS_DC_ERROR;
875 spin_unlock(&block_group->lock); 1061 spin_unlock(&block_group->lock);
876 ret = 0; 1062 ret = 0;
877 1063#ifdef DEBUG
878 printk(KERN_ERR "btrfs: failed to write free space cace " 1064 printk(KERN_ERR "btrfs: failed to write free space cace "
879 "for block group %llu\n", block_group->key.objectid); 1065 "for block group %llu\n", block_group->key.objectid);
1066#endif
880 } 1067 }
881 1068
882 iput(inode); 1069 iput(inode);
@@ -1283,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1283{ 1470{
1284 info->offset = offset_to_bitmap(ctl, offset); 1471 info->offset = offset_to_bitmap(ctl, offset);
1285 info->bytes = 0; 1472 info->bytes = 0;
1473 INIT_LIST_HEAD(&info->list);
1286 link_free_space(ctl, info); 1474 link_free_space(ctl, info);
1287 ctl->total_bitmaps++; 1475 ctl->total_bitmaps++;
1288 1476
@@ -1662,7 +1850,13 @@ again:
1662 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1850 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1663 1, 0); 1851 1, 0);
1664 if (!info) { 1852 if (!info) {
1665 WARN_ON(1); 1853 /* the tree logging code might be calling us before we
1854 * have fully loaded the free space rbtree for this
1855 * block group. So it is possible the entry won't
1856 * be in the rbtree yet at all. The caching code
1857 * will make sure not to put it in the rbtree if
1858 * the logging code has pinned it.
1859 */
1666 goto out_lock; 1860 goto out_lock;
1667 } 1861 }
1668 } 1862 }
@@ -1701,6 +1895,7 @@ again:
1701 ctl->total_bitmaps--; 1895 ctl->total_bitmaps--;
1702 } 1896 }
1703 kmem_cache_free(btrfs_free_space_cachep, info); 1897 kmem_cache_free(btrfs_free_space_cachep, info);
1898 ret = 0;
1704 goto out_lock; 1899 goto out_lock;
1705 } 1900 }
1706 1901
@@ -1708,7 +1903,8 @@ again:
1708 unlink_free_space(ctl, info); 1903 unlink_free_space(ctl, info);
1709 info->offset += bytes; 1904 info->offset += bytes;
1710 info->bytes -= bytes; 1905 info->bytes -= bytes;
1711 link_free_space(ctl, info); 1906 ret = link_free_space(ctl, info);
1907 WARN_ON(ret);
1712 goto out_lock; 1908 goto out_lock;
1713 } 1909 }
1714 1910
@@ -2124,6 +2320,7 @@ again:
2124 2320
2125 if (!found) { 2321 if (!found) {
2126 start = i; 2322 start = i;
2323 cluster->max_size = 0;
2127 found = true; 2324 found = true;
2128 } 2325 }
2129 2326
@@ -2267,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2267{ 2464{
2268 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2269 struct btrfs_free_space *entry; 2466 struct btrfs_free_space *entry;
2270 struct rb_node *node;
2271 int ret = -ENOSPC; 2467 int ret = -ENOSPC;
2468 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2272 2469
2273 if (ctl->total_bitmaps == 0) 2470 if (ctl->total_bitmaps == 0)
2274 return -ENOSPC; 2471 return -ENOSPC;
2275 2472
2276 /* 2473 /*
2277 * First check our cached list of bitmaps and see if there is an entry 2474 * The bitmap that covers offset won't be in the list unless offset
2278 * here that will work. 2475 * is just its start offset.
2279 */ 2476 */
2477 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2478 if (entry->offset != bitmap_offset) {
2479 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2480 if (entry && list_empty(&entry->list))
2481 list_add(&entry->list, bitmaps);
2482 }
2483
2280 list_for_each_entry(entry, bitmaps, list) { 2484 list_for_each_entry(entry, bitmaps, list) {
2281 if (entry->bytes < min_bytes) 2485 if (entry->bytes < min_bytes)
2282 continue; 2486 continue;
@@ -2287,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2287 } 2491 }
2288 2492
2289 /* 2493 /*
2290 * If we do have entries on our list and we are here then we didn't find 2494 * The bitmaps list has all the bitmaps that record free space
2291 * anything, so go ahead and get the next entry after the last entry in 2495 * starting after offset, so no more search is required.
2292 * this list and start the search from there.
2293 */ 2496 */
2294 if (!list_empty(bitmaps)) { 2497 return -ENOSPC;
2295 entry = list_entry(bitmaps->prev, struct btrfs_free_space,
2296 list);
2297 node = rb_next(&entry->offset_index);
2298 if (!node)
2299 return -ENOSPC;
2300 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2301 goto search;
2302 }
2303
2304 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
2305 if (!entry)
2306 return -ENOSPC;
2307
2308search:
2309 node = &entry->offset_index;
2310 do {
2311 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2312 node = rb_next(&entry->offset_index);
2313 if (!entry->bitmap)
2314 continue;
2315 if (entry->bytes < min_bytes)
2316 continue;
2317 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2318 bytes, min_bytes);
2319 } while (ret && node);
2320
2321 return ret;
2322} 2498}
2323 2499
2324/* 2500/*
@@ -2336,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2336 u64 offset, u64 bytes, u64 empty_size) 2512 u64 offset, u64 bytes, u64 empty_size)
2337{ 2513{
2338 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2514 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2339 struct list_head bitmaps;
2340 struct btrfs_free_space *entry, *tmp; 2515 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps);
2341 u64 min_bytes; 2517 u64 min_bytes;
2342 int ret; 2518 int ret;
2343 2519
@@ -2376,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2376 goto out; 2552 goto out;
2377 } 2553 }
2378 2554
2379 INIT_LIST_HEAD(&bitmaps);
2380 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2381 bytes, min_bytes); 2556 bytes, min_bytes);
2382 if (ret) 2557 if (ret)
@@ -2472,9 +2647,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2472 spin_unlock(&ctl->tree_lock); 2647 spin_unlock(&ctl->tree_lock);
2473 2648
2474 if (bytes >= minlen) { 2649 if (bytes >= minlen) {
2475 int update_ret; 2650 struct btrfs_space_info *space_info;
2476 update_ret = btrfs_update_reserved_bytes(block_group, 2651 int update = 0;
2477 bytes, 1, 1); 2652
2653 space_info = block_group->space_info;
2654 spin_lock(&space_info->lock);
2655 spin_lock(&block_group->lock);
2656 if (!block_group->ro) {
2657 block_group->reserved += bytes;
2658 space_info->bytes_reserved += bytes;
2659 update = 1;
2660 }
2661 spin_unlock(&block_group->lock);
2662 spin_unlock(&space_info->lock);
2478 2663
2479 ret = btrfs_error_discard_extent(fs_info->extent_root, 2664 ret = btrfs_error_discard_extent(fs_info->extent_root,
2480 start, 2665 start,
@@ -2482,9 +2667,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2482 &actually_trimmed); 2667 &actually_trimmed);
2483 2668
2484 btrfs_add_free_space(block_group, start, bytes); 2669 btrfs_add_free_space(block_group, start, bytes);
2485 if (!update_ret) 2670 if (update) {
2486 btrfs_update_reserved_bytes(block_group, 2671 spin_lock(&space_info->lock);
2487 bytes, 0, 1); 2672 spin_lock(&block_group->lock);
2673 if (block_group->ro)
2674 space_info->bytes_readonly += bytes;
2675 block_group->reserved -= bytes;
2676 space_info->bytes_reserved -= bytes;
2677 spin_unlock(&space_info->lock);
2678 spin_unlock(&block_group->lock);
2679 }
2488 2680
2489 if (ret) 2681 if (ret)
2490 break; 2682 break;
@@ -2643,9 +2835,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
2643 return 0; 2835 return 0;
2644 2836
2645 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 2837 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2646 if (ret < 0) 2838 if (ret) {
2839 btrfs_delalloc_release_metadata(inode, inode->i_size);
2840#ifdef DEBUG
2647 printk(KERN_ERR "btrfs: failed to write free ino cache " 2841 printk(KERN_ERR "btrfs: failed to write free ino cache "
2648 "for root %llu\n", root->root_key.objectid); 2842 "for root %llu\n", root->root_key.objectid);
2843#endif
2844 }
2649 2845
2650 iput(inode); 2846 iput(inode);
2651 return ret; 2847 return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..f8962a957d65 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
399 struct btrfs_path *path; 399 struct btrfs_path *path;
400 struct inode *inode; 400 struct inode *inode;
401 struct btrfs_block_rsv *rsv;
402 u64 num_bytes;
401 u64 alloc_hint = 0; 403 u64 alloc_hint = 0;
402 int ret; 404 int ret;
403 int prealloc; 405 int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
421 if (!path) 423 if (!path)
422 return -ENOMEM; 424 return -ENOMEM;
423 425
426 rsv = trans->block_rsv;
427 trans->block_rsv = &root->fs_info->trans_block_rsv;
428
429 num_bytes = trans->bytes_reserved;
430 /*
431 * 1 item for inode item insertion if need
432 * 3 items for inode item update (in the worst case)
433 * 1 item for free space object
434 * 3 items for pre-allocation
435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
438 trans->bytes_reserved);
439 if (ret)
440 goto out;
424again: 441again:
425 inode = lookup_free_ino_inode(root, path); 442 inode = lookup_free_ino_inode(root, path);
426 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
427 ret = PTR_ERR(inode); 444 ret = PTR_ERR(inode);
428 goto out; 445 goto out_release;
429 } 446 }
430 447
431 if (IS_ERR(inode)) { 448 if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
434 451
435 ret = create_free_ino_inode(root, trans, path); 452 ret = create_free_ino_inode(root, trans, path);
436 if (ret) 453 if (ret)
437 goto out; 454 goto out_release;
438 goto again; 455 goto again;
439 } 456 }
440 457
@@ -465,21 +482,26 @@ again:
465 /* Just to make sure we have enough space */ 482 /* Just to make sure we have enough space */
466 prealloc += 8 * PAGE_CACHE_SIZE; 483 prealloc += 8 * PAGE_CACHE_SIZE;
467 484
468 ret = btrfs_check_data_free_space(inode, prealloc); 485 ret = btrfs_delalloc_reserve_space(inode, prealloc);
469 if (ret) 486 if (ret)
470 goto out_put; 487 goto out_put;
471 488
472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 489 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
473 prealloc, prealloc, &alloc_hint); 490 prealloc, prealloc, &alloc_hint);
474 if (ret) 491 if (ret) {
492 btrfs_delalloc_release_space(inode, prealloc);
475 goto out_put; 493 goto out_put;
494 }
476 btrfs_free_reserved_data_space(inode, prealloc); 495 btrfs_free_reserved_data_space(inode, prealloc);
477 496
497 ret = btrfs_write_out_ino_cache(root, trans, path);
478out_put: 498out_put:
479 iput(inode); 499 iput(inode);
500out_release:
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
480out: 502out:
481 if (ret == 0) 503 trans->block_rsv = rsv;
482 ret = btrfs_write_out_ino_cache(root, trans, path); 504 trans->bytes_reserved = num_bytes;
483 505
484 btrfs_free_path(path); 506 btrfs_free_path(path);
485 return ret; 507 return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d004ad66a0..13b0542015ff 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -38,6 +38,7 @@
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/ratelimit.h> 40#include <linux/ratelimit.h>
41#include <linux/mount.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -45,10 +46,10 @@
45#include "btrfs_inode.h" 46#include "btrfs_inode.h"
46#include "ioctl.h" 47#include "ioctl.h"
47#include "print-tree.h" 48#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h" 49#include "ordered-data.h"
50#include "xattr.h" 50#include "xattr.h"
51#include "tree-log.h" 51#include "tree-log.h"
52#include "volumes.h"
52#include "compression.h" 53#include "compression.h"
53#include "locking.h" 54#include "locking.h"
54#include "free-space-cache.h" 55#include "free-space-cache.h"
@@ -93,6 +94,8 @@ static noinline int cow_file_range(struct inode *inode,
93 struct page *locked_page, 94 struct page *locked_page,
94 u64 start, u64 end, int *page_started, 95 u64 start, u64 end, int *page_started,
95 unsigned long *nr_written, int unlock); 96 unsigned long *nr_written, int unlock);
97static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
98 struct btrfs_root *root, struct inode *inode);
96 99
97static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 100static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
98 struct inode *inode, struct inode *dir, 101 struct inode *inode, struct inode *dir,
@@ -393,7 +396,10 @@ again:
393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 396 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
394 WARN_ON(pages); 397 WARN_ON(pages);
395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 398 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
396 BUG_ON(!pages); 399 if (!pages) {
400 /* just bail out to the uncompressed code */
401 goto cont;
402 }
397 403
398 if (BTRFS_I(inode)->force_compress) 404 if (BTRFS_I(inode)->force_compress)
399 compress_type = BTRFS_I(inode)->force_compress; 405 compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +430,7 @@ again:
424 will_compress = 1; 430 will_compress = 1;
425 } 431 }
426 } 432 }
433cont:
427 if (start == 0) { 434 if (start == 0) {
428 trans = btrfs_join_transaction(root); 435 trans = btrfs_join_transaction(root);
429 BUG_ON(IS_ERR(trans)); 436 BUG_ON(IS_ERR(trans));
@@ -820,7 +827,7 @@ static noinline int cow_file_range(struct inode *inode,
820 } 827 }
821 828
822 BUG_ON(disk_num_bytes > 829 BUG_ON(disk_num_bytes >
823 btrfs_super_total_bytes(&root->fs_info->super_copy)); 830 btrfs_super_total_bytes(root->fs_info->super_copy));
824 831
825 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 832 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
826 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 833 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1737,7 +1744,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1737 trans = btrfs_join_transaction(root); 1744 trans = btrfs_join_transaction(root);
1738 BUG_ON(IS_ERR(trans)); 1745 BUG_ON(IS_ERR(trans));
1739 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1746 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1740 ret = btrfs_update_inode(trans, root, inode); 1747 ret = btrfs_update_inode_fallback(trans, root, inode);
1741 BUG_ON(ret); 1748 BUG_ON(ret);
1742 } 1749 }
1743 goto out; 1750 goto out;
@@ -1787,17 +1794,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1787 1794
1788 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1795 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1789 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1796 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1790 ret = btrfs_update_inode(trans, root, inode); 1797 ret = btrfs_update_inode_fallback(trans, root, inode);
1791 BUG_ON(ret); 1798 BUG_ON(ret);
1792 } 1799 }
1793 ret = 0; 1800 ret = 0;
1794out: 1801out:
1795 if (nolock) { 1802 if (root != root->fs_info->tree_root)
1796 if (trans)
1797 btrfs_end_transaction_nolock(trans, root);
1798 } else {
1799 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1803 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1800 if (trans) 1804 if (trans) {
1805 if (nolock)
1806 btrfs_end_transaction_nolock(trans, root);
1807 else
1801 btrfs_end_transaction(trans, root); 1808 btrfs_end_transaction(trans, root);
1802 } 1809 }
1803 1810
@@ -1819,153 +1826,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1819} 1826}
1820 1827
1821/* 1828/*
1822 * When IO fails, either with EIO or csum verification fails, we
1823 * try other mirrors that might have a good copy of the data. This
1824 * io_failure_record is used to record state as we go through all the
1825 * mirrors. If another mirror has good data, the page is set up to date
1826 * and things continue. If a good mirror can't be found, the original
1827 * bio end_io callback is called to indicate things have failed.
1828 */
1829struct io_failure_record {
1830 struct page *page;
1831 u64 start;
1832 u64 len;
1833 u64 logical;
1834 unsigned long bio_flags;
1835 int last_mirror;
1836};
1837
1838static int btrfs_io_failed_hook(struct bio *failed_bio,
1839 struct page *page, u64 start, u64 end,
1840 struct extent_state *state)
1841{
1842 struct io_failure_record *failrec = NULL;
1843 u64 private;
1844 struct extent_map *em;
1845 struct inode *inode = page->mapping->host;
1846 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1847 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1848 struct bio *bio;
1849 int num_copies;
1850 int ret;
1851 int rw;
1852 u64 logical;
1853
1854 ret = get_state_private(failure_tree, start, &private);
1855 if (ret) {
1856 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1857 if (!failrec)
1858 return -ENOMEM;
1859 failrec->start = start;
1860 failrec->len = end - start + 1;
1861 failrec->last_mirror = 0;
1862 failrec->bio_flags = 0;
1863
1864 read_lock(&em_tree->lock);
1865 em = lookup_extent_mapping(em_tree, start, failrec->len);
1866 if (em->start > start || em->start + em->len < start) {
1867 free_extent_map(em);
1868 em = NULL;
1869 }
1870 read_unlock(&em_tree->lock);
1871
1872 if (IS_ERR_OR_NULL(em)) {
1873 kfree(failrec);
1874 return -EIO;
1875 }
1876 logical = start - em->start;
1877 logical = em->block_start + logical;
1878 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1879 logical = em->block_start;
1880 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1881 extent_set_compress_type(&failrec->bio_flags,
1882 em->compress_type);
1883 }
1884 failrec->logical = logical;
1885 free_extent_map(em);
1886 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1887 EXTENT_DIRTY, GFP_NOFS);
1888 set_state_private(failure_tree, start,
1889 (u64)(unsigned long)failrec);
1890 } else {
1891 failrec = (struct io_failure_record *)(unsigned long)private;
1892 }
1893 num_copies = btrfs_num_copies(
1894 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1895 failrec->logical, failrec->len);
1896 failrec->last_mirror++;
1897 if (!state) {
1898 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1899 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1900 failrec->start,
1901 EXTENT_LOCKED);
1902 if (state && state->start != failrec->start)
1903 state = NULL;
1904 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1905 }
1906 if (!state || failrec->last_mirror > num_copies) {
1907 set_state_private(failure_tree, failrec->start, 0);
1908 clear_extent_bits(failure_tree, failrec->start,
1909 failrec->start + failrec->len - 1,
1910 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1911 kfree(failrec);
1912 return -EIO;
1913 }
1914 bio = bio_alloc(GFP_NOFS, 1);
1915 bio->bi_private = state;
1916 bio->bi_end_io = failed_bio->bi_end_io;
1917 bio->bi_sector = failrec->logical >> 9;
1918 bio->bi_bdev = failed_bio->bi_bdev;
1919 bio->bi_size = 0;
1920
1921 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1922 if (failed_bio->bi_rw & REQ_WRITE)
1923 rw = WRITE;
1924 else
1925 rw = READ;
1926
1927 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1928 failrec->last_mirror,
1929 failrec->bio_flags, 0);
1930 return ret;
1931}
1932
1933/*
1934 * each time an IO finishes, we do a fast check in the IO failure tree
1935 * to see if we need to process or clean up an io_failure_record
1936 */
1937static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1938{
1939 u64 private;
1940 u64 private_failure;
1941 struct io_failure_record *failure;
1942 int ret;
1943
1944 private = 0;
1945 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1946 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1947 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1948 start, &private_failure);
1949 if (ret == 0) {
1950 failure = (struct io_failure_record *)(unsigned long)
1951 private_failure;
1952 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1953 failure->start, 0);
1954 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1955 failure->start,
1956 failure->start + failure->len - 1,
1957 EXTENT_DIRTY | EXTENT_LOCKED,
1958 GFP_NOFS);
1959 kfree(failure);
1960 }
1961 }
1962 return 0;
1963}
1964
1965/*
1966 * when reads are done, we need to check csums to verify the data is correct 1829 * when reads are done, we need to check csums to verify the data is correct
1967 * if there's a match, we allow the bio to finish. If not, we go through 1830 * if there's a match, we allow the bio to finish. If not, the code in
1968 * the io_failure_record routines to find good copies 1831 * extent_io.c will try to find good copies for us.
1969 */ 1832 */
1970static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1833static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1971 struct extent_state *state) 1834 struct extent_state *state)
@@ -2011,10 +1874,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2011 1874
2012 kunmap_atomic(kaddr, KM_USER0); 1875 kunmap_atomic(kaddr, KM_USER0);
2013good: 1876good:
2014 /* if the io failure tree for this inode is non-empty,
2015 * check to see if we've recovered from a failed IO
2016 */
2017 btrfs_clean_io_failures(inode, start);
2018 return 0; 1877 return 0;
2019 1878
2020zeroit: 1879zeroit:
@@ -2079,89 +1938,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2079 up_read(&root->fs_info->cleanup_work_sem); 1938 up_read(&root->fs_info->cleanup_work_sem);
2080} 1939}
2081 1940
2082/*
2083 * calculate extra metadata reservation when snapshotting a subvolume
2084 * contains orphan files.
2085 */
2086void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2087 struct btrfs_pending_snapshot *pending,
2088 u64 *bytes_to_reserve)
2089{
2090 struct btrfs_root *root;
2091 struct btrfs_block_rsv *block_rsv;
2092 u64 num_bytes;
2093 int index;
2094
2095 root = pending->root;
2096 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2097 return;
2098
2099 block_rsv = root->orphan_block_rsv;
2100
2101 /* orphan block reservation for the snapshot */
2102 num_bytes = block_rsv->size;
2103
2104 /*
2105 * after the snapshot is created, COWing tree blocks may use more
2106 * space than it frees. So we should make sure there is enough
2107 * reserved space.
2108 */
2109 index = trans->transid & 0x1;
2110 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2111 num_bytes += block_rsv->size -
2112 (block_rsv->reserved + block_rsv->freed[index]);
2113 }
2114
2115 *bytes_to_reserve += num_bytes;
2116}
2117
2118void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2119 struct btrfs_pending_snapshot *pending)
2120{
2121 struct btrfs_root *root = pending->root;
2122 struct btrfs_root *snap = pending->snap;
2123 struct btrfs_block_rsv *block_rsv;
2124 u64 num_bytes;
2125 int index;
2126 int ret;
2127
2128 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2129 return;
2130
2131 /* refill source subvolume's orphan block reservation */
2132 block_rsv = root->orphan_block_rsv;
2133 index = trans->transid & 0x1;
2134 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2135 num_bytes = block_rsv->size -
2136 (block_rsv->reserved + block_rsv->freed[index]);
2137 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2138 root->orphan_block_rsv,
2139 num_bytes);
2140 BUG_ON(ret);
2141 }
2142
2143 /* setup orphan block reservation for the snapshot */
2144 block_rsv = btrfs_alloc_block_rsv(snap);
2145 BUG_ON(!block_rsv);
2146
2147 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2148 snap->orphan_block_rsv = block_rsv;
2149
2150 num_bytes = root->orphan_block_rsv->size;
2151 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2152 block_rsv, num_bytes);
2153 BUG_ON(ret);
2154
2155#if 0
2156 /* insert orphan item for the snapshot */
2157 WARN_ON(!root->orphan_item_inserted);
2158 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2159 snap->root_key.objectid);
2160 BUG_ON(ret);
2161 snap->orphan_item_inserted = 1;
2162#endif
2163}
2164
2165enum btrfs_orphan_cleanup_state { 1941enum btrfs_orphan_cleanup_state {
2166 ORPHAN_CLEANUP_STARTED = 1, 1942 ORPHAN_CLEANUP_STARTED = 1,
2167 ORPHAN_CLEANUP_DONE = 2, 1943 ORPHAN_CLEANUP_DONE = 2,
@@ -2247,9 +2023,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2247 } 2023 }
2248 spin_unlock(&root->orphan_lock); 2024 spin_unlock(&root->orphan_lock);
2249 2025
2250 if (block_rsv)
2251 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2252
2253 /* grab metadata reservation from transaction handle */ 2026 /* grab metadata reservation from transaction handle */
2254 if (reserve) { 2027 if (reserve) {
2255 ret = btrfs_orphan_reserve_metadata(trans, inode); 2028 ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2259,7 +2032,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2259 /* insert an orphan item to track this unlinked/truncated file */ 2032 /* insert an orphan item to track this unlinked/truncated file */
2260 if (insert >= 1) { 2033 if (insert >= 1) {
2261 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2034 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2262 BUG_ON(ret); 2035 BUG_ON(ret && ret != -EEXIST);
2263 } 2036 }
2264 2037
2265 /* insert an orphan item to track subvolume contains orphan files */ 2038 /* insert an orphan item to track subvolume contains orphan files */
@@ -2316,6 +2089,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2316 struct btrfs_key key, found_key; 2089 struct btrfs_key key, found_key;
2317 struct btrfs_trans_handle *trans; 2090 struct btrfs_trans_handle *trans;
2318 struct inode *inode; 2091 struct inode *inode;
2092 u64 last_objectid = 0;
2319 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2093 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2320 2094
2321 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2095 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2367,41 +2141,81 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2367 * crossing root thing. we store the inode number in the 2141 * crossing root thing. we store the inode number in the
2368 * offset of the orphan item. 2142 * offset of the orphan item.
2369 */ 2143 */
2144
2145 if (found_key.offset == last_objectid) {
2146 printk(KERN_ERR "btrfs: Error removing orphan entry, "
2147 "stopping orphan cleanup\n");
2148 ret = -EINVAL;
2149 goto out;
2150 }
2151
2152 last_objectid = found_key.offset;
2153
2370 found_key.objectid = found_key.offset; 2154 found_key.objectid = found_key.offset;
2371 found_key.type = BTRFS_INODE_ITEM_KEY; 2155 found_key.type = BTRFS_INODE_ITEM_KEY;
2372 found_key.offset = 0; 2156 found_key.offset = 0;
2373 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2157 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2374 if (IS_ERR(inode)) { 2158 ret = PTR_RET(inode);
2375 ret = PTR_ERR(inode); 2159 if (ret && ret != -ESTALE)
2376 goto out; 2160 goto out;
2377 }
2378 2161
2379 /* 2162 if (ret == -ESTALE && root == root->fs_info->tree_root) {
2380 * add this inode to the orphan list so btrfs_orphan_del does 2163 struct btrfs_root *dead_root;
2381 * the proper thing when we hit it 2164 struct btrfs_fs_info *fs_info = root->fs_info;
2382 */ 2165 int is_dead_root = 0;
2383 spin_lock(&root->orphan_lock);
2384 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2385 spin_unlock(&root->orphan_lock);
2386 2166
2167 /*
2168 * this is an orphan in the tree root. Currently these
2169 * could come from 2 sources:
2170 * a) a snapshot deletion in progress
2171 * b) a free space cache inode
2172 * We need to distinguish those two, as the snapshot
2173 * orphan must not get deleted.
2174 * find_dead_roots already ran before us, so if this
2175 * is a snapshot deletion, we should find the root
2176 * in the dead_roots list
2177 */
2178 spin_lock(&fs_info->trans_lock);
2179 list_for_each_entry(dead_root, &fs_info->dead_roots,
2180 root_list) {
2181 if (dead_root->root_key.objectid ==
2182 found_key.objectid) {
2183 is_dead_root = 1;
2184 break;
2185 }
2186 }
2187 spin_unlock(&fs_info->trans_lock);
2188 if (is_dead_root) {
2189 /* prevent this orphan from being found again */
2190 key.offset = found_key.objectid - 1;
2191 continue;
2192 }
2193 }
2387 /* 2194 /*
2388 * if this is a bad inode, means we actually succeeded in 2195 * Inode is already gone but the orphan item is still there,
2389 * removing the inode, but not the orphan record, which means 2196 * kill the orphan item.
2390 * we need to manually delete the orphan since iput will just
2391 * do a destroy_inode
2392 */ 2197 */
2393 if (is_bad_inode(inode)) { 2198 if (ret == -ESTALE) {
2394 trans = btrfs_start_transaction(root, 0); 2199 trans = btrfs_start_transaction(root, 1);
2395 if (IS_ERR(trans)) { 2200 if (IS_ERR(trans)) {
2396 ret = PTR_ERR(trans); 2201 ret = PTR_ERR(trans);
2397 goto out; 2202 goto out;
2398 } 2203 }
2399 btrfs_orphan_del(trans, inode); 2204 ret = btrfs_del_orphan_item(trans, root,
2205 found_key.objectid);
2206 BUG_ON(ret);
2400 btrfs_end_transaction(trans, root); 2207 btrfs_end_transaction(trans, root);
2401 iput(inode);
2402 continue; 2208 continue;
2403 } 2209 }
2404 2210
2211 /*
2212 * add this inode to the orphan list so btrfs_orphan_del does
2213 * the proper thing when we hit it
2214 */
2215 spin_lock(&root->orphan_lock);
2216 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2217 spin_unlock(&root->orphan_lock);
2218
2405 /* if we have links, this was a truncate, lets do that */ 2219 /* if we have links, this was a truncate, lets do that */
2406 if (inode->i_nlink) { 2220 if (inode->i_nlink) {
2407 if (!S_ISREG(inode->i_mode)) { 2221 if (!S_ISREG(inode->i_mode)) {
@@ -2410,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2410 continue; 2224 continue;
2411 } 2225 }
2412 nr_truncate++; 2226 nr_truncate++;
2227 /*
2228 * Need to hold the imutex for reservation purposes, not
2229 * a huge deal here but I have a WARN_ON in
2230 * btrfs_delalloc_reserve_space to catch offenders.
2231 */
2232 mutex_lock(&inode->i_mutex);
2413 ret = btrfs_truncate(inode); 2233 ret = btrfs_truncate(inode);
2234 mutex_unlock(&inode->i_mutex);
2414 } else { 2235 } else {
2415 nr_unlink++; 2236 nr_unlink++;
2416 } 2237 }
@@ -2420,6 +2241,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2420 if (ret) 2241 if (ret)
2421 goto out; 2242 goto out;
2422 } 2243 }
2244 /* release the path since we're done with it */
2245 btrfs_release_path(path);
2246
2423 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2247 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2424 2248
2425 if (root->orphan_block_rsv) 2249 if (root->orphan_block_rsv)
@@ -2647,7 +2471,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2647/* 2471/*
2648 * copy everything in the in-memory inode into the btree. 2472 * copy everything in the in-memory inode into the btree.
2649 */ 2473 */
2650noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2474static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
2651 struct btrfs_root *root, struct inode *inode) 2475 struct btrfs_root *root, struct inode *inode)
2652{ 2476{
2653 struct btrfs_inode_item *inode_item; 2477 struct btrfs_inode_item *inode_item;
@@ -2655,21 +2479,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2655 struct extent_buffer *leaf; 2479 struct extent_buffer *leaf;
2656 int ret; 2480 int ret;
2657 2481
2658 /*
2659 * If the inode is a free space inode, we can deadlock during commit
2660 * if we put it into the delayed code.
2661 *
2662 * The data relocation inode should also be directly updated
2663 * without delay
2664 */
2665 if (!btrfs_is_free_space_inode(root, inode)
2666 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2667 ret = btrfs_delayed_update_inode(trans, root, inode);
2668 if (!ret)
2669 btrfs_set_inode_last_trans(trans, inode);
2670 return ret;
2671 }
2672
2673 path = btrfs_alloc_path(); 2482 path = btrfs_alloc_path();
2674 if (!path) 2483 if (!path)
2675 return -ENOMEM; 2484 return -ENOMEM;
@@ -2698,6 +2507,43 @@ failed:
2698} 2507}
2699 2508
2700/* 2509/*
2510 * copy everything in the in-memory inode into the btree.
2511 */
2512noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2513 struct btrfs_root *root, struct inode *inode)
2514{
2515 int ret;
2516
2517 /*
2518 * If the inode is a free space inode, we can deadlock during commit
2519 * if we put it into the delayed code.
2520 *
2521 * The data relocation inode should also be directly updated
2522 * without delay
2523 */
2524 if (!btrfs_is_free_space_inode(root, inode)
2525 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2526 ret = btrfs_delayed_update_inode(trans, root, inode);
2527 if (!ret)
2528 btrfs_set_inode_last_trans(trans, inode);
2529 return ret;
2530 }
2531
2532 return btrfs_update_inode_item(trans, root, inode);
2533}
2534
2535static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
2536 struct btrfs_root *root, struct inode *inode)
2537{
2538 int ret;
2539
2540 ret = btrfs_update_inode(trans, root, inode);
2541 if (ret == -ENOSPC)
2542 return btrfs_update_inode_item(trans, root, inode);
2543 return ret;
2544}
2545
2546/*
2701 * unlink helper that gets used here in inode.c and in the tree logging 2547 * unlink helper that gets used here in inode.c and in the tree logging
2702 * recovery code. It remove a link in a directory with a given name, and 2548 * recovery code. It remove a link in a directory with a given name, and
2703 * also drops the back refs in the inode to the directory 2549 * also drops the back refs in the inode to the directory
@@ -2835,7 +2681,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2835 u64 ino = btrfs_ino(inode); 2681 u64 ino = btrfs_ino(inode);
2836 u64 dir_ino = btrfs_ino(dir); 2682 u64 dir_ino = btrfs_ino(dir);
2837 2683
2838 trans = btrfs_start_transaction(root, 10); 2684 /*
2685 * 1 for the possible orphan item
2686 * 1 for the dir item
2687 * 1 for the dir index
2688 * 1 for the inode ref
2689 * 1 for the inode ref in the tree log
2690 * 2 for the dir entries in the log
2691 * 1 for the inode
2692 */
2693 trans = btrfs_start_transaction(root, 8);
2839 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2694 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2840 return trans; 2695 return trans;
2841 2696
@@ -2858,7 +2713,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2858 return ERR_PTR(-ENOMEM); 2713 return ERR_PTR(-ENOMEM);
2859 } 2714 }
2860 2715
2861 trans = btrfs_start_transaction(root, 0); 2716 /* 1 for the orphan item */
2717 trans = btrfs_start_transaction(root, 1);
2862 if (IS_ERR(trans)) { 2718 if (IS_ERR(trans)) {
2863 btrfs_free_path(path); 2719 btrfs_free_path(path);
2864 root->fs_info->enospc_unlink = 0; 2720 root->fs_info->enospc_unlink = 0;
@@ -2963,6 +2819,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2963 err = 0; 2819 err = 0;
2964out: 2820out:
2965 btrfs_free_path(path); 2821 btrfs_free_path(path);
2822 /* Migrate the orphan reservation over */
2823 if (!err)
2824 err = btrfs_block_rsv_migrate(trans->block_rsv,
2825 &root->fs_info->global_block_rsv,
2826 trans->bytes_reserved);
2827
2966 if (err) { 2828 if (err) {
2967 btrfs_end_transaction(trans, root); 2829 btrfs_end_transaction(trans, root);
2968 root->fs_info->enospc_unlink = 0; 2830 root->fs_info->enospc_unlink = 0;
@@ -2977,6 +2839,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2977 struct btrfs_root *root) 2839 struct btrfs_root *root)
2978{ 2840{
2979 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2841 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2842 btrfs_block_rsv_release(root, trans->block_rsv,
2843 trans->bytes_reserved);
2844 trans->block_rsv = &root->fs_info->trans_block_rsv;
2980 BUG_ON(!root->fs_info->enospc_unlink); 2845 BUG_ON(!root->fs_info->enospc_unlink);
2981 root->fs_info->enospc_unlink = 0; 2846 root->fs_info->enospc_unlink = 0;
2982 } 2847 }
@@ -3368,6 +3233,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3368 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3233 pgoff_t index = from >> PAGE_CACHE_SHIFT;
3369 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3234 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3370 struct page *page; 3235 struct page *page;
3236 gfp_t mask = btrfs_alloc_write_mask(mapping);
3371 int ret = 0; 3237 int ret = 0;
3372 u64 page_start; 3238 u64 page_start;
3373 u64 page_end; 3239 u64 page_end;
@@ -3380,7 +3246,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3380 3246
3381 ret = -ENOMEM; 3247 ret = -ENOMEM;
3382again: 3248again:
3383 page = find_or_create_page(mapping, index, GFP_NOFS); 3249 page = find_or_create_page(mapping, index, mask);
3384 if (!page) { 3250 if (!page) {
3385 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3251 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3386 goto out; 3252 goto out;
@@ -3501,7 +3367,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3501 u64 hint_byte = 0; 3367 u64 hint_byte = 0;
3502 hole_size = last_byte - cur_offset; 3368 hole_size = last_byte - cur_offset;
3503 3369
3504 trans = btrfs_start_transaction(root, 2); 3370 trans = btrfs_start_transaction(root, 3);
3505 if (IS_ERR(trans)) { 3371 if (IS_ERR(trans)) {
3506 err = PTR_ERR(trans); 3372 err = PTR_ERR(trans);
3507 break; 3373 break;
@@ -3511,6 +3377,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3511 cur_offset + hole_size, 3377 cur_offset + hole_size,
3512 &hint_byte, 1); 3378 &hint_byte, 1);
3513 if (err) { 3379 if (err) {
3380 btrfs_update_inode(trans, root, inode);
3514 btrfs_end_transaction(trans, root); 3381 btrfs_end_transaction(trans, root);
3515 break; 3382 break;
3516 } 3383 }
@@ -3520,6 +3387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3520 0, hole_size, 0, hole_size, 3387 0, hole_size, 0, hole_size,
3521 0, 0, 0); 3388 0, 0, 0);
3522 if (err) { 3389 if (err) {
3390 btrfs_update_inode(trans, root, inode);
3523 btrfs_end_transaction(trans, root); 3391 btrfs_end_transaction(trans, root);
3524 break; 3392 break;
3525 } 3393 }
@@ -3527,6 +3395,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3527 btrfs_drop_extent_cache(inode, hole_start, 3395 btrfs_drop_extent_cache(inode, hole_start,
3528 last_byte - 1, 0); 3396 last_byte - 1, 0);
3529 3397
3398 btrfs_update_inode(trans, root, inode);
3530 btrfs_end_transaction(trans, root); 3399 btrfs_end_transaction(trans, root);
3531 } 3400 }
3532 free_extent_map(em); 3401 free_extent_map(em);
@@ -3544,6 +3413,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3544 3413
3545static int btrfs_setsize(struct inode *inode, loff_t newsize) 3414static int btrfs_setsize(struct inode *inode, loff_t newsize)
3546{ 3415{
3416 struct btrfs_root *root = BTRFS_I(inode)->root;
3417 struct btrfs_trans_handle *trans;
3547 loff_t oldsize = i_size_read(inode); 3418 loff_t oldsize = i_size_read(inode);
3548 int ret; 3419 int ret;
3549 3420
@@ -3551,16 +3422,19 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3551 return 0; 3422 return 0;
3552 3423
3553 if (newsize > oldsize) { 3424 if (newsize > oldsize) {
3554 i_size_write(inode, newsize);
3555 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3556 truncate_pagecache(inode, oldsize, newsize); 3425 truncate_pagecache(inode, oldsize, newsize);
3557 ret = btrfs_cont_expand(inode, oldsize, newsize); 3426 ret = btrfs_cont_expand(inode, oldsize, newsize);
3558 if (ret) { 3427 if (ret)
3559 btrfs_setsize(inode, oldsize);
3560 return ret; 3428 return ret;
3561 }
3562 3429
3563 mark_inode_dirty(inode); 3430 trans = btrfs_start_transaction(root, 1);
3431 if (IS_ERR(trans))
3432 return PTR_ERR(trans);
3433
3434 i_size_write(inode, newsize);
3435 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3436 ret = btrfs_update_inode(trans, root, inode);
3437 btrfs_end_transaction_throttle(trans, root);
3564 } else { 3438 } else {
3565 3439
3566 /* 3440 /*
@@ -3600,9 +3474,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3600 3474
3601 if (attr->ia_valid) { 3475 if (attr->ia_valid) {
3602 setattr_copy(inode, attr); 3476 setattr_copy(inode, attr);
3603 mark_inode_dirty(inode); 3477 err = btrfs_dirty_inode(inode);
3604 3478
3605 if (attr->ia_valid & ATTR_MODE) 3479 if (!err && attr->ia_valid & ATTR_MODE)
3606 err = btrfs_acl_chmod(inode); 3480 err = btrfs_acl_chmod(inode);
3607 } 3481 }
3608 3482
@@ -3613,6 +3487,8 @@ void btrfs_evict_inode(struct inode *inode)
3613{ 3487{
3614 struct btrfs_trans_handle *trans; 3488 struct btrfs_trans_handle *trans;
3615 struct btrfs_root *root = BTRFS_I(inode)->root; 3489 struct btrfs_root *root = BTRFS_I(inode)->root;
3490 struct btrfs_block_rsv *rsv, *global_rsv;
3491 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3616 unsigned long nr; 3492 unsigned long nr;
3617 int ret; 3493 int ret;
3618 3494
@@ -3640,22 +3516,55 @@ void btrfs_evict_inode(struct inode *inode)
3640 goto no_delete; 3516 goto no_delete;
3641 } 3517 }
3642 3518
3519 rsv = btrfs_alloc_block_rsv(root);
3520 if (!rsv) {
3521 btrfs_orphan_del(NULL, inode);
3522 goto no_delete;
3523 }
3524 rsv->size = min_size;
3525 global_rsv = &root->fs_info->global_block_rsv;
3526
3643 btrfs_i_size_write(inode, 0); 3527 btrfs_i_size_write(inode, 0);
3644 3528
3529 /*
3530 * This is a bit simpler than btrfs_truncate since
3531 *
3532 * 1) We've already reserved our space for our orphan item in the
3533 * unlink.
3534 * 2) We're going to delete the inode item, so we don't need to update
3535 * it at all.
3536 *
3537 * So we just need to reserve some slack space in case we add bytes when
3538 * doing the truncate.
3539 */
3645 while (1) { 3540 while (1) {
3646 trans = btrfs_join_transaction(root); 3541 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
3647 BUG_ON(IS_ERR(trans)); 3542
3648 trans->block_rsv = root->orphan_block_rsv; 3543 /*
3544 * Try and steal from the global reserve since we will
3545 * likely not use this space anyway, we want to try as
3546 * hard as possible to get this to work.
3547 */
3548 if (ret)
3549 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3649 3550
3650 ret = btrfs_block_rsv_check(trans, root,
3651 root->orphan_block_rsv, 0, 5);
3652 if (ret) { 3551 if (ret) {
3653 BUG_ON(ret != -EAGAIN); 3552 printk(KERN_WARNING "Could not get space for a "
3654 ret = btrfs_commit_transaction(trans, root); 3553 "delete, will truncate on mount %d\n", ret);
3655 BUG_ON(ret); 3554 btrfs_orphan_del(NULL, inode);
3656 continue; 3555 btrfs_free_block_rsv(root, rsv);
3556 goto no_delete;
3557 }
3558
3559 trans = btrfs_start_transaction(root, 0);
3560 if (IS_ERR(trans)) {
3561 btrfs_orphan_del(NULL, inode);
3562 btrfs_free_block_rsv(root, rsv);
3563 goto no_delete;
3657 } 3564 }
3658 3565
3566 trans->block_rsv = rsv;
3567
3659 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3568 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3660 if (ret != -EAGAIN) 3569 if (ret != -EAGAIN)
3661 break; 3570 break;
@@ -3664,14 +3573,17 @@ void btrfs_evict_inode(struct inode *inode)
3664 btrfs_end_transaction(trans, root); 3573 btrfs_end_transaction(trans, root);
3665 trans = NULL; 3574 trans = NULL;
3666 btrfs_btree_balance_dirty(root, nr); 3575 btrfs_btree_balance_dirty(root, nr);
3667
3668 } 3576 }
3669 3577
3578 btrfs_free_block_rsv(root, rsv);
3579
3670 if (ret == 0) { 3580 if (ret == 0) {
3581 trans->block_rsv = root->orphan_block_rsv;
3671 ret = btrfs_orphan_del(trans, inode); 3582 ret = btrfs_orphan_del(trans, inode);
3672 BUG_ON(ret); 3583 BUG_ON(ret);
3673 } 3584 }
3674 3585
3586 trans->block_rsv = &root->fs_info->trans_block_rsv;
3675 if (!(root == root->fs_info->tree_root || 3587 if (!(root == root->fs_info->tree_root ||
3676 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3588 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3677 btrfs_return_ino(root, btrfs_ino(inode)); 3589 btrfs_return_ino(root, btrfs_ino(inode));
@@ -4340,42 +4252,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4340 * FIXME, needs more benchmarking...there are no reasons other than performance 4252 * FIXME, needs more benchmarking...there are no reasons other than performance
4341 * to keep or drop this code. 4253 * to keep or drop this code.
4342 */ 4254 */
4343void btrfs_dirty_inode(struct inode *inode, int flags) 4255int btrfs_dirty_inode(struct inode *inode)
4344{ 4256{
4345 struct btrfs_root *root = BTRFS_I(inode)->root; 4257 struct btrfs_root *root = BTRFS_I(inode)->root;
4346 struct btrfs_trans_handle *trans; 4258 struct btrfs_trans_handle *trans;
4347 int ret; 4259 int ret;
4348 4260
4349 if (BTRFS_I(inode)->dummy_inode) 4261 if (BTRFS_I(inode)->dummy_inode)
4350 return; 4262 return 0;
4351 4263
4352 trans = btrfs_join_transaction(root); 4264 trans = btrfs_join_transaction(root);
4353 BUG_ON(IS_ERR(trans)); 4265 if (IS_ERR(trans))
4266 return PTR_ERR(trans);
4354 4267
4355 ret = btrfs_update_inode(trans, root, inode); 4268 ret = btrfs_update_inode(trans, root, inode);
4356 if (ret && ret == -ENOSPC) { 4269 if (ret && ret == -ENOSPC) {
4357 /* whoops, lets try again with the full transaction */ 4270 /* whoops, lets try again with the full transaction */
4358 btrfs_end_transaction(trans, root); 4271 btrfs_end_transaction(trans, root);
4359 trans = btrfs_start_transaction(root, 1); 4272 trans = btrfs_start_transaction(root, 1);
4360 if (IS_ERR(trans)) { 4273 if (IS_ERR(trans))
4361 printk_ratelimited(KERN_ERR "btrfs: fail to " 4274 return PTR_ERR(trans);
4362 "dirty inode %llu error %ld\n",
4363 (unsigned long long)btrfs_ino(inode),
4364 PTR_ERR(trans));
4365 return;
4366 }
4367 4275
4368 ret = btrfs_update_inode(trans, root, inode); 4276 ret = btrfs_update_inode(trans, root, inode);
4369 if (ret) {
4370 printk_ratelimited(KERN_ERR "btrfs: fail to "
4371 "dirty inode %llu error %d\n",
4372 (unsigned long long)btrfs_ino(inode),
4373 ret);
4374 }
4375 } 4277 }
4376 btrfs_end_transaction(trans, root); 4278 btrfs_end_transaction(trans, root);
4377 if (BTRFS_I(inode)->delayed_node) 4279 if (BTRFS_I(inode)->delayed_node)
4378 btrfs_balance_delayed_items(root); 4280 btrfs_balance_delayed_items(root);
4281
4282 return ret;
4283}
4284
4285/*
4286 * This is a copy of file_update_time. We need this so we can return error on
4287 * ENOSPC for updating the inode in the case of file write and mmap writes.
4288 */
4289int btrfs_update_time(struct file *file)
4290{
4291 struct inode *inode = file->f_path.dentry->d_inode;
4292 struct timespec now;
4293 int ret;
4294 enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
4295
4296 /* First try to exhaust all avenues to not sync */
4297 if (IS_NOCMTIME(inode))
4298 return 0;
4299
4300 now = current_fs_time(inode->i_sb);
4301 if (!timespec_equal(&inode->i_mtime, &now))
4302 sync_it = S_MTIME;
4303
4304 if (!timespec_equal(&inode->i_ctime, &now))
4305 sync_it |= S_CTIME;
4306
4307 if (IS_I_VERSION(inode))
4308 sync_it |= S_VERSION;
4309
4310 if (!sync_it)
4311 return 0;
4312
4313 /* Finally allowed to write? Takes lock. */
4314 if (mnt_want_write_file(file))
4315 return 0;
4316
4317 /* Only change inode inside the lock region */
4318 if (sync_it & S_VERSION)
4319 inode_inc_iversion(inode);
4320 if (sync_it & S_CTIME)
4321 inode->i_ctime = now;
4322 if (sync_it & S_MTIME)
4323 inode->i_mtime = now;
4324 ret = btrfs_dirty_inode(inode);
4325 if (!ret)
4326 mark_inode_dirty_sync(inode);
4327 mnt_drop_write(file->f_path.mnt);
4328 return ret;
4379} 4329}
4380 4330
4381/* 4331/*
@@ -4640,10 +4590,6 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
4640 int err = btrfs_add_link(trans, dir, inode, 4590 int err = btrfs_add_link(trans, dir, inode,
4641 dentry->d_name.name, dentry->d_name.len, 4591 dentry->d_name.name, dentry->d_name.len,
4642 backref, index); 4592 backref, index);
4643 if (!err) {
4644 d_instantiate(dentry, inode);
4645 return 0;
4646 }
4647 if (err > 0) 4593 if (err > 0)
4648 err = -EEXIST; 4594 err = -EEXIST;
4649 return err; 4595 return err;
@@ -4691,13 +4637,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4691 goto out_unlock; 4637 goto out_unlock;
4692 } 4638 }
4693 4639
4640 /*
4641 * If the active LSM wants to access the inode during
4642 * d_instantiate it needs these. Smack checks to see
4643 * if the filesystem supports xattrs by looking at the
4644 * ops vector.
4645 */
4646
4647 inode->i_op = &btrfs_special_inode_operations;
4694 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4648 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4695 if (err) 4649 if (err)
4696 drop_inode = 1; 4650 drop_inode = 1;
4697 else { 4651 else {
4698 inode->i_op = &btrfs_special_inode_operations;
4699 init_special_inode(inode, inode->i_mode, rdev); 4652 init_special_inode(inode, inode->i_mode, rdev);
4700 btrfs_update_inode(trans, root, inode); 4653 btrfs_update_inode(trans, root, inode);
4654 d_instantiate(dentry, inode);
4701 } 4655 }
4702out_unlock: 4656out_unlock:
4703 nr = trans->blocks_used; 4657 nr = trans->blocks_used;
@@ -4749,15 +4703,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4749 goto out_unlock; 4703 goto out_unlock;
4750 } 4704 }
4751 4705
4706 /*
4707 * If the active LSM wants to access the inode during
4708 * d_instantiate it needs these. Smack checks to see
4709 * if the filesystem supports xattrs by looking at the
4710 * ops vector.
4711 */
4712 inode->i_fop = &btrfs_file_operations;
4713 inode->i_op = &btrfs_file_inode_operations;
4714
4752 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4715 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4753 if (err) 4716 if (err)
4754 drop_inode = 1; 4717 drop_inode = 1;
4755 else { 4718 else {
4756 inode->i_mapping->a_ops = &btrfs_aops; 4719 inode->i_mapping->a_ops = &btrfs_aops;
4757 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4720 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4758 inode->i_fop = &btrfs_file_operations;
4759 inode->i_op = &btrfs_file_inode_operations;
4760 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4721 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4722 d_instantiate(dentry, inode);
4761 } 4723 }
4762out_unlock: 4724out_unlock:
4763 nr = trans->blocks_used; 4725 nr = trans->blocks_used;
@@ -4815,6 +4777,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4815 struct dentry *parent = dentry->d_parent; 4777 struct dentry *parent = dentry->d_parent;
4816 err = btrfs_update_inode(trans, root, inode); 4778 err = btrfs_update_inode(trans, root, inode);
4817 BUG_ON(err); 4779 BUG_ON(err);
4780 d_instantiate(dentry, inode);
4818 btrfs_log_new_name(trans, inode, NULL, parent); 4781 btrfs_log_new_name(trans, inode, NULL, parent);
4819 } 4782 }
4820 4783
@@ -5795,8 +5758,7 @@ again:
5795 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5758 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5796 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5759 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5797 if (!ret) 5760 if (!ret)
5798 ret = btrfs_update_inode(trans, root, inode); 5761 err = btrfs_update_inode_fallback(trans, root, inode);
5799 err = ret;
5800 goto out; 5762 goto out;
5801 } 5763 }
5802 5764
@@ -5834,7 +5796,7 @@ again:
5834 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5796 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5835 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5797 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5836 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 5798 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5837 btrfs_update_inode(trans, root, inode); 5799 btrfs_update_inode_fallback(trans, root, inode);
5838 ret = 0; 5800 ret = 0;
5839out_unlock: 5801out_unlock:
5840 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5802 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
@@ -6289,7 +6251,7 @@ int btrfs_readpage(struct file *file, struct page *page)
6289{ 6251{
6290 struct extent_io_tree *tree; 6252 struct extent_io_tree *tree;
6291 tree = &BTRFS_I(page->mapping->host)->io_tree; 6253 tree = &BTRFS_I(page->mapping->host)->io_tree;
6292 return extent_read_full_page(tree, page, btrfs_get_extent); 6254 return extent_read_full_page(tree, page, btrfs_get_extent, 0);
6293} 6255}
6294 6256
6295static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6257static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -6440,7 +6402,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6440 u64 page_start; 6402 u64 page_start;
6441 u64 page_end; 6403 u64 page_end;
6442 6404
6405 /* Need this to keep space reservations serialized */
6406 mutex_lock(&inode->i_mutex);
6443 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6407 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6408 mutex_unlock(&inode->i_mutex);
6409 if (!ret)
6410 ret = btrfs_update_time(vma->vm_file);
6444 if (ret) { 6411 if (ret) {
6445 if (ret == -ENOMEM) 6412 if (ret == -ENOMEM)
6446 ret = VM_FAULT_OOM; 6413 ret = VM_FAULT_OOM;
@@ -6541,6 +6508,7 @@ static int btrfs_truncate(struct inode *inode)
6541 struct btrfs_trans_handle *trans; 6508 struct btrfs_trans_handle *trans;
6542 unsigned long nr; 6509 unsigned long nr;
6543 u64 mask = root->sectorsize - 1; 6510 u64 mask = root->sectorsize - 1;
6511 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6544 6512
6545 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6513 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6546 if (ret) 6514 if (ret)
@@ -6588,19 +6556,23 @@ static int btrfs_truncate(struct inode *inode)
6588 rsv = btrfs_alloc_block_rsv(root); 6556 rsv = btrfs_alloc_block_rsv(root);
6589 if (!rsv) 6557 if (!rsv)
6590 return -ENOMEM; 6558 return -ENOMEM;
6591 btrfs_add_durable_block_rsv(root->fs_info, rsv); 6559 rsv->size = min_size;
6592 6560
6561 /*
6562 * 1 for the truncate slack space
6563 * 1 for the orphan item we're going to add
6564 * 1 for the orphan item deletion
6565 * 1 for updating the inode.
6566 */
6593 trans = btrfs_start_transaction(root, 4); 6567 trans = btrfs_start_transaction(root, 4);
6594 if (IS_ERR(trans)) { 6568 if (IS_ERR(trans)) {
6595 err = PTR_ERR(trans); 6569 err = PTR_ERR(trans);
6596 goto out; 6570 goto out;
6597 } 6571 }
6598 6572
6599 /* 6573 /* Migrate the slack space for the truncate to our reserve */
6600 * Reserve space for the truncate process. Truncate should be adding 6574 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6601 * space, but if there are snapshots it may end up using space. 6575 min_size);
6602 */
6603 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6604 BUG_ON(ret); 6576 BUG_ON(ret);
6605 6577
6606 ret = btrfs_orphan_add(trans, inode); 6578 ret = btrfs_orphan_add(trans, inode);
@@ -6609,21 +6581,6 @@ static int btrfs_truncate(struct inode *inode)
6609 goto out; 6581 goto out;
6610 } 6582 }
6611 6583
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /*
6617 * Ok so we've already migrated our bytes over for the truncate, so here
6618 * just reserve the one slot we need for updating the inode.
6619 */
6620 trans = btrfs_start_transaction(root, 1);
6621 if (IS_ERR(trans)) {
6622 err = PTR_ERR(trans);
6623 goto out;
6624 }
6625 trans->block_rsv = rsv;
6626
6627 /* 6584 /*
6628 * setattr is responsible for setting the ordered_data_close flag, 6585 * setattr is responsible for setting the ordered_data_close flag,
6629 * but that is only tested during the last file release. That 6586 * but that is only tested during the last file release. That
@@ -6645,20 +6602,31 @@ static int btrfs_truncate(struct inode *inode)
6645 btrfs_add_ordered_operation(trans, root, inode); 6602 btrfs_add_ordered_operation(trans, root, inode);
6646 6603
6647 while (1) { 6604 while (1) {
6605 ret = btrfs_block_rsv_refill(root, rsv, min_size);
6606 if (ret) {
6607 /*
6608 * This can only happen with the original transaction we
6609 * started above, every other time we shouldn't have a
6610 * transaction started yet.
6611 */
6612 if (ret == -EAGAIN)
6613 goto end_trans;
6614 err = ret;
6615 break;
6616 }
6617
6648 if (!trans) { 6618 if (!trans) {
6649 trans = btrfs_start_transaction(root, 3); 6619 /* Just need the 1 for updating the inode */
6620 trans = btrfs_start_transaction(root, 1);
6650 if (IS_ERR(trans)) { 6621 if (IS_ERR(trans)) {
6651 err = PTR_ERR(trans); 6622 ret = err = PTR_ERR(trans);
6652 goto out; 6623 trans = NULL;
6624 break;
6653 } 6625 }
6654
6655 ret = btrfs_truncate_reserve_metadata(trans, root,
6656 rsv);
6657 BUG_ON(ret);
6658
6659 trans->block_rsv = rsv;
6660 } 6626 }
6661 6627
6628 trans->block_rsv = rsv;
6629
6662 ret = btrfs_truncate_inode_items(trans, root, inode, 6630 ret = btrfs_truncate_inode_items(trans, root, inode,
6663 inode->i_size, 6631 inode->i_size,
6664 BTRFS_EXTENT_DATA_KEY); 6632 BTRFS_EXTENT_DATA_KEY);
@@ -6673,7 +6641,7 @@ static int btrfs_truncate(struct inode *inode)
6673 err = ret; 6641 err = ret;
6674 break; 6642 break;
6675 } 6643 }
6676 6644end_trans:
6677 nr = trans->blocks_used; 6645 nr = trans->blocks_used;
6678 btrfs_end_transaction(trans, root); 6646 btrfs_end_transaction(trans, root);
6679 trans = NULL; 6647 trans = NULL;
@@ -6693,14 +6661,16 @@ static int btrfs_truncate(struct inode *inode)
6693 ret = btrfs_orphan_del(NULL, inode); 6661 ret = btrfs_orphan_del(NULL, inode);
6694 } 6662 }
6695 6663
6696 trans->block_rsv = &root->fs_info->trans_block_rsv; 6664 if (trans) {
6697 ret = btrfs_update_inode(trans, root, inode); 6665 trans->block_rsv = &root->fs_info->trans_block_rsv;
6698 if (ret && !err) 6666 ret = btrfs_update_inode(trans, root, inode);
6699 err = ret; 6667 if (ret && !err)
6668 err = ret;
6700 6669
6701 nr = trans->blocks_used; 6670 nr = trans->blocks_used;
6702 ret = btrfs_end_transaction_throttle(trans, root); 6671 ret = btrfs_end_transaction_throttle(trans, root);
6703 btrfs_btree_balance_dirty(root, nr); 6672 btrfs_btree_balance_dirty(root, nr);
6673 }
6704 6674
6705out: 6675out:
6706 btrfs_free_block_rsv(root, rsv); 6676 btrfs_free_block_rsv(root, rsv);
@@ -6755,9 +6725,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6755 ei->last_sub_trans = 0; 6725 ei->last_sub_trans = 0;
6756 ei->logged_trans = 0; 6726 ei->logged_trans = 0;
6757 ei->delalloc_bytes = 0; 6727 ei->delalloc_bytes = 0;
6758 ei->reserved_bytes = 0;
6759 ei->disk_i_size = 0; 6728 ei->disk_i_size = 0;
6760 ei->flags = 0; 6729 ei->flags = 0;
6730 ei->csum_bytes = 0;
6761 ei->index_cnt = (u64)-1; 6731 ei->index_cnt = (u64)-1;
6762 ei->last_unlink_trans = 0; 6732 ei->last_unlink_trans = 0;
6763 6733
@@ -6769,6 +6739,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6769 ei->orphan_meta_reserved = 0; 6739 ei->orphan_meta_reserved = 0;
6770 ei->dummy_inode = 0; 6740 ei->dummy_inode = 0;
6771 ei->in_defrag = 0; 6741 ei->in_defrag = 0;
6742 ei->delalloc_meta_reserved = 0;
6772 ei->force_compress = BTRFS_COMPRESS_NONE; 6743 ei->force_compress = BTRFS_COMPRESS_NONE;
6773 6744
6774 ei->delayed_node = NULL; 6745 ei->delayed_node = NULL;
@@ -6803,6 +6774,8 @@ void btrfs_destroy_inode(struct inode *inode)
6803 WARN_ON(inode->i_data.nrpages); 6774 WARN_ON(inode->i_data.nrpages);
6804 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6775 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6805 WARN_ON(BTRFS_I(inode)->reserved_extents); 6776 WARN_ON(BTRFS_I(inode)->reserved_extents);
6777 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
6778 WARN_ON(BTRFS_I(inode)->csum_bytes);
6806 6779
6807 /* 6780 /*
6808 * This can happen where we create an inode, but somebody else also 6781 * This can happen where we create an inode, but somebody else also
@@ -6926,11 +6899,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
6926 struct dentry *dentry, struct kstat *stat) 6899 struct dentry *dentry, struct kstat *stat)
6927{ 6900{
6928 struct inode *inode = dentry->d_inode; 6901 struct inode *inode = dentry->d_inode;
6902 u32 blocksize = inode->i_sb->s_blocksize;
6903
6929 generic_fillattr(inode, stat); 6904 generic_fillattr(inode, stat);
6930 stat->dev = BTRFS_I(inode)->root->anon_dev; 6905 stat->dev = BTRFS_I(inode)->root->anon_dev;
6931 stat->blksize = PAGE_CACHE_SIZE; 6906 stat->blksize = PAGE_CACHE_SIZE;
6932 stat->blocks = (inode_get_bytes(inode) + 6907 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
6933 BTRFS_I(inode)->delalloc_bytes) >> 9; 6908 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
6934 return 0; 6909 return 0;
6935} 6910}
6936 6911
@@ -7206,14 +7181,21 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7206 goto out_unlock; 7181 goto out_unlock;
7207 } 7182 }
7208 7183
7184 /*
7185 * If the active LSM wants to access the inode during
7186 * d_instantiate it needs these. Smack checks to see
7187 * if the filesystem supports xattrs by looking at the
7188 * ops vector.
7189 */
7190 inode->i_fop = &btrfs_file_operations;
7191 inode->i_op = &btrfs_file_inode_operations;
7192
7209 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7193 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
7210 if (err) 7194 if (err)
7211 drop_inode = 1; 7195 drop_inode = 1;
7212 else { 7196 else {
7213 inode->i_mapping->a_ops = &btrfs_aops; 7197 inode->i_mapping->a_ops = &btrfs_aops;
7214 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7198 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
7215 inode->i_fop = &btrfs_file_operations;
7216 inode->i_op = &btrfs_file_inode_operations;
7217 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7199 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
7218 } 7200 }
7219 if (drop_inode) 7201 if (drop_inode)
@@ -7262,6 +7244,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7262 drop_inode = 1; 7244 drop_inode = 1;
7263 7245
7264out_unlock: 7246out_unlock:
7247 if (!err)
7248 d_instantiate(dentry, inode);
7265 nr = trans->blocks_used; 7249 nr = trans->blocks_used;
7266 btrfs_end_transaction_throttle(trans, root); 7250 btrfs_end_transaction_throttle(trans, root);
7267 if (drop_inode) { 7251 if (drop_inode) {
@@ -7420,7 +7404,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
7420 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7404 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
7421 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7405 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
7422 .writepage_start_hook = btrfs_writepage_start_hook, 7406 .writepage_start_hook = btrfs_writepage_start_hook,
7423 .readpage_io_failed_hook = btrfs_io_failed_hook,
7424 .set_bit_hook = btrfs_set_bit_hook, 7407 .set_bit_hook = btrfs_set_bit_hook,
7425 .clear_bit_hook = btrfs_clear_bit_hook, 7408 .clear_bit_hook = btrfs_clear_bit_hook,
7426 .merge_extent_hook = btrfs_merge_extent_hook, 7409 .merge_extent_hook = btrfs_merge_extent_hook,
@@ -7484,6 +7467,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7484 .follow_link = page_follow_link_light, 7467 .follow_link = page_follow_link_light,
7485 .put_link = page_put_link, 7468 .put_link = page_put_link,
7486 .getattr = btrfs_getattr, 7469 .getattr = btrfs_getattr,
7470 .setattr = btrfs_setattr,
7487 .permission = btrfs_permission, 7471 .permission = btrfs_permission,
7488 .setxattr = btrfs_setxattr, 7472 .setxattr = btrfs_setxattr,
7489 .getxattr = btrfs_getxattr, 7473 .getxattr = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dae5dfe41ba5..c04f02c7d5bb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
51#include "volumes.h" 51#include "volumes.h"
52#include "locking.h" 52#include "locking.h"
53#include "inode-map.h" 53#include "inode-map.h"
54#include "backref.h"
54 55
55/* Mask out flags that are inappropriate for the given type of inode. */ 56/* Mask out flags that are inappropriate for the given type of inode. */
56static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 57static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
117/* 118/*
118 * Inherit flags from the parent inode. 119 * Inherit flags from the parent inode.
119 * 120 *
120 * Unlike extN we don't have any flags we don't want to inherit currently. 121 * Currently only the compression flags and the cow flags are inherited.
121 */ 122 */
122void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 123void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
123{ 124{
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
128 129
129 flags = BTRFS_I(dir)->flags; 130 flags = BTRFS_I(dir)->flags;
130 131
131 if (S_ISREG(inode->i_mode)) 132 if (flags & BTRFS_INODE_NOCOMPRESS) {
132 flags &= ~BTRFS_INODE_DIRSYNC; 133 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
133 else if (!S_ISDIR(inode->i_mode)) 134 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
134 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); 135 } else if (flags & BTRFS_INODE_COMPRESS) {
136 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
137 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
138 }
139
140 if (flags & BTRFS_INODE_NODATACOW)
141 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
135 142
136 BTRFS_I(inode)->flags = flags;
137 btrfs_update_iflags(inode); 143 btrfs_update_iflags(inode);
138} 144}
139 145
@@ -246,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
246 trans = btrfs_join_transaction(root); 252 trans = btrfs_join_transaction(root);
247 BUG_ON(IS_ERR(trans)); 253 BUG_ON(IS_ERR(trans));
248 254
255 btrfs_update_iflags(inode);
256 inode->i_ctime = CURRENT_TIME;
249 ret = btrfs_update_inode(trans, root, inode); 257 ret = btrfs_update_inode(trans, root, inode);
250 BUG_ON(ret); 258 BUG_ON(ret);
251 259
252 btrfs_update_iflags(inode);
253 inode->i_ctime = CURRENT_TIME;
254 btrfs_end_transaction(trans, root); 260 btrfs_end_transaction(trans, root);
255 261
256 mnt_drop_write(file->f_path.mnt); 262 mnt_drop_write(file->f_path.mnt);
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
277 struct fstrim_range range; 283 struct fstrim_range range;
278 u64 minlen = ULLONG_MAX; 284 u64 minlen = ULLONG_MAX;
279 u64 num_devices = 0; 285 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
280 int ret; 287 int ret;
281 288
282 if (!capable(CAP_SYS_ADMIN)) 289 if (!capable(CAP_SYS_ADMIN))
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
295 } 302 }
296 } 303 }
297 rcu_read_unlock(); 304 rcu_read_unlock();
305
298 if (!num_devices) 306 if (!num_devices)
299 return -EOPNOTSUPP; 307 return -EOPNOTSUPP;
300
301 if (copy_from_user(&range, arg, sizeof(range))) 308 if (copy_from_user(&range, arg, sizeof(range)))
302 return -EFAULT; 309 return -EFAULT;
310 if (range.start > total_bytes)
311 return -EINVAL;
303 312
313 range.len = min(range.len, total_bytes - range.start);
304 range.minlen = max(range.minlen, minlen); 314 range.minlen = max(range.minlen, minlen);
305 ret = btrfs_trim_fs(root, &range); 315 ret = btrfs_trim_fs(root, &range);
306 if (ret < 0) 316 if (ret < 0)
@@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
760 int ret = 1; 770 int ret = 1;
761 771
762 /* 772 /*
763 * make sure that once we start defragging and extent, we keep on 773 * make sure that once we start defragging an extent, we keep on
764 * defragging it 774 * defragging it
765 */ 775 */
766 if (start < *defrag_end) 776 if (start < *defrag_end)
@@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
805 * extent will force at least part of that big extent to be defragged. 815 * extent will force at least part of that big extent to be defragged.
806 */ 816 */
807 if (ret) { 817 if (ret) {
808 *last_len += len;
809 *defrag_end = extent_map_end(em); 818 *defrag_end = extent_map_end(em);
810 } else { 819 } else {
811 *last_len = 0; 820 *last_len = 0;
@@ -843,13 +852,16 @@ static int cluster_pages_for_defrag(struct inode *inode,
843 int i_done; 852 int i_done;
844 struct btrfs_ordered_extent *ordered; 853 struct btrfs_ordered_extent *ordered;
845 struct extent_state *cached_state = NULL; 854 struct extent_state *cached_state = NULL;
855 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
846 856
847 if (isize == 0) 857 if (isize == 0)
848 return 0; 858 return 0;
849 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
850 860
861 mutex_lock(&inode->i_mutex);
851 ret = btrfs_delalloc_reserve_space(inode, 862 ret = btrfs_delalloc_reserve_space(inode,
852 num_pages << PAGE_CACHE_SHIFT); 863 num_pages << PAGE_CACHE_SHIFT);
864 mutex_unlock(&inode->i_mutex);
853 if (ret) 865 if (ret)
854 return ret; 866 return ret;
855again: 867again:
@@ -860,7 +872,7 @@ again:
860 for (i = 0; i < num_pages; i++) { 872 for (i = 0; i < num_pages; i++) {
861 struct page *page; 873 struct page *page;
862 page = find_or_create_page(inode->i_mapping, 874 page = find_or_create_page(inode->i_mapping,
863 start_index + i, GFP_NOFS); 875 start_index + i, mask);
864 if (!page) 876 if (!page)
865 break; 877 break;
866 878
@@ -972,18 +984,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
972 struct btrfs_super_block *disk_super; 984 struct btrfs_super_block *disk_super;
973 struct file_ra_state *ra = NULL; 985 struct file_ra_state *ra = NULL;
974 unsigned long last_index; 986 unsigned long last_index;
987 u64 isize = i_size_read(inode);
975 u64 features; 988 u64 features;
976 u64 last_len = 0; 989 u64 last_len = 0;
977 u64 skip = 0; 990 u64 skip = 0;
978 u64 defrag_end = 0; 991 u64 defrag_end = 0;
979 u64 newer_off = range->start; 992 u64 newer_off = range->start;
980 int newer_left = 0;
981 unsigned long i; 993 unsigned long i;
994 unsigned long ra_index = 0;
982 int ret; 995 int ret;
983 int defrag_count = 0; 996 int defrag_count = 0;
984 int compress_type = BTRFS_COMPRESS_ZLIB; 997 int compress_type = BTRFS_COMPRESS_ZLIB;
985 int extent_thresh = range->extent_thresh; 998 int extent_thresh = range->extent_thresh;
986 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 999 int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
1000 int cluster = max_cluster;
987 u64 new_align = ~((u64)128 * 1024 - 1); 1001 u64 new_align = ~((u64)128 * 1024 - 1);
988 struct page **pages = NULL; 1002 struct page **pages = NULL;
989 1003
@@ -997,7 +1011,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
997 compress_type = range->compress_type; 1011 compress_type = range->compress_type;
998 } 1012 }
999 1013
1000 if (inode->i_size == 0) 1014 if (isize == 0)
1001 return 0; 1015 return 0;
1002 1016
1003 /* 1017 /*
@@ -1013,7 +1027,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1013 ra = &file->f_ra; 1027 ra = &file->f_ra;
1014 } 1028 }
1015 1029
1016 pages = kmalloc(sizeof(struct page *) * newer_cluster, 1030 pages = kmalloc(sizeof(struct page *) * max_cluster,
1017 GFP_NOFS); 1031 GFP_NOFS);
1018 if (!pages) { 1032 if (!pages) {
1019 ret = -ENOMEM; 1033 ret = -ENOMEM;
@@ -1022,10 +1036,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1022 1036
1023 /* find the last page to defrag */ 1037 /* find the last page to defrag */
1024 if (range->start + range->len > range->start) { 1038 if (range->start + range->len > range->start) {
1025 last_index = min_t(u64, inode->i_size - 1, 1039 last_index = min_t(u64, isize - 1,
1026 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1040 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
1027 } else { 1041 } else {
1028 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1042 last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1029 } 1043 }
1030 1044
1031 if (newer_than) { 1045 if (newer_than) {
@@ -1038,14 +1052,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1038 * the extents in the file evenly spaced 1052 * the extents in the file evenly spaced
1039 */ 1053 */
1040 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1054 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1041 newer_left = newer_cluster;
1042 } else 1055 } else
1043 goto out_ra; 1056 goto out_ra;
1044 } else { 1057 } else {
1045 i = range->start >> PAGE_CACHE_SHIFT; 1058 i = range->start >> PAGE_CACHE_SHIFT;
1046 } 1059 }
1047 if (!max_to_defrag) 1060 if (!max_to_defrag)
1048 max_to_defrag = last_index - 1; 1061 max_to_defrag = last_index;
1049 1062
1050 /* 1063 /*
1051 * make writeback starts from i, so the defrag range can be 1064 * make writeback starts from i, so the defrag range can be
@@ -1079,18 +1092,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1079 i = max(i + 1, next); 1092 i = max(i + 1, next);
1080 continue; 1093 continue;
1081 } 1094 }
1095
1096 if (!newer_than) {
1097 cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
1098 PAGE_CACHE_SHIFT) - i;
1099 cluster = min(cluster, max_cluster);
1100 } else {
1101 cluster = max_cluster;
1102 }
1103
1082 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1104 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
1083 BTRFS_I(inode)->force_compress = compress_type; 1105 BTRFS_I(inode)->force_compress = compress_type;
1084 1106
1085 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); 1107 if (i + cluster > ra_index) {
1108 ra_index = max(i, ra_index);
1109 btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
1110 cluster);
1111 ra_index += max_cluster;
1112 }
1086 1113
1087 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); 1114 ret = cluster_pages_for_defrag(inode, pages, i, cluster);
1088 if (ret < 0) 1115 if (ret < 0)
1089 goto out_ra; 1116 goto out_ra;
1090 1117
1091 defrag_count += ret; 1118 defrag_count += ret;
1092 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1119 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
1093 i += ret;
1094 1120
1095 if (newer_than) { 1121 if (newer_than) {
1096 if (newer_off == (u64)-1) 1122 if (newer_off == (u64)-1)
@@ -1105,12 +1131,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1105 if (!ret) { 1131 if (!ret) {
1106 range->start = newer_off; 1132 range->start = newer_off;
1107 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1133 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1108 newer_left = newer_cluster;
1109 } else { 1134 } else {
1110 break; 1135 break;
1111 } 1136 }
1112 } else { 1137 } else {
1113 i++; 1138 if (ret > 0) {
1139 i += ret;
1140 last_len += ret << PAGE_CACHE_SHIFT;
1141 } else {
1142 i++;
1143 last_len = 0;
1144 }
1114 } 1145 }
1115 } 1146 }
1116 1147
@@ -1136,16 +1167,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1136 mutex_unlock(&inode->i_mutex); 1167 mutex_unlock(&inode->i_mutex);
1137 } 1168 }
1138 1169
1139 disk_super = &root->fs_info->super_copy; 1170 disk_super = root->fs_info->super_copy;
1140 features = btrfs_super_incompat_flags(disk_super); 1171 features = btrfs_super_incompat_flags(disk_super);
1141 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1172 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1142 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1173 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1143 btrfs_set_super_incompat_flags(disk_super, features); 1174 btrfs_set_super_incompat_flags(disk_super, features);
1144 } 1175 }
1145 1176
1146 if (!file) 1177 ret = defrag_count;
1147 kfree(ra);
1148 return defrag_count;
1149 1178
1150out_ra: 1179out_ra:
1151 if (!file) 1180 if (!file)
@@ -1189,12 +1218,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1189 *devstr = '\0'; 1218 *devstr = '\0';
1190 devstr = vol_args->name; 1219 devstr = vol_args->name;
1191 devid = simple_strtoull(devstr, &end, 10); 1220 devid = simple_strtoull(devstr, &end, 10);
1192 printk(KERN_INFO "resizing devid %llu\n", 1221 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1193 (unsigned long long)devid); 1222 (unsigned long long)devid);
1194 } 1223 }
1195 device = btrfs_find_device(root, devid, NULL, NULL); 1224 device = btrfs_find_device(root, devid, NULL, NULL);
1196 if (!device) { 1225 if (!device) {
1197 printk(KERN_INFO "resizer unable to find device %llu\n", 1226 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1198 (unsigned long long)devid); 1227 (unsigned long long)devid);
1199 ret = -EINVAL; 1228 ret = -EINVAL;
1200 goto out_unlock; 1229 goto out_unlock;
@@ -1240,7 +1269,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1240 do_div(new_size, root->sectorsize); 1269 do_div(new_size, root->sectorsize);
1241 new_size *= root->sectorsize; 1270 new_size *= root->sectorsize;
1242 1271
1243 printk(KERN_INFO "new size for %s is %llu\n", 1272 printk(KERN_INFO "btrfs: new size for %s is %llu\n",
1244 device->name, (unsigned long long)new_size); 1273 device->name, (unsigned long long)new_size);
1245 1274
1246 if (new_size > old_size) { 1275 if (new_size > old_size) {
@@ -1251,7 +1280,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1251 } 1280 }
1252 ret = btrfs_grow_device(trans, device, new_size); 1281 ret = btrfs_grow_device(trans, device, new_size);
1253 btrfs_commit_transaction(trans, root); 1282 btrfs_commit_transaction(trans, root);
1254 } else { 1283 } else if (new_size < old_size) {
1255 ret = btrfs_shrink_device(device, new_size); 1284 ret = btrfs_shrink_device(device, new_size);
1256 } 1285 }
1257 1286
@@ -2587,7 +2616,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2587 return PTR_ERR(trans); 2616 return PTR_ERR(trans);
2588 } 2617 }
2589 2618
2590 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2619 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
2591 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 2620 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
2592 dir_id, "default", 7, 1); 2621 dir_id, "default", 7, 1);
2593 if (IS_ERR_OR_NULL(di)) { 2622 if (IS_ERR_OR_NULL(di)) {
@@ -2603,7 +2632,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2603 btrfs_mark_buffer_dirty(path->nodes[0]); 2632 btrfs_mark_buffer_dirty(path->nodes[0]);
2604 btrfs_free_path(path); 2633 btrfs_free_path(path);
2605 2634
2606 disk_super = &root->fs_info->super_copy; 2635 disk_super = root->fs_info->super_copy;
2607 features = btrfs_super_incompat_flags(disk_super); 2636 features = btrfs_super_incompat_flags(disk_super);
2608 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { 2637 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2609 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; 2638 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2864,6 +2893,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
2864 return ret; 2893 return ret;
2865} 2894}
2866 2895
2896static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2897{
2898 int ret = 0;
2899 int i;
2900 u64 rel_ptr;
2901 int size;
2902 struct btrfs_ioctl_ino_path_args *ipa = NULL;
2903 struct inode_fs_paths *ipath = NULL;
2904 struct btrfs_path *path;
2905
2906 if (!capable(CAP_SYS_ADMIN))
2907 return -EPERM;
2908
2909 path = btrfs_alloc_path();
2910 if (!path) {
2911 ret = -ENOMEM;
2912 goto out;
2913 }
2914
2915 ipa = memdup_user(arg, sizeof(*ipa));
2916 if (IS_ERR(ipa)) {
2917 ret = PTR_ERR(ipa);
2918 ipa = NULL;
2919 goto out;
2920 }
2921
2922 size = min_t(u32, ipa->size, 4096);
2923 ipath = init_ipath(size, root, path);
2924 if (IS_ERR(ipath)) {
2925 ret = PTR_ERR(ipath);
2926 ipath = NULL;
2927 goto out;
2928 }
2929
2930 ret = paths_from_inode(ipa->inum, ipath);
2931 if (ret < 0)
2932 goto out;
2933
2934 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2935 rel_ptr = ipath->fspath->val[i] -
2936 (u64)(unsigned long)ipath->fspath->val;
2937 ipath->fspath->val[i] = rel_ptr;
2938 }
2939
2940 ret = copy_to_user((void *)(unsigned long)ipa->fspath,
2941 (void *)(unsigned long)ipath->fspath, size);
2942 if (ret) {
2943 ret = -EFAULT;
2944 goto out;
2945 }
2946
2947out:
2948 btrfs_free_path(path);
2949 free_ipath(ipath);
2950 kfree(ipa);
2951
2952 return ret;
2953}
2954
2955static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
2956{
2957 struct btrfs_data_container *inodes = ctx;
2958 const size_t c = 3 * sizeof(u64);
2959
2960 if (inodes->bytes_left >= c) {
2961 inodes->bytes_left -= c;
2962 inodes->val[inodes->elem_cnt] = inum;
2963 inodes->val[inodes->elem_cnt + 1] = offset;
2964 inodes->val[inodes->elem_cnt + 2] = root;
2965 inodes->elem_cnt += 3;
2966 } else {
2967 inodes->bytes_missing += c - inodes->bytes_left;
2968 inodes->bytes_left = 0;
2969 inodes->elem_missed += 3;
2970 }
2971
2972 return 0;
2973}
2974
2975static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2976 void __user *arg)
2977{
2978 int ret = 0;
2979 int size;
2980 u64 extent_offset;
2981 struct btrfs_ioctl_logical_ino_args *loi;
2982 struct btrfs_data_container *inodes = NULL;
2983 struct btrfs_path *path = NULL;
2984 struct btrfs_key key;
2985
2986 if (!capable(CAP_SYS_ADMIN))
2987 return -EPERM;
2988
2989 loi = memdup_user(arg, sizeof(*loi));
2990 if (IS_ERR(loi)) {
2991 ret = PTR_ERR(loi);
2992 loi = NULL;
2993 goto out;
2994 }
2995
2996 path = btrfs_alloc_path();
2997 if (!path) {
2998 ret = -ENOMEM;
2999 goto out;
3000 }
3001
3002 size = min_t(u32, loi->size, 4096);
3003 inodes = init_data_container(size);
3004 if (IS_ERR(inodes)) {
3005 ret = PTR_ERR(inodes);
3006 inodes = NULL;
3007 goto out;
3008 }
3009
3010 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3011
3012 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3013 ret = -ENOENT;
3014 if (ret < 0)
3015 goto out;
3016
3017 extent_offset = loi->logical - key.objectid;
3018 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3019 extent_offset, build_ino_list, inodes);
3020
3021 if (ret < 0)
3022 goto out;
3023
3024 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3025 (void *)(unsigned long)inodes, size);
3026 if (ret)
3027 ret = -EFAULT;
3028
3029out:
3030 btrfs_free_path(path);
3031 kfree(inodes);
3032 kfree(loi);
3033
3034 return ret;
3035}
3036
2867long btrfs_ioctl(struct file *file, unsigned int 3037long btrfs_ioctl(struct file *file, unsigned int
2868 cmd, unsigned long arg) 3038 cmd, unsigned long arg)
2869{ 3039{
@@ -2921,6 +3091,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2921 return btrfs_ioctl_tree_search(file, argp); 3091 return btrfs_ioctl_tree_search(file, argp);
2922 case BTRFS_IOC_INO_LOOKUP: 3092 case BTRFS_IOC_INO_LOOKUP:
2923 return btrfs_ioctl_ino_lookup(file, argp); 3093 return btrfs_ioctl_ino_lookup(file, argp);
3094 case BTRFS_IOC_INO_PATHS:
3095 return btrfs_ioctl_ino_to_path(root, argp);
3096 case BTRFS_IOC_LOGICAL_INO:
3097 return btrfs_ioctl_logical_to_ino(root, argp);
2924 case BTRFS_IOC_SPACE_INFO: 3098 case BTRFS_IOC_SPACE_INFO:
2925 return btrfs_ioctl_space_info(root, argp); 3099 return btrfs_ioctl_space_info(root, argp);
2926 case BTRFS_IOC_SYNC: 3100 case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb4..252ae9915de8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
193 struct btrfs_ioctl_space_info spaces[0]; 193 struct btrfs_ioctl_space_info spaces[0];
194}; 194};
195 195
196struct btrfs_data_container {
197 __u32 bytes_left; /* out -- bytes not needed to deliver output */
198 __u32 bytes_missing; /* out -- additional bytes needed for result */
199 __u32 elem_cnt; /* out */
200 __u32 elem_missed; /* out */
201 __u64 val[0]; /* out */
202};
203
204struct btrfs_ioctl_ino_path_args {
205 __u64 inum; /* in */
206 __u32 size; /* in */
207 __u64 reserved[4];
208 /* struct btrfs_data_container *fspath; out */
209 __u64 fspath; /* out */
210};
211
212struct btrfs_ioctl_logical_ino_args {
213 __u64 logical; /* in */
214 __u32 size; /* in */
215 __u64 reserved[4];
216 /* struct btrfs_data_container *inodes; out */
217 __u64 inodes;
218};
219
196#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 220#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
197 struct btrfs_ioctl_vol_args) 221 struct btrfs_ioctl_vol_args)
198#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 222#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
248 struct btrfs_ioctl_dev_info_args) 272 struct btrfs_ioctl_dev_info_args)
249#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
250 struct btrfs_ioctl_fs_info_args) 274 struct btrfs_ioctl_fs_info_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
278 struct btrfs_ioctl_ino_path_args)
279
251#endif 280#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) 158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
159{ 159{
160 int i; 160 int i;
161 u32 type; 161 u32 type, nr;
162 u32 nr = btrfs_header_nritems(l);
163 struct btrfs_item *item; 162 struct btrfs_item *item;
164 struct btrfs_root_item *ri; 163 struct btrfs_root_item *ri;
165 struct btrfs_dir_item *di; 164 struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
172 struct btrfs_key key; 171 struct btrfs_key key;
173 struct btrfs_key found_key; 172 struct btrfs_key found_key;
174 173
174 if (!l)
175 return;
176
177 nr = btrfs_header_nritems(l);
178
175 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", 179 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
176 (unsigned long long)btrfs_header_bytenr(l), nr, 180 (unsigned long long)btrfs_header_bytenr(l), nr,
177 btrfs_leaf_free_space(root, l)); 181 btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..2373b39a132b
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,951 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31#undef DEBUG
32
33/*
34 * This is the implementation for the generic read ahead framework.
35 *
36 * To trigger a readahead, btrfs_reada_add must be called. It will start
37 * a read ahead for the given range [start, end) on tree root. The returned
38 * handle can either be used to wait on the readahead to finish
39 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
40 *
41 * The read ahead works as follows:
42 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
43 * reada_start_machine will then search for extents to prefetch and trigger
44 * some reads. When a read finishes for a node, all contained node/leaf
45 * pointers that lie in the given range will also be enqueued. The reads will
46 * be triggered in sequential order, thus giving a big win over a naive
47 * enumeration. It will also make use of multi-device layouts. Each disk
48 * will have its on read pointer and all disks will by utilized in parallel.
49 * Also will no two disks read both sides of a mirror simultaneously, as this
50 * would waste seeking capacity. Instead both disks will read different parts
51 * of the filesystem.
52 * Any number of readaheads can be started in parallel. The read order will be
53 * determined globally, i.e. 2 parallel readaheads will normally finish faster
54 * than the 2 started one after another.
55 */
56
57#define MAX_MIRRORS 2
58#define MAX_IN_FLIGHT 6
59
60struct reada_extctl {
61 struct list_head list;
62 struct reada_control *rc;
63 u64 generation;
64};
65
66struct reada_extent {
67 u64 logical;
68 struct btrfs_key top;
69 u32 blocksize;
70 int err;
71 struct list_head extctl;
72 struct kref refcnt;
73 spinlock_t lock;
74 struct reada_zone *zones[MAX_MIRRORS];
75 int nzones;
76 struct btrfs_device *scheduled_for;
77};
78
79struct reada_zone {
80 u64 start;
81 u64 end;
82 u64 elems;
83 struct list_head list;
84 spinlock_t lock;
85 int locked;
86 struct btrfs_device *device;
87 struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
88 int ndevs;
89 struct kref refcnt;
90};
91
92struct reada_machine_work {
93 struct btrfs_work work;
94 struct btrfs_fs_info *fs_info;
95};
96
97static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
98static void reada_control_release(struct kref *kref);
99static void reada_zone_release(struct kref *kref);
100static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102
103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation);
105
106/* recurses */
107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
109 u64 start, int err)
110{
111 int level = 0;
112 int nritems;
113 int i;
114 u64 bytenr;
115 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121
122 if (eb)
123 level = btrfs_header_level(eb);
124
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 kref_get(&re->refcnt);
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock);
136 /*
137 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore
139 */
140 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock);
144
145 if (err == 0) {
146 nritems = level ? btrfs_header_nritems(eb) : 0;
147 generation = btrfs_header_generation(eb);
148 /*
149 * FIXME: currently we just set nritems to 0 if this is a leaf,
150 * effectively ignoring the content. In a next step we could
151 * trigger more readahead depending from the content, e.g.
152 * fetch the checksums for the extents in the leaf.
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164
165 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec;
167 u64 n_gen;
168 struct btrfs_key key;
169 struct btrfs_key next_key;
170
171 btrfs_node_key_to_cpu(eb, &key, i);
172 if (i + 1 < nritems)
173 btrfs_node_key_to_cpu(eb, &next_key, i + 1);
174 else
175 next_key = re->top;
176 bytenr = btrfs_node_blockptr(eb, i);
177 n_gen = btrfs_node_ptr_generation(eb, i);
178
179 list_for_each_entry(rec, &list, list) {
180 struct reada_control *rc = rec->rc;
181
182 /*
183 * if the generation doesn't match, just ignore this
184 * extctl. This will probably cut off a branch from
185 * prefetch. Alternatively one could start a new (sub-)
186 * prefetch for this branch, starting again from root.
187 * FIXME: move the generation check out of this loop
188 */
189#ifdef DEBUG
190 if (rec->generation != generation) {
191 printk(KERN_DEBUG "generation mismatch for "
192 "(%llu,%d,%llu) %llu != %llu\n",
193 key.objectid, key.type, key.offset,
194 rec->generation, generation);
195 }
196#endif
197 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key,
201 level - 1, n_gen);
202 }
203 }
204 /*
205 * free extctl records
206 */
207 while (!list_empty(&list)) {
208 struct reada_control *rc;
209 struct reada_extctl *rec;
210
211 rec = list_first_entry(&list, struct reada_extctl, list);
212 list_del(&rec->list);
213 rc = rec->rc;
214 kfree(rec);
215
216 kref_get(&rc->refcnt);
217 if (atomic_dec_and_test(&rc->elems)) {
218 kref_put(&rc->refcnt, reada_control_release);
219 wake_up(&rc->wait);
220 }
221 kref_put(&rc->refcnt, reada_control_release);
222
223 reada_extent_put(fs_info, re); /* one ref for each entry */
224 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228
229 return 0;
230}
231
232/*
233 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O
235 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
237 u64 start, int err)
238{
239 int ret;
240
241 ret = __readahead_hook(root, eb, start, err);
242
243 reada_start_machine(root->fs_info);
244
245 return ret;
246}
247
248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
249 struct btrfs_device *dev, u64 logical,
250 struct btrfs_bio *bbio)
251{
252 int ret;
253 int looped = 0;
254 struct reada_zone *zone;
255 struct btrfs_block_group_cache *cache = NULL;
256 u64 start;
257 u64 end;
258 int i;
259
260again:
261 zone = NULL;
262 spin_lock(&fs_info->reada_lock);
263 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
264 logical >> PAGE_CACHE_SHIFT, 1);
265 if (ret == 1)
266 kref_get(&zone->refcnt);
267 spin_unlock(&fs_info->reada_lock);
268
269 if (ret == 1) {
270 if (logical >= zone->start && logical < zone->end)
271 return zone;
272 spin_lock(&fs_info->reada_lock);
273 kref_put(&zone->refcnt, reada_zone_release);
274 spin_unlock(&fs_info->reada_lock);
275 }
276
277 if (looped)
278 return NULL;
279
280 cache = btrfs_lookup_block_group(fs_info, logical);
281 if (!cache)
282 return NULL;
283
284 start = cache->key.objectid;
285 end = start + cache->key.offset - 1;
286 btrfs_put_block_group(cache);
287
288 zone = kzalloc(sizeof(*zone), GFP_NOFS);
289 if (!zone)
290 return NULL;
291
292 zone->start = start;
293 zone->end = end;
294 INIT_LIST_HEAD(&zone->list);
295 spin_lock_init(&zone->lock);
296 zone->locked = 0;
297 kref_init(&zone->refcnt);
298 zone->elems = 0;
299 zone->device = dev; /* our device always sits at index 0 */
300 for (i = 0; i < bbio->num_stripes; ++i) {
301 /* bounds have already been checked */
302 zone->devs[i] = bbio->stripes[i].dev;
303 }
304 zone->ndevs = bbio->num_stripes;
305
306 spin_lock(&fs_info->reada_lock);
307 ret = radix_tree_insert(&dev->reada_zones,
308 (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
309 zone);
310 spin_unlock(&fs_info->reada_lock);
311
312 if (ret) {
313 kfree(zone);
314 looped = 1;
315 goto again;
316 }
317
318 return zone;
319}
320
321static struct reada_extent *reada_find_extent(struct btrfs_root *root,
322 u64 logical,
323 struct btrfs_key *top, int level)
324{
325 int ret;
326 int looped = 0;
327 struct reada_extent *re = NULL;
328 struct btrfs_fs_info *fs_info = root->fs_info;
329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
330 struct btrfs_bio *bbio = NULL;
331 struct btrfs_device *dev;
332 u32 blocksize;
333 u64 length;
334 int nzones = 0;
335 int i;
336 unsigned long index = logical >> PAGE_CACHE_SHIFT;
337
338again:
339 spin_lock(&fs_info->reada_lock);
340 re = radix_tree_lookup(&fs_info->reada_tree, index);
341 if (re)
342 kref_get(&re->refcnt);
343 spin_unlock(&fs_info->reada_lock);
344
345 if (re || looped)
346 return re;
347
348 re = kzalloc(sizeof(*re), GFP_NOFS);
349 if (!re)
350 return NULL;
351
352 blocksize = btrfs_level_size(root, level);
353 re->logical = logical;
354 re->blocksize = blocksize;
355 re->top = *top;
356 INIT_LIST_HEAD(&re->extctl);
357 spin_lock_init(&re->lock);
358 kref_init(&re->refcnt);
359
360 /*
361 * map block
362 */
363 length = blocksize;
364 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
365 if (ret || !bbio || length < blocksize)
366 goto error;
367
368 if (bbio->num_stripes > MAX_MIRRORS) {
369 printk(KERN_ERR "btrfs readahead: more than %d copies not "
370 "supported", MAX_MIRRORS);
371 goto error;
372 }
373
374 for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
375 struct reada_zone *zone;
376
377 dev = bbio->stripes[nzones].dev;
378 zone = reada_find_zone(fs_info, dev, logical, bbio);
379 if (!zone)
380 break;
381
382 re->zones[nzones] = zone;
383 spin_lock(&zone->lock);
384 if (!zone->elems)
385 kref_get(&zone->refcnt);
386 ++zone->elems;
387 spin_unlock(&zone->lock);
388 spin_lock(&fs_info->reada_lock);
389 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock);
391 }
392 re->nzones = nzones;
393 if (nzones == 0) {
394 /* not a single zone found, error and out */
395 goto error;
396 }
397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 spin_lock(&fs_info->reada_lock);
400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
401 if (ret) {
402 spin_unlock(&fs_info->reada_lock);
403 if (ret != -ENOMEM) {
404 /* someone inserted the extent in the meantime */
405 looped = 1;
406 }
407 goto error;
408 }
409 for (i = 0; i < nzones; ++i) {
410 dev = bbio->stripes[i].dev;
411 ret = radix_tree_insert(&dev->reada_extents, index, re);
412 if (ret) {
413 while (--i >= 0) {
414 dev = bbio->stripes[i].dev;
415 BUG_ON(dev == NULL);
416 radix_tree_delete(&dev->reada_extents, index);
417 }
418 BUG_ON(fs_info == NULL);
419 radix_tree_delete(&fs_info->reada_tree, index);
420 spin_unlock(&fs_info->reada_lock);
421 goto error;
422 }
423 }
424 spin_unlock(&fs_info->reada_lock);
425
426 kfree(bbio);
427 return re;
428
429error:
430 while (nzones) {
431 struct reada_zone *zone;
432
433 --nzones;
434 zone = re->zones[nzones];
435 kref_get(&zone->refcnt);
436 spin_lock(&zone->lock);
437 --zone->elems;
438 if (zone->elems == 0) {
439 /*
440 * no fs_info->reada_lock needed, as this can't be
441 * the last ref
442 */
443 kref_put(&zone->refcnt, reada_zone_release);
444 }
445 spin_unlock(&zone->lock);
446
447 spin_lock(&fs_info->reada_lock);
448 kref_put(&zone->refcnt, reada_zone_release);
449 spin_unlock(&fs_info->reada_lock);
450 }
451 kfree(bbio);
452 kfree(re);
453 if (looped)
454 goto again;
455 return NULL;
456}
457
458static void reada_kref_dummy(struct kref *kr)
459{
460}
461
462static void reada_extent_put(struct btrfs_fs_info *fs_info,
463 struct reada_extent *re)
464{
465 int i;
466 unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
467
468 spin_lock(&fs_info->reada_lock);
469 if (!kref_put(&re->refcnt, reada_kref_dummy)) {
470 spin_unlock(&fs_info->reada_lock);
471 return;
472 }
473
474 radix_tree_delete(&fs_info->reada_tree, index);
475 for (i = 0; i < re->nzones; ++i) {
476 struct reada_zone *zone = re->zones[i];
477
478 radix_tree_delete(&zone->device->reada_extents, index);
479 }
480
481 spin_unlock(&fs_info->reada_lock);
482
483 for (i = 0; i < re->nzones; ++i) {
484 struct reada_zone *zone = re->zones[i];
485
486 kref_get(&zone->refcnt);
487 spin_lock(&zone->lock);
488 --zone->elems;
489 if (zone->elems == 0) {
490 /* no fs_info->reada_lock needed, as this can't be
491 * the last ref */
492 kref_put(&zone->refcnt, reada_zone_release);
493 }
494 spin_unlock(&zone->lock);
495
496 spin_lock(&fs_info->reada_lock);
497 kref_put(&zone->refcnt, reada_zone_release);
498 spin_unlock(&fs_info->reada_lock);
499 }
500 if (re->scheduled_for)
501 atomic_dec(&re->scheduled_for->reada_in_flight);
502
503 kfree(re);
504}
505
506static void reada_zone_release(struct kref *kref)
507{
508 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
509
510 radix_tree_delete(&zone->device->reada_zones,
511 zone->end >> PAGE_CACHE_SHIFT);
512
513 kfree(zone);
514}
515
516static void reada_control_release(struct kref *kref)
517{
518 struct reada_control *rc = container_of(kref, struct reada_control,
519 refcnt);
520
521 kfree(rc);
522}
523
524static int reada_add_block(struct reada_control *rc, u64 logical,
525 struct btrfs_key *top, int level, u64 generation)
526{
527 struct btrfs_root *root = rc->root;
528 struct reada_extent *re;
529 struct reada_extctl *rec;
530
531 re = reada_find_extent(root, logical, top, level); /* takes one ref */
532 if (!re)
533 return -1;
534
535 rec = kzalloc(sizeof(*rec), GFP_NOFS);
536 if (!rec) {
537 reada_extent_put(root->fs_info, re);
538 return -1;
539 }
540
541 rec->rc = rc;
542 rec->generation = generation;
543 atomic_inc(&rc->elems);
544
545 spin_lock(&re->lock);
546 list_add_tail(&rec->list, &re->extctl);
547 spin_unlock(&re->lock);
548
549 /* leave the ref on the extent */
550
551 return 0;
552}
553
554/*
555 * called with fs_info->reada_lock held
556 */
557static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
558{
559 int i;
560 unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
561
562 for (i = 0; i < zone->ndevs; ++i) {
563 struct reada_zone *peer;
564 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
565 if (peer && peer->device != zone->device)
566 peer->locked = lock;
567 }
568}
569
570/*
571 * called with fs_info->reada_lock held
572 */
573static int reada_pick_zone(struct btrfs_device *dev)
574{
575 struct reada_zone *top_zone = NULL;
576 struct reada_zone *top_locked_zone = NULL;
577 u64 top_elems = 0;
578 u64 top_locked_elems = 0;
579 unsigned long index = 0;
580 int ret;
581
582 if (dev->reada_curr_zone) {
583 reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
584 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
585 dev->reada_curr_zone = NULL;
586 }
587 /* pick the zone with the most elements */
588 while (1) {
589 struct reada_zone *zone;
590
591 ret = radix_tree_gang_lookup(&dev->reada_zones,
592 (void **)&zone, index, 1);
593 if (ret == 0)
594 break;
595 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
596 if (zone->locked) {
597 if (zone->elems > top_locked_elems) {
598 top_locked_elems = zone->elems;
599 top_locked_zone = zone;
600 }
601 } else {
602 if (zone->elems > top_elems) {
603 top_elems = zone->elems;
604 top_zone = zone;
605 }
606 }
607 }
608 if (top_zone)
609 dev->reada_curr_zone = top_zone;
610 else if (top_locked_zone)
611 dev->reada_curr_zone = top_locked_zone;
612 else
613 return 0;
614
615 dev->reada_next = dev->reada_curr_zone->start;
616 kref_get(&dev->reada_curr_zone->refcnt);
617 reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
618
619 return 1;
620}
621
622static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
623 struct btrfs_device *dev)
624{
625 struct reada_extent *re = NULL;
626 int mirror_num = 0;
627 struct extent_buffer *eb = NULL;
628 u64 logical;
629 u32 blocksize;
630 int ret;
631 int i;
632 int need_kick = 0;
633
634 spin_lock(&fs_info->reada_lock);
635 if (dev->reada_curr_zone == NULL) {
636 ret = reada_pick_zone(dev);
637 if (!ret) {
638 spin_unlock(&fs_info->reada_lock);
639 return 0;
640 }
641 }
642 /*
643 * FIXME currently we issue the reads one extent at a time. If we have
644 * a contiguous block of extents, we could also coagulate them or use
645 * plugging to speed things up
646 */
647 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
648 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
649 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
650 ret = reada_pick_zone(dev);
651 if (!ret) {
652 spin_unlock(&fs_info->reada_lock);
653 return 0;
654 }
655 re = NULL;
656 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
657 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
658 }
659 if (ret == 0) {
660 spin_unlock(&fs_info->reada_lock);
661 return 0;
662 }
663 dev->reada_next = re->logical + re->blocksize;
664 kref_get(&re->refcnt);
665
666 spin_unlock(&fs_info->reada_lock);
667
668 /*
669 * find mirror num
670 */
671 for (i = 0; i < re->nzones; ++i) {
672 if (re->zones[i]->device == dev) {
673 mirror_num = i + 1;
674 break;
675 }
676 }
677 logical = re->logical;
678 blocksize = re->blocksize;
679
680 spin_lock(&re->lock);
681 if (re->scheduled_for == NULL) {
682 re->scheduled_for = dev;
683 need_kick = 1;
684 }
685 spin_unlock(&re->lock);
686
687 reada_extent_put(fs_info, re);
688
689 if (!need_kick)
690 return 0;
691
692 atomic_inc(&dev->reada_in_flight);
693 ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
694 mirror_num, &eb);
695 if (ret)
696 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
697 else if (eb)
698 __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
699
700 if (eb)
701 free_extent_buffer(eb);
702
703 return 1;
704
705}
706
707static void reada_start_machine_worker(struct btrfs_work *work)
708{
709 struct reada_machine_work *rmw;
710 struct btrfs_fs_info *fs_info;
711
712 rmw = container_of(work, struct reada_machine_work, work);
713 fs_info = rmw->fs_info;
714
715 kfree(rmw);
716
717 __reada_start_machine(fs_info);
718}
719
720static void __reada_start_machine(struct btrfs_fs_info *fs_info)
721{
722 struct btrfs_device *device;
723 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
724 u64 enqueued;
725 u64 total = 0;
726 int i;
727
728 do {
729 enqueued = 0;
730 list_for_each_entry(device, &fs_devices->devices, dev_list) {
731 if (atomic_read(&device->reada_in_flight) <
732 MAX_IN_FLIGHT)
733 enqueued += reada_start_machine_dev(fs_info,
734 device);
735 }
736 total += enqueued;
737 } while (enqueued && total < 10000);
738
739 if (enqueued == 0)
740 return;
741
742 /*
743 * If everything is already in the cache, this is effectively single
744 * threaded. To a) not hold the caller for too long and b) to utilize
745 * more cores, we broke the loop above after 10000 iterations and now
746 * enqueue to workers to finish it. This will distribute the load to
747 * the cores.
748 */
749 for (i = 0; i < 2; ++i)
750 reada_start_machine(fs_info);
751}
752
753static void reada_start_machine(struct btrfs_fs_info *fs_info)
754{
755 struct reada_machine_work *rmw;
756
757 rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
758 if (!rmw) {
759 /* FIXME we cannot handle this properly right now */
760 BUG();
761 }
762 rmw->work.func = reada_start_machine_worker;
763 rmw->fs_info = fs_info;
764
765 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
766}
767
768#ifdef DEBUG
769static void dump_devs(struct btrfs_fs_info *fs_info, int all)
770{
771 struct btrfs_device *device;
772 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
773 unsigned long index;
774 int ret;
775 int i;
776 int j;
777 int cnt;
778
779 spin_lock(&fs_info->reada_lock);
780 list_for_each_entry(device, &fs_devices->devices, dev_list) {
781 printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
782 atomic_read(&device->reada_in_flight));
783 index = 0;
784 while (1) {
785 struct reada_zone *zone;
786 ret = radix_tree_gang_lookup(&device->reada_zones,
787 (void **)&zone, index, 1);
788 if (ret == 0)
789 break;
790 printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
791 "%d devs", zone->start, zone->end, zone->elems,
792 zone->locked);
793 for (j = 0; j < zone->ndevs; ++j) {
794 printk(KERN_CONT " %lld",
795 zone->devs[j]->devid);
796 }
797 if (device->reada_curr_zone == zone)
798 printk(KERN_CONT " curr off %llu",
799 device->reada_next - zone->start);
800 printk(KERN_CONT "\n");
801 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
802 }
803 cnt = 0;
804 index = 0;
805 while (all) {
806 struct reada_extent *re = NULL;
807
808 ret = radix_tree_gang_lookup(&device->reada_extents,
809 (void **)&re, index, 1);
810 if (ret == 0)
811 break;
812 printk(KERN_DEBUG
813 " re: logical %llu size %u empty %d for %lld",
814 re->logical, re->blocksize,
815 list_empty(&re->extctl), re->scheduled_for ?
816 re->scheduled_for->devid : -1);
817
818 for (i = 0; i < re->nzones; ++i) {
819 printk(KERN_CONT " zone %llu-%llu devs",
820 re->zones[i]->start,
821 re->zones[i]->end);
822 for (j = 0; j < re->zones[i]->ndevs; ++j) {
823 printk(KERN_CONT " %lld",
824 re->zones[i]->devs[j]->devid);
825 }
826 }
827 printk(KERN_CONT "\n");
828 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
829 if (++cnt > 15)
830 break;
831 }
832 }
833
834 index = 0;
835 cnt = 0;
836 while (all) {
837 struct reada_extent *re = NULL;
838
839 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
840 index, 1);
841 if (ret == 0)
842 break;
843 if (!re->scheduled_for) {
844 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
845 continue;
846 }
847 printk(KERN_DEBUG
848 "re: logical %llu size %u list empty %d for %lld",
849 re->logical, re->blocksize, list_empty(&re->extctl),
850 re->scheduled_for ? re->scheduled_for->devid : -1);
851 for (i = 0; i < re->nzones; ++i) {
852 printk(KERN_CONT " zone %llu-%llu devs",
853 re->zones[i]->start,
854 re->zones[i]->end);
855 for (i = 0; i < re->nzones; ++i) {
856 printk(KERN_CONT " zone %llu-%llu devs",
857 re->zones[i]->start,
858 re->zones[i]->end);
859 for (j = 0; j < re->zones[i]->ndevs; ++j) {
860 printk(KERN_CONT " %lld",
861 re->zones[i]->devs[j]->devid);
862 }
863 }
864 }
865 printk(KERN_CONT "\n");
866 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
867 }
868 spin_unlock(&fs_info->reada_lock);
869}
870#endif
871
872/*
873 * interface
874 */
875struct reada_control *btrfs_reada_add(struct btrfs_root *root,
876 struct btrfs_key *key_start, struct btrfs_key *key_end)
877{
878 struct reada_control *rc;
879 u64 start;
880 u64 generation;
881 int level;
882 struct extent_buffer *node;
883 static struct btrfs_key max_key = {
884 .objectid = (u64)-1,
885 .type = (u8)-1,
886 .offset = (u64)-1
887 };
888
889 rc = kzalloc(sizeof(*rc), GFP_NOFS);
890 if (!rc)
891 return ERR_PTR(-ENOMEM);
892
893 rc->root = root;
894 rc->key_start = *key_start;
895 rc->key_end = *key_end;
896 atomic_set(&rc->elems, 0);
897 init_waitqueue_head(&rc->wait);
898 kref_init(&rc->refcnt);
899 kref_get(&rc->refcnt); /* one ref for having elements */
900
901 node = btrfs_root_node(root);
902 start = node->start;
903 level = btrfs_header_level(node);
904 generation = btrfs_header_generation(node);
905 free_extent_buffer(node);
906
907 reada_add_block(rc, start, &max_key, level, generation);
908
909 reada_start_machine(root->fs_info);
910
911 return rc;
912}
913
914#ifdef DEBUG
915int btrfs_reada_wait(void *handle)
916{
917 struct reada_control *rc = handle;
918
919 while (atomic_read(&rc->elems)) {
920 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
921 5 * HZ);
922 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
923 }
924
925 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
926
927 kref_put(&rc->refcnt, reada_control_release);
928
929 return 0;
930}
931#else
932int btrfs_reada_wait(void *handle)
933{
934 struct reada_control *rc = handle;
935
936 while (atomic_read(&rc->elems)) {
937 wait_event(rc->wait, atomic_read(&rc->elems) == 0);
938 }
939
940 kref_put(&rc->refcnt, reada_control_release);
941
942 return 0;
943}
944#endif
945
946void btrfs_reada_detach(void *handle)
947{
948 struct reada_control *rc = handle;
949
950 kref_put(&rc->refcnt, reada_control_release);
951}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..cfb55434a469 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
1174 list_add_tail(&new_edge->list[UPPER], 1174 list_add_tail(&new_edge->list[UPPER],
1175 &new_node->lower); 1175 &new_node->lower);
1176 } 1176 }
1177 } else {
1178 list_add_tail(&new_node->lower, &cache->leaves);
1177 } 1179 }
1178 1180
1179 rb_node = tree_insert(&cache->rb_root, new_node->bytenr, 1181 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
@@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2041 BUG_ON(IS_ERR(trans)); 2043 BUG_ON(IS_ERR(trans));
2042 trans->block_rsv = rc->block_rsv; 2044 trans->block_rsv = rc->block_rsv;
2043 2045
2044 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2046 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
2045 min_reserved, 0);
2046 if (ret) { 2047 if (ret) {
2047 BUG_ON(ret != -EAGAIN); 2048 BUG_ON(ret != -EAGAIN);
2048 ret = btrfs_commit_transaction(trans, root); 2049 ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2152again: 2153again:
2153 if (!err) { 2154 if (!err) {
2154 num_bytes = rc->merging_rsv_size; 2155 num_bytes = rc->merging_rsv_size;
2155 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2156 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2156 num_bytes);
2157 if (ret) 2157 if (ret)
2158 err = ret; 2158 err = ret;
2159 } 2159 }
@@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2428 2428
2429 trans->block_rsv = rc->block_rsv; 2429 trans->block_rsv = rc->block_rsv;
2430 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); 2430 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2431 if (ret) { 2431 if (ret) {
2432 if (ret == -EAGAIN) 2432 if (ret == -EAGAIN)
2433 rc->commit_transaction = 1; 2433 rc->commit_transaction = 1;
@@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2922 unsigned long last_index; 2922 unsigned long last_index;
2923 struct page *page; 2923 struct page *page;
2924 struct file_ra_state *ra; 2924 struct file_ra_state *ra;
2925 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2925 int nr = 0; 2926 int nr = 0;
2926 int ret = 0; 2927 int ret = 0;
2927 2928
@@ -2946,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2946 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2947 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2948 while (index <= last_index) { 2949 while (index <= last_index) {
2950 mutex_lock(&inode->i_mutex);
2949 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); 2951 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2952 mutex_unlock(&inode->i_mutex);
2950 if (ret) 2953 if (ret)
2951 goto out; 2954 goto out;
2952 2955
@@ -2956,7 +2959,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2956 ra, NULL, index, 2959 ra, NULL, index,
2957 last_index + 1 - index); 2960 last_index + 1 - index);
2958 page = find_or_create_page(inode->i_mapping, index, 2961 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS); 2962 mask);
2960 if (!page) { 2963 if (!page) {
2961 btrfs_delalloc_release_metadata(inode, 2964 btrfs_delalloc_release_metadata(inode,
2962 PAGE_CACHE_SIZE); 2965 PAGE_CACHE_SIZE);
@@ -3323,8 +3326,11 @@ static int find_data_references(struct reloc_control *rc,
3323 } 3326 }
3324 3327
3325 key.objectid = ref_objectid; 3328 key.objectid = ref_objectid;
3326 key.offset = ref_offset;
3327 key.type = BTRFS_EXTENT_DATA_KEY; 3329 key.type = BTRFS_EXTENT_DATA_KEY;
3330 if (ref_offset > ((u64)-1 << 32))
3331 key.offset = 0;
3332 else
3333 key.offset = ref_offset;
3328 3334
3329 path->search_commit_root = 1; 3335 path->search_commit_root = 1;
3330 path->skip_locking = 1; 3336 path->skip_locking = 1;
@@ -3645,14 +3651,11 @@ int prepare_to_relocate(struct reloc_control *rc)
3645 * btrfs_init_reloc_root will use them when there 3651 * btrfs_init_reloc_root will use them when there
3646 * is no reservation in transaction handle. 3652 * is no reservation in transaction handle.
3647 */ 3653 */
3648 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3654 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3649 rc->extent_root->nodesize * 256); 3655 rc->extent_root->nodesize * 256);
3650 if (ret) 3656 if (ret)
3651 return ret; 3657 return ret;
3652 3658
3653 rc->block_rsv->refill_used = 1;
3654 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3655
3656 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3659 memset(&rc->cluster, 0, sizeof(rc->cluster));
3657 rc->search_start = rc->block_group->key.objectid; 3660 rc->search_start = rc->block_group->key.objectid;
3658 rc->extents_found = 0; 3661 rc->extents_found = 0;
@@ -3777,8 +3780,7 @@ restart:
3777 } 3780 }
3778 } 3781 }
3779 3782
3780 ret = btrfs_block_rsv_check(trans, rc->extent_root, 3783 ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3781 rc->block_rsv, 0, 5);
3782 if (ret < 0) { 3784 if (ret < 0) {
3783 if (ret != -EAGAIN) { 3785 if (ret != -EAGAIN) {
3784 err = ret; 3786 err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..ddf2c90d3fc0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
17 */ 17 */
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "volumes.h" 22#include "volumes.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "ordered-data.h" 24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
24 28
25/* 29/*
26 * This is only the first step towards a full-features scrub. It reads all 30 * This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
29 * any can be found. 33 * any can be found.
30 * 34 *
31 * Future enhancements: 35 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are 36 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them 37 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read 38 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy 39 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices 40 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space 41 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */ 42 */
42 43
43struct scrub_bio; 44struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
63struct scrub_page { 64struct scrub_page {
64 u64 flags; /* extent flags */ 65 u64 flags; /* extent flags */
65 u64 generation; 66 u64 generation;
66 u64 mirror_num; 67 int mirror_num;
67 int have_csum; 68 int have_csum;
68 u8 csum[BTRFS_CSUM_SIZE]; 69 u8 csum[BTRFS_CSUM_SIZE];
69}; 70};
@@ -87,6 +88,7 @@ struct scrub_dev {
87 int first_free; 88 int first_free;
88 int curr; 89 int curr;
89 atomic_t in_flight; 90 atomic_t in_flight;
91 atomic_t fixup_cnt;
90 spinlock_t list_lock; 92 spinlock_t list_lock;
91 wait_queue_head_t list_wait; 93 wait_queue_head_t list_wait;
92 u16 csum_size; 94 u16 csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
100 spinlock_t stat_lock; 102 spinlock_t stat_lock;
101}; 103};
102 104
105struct scrub_fixup_nodatasum {
106 struct scrub_dev *sdev;
107 u64 logical;
108 struct btrfs_root *root;
109 struct btrfs_work work;
110 int mirror_num;
111};
112
113struct scrub_warning {
114 struct btrfs_path *path;
115 u64 extent_item_size;
116 char *scratch_buf;
117 char *msg_buf;
118 const char *errstr;
119 sector_t sector;
120 u64 logical;
121 struct btrfs_device *dev;
122 int msg_bufsize;
123 int scratch_bufsize;
124};
125
103static void scrub_free_csums(struct scrub_dev *sdev) 126static void scrub_free_csums(struct scrub_dev *sdev)
104{ 127{
105 while (!list_empty(&sdev->csum_list)) { 128 while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
175 198
176 if (i != SCRUB_BIOS_PER_DEV-1) 199 if (i != SCRUB_BIOS_PER_DEV-1)
177 sdev->bios[i]->next_free = i + 1; 200 sdev->bios[i]->next_free = i + 1;
178 else 201 else
179 sdev->bios[i]->next_free = -1; 202 sdev->bios[i]->next_free = -1;
180 } 203 }
181 sdev->first_free = 0; 204 sdev->first_free = 0;
182 sdev->curr = -1; 205 sdev->curr = -1;
183 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
207 atomic_set(&sdev->fixup_cnt, 0);
184 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
185 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
186 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
187 211
188 spin_lock_init(&sdev->list_lock); 212 spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,366 @@ nomem:
195 return ERR_PTR(-ENOMEM); 219 return ERR_PTR(-ENOMEM);
196} 220}
197 221
222static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
223{
224 u64 isize;
225 u32 nlink;
226 int ret;
227 int i;
228 struct extent_buffer *eb;
229 struct btrfs_inode_item *inode_item;
230 struct scrub_warning *swarn = ctx;
231 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
232 struct inode_fs_paths *ipath = NULL;
233 struct btrfs_root *local_root;
234 struct btrfs_key root_key;
235
236 root_key.objectid = root;
237 root_key.type = BTRFS_ROOT_ITEM_KEY;
238 root_key.offset = (u64)-1;
239 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
240 if (IS_ERR(local_root)) {
241 ret = PTR_ERR(local_root);
242 goto err;
243 }
244
245 ret = inode_item_info(inum, 0, local_root, swarn->path);
246 if (ret) {
247 btrfs_release_path(swarn->path);
248 goto err;
249 }
250
251 eb = swarn->path->nodes[0];
252 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
253 struct btrfs_inode_item);
254 isize = btrfs_inode_size(eb, inode_item);
255 nlink = btrfs_inode_nlink(eb, inode_item);
256 btrfs_release_path(swarn->path);
257
258 ipath = init_ipath(4096, local_root, swarn->path);
259 if (IS_ERR(ipath)) {
260 ret = PTR_ERR(ipath);
261 ipath = NULL;
262 goto err;
263 }
264 ret = paths_from_inode(inum, ipath);
265
266 if (ret < 0)
267 goto err;
268
269 /*
270 * we deliberately ignore the bit ipath might have been too small to
271 * hold all of the paths here
272 */
273 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
274 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
275 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
276 "length %llu, links %u (path: %s)\n", swarn->errstr,
277 swarn->logical, swarn->dev->name,
278 (unsigned long long)swarn->sector, root, inum, offset,
279 min(isize - offset, (u64)PAGE_SIZE), nlink,
280 (char *)(unsigned long)ipath->fspath->val[i]);
281
282 free_ipath(ipath);
283 return 0;
284
285err:
286 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
287 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
288 "resolving failed with ret=%d\n", swarn->errstr,
289 swarn->logical, swarn->dev->name,
290 (unsigned long long)swarn->sector, root, inum, offset, ret);
291
292 free_ipath(ipath);
293 return 0;
294}
295
296static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
297 int ix)
298{
299 struct btrfs_device *dev = sbio->sdev->dev;
300 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
301 struct btrfs_path *path;
302 struct btrfs_key found_key;
303 struct extent_buffer *eb;
304 struct btrfs_extent_item *ei;
305 struct scrub_warning swarn;
306 u32 item_size;
307 int ret;
308 u64 ref_root;
309 u8 ref_level;
310 unsigned long ptr = 0;
311 const int bufsize = 4096;
312 u64 extent_offset;
313
314 path = btrfs_alloc_path();
315
316 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
317 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
318 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
319 swarn.logical = sbio->logical + ix * PAGE_SIZE;
320 swarn.errstr = errstr;
321 swarn.dev = dev;
322 swarn.msg_bufsize = bufsize;
323 swarn.scratch_bufsize = bufsize;
324
325 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
326 goto out;
327
328 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
329 if (ret < 0)
330 goto out;
331
332 extent_offset = swarn.logical - found_key.objectid;
333 swarn.extent_item_size = found_key.offset;
334
335 eb = path->nodes[0];
336 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
337 item_size = btrfs_item_size_nr(eb, path->slots[0]);
338
339 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
340 do {
341 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
342 &ref_root, &ref_level);
343 printk(KERN_WARNING "%s at logical %llu on dev %s, "
344 "sector %llu: metadata %s (level %d) in tree "
345 "%llu\n", errstr, swarn.logical, dev->name,
346 (unsigned long long)swarn.sector,
347 ref_level ? "node" : "leaf",
348 ret < 0 ? -1 : ref_level,
349 ret < 0 ? -1 : ref_root);
350 } while (ret != 1);
351 } else {
352 swarn.path = path;
353 iterate_extent_inodes(fs_info, path, found_key.objectid,
354 extent_offset,
355 scrub_print_warning_inode, &swarn);
356 }
357
358out:
359 btrfs_free_path(path);
360 kfree(swarn.scratch_buf);
361 kfree(swarn.msg_buf);
362}
363
364static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
365{
366 struct page *page = NULL;
367 unsigned long index;
368 struct scrub_fixup_nodatasum *fixup = ctx;
369 int ret;
370 int corrected = 0;
371 struct btrfs_key key;
372 struct inode *inode = NULL;
373 u64 end = offset + PAGE_SIZE - 1;
374 struct btrfs_root *local_root;
375
376 key.objectid = root;
377 key.type = BTRFS_ROOT_ITEM_KEY;
378 key.offset = (u64)-1;
379 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
380 if (IS_ERR(local_root))
381 return PTR_ERR(local_root);
382
383 key.type = BTRFS_INODE_ITEM_KEY;
384 key.objectid = inum;
385 key.offset = 0;
386 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
387 if (IS_ERR(inode))
388 return PTR_ERR(inode);
389
390 index = offset >> PAGE_CACHE_SHIFT;
391
392 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
393 if (!page) {
394 ret = -ENOMEM;
395 goto out;
396 }
397
398 if (PageUptodate(page)) {
399 struct btrfs_mapping_tree *map_tree;
400 if (PageDirty(page)) {
401 /*
402 * we need to write the data to the defect sector. the
403 * data that was in that sector is not in memory,
404 * because the page was modified. we must not write the
405 * modified page to that sector.
406 *
407 * TODO: what could be done here: wait for the delalloc
408 * runner to write out that page (might involve
409 * COW) and see whether the sector is still
410 * referenced afterwards.
411 *
412 * For the meantime, we'll treat this error
413 * incorrectable, although there is a chance that a
414 * later scrub will find the bad sector again and that
415 * there's no dirty page in memory, then.
416 */
417 ret = -EIO;
418 goto out;
419 }
420 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
421 ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
422 fixup->logical, page,
423 fixup->mirror_num);
424 unlock_page(page);
425 corrected = !ret;
426 } else {
427 /*
428 * we need to get good data first. the general readpage path
429 * will call repair_io_failure for us, we just have to make
430 * sure we read the bad mirror.
431 */
432 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
433 EXTENT_DAMAGED, GFP_NOFS);
434 if (ret) {
435 /* set_extent_bits should give proper error */
436 WARN_ON(ret > 0);
437 if (ret > 0)
438 ret = -EFAULT;
439 goto out;
440 }
441
442 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
443 btrfs_get_extent,
444 fixup->mirror_num);
445 wait_on_page_locked(page);
446
447 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
448 end, EXTENT_DAMAGED, 0, NULL);
449 if (!corrected)
450 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
451 EXTENT_DAMAGED, GFP_NOFS);
452 }
453
454out:
455 if (page)
456 put_page(page);
457 if (inode)
458 iput(inode);
459
460 if (ret < 0)
461 return ret;
462
463 if (ret == 0 && corrected) {
464 /*
465 * we only need to call readpage for one of the inodes belonging
466 * to this extent. so make iterate_extent_inodes stop
467 */
468 return 1;
469 }
470
471 return -EIO;
472}
473
474static void scrub_fixup_nodatasum(struct btrfs_work *work)
475{
476 int ret;
477 struct scrub_fixup_nodatasum *fixup;
478 struct scrub_dev *sdev;
479 struct btrfs_trans_handle *trans = NULL;
480 struct btrfs_fs_info *fs_info;
481 struct btrfs_path *path;
482 int uncorrectable = 0;
483
484 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
485 sdev = fixup->sdev;
486 fs_info = fixup->root->fs_info;
487
488 path = btrfs_alloc_path();
489 if (!path) {
490 spin_lock(&sdev->stat_lock);
491 ++sdev->stat.malloc_errors;
492 spin_unlock(&sdev->stat_lock);
493 uncorrectable = 1;
494 goto out;
495 }
496
497 trans = btrfs_join_transaction(fixup->root);
498 if (IS_ERR(trans)) {
499 uncorrectable = 1;
500 goto out;
501 }
502
503 /*
504 * the idea is to trigger a regular read through the standard path. we
505 * read a page from the (failed) logical address by specifying the
506 * corresponding copynum of the failed sector. thus, that readpage is
507 * expected to fail.
508 * that is the point where on-the-fly error correction will kick in
509 * (once it's finished) and rewrite the failed sector if a good copy
510 * can be found.
511 */
512 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
513 path, scrub_fixup_readpage,
514 fixup);
515 if (ret < 0) {
516 uncorrectable = 1;
517 goto out;
518 }
519 WARN_ON(ret != 1);
520
521 spin_lock(&sdev->stat_lock);
522 ++sdev->stat.corrected_errors;
523 spin_unlock(&sdev->stat_lock);
524
525out:
526 if (trans && !IS_ERR(trans))
527 btrfs_end_transaction(trans, fixup->root);
528 if (uncorrectable) {
529 spin_lock(&sdev->stat_lock);
530 ++sdev->stat.uncorrectable_errors;
531 spin_unlock(&sdev->stat_lock);
532 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
533 "(nodatasum) error at logical %llu\n",
534 fixup->logical);
535 }
536
537 btrfs_free_path(path);
538 kfree(fixup);
539
540 /* see caller why we're pretending to be paused in the scrub counters */
541 mutex_lock(&fs_info->scrub_lock);
542 atomic_dec(&fs_info->scrubs_running);
543 atomic_dec(&fs_info->scrubs_paused);
544 mutex_unlock(&fs_info->scrub_lock);
545 atomic_dec(&sdev->fixup_cnt);
546 wake_up(&fs_info->scrub_pause_wait);
547 wake_up(&sdev->list_wait);
548}
549
198/* 550/*
199 * scrub_recheck_error gets called when either verification of the page 551 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case, 552 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only 553 * recheck_error gets called for every page in the bio, even though only
202 * one may be bad 554 * one may be bad
203 */ 555 */
204static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 556static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
205{ 557{
558 struct scrub_dev *sdev = sbio->sdev;
559 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
560 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
561 DEFAULT_RATELIMIT_BURST);
562
206 if (sbio->err) { 563 if (sbio->err) {
207 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 564 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
208 (sbio->physical + ix * PAGE_SIZE) >> 9,
209 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 565 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
210 if (scrub_fixup_check(sbio, ix) == 0) 566 if (scrub_fixup_check(sbio, ix) == 0)
211 return; 567 return 0;
212 } 568 }
569 if (__ratelimit(&_rs))
570 scrub_print_warning("i/o error", sbio, ix);
571 } else {
572 if (__ratelimit(&_rs))
573 scrub_print_warning("checksum error", sbio, ix);
213 } 574 }
214 575
576 spin_lock(&sdev->stat_lock);
577 ++sdev->stat.read_errors;
578 spin_unlock(&sdev->stat_lock);
579
215 scrub_fixup(sbio, ix); 580 scrub_fixup(sbio, ix);
581 return 1;
216} 582}
217 583
218static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 584static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
250 struct scrub_dev *sdev = sbio->sdev; 616 struct scrub_dev *sdev = sbio->sdev;
251 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 617 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
252 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 618 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
253 struct btrfs_multi_bio *multi = NULL; 619 struct btrfs_bio *bbio = NULL;
620 struct scrub_fixup_nodatasum *fixup;
254 u64 logical = sbio->logical + ix * PAGE_SIZE; 621 u64 logical = sbio->logical + ix * PAGE_SIZE;
255 u64 length; 622 u64 length;
256 int i; 623 int i;
@@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
259 626
260 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 627 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
261 (sbio->spag[ix].have_csum == 0)) { 628 (sbio->spag[ix].have_csum == 0)) {
629 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
630 if (!fixup)
631 goto uncorrectable;
632 fixup->sdev = sdev;
633 fixup->logical = logical;
634 fixup->root = fs_info->extent_root;
635 fixup->mirror_num = sbio->spag[ix].mirror_num;
262 /* 636 /*
263 * nodatasum, don't try to fix anything 637 * increment scrubs_running to prevent cancel requests from
264 * FIXME: we can do better, open the inode and trigger a 638 * completing as long as a fixup worker is running. we must also
265 * writeback 639 * increment scrubs_paused to prevent deadlocking on pause
640 * requests used for transactions commits (as the worker uses a
641 * transaction context). it is safe to regard the fixup worker
642 * as paused for all matters practical. effectively, we only
643 * avoid cancellation requests from completing.
266 */ 644 */
267 goto uncorrectable; 645 mutex_lock(&fs_info->scrub_lock);
646 atomic_inc(&fs_info->scrubs_running);
647 atomic_inc(&fs_info->scrubs_paused);
648 mutex_unlock(&fs_info->scrub_lock);
649 atomic_inc(&sdev->fixup_cnt);
650 fixup->work.func = scrub_fixup_nodatasum;
651 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
652 return;
268 } 653 }
269 654
270 length = PAGE_SIZE; 655 length = PAGE_SIZE;
271 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 656 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
272 &multi, 0); 657 &bbio, 0);
273 if (ret || !multi || length < PAGE_SIZE) { 658 if (ret || !bbio || length < PAGE_SIZE) {
274 printk(KERN_ERR 659 printk(KERN_ERR
275 "scrub_fixup: btrfs_map_block failed us for %llu\n", 660 "scrub_fixup: btrfs_map_block failed us for %llu\n",
276 (unsigned long long)logical); 661 (unsigned long long)logical);
277 WARN_ON(1); 662 WARN_ON(1);
663 kfree(bbio);
278 return; 664 return;
279 } 665 }
280 666
281 if (multi->num_stripes == 1) 667 if (bbio->num_stripes == 1)
282 /* there aren't any replicas */ 668 /* there aren't any replicas */
283 goto uncorrectable; 669 goto uncorrectable;
284 670
285 /* 671 /*
286 * first find a good copy 672 * first find a good copy
287 */ 673 */
288 for (i = 0; i < multi->num_stripes; ++i) { 674 for (i = 0; i < bbio->num_stripes; ++i) {
289 if (i == sbio->spag[ix].mirror_num) 675 if (i + 1 == sbio->spag[ix].mirror_num)
290 continue; 676 continue;
291 677
292 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 678 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
293 multi->stripes[i].physical >> 9, 679 bbio->stripes[i].physical >> 9,
294 sbio->bio->bi_io_vec[ix].bv_page)) { 680 sbio->bio->bi_io_vec[ix].bv_page)) {
295 /* I/O-error, this is not a good copy */ 681 /* I/O-error, this is not a good copy */
296 continue; 682 continue;
@@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
299 if (scrub_fixup_check(sbio, ix) == 0) 685 if (scrub_fixup_check(sbio, ix) == 0)
300 break; 686 break;
301 } 687 }
302 if (i == multi->num_stripes) 688 if (i == bbio->num_stripes)
303 goto uncorrectable; 689 goto uncorrectable;
304 690
305 if (!sdev->readonly) { 691 if (!sdev->readonly) {
@@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
314 } 700 }
315 } 701 }
316 702
317 kfree(multi); 703 kfree(bbio);
318 spin_lock(&sdev->stat_lock); 704 spin_lock(&sdev->stat_lock);
319 ++sdev->stat.corrected_errors; 705 ++sdev->stat.corrected_errors;
320 spin_unlock(&sdev->stat_lock); 706 spin_unlock(&sdev->stat_lock);
321 707
322 if (printk_ratelimit()) 708 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
323 printk(KERN_ERR "btrfs: fixed up at %llu\n", 709 (unsigned long long)logical);
324 (unsigned long long)logical);
325 return; 710 return;
326 711
327uncorrectable: 712uncorrectable:
328 kfree(multi); 713 kfree(bbio);
329 spin_lock(&sdev->stat_lock); 714 spin_lock(&sdev->stat_lock);
330 ++sdev->stat.uncorrectable_errors; 715 ++sdev->stat.uncorrectable_errors;
331 spin_unlock(&sdev->stat_lock); 716 spin_unlock(&sdev->stat_lock);
332 717
333 if (printk_ratelimit()) 718 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
334 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 719 "logical %llu\n", (unsigned long long)logical);
335 (unsigned long long)logical);
336} 720}
337 721
338static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 722static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work)
382 int ret; 766 int ret;
383 767
384 if (sbio->err) { 768 if (sbio->err) {
769 ret = 0;
385 for (i = 0; i < sbio->count; ++i) 770 for (i = 0; i < sbio->count; ++i)
386 scrub_recheck_error(sbio, i); 771 ret |= scrub_recheck_error(sbio, i);
772 if (!ret) {
773 spin_lock(&sdev->stat_lock);
774 ++sdev->stat.unverified_errors;
775 spin_unlock(&sdev->stat_lock);
776 }
387 777
388 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 778 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
389 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 779 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work)
396 bi->bv_offset = 0; 786 bi->bv_offset = 0;
397 bi->bv_len = PAGE_SIZE; 787 bi->bv_len = PAGE_SIZE;
398 } 788 }
399
400 spin_lock(&sdev->stat_lock);
401 ++sdev->stat.read_errors;
402 spin_unlock(&sdev->stat_lock);
403 goto out; 789 goto out;
404 } 790 }
405 for (i = 0; i < sbio->count; ++i) { 791 for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work)
420 WARN_ON(1); 806 WARN_ON(1);
421 } 807 }
422 kunmap_atomic(buffer, KM_USER0); 808 kunmap_atomic(buffer, KM_USER0);
423 if (ret) 809 if (ret) {
424 scrub_recheck_error(sbio, i); 810 ret = scrub_recheck_error(sbio, i);
811 if (!ret) {
812 spin_lock(&sdev->stat_lock);
813 ++sdev->stat.unverified_errors;
814 spin_unlock(&sdev->stat_lock);
815 }
816 }
425 } 817 }
426 818
427out: 819out:
@@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
557static int scrub_submit(struct scrub_dev *sdev) 949static int scrub_submit(struct scrub_dev *sdev)
558{ 950{
559 struct scrub_bio *sbio; 951 struct scrub_bio *sbio;
560 struct bio *bio;
561 int i;
562 952
563 if (sdev->curr == -1) 953 if (sdev->curr == -1)
564 return 0; 954 return 0;
565 955
566 sbio = sdev->bios[sdev->curr]; 956 sbio = sdev->bios[sdev->curr];
567
568 bio = bio_alloc(GFP_NOFS, sbio->count);
569 if (!bio)
570 goto nomem;
571
572 bio->bi_private = sbio;
573 bio->bi_end_io = scrub_bio_end_io;
574 bio->bi_bdev = sdev->dev->bdev;
575 bio->bi_sector = sbio->physical >> 9;
576
577 for (i = 0; i < sbio->count; ++i) {
578 struct page *page;
579 int ret;
580
581 page = alloc_page(GFP_NOFS);
582 if (!page)
583 goto nomem;
584
585 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
586 if (!ret) {
587 __free_page(page);
588 goto nomem;
589 }
590 }
591
592 sbio->err = 0; 957 sbio->err = 0;
593 sdev->curr = -1; 958 sdev->curr = -1;
594 atomic_inc(&sdev->in_flight); 959 atomic_inc(&sdev->in_flight);
595 960
596 submit_bio(READ, bio); 961 submit_bio(READ, sbio->bio);
597 962
598 return 0; 963 return 0;
599
600nomem:
601 scrub_free_bio(bio);
602
603 return -ENOMEM;
604} 964}
605 965
606static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 966static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
607 u64 physical, u64 flags, u64 gen, u64 mirror_num, 967 u64 physical, u64 flags, u64 gen, int mirror_num,
608 u8 *csum, int force) 968 u8 *csum, int force)
609{ 969{
610 struct scrub_bio *sbio; 970 struct scrub_bio *sbio;
971 struct page *page;
972 int ret;
611 973
612again: 974again:
613 /* 975 /*
@@ -628,12 +990,22 @@ again:
628 } 990 }
629 sbio = sdev->bios[sdev->curr]; 991 sbio = sdev->bios[sdev->curr];
630 if (sbio->count == 0) { 992 if (sbio->count == 0) {
993 struct bio *bio;
994
631 sbio->physical = physical; 995 sbio->physical = physical;
632 sbio->logical = logical; 996 sbio->logical = logical;
997 bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
998 if (!bio)
999 return -ENOMEM;
1000
1001 bio->bi_private = sbio;
1002 bio->bi_end_io = scrub_bio_end_io;
1003 bio->bi_bdev = sdev->dev->bdev;
1004 bio->bi_sector = sbio->physical >> 9;
1005 sbio->err = 0;
1006 sbio->bio = bio;
633 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 1007 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
634 sbio->logical + sbio->count * PAGE_SIZE != logical) { 1008 sbio->logical + sbio->count * PAGE_SIZE != logical) {
635 int ret;
636
637 ret = scrub_submit(sdev); 1009 ret = scrub_submit(sdev);
638 if (ret) 1010 if (ret)
639 return ret; 1011 return ret;
@@ -643,6 +1015,20 @@ again:
643 sbio->spag[sbio->count].generation = gen; 1015 sbio->spag[sbio->count].generation = gen;
644 sbio->spag[sbio->count].have_csum = 0; 1016 sbio->spag[sbio->count].have_csum = 0;
645 sbio->spag[sbio->count].mirror_num = mirror_num; 1017 sbio->spag[sbio->count].mirror_num = mirror_num;
1018
1019 page = alloc_page(GFP_NOFS);
1020 if (!page)
1021 return -ENOMEM;
1022
1023 ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
1024 if (!ret) {
1025 __free_page(page);
1026 ret = scrub_submit(sdev);
1027 if (ret)
1028 return ret;
1029 goto again;
1030 }
1031
646 if (csum) { 1032 if (csum) {
647 sbio->spag[sbio->count].have_csum = 1; 1033 sbio->spag[sbio->count].have_csum = 1;
648 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 1034 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
701 1087
702/* scrub extent tries to collect up to 64 kB for each bio */ 1088/* scrub extent tries to collect up to 64 kB for each bio */
703static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1089static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
704 u64 physical, u64 flags, u64 gen, u64 mirror_num) 1090 u64 physical, u64 flags, u64 gen, int mirror_num)
705{ 1091{
706 int ret; 1092 int ret;
707 u8 csum[BTRFS_CSUM_SIZE]; 1093 u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
741 int slot; 1127 int slot;
742 int i; 1128 int i;
743 u64 nstripes; 1129 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l; 1130 struct extent_buffer *l;
746 struct btrfs_key key; 1131 struct btrfs_key key;
747 u64 physical; 1132 u64 physical;
748 u64 logical; 1133 u64 logical;
749 u64 generation; 1134 u64 generation;
750 u64 mirror_num; 1135 int mirror_num;
1136 struct reada_control *reada1;
1137 struct reada_control *reada2;
1138 struct btrfs_key key_start;
1139 struct btrfs_key key_end;
751 1140
752 u64 increment = map->stripe_len; 1141 u64 increment = map->stripe_len;
753 u64 offset; 1142 u64 offset;
@@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1147 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num; 1148 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes; 1149 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0; 1150 mirror_num = 1;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1151 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes; 1152 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes); 1153 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor; 1154 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes; 1155 mirror_num = num % map->sub_stripes + 1;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1156 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len; 1157 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes; 1158 mirror_num = num % map->num_stripes + 1;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1159 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len; 1160 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes; 1161 mirror_num = num % map->num_stripes + 1;
773 } else { 1162 } else {
774 increment = map->stripe_len; 1163 increment = map->stripe_len;
775 mirror_num = 0; 1164 mirror_num = 1;
776 } 1165 }
777 1166
778 path = btrfs_alloc_path(); 1167 path = btrfs_alloc_path();
779 if (!path) 1168 if (!path)
780 return -ENOMEM; 1169 return -ENOMEM;
781 1170
782 path->reada = 2;
783 path->search_commit_root = 1; 1171 path->search_commit_root = 1;
784 path->skip_locking = 1; 1172 path->skip_locking = 1;
785 1173
786 /* 1174 /*
787 * find all extents for each stripe and just read them to get 1175 * trigger the readahead for extent tree csum tree and wait for
788 * them into the page cache 1176 * completion. During readahead, the scrub is officially paused
789 * FIXME: we can do better. build a more intelligent prefetching 1177 * to not hold off transaction commits
790 */ 1178 */
791 logical = base + offset; 1179 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802 1180
803 /* 1181 wait_event(sdev->list_wait,
804 * we might miss half an extent here, but that doesn't matter, 1182 atomic_read(&sdev->in_flight) == 0);
805 * as it's only the prefetch 1183 atomic_inc(&fs_info->scrubs_paused);
806 */ 1184 wake_up(&fs_info->scrub_pause_wait);
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816
817 break;
818 }
819 btrfs_item_key_to_cpu(l, &key, slot);
820 1185
821 if (key.objectid >= logical + map->stripe_len) 1186 /* FIXME it might be better to start readahead at commit root */
822 break; 1187 key_start.objectid = logical;
1188 key_start.type = BTRFS_EXTENT_ITEM_KEY;
1189 key_start.offset = (u64)0;
1190 key_end.objectid = base + offset + nstripes * increment;
1191 key_end.type = BTRFS_EXTENT_ITEM_KEY;
1192 key_end.offset = (u64)0;
1193 reada1 = btrfs_reada_add(root, &key_start, &key_end);
1194
1195 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1196 key_start.type = BTRFS_EXTENT_CSUM_KEY;
1197 key_start.offset = logical;
1198 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1199 key_end.type = BTRFS_EXTENT_CSUM_KEY;
1200 key_end.offset = base + offset + nstripes * increment;
1201 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1202
1203 if (!IS_ERR(reada1))
1204 btrfs_reada_wait(reada1);
1205 if (!IS_ERR(reada2))
1206 btrfs_reada_wait(reada2);
823 1207
824 path->slots[0]++; 1208 mutex_lock(&fs_info->scrub_lock);
825 } 1209 while (atomic_read(&fs_info->scrub_pause_req)) {
826 btrfs_release_path(path); 1210 mutex_unlock(&fs_info->scrub_lock);
827 logical += increment; 1211 wait_event(fs_info->scrub_pause_wait,
828 physical += map->stripe_len; 1212 atomic_read(&fs_info->scrub_pause_req) == 0);
829 cond_resched(); 1213 mutex_lock(&fs_info->scrub_lock);
830 } 1214 }
1215 atomic_dec(&fs_info->scrubs_paused);
1216 mutex_unlock(&fs_info->scrub_lock);
1217 wake_up(&fs_info->scrub_pause_wait);
831 1218
832 /* 1219 /*
833 * collect all data csums for the stripe to avoid seeking during 1220 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB 1221 * the scrub. This might currently (crc32) end up to be about 1MB
835 */ 1222 */
836 start_stripe = 0;
837 blk_start_plug(&plug); 1223 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846 1224
847 logical += increment;
848 cond_resched();
849 }
850 /* 1225 /*
851 * now find all extents for each stripe and scrub them 1226 * now find all extents for each stripe and scrub them
852 */ 1227 */
853 logical = base + offset + start_stripe * increment; 1228 logical = base + offset;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 1229 physical = map->stripes[num].physical;
855 ret = 0; 1230 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) { 1231 for (i = 0; i < nstripes; ++i) {
857 /* 1232 /*
858 * canceled? 1233 * canceled?
859 */ 1234 */
@@ -882,11 +1257,14 @@ again:
882 atomic_dec(&fs_info->scrubs_paused); 1257 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock); 1258 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait); 1259 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 } 1260 }
889 1261
1262 ret = btrfs_lookup_csums_range(csum_root, logical,
1263 logical + map->stripe_len - 1,
1264 &sdev->csum_list, 1);
1265 if (ret)
1266 goto out;
1267
890 key.objectid = logical; 1268 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY; 1269 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0; 1270 key.offset = (u64)0;
@@ -982,7 +1360,6 @@ next:
982 1360
983out: 1361out:
984 blk_finish_plug(&plug); 1362 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path); 1363 btrfs_free_path(path);
987 return ret < 0 ? ret : 0; 1364 return ret < 0 ? ret : 0;
988} 1365}
@@ -1158,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1158static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 1535static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
1159{ 1536{
1160 struct btrfs_fs_info *fs_info = root->fs_info; 1537 struct btrfs_fs_info *fs_info = root->fs_info;
1538 int ret = 0;
1161 1539
1162 mutex_lock(&fs_info->scrub_lock); 1540 mutex_lock(&fs_info->scrub_lock);
1163 if (fs_info->scrub_workers_refcnt == 0) { 1541 if (fs_info->scrub_workers_refcnt == 0) {
1164 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1542 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
1165 fs_info->thread_pool_size, &fs_info->generic_worker); 1543 fs_info->thread_pool_size, &fs_info->generic_worker);
1166 fs_info->scrub_workers.idle_thresh = 4; 1544 fs_info->scrub_workers.idle_thresh = 4;
1167 btrfs_start_workers(&fs_info->scrub_workers, 1); 1545 ret = btrfs_start_workers(&fs_info->scrub_workers);
1546 if (ret)
1547 goto out;
1168 } 1548 }
1169 ++fs_info->scrub_workers_refcnt; 1549 ++fs_info->scrub_workers_refcnt;
1550out:
1170 mutex_unlock(&fs_info->scrub_lock); 1551 mutex_unlock(&fs_info->scrub_lock);
1171 1552
1172 return 0; 1553 return ret;
1173} 1554}
1174 1555
1175static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 1556static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
@@ -1253,10 +1634,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1253 ret = scrub_enumerate_chunks(sdev, start, end); 1634 ret = scrub_enumerate_chunks(sdev, start, end);
1254 1635
1255 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1636 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1256
1257 atomic_dec(&fs_info->scrubs_running); 1637 atomic_dec(&fs_info->scrubs_running);
1258 wake_up(&fs_info->scrub_pause_wait); 1638 wake_up(&fs_info->scrub_pause_wait);
1259 1639
1640 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1641
1260 if (progress) 1642 if (progress)
1261 memcpy(progress, &sdev->stat, sizeof(*progress)); 1643 memcpy(progress, &sdev->stat, sizeof(*progress));
1262 1644
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..34a8b6112ea4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,8 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h>
44#include <linux/ratelimit.h>
43#include "compat.h" 45#include "compat.h"
44#include "delayed-inode.h" 46#include "delayed-inode.h"
45#include "ctree.h" 47#include "ctree.h"
@@ -58,6 +60,7 @@
58#include <trace/events/btrfs.h> 60#include <trace/events/btrfs.h>
59 61
60static const struct super_operations btrfs_super_ops; 62static const struct super_operations btrfs_super_ops;
63static struct file_system_type btrfs_fs_type;
61 64
62static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 65static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
63 char nbuf[16]) 66 char nbuf[16])
@@ -162,7 +165,7 @@ enum {
162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 165 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 166 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 167 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err, 168 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
166}; 169};
167 170
168static match_table_t tokens = { 171static match_table_t tokens = {
@@ -195,6 +198,8 @@ static match_table_t tokens = {
195 {Opt_subvolrootid, "subvolrootid=%d"}, 198 {Opt_subvolrootid, "subvolrootid=%d"},
196 {Opt_defrag, "autodefrag"}, 199 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"}, 200 {Opt_inode_cache, "inode_cache"},
201 {Opt_no_space_cache, "nospace_cache"},
202 {Opt_recovery, "recovery"},
198 {Opt_err, NULL}, 203 {Opt_err, NULL},
199}; 204};
200 205
@@ -206,14 +211,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
206{ 211{
207 struct btrfs_fs_info *info = root->fs_info; 212 struct btrfs_fs_info *info = root->fs_info;
208 substring_t args[MAX_OPT_ARGS]; 213 substring_t args[MAX_OPT_ARGS];
209 char *p, *num, *orig; 214 char *p, *num, *orig = NULL;
215 u64 cache_gen;
210 int intarg; 216 int intarg;
211 int ret = 0; 217 int ret = 0;
212 char *compress_type; 218 char *compress_type;
213 bool compress_force = false; 219 bool compress_force = false;
214 220
221 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
222 if (cache_gen)
223 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
224
215 if (!options) 225 if (!options)
216 return 0; 226 goto out;
217 227
218 /* 228 /*
219 * strsep changes the string, duplicate it because parse_options 229 * strsep changes the string, duplicate it because parse_options
@@ -360,9 +370,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
360 btrfs_set_opt(info->mount_opt, DISCARD); 370 btrfs_set_opt(info->mount_opt, DISCARD);
361 break; 371 break;
362 case Opt_space_cache: 372 case Opt_space_cache:
363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
364 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 373 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
365 break; 374 break;
375 case Opt_no_space_cache:
376 printk(KERN_INFO "btrfs: disabling disk space caching\n");
377 btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
378 break;
366 case Opt_inode_cache: 379 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n"); 380 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); 381 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +394,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
381 printk(KERN_INFO "btrfs: enabling auto defrag"); 394 printk(KERN_INFO "btrfs: enabling auto defrag");
382 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 395 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
383 break; 396 break;
397 case Opt_recovery:
398 printk(KERN_INFO "btrfs: enabling auto recovery");
399 btrfs_set_opt(info->mount_opt, RECOVERY);
400 break;
384 case Opt_err: 401 case Opt_err:
385 printk(KERN_INFO "btrfs: unrecognized mount option " 402 printk(KERN_INFO "btrfs: unrecognized mount option "
386 "'%s'\n", p); 403 "'%s'\n", p);
@@ -391,6 +408,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
391 } 408 }
392 } 409 }
393out: 410out:
411 if (!ret && btrfs_test_opt(root, SPACE_CACHE))
412 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
394 kfree(orig); 413 kfree(orig);
395 return ret; 414 return ret;
396} 415}
@@ -406,12 +425,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
406 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) 425 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
407{ 426{
408 substring_t args[MAX_OPT_ARGS]; 427 substring_t args[MAX_OPT_ARGS];
409 char *opts, *orig, *p; 428 char *device_name, *opts, *orig, *p;
410 int error = 0; 429 int error = 0;
411 int intarg; 430 int intarg;
412 431
413 if (!options) 432 if (!options)
414 goto out; 433 return 0;
415 434
416 /* 435 /*
417 * strsep changes the string, duplicate it because parse_options 436 * strsep changes the string, duplicate it because parse_options
@@ -430,6 +449,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
430 token = match_token(p, tokens, args); 449 token = match_token(p, tokens, args);
431 switch (token) { 450 switch (token) {
432 case Opt_subvol: 451 case Opt_subvol:
452 kfree(*subvol_name);
433 *subvol_name = match_strdup(&args[0]); 453 *subvol_name = match_strdup(&args[0]);
434 break; 454 break;
435 case Opt_subvolid: 455 case Opt_subvolid:
@@ -457,29 +477,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
457 } 477 }
458 break; 478 break;
459 case Opt_device: 479 case Opt_device:
460 error = btrfs_scan_one_device(match_strdup(&args[0]), 480 device_name = match_strdup(&args[0]);
481 if (!device_name) {
482 error = -ENOMEM;
483 goto out;
484 }
485 error = btrfs_scan_one_device(device_name,
461 flags, holder, fs_devices); 486 flags, holder, fs_devices);
487 kfree(device_name);
462 if (error) 488 if (error)
463 goto out_free_opts; 489 goto out;
464 break; 490 break;
465 default: 491 default:
466 break; 492 break;
467 } 493 }
468 } 494 }
469 495
470 out_free_opts: 496out:
471 kfree(orig); 497 kfree(orig);
472 out:
473 /*
474 * If no subvolume name is specified we use the default one. Allocate
475 * a copy of the string "." here so that code later in the
476 * mount path doesn't care if it's the default volume or another one.
477 */
478 if (!*subvol_name) {
479 *subvol_name = kstrdup(".", GFP_KERNEL);
480 if (!*subvol_name)
481 return -ENOMEM;
482 }
483 return error; 498 return error;
484} 499}
485 500
@@ -492,7 +507,6 @@ static struct dentry *get_default_root(struct super_block *sb,
492 struct btrfs_path *path; 507 struct btrfs_path *path;
493 struct btrfs_key location; 508 struct btrfs_key location;
494 struct inode *inode; 509 struct inode *inode;
495 struct dentry *dentry;
496 u64 dir_id; 510 u64 dir_id;
497 int new = 0; 511 int new = 0;
498 512
@@ -517,7 +531,7 @@ static struct dentry *get_default_root(struct super_block *sb,
517 * will mount by default if we haven't been given a specific subvolume 531 * will mount by default if we haven't been given a specific subvolume
518 * to mount. 532 * to mount.
519 */ 533 */
520 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 534 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
521 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 535 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
522 if (IS_ERR(di)) { 536 if (IS_ERR(di)) {
523 btrfs_free_path(path); 537 btrfs_free_path(path);
@@ -566,29 +580,7 @@ setup_root:
566 return dget(sb->s_root); 580 return dget(sb->s_root);
567 } 581 }
568 582
569 if (new) { 583 return d_obtain_alias(inode);
570 const struct qstr name = { .name = "/", .len = 1 };
571
572 /*
573 * New inode, we need to make the dentry a sibling of s_root so
574 * everything gets cleaned up properly on unmount.
575 */
576 dentry = d_alloc(sb->s_root, &name);
577 if (!dentry) {
578 iput(inode);
579 return ERR_PTR(-ENOMEM);
580 }
581 d_splice_alias(inode, dentry);
582 } else {
583 /*
584 * We found the inode in cache, just find a dentry for it and
585 * put the reference to the inode we just got.
586 */
587 dentry = d_find_alias(inode);
588 iput(inode);
589 }
590
591 return dentry;
592} 584}
593 585
594static int btrfs_fill_super(struct super_block *sb, 586static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +711,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
719 seq_puts(seq, ",noacl"); 711 seq_puts(seq, ",noacl");
720 if (btrfs_test_opt(root, SPACE_CACHE)) 712 if (btrfs_test_opt(root, SPACE_CACHE))
721 seq_puts(seq, ",space_cache"); 713 seq_puts(seq, ",space_cache");
714 else
715 seq_puts(seq, ",nospace_cache");
722 if (btrfs_test_opt(root, CLEAR_CACHE)) 716 if (btrfs_test_opt(root, CLEAR_CACHE))
723 seq_puts(seq, ",clear_cache"); 717 seq_puts(seq, ",clear_cache");
724 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 718 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +747,137 @@ static int btrfs_set_super(struct super_block *s, void *data)
753 return set_anon_super(s, data); 747 return set_anon_super(s, data);
754} 748}
755 749
750/*
751 * subvolumes are identified by ino 256
752 */
753static inline int is_subvolume_inode(struct inode *inode)
754{
755 if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
756 return 1;
757 return 0;
758}
759
760/*
761 * This will strip out the subvol=%s argument for an argument string and add
762 * subvolid=0 to make sure we get the actual tree root for path walking to the
763 * subvol we want.
764 */
765static char *setup_root_args(char *args)
766{
767 unsigned copied = 0;
768 unsigned len = strlen(args) + 2;
769 char *pos;
770 char *ret;
771
772 /*
773 * We need the same args as before, but minus
774 *
775 * subvol=a
776 *
777 * and add
778 *
779 * subvolid=0
780 *
781 * which is a difference of 2 characters, so we allocate strlen(args) +
782 * 2 characters.
783 */
784 ret = kzalloc(len * sizeof(char), GFP_NOFS);
785 if (!ret)
786 return NULL;
787 pos = strstr(args, "subvol=");
788
789 /* This shouldn't happen, but just in case.. */
790 if (!pos) {
791 kfree(ret);
792 return NULL;
793 }
794
795 /*
796 * The subvol=<> arg is not at the front of the string, copy everybody
797 * up to that into ret.
798 */
799 if (pos != args) {
800 *pos = '\0';
801 strcpy(ret, args);
802 copied += strlen(args);
803 pos++;
804 }
805
806 strncpy(ret + copied, "subvolid=0", len - copied);
807
808 /* Length of subvolid=0 */
809 copied += 10;
810
811 /*
812 * If there is no , after the subvol= option then we know there's no
813 * other options and we can just return.
814 */
815 pos = strchr(pos, ',');
816 if (!pos)
817 return ret;
818
819 /* Copy the rest of the arguments into our buffer */
820 strncpy(ret + copied, pos, len - copied);
821 copied += strlen(pos);
822
823 return ret;
824}
825
826static struct dentry *mount_subvol(const char *subvol_name, int flags,
827 const char *device_name, char *data)
828{
829 struct super_block *s;
830 struct dentry *root;
831 struct vfsmount *mnt;
832 struct mnt_namespace *ns_private;
833 char *newargs;
834 struct path path;
835 int error;
836
837 newargs = setup_root_args(data);
838 if (!newargs)
839 return ERR_PTR(-ENOMEM);
840 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
841 newargs);
842 kfree(newargs);
843 if (IS_ERR(mnt))
844 return ERR_CAST(mnt);
845
846 ns_private = create_mnt_ns(mnt);
847 if (IS_ERR(ns_private)) {
848 mntput(mnt);
849 return ERR_CAST(ns_private);
850 }
851
852 /*
853 * This will trigger the automount of the subvol so we can just
854 * drop the mnt we have here and return the dentry that we
855 * found.
856 */
857 error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
858 LOOKUP_FOLLOW, &path);
859 put_mnt_ns(ns_private);
860 if (error)
861 return ERR_PTR(error);
862
863 if (!is_subvolume_inode(path.dentry->d_inode)) {
864 path_put(&path);
865 mntput(mnt);
866 error = -EINVAL;
867 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
868 subvol_name);
869 return ERR_PTR(-EINVAL);
870 }
871
872 /* Get a ref to the sb and the dentry we found and return it */
873 s = path.mnt->mnt_sb;
874 atomic_inc(&s->s_active);
875 root = dget(path.dentry);
876 path_put(&path);
877 down_write(&s->s_umount);
878
879 return root;
880}
756 881
757/* 882/*
758 * Find a superblock for the given device / mount point. 883 * Find a superblock for the given device / mount point.
@@ -767,7 +892,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
767 struct super_block *s; 892 struct super_block *s;
768 struct dentry *root; 893 struct dentry *root;
769 struct btrfs_fs_devices *fs_devices = NULL; 894 struct btrfs_fs_devices *fs_devices = NULL;
770 struct btrfs_root *tree_root = NULL;
771 struct btrfs_fs_info *fs_info = NULL; 895 struct btrfs_fs_info *fs_info = NULL;
772 fmode_t mode = FMODE_READ; 896 fmode_t mode = FMODE_READ;
773 char *subvol_name = NULL; 897 char *subvol_name = NULL;
@@ -781,21 +905,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
781 error = btrfs_parse_early_options(data, mode, fs_type, 905 error = btrfs_parse_early_options(data, mode, fs_type,
782 &subvol_name, &subvol_objectid, 906 &subvol_name, &subvol_objectid,
783 &subvol_rootid, &fs_devices); 907 &subvol_rootid, &fs_devices);
784 if (error) 908 if (error) {
909 kfree(subvol_name);
785 return ERR_PTR(error); 910 return ERR_PTR(error);
911 }
786 912
787 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 913 if (subvol_name) {
788 if (error) 914 root = mount_subvol(subvol_name, flags, device_name, data);
789 goto error_free_subvol_name; 915 kfree(subvol_name);
916 return root;
917 }
790 918
791 error = btrfs_open_devices(fs_devices, mode, fs_type); 919 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
792 if (error) 920 if (error)
793 goto error_free_subvol_name; 921 return ERR_PTR(error);
794
795 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
796 error = -EACCES;
797 goto error_close_devices;
798 }
799 922
800 /* 923 /*
801 * Setup a dummy root and fs_info for test/set super. This is because 924 * Setup a dummy root and fs_info for test/set super. This is because
@@ -804,19 +927,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
804 * then open_ctree will properly initialize everything later. 927 * then open_ctree will properly initialize everything later.
805 */ 928 */
806 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); 929 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
807 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 930 if (!fs_info)
808 if (!fs_info || !tree_root) { 931 return ERR_PTR(-ENOMEM);
932
933 fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
934 if (!fs_info->tree_root) {
809 error = -ENOMEM; 935 error = -ENOMEM;
810 goto error_close_devices; 936 goto error_fs_info;
811 } 937 }
812 fs_info->tree_root = tree_root; 938 fs_info->tree_root->fs_info = fs_info;
813 fs_info->fs_devices = fs_devices; 939 fs_info->fs_devices = fs_devices;
814 tree_root->fs_info = fs_info; 940
941 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
942 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
943 if (!fs_info->super_copy || !fs_info->super_for_commit) {
944 error = -ENOMEM;
945 goto error_fs_info;
946 }
947
948 error = btrfs_open_devices(fs_devices, mode, fs_type);
949 if (error)
950 goto error_fs_info;
951
952 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
953 error = -EACCES;
954 goto error_close_devices;
955 }
815 956
816 bdev = fs_devices->latest_bdev; 957 bdev = fs_devices->latest_bdev;
817 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); 958 s = sget(fs_type, btrfs_test_super, btrfs_set_super,
818 if (IS_ERR(s)) 959 fs_info->tree_root);
819 goto error_s; 960 if (IS_ERR(s)) {
961 error = PTR_ERR(s);
962 goto error_close_devices;
963 }
820 964
821 if (s->s_root) { 965 if (s->s_root) {
822 if ((flags ^ s->s_flags) & MS_RDONLY) { 966 if ((flags ^ s->s_flags) & MS_RDONLY) {
@@ -826,75 +970,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
826 } 970 }
827 971
828 btrfs_close_devices(fs_devices); 972 btrfs_close_devices(fs_devices);
829 kfree(fs_info); 973 free_fs_info(fs_info);
830 kfree(tree_root);
831 } else { 974 } else {
832 char b[BDEVNAME_SIZE]; 975 char b[BDEVNAME_SIZE];
833 976
834 s->s_flags = flags | MS_NOSEC; 977 s->s_flags = flags | MS_NOSEC;
835 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 978 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
979 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
836 error = btrfs_fill_super(s, fs_devices, data, 980 error = btrfs_fill_super(s, fs_devices, data,
837 flags & MS_SILENT ? 1 : 0); 981 flags & MS_SILENT ? 1 : 0);
838 if (error) { 982 if (error) {
839 deactivate_locked_super(s); 983 deactivate_locked_super(s);
840 goto error_free_subvol_name; 984 return ERR_PTR(error);
841 } 985 }
842 986
843 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
844 s->s_flags |= MS_ACTIVE; 987 s->s_flags |= MS_ACTIVE;
845 } 988 }
846 989
847 /* if they gave us a subvolume name bind mount into that */ 990 root = get_default_root(s, subvol_objectid);
848 if (strcmp(subvol_name, ".")) { 991 if (IS_ERR(root)) {
849 struct dentry *new_root; 992 deactivate_locked_super(s);
850 993 return root;
851 root = get_default_root(s, subvol_rootid);
852 if (IS_ERR(root)) {
853 error = PTR_ERR(root);
854 deactivate_locked_super(s);
855 goto error_free_subvol_name;
856 }
857
858 mutex_lock(&root->d_inode->i_mutex);
859 new_root = lookup_one_len(subvol_name, root,
860 strlen(subvol_name));
861 mutex_unlock(&root->d_inode->i_mutex);
862
863 if (IS_ERR(new_root)) {
864 dput(root);
865 deactivate_locked_super(s);
866 error = PTR_ERR(new_root);
867 goto error_free_subvol_name;
868 }
869 if (!new_root->d_inode) {
870 dput(root);
871 dput(new_root);
872 deactivate_locked_super(s);
873 error = -ENXIO;
874 goto error_free_subvol_name;
875 }
876 dput(root);
877 root = new_root;
878 } else {
879 root = get_default_root(s, subvol_objectid);
880 if (IS_ERR(root)) {
881 error = PTR_ERR(root);
882 deactivate_locked_super(s);
883 goto error_free_subvol_name;
884 }
885 } 994 }
886 995
887 kfree(subvol_name);
888 return root; 996 return root;
889 997
890error_s:
891 error = PTR_ERR(s);
892error_close_devices: 998error_close_devices:
893 btrfs_close_devices(fs_devices); 999 btrfs_close_devices(fs_devices);
894 kfree(fs_info); 1000error_fs_info:
895 kfree(tree_root); 1001 free_fs_info(fs_info);
896error_free_subvol_name:
897 kfree(subvol_name);
898 return ERR_PTR(error); 1002 return ERR_PTR(error);
899} 1003}
900 1004
@@ -919,7 +1023,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
919 if (root->fs_info->fs_devices->rw_devices == 0) 1023 if (root->fs_info->fs_devices->rw_devices == 0)
920 return -EACCES; 1024 return -EACCES;
921 1025
922 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 1026 if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
923 return -EINVAL; 1027 return -EINVAL;
924 1028
925 ret = btrfs_cleanup_fs_roots(root->fs_info); 1029 ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -976,11 +1080,11 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
976 u64 avail_space; 1080 u64 avail_space;
977 u64 used_space; 1081 u64 used_space;
978 u64 min_stripe_size; 1082 u64 min_stripe_size;
979 int min_stripes = 1; 1083 int min_stripes = 1, num_stripes = 1;
980 int i = 0, nr_devices; 1084 int i = 0, nr_devices;
981 int ret; 1085 int ret;
982 1086
983 nr_devices = fs_info->fs_devices->rw_devices; 1087 nr_devices = fs_info->fs_devices->open_devices;
984 BUG_ON(!nr_devices); 1088 BUG_ON(!nr_devices);
985 1089
986 devices_info = kmalloc(sizeof(*devices_info) * nr_devices, 1090 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -990,20 +1094,24 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
990 1094
991 /* calc min stripe number for data space alloction */ 1095 /* calc min stripe number for data space alloction */
992 type = btrfs_get_alloc_profile(root, 1); 1096 type = btrfs_get_alloc_profile(root, 1);
993 if (type & BTRFS_BLOCK_GROUP_RAID0) 1097 if (type & BTRFS_BLOCK_GROUP_RAID0) {
994 min_stripes = 2; 1098 min_stripes = 2;
995 else if (type & BTRFS_BLOCK_GROUP_RAID1) 1099 num_stripes = nr_devices;
1100 } else if (type & BTRFS_BLOCK_GROUP_RAID1) {
996 min_stripes = 2; 1101 min_stripes = 2;
997 else if (type & BTRFS_BLOCK_GROUP_RAID10) 1102 num_stripes = 2;
1103 } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
998 min_stripes = 4; 1104 min_stripes = 4;
1105 num_stripes = 4;
1106 }
999 1107
1000 if (type & BTRFS_BLOCK_GROUP_DUP) 1108 if (type & BTRFS_BLOCK_GROUP_DUP)
1001 min_stripe_size = 2 * BTRFS_STRIPE_LEN; 1109 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
1002 else 1110 else
1003 min_stripe_size = BTRFS_STRIPE_LEN; 1111 min_stripe_size = BTRFS_STRIPE_LEN;
1004 1112
1005 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 1113 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1006 if (!device->in_fs_metadata) 1114 if (!device->in_fs_metadata || !device->bdev)
1007 continue; 1115 continue;
1008 1116
1009 avail_space = device->total_bytes - device->bytes_used; 1117 avail_space = device->total_bytes - device->bytes_used;
@@ -1064,13 +1172,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1064 i = nr_devices - 1; 1172 i = nr_devices - 1;
1065 avail_space = 0; 1173 avail_space = 0;
1066 while (nr_devices >= min_stripes) { 1174 while (nr_devices >= min_stripes) {
1175 if (num_stripes > nr_devices)
1176 num_stripes = nr_devices;
1177
1067 if (devices_info[i].max_avail >= min_stripe_size) { 1178 if (devices_info[i].max_avail >= min_stripe_size) {
1068 int j; 1179 int j;
1069 u64 alloc_size; 1180 u64 alloc_size;
1070 1181
1071 avail_space += devices_info[i].max_avail * min_stripes; 1182 avail_space += devices_info[i].max_avail * num_stripes;
1072 alloc_size = devices_info[i].max_avail; 1183 alloc_size = devices_info[i].max_avail;
1073 for (j = i + 1 - min_stripes; j <= i; j++) 1184 for (j = i + 1 - num_stripes; j <= i; j++)
1074 devices_info[j].max_avail -= alloc_size; 1185 devices_info[j].max_avail -= alloc_size;
1075 } 1186 }
1076 i--; 1187 i--;
@@ -1085,7 +1196,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1085static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1196static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1086{ 1197{
1087 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1198 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
1088 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1199 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1089 struct list_head *head = &root->fs_info->space_info; 1200 struct list_head *head = &root->fs_info->space_info;
1090 struct btrfs_space_info *found; 1201 struct btrfs_space_info *found;
1091 u64 total_used = 0; 1202 u64 total_used = 0;
@@ -1187,6 +1298,16 @@ static int btrfs_unfreeze(struct super_block *sb)
1187 return 0; 1298 return 0;
1188} 1299}
1189 1300
1301static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
1302{
1303 int ret;
1304
1305 ret = btrfs_dirty_inode(inode);
1306 if (ret)
1307 printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
1308 "error %d\n", btrfs_ino(inode), ret);
1309}
1310
1190static const struct super_operations btrfs_super_ops = { 1311static const struct super_operations btrfs_super_ops = {
1191 .drop_inode = btrfs_drop_inode, 1312 .drop_inode = btrfs_drop_inode,
1192 .evict_inode = btrfs_evict_inode, 1313 .evict_inode = btrfs_evict_inode,
@@ -1194,7 +1315,7 @@ static const struct super_operations btrfs_super_ops = {
1194 .sync_fs = btrfs_sync_fs, 1315 .sync_fs = btrfs_sync_fs,
1195 .show_options = btrfs_show_options, 1316 .show_options = btrfs_show_options,
1196 .write_inode = btrfs_write_inode, 1317 .write_inode = btrfs_write_inode,
1197 .dirty_inode = btrfs_dirty_inode, 1318 .dirty_inode = btrfs_fs_dirty_inode,
1198 .alloc_inode = btrfs_alloc_inode, 1319 .alloc_inode = btrfs_alloc_inode,
1199 .destroy_inode = btrfs_destroy_inode, 1320 .destroy_inode = btrfs_destroy_inode,
1200 .statfs = btrfs_statfs, 1321 .statfs = btrfs_statfs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..81376d94cd3c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
55 struct btrfs_transaction *cur_trans; 55 struct btrfs_transaction *cur_trans;
56 56
57 spin_lock(&root->fs_info->trans_lock); 57 spin_lock(&root->fs_info->trans_lock);
58loop:
58 if (root->fs_info->trans_no_join) { 59 if (root->fs_info->trans_no_join) {
59 if (!nofail) { 60 if (!nofail) {
60 spin_unlock(&root->fs_info->trans_lock); 61 spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
75 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 76 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
76 if (!cur_trans) 77 if (!cur_trans)
77 return -ENOMEM; 78 return -ENOMEM;
79
78 spin_lock(&root->fs_info->trans_lock); 80 spin_lock(&root->fs_info->trans_lock);
79 if (root->fs_info->running_transaction) { 81 if (root->fs_info->running_transaction) {
82 /*
83 * someone started a transaction after we unlocked. Make sure
84 * to redo the trans_no_join checks above
85 */
80 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 86 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
81 cur_trans = root->fs_info->running_transaction; 87 cur_trans = root->fs_info->running_transaction;
82 atomic_inc(&cur_trans->use_count); 88 goto loop;
83 atomic_inc(&cur_trans->num_writers);
84 cur_trans->num_joined++;
85 spin_unlock(&root->fs_info->trans_lock);
86 return 0;
87 } 89 }
90
88 atomic_set(&cur_trans->num_writers, 1); 91 atomic_set(&cur_trans->num_writers, 1);
89 cur_trans->num_joined = 0; 92 cur_trans->num_joined = 0;
90 init_waitqueue_head(&cur_trans->writer_wait); 93 init_waitqueue_head(&cur_trans->writer_wait);
@@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
275 */ 278 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) { 279 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 280 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root, 281 ret = btrfs_block_rsv_add(root,
279 &root->fs_info->trans_block_rsv, 282 &root->fs_info->trans_block_rsv,
280 num_bytes); 283 num_bytes);
281 if (ret) 284 if (ret)
@@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 421 struct btrfs_root *root)
419{ 422{
420 int ret; 423 int ret;
421 ret = btrfs_block_rsv_check(trans, root, 424
422 &root->fs_info->global_block_rsv, 0, 5); 425 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
423 return ret ? 1 : 0; 426 return ret ? 1 : 0;
424} 427}
425 428
@@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
427 struct btrfs_root *root) 430 struct btrfs_root *root)
428{ 431{
429 struct btrfs_transaction *cur_trans = trans->transaction; 432 struct btrfs_transaction *cur_trans = trans->transaction;
433 struct btrfs_block_rsv *rsv = trans->block_rsv;
430 int updates; 434 int updates;
431 435
432 smp_mb(); 436 smp_mb();
433 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 437 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
434 return 1; 438 return 1;
435 439
440 /*
441 * We need to do this in case we're deleting csums so the global block
442 * rsv get's used instead of the csum block rsv.
443 */
444 trans->block_rsv = NULL;
445
436 updates = trans->delayed_ref_updates; 446 updates = trans->delayed_ref_updates;
437 trans->delayed_ref_updates = 0; 447 trans->delayed_ref_updates = 0;
438 if (updates) 448 if (updates)
439 btrfs_run_delayed_refs(trans, root, updates); 449 btrfs_run_delayed_refs(trans, root, updates);
440 450
451 trans->block_rsv = rsv;
452
441 return should_end_transaction(trans, root); 453 return should_end_transaction(trans, root);
442} 454}
443 455
@@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
453 return 0; 465 return 0;
454 } 466 }
455 467
468 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL;
456 while (count < 4) { 470 while (count < 4) {
457 unsigned long cur = trans->delayed_ref_updates; 471 unsigned long cur = trans->delayed_ref_updates;
458 trans->delayed_ref_updates = 0; 472 trans->delayed_ref_updates = 0;
@@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
473 count++; 487 count++;
474 } 488 }
475 489
476 btrfs_trans_release_metadata(trans, root);
477
478 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 490 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
479 should_end_transaction(trans, root)) { 491 should_end_transaction(trans, root)) {
480 trans->transaction->blocked = 1; 492 trans->transaction->blocked = 1;
@@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
562int btrfs_write_marked_extents(struct btrfs_root *root, 574int btrfs_write_marked_extents(struct btrfs_root *root,
563 struct extent_io_tree *dirty_pages, int mark) 575 struct extent_io_tree *dirty_pages, int mark)
564{ 576{
565 int ret;
566 int err = 0; 577 int err = 0;
567 int werr = 0; 578 int werr = 0;
568 struct page *page; 579 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
569 struct inode *btree_inode = root->fs_info->btree_inode;
570 u64 start = 0; 580 u64 start = 0;
571 u64 end; 581 u64 end;
572 unsigned long index;
573
574 while (1) {
575 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
576 mark);
577 if (ret)
578 break;
579 while (start <= end) {
580 cond_resched();
581
582 index = start >> PAGE_CACHE_SHIFT;
583 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
584 page = find_get_page(btree_inode->i_mapping, index);
585 if (!page)
586 continue;
587
588 btree_lock_page_hook(page);
589 if (!page->mapping) {
590 unlock_page(page);
591 page_cache_release(page);
592 continue;
593 }
594 582
595 if (PageWriteback(page)) { 583 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
596 if (PageDirty(page)) 584 mark)) {
597 wait_on_page_writeback(page); 585 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
598 else { 586 GFP_NOFS);
599 unlock_page(page); 587 err = filemap_fdatawrite_range(mapping, start, end);
600 page_cache_release(page); 588 if (err)
601 continue; 589 werr = err;
602 } 590 cond_resched();
603 } 591 start = end + 1;
604 err = write_one_page(page, 0);
605 if (err)
606 werr = err;
607 page_cache_release(page);
608 }
609 } 592 }
610 if (err) 593 if (err)
611 werr = err; 594 werr = err;
@@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
621int btrfs_wait_marked_extents(struct btrfs_root *root, 604int btrfs_wait_marked_extents(struct btrfs_root *root,
622 struct extent_io_tree *dirty_pages, int mark) 605 struct extent_io_tree *dirty_pages, int mark)
623{ 606{
624 int ret;
625 int err = 0; 607 int err = 0;
626 int werr = 0; 608 int werr = 0;
627 struct page *page; 609 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
628 struct inode *btree_inode = root->fs_info->btree_inode;
629 u64 start = 0; 610 u64 start = 0;
630 u64 end; 611 u64 end;
631 unsigned long index;
632 612
633 while (1) { 613 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
634 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 614 EXTENT_NEED_WAIT)) {
635 mark); 615 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
636 if (ret) 616 err = filemap_fdatawait_range(mapping, start, end);
637 break; 617 if (err)
638 618 werr = err;
639 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 619 cond_resched();
640 while (start <= end) { 620 start = end + 1;
641 index = start >> PAGE_CACHE_SHIFT;
642 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
643 page = find_get_page(btree_inode->i_mapping, index);
644 if (!page)
645 continue;
646 if (PageDirty(page)) {
647 btree_lock_page_hook(page);
648 wait_on_page_writeback(page);
649 err = write_one_page(page, 0);
650 if (err)
651 werr = err;
652 }
653 wait_on_page_writeback(page);
654 page_cache_release(page);
655 cond_resched();
656 }
657 } 621 }
658 if (err) 622 if (err)
659 werr = err; 623 werr = err;
@@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
673 637
674 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 638 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
675 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 639 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
676 return ret || ret2; 640
641 if (ret)
642 return ret;
643 if (ret2)
644 return ret2;
645 return 0;
677} 646}
678 647
679int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 648int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -816,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
816 785
817 btrfs_save_ino_cache(root, trans); 786 btrfs_save_ino_cache(root, trans);
818 787
788 /* see comments in should_cow_block() */
789 root->force_cow = 0;
790 smp_wmb();
791
819 if (root->commit_root != root->node) { 792 if (root->commit_root != root->node) {
820 mutex_lock(&root->fs_commit_mutex); 793 mutex_lock(&root->fs_commit_mutex);
821 switch_commit_root(root); 794 switch_commit_root(root);
@@ -911,11 +884,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
911 } 884 }
912 885
913 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 886 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
914 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
915 887
916 if (to_reserve > 0) { 888 if (to_reserve > 0) {
917 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 889 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
918 to_reserve); 890 to_reserve);
919 if (ret) { 891 if (ret) {
920 pending->error = ret; 892 pending->error = ret;
921 goto fail; 893 goto fail;
@@ -979,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
979 btrfs_tree_unlock(old); 951 btrfs_tree_unlock(old);
980 free_extent_buffer(old); 952 free_extent_buffer(old);
981 953
954 /* see comments in should_cow_block() */
955 root->force_cow = 1;
956 smp_wmb();
957
982 btrfs_set_root_node(new_root_item, tmp); 958 btrfs_set_root_node(new_root_item, tmp);
983 /* record when the snapshot was created in key.offset */ 959 /* record when the snapshot was created in key.offset */
984 key.offset = trans->transid; 960 key.offset = trans->transid;
@@ -1002,7 +978,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1002 BUG_ON(IS_ERR(pending->snap)); 978 BUG_ON(IS_ERR(pending->snap));
1003 979
1004 btrfs_reloc_post_snapshot(trans, pending); 980 btrfs_reloc_post_snapshot(trans, pending);
1005 btrfs_orphan_post_snapshot(trans, pending);
1006fail: 981fail:
1007 kfree(new_root_item); 982 kfree(new_root_item);
1008 trans->block_rsv = rsv; 983 trans->block_rsv = rsv;
@@ -1032,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root)
1032 struct btrfs_root_item *root_item; 1007 struct btrfs_root_item *root_item;
1033 struct btrfs_super_block *super; 1008 struct btrfs_super_block *super;
1034 1009
1035 super = &root->fs_info->super_copy; 1010 super = root->fs_info->super_copy;
1036 1011
1037 root_item = &root->fs_info->chunk_root->root_item; 1012 root_item = &root->fs_info->chunk_root->root_item;
1038 super->chunk_root = root_item->bytenr; 1013 super->chunk_root = root_item->bytenr;
@@ -1043,7 +1018,7 @@ static void update_super_roots(struct btrfs_root *root)
1043 super->root = root_item->bytenr; 1018 super->root = root_item->bytenr;
1044 super->generation = root_item->generation; 1019 super->generation = root_item->generation;
1045 super->root_level = root_item->level; 1020 super->root_level = root_item->level;
1046 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 1021 if (btrfs_test_opt(root, SPACE_CACHE))
1047 super->cache_generation = root_item->generation; 1022 super->cache_generation = root_item->generation;
1048} 1023}
1049 1024
@@ -1168,14 +1143,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1168 1143
1169 btrfs_run_ordered_operations(root, 0); 1144 btrfs_run_ordered_operations(root, 0);
1170 1145
1146 btrfs_trans_release_metadata(trans, root);
1147 trans->block_rsv = NULL;
1148
1171 /* make a pass through all the delayed refs we have so far 1149 /* make a pass through all the delayed refs we have so far
1172 * any runnings procs may add more while we are here 1150 * any runnings procs may add more while we are here
1173 */ 1151 */
1174 ret = btrfs_run_delayed_refs(trans, root, 0); 1152 ret = btrfs_run_delayed_refs(trans, root, 0);
1175 BUG_ON(ret); 1153 BUG_ON(ret);
1176 1154
1177 btrfs_trans_release_metadata(trans, root);
1178
1179 cur_trans = trans->transaction; 1155 cur_trans = trans->transaction;
1180 /* 1156 /*
1181 * set the flushing flag so procs in this transaction have to 1157 * set the flushing flag so procs in this transaction have to
@@ -1341,12 +1317,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1341 update_super_roots(root); 1317 update_super_roots(root);
1342 1318
1343 if (!root->fs_info->log_root_recovering) { 1319 if (!root->fs_info->log_root_recovering) {
1344 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1320 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
1345 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1321 btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1346 } 1322 }
1347 1323
1348 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1324 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1349 sizeof(root->fs_info->super_copy)); 1325 sizeof(*root->fs_info->super_copy));
1350 1326
1351 trans->transaction->blocked = 0; 1327 trans->transaction->blocked = 0;
1352 spin_lock(&root->fs_info->trans_lock); 1328 spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639fca067..f4d81c06d48f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
276 struct walk_control *wc, u64 gen) 276 struct walk_control *wc, u64 gen)
277{ 277{
278 if (wc->pin) 278 if (wc->pin)
279 btrfs_pin_extent(log->fs_info->extent_root, 279 btrfs_pin_extent_for_log_replay(wc->trans,
280 eb->start, eb->len, 0); 280 log->fs_info->extent_root,
281 eb->start, eb->len);
281 282
282 if (btrfs_buffer_uptodate(eb, gen)) { 283 if (btrfs_buffer_uptodate(eb, gen)) {
283 if (wc->write) 284 if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1760 1761
1761 WARN_ON(root_owner != 1762 WARN_ON(root_owner !=
1762 BTRFS_TREE_LOG_OBJECTID); 1763 BTRFS_TREE_LOG_OBJECTID);
1763 ret = btrfs_free_reserved_extent(root, 1764 ret = btrfs_free_and_pin_reserved_extent(root,
1764 bytenr, blocksize); 1765 bytenr, blocksize);
1765 BUG_ON(ret); 1766 BUG_ON(ret);
1766 } 1767 }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1828 btrfs_tree_unlock(next); 1829 btrfs_tree_unlock(next);
1829 1830
1830 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1831 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1831 ret = btrfs_free_reserved_extent(root, 1832 ret = btrfs_free_and_pin_reserved_extent(root,
1832 path->nodes[*level]->start, 1833 path->nodes[*level]->start,
1833 path->nodes[*level]->len); 1834 path->nodes[*level]->len);
1834 BUG_ON(ret); 1835 BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1897 1898
1898 WARN_ON(log->root_key.objectid != 1899 WARN_ON(log->root_key.objectid !=
1899 BTRFS_TREE_LOG_OBJECTID); 1900 BTRFS_TREE_LOG_OBJECTID);
1900 ret = btrfs_free_reserved_extent(log, next->start, 1901 ret = btrfs_free_and_pin_reserved_extent(log, next->start,
1901 next->len); 1902 next->len);
1902 BUG_ON(ret); 1903 BUG_ON(ret);
1903 } 1904 }
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2013 /* wait for previous tree log sync to complete */ 2014 /* wait for previous tree log sync to complete */
2014 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2015 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2015 wait_log_commit(trans, root, root->log_transid - 1); 2016 wait_log_commit(trans, root, root->log_transid - 1);
2016
2017 while (1) { 2017 while (1) {
2018 unsigned long batch = root->log_batch; 2018 unsigned long batch = root->log_batch;
2019 if (root->log_multiple_pids) { 2019 /* when we're on an ssd, just kick the log commit out */
2020 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2020 mutex_unlock(&root->log_mutex); 2021 mutex_unlock(&root->log_mutex);
2021 schedule_timeout_uninterruptible(1); 2022 schedule_timeout_uninterruptible(1);
2022 mutex_lock(&root->log_mutex); 2023 mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2117 BUG_ON(ret); 2118 BUG_ON(ret);
2118 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2119 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2119 2120
2120 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2121 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2121 log_root_tree->node->start); 2122 log_root_tree->node->start);
2122 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2123 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2123 btrfs_header_level(log_root_tree->node)); 2124 btrfs_header_level(log_root_tree->node));
2124 2125
2125 log_root_tree->log_batch = 0; 2126 log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..f4b839fd3c9d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -295,6 +295,12 @@ loop_lock:
295 btrfs_requeue_work(&device->work); 295 btrfs_requeue_work(&device->work);
296 goto done; 296 goto done;
297 } 297 }
298 /* unplug every 64 requests just for good measure */
299 if (batch_run % 64 == 0) {
300 blk_finish_plug(&plug);
301 blk_start_plug(&plug);
302 sync_pending = 0;
303 }
298 } 304 }
299 305
300 cond_resched(); 306 cond_resched();
@@ -366,6 +372,14 @@ static noinline int device_list_add(const char *path,
366 } 372 }
367 INIT_LIST_HEAD(&device->dev_alloc_list); 373 INIT_LIST_HEAD(&device->dev_alloc_list);
368 374
375 /* init readahead state */
376 spin_lock_init(&device->reada_lock);
377 device->reada_curr_zone = NULL;
378 atomic_set(&device->reada_in_flight, 0);
379 device->reada_next = 0;
380 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
381 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
382
369 mutex_lock(&fs_devices->device_list_mutex); 383 mutex_lock(&fs_devices->device_list_mutex);
370 list_add_rcu(&device->dev_list, &fs_devices->devices); 384 list_add_rcu(&device->dev_list, &fs_devices->devices);
371 mutex_unlock(&fs_devices->device_list_mutex); 385 mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +611,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
597 set_blocksize(bdev, 4096); 611 set_blocksize(bdev, 4096);
598 612
599 bh = btrfs_read_dev_super(bdev); 613 bh = btrfs_read_dev_super(bdev);
600 if (!bh) { 614 if (!bh)
601 ret = -EINVAL;
602 goto error_close; 615 goto error_close;
603 }
604 616
605 disk_super = (struct btrfs_super_block *)bh->b_data; 617 disk_super = (struct btrfs_super_block *)bh->b_data;
606 devid = btrfs_stack_device_id(&disk_super->dev_item); 618 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +667,7 @@ error:
655 continue; 667 continue;
656 } 668 }
657 if (fs_devices->open_devices == 0) { 669 if (fs_devices->open_devices == 0) {
658 ret = -EIO; 670 ret = -EINVAL;
659 goto out; 671 goto out;
660 } 672 }
661 fs_devices->seeding = seeding; 673 fs_devices->seeding = seeding;
@@ -993,7 +1005,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
993 key.objectid = device->devid; 1005 key.objectid = device->devid;
994 key.offset = start; 1006 key.offset = start;
995 key.type = BTRFS_DEV_EXTENT_KEY; 1007 key.type = BTRFS_DEV_EXTENT_KEY;
996 1008again:
997 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1009 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
998 if (ret > 0) { 1010 if (ret > 0) {
999 ret = btrfs_previous_item(root, path, key.objectid, 1011 ret = btrfs_previous_item(root, path, key.objectid,
@@ -1006,6 +1018,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1006 struct btrfs_dev_extent); 1018 struct btrfs_dev_extent);
1007 BUG_ON(found_key.offset > start || found_key.offset + 1019 BUG_ON(found_key.offset > start || found_key.offset +
1008 btrfs_dev_extent_length(leaf, extent) < start); 1020 btrfs_dev_extent_length(leaf, extent) < start);
1021 key = found_key;
1022 btrfs_release_path(path);
1023 goto again;
1009 } else if (ret == 0) { 1024 } else if (ret == 0) {
1010 leaf = path->nodes[0]; 1025 leaf = path->nodes[0];
1011 extent = btrfs_item_ptr(leaf, path->slots[0], 1026 extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1013,8 +1028,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1013 } 1028 }
1014 BUG_ON(ret); 1029 BUG_ON(ret);
1015 1030
1016 if (device->bytes_used > 0) 1031 if (device->bytes_used > 0) {
1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1032 u64 len = btrfs_dev_extent_length(leaf, extent);
1033 device->bytes_used -= len;
1034 spin_lock(&root->fs_info->free_chunk_lock);
1035 root->fs_info->free_chunk_space += len;
1036 spin_unlock(&root->fs_info->free_chunk_lock);
1037 }
1018 ret = btrfs_del_item(trans, root, path); 1038 ret = btrfs_del_item(trans, root, path);
1019 1039
1020out: 1040out:
@@ -1356,6 +1376,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1356 if (ret) 1376 if (ret)
1357 goto error_undo; 1377 goto error_undo;
1358 1378
1379 spin_lock(&root->fs_info->free_chunk_lock);
1380 root->fs_info->free_chunk_space = device->total_bytes -
1381 device->bytes_used;
1382 spin_unlock(&root->fs_info->free_chunk_lock);
1383
1359 device->in_fs_metadata = 0; 1384 device->in_fs_metadata = 0;
1360 btrfs_scrub_cancel_dev(root, device); 1385 btrfs_scrub_cancel_dev(root, device);
1361 1386
@@ -1387,8 +1412,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1387 call_rcu(&device->rcu, free_device); 1412 call_rcu(&device->rcu, free_device);
1388 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1413 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1389 1414
1390 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1415 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1391 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1416 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1392 1417
1393 if (cur_devices->open_devices == 0) { 1418 if (cur_devices->open_devices == 0) {
1394 struct btrfs_fs_devices *fs_devices; 1419 struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1475,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1450 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1475 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1451 struct btrfs_fs_devices *old_devices; 1476 struct btrfs_fs_devices *old_devices;
1452 struct btrfs_fs_devices *seed_devices; 1477 struct btrfs_fs_devices *seed_devices;
1453 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1478 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1454 struct btrfs_device *device; 1479 struct btrfs_device *device;
1455 u64 super_flags; 1480 u64 super_flags;
1456 1481
@@ -1592,7 +1617,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1592 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1617 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1593 return -EINVAL; 1618 return -EINVAL;
1594 1619
1595 bdev = blkdev_get_by_path(device_path, FMODE_EXCL, 1620 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1596 root->fs_info->bdev_holder); 1621 root->fs_info->bdev_holder);
1597 if (IS_ERR(bdev)) 1622 if (IS_ERR(bdev))
1598 return PTR_ERR(bdev); 1623 return PTR_ERR(bdev);
@@ -1691,15 +1716,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1691 root->fs_info->fs_devices->num_can_discard++; 1716 root->fs_info->fs_devices->num_can_discard++;
1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1717 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 1718
1719 spin_lock(&root->fs_info->free_chunk_lock);
1720 root->fs_info->free_chunk_space += device->total_bytes;
1721 spin_unlock(&root->fs_info->free_chunk_lock);
1722
1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1723 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 root->fs_info->fs_devices->rotating = 1; 1724 root->fs_info->fs_devices->rotating = 1;
1696 1725
1697 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1726 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1698 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1727 btrfs_set_super_total_bytes(root->fs_info->super_copy,
1699 total_bytes + device->total_bytes); 1728 total_bytes + device->total_bytes);
1700 1729
1701 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1730 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1702 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1731 btrfs_set_super_num_devices(root->fs_info->super_copy,
1703 total_bytes + 1); 1732 total_bytes + 1);
1704 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1733 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1705 1734
@@ -1790,7 +1819,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1790 struct btrfs_device *device, u64 new_size) 1819 struct btrfs_device *device, u64 new_size)
1791{ 1820{
1792 struct btrfs_super_block *super_copy = 1821 struct btrfs_super_block *super_copy =
1793 &device->dev_root->fs_info->super_copy; 1822 device->dev_root->fs_info->super_copy;
1794 u64 old_total = btrfs_super_total_bytes(super_copy); 1823 u64 old_total = btrfs_super_total_bytes(super_copy);
1795 u64 diff = new_size - device->total_bytes; 1824 u64 diff = new_size - device->total_bytes;
1796 1825
@@ -1849,7 +1878,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1878static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1850 chunk_offset) 1879 chunk_offset)
1851{ 1880{
1852 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1881 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1853 struct btrfs_disk_key *disk_key; 1882 struct btrfs_disk_key *disk_key;
1854 struct btrfs_chunk *chunk; 1883 struct btrfs_chunk *chunk;
1855 u8 *ptr; 1884 u8 *ptr;
@@ -2175,7 +2204,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2175 bool retried = false; 2204 bool retried = false;
2176 struct extent_buffer *l; 2205 struct extent_buffer *l;
2177 struct btrfs_key key; 2206 struct btrfs_key key;
2178 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2207 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2179 u64 old_total = btrfs_super_total_bytes(super_copy); 2208 u64 old_total = btrfs_super_total_bytes(super_copy);
2180 u64 old_size = device->total_bytes; 2209 u64 old_size = device->total_bytes;
2181 u64 diff = device->total_bytes - new_size; 2210 u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2221,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2192 lock_chunks(root); 2221 lock_chunks(root);
2193 2222
2194 device->total_bytes = new_size; 2223 device->total_bytes = new_size;
2195 if (device->writeable) 2224 if (device->writeable) {
2196 device->fs_devices->total_rw_bytes -= diff; 2225 device->fs_devices->total_rw_bytes -= diff;
2226 spin_lock(&root->fs_info->free_chunk_lock);
2227 root->fs_info->free_chunk_space -= diff;
2228 spin_unlock(&root->fs_info->free_chunk_lock);
2229 }
2197 unlock_chunks(root); 2230 unlock_chunks(root);
2198 2231
2199again: 2232again:
@@ -2257,6 +2290,9 @@ again:
2257 device->total_bytes = old_size; 2290 device->total_bytes = old_size;
2258 if (device->writeable) 2291 if (device->writeable)
2259 device->fs_devices->total_rw_bytes += diff; 2292 device->fs_devices->total_rw_bytes += diff;
2293 spin_lock(&root->fs_info->free_chunk_lock);
2294 root->fs_info->free_chunk_space += diff;
2295 spin_unlock(&root->fs_info->free_chunk_lock);
2260 unlock_chunks(root); 2296 unlock_chunks(root);
2261 goto done; 2297 goto done;
2262 } 2298 }
@@ -2292,7 +2328,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2292 struct btrfs_key *key, 2328 struct btrfs_key *key,
2293 struct btrfs_chunk *chunk, int item_size) 2329 struct btrfs_chunk *chunk, int item_size)
2294{ 2330{
2295 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2331 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2296 struct btrfs_disk_key disk_key; 2332 struct btrfs_disk_key disk_key;
2297 u32 array_size; 2333 u32 array_size;
2298 u8 *ptr; 2334 u8 *ptr;
@@ -2615,6 +2651,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2615 index++; 2651 index++;
2616 } 2652 }
2617 2653
2654 spin_lock(&extent_root->fs_info->free_chunk_lock);
2655 extent_root->fs_info->free_chunk_space -= (stripe_size *
2656 map->num_stripes);
2657 spin_unlock(&extent_root->fs_info->free_chunk_lock);
2658
2618 index = 0; 2659 index = 0;
2619 stripe = &chunk->stripe; 2660 stripe = &chunk->stripe;
2620 while (index < map->num_stripes) { 2661 while (index < map->num_stripes) {
@@ -2848,7 +2889,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2848 2889
2849static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2890static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2850 u64 logical, u64 *length, 2891 u64 logical, u64 *length,
2851 struct btrfs_multi_bio **multi_ret, 2892 struct btrfs_bio **bbio_ret,
2852 int mirror_num) 2893 int mirror_num)
2853{ 2894{
2854 struct extent_map *em; 2895 struct extent_map *em;
@@ -2866,18 +2907,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2866 int i; 2907 int i;
2867 int num_stripes; 2908 int num_stripes;
2868 int max_errors = 0; 2909 int max_errors = 0;
2869 struct btrfs_multi_bio *multi = NULL; 2910 struct btrfs_bio *bbio = NULL;
2870 2911
2871 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2912 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2872 stripes_allocated = 1; 2913 stripes_allocated = 1;
2873again: 2914again:
2874 if (multi_ret) { 2915 if (bbio_ret) {
2875 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2916 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2876 GFP_NOFS); 2917 GFP_NOFS);
2877 if (!multi) 2918 if (!bbio)
2878 return -ENOMEM; 2919 return -ENOMEM;
2879 2920
2880 atomic_set(&multi->error, 0); 2921 atomic_set(&bbio->error, 0);
2881 } 2922 }
2882 2923
2883 read_lock(&em_tree->lock); 2924 read_lock(&em_tree->lock);
@@ -2898,7 +2939,7 @@ again:
2898 if (mirror_num > map->num_stripes) 2939 if (mirror_num > map->num_stripes)
2899 mirror_num = 0; 2940 mirror_num = 0;
2900 2941
2901 /* if our multi bio struct is too small, back off and try again */ 2942 /* if our btrfs_bio struct is too small, back off and try again */
2902 if (rw & REQ_WRITE) { 2943 if (rw & REQ_WRITE) {
2903 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2944 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2904 BTRFS_BLOCK_GROUP_DUP)) { 2945 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2917,11 +2958,11 @@ again:
2917 stripes_required = map->num_stripes; 2958 stripes_required = map->num_stripes;
2918 } 2959 }
2919 } 2960 }
2920 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2961 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2921 stripes_allocated < stripes_required) { 2962 stripes_allocated < stripes_required) {
2922 stripes_allocated = map->num_stripes; 2963 stripes_allocated = map->num_stripes;
2923 free_extent_map(em); 2964 free_extent_map(em);
2924 kfree(multi); 2965 kfree(bbio);
2925 goto again; 2966 goto again;
2926 } 2967 }
2927 stripe_nr = offset; 2968 stripe_nr = offset;
@@ -2950,7 +2991,7 @@ again:
2950 *length = em->len - offset; 2991 *length = em->len - offset;
2951 } 2992 }
2952 2993
2953 if (!multi_ret) 2994 if (!bbio_ret)
2954 goto out; 2995 goto out;
2955 2996
2956 num_stripes = 1; 2997 num_stripes = 1;
@@ -2975,13 +3016,17 @@ again:
2975 stripe_index = find_live_mirror(map, 0, 3016 stripe_index = find_live_mirror(map, 0,
2976 map->num_stripes, 3017 map->num_stripes,
2977 current->pid % map->num_stripes); 3018 current->pid % map->num_stripes);
3019 mirror_num = stripe_index + 1;
2978 } 3020 }
2979 3021
2980 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3022 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2981 if (rw & (REQ_WRITE | REQ_DISCARD)) 3023 if (rw & (REQ_WRITE | REQ_DISCARD)) {
2982 num_stripes = map->num_stripes; 3024 num_stripes = map->num_stripes;
2983 else if (mirror_num) 3025 } else if (mirror_num) {
2984 stripe_index = mirror_num - 1; 3026 stripe_index = mirror_num - 1;
3027 } else {
3028 mirror_num = 1;
3029 }
2985 3030
2986 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3031 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2987 int factor = map->num_stripes / map->sub_stripes; 3032 int factor = map->num_stripes / map->sub_stripes;
@@ -3001,6 +3046,7 @@ again:
3001 stripe_index = find_live_mirror(map, stripe_index, 3046 stripe_index = find_live_mirror(map, stripe_index,
3002 map->sub_stripes, stripe_index + 3047 map->sub_stripes, stripe_index +
3003 current->pid % map->sub_stripes); 3048 current->pid % map->sub_stripes);
3049 mirror_num = stripe_index + 1;
3004 } 3050 }
3005 } else { 3051 } else {
3006 /* 3052 /*
@@ -3009,15 +3055,16 @@ again:
3009 * stripe_index is the number of our device in the stripe array 3055 * stripe_index is the number of our device in the stripe array
3010 */ 3056 */
3011 stripe_index = do_div(stripe_nr, map->num_stripes); 3057 stripe_index = do_div(stripe_nr, map->num_stripes);
3058 mirror_num = stripe_index + 1;
3012 } 3059 }
3013 BUG_ON(stripe_index >= map->num_stripes); 3060 BUG_ON(stripe_index >= map->num_stripes);
3014 3061
3015 if (rw & REQ_DISCARD) { 3062 if (rw & REQ_DISCARD) {
3016 for (i = 0; i < num_stripes; i++) { 3063 for (i = 0; i < num_stripes; i++) {
3017 multi->stripes[i].physical = 3064 bbio->stripes[i].physical =
3018 map->stripes[stripe_index].physical + 3065 map->stripes[stripe_index].physical +
3019 stripe_offset + stripe_nr * map->stripe_len; 3066 stripe_offset + stripe_nr * map->stripe_len;
3020 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3067 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3021 3068
3022 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3069 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3023 u64 stripes; 3070 u64 stripes;
@@ -3038,16 +3085,16 @@ again:
3038 } 3085 }
3039 stripes = stripe_nr_end - 1 - j; 3086 stripes = stripe_nr_end - 1 - j;
3040 do_div(stripes, map->num_stripes); 3087 do_div(stripes, map->num_stripes);
3041 multi->stripes[i].length = map->stripe_len * 3088 bbio->stripes[i].length = map->stripe_len *
3042 (stripes - stripe_nr + 1); 3089 (stripes - stripe_nr + 1);
3043 3090
3044 if (i == 0) { 3091 if (i == 0) {
3045 multi->stripes[i].length -= 3092 bbio->stripes[i].length -=
3046 stripe_offset; 3093 stripe_offset;
3047 stripe_offset = 0; 3094 stripe_offset = 0;
3048 } 3095 }
3049 if (stripe_index == last_stripe) 3096 if (stripe_index == last_stripe)
3050 multi->stripes[i].length -= 3097 bbio->stripes[i].length -=
3051 stripe_end_offset; 3098 stripe_end_offset;
3052 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3099 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3053 u64 stripes; 3100 u64 stripes;
@@ -3072,11 +3119,11 @@ again:
3072 } 3119 }
3073 stripes = stripe_nr_end - 1 - j; 3120 stripes = stripe_nr_end - 1 - j;
3074 do_div(stripes, factor); 3121 do_div(stripes, factor);
3075 multi->stripes[i].length = map->stripe_len * 3122 bbio->stripes[i].length = map->stripe_len *
3076 (stripes - stripe_nr + 1); 3123 (stripes - stripe_nr + 1);
3077 3124
3078 if (i < map->sub_stripes) { 3125 if (i < map->sub_stripes) {
3079 multi->stripes[i].length -= 3126 bbio->stripes[i].length -=
3080 stripe_offset; 3127 stripe_offset;
3081 if (i == map->sub_stripes - 1) 3128 if (i == map->sub_stripes - 1)
3082 stripe_offset = 0; 3129 stripe_offset = 0;
@@ -3084,11 +3131,11 @@ again:
3084 if (stripe_index >= last_stripe && 3131 if (stripe_index >= last_stripe &&
3085 stripe_index <= (last_stripe + 3132 stripe_index <= (last_stripe +
3086 map->sub_stripes - 1)) { 3133 map->sub_stripes - 1)) {
3087 multi->stripes[i].length -= 3134 bbio->stripes[i].length -=
3088 stripe_end_offset; 3135 stripe_end_offset;
3089 } 3136 }
3090 } else 3137 } else
3091 multi->stripes[i].length = *length; 3138 bbio->stripes[i].length = *length;
3092 3139
3093 stripe_index++; 3140 stripe_index++;
3094 if (stripe_index == map->num_stripes) { 3141 if (stripe_index == map->num_stripes) {
@@ -3099,19 +3146,20 @@ again:
3099 } 3146 }
3100 } else { 3147 } else {
3101 for (i = 0; i < num_stripes; i++) { 3148 for (i = 0; i < num_stripes; i++) {
3102 multi->stripes[i].physical = 3149 bbio->stripes[i].physical =
3103 map->stripes[stripe_index].physical + 3150 map->stripes[stripe_index].physical +
3104 stripe_offset + 3151 stripe_offset +
3105 stripe_nr * map->stripe_len; 3152 stripe_nr * map->stripe_len;
3106 multi->stripes[i].dev = 3153 bbio->stripes[i].dev =
3107 map->stripes[stripe_index].dev; 3154 map->stripes[stripe_index].dev;
3108 stripe_index++; 3155 stripe_index++;
3109 } 3156 }
3110 } 3157 }
3111 if (multi_ret) { 3158 if (bbio_ret) {
3112 *multi_ret = multi; 3159 *bbio_ret = bbio;
3113 multi->num_stripes = num_stripes; 3160 bbio->num_stripes = num_stripes;
3114 multi->max_errors = max_errors; 3161 bbio->max_errors = max_errors;
3162 bbio->mirror_num = mirror_num;
3115 } 3163 }
3116out: 3164out:
3117 free_extent_map(em); 3165 free_extent_map(em);
@@ -3120,9 +3168,9 @@ out:
3120 3168
3121int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3169int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3122 u64 logical, u64 *length, 3170 u64 logical, u64 *length,
3123 struct btrfs_multi_bio **multi_ret, int mirror_num) 3171 struct btrfs_bio **bbio_ret, int mirror_num)
3124{ 3172{
3125 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3173 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3126 mirror_num); 3174 mirror_num);
3127} 3175}
3128 3176
@@ -3191,30 +3239,32 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3191 return 0; 3239 return 0;
3192} 3240}
3193 3241
3194static void end_bio_multi_stripe(struct bio *bio, int err) 3242static void btrfs_end_bio(struct bio *bio, int err)
3195{ 3243{
3196 struct btrfs_multi_bio *multi = bio->bi_private; 3244 struct btrfs_bio *bbio = bio->bi_private;
3197 int is_orig_bio = 0; 3245 int is_orig_bio = 0;
3198 3246
3199 if (err) 3247 if (err)
3200 atomic_inc(&multi->error); 3248 atomic_inc(&bbio->error);
3201 3249
3202 if (bio == multi->orig_bio) 3250 if (bio == bbio->orig_bio)
3203 is_orig_bio = 1; 3251 is_orig_bio = 1;
3204 3252
3205 if (atomic_dec_and_test(&multi->stripes_pending)) { 3253 if (atomic_dec_and_test(&bbio->stripes_pending)) {
3206 if (!is_orig_bio) { 3254 if (!is_orig_bio) {
3207 bio_put(bio); 3255 bio_put(bio);
3208 bio = multi->orig_bio; 3256 bio = bbio->orig_bio;
3209 } 3257 }
3210 bio->bi_private = multi->private; 3258 bio->bi_private = bbio->private;
3211 bio->bi_end_io = multi->end_io; 3259 bio->bi_end_io = bbio->end_io;
3260 bio->bi_bdev = (struct block_device *)
3261 (unsigned long)bbio->mirror_num;
3212 /* only send an error to the higher layers if it is 3262 /* only send an error to the higher layers if it is
3213 * beyond the tolerance of the multi-bio 3263 * beyond the tolerance of the multi-bio
3214 */ 3264 */
3215 if (atomic_read(&multi->error) > multi->max_errors) { 3265 if (atomic_read(&bbio->error) > bbio->max_errors) {
3216 err = -EIO; 3266 err = -EIO;
3217 } else if (err) { 3267 } else {
3218 /* 3268 /*
3219 * this bio is actually up to date, we didn't 3269 * this bio is actually up to date, we didn't
3220 * go over the max number of errors 3270 * go over the max number of errors
@@ -3222,7 +3272,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
3222 set_bit(BIO_UPTODATE, &bio->bi_flags); 3272 set_bit(BIO_UPTODATE, &bio->bi_flags);
3223 err = 0; 3273 err = 0;
3224 } 3274 }
3225 kfree(multi); 3275 kfree(bbio);
3226 3276
3227 bio_endio(bio, err); 3277 bio_endio(bio, err);
3228 } else if (!is_orig_bio) { 3278 } else if (!is_orig_bio) {
@@ -3302,20 +3352,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3302 u64 logical = (u64)bio->bi_sector << 9; 3352 u64 logical = (u64)bio->bi_sector << 9;
3303 u64 length = 0; 3353 u64 length = 0;
3304 u64 map_length; 3354 u64 map_length;
3305 struct btrfs_multi_bio *multi = NULL;
3306 int ret; 3355 int ret;
3307 int dev_nr = 0; 3356 int dev_nr = 0;
3308 int total_devs = 1; 3357 int total_devs = 1;
3358 struct btrfs_bio *bbio = NULL;
3309 3359
3310 length = bio->bi_size; 3360 length = bio->bi_size;
3311 map_tree = &root->fs_info->mapping_tree; 3361 map_tree = &root->fs_info->mapping_tree;
3312 map_length = length; 3362 map_length = length;
3313 3363
3314 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3364 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
3315 mirror_num); 3365 mirror_num);
3316 BUG_ON(ret); 3366 BUG_ON(ret);
3317 3367
3318 total_devs = multi->num_stripes; 3368 total_devs = bbio->num_stripes;
3319 if (map_length < length) { 3369 if (map_length < length) {
3320 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3370 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3321 "len %llu\n", (unsigned long long)logical, 3371 "len %llu\n", (unsigned long long)logical,
@@ -3323,25 +3373,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3323 (unsigned long long)map_length); 3373 (unsigned long long)map_length);
3324 BUG(); 3374 BUG();
3325 } 3375 }
3326 multi->end_io = first_bio->bi_end_io; 3376
3327 multi->private = first_bio->bi_private; 3377 bbio->orig_bio = first_bio;
3328 multi->orig_bio = first_bio; 3378 bbio->private = first_bio->bi_private;
3329 atomic_set(&multi->stripes_pending, multi->num_stripes); 3379 bbio->end_io = first_bio->bi_end_io;
3380 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
3330 3381
3331 while (dev_nr < total_devs) { 3382 while (dev_nr < total_devs) {
3332 if (total_devs > 1) { 3383 if (dev_nr < total_devs - 1) {
3333 if (dev_nr < total_devs - 1) { 3384 bio = bio_clone(first_bio, GFP_NOFS);
3334 bio = bio_clone(first_bio, GFP_NOFS); 3385 BUG_ON(!bio);
3335 BUG_ON(!bio); 3386 } else {
3336 } else { 3387 bio = first_bio;
3337 bio = first_bio;
3338 }
3339 bio->bi_private = multi;
3340 bio->bi_end_io = end_bio_multi_stripe;
3341 } 3388 }
3342 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3389 bio->bi_private = bbio;
3343 dev = multi->stripes[dev_nr].dev; 3390 bio->bi_end_io = btrfs_end_bio;
3391 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
3392 dev = bbio->stripes[dev_nr].dev;
3344 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3393 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3394 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
3395 "(%s id %llu), size=%u\n", rw,
3396 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
3397 dev->name, dev->devid, bio->bi_size);
3345 bio->bi_bdev = dev->bdev; 3398 bio->bi_bdev = dev->bdev;
3346 if (async_submit) 3399 if (async_submit)
3347 schedule_bio(root, dev, rw, bio); 3400 schedule_bio(root, dev, rw, bio);
@@ -3354,8 +3407,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3354 } 3407 }
3355 dev_nr++; 3408 dev_nr++;
3356 } 3409 }
3357 if (total_devs == 1)
3358 kfree(multi);
3359 return 0; 3410 return 0;
3360} 3411}
3361 3412
@@ -3616,15 +3667,20 @@ static int read_one_dev(struct btrfs_root *root,
3616 fill_device_from_item(leaf, dev_item, device); 3667 fill_device_from_item(leaf, dev_item, device);
3617 device->dev_root = root->fs_info->dev_root; 3668 device->dev_root = root->fs_info->dev_root;
3618 device->in_fs_metadata = 1; 3669 device->in_fs_metadata = 1;
3619 if (device->writeable) 3670 if (device->writeable) {
3620 device->fs_devices->total_rw_bytes += device->total_bytes; 3671 device->fs_devices->total_rw_bytes += device->total_bytes;
3672 spin_lock(&root->fs_info->free_chunk_lock);
3673 root->fs_info->free_chunk_space += device->total_bytes -
3674 device->bytes_used;
3675 spin_unlock(&root->fs_info->free_chunk_lock);
3676 }
3621 ret = 0; 3677 ret = 0;
3622 return ret; 3678 return ret;
3623} 3679}
3624 3680
3625int btrfs_read_sys_array(struct btrfs_root *root) 3681int btrfs_read_sys_array(struct btrfs_root *root)
3626{ 3682{
3627 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3683 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3628 struct extent_buffer *sb; 3684 struct extent_buffer *sb;
3629 struct btrfs_disk_key *disk_key; 3685 struct btrfs_disk_key *disk_key;
3630 struct btrfs_chunk *chunk; 3686 struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e177..78f2d4d4f37f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,20 @@ struct btrfs_device {
92 struct btrfs_work work; 92 struct btrfs_work work;
93 struct rcu_head rcu; 93 struct rcu_head rcu;
94 struct work_struct rcu_work; 94 struct work_struct rcu_work;
95
96 /* readahead state */
97 spinlock_t reada_lock;
98 atomic_t reada_in_flight;
99 u64 reada_next;
100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents;
103
104 /* for sending down flush barriers */
105 struct bio *flush_bio;
106 struct completion flush_wait;
107 int nobarriers;
108
95}; 109};
96 110
97struct btrfs_fs_devices { 111struct btrfs_fs_devices {
@@ -136,7 +150,10 @@ struct btrfs_bio_stripe {
136 u64 length; /* only used for discard mappings */ 150 u64 length; /* only used for discard mappings */
137}; 151};
138 152
139struct btrfs_multi_bio { 153struct btrfs_bio;
154typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
155
156struct btrfs_bio {
140 atomic_t stripes_pending; 157 atomic_t stripes_pending;
141 bio_end_io_t *end_io; 158 bio_end_io_t *end_io;
142 struct bio *orig_bio; 159 struct bio *orig_bio;
@@ -144,6 +161,7 @@ struct btrfs_multi_bio {
144 atomic_t error; 161 atomic_t error;
145 int max_errors; 162 int max_errors;
146 int num_stripes; 163 int num_stripes;
164 int mirror_num;
147 struct btrfs_bio_stripe stripes[]; 165 struct btrfs_bio_stripe stripes[];
148}; 166};
149 167
@@ -171,7 +189,7 @@ struct map_lookup {
171int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 189int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
172 u64 end, u64 *length); 190 u64 end, u64 *length);
173 191
174#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 192#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
175 (sizeof(struct btrfs_bio_stripe) * (n))) 193 (sizeof(struct btrfs_bio_stripe) * (n)))
176 194
177int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 195int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
180 u64 chunk_offset, u64 start, u64 num_bytes); 198 u64 chunk_offset, u64 start, u64 num_bytes);
181int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 199int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
182 u64 logical, u64 *length, 200 u64 logical, u64 *length,
183 struct btrfs_multi_bio **multi_ret, int mirror_num); 201 struct btrfs_bio **bbio_ret, int mirror_num);
184int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 202int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
185 u64 chunk_start, u64 physical, u64 devid, 203 u64 chunk_start, u64 physical, u64 devid,
186 u64 **logical, int *naddrs, int *stripe_len); 204 u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 426aa464f1af..3848b04e310e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
127again: 127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), 128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size); 129 name, name_len, value, size);
130 /*
131 * If we're setting an xattr to a new value but the new value is say
132 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
133 * back from split_leaf. This is because it thinks we'll be extending
134 * the existing item size, but we're asking for enough space to add the
135 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
136 * the rest of the function figure it out.
137 */
138 if (ret == -EOVERFLOW)
139 ret = -EEXIST;
140
130 if (ret == -EEXIST) { 141 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE) 142 if (flags & XATTR_CREATE)
132 goto out; 143 goto out;