aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLi Zefan <lizf@cn.fujitsu.com>2012-01-10 20:54:49 -0500
committerLi Zefan <lizf@cn.fujitsu.com>2012-01-10 20:54:49 -0500
commitd25223a0d22f7ec4203ec285dc6e51f696591ba3 (patch)
treef54428e64f692edfa5bf75f8eb301329e32a895f
parent396e6e49c58bb23d1814d3c240c736c9f01523c5 (diff)
parent08c422c27f855d27b0b3d9fa30ebd938d4ae6f1f (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs into for-linus
-rw-r--r--Documentation/filesystems/btrfs.txt4
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/acl.c17
-rw-r--r--fs/btrfs/async-thread.c120
-rw-r--r--fs/btrfs/async-thread.h4
-rw-r--r--fs/btrfs/backref.c776
-rw-r--r--fs/btrfs/backref.h62
-rw-r--r--fs/btrfs/btrfs_inode.h21
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/ctree.c27
-rw-r--r--fs/btrfs/ctree.h209
-rw-r--r--fs/btrfs/delayed-inode.c108
-rw-r--r--fs/btrfs/disk-io.c662
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c1126
-rw-r--r--fs/btrfs/extent_io.c640
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c31
-rw-r--r--fs/btrfs/free-space-cache.c994
-rw-r--r--fs/btrfs/inode-map.c34
-rw-r--r--fs/btrfs/inode.c732
-rw-r--r--fs/btrfs/ioctl.c244
-rw-r--r--fs/btrfs/ioctl.h29
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/reada.c951
-rw-r--r--fs/btrfs/relocation.c28
-rw-r--r--fs/btrfs/scrub.c668
-rw-r--r--fs/btrfs/super.c373
-rw-r--r--fs/btrfs/transaction.c156
-rw-r--r--fs/btrfs/tree-log.c19
-rw-r--r--fs/btrfs/volumes.c222
-rw-r--r--fs/btrfs/volumes.h24
-rw-r--r--fs/btrfs/xattr.c11
34 files changed, 6339 insertions, 2011 deletions
diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index 64087c34327f..7671352216f1 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -63,8 +63,8 @@ IRC network.
63Userspace tools for creating and manipulating Btrfs file systems are 63Userspace tools for creating and manipulating Btrfs file systems are
64available from the git repository at the following location: 64available from the git repository at the following location:
65 65
66 http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs-unstable.git 66 http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs.git
67 git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs-unstable.git 67 git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs.git
68 68
69These include the following tools: 69These include the following tools:
70 70
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21f..c0ddfd29c5e5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o
11 12
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 if (!value) 59 if (!value)
60 return ERR_PTR(-ENOMEM); 60 return ERR_PTR(-ENOMEM);
61 size = __btrfs_getxattr(inode, name, value, size); 61 size = __btrfs_getxattr(inode, name, value, size);
62 if (size > 0) { 62 }
63 acl = posix_acl_from_xattr(value, size); 63 if (size > 0) {
64 if (IS_ERR(acl)) { 64 acl = posix_acl_from_xattr(value, size);
65 kfree(value);
66 return acl;
67 }
68 set_cached_acl(inode, type, acl);
69 }
70 kfree(value);
71 } else if (size == -ENOENT || size == -ENODATA || size == 0) { 65 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
72 /* FIXME, who returns -ENOENT? I think nobody */ 66 /* FIXME, who returns -ENOENT? I think nobody */
73 acl = NULL; 67 acl = NULL;
74 set_cached_acl(inode, type, acl);
75 } else { 68 } else {
76 acl = ERR_PTR(-EIO); 69 acl = ERR_PTR(-EIO);
77 } 70 }
71 kfree(value);
72
73 if (!IS_ERR(acl))
74 set_cached_acl(inode, type, acl);
78 75
79 return acl; 76 return acl;
80} 77}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec14097fef1..0b394580d860 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,8 @@ struct btrfs_worker_thread {
64 int idle; 64 int idle;
65}; 65};
66 66
67static int __btrfs_start_workers(struct btrfs_workers *workers);
68
67/* 69/*
68 * btrfs_start_workers uses kthread_run, which can block waiting for memory 70 * btrfs_start_workers uses kthread_run, which can block waiting for memory
69 * for a very long time. It will actually throttle on page writeback, 71 * for a very long time. It will actually throttle on page writeback,
@@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work)
88{ 90{
89 struct worker_start *start; 91 struct worker_start *start;
90 start = container_of(work, struct worker_start, work); 92 start = container_of(work, struct worker_start, work);
91 btrfs_start_workers(start->queue, 1); 93 __btrfs_start_workers(start->queue);
92 kfree(start); 94 kfree(start);
93} 95}
94 96
95static int start_new_worker(struct btrfs_workers *queue)
96{
97 struct worker_start *start;
98 int ret;
99
100 start = kzalloc(sizeof(*start), GFP_NOFS);
101 if (!start)
102 return -ENOMEM;
103
104 start->work.func = start_new_worker_func;
105 start->queue = queue;
106 ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
107 if (ret)
108 kfree(start);
109 return ret;
110}
111
112/* 97/*
113 * helper function to move a thread onto the idle list after it 98 * helper function to move a thread onto the idle list after it
114 * has finished some requests. 99 * has finished some requests.
@@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
153static void check_pending_worker_creates(struct btrfs_worker_thread *worker) 138static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
154{ 139{
155 struct btrfs_workers *workers = worker->workers; 140 struct btrfs_workers *workers = worker->workers;
141 struct worker_start *start;
156 unsigned long flags; 142 unsigned long flags;
157 143
158 rmb(); 144 rmb();
159 if (!workers->atomic_start_pending) 145 if (!workers->atomic_start_pending)
160 return; 146 return;
161 147
148 start = kzalloc(sizeof(*start), GFP_NOFS);
149 if (!start)
150 return;
151
152 start->work.func = start_new_worker_func;
153 start->queue = workers;
154
162 spin_lock_irqsave(&workers->lock, flags); 155 spin_lock_irqsave(&workers->lock, flags);
163 if (!workers->atomic_start_pending) 156 if (!workers->atomic_start_pending)
164 goto out; 157 goto out;
@@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
170 163
171 workers->num_workers_starting += 1; 164 workers->num_workers_starting += 1;
172 spin_unlock_irqrestore(&workers->lock, flags); 165 spin_unlock_irqrestore(&workers->lock, flags);
173 start_new_worker(workers); 166 btrfs_queue_worker(workers->atomic_worker_start, &start->work);
174 return; 167 return;
175 168
176out: 169out:
170 kfree(start);
177 spin_unlock_irqrestore(&workers->lock, flags); 171 spin_unlock_irqrestore(&workers->lock, flags);
178} 172}
179 173
@@ -331,7 +325,7 @@ again:
331 run_ordered_completions(worker->workers, work); 325 run_ordered_completions(worker->workers, work);
332 326
333 check_pending_worker_creates(worker); 327 check_pending_worker_creates(worker);
334 328 cond_resched();
335 } 329 }
336 330
337 spin_lock_irq(&worker->lock); 331 spin_lock_irq(&worker->lock);
@@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
462 * starts new worker threads. This does not enforce the max worker 456 * starts new worker threads. This does not enforce the max worker
463 * count in case you need to temporarily go past it. 457 * count in case you need to temporarily go past it.
464 */ 458 */
465static int __btrfs_start_workers(struct btrfs_workers *workers, 459static int __btrfs_start_workers(struct btrfs_workers *workers)
466 int num_workers)
467{ 460{
468 struct btrfs_worker_thread *worker; 461 struct btrfs_worker_thread *worker;
469 int ret = 0; 462 int ret = 0;
470 int i;
471 463
472 for (i = 0; i < num_workers; i++) { 464 worker = kzalloc(sizeof(*worker), GFP_NOFS);
473 worker = kzalloc(sizeof(*worker), GFP_NOFS); 465 if (!worker) {
474 if (!worker) { 466 ret = -ENOMEM;
475 ret = -ENOMEM; 467 goto fail;
476 goto fail; 468 }
477 }
478 469
479 INIT_LIST_HEAD(&worker->pending); 470 INIT_LIST_HEAD(&worker->pending);
480 INIT_LIST_HEAD(&worker->prio_pending); 471 INIT_LIST_HEAD(&worker->prio_pending);
481 INIT_LIST_HEAD(&worker->worker_list); 472 INIT_LIST_HEAD(&worker->worker_list);
482 spin_lock_init(&worker->lock); 473 spin_lock_init(&worker->lock);
483 474
484 atomic_set(&worker->num_pending, 0); 475 atomic_set(&worker->num_pending, 0);
485 atomic_set(&worker->refs, 1); 476 atomic_set(&worker->refs, 1);
486 worker->workers = workers; 477 worker->workers = workers;
487 worker->task = kthread_run(worker_loop, worker, 478 worker->task = kthread_run(worker_loop, worker,
488 "btrfs-%s-%d", workers->name, 479 "btrfs-%s-%d", workers->name,
489 workers->num_workers + i); 480 workers->num_workers + 1);
490 if (IS_ERR(worker->task)) { 481 if (IS_ERR(worker->task)) {
491 ret = PTR_ERR(worker->task); 482 ret = PTR_ERR(worker->task);
492 kfree(worker); 483 kfree(worker);
493 goto fail; 484 goto fail;
494 }
495 spin_lock_irq(&workers->lock);
496 list_add_tail(&worker->worker_list, &workers->idle_list);
497 worker->idle = 1;
498 workers->num_workers++;
499 workers->num_workers_starting--;
500 WARN_ON(workers->num_workers_starting < 0);
501 spin_unlock_irq(&workers->lock);
502 } 485 }
486 spin_lock_irq(&workers->lock);
487 list_add_tail(&worker->worker_list, &workers->idle_list);
488 worker->idle = 1;
489 workers->num_workers++;
490 workers->num_workers_starting--;
491 WARN_ON(workers->num_workers_starting < 0);
492 spin_unlock_irq(&workers->lock);
493
503 return 0; 494 return 0;
504fail: 495fail:
505 btrfs_stop_workers(workers); 496 spin_lock_irq(&workers->lock);
497 workers->num_workers_starting--;
498 spin_unlock_irq(&workers->lock);
506 return ret; 499 return ret;
507} 500}
508 501
509int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) 502int btrfs_start_workers(struct btrfs_workers *workers)
510{ 503{
511 spin_lock_irq(&workers->lock); 504 spin_lock_irq(&workers->lock);
512 workers->num_workers_starting += num_workers; 505 workers->num_workers_starting++;
513 spin_unlock_irq(&workers->lock); 506 spin_unlock_irq(&workers->lock);
514 return __btrfs_start_workers(workers, num_workers); 507 return __btrfs_start_workers(workers);
515} 508}
516 509
517/* 510/*
@@ -568,9 +561,10 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
568 struct btrfs_worker_thread *worker; 561 struct btrfs_worker_thread *worker;
569 unsigned long flags; 562 unsigned long flags;
570 struct list_head *fallback; 563 struct list_head *fallback;
564 int ret;
571 565
572again:
573 spin_lock_irqsave(&workers->lock, flags); 566 spin_lock_irqsave(&workers->lock, flags);
567again:
574 worker = next_worker(workers); 568 worker = next_worker(workers);
575 569
576 if (!worker) { 570 if (!worker) {
@@ -584,7 +578,10 @@ again:
584 workers->num_workers_starting++; 578 workers->num_workers_starting++;
585 spin_unlock_irqrestore(&workers->lock, flags); 579 spin_unlock_irqrestore(&workers->lock, flags);
586 /* we're below the limit, start another worker */ 580 /* we're below the limit, start another worker */
587 __btrfs_start_workers(workers, 1); 581 ret = __btrfs_start_workers(workers);
582 spin_lock_irqsave(&workers->lock, flags);
583 if (ret)
584 goto fallback;
588 goto again; 585 goto again;
589 } 586 }
590 } 587 }
@@ -665,7 +662,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work)
665/* 662/*
666 * places a struct btrfs_work into the pending queue of one of the kthreads 663 * places a struct btrfs_work into the pending queue of one of the kthreads
667 */ 664 */
668int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) 665void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
669{ 666{
670 struct btrfs_worker_thread *worker; 667 struct btrfs_worker_thread *worker;
671 unsigned long flags; 668 unsigned long flags;
@@ -673,7 +670,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
673 670
674 /* don't requeue something already on a list */ 671 /* don't requeue something already on a list */
675 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 672 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
676 goto out; 673 return;
677 674
678 worker = find_worker(workers); 675 worker = find_worker(workers);
679 if (workers->ordered) { 676 if (workers->ordered) {
@@ -712,7 +709,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
712 if (wake) 709 if (wake)
713 wake_up_process(worker->task); 710 wake_up_process(worker->task);
714 spin_unlock_irqrestore(&worker->lock, flags); 711 spin_unlock_irqrestore(&worker->lock, flags);
715
716out:
717 return 0;
718} 712}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746cf85e..f34cc31fa3c9 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -109,8 +109,8 @@ struct btrfs_workers {
109 char *name; 109 char *name;
110}; 110};
111 111
112int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 112void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
113int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); 113int btrfs_start_workers(struct btrfs_workers *workers);
114int btrfs_stop_workers(struct btrfs_workers *workers); 114int btrfs_stop_workers(struct btrfs_workers *workers);
115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
116 struct btrfs_workers *async_starter); 116 struct btrfs_workers *async_starter);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 000000000000..22c64fff1bd5
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "backref.h"
22
23struct __data_ref {
24 struct list_head list;
25 u64 inum;
26 u64 root;
27 u64 extent_data_item_offset;
28};
29
30struct __shared_ref {
31 struct list_head list;
32 u64 disk_byte;
33};
34
35static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path,
37 struct btrfs_key *found_key)
38{
39 int ret;
40 struct btrfs_key key;
41 struct extent_buffer *eb;
42
43 key.type = key_type;
44 key.objectid = inum;
45 key.offset = ioff;
46
47 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
48 if (ret < 0)
49 return ret;
50
51 eb = path->nodes[0];
52 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
53 ret = btrfs_next_leaf(fs_root, path);
54 if (ret)
55 return ret;
56 eb = path->nodes[0];
57 }
58
59 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
60 if (found_key->type != key.type || found_key->objectid != key.objectid)
61 return 1;
62
63 return 0;
64}
65
66/*
67 * this makes the path point to (inum INODE_ITEM ioff)
68 */
69int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
70 struct btrfs_path *path)
71{
72 struct btrfs_key key;
73 return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
74 &key);
75}
76
77static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
78 struct btrfs_path *path,
79 struct btrfs_key *found_key)
80{
81 return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
82 found_key);
83}
84
85/*
86 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
87 * of the path are separated by '/' and the path is guaranteed to be
88 * 0-terminated. the path is only given within the current file system.
89 * Therefore, it never starts with a '/'. the caller is responsible to provide
90 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
91 * the start point of the resulting string is returned. this pointer is within
92 * dest, normally.
93 * in case the path buffer would overflow, the pointer is decremented further
94 * as if output was written to the buffer, though no more output is actually
95 * generated. that way, the caller can determine how much space would be
96 * required for the path to fit into the buffer. in that case, the returned
97 * value will be smaller than dest. callers must check this!
98 */
99static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
100 struct btrfs_inode_ref *iref,
101 struct extent_buffer *eb_in, u64 parent,
102 char *dest, u32 size)
103{
104 u32 len;
105 int slot;
106 u64 next_inum;
107 int ret;
108 s64 bytes_left = size - 1;
109 struct extent_buffer *eb = eb_in;
110 struct btrfs_key found_key;
111
112 if (bytes_left >= 0)
113 dest[bytes_left] = '\0';
114
115 while (1) {
116 len = btrfs_inode_ref_name_len(eb, iref);
117 bytes_left -= len;
118 if (bytes_left >= 0)
119 read_extent_buffer(eb, dest + bytes_left,
120 (unsigned long)(iref + 1), len);
121 if (eb != eb_in)
122 free_extent_buffer(eb);
123 ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
124 if (ret)
125 break;
126 next_inum = found_key.offset;
127
128 /* regular exit ahead */
129 if (parent == next_inum)
130 break;
131
132 slot = path->slots[0];
133 eb = path->nodes[0];
134 /* make sure we can use eb after releasing the path */
135 if (eb != eb_in)
136 atomic_inc(&eb->refs);
137 btrfs_release_path(path);
138
139 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
140 parent = next_inum;
141 --bytes_left;
142 if (bytes_left >= 0)
143 dest[bytes_left] = '/';
144 }
145
146 btrfs_release_path(path);
147
148 if (ret)
149 return ERR_PTR(ret);
150
151 return dest + bytes_left;
152}
153
154/*
155 * this makes the path point to (logical EXTENT_ITEM *)
156 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
157 * tree blocks and <0 on error.
158 */
159int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
160 struct btrfs_path *path, struct btrfs_key *found_key)
161{
162 int ret;
163 u64 flags;
164 u32 item_size;
165 struct extent_buffer *eb;
166 struct btrfs_extent_item *ei;
167 struct btrfs_key key;
168
169 key.type = BTRFS_EXTENT_ITEM_KEY;
170 key.objectid = logical;
171 key.offset = (u64)-1;
172
173 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
174 if (ret < 0)
175 return ret;
176 ret = btrfs_previous_item(fs_info->extent_root, path,
177 0, BTRFS_EXTENT_ITEM_KEY);
178 if (ret < 0)
179 return ret;
180
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical)
185 return -ENOENT;
186
187 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]);
189 BUG_ON(item_size < sizeof(*ei));
190
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei);
193
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA)
197 return BTRFS_EXTENT_FLAG_DATA;
198
199 return -EIO;
200}
201
202/*
203 * helper function to iterate extent inline refs. ptr must point to a 0 value
204 * for the first call and may be modified. it is used to track state.
205 * if more refs exist, 0 is returned and the next call to
206 * __get_extent_inline_ref must pass the modified ptr parameter to get the
207 * next ref. after the last ref was processed, 1 is returned.
208 * returns <0 on error
209 */
210static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
211 struct btrfs_extent_item *ei, u32 item_size,
212 struct btrfs_extent_inline_ref **out_eiref,
213 int *out_type)
214{
215 unsigned long end;
216 u64 flags;
217 struct btrfs_tree_block_info *info;
218
219 if (!*ptr) {
220 /* first call */
221 flags = btrfs_extent_flags(eb, ei);
222 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
223 info = (struct btrfs_tree_block_info *)(ei + 1);
224 *out_eiref =
225 (struct btrfs_extent_inline_ref *)(info + 1);
226 } else {
227 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
228 }
229 *ptr = (unsigned long)*out_eiref;
230 if ((void *)*ptr >= (void *)ei + item_size)
231 return -ENOENT;
232 }
233
234 end = (unsigned long)ei + item_size;
235 *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
236 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
237
238 *ptr += btrfs_extent_inline_ref_size(*out_type);
239 WARN_ON(*ptr > end);
240 if (*ptr == end)
241 return 1; /* last */
242
243 return 0;
244}
245
246/*
247 * reads the tree block backref for an extent. tree level and root are returned
248 * through out_level and out_root. ptr must point to a 0 value for the first
249 * call and may be modified (see __get_extent_inline_ref comment).
250 * returns 0 if data was provided, 1 if there was no more data to provide or
251 * <0 on error.
252 */
253int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
254 struct btrfs_extent_item *ei, u32 item_size,
255 u64 *out_root, u8 *out_level)
256{
257 int ret;
258 int type;
259 struct btrfs_tree_block_info *info;
260 struct btrfs_extent_inline_ref *eiref;
261
262 if (*ptr == (unsigned long)-1)
263 return 1;
264
265 while (1) {
266 ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
267 &eiref, &type);
268 if (ret < 0)
269 return ret;
270
271 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
272 type == BTRFS_SHARED_BLOCK_REF_KEY)
273 break;
274
275 if (ret == 1)
276 return 1;
277 }
278
279 /* we can treat both ref types equally here */
280 info = (struct btrfs_tree_block_info *)(ei + 1);
281 *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
282 *out_level = btrfs_tree_block_level(eb, info);
283
284 if (ret == 1)
285 *ptr = (unsigned long)-1;
286
287 return 0;
288}
289
290static int __data_list_add(struct list_head *head, u64 inum,
291 u64 extent_data_item_offset, u64 root)
292{
293 struct __data_ref *ref;
294
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{
413 u64 disk_byte;
414 struct btrfs_key key;
415 struct btrfs_file_extent_item *fi;
416 struct extent_buffer *eb;
417 int slot;
418 int nritems;
419 int ret;
420 int found = 0;
421
422 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0);
424 if (!eb)
425 return -EIO;
426
427 /*
428 * from the shared data ref, we only have the leaf but we need
429 * the key. thus, we must look into all items and see that we
430 * find one (some) with a reference to our extent item.
431 */
432 nritems = btrfs_header_nritems(eb);
433 for (slot = 0; slot < nritems; ++slot) {
434 btrfs_item_key_to_cpu(eb, &key, slot);
435 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) {
439 free_extent_buffer(eb);
440 return -EIO;
441 }
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) {
444 if (found)
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459
460 if (!found) {
461 printk(KERN_ERR "btrfs: failed to follow shared data backref "
462 "to parent %llu\n", logical);
463 WARN_ON(1);
464 ret = -EIO;
465 }
466
467 free_extent_buffer(eb);
468 return ret;
469}
470
471/*
472 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops.
476 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path,
479 u64 extent_item_objectid,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx)
482{
483 unsigned long ptr = 0;
484 int last;
485 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d;
497 struct __shared_ref *ref_s;
498
499 eb = path->nodes[0];
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
501 item_size = btrfs_item_size_nr(eb, path->slots[0]);
502
503 /* first we iterate the inline refs, ... */
504 do {
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
506 &eiref, &type);
507 if (last == -ENOENT) {
508 ret = 0;
509 break;
510 }
511 if (last < 0) {
512 ret = last;
513 break;
514 }
515
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524
525 /* ... then we proceed to in-tree references and ... */
526 while (!ret) {
527 ++path->slots[0];
528 if (path->slots[0] > btrfs_header_nritems(eb)) {
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
541 dref = btrfs_item_ptr(eb, path->slots[0],
542 struct btrfs_extent_data_ref);
543 ret = __data_list_add_eb(&data_refs, eb, dref);
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
545 ret = __shared_list_add(&shared_refs, key.offset);
546 }
547 }
548
549 btrfs_release_path(path);
550
551 /*
552 * ... only at the very end we can process the refs we found. this is
553 * because the iterator function we call is allowed to make tree lookups
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret;
582}
583
584int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
585 struct btrfs_path *path,
586 iterate_extent_inodes_t *iterate, void *ctx)
587{
588 int ret;
589 u64 offset;
590 struct btrfs_key found_key;
591
592 ret = extent_from_logical(fs_info, logical, path,
593 &found_key);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL;
596 if (ret < 0)
597 return ret;
598
599 offset = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx);
602
603 return ret;
604}
605
606static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
607 struct btrfs_path *path,
608 iterate_irefs_t *iterate, void *ctx)
609{
610 int ret;
611 int slot;
612 u32 cur;
613 u32 len;
614 u32 name_len;
615 u64 parent = 0;
616 int found = 0;
617 struct extent_buffer *eb;
618 struct btrfs_item *item;
619 struct btrfs_inode_ref *iref;
620 struct btrfs_key found_key;
621
622 while (1) {
623 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
624 &found_key);
625 if (ret < 0)
626 break;
627 if (ret) {
628 ret = found ? 0 : -ENOENT;
629 break;
630 }
631 ++found;
632
633 parent = found_key.offset;
634 slot = path->slots[0];
635 eb = path->nodes[0];
636 /* make sure we can use eb after releasing the path */
637 atomic_inc(&eb->refs);
638 btrfs_release_path(path);
639
640 item = btrfs_item_nr(eb, slot);
641 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
642
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */
646 ret = iterate(parent, iref, eb, ctx);
647 if (ret) {
648 free_extent_buffer(eb);
649 break;
650 }
651 len = sizeof(*iref) + name_len;
652 iref = (struct btrfs_inode_ref *)((char *)iref + len);
653 }
654 free_extent_buffer(eb);
655 }
656
657 btrfs_release_path(path);
658
659 return ret;
660}
661
662/*
663 * returns 0 if the path could be dumped (probably truncated)
664 * returns <0 in case of an error
665 */
666static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
667 struct extent_buffer *eb, void *ctx)
668{
669 struct inode_fs_paths *ipath = ctx;
670 char *fspath;
671 char *fspath_min;
672 int i = ipath->fspath->elem_cnt;
673 const int s_ptr = sizeof(char *);
674 u32 bytes_left;
675
676 bytes_left = ipath->fspath->bytes_left > s_ptr ?
677 ipath->fspath->bytes_left - s_ptr : 0;
678
679 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
680 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
681 inum, fspath_min, bytes_left);
682 if (IS_ERR(fspath))
683 return PTR_ERR(fspath);
684
685 if (fspath > fspath_min) {
686 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else {
690 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0;
693 }
694
695 return 0;
696}
697
698/*
699 * this dumps all file system paths to the inode into the ipath struct, provided
700 * is has been created large enough. each path is zero-terminated and accessed
701 * from ipath->fspath->val[i].
702 * when it returns, there are ipath->fspath->elem_cnt number of paths available
703 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
704 * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
705 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
706 * have been needed to return all paths.
707 */
708int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
709{
710 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
711 inode_to_path, ipath);
712}
713
714/*
715 * allocates space to return multiple file system paths for an inode.
716 * total_bytes to allocate are passed, note that space usable for actual path
717 * information will be total_bytes - sizeof(struct inode_fs_paths).
718 * the returned pointer must be freed with free_ipath() in the end.
719 */
720struct btrfs_data_container *init_data_container(u32 total_bytes)
721{
722 struct btrfs_data_container *data;
723 size_t alloc_bytes;
724
725 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
726 data = kmalloc(alloc_bytes, GFP_NOFS);
727 if (!data)
728 return ERR_PTR(-ENOMEM);
729
730 if (total_bytes >= sizeof(*data)) {
731 data->bytes_left = total_bytes - sizeof(*data);
732 data->bytes_missing = 0;
733 } else {
734 data->bytes_missing = sizeof(*data) - total_bytes;
735 data->bytes_left = 0;
736 }
737
738 data->elem_cnt = 0;
739 data->elem_missed = 0;
740
741 return data;
742}
743
744/*
745 * allocates space to return multiple file system paths for an inode.
746 * total_bytes to allocate are passed, note that space usable for actual path
747 * information will be total_bytes - sizeof(struct inode_fs_paths).
748 * the returned pointer must be freed with free_ipath() in the end.
749 */
750struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
751 struct btrfs_path *path)
752{
753 struct inode_fs_paths *ifp;
754 struct btrfs_data_container *fspath;
755
756 fspath = init_data_container(total_bytes);
757 if (IS_ERR(fspath))
758 return (void *)fspath;
759
760 ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
761 if (!ifp) {
762 kfree(fspath);
763 return ERR_PTR(-ENOMEM);
764 }
765
766 ifp->btrfs_path = path;
767 ifp->fspath = fspath;
768 ifp->fs_root = fs_root;
769
770 return ifp;
771}
772
773void free_ipath(struct inode_fs_paths *ipath)
774{
775 kfree(ipath);
776}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 000000000000..92618837cb8f
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__
21
22#include "ioctl.h"
23
24struct inode_fs_paths {
25 struct btrfs_path *btrfs_path;
26 struct btrfs_root *fs_root;
27 struct btrfs_data_container *fspath;
28};
29
30typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
31 void *ctx);
32typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
33 struct extent_buffer *eb, void *ctx);
34
35int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
36 struct btrfs_path *path);
37
38int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
39 struct btrfs_path *path, struct btrfs_key *found_key);
40
41int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
42 struct btrfs_extent_item *ei, u32 item_size,
43 u64 *out_root, u8 *out_level);
44
45int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
46 struct btrfs_path *path,
47 u64 extent_item_objectid,
48 u64 extent_offset,
49 iterate_extent_inodes_t *iterate, void *ctx);
50
51int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
52 struct btrfs_path *path,
53 iterate_extent_inodes_t *iterate, void *ctx);
54
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56
57struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path);
60void free_ipath(struct inode_fs_paths *ipath);
61
62#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..634608d2a6d0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
103 */ 103 */
104 u64 delalloc_bytes; 104 u64 delalloc_bytes;
105 105
106 /* total number of bytes that may be used for this inode for
107 * delalloc
108 */
109 u64 reserved_bytes;
110
111 /* 106 /*
112 * the size of the file stored in the metadata on disk. data=ordered 107 * the size of the file stored in the metadata on disk. data=ordered
113 * means the in-memory i_size might be larger than the size on disk 108 * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
115 */ 110 */
116 u64 disk_i_size; 111 u64 disk_i_size;
117 112
118 /* flags field from the on disk inode */
119 u32 flags;
120
121 /* 113 /*
122 * if this is a directory then index_cnt is the counter for the index 114 * if this is a directory then index_cnt is the counter for the index
123 * number for new files that are created 115 * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
132 u64 last_unlink_trans; 124 u64 last_unlink_trans;
133 125
134 /* 126 /*
127 * Number of bytes outstanding that are going to need csums. This is
128 * used in ENOSPC accounting.
129 */
130 u64 csum_bytes;
131
132 /* flags field from the on disk inode */
133 u32 flags;
134
135 /*
135 * Counters to keep track of the number of extent item's we may use due 136 * Counters to keep track of the number of extent item's we may use due
136 * to delalloc and such. outstanding_extents is the number of extent 137 * to delalloc and such. outstanding_extents is the number of extent
137 * items we think we'll end up using, and reserved_extents is the number 138 * items we think we'll end up using, and reserved_extents is the number
@@ -146,14 +147,12 @@ struct btrfs_inode {
146 * the btrfs file release call will add this inode to the 147 * the btrfs file release call will add this inode to the
147 * ordered operations list so that we make sure to flush out any 148 * ordered operations list so that we make sure to flush out any
148 * new data the application may have written before commit. 149 * new data the application may have written before commit.
149 *
150 * yes, its silly to have a single bitflag, but we might grow more
151 * of these.
152 */ 150 */
153 unsigned ordered_data_close:1; 151 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1; 152 unsigned orphan_meta_reserved:1;
155 unsigned dummy_inode:1; 153 unsigned dummy_inode:1;
156 unsigned in_defrag:1; 154 unsigned in_defrag:1;
155 unsigned delalloc_meta_reserved:1;
157 156
158 /* 157 /*
159 * always compress this one file 158 * always compress this one file
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
85static inline int compressed_bio_size(struct btrfs_root *root, 85static inline int compressed_bio_size(struct btrfs_root *root,
86 unsigned long disk_size) 86 unsigned long disk_size)
87{ 87{
88 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 88 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
89
89 return sizeof(struct compressed_bio) + 90 return sizeof(struct compressed_bio) +
90 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 91 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
91 csum_size; 92 csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..dede441bdeee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
514 struct btrfs_root *root, 514 struct btrfs_root *root,
515 struct extent_buffer *buf) 515 struct extent_buffer *buf)
516{ 516{
517 /* ensure we can see the force_cow */
518 smp_rmb();
519
520 /*
521 * We do not need to cow a block if
522 * 1) this block is not created or changed in this transaction;
523 * 2) this block does not belong to TREE_RELOC tree;
524 * 3) the root is not forced COW.
525 *
526 * What is forced COW:
527 * when we create snapshot during commiting the transaction,
528 * after we've finished coping src root, we must COW the shared
529 * block to ensure the metadata consistency.
530 */
517 if (btrfs_header_generation(buf) == trans->transid && 531 if (btrfs_header_generation(buf) == trans->transid &&
518 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 532 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
519 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 533 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
520 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 534 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
535 !root->force_cow)
521 return 0; 536 return 0;
522 return 1; 537 return 1;
523} 538}
@@ -902,9 +917,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
902 917
903 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 918 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
904 919
905 if (level < BTRFS_MAX_LEVEL - 1) 920 if (level < BTRFS_MAX_LEVEL - 1) {
906 parent = path->nodes[level + 1]; 921 parent = path->nodes[level + 1];
907 pslot = path->slots[level + 1]; 922 pslot = path->slots[level + 1];
923 }
908 924
909 /* 925 /*
910 * deal with the case where there is only one pointer in the root 926 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1107 mid = path->nodes[level]; 1123 mid = path->nodes[level];
1108 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1124 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1109 1125
1110 if (level < BTRFS_MAX_LEVEL - 1) 1126 if (level < BTRFS_MAX_LEVEL - 1) {
1111 parent = path->nodes[level + 1]; 1127 parent = path->nodes[level + 1];
1112 pslot = path->slots[level + 1]; 1128 pslot = path->slots[level + 1];
1129 }
1113 1130
1114 if (!parent) 1131 if (!parent)
1115 return 1; 1132 return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..67385033323d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h>
33#include "extent_io.h" 34#include "extent_io.h"
34#include "extent_map.h" 35#include "extent_map.h"
35#include "async-thread.h" 36#include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
360#define BTRFS_LABEL_SIZE 256 361#define BTRFS_LABEL_SIZE 256
361 362
362/* 363/*
364 * just in case we somehow lose the roots and are not able to mount,
365 * we store an array of the roots from previous transactions
366 * in the super.
367 */
368#define BTRFS_NUM_BACKUP_ROOTS 4
369struct btrfs_root_backup {
370 __le64 tree_root;
371 __le64 tree_root_gen;
372
373 __le64 chunk_root;
374 __le64 chunk_root_gen;
375
376 __le64 extent_root;
377 __le64 extent_root_gen;
378
379 __le64 fs_root;
380 __le64 fs_root_gen;
381
382 __le64 dev_root;
383 __le64 dev_root_gen;
384
385 __le64 csum_root;
386 __le64 csum_root_gen;
387
388 __le64 total_bytes;
389 __le64 bytes_used;
390 __le64 num_devices;
391 /* future */
392 __le64 unsed_64[4];
393
394 u8 tree_root_level;
395 u8 chunk_root_level;
396 u8 extent_root_level;
397 u8 fs_root_level;
398 u8 dev_root_level;
399 u8 csum_root_level;
400 /* future and to align */
401 u8 unused_8[10];
402} __attribute__ ((__packed__));
403
404/*
363 * the super block basically lists the main trees of the FS 405 * the super block basically lists the main trees of the FS
364 * it currently lacks any block count etc etc 406 * it currently lacks any block count etc etc
365 */ 407 */
@@ -405,6 +447,7 @@ struct btrfs_super_block {
405 /* future expansion */ 447 /* future expansion */
406 __le64 reserved[31]; 448 __le64 reserved[31];
407 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 449 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
450 struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
408} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
409 452
410/* 453/*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
772struct btrfs_block_rsv { 815struct btrfs_block_rsv {
773 u64 size; 816 u64 size;
774 u64 reserved; 817 u64 reserved;
775 u64 freed[2];
776 struct btrfs_space_info *space_info; 818 struct btrfs_space_info *space_info;
777 struct list_head list;
778 spinlock_t lock; 819 spinlock_t lock;
779 atomic_t usage;
780 unsigned int priority:8;
781 unsigned int durable:1;
782 unsigned int refill_used:1;
783 unsigned int full:1; 820 unsigned int full:1;
784}; 821};
785 822
@@ -811,7 +848,8 @@ struct btrfs_free_cluster {
811enum btrfs_caching_type { 848enum btrfs_caching_type {
812 BTRFS_CACHE_NO = 0, 849 BTRFS_CACHE_NO = 0,
813 BTRFS_CACHE_STARTED = 1, 850 BTRFS_CACHE_STARTED = 1,
814 BTRFS_CACHE_FINISHED = 2, 851 BTRFS_CACHE_FAST = 2,
852 BTRFS_CACHE_FINISHED = 3,
815}; 853};
816 854
817enum btrfs_disk_cache_state { 855enum btrfs_disk_cache_state {
@@ -840,10 +878,10 @@ struct btrfs_block_group_cache {
840 spinlock_t lock; 878 spinlock_t lock;
841 u64 pinned; 879 u64 pinned;
842 u64 reserved; 880 u64 reserved;
843 u64 reserved_pinned;
844 u64 bytes_super; 881 u64 bytes_super;
845 u64 flags; 882 u64 flags;
846 u64 sectorsize; 883 u64 sectorsize;
884 u64 cache_generation;
847 unsigned int ro:1; 885 unsigned int ro:1;
848 unsigned int dirty:1; 886 unsigned int dirty:1;
849 unsigned int iref:1; 887 unsigned int iref:1;
@@ -899,6 +937,10 @@ struct btrfs_fs_info {
899 spinlock_t block_group_cache_lock; 937 spinlock_t block_group_cache_lock;
900 struct rb_root block_group_cache_tree; 938 struct rb_root block_group_cache_tree;
901 939
940 /* keep track of unallocated space */
941 spinlock_t free_chunk_lock;
942 u64 free_chunk_space;
943
902 struct extent_io_tree freed_extents[2]; 944 struct extent_io_tree freed_extents[2];
903 struct extent_io_tree *pinned_extents; 945 struct extent_io_tree *pinned_extents;
904 946
@@ -916,14 +958,11 @@ struct btrfs_fs_info {
916 struct btrfs_block_rsv trans_block_rsv; 958 struct btrfs_block_rsv trans_block_rsv;
917 /* block reservation for chunk tree */ 959 /* block reservation for chunk tree */
918 struct btrfs_block_rsv chunk_block_rsv; 960 struct btrfs_block_rsv chunk_block_rsv;
961 /* block reservation for delayed operations */
962 struct btrfs_block_rsv delayed_block_rsv;
919 963
920 struct btrfs_block_rsv empty_block_rsv; 964 struct btrfs_block_rsv empty_block_rsv;
921 965
922 /* list of block reservations that cross multiple transactions */
923 struct list_head durable_block_rsv_list;
924
925 struct mutex durable_block_rsv_mutex;
926
927 u64 generation; 966 u64 generation;
928 u64 last_trans_committed; 967 u64 last_trans_committed;
929 968
@@ -942,8 +981,8 @@ struct btrfs_fs_info {
942 wait_queue_head_t transaction_blocked_wait; 981 wait_queue_head_t transaction_blocked_wait;
943 wait_queue_head_t async_submit_wait; 982 wait_queue_head_t async_submit_wait;
944 983
945 struct btrfs_super_block super_copy; 984 struct btrfs_super_block *super_copy;
946 struct btrfs_super_block super_for_commit; 985 struct btrfs_super_block *super_for_commit;
947 struct block_device *__bdev; 986 struct block_device *__bdev;
948 struct super_block *sb; 987 struct super_block *sb;
949 struct inode *btree_inode; 988 struct inode *btree_inode;
@@ -1036,6 +1075,7 @@ struct btrfs_fs_info {
1036 struct btrfs_workers endio_freespace_worker; 1075 struct btrfs_workers endio_freespace_worker;
1037 struct btrfs_workers submit_workers; 1076 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers; 1077 struct btrfs_workers caching_workers;
1078 struct btrfs_workers readahead_workers;
1039 1079
1040 /* 1080 /*
1041 * fixup workers take dirty pages that didn't properly go through 1081 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1159,13 @@ struct btrfs_fs_info {
1119 u64 fs_state; 1159 u64 fs_state;
1120 1160
1121 struct btrfs_delayed_root *delayed_root; 1161 struct btrfs_delayed_root *delayed_root;
1162
1163 /* readahead tree */
1164 spinlock_t reada_lock;
1165 struct radix_tree_root reada_tree;
1166
1167 /* next backup root to be overwritten */
1168 int backup_root_index;
1122}; 1169};
1123 1170
1124/* 1171/*
@@ -1225,6 +1272,8 @@ struct btrfs_root {
1225 * for stat. It may be used for more later 1272 * for stat. It may be used for more later
1226 */ 1273 */
1227 dev_t anon_dev; 1274 dev_t anon_dev;
1275
1276 int force_cow;
1228}; 1277};
1229 1278
1230struct btrfs_ioctl_defrag_range_args { 1279struct btrfs_ioctl_defrag_range_args {
@@ -1363,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args {
1363#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1412#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1364#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1413#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1365#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1414#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1415#define BTRFS_MOUNT_RECOVERY (1 << 18)
1366 1416
1367#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1417#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1368#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1418#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
1978 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; 2028 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1979} 2029}
1980 2030
2031/* struct btrfs_root_backup */
2032BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
2033 tree_root, 64);
2034BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
2035 tree_root_gen, 64);
2036BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
2037 tree_root_level, 8);
2038
2039BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
2040 chunk_root, 64);
2041BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
2042 chunk_root_gen, 64);
2043BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
2044 chunk_root_level, 8);
2045
2046BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
2047 extent_root, 64);
2048BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
2049 extent_root_gen, 64);
2050BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
2051 extent_root_level, 8);
2052
2053BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
2054 fs_root, 64);
2055BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
2056 fs_root_gen, 64);
2057BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
2058 fs_root_level, 8);
2059
2060BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
2061 dev_root, 64);
2062BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
2063 dev_root_gen, 64);
2064BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
2065 dev_root_level, 8);
2066
2067BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
2068 csum_root, 64);
2069BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
2070 csum_root_gen, 64);
2071BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
2072 csum_root_level, 8);
2073BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
2074 total_bytes, 64);
2075BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2076 bytes_used, 64);
2077BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2078 num_devices, 64);
2079
1981/* struct btrfs_super_block */ 2080/* struct btrfs_super_block */
1982 2081
1983BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2082BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2228,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2129 (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 2228 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2130} 2229}
2131 2230
2231static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
2232{
2233 return mapping_gfp_mask(mapping) & ~__GFP_FS;
2234}
2235
2132/* extent-tree.c */ 2236/* extent-tree.c */
2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2237static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2134 unsigned num_items) 2238 unsigned num_items)
@@ -2137,6 +2241,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2137 3 * num_items; 2241 3 * num_items;
2138} 2242}
2139 2243
2244/*
2245 * Doing a truncate won't result in new nodes or leaves, just what we need for
2246 * COW.
2247 */
2248static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
2249 unsigned num_items)
2250{
2251 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2252 num_items;
2253}
2254
2140void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2255void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
2141int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2256int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2142 struct btrfs_root *root, unsigned long count); 2257 struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2146 u64 num_bytes, u64 *refs, u64 *flags); 2261 u64 num_bytes, u64 *refs, u64 *flags);
2147int btrfs_pin_extent(struct btrfs_root *root, 2262int btrfs_pin_extent(struct btrfs_root *root,
2148 u64 bytenr, u64 num, int reserved); 2263 u64 bytenr, u64 num, int reserved);
2264int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2265 struct btrfs_root *root,
2266 u64 bytenr, u64 num_bytes);
2149int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2267int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2150 struct btrfs_root *root, 2268 struct btrfs_root *root,
2151 u64 objectid, u64 offset, u64 bytenr); 2269 u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2196 u64 root_objectid, u64 owner, u64 offset); 2314 u64 root_objectid, u64 owner, u64 offset);
2197 2315
2198int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2316int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2199int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 2317int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
2200 u64 num_bytes, int reserve, int sinfo); 2318 u64 start, u64 len);
2201int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2319int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2202 struct btrfs_root *root); 2320 struct btrfs_root *root);
2203int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2321int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2358,26 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2240struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2358struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2241void btrfs_free_block_rsv(struct btrfs_root *root, 2359void btrfs_free_block_rsv(struct btrfs_root *root,
2242 struct btrfs_block_rsv *rsv); 2360 struct btrfs_block_rsv *rsv);
2243void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, 2361int btrfs_block_rsv_add(struct btrfs_root *root,
2244 struct btrfs_block_rsv *rsv);
2245int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2246 struct btrfs_root *root,
2247 struct btrfs_block_rsv *block_rsv, 2362 struct btrfs_block_rsv *block_rsv,
2248 u64 num_bytes); 2363 u64 num_bytes);
2249int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2364int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2250 struct btrfs_root *root, 2365 struct btrfs_block_rsv *block_rsv,
2366 u64 num_bytes);
2367int btrfs_block_rsv_check(struct btrfs_root *root,
2368 struct btrfs_block_rsv *block_rsv, int min_factor);
2369int btrfs_block_rsv_refill(struct btrfs_root *root,
2251 struct btrfs_block_rsv *block_rsv, 2370 struct btrfs_block_rsv *block_rsv,
2252 u64 min_reserved, int min_factor); 2371 u64 min_reserved);
2372int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2373 struct btrfs_block_rsv *block_rsv,
2374 u64 min_reserved);
2253int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2375int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2254 struct btrfs_block_rsv *dst_rsv, 2376 struct btrfs_block_rsv *dst_rsv,
2255 u64 num_bytes); 2377 u64 num_bytes);
2256void btrfs_block_rsv_release(struct btrfs_root *root, 2378void btrfs_block_rsv_release(struct btrfs_root *root,
2257 struct btrfs_block_rsv *block_rsv, 2379 struct btrfs_block_rsv *block_rsv,
2258 u64 num_bytes); 2380 u64 num_bytes);
2259int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root,
2261 struct btrfs_block_rsv *rsv);
2262int btrfs_set_block_group_ro(struct btrfs_root *root, 2381int btrfs_set_block_group_ro(struct btrfs_root *root,
2263 struct btrfs_block_group_cache *cache); 2382 struct btrfs_block_group_cache *cache);
2264int btrfs_set_block_group_rw(struct btrfs_root *root, 2383int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2498,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2379 smp_mb(); 2498 smp_mb();
2380 return fs_info->closing; 2499 return fs_info->closing;
2381} 2500}
2501static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2502{
2503 kfree(fs_info->delayed_root);
2504 kfree(fs_info->extent_root);
2505 kfree(fs_info->tree_root);
2506 kfree(fs_info->chunk_root);
2507 kfree(fs_info->dev_root);
2508 kfree(fs_info->csum_root);
2509 kfree(fs_info->super_copy);
2510 kfree(fs_info->super_for_commit);
2511 kfree(fs_info);
2512}
2382 2513
2383/* root-item.c */ 2514/* root-item.c */
2384int btrfs_find_root_ref(struct btrfs_root *tree_root, 2515int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2561,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2561int btrfs_readpage(struct file *file, struct page *page); 2692int btrfs_readpage(struct file *file, struct page *page);
2562void btrfs_evict_inode(struct inode *inode); 2693void btrfs_evict_inode(struct inode *inode);
2563int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 2694int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2564void btrfs_dirty_inode(struct inode *inode, int flags); 2695int btrfs_dirty_inode(struct inode *inode);
2696int btrfs_update_time(struct file *file);
2565struct inode *btrfs_alloc_inode(struct super_block *sb); 2697struct inode *btrfs_alloc_inode(struct super_block *sb);
2566void btrfs_destroy_inode(struct inode *inode); 2698void btrfs_destroy_inode(struct inode *inode);
2567int btrfs_drop_inode(struct inode *inode); 2699int btrfs_drop_inode(struct inode *inode);
@@ -2579,11 +2711,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2579int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2711int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2580int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2712int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2581int btrfs_orphan_cleanup(struct btrfs_root *root); 2713int btrfs_orphan_cleanup(struct btrfs_root *root);
2582void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2583 struct btrfs_pending_snapshot *pending,
2584 u64 *bytes_to_reserve);
2585void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2586 struct btrfs_pending_snapshot *pending);
2587void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2714void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2588 struct btrfs_root *root); 2715 struct btrfs_root *root);
2589int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); 2716int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2824,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2697int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2824int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2698 struct btrfs_scrub_progress *progress); 2825 struct btrfs_scrub_progress *progress);
2699 2826
2827/* reada.c */
2828struct reada_control {
2829 struct btrfs_root *root; /* tree to prefetch */
2830 struct btrfs_key key_start;
2831 struct btrfs_key key_end; /* exclusive */
2832 atomic_t elems;
2833 struct kref refcnt;
2834 wait_queue_head_t wait;
2835};
2836struct reada_control *btrfs_reada_add(struct btrfs_root *root,
2837 struct btrfs_key *start, struct btrfs_key *end);
2838int btrfs_reada_wait(void *handle);
2839void btrfs_reada_detach(void *handle);
2840int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
2841 u64 start, int err);
2842
2700#endif 2843#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c18..c7ddf8a01c54 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
591 return 0; 591 return 0;
592 592
593 src_rsv = trans->block_rsv; 593 src_rsv = trans->block_rsv;
594 dst_rsv = &root->fs_info->global_block_rsv; 594 dst_rsv = &root->fs_info->delayed_block_rsv;
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
609 if (!item->bytes_reserved) 609 if (!item->bytes_reserved)
610 return; 610 return;
611 611
612 rsv = &root->fs_info->global_block_rsv; 612 rsv = &root->fs_info->delayed_block_rsv;
613 btrfs_block_rsv_release(root, rsv, 613 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 614 item->bytes_reserved);
615} 615}
@@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
617static int btrfs_delayed_inode_reserve_metadata( 617static int btrfs_delayed_inode_reserve_metadata(
618 struct btrfs_trans_handle *trans, 618 struct btrfs_trans_handle *trans,
619 struct btrfs_root *root, 619 struct btrfs_root *root,
620 struct inode *inode,
620 struct btrfs_delayed_node *node) 621 struct btrfs_delayed_node *node)
621{ 622{
622 struct btrfs_block_rsv *src_rsv; 623 struct btrfs_block_rsv *src_rsv;
623 struct btrfs_block_rsv *dst_rsv; 624 struct btrfs_block_rsv *dst_rsv;
624 u64 num_bytes; 625 u64 num_bytes;
625 int ret; 626 int ret;
626 627 int release = false;
627 if (!trans->bytes_reserved)
628 return 0;
629 628
630 src_rsv = trans->block_rsv; 629 src_rsv = trans->block_rsv;
631 dst_rsv = &root->fs_info->global_block_rsv; 630 dst_rsv = &root->fs_info->delayed_block_rsv;
632 631
633 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 632 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
633
634 /*
635 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
636 * which doesn't reserve space for speed. This is a problem since we
637 * still need to reserve space for this update, so try to reserve the
638 * space.
639 *
640 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
641 * we're accounted for.
642 */
643 if (!src_rsv || (!trans->bytes_reserved &&
644 src_rsv != &root->fs_info->delalloc_block_rsv)) {
645 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
646 /*
647 * Since we're under a transaction reserve_metadata_bytes could
648 * try to commit the transaction which will make it return
649 * EAGAIN to make us stop the transaction we have, so return
650 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
651 */
652 if (ret == -EAGAIN)
653 ret = -ENOSPC;
654 if (!ret)
655 node->bytes_reserved = num_bytes;
656 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock);
659 if (BTRFS_I(inode)->delalloc_meta_reserved) {
660 BTRFS_I(inode)->delalloc_meta_reserved = 0;
661 spin_unlock(&BTRFS_I(inode)->lock);
662 release = true;
663 goto migrate;
664 }
665 spin_unlock(&BTRFS_I(inode)->lock);
666
667 /* Ok we didn't have space pre-reserved. This shouldn't happen
668 * too often but it can happen if we do delalloc to an existing
669 * inode which gets dirtied because of the time update, and then
670 * isn't touched again until after the transaction commits and
671 * then we try to write out the data. First try to be nice and
672 * reserve something strictly for us. If not be a pain and try
673 * to steal from the delalloc block rsv.
674 */
675 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
676 if (!ret)
677 goto out;
678
679 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
680 if (!ret)
681 goto out;
682
683 /*
684 * Ok this is a problem, let's just steal from the global rsv
685 * since this really shouldn't happen that often.
686 */
687 WARN_ON(1);
688 ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
689 dst_rsv, num_bytes);
690 goto out;
691 }
692
693migrate:
634 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 694 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
695
696out:
697 /*
698 * Migrate only takes a reservation, it doesn't touch the size of the
699 * block_rsv. This is to simplify people who don't normally have things
700 * migrated from their block rsv. If they go to release their
701 * reservation, that will decrease the size as well, so if migrate
702 * reduced size we'd end up with a negative size. But for the
703 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
704 * but we could in fact do this reserve/migrate dance several times
705 * between the time we did the original reservation and we'd clean it
706 * up. So to take care of this, release the space for the meta
707 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work.
709 */
635 if (!ret) 710 if (!ret)
636 node->bytes_reserved = num_bytes; 711 node->bytes_reserved = num_bytes;
637 712
713 if (release)
714 btrfs_block_rsv_release(root, src_rsv, num_bytes);
715
638 return ret; 716 return ret;
639} 717}
640 718
@@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
646 if (!node->bytes_reserved) 724 if (!node->bytes_reserved)
647 return; 725 return;
648 726
649 rsv = &root->fs_info->global_block_rsv; 727 rsv = &root->fs_info->delayed_block_rsv;
650 btrfs_block_rsv_release(root, rsv, 728 btrfs_block_rsv_release(root, rsv,
651 node->bytes_reserved); 729 node->bytes_reserved);
652 node->bytes_reserved = 0; 730 node->bytes_reserved = 0;
@@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1026 path->leave_spinning = 1; 1104 path->leave_spinning = 1;
1027 1105
1028 block_rsv = trans->block_rsv; 1106 block_rsv = trans->block_rsv;
1029 trans->block_rsv = &root->fs_info->global_block_rsv; 1107 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1030 1108
1031 delayed_root = btrfs_get_delayed_root(root); 1109 delayed_root = btrfs_get_delayed_root(root);
1032 1110
@@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1069 path->leave_spinning = 1; 1147 path->leave_spinning = 1;
1070 1148
1071 block_rsv = trans->block_rsv; 1149 block_rsv = trans->block_rsv;
1072 trans->block_rsv = &node->root->fs_info->global_block_rsv; 1150 trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
1073 1151
1074 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1152 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1075 if (!ret) 1153 if (!ret)
@@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1149 goto free_path; 1227 goto free_path;
1150 1228
1151 block_rsv = trans->block_rsv; 1229 block_rsv = trans->block_rsv;
1152 trans->block_rsv = &root->fs_info->global_block_rsv; 1230 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1153 1231
1154 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1232 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
1155 if (!ret) 1233 if (!ret)
@@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1685 goto release_node; 1763 goto release_node;
1686 } 1764 }
1687 1765
1688 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1766 ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
1689 /* 1767 delayed_node);
1690 * we must reserve enough space when we start a new transaction, 1768 if (ret)
1691 * so reserving metadata failure is impossible 1769 goto release_node;
1692 */
1693 BUG_ON(ret);
1694 1770
1695 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1771 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1696 delayed_node->inode_dirty = 1; 1772 delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..3f9d5551e582 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
257 int verify) 257 int verify)
258{ 258{
259 u16 csum_size = 259 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
260 btrfs_super_csum_size(&root->fs_info->super_copy);
261 char *result = NULL; 260 char *result = NULL;
262 unsigned long len; 261 unsigned long len;
263 unsigned long cur_len; 262 unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
367 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 366 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
368 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 367 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
369 while (1) { 368 while (1) {
370 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 369 ret = read_extent_buffer_pages(io_tree, eb, start,
370 WAIT_COMPLETE,
371 btree_get_extent, mirror_num); 371 btree_get_extent, mirror_num);
372 if (!ret && 372 if (!ret &&
373 !verify_parent_transid(io_tree, eb, parent_transid)) 373 !verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
609 end = eb->start + end - 1; 609 end = eb->start + end - 1;
610err: 610err:
611 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
612 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
613 btree_readahead_hook(root, eb, eb->start, ret);
614 }
615
611 free_extent_buffer(eb); 616 free_extent_buffer(eb);
612out: 617out:
613 return ret; 618 return ret;
614} 619}
615 620
621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end,
623 int mirror_num, struct extent_state *state)
624{
625 struct extent_io_tree *tree;
626 unsigned long len;
627 struct extent_buffer *eb;
628 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
629
630 tree = &BTRFS_I(page->mapping->host)->io_tree;
631 if (page->private == EXTENT_PAGE_PRIVATE)
632 goto out;
633 if (!page->private)
634 goto out;
635
636 len = page->private >> 2;
637 WARN_ON(len == 0);
638
639 eb = alloc_extent_buffer(tree, start, len, page);
640 if (eb == NULL)
641 goto out;
642
643 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
644 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
645 btree_readahead_hook(root, eb, eb->start, -EIO);
646 }
647 free_extent_buffer(eb);
648
649out:
650 return -EIO; /* we fixed nothing */
651}
652
616static void end_workqueue_bio(struct bio *bio, int err) 653static void end_workqueue_bio(struct bio *bio, int err)
617{ 654{
618 struct end_io_wq *end_io_wq = bio->bi_private; 655 struct end_io_wq *end_io_wq = bio->bi_private;
@@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
908{ 945{
909 struct extent_io_tree *tree; 946 struct extent_io_tree *tree;
910 tree = &BTRFS_I(page->mapping->host)->io_tree; 947 tree = &BTRFS_I(page->mapping->host)->io_tree;
911 return extent_read_full_page(tree, page, btree_get_extent); 948 return extent_read_full_page(tree, page, btree_get_extent, 0);
912} 949}
913 950
914static int btree_releasepage(struct page *page, gfp_t gfp_flags) 951static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
974 if (!buf) 1011 if (!buf)
975 return 0; 1012 return 0;
976 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1013 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
977 buf, 0, 0, btree_get_extent, 0); 1014 buf, 0, WAIT_NONE, btree_get_extent, 0);
978 free_extent_buffer(buf); 1015 free_extent_buffer(buf);
979 return ret; 1016 return ret;
980} 1017}
981 1018
1019int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1020 int mirror_num, struct extent_buffer **eb)
1021{
1022 struct extent_buffer *buf = NULL;
1023 struct inode *btree_inode = root->fs_info->btree_inode;
1024 struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1025 int ret;
1026
1027 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1028 if (!buf)
1029 return 0;
1030
1031 set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1032
1033 ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1034 btree_get_extent, mirror_num);
1035 if (ret) {
1036 free_extent_buffer(buf);
1037 return ret;
1038 }
1039
1040 if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1041 free_extent_buffer(buf);
1042 return -EIO;
1043 } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
1044 *eb = buf;
1045 } else {
1046 free_extent_buffer(buf);
1047 }
1048 return 0;
1049}
1050
982struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1051struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
983 u64 bytenr, u32 blocksize) 1052 u64 bytenr, u32 blocksize)
984{ 1053{
@@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1135 1204
1136 generation = btrfs_root_generation(&root->root_item); 1205 generation = btrfs_root_generation(&root->root_item);
1137 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1206 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1207 root->commit_root = NULL;
1138 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1208 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1139 blocksize, generation); 1209 blocksize, generation);
1140 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { 1210 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1141 free_extent_buffer(root->node); 1211 free_extent_buffer(root->node);
1212 root->node = NULL;
1142 return -EIO; 1213 return -EIO;
1143 } 1214 }
1144 root->commit_root = btrfs_root_node(root); 1215 root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1648,235 @@ sleep:
1577 return 0; 1648 return 0;
1578} 1649}
1579 1650
1651/*
1652 * this will find the highest generation in the array of
1653 * root backups. The index of the highest array is returned,
1654 * or -1 if we can't find anything.
1655 *
1656 * We check to make sure the array is valid by comparing the
1657 * generation of the latest root in the array with the generation
1658 * in the super block. If they don't match we pitch it.
1659 */
1660static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1661{
1662 u64 cur;
1663 int newest_index = -1;
1664 struct btrfs_root_backup *root_backup;
1665 int i;
1666
1667 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1668 root_backup = info->super_copy->super_roots + i;
1669 cur = btrfs_backup_tree_root_gen(root_backup);
1670 if (cur == newest_gen)
1671 newest_index = i;
1672 }
1673
1674 /* check to see if we actually wrapped around */
1675 if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1676 root_backup = info->super_copy->super_roots;
1677 cur = btrfs_backup_tree_root_gen(root_backup);
1678 if (cur == newest_gen)
1679 newest_index = 0;
1680 }
1681 return newest_index;
1682}
1683
1684
1685/*
1686 * find the oldest backup so we know where to store new entries
1687 * in the backup array. This will set the backup_root_index
1688 * field in the fs_info struct
1689 */
1690static void find_oldest_super_backup(struct btrfs_fs_info *info,
1691 u64 newest_gen)
1692{
1693 int newest_index = -1;
1694
1695 newest_index = find_newest_super_backup(info, newest_gen);
1696 /* if there was garbage in there, just move along */
1697 if (newest_index == -1) {
1698 info->backup_root_index = 0;
1699 } else {
1700 info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1701 }
1702}
1703
1704/*
1705 * copy all the root pointers into the super backup array.
1706 * this will bump the backup pointer by one when it is
1707 * done
1708 */
1709static void backup_super_roots(struct btrfs_fs_info *info)
1710{
1711 int next_backup;
1712 struct btrfs_root_backup *root_backup;
1713 int last_backup;
1714
1715 next_backup = info->backup_root_index;
1716 last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1717 BTRFS_NUM_BACKUP_ROOTS;
1718
1719 /*
1720 * just overwrite the last backup if we're at the same generation
1721 * this happens only at umount
1722 */
1723 root_backup = info->super_for_commit->super_roots + last_backup;
1724 if (btrfs_backup_tree_root_gen(root_backup) ==
1725 btrfs_header_generation(info->tree_root->node))
1726 next_backup = last_backup;
1727
1728 root_backup = info->super_for_commit->super_roots + next_backup;
1729
1730 /*
1731 * make sure all of our padding and empty slots get zero filled
1732 * regardless of which ones we use today
1733 */
1734 memset(root_backup, 0, sizeof(*root_backup));
1735
1736 info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1737
1738 btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1739 btrfs_set_backup_tree_root_gen(root_backup,
1740 btrfs_header_generation(info->tree_root->node));
1741
1742 btrfs_set_backup_tree_root_level(root_backup,
1743 btrfs_header_level(info->tree_root->node));
1744
1745 btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1746 btrfs_set_backup_chunk_root_gen(root_backup,
1747 btrfs_header_generation(info->chunk_root->node));
1748 btrfs_set_backup_chunk_root_level(root_backup,
1749 btrfs_header_level(info->chunk_root->node));
1750
1751 btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1752 btrfs_set_backup_extent_root_gen(root_backup,
1753 btrfs_header_generation(info->extent_root->node));
1754 btrfs_set_backup_extent_root_level(root_backup,
1755 btrfs_header_level(info->extent_root->node));
1756
1757 /*
1758 * we might commit during log recovery, which happens before we set
1759 * the fs_root. Make sure it is valid before we fill it in.
1760 */
1761 if (info->fs_root && info->fs_root->node) {
1762 btrfs_set_backup_fs_root(root_backup,
1763 info->fs_root->node->start);
1764 btrfs_set_backup_fs_root_gen(root_backup,
1765 btrfs_header_generation(info->fs_root->node));
1766 btrfs_set_backup_fs_root_level(root_backup,
1767 btrfs_header_level(info->fs_root->node));
1768 }
1769
1770 btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1771 btrfs_set_backup_dev_root_gen(root_backup,
1772 btrfs_header_generation(info->dev_root->node));
1773 btrfs_set_backup_dev_root_level(root_backup,
1774 btrfs_header_level(info->dev_root->node));
1775
1776 btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1777 btrfs_set_backup_csum_root_gen(root_backup,
1778 btrfs_header_generation(info->csum_root->node));
1779 btrfs_set_backup_csum_root_level(root_backup,
1780 btrfs_header_level(info->csum_root->node));
1781
1782 btrfs_set_backup_total_bytes(root_backup,
1783 btrfs_super_total_bytes(info->super_copy));
1784 btrfs_set_backup_bytes_used(root_backup,
1785 btrfs_super_bytes_used(info->super_copy));
1786 btrfs_set_backup_num_devices(root_backup,
1787 btrfs_super_num_devices(info->super_copy));
1788
1789 /*
1790 * if we don't copy this out to the super_copy, it won't get remembered
1791 * for the next commit
1792 */
1793 memcpy(&info->super_copy->super_roots,
1794 &info->super_for_commit->super_roots,
1795 sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1796}
1797
1798/*
1799 * this copies info out of the root backup array and back into
1800 * the in-memory super block. It is meant to help iterate through
1801 * the array, so you send it the number of backups you've already
1802 * tried and the last backup index you used.
1803 *
1804 * this returns -1 when it has tried all the backups
1805 */
1806static noinline int next_root_backup(struct btrfs_fs_info *info,
1807 struct btrfs_super_block *super,
1808 int *num_backups_tried, int *backup_index)
1809{
1810 struct btrfs_root_backup *root_backup;
1811 int newest = *backup_index;
1812
1813 if (*num_backups_tried == 0) {
1814 u64 gen = btrfs_super_generation(super);
1815
1816 newest = find_newest_super_backup(info, gen);
1817 if (newest == -1)
1818 return -1;
1819
1820 *backup_index = newest;
1821 *num_backups_tried = 1;
1822 } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1823 /* we've tried all the backups, all done */
1824 return -1;
1825 } else {
1826 /* jump to the next oldest backup */
1827 newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1828 BTRFS_NUM_BACKUP_ROOTS;
1829 *backup_index = newest;
1830 *num_backups_tried += 1;
1831 }
1832 root_backup = super->super_roots + newest;
1833
1834 btrfs_set_super_generation(super,
1835 btrfs_backup_tree_root_gen(root_backup));
1836 btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1837 btrfs_set_super_root_level(super,
1838 btrfs_backup_tree_root_level(root_backup));
1839 btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1840
1841 /*
1842 * fixme: the total bytes and num_devices need to match or we should
1843 * need a fsck
1844 */
1845 btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1846 btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1847 return 0;
1848}
1849
1850/* helper to cleanup tree roots */
1851static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1852{
1853 free_extent_buffer(info->tree_root->node);
1854 free_extent_buffer(info->tree_root->commit_root);
1855 free_extent_buffer(info->dev_root->node);
1856 free_extent_buffer(info->dev_root->commit_root);
1857 free_extent_buffer(info->extent_root->node);
1858 free_extent_buffer(info->extent_root->commit_root);
1859 free_extent_buffer(info->csum_root->node);
1860 free_extent_buffer(info->csum_root->commit_root);
1861
1862 info->tree_root->node = NULL;
1863 info->tree_root->commit_root = NULL;
1864 info->dev_root->node = NULL;
1865 info->dev_root->commit_root = NULL;
1866 info->extent_root->node = NULL;
1867 info->extent_root->commit_root = NULL;
1868 info->csum_root->node = NULL;
1869 info->csum_root->commit_root = NULL;
1870
1871 if (chunk_root) {
1872 free_extent_buffer(info->chunk_root->node);
1873 free_extent_buffer(info->chunk_root->commit_root);
1874 info->chunk_root->node = NULL;
1875 info->chunk_root->commit_root = NULL;
1876 }
1877}
1878
1879
1580struct btrfs_root *open_ctree(struct super_block *sb, 1880struct btrfs_root *open_ctree(struct super_block *sb,
1581 struct btrfs_fs_devices *fs_devices, 1881 struct btrfs_fs_devices *fs_devices,
1582 char *options) 1882 char *options)
@@ -1590,29 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1590 u64 features; 1890 u64 features;
1591 struct btrfs_key location; 1891 struct btrfs_key location;
1592 struct buffer_head *bh; 1892 struct buffer_head *bh;
1593 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), 1893 struct btrfs_super_block *disk_super;
1594 GFP_NOFS);
1595 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1596 GFP_NOFS);
1597 struct btrfs_root *tree_root = btrfs_sb(sb); 1894 struct btrfs_root *tree_root = btrfs_sb(sb);
1598 struct btrfs_fs_info *fs_info = NULL; 1895 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1599 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1896 struct btrfs_root *extent_root;
1600 GFP_NOFS); 1897 struct btrfs_root *csum_root;
1601 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1898 struct btrfs_root *chunk_root;
1602 GFP_NOFS); 1899 struct btrfs_root *dev_root;
1603 struct btrfs_root *log_tree_root; 1900 struct btrfs_root *log_tree_root;
1604
1605 int ret; 1901 int ret;
1606 int err = -EINVAL; 1902 int err = -EINVAL;
1607 1903 int num_backups_tried = 0;
1608 struct btrfs_super_block *disk_super; 1904 int backup_index = 0;
1609 1905
1610 if (!extent_root || !tree_root || !tree_root->fs_info || 1906 extent_root = fs_info->extent_root =
1611 !chunk_root || !dev_root || !csum_root) { 1907 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1908 csum_root = fs_info->csum_root =
1909 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1910 chunk_root = fs_info->chunk_root =
1911 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1912 dev_root = fs_info->dev_root =
1913 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1914
1915 if (!extent_root || !csum_root || !chunk_root || !dev_root) {
1612 err = -ENOMEM; 1916 err = -ENOMEM;
1613 goto fail; 1917 goto fail;
1614 } 1918 }
1615 fs_info = tree_root->fs_info;
1616 1919
1617 ret = init_srcu_struct(&fs_info->subvol_srcu); 1920 ret = init_srcu_struct(&fs_info->subvol_srcu);
1618 if (ret) { 1921 if (ret) {
@@ -1648,15 +1951,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1648 spin_lock_init(&fs_info->fs_roots_radix_lock); 1951 spin_lock_init(&fs_info->fs_roots_radix_lock);
1649 spin_lock_init(&fs_info->delayed_iput_lock); 1952 spin_lock_init(&fs_info->delayed_iput_lock);
1650 spin_lock_init(&fs_info->defrag_inodes_lock); 1953 spin_lock_init(&fs_info->defrag_inodes_lock);
1954 spin_lock_init(&fs_info->free_chunk_lock);
1651 mutex_init(&fs_info->reloc_mutex); 1955 mutex_init(&fs_info->reloc_mutex);
1652 1956
1653 init_completion(&fs_info->kobj_unregister); 1957 init_completion(&fs_info->kobj_unregister);
1654 fs_info->tree_root = tree_root;
1655 fs_info->extent_root = extent_root;
1656 fs_info->csum_root = csum_root;
1657 fs_info->chunk_root = chunk_root;
1658 fs_info->dev_root = dev_root;
1659 fs_info->fs_devices = fs_devices;
1660 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1958 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1661 INIT_LIST_HEAD(&fs_info->space_info); 1959 INIT_LIST_HEAD(&fs_info->space_info);
1662 btrfs_mapping_init(&fs_info->mapping_tree); 1960 btrfs_mapping_init(&fs_info->mapping_tree);
@@ -1665,8 +1963,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1665 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 1963 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1666 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 1964 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1667 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 1965 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1668 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); 1966 btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
1669 mutex_init(&fs_info->durable_block_rsv_mutex);
1670 atomic_set(&fs_info->nr_async_submits, 0); 1967 atomic_set(&fs_info->nr_async_submits, 0);
1671 atomic_set(&fs_info->async_delalloc_pages, 0); 1968 atomic_set(&fs_info->async_delalloc_pages, 0);
1672 atomic_set(&fs_info->async_submit_draining, 0); 1969 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1974,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1677 fs_info->metadata_ratio = 0; 1974 fs_info->metadata_ratio = 0;
1678 fs_info->defrag_inodes = RB_ROOT; 1975 fs_info->defrag_inodes = RB_ROOT;
1679 fs_info->trans_no_join = 0; 1976 fs_info->trans_no_join = 0;
1977 fs_info->free_chunk_space = 0;
1978
1979 /* readahead state */
1980 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1981 spin_lock_init(&fs_info->reada_lock);
1680 1982
1681 fs_info->thread_pool_size = min_t(unsigned long, 1983 fs_info->thread_pool_size = min_t(unsigned long,
1682 num_online_cpus() + 2, 8); 1984 num_online_cpus() + 2, 8);
@@ -1766,14 +2068,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1766 goto fail_alloc; 2068 goto fail_alloc;
1767 } 2069 }
1768 2070
1769 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 2071 memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
1770 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 2072 memcpy(fs_info->super_for_commit, fs_info->super_copy,
1771 sizeof(fs_info->super_for_commit)); 2073 sizeof(*fs_info->super_for_commit));
1772 brelse(bh); 2074 brelse(bh);
1773 2075
1774 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); 2076 memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
1775 2077
1776 disk_super = &fs_info->super_copy; 2078 disk_super = fs_info->super_copy;
1777 if (!btrfs_super_root(disk_super)) 2079 if (!btrfs_super_root(disk_super))
1778 goto fail_alloc; 2080 goto fail_alloc;
1779 2081
@@ -1783,6 +2085,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1783 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2085 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1784 2086
1785 /* 2087 /*
2088 * run through our array of backup supers and setup
2089 * our ring pointer to the oldest one
2090 */
2091 generation = btrfs_super_generation(disk_super);
2092 find_oldest_super_backup(fs_info, generation);
2093
2094 /*
1786 * In the long term, we'll store the compression type in the super 2095 * In the long term, we'll store the compression type in the super
1787 * block, and it'll be used for per file compression control. 2096 * block, and it'll be used for per file compression control.
1788 */ 2097 */
@@ -1870,6 +2179,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1870 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", 2179 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1871 fs_info->thread_pool_size, 2180 fs_info->thread_pool_size,
1872 &fs_info->generic_worker); 2181 &fs_info->generic_worker);
2182 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2183 fs_info->thread_pool_size,
2184 &fs_info->generic_worker);
1873 2185
1874 /* 2186 /*
1875 * endios are largely parallel and should have a very 2187 * endios are largely parallel and should have a very
@@ -1880,19 +2192,29 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1880 2192
1881 fs_info->endio_write_workers.idle_thresh = 2; 2193 fs_info->endio_write_workers.idle_thresh = 2;
1882 fs_info->endio_meta_write_workers.idle_thresh = 2; 2194 fs_info->endio_meta_write_workers.idle_thresh = 2;
2195 fs_info->readahead_workers.idle_thresh = 2;
1883 2196
1884 btrfs_start_workers(&fs_info->workers, 1); 2197 /*
1885 btrfs_start_workers(&fs_info->generic_worker, 1); 2198 * btrfs_start_workers can really only fail because of ENOMEM so just
1886 btrfs_start_workers(&fs_info->submit_workers, 1); 2199 * return -ENOMEM if any of these fail.
1887 btrfs_start_workers(&fs_info->delalloc_workers, 1); 2200 */
1888 btrfs_start_workers(&fs_info->fixup_workers, 1); 2201 ret = btrfs_start_workers(&fs_info->workers);
1889 btrfs_start_workers(&fs_info->endio_workers, 1); 2202 ret |= btrfs_start_workers(&fs_info->generic_worker);
1890 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 2203 ret |= btrfs_start_workers(&fs_info->submit_workers);
1891 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 2204 ret |= btrfs_start_workers(&fs_info->delalloc_workers);
1892 btrfs_start_workers(&fs_info->endio_write_workers, 1); 2205 ret |= btrfs_start_workers(&fs_info->fixup_workers);
1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2206 ret |= btrfs_start_workers(&fs_info->endio_workers);
1894 btrfs_start_workers(&fs_info->delayed_workers, 1); 2207 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
1895 btrfs_start_workers(&fs_info->caching_workers, 1); 2208 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2209 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2210 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
2211 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2212 ret |= btrfs_start_workers(&fs_info->caching_workers);
2213 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2214 if (ret) {
2215 ret = -ENOMEM;
2216 goto fail_sb_buffer;
2217 }
1896 2218
1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2219 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2220 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2261,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1939 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2261 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1940 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", 2262 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1941 sb->s_id); 2263 sb->s_id);
1942 goto fail_chunk_root; 2264 goto fail_tree_roots;
1943 } 2265 }
1944 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 2266 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1945 chunk_root->commit_root = btrfs_root_node(chunk_root); 2267 chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2276,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1954 if (ret) { 2276 if (ret) {
1955 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2277 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1956 sb->s_id); 2278 sb->s_id);
1957 goto fail_chunk_root; 2279 goto fail_tree_roots;
1958 } 2280 }
1959 2281
1960 btrfs_close_extra_devices(fs_devices); 2282 btrfs_close_extra_devices(fs_devices);
1961 2283
2284retry_root_backup:
1962 blocksize = btrfs_level_size(tree_root, 2285 blocksize = btrfs_level_size(tree_root,
1963 btrfs_super_root_level(disk_super)); 2286 btrfs_super_root_level(disk_super));
1964 generation = btrfs_super_generation(disk_super); 2287 generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2289,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1966 tree_root->node = read_tree_block(tree_root, 2289 tree_root->node = read_tree_block(tree_root,
1967 btrfs_super_root(disk_super), 2290 btrfs_super_root(disk_super),
1968 blocksize, generation); 2291 blocksize, generation);
1969 if (!tree_root->node) 2292 if (!tree_root->node ||
1970 goto fail_chunk_root; 2293 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1971 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1972 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", 2294 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1973 sb->s_id); 2295 sb->s_id);
1974 goto fail_tree_root; 2296
2297 goto recovery_tree_root;
1975 } 2298 }
2299
1976 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2300 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1977 tree_root->commit_root = btrfs_root_node(tree_root); 2301 tree_root->commit_root = btrfs_root_node(tree_root);
1978 2302
1979 ret = find_and_setup_root(tree_root, fs_info, 2303 ret = find_and_setup_root(tree_root, fs_info,
1980 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2304 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1981 if (ret) 2305 if (ret)
1982 goto fail_tree_root; 2306 goto recovery_tree_root;
1983 extent_root->track_dirty = 1; 2307 extent_root->track_dirty = 1;
1984 2308
1985 ret = find_and_setup_root(tree_root, fs_info, 2309 ret = find_and_setup_root(tree_root, fs_info,
1986 BTRFS_DEV_TREE_OBJECTID, dev_root); 2310 BTRFS_DEV_TREE_OBJECTID, dev_root);
1987 if (ret) 2311 if (ret)
1988 goto fail_extent_root; 2312 goto recovery_tree_root;
1989 dev_root->track_dirty = 1; 2313 dev_root->track_dirty = 1;
1990 2314
1991 ret = find_and_setup_root(tree_root, fs_info, 2315 ret = find_and_setup_root(tree_root, fs_info,
1992 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2316 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1993 if (ret) 2317 if (ret)
1994 goto fail_dev_root; 2318 goto recovery_tree_root;
1995 2319
1996 csum_root->track_dirty = 1; 2320 csum_root->track_dirty = 1;
1997 2321
@@ -2124,22 +2448,13 @@ fail_cleaner:
2124 2448
2125fail_block_groups: 2449fail_block_groups:
2126 btrfs_free_block_groups(fs_info); 2450 btrfs_free_block_groups(fs_info);
2127 free_extent_buffer(csum_root->node); 2451
2128 free_extent_buffer(csum_root->commit_root); 2452fail_tree_roots:
2129fail_dev_root: 2453 free_root_pointers(fs_info, 1);
2130 free_extent_buffer(dev_root->node); 2454
2131 free_extent_buffer(dev_root->commit_root);
2132fail_extent_root:
2133 free_extent_buffer(extent_root->node);
2134 free_extent_buffer(extent_root->commit_root);
2135fail_tree_root:
2136 free_extent_buffer(tree_root->node);
2137 free_extent_buffer(tree_root->commit_root);
2138fail_chunk_root:
2139 free_extent_buffer(chunk_root->node);
2140 free_extent_buffer(chunk_root->commit_root);
2141fail_sb_buffer: 2455fail_sb_buffer:
2142 btrfs_stop_workers(&fs_info->generic_worker); 2456 btrfs_stop_workers(&fs_info->generic_worker);
2457 btrfs_stop_workers(&fs_info->readahead_workers);
2143 btrfs_stop_workers(&fs_info->fixup_workers); 2458 btrfs_stop_workers(&fs_info->fixup_workers);
2144 btrfs_stop_workers(&fs_info->delalloc_workers); 2459 btrfs_stop_workers(&fs_info->delalloc_workers);
2145 btrfs_stop_workers(&fs_info->workers); 2460 btrfs_stop_workers(&fs_info->workers);
@@ -2152,25 +2467,37 @@ fail_sb_buffer:
2152 btrfs_stop_workers(&fs_info->delayed_workers); 2467 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers); 2468 btrfs_stop_workers(&fs_info->caching_workers);
2154fail_alloc: 2469fail_alloc:
2155 kfree(fs_info->delayed_root);
2156fail_iput: 2470fail_iput:
2471 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2472
2157 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2473 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2158 iput(fs_info->btree_inode); 2474 iput(fs_info->btree_inode);
2159
2160 btrfs_close_devices(fs_info->fs_devices);
2161 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2162fail_bdi: 2475fail_bdi:
2163 bdi_destroy(&fs_info->bdi); 2476 bdi_destroy(&fs_info->bdi);
2164fail_srcu: 2477fail_srcu:
2165 cleanup_srcu_struct(&fs_info->subvol_srcu); 2478 cleanup_srcu_struct(&fs_info->subvol_srcu);
2166fail: 2479fail:
2167 kfree(extent_root); 2480 btrfs_close_devices(fs_info->fs_devices);
2168 kfree(tree_root); 2481 free_fs_info(fs_info);
2169 kfree(fs_info);
2170 kfree(chunk_root);
2171 kfree(dev_root);
2172 kfree(csum_root);
2173 return ERR_PTR(err); 2482 return ERR_PTR(err);
2483
2484recovery_tree_root:
2485 if (!btrfs_test_opt(tree_root, RECOVERY))
2486 goto fail_tree_roots;
2487
2488 free_root_pointers(fs_info, 0);
2489
2490 /* don't use the log in recovery mode, it won't be valid */
2491 btrfs_set_super_log_root(disk_super, 0);
2492
2493 /* we can't trust the free space cache either */
2494 btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2495
2496 ret = next_root_backup(fs_info, fs_info->super_copy,
2497 &num_backups_tried, &backup_index);
2498 if (ret == -1)
2499 goto fail_block_groups;
2500 goto retry_root_backup;
2174} 2501}
2175 2502
2176static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2503static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2254,22 +2581,10 @@ static int write_dev_supers(struct btrfs_device *device,
2254 int errors = 0; 2581 int errors = 0;
2255 u32 crc; 2582 u32 crc;
2256 u64 bytenr; 2583 u64 bytenr;
2257 int last_barrier = 0;
2258 2584
2259 if (max_mirrors == 0) 2585 if (max_mirrors == 0)
2260 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 2586 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
2261 2587
2262 /* make sure only the last submit_bh does a barrier */
2263 if (do_barriers) {
2264 for (i = 0; i < max_mirrors; i++) {
2265 bytenr = btrfs_sb_offset(i);
2266 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
2267 device->total_bytes)
2268 break;
2269 last_barrier = i;
2270 }
2271 }
2272
2273 for (i = 0; i < max_mirrors; i++) { 2588 for (i = 0; i < max_mirrors; i++) {
2274 bytenr = btrfs_sb_offset(i); 2589 bytenr = btrfs_sb_offset(i);
2275 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 2590 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2315,17 +2630,136 @@ static int write_dev_supers(struct btrfs_device *device,
2315 bh->b_end_io = btrfs_end_buffer_write_sync; 2630 bh->b_end_io = btrfs_end_buffer_write_sync;
2316 } 2631 }
2317 2632
2318 if (i == last_barrier && do_barriers) 2633 /*
2319 ret = submit_bh(WRITE_FLUSH_FUA, bh); 2634 * we fua the first super. The others we allow
2320 else 2635 * to go down lazy.
2321 ret = submit_bh(WRITE_SYNC, bh); 2636 */
2322 2637 ret = submit_bh(WRITE_FUA, bh);
2323 if (ret) 2638 if (ret)
2324 errors++; 2639 errors++;
2325 } 2640 }
2326 return errors < i ? 0 : -1; 2641 return errors < i ? 0 : -1;
2327} 2642}
2328 2643
2644/*
2645 * endio for the write_dev_flush, this will wake anyone waiting
2646 * for the barrier when it is done
2647 */
2648static void btrfs_end_empty_barrier(struct bio *bio, int err)
2649{
2650 if (err) {
2651 if (err == -EOPNOTSUPP)
2652 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2653 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2654 }
2655 if (bio->bi_private)
2656 complete(bio->bi_private);
2657 bio_put(bio);
2658}
2659
2660/*
2661 * trigger flushes for one the devices. If you pass wait == 0, the flushes are
2662 * sent down. With wait == 1, it waits for the previous flush.
2663 *
2664 * any device where the flush fails with eopnotsupp are flagged as not-barrier
2665 * capable
2666 */
2667static int write_dev_flush(struct btrfs_device *device, int wait)
2668{
2669 struct bio *bio;
2670 int ret = 0;
2671
2672 if (device->nobarriers)
2673 return 0;
2674
2675 if (wait) {
2676 bio = device->flush_bio;
2677 if (!bio)
2678 return 0;
2679
2680 wait_for_completion(&device->flush_wait);
2681
2682 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
2683 printk("btrfs: disabling barriers on dev %s\n",
2684 device->name);
2685 device->nobarriers = 1;
2686 }
2687 if (!bio_flagged(bio, BIO_UPTODATE)) {
2688 ret = -EIO;
2689 }
2690
2691 /* drop the reference from the wait == 0 run */
2692 bio_put(bio);
2693 device->flush_bio = NULL;
2694
2695 return ret;
2696 }
2697
2698 /*
2699 * one reference for us, and we leave it for the
2700 * caller
2701 */
2702 device->flush_bio = NULL;;
2703 bio = bio_alloc(GFP_NOFS, 0);
2704 if (!bio)
2705 return -ENOMEM;
2706
2707 bio->bi_end_io = btrfs_end_empty_barrier;
2708 bio->bi_bdev = device->bdev;
2709 init_completion(&device->flush_wait);
2710 bio->bi_private = &device->flush_wait;
2711 device->flush_bio = bio;
2712
2713 bio_get(bio);
2714 submit_bio(WRITE_FLUSH, bio);
2715
2716 return 0;
2717}
2718
2719/*
2720 * send an empty flush down to each device in parallel,
2721 * then wait for them
2722 */
2723static int barrier_all_devices(struct btrfs_fs_info *info)
2724{
2725 struct list_head *head;
2726 struct btrfs_device *dev;
2727 int errors = 0;
2728 int ret;
2729
2730 /* send down all the barriers */
2731 head = &info->fs_devices->devices;
2732 list_for_each_entry_rcu(dev, head, dev_list) {
2733 if (!dev->bdev) {
2734 errors++;
2735 continue;
2736 }
2737 if (!dev->in_fs_metadata || !dev->writeable)
2738 continue;
2739
2740 ret = write_dev_flush(dev, 0);
2741 if (ret)
2742 errors++;
2743 }
2744
2745 /* wait for all the barriers */
2746 list_for_each_entry_rcu(dev, head, dev_list) {
2747 if (!dev->bdev) {
2748 errors++;
2749 continue;
2750 }
2751 if (!dev->in_fs_metadata || !dev->writeable)
2752 continue;
2753
2754 ret = write_dev_flush(dev, 1);
2755 if (ret)
2756 errors++;
2757 }
2758 if (errors)
2759 return -EIO;
2760 return 0;
2761}
2762
2329int write_all_supers(struct btrfs_root *root, int max_mirrors) 2763int write_all_supers(struct btrfs_root *root, int max_mirrors)
2330{ 2764{
2331 struct list_head *head; 2765 struct list_head *head;
@@ -2338,14 +2772,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2338 int total_errors = 0; 2772 int total_errors = 0;
2339 u64 flags; 2773 u64 flags;
2340 2774
2341 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 2775 max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
2342 do_barriers = !btrfs_test_opt(root, NOBARRIER); 2776 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2777 backup_super_roots(root->fs_info);
2343 2778
2344 sb = &root->fs_info->super_for_commit; 2779 sb = root->fs_info->super_for_commit;
2345 dev_item = &sb->dev_item; 2780 dev_item = &sb->dev_item;
2346 2781
2347 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2782 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2348 head = &root->fs_info->fs_devices->devices; 2783 head = &root->fs_info->fs_devices->devices;
2784
2785 if (do_barriers)
2786 barrier_all_devices(root->fs_info);
2787
2349 list_for_each_entry_rcu(dev, head, dev_list) { 2788 list_for_each_entry_rcu(dev, head, dev_list) {
2350 if (!dev->bdev) { 2789 if (!dev->bdev) {
2351 total_errors++; 2790 total_errors++;
@@ -2545,8 +2984,6 @@ int close_ctree(struct btrfs_root *root)
2545 /* clear out the rbtree of defraggable inodes */ 2984 /* clear out the rbtree of defraggable inodes */
2546 btrfs_run_defrag_inodes(root->fs_info); 2985 btrfs_run_defrag_inodes(root->fs_info);
2547 2986
2548 btrfs_put_block_group_cache(fs_info);
2549
2550 /* 2987 /*
2551 * Here come 2 situations when btrfs is broken to flip readonly: 2988 * Here come 2 situations when btrfs is broken to flip readonly:
2552 * 2989 *
@@ -2572,6 +3009,8 @@ int close_ctree(struct btrfs_root *root)
2572 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 3009 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2573 } 3010 }
2574 3011
3012 btrfs_put_block_group_cache(fs_info);
3013
2575 kthread_stop(root->fs_info->transaction_kthread); 3014 kthread_stop(root->fs_info->transaction_kthread);
2576 kthread_stop(root->fs_info->cleaner_kthread); 3015 kthread_stop(root->fs_info->cleaner_kthread);
2577 3016
@@ -2603,7 +3042,6 @@ int close_ctree(struct btrfs_root *root)
2603 del_fs_roots(fs_info); 3042 del_fs_roots(fs_info);
2604 3043
2605 iput(fs_info->btree_inode); 3044 iput(fs_info->btree_inode);
2606 kfree(fs_info->delayed_root);
2607 3045
2608 btrfs_stop_workers(&fs_info->generic_worker); 3046 btrfs_stop_workers(&fs_info->generic_worker);
2609 btrfs_stop_workers(&fs_info->fixup_workers); 3047 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +3055,7 @@ int close_ctree(struct btrfs_root *root)
2617 btrfs_stop_workers(&fs_info->submit_workers); 3055 btrfs_stop_workers(&fs_info->submit_workers);
2618 btrfs_stop_workers(&fs_info->delayed_workers); 3056 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers); 3057 btrfs_stop_workers(&fs_info->caching_workers);
3058 btrfs_stop_workers(&fs_info->readahead_workers);
2620 3059
2621 btrfs_close_devices(fs_info->fs_devices); 3060 btrfs_close_devices(fs_info->fs_devices);
2622 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3061 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +3063,7 @@ int close_ctree(struct btrfs_root *root)
2624 bdi_destroy(&fs_info->bdi); 3063 bdi_destroy(&fs_info->bdi);
2625 cleanup_srcu_struct(&fs_info->subvol_srcu); 3064 cleanup_srcu_struct(&fs_info->subvol_srcu);
2626 3065
2627 kfree(fs_info->extent_root); 3066 free_fs_info(fs_info);
2628 kfree(fs_info->tree_root);
2629 kfree(fs_info->chunk_root);
2630 kfree(fs_info->dev_root);
2631 kfree(fs_info->csum_root);
2632 kfree(fs_info);
2633 3067
2634 return 0; 3068 return 0;
2635} 3069}
@@ -2735,7 +3169,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2735 return ret; 3169 return ret;
2736} 3170}
2737 3171
2738int btree_lock_page_hook(struct page *page) 3172static int btree_lock_page_hook(struct page *page, void *data,
3173 void (*flush_fn)(void *))
2739{ 3174{
2740 struct inode *inode = page->mapping->host; 3175 struct inode *inode = page->mapping->host;
2741 struct btrfs_root *root = BTRFS_I(inode)->root; 3176 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3187,10 @@ int btree_lock_page_hook(struct page *page)
2752 if (!eb) 3187 if (!eb)
2753 goto out; 3188 goto out;
2754 3189
2755 btrfs_tree_lock(eb); 3190 if (!btrfs_try_tree_write_lock(eb)) {
3191 flush_fn(data);
3192 btrfs_tree_lock(eb);
3193 }
2756 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3194 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2757 3195
2758 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3196 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3205,10 @@ int btree_lock_page_hook(struct page *page)
2767 btrfs_tree_unlock(eb); 3205 btrfs_tree_unlock(eb);
2768 free_extent_buffer(eb); 3206 free_extent_buffer(eb);
2769out: 3207out:
2770 lock_page(page); 3208 if (!trylock_page(page)) {
3209 flush_fn(data);
3210 lock_page(page);
3211 }
2771 return 0; 3212 return 0;
2772} 3213}
2773 3214
@@ -3123,6 +3564,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3123static struct extent_io_ops btree_extent_io_ops = { 3564static struct extent_io_ops btree_extent_io_ops = {
3124 .write_cache_pages_lock_hook = btree_lock_page_hook, 3565 .write_cache_pages_lock_hook = btree_lock_page_hook,
3125 .readpage_end_io_hook = btree_readpage_end_io_hook, 3566 .readpage_end_io_hook = btree_readpage_end_io_hook,
3567 .readpage_io_failed_hook = btree_io_failed_hook,
3126 .submit_bio_hook = btree_submit_bio_hook, 3568 .submit_bio_hook = btree_submit_bio_hook,
3127 /* note we're sharing with inode.c for the merge bio hook */ 3569 /* note we're sharing with inode.c for the merge bio hook */
3128 .merge_bio_hook = btrfs_merge_bio_hook, 3570 .merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..c99d0a8f13fa 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid); 40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid); 42 u64 parent_transid);
43int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
44 int mirror_num, struct extent_buffer **eb);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 45struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
83 struct btrfs_fs_info *fs_info); 85 struct btrfs_fs_info *fs_info);
84int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
85 struct btrfs_root *root); 87 struct btrfs_root *root);
86int btree_lock_page_hook(struct page *page);
87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_init_lockdep(void); 90void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..8603ee4e3dfd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h>
26#include "compat.h" 27#include "compat.h"
27#include "hash.h" 28#include "hash.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -52,6 +53,21 @@ enum {
52 CHUNK_ALLOC_LIMITED = 2, 53 CHUNK_ALLOC_LIMITED = 2,
53}; 54};
54 55
56/*
57 * Control how reservations are dealt with.
58 *
59 * RESERVE_FREE - freeing a reservation.
60 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
61 * ENOSPC accounting
62 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
63 * bytes_may_use as the ENOSPC accounting is done elsewhere
64 */
65enum {
66 RESERVE_FREE = 0,
67 RESERVE_ALLOC = 1,
68 RESERVE_ALLOC_NO_ACCOUNT = 2,
69};
70
55static int update_block_group(struct btrfs_trans_handle *trans, 71static int update_block_group(struct btrfs_trans_handle *trans,
56 struct btrfs_root *root, 72 struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int alloc); 73 u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
81 struct btrfs_key *key); 97 struct btrfs_key *key);
82static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 98static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
83 int dump_block_groups); 99 int dump_block_groups);
100static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
101 u64 num_bytes, int reserve);
84 102
85static noinline int 103static noinline int
86block_group_cache_done(struct btrfs_block_group_cache *cache) 104block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104 if (atomic_dec_and_test(&cache->count)) { 122 if (atomic_dec_and_test(&cache->count)) {
105 WARN_ON(cache->pinned > 0); 123 WARN_ON(cache->pinned > 0);
106 WARN_ON(cache->reserved > 0); 124 WARN_ON(cache->reserved > 0);
107 WARN_ON(cache->reserved_pinned > 0);
108 kfree(cache->free_space_ctl); 125 kfree(cache->free_space_ctl);
109 kfree(cache); 126 kfree(cache);
110 } 127 }
@@ -450,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
450 struct btrfs_root *root, 467 struct btrfs_root *root,
451 int load_cache_only) 468 int load_cache_only)
452{ 469{
470 DEFINE_WAIT(wait);
453 struct btrfs_fs_info *fs_info = cache->fs_info; 471 struct btrfs_fs_info *fs_info = cache->fs_info;
454 struct btrfs_caching_control *caching_ctl; 472 struct btrfs_caching_control *caching_ctl;
455 int ret = 0; 473 int ret = 0;
456 474
457 smp_mb(); 475 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
458 if (cache->cached != BTRFS_CACHE_NO) 476 BUG_ON(!caching_ctl);
477
478 INIT_LIST_HEAD(&caching_ctl->list);
479 mutex_init(&caching_ctl->mutex);
480 init_waitqueue_head(&caching_ctl->wait);
481 caching_ctl->block_group = cache;
482 caching_ctl->progress = cache->key.objectid;
483 atomic_set(&caching_ctl->count, 1);
484 caching_ctl->work.func = caching_thread;
485
486 spin_lock(&cache->lock);
487 /*
488 * This should be a rare occasion, but this could happen I think in the
489 * case where one thread starts to load the space cache info, and then
490 * some other thread starts a transaction commit which tries to do an
491 * allocation while the other thread is still loading the space cache
492 * info. The previous loop should have kept us from choosing this block
493 * group, but if we've moved to the state where we will wait on caching
494 * block groups we need to first check if we're doing a fast load here,
495 * so we can wait for it to finish, otherwise we could end up allocating
496 * from a block group who's cache gets evicted for one reason or
497 * another.
498 */
499 while (cache->cached == BTRFS_CACHE_FAST) {
500 struct btrfs_caching_control *ctl;
501
502 ctl = cache->caching_ctl;
503 atomic_inc(&ctl->count);
504 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
505 spin_unlock(&cache->lock);
506
507 schedule();
508
509 finish_wait(&ctl->wait, &wait);
510 put_caching_control(ctl);
511 spin_lock(&cache->lock);
512 }
513
514 if (cache->cached != BTRFS_CACHE_NO) {
515 spin_unlock(&cache->lock);
516 kfree(caching_ctl);
459 return 0; 517 return 0;
518 }
519 WARN_ON(cache->caching_ctl);
520 cache->caching_ctl = caching_ctl;
521 cache->cached = BTRFS_CACHE_FAST;
522 spin_unlock(&cache->lock);
460 523
461 /* 524 /*
462 * We can't do the read from on-disk cache during a commit since we need 525 * We can't do the read from on-disk cache during a commit since we need
@@ -465,57 +528,53 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
465 * we likely hold important locks. 528 * we likely hold important locks.
466 */ 529 */
467 if (trans && (!trans->transaction->in_commit) && 530 if (trans && (!trans->transaction->in_commit) &&
468 (root && root != root->fs_info->tree_root)) { 531 (root && root != root->fs_info->tree_root) &&
469 spin_lock(&cache->lock); 532 btrfs_test_opt(root, SPACE_CACHE)) {
470 if (cache->cached != BTRFS_CACHE_NO) {
471 spin_unlock(&cache->lock);
472 return 0;
473 }
474 cache->cached = BTRFS_CACHE_STARTED;
475 spin_unlock(&cache->lock);
476
477 ret = load_free_space_cache(fs_info, cache); 533 ret = load_free_space_cache(fs_info, cache);
478 534
479 spin_lock(&cache->lock); 535 spin_lock(&cache->lock);
480 if (ret == 1) { 536 if (ret == 1) {
537 cache->caching_ctl = NULL;
481 cache->cached = BTRFS_CACHE_FINISHED; 538 cache->cached = BTRFS_CACHE_FINISHED;
482 cache->last_byte_to_unpin = (u64)-1; 539 cache->last_byte_to_unpin = (u64)-1;
483 } else { 540 } else {
484 cache->cached = BTRFS_CACHE_NO; 541 if (load_cache_only) {
542 cache->caching_ctl = NULL;
543 cache->cached = BTRFS_CACHE_NO;
544 } else {
545 cache->cached = BTRFS_CACHE_STARTED;
546 }
485 } 547 }
486 spin_unlock(&cache->lock); 548 spin_unlock(&cache->lock);
549 wake_up(&caching_ctl->wait);
487 if (ret == 1) { 550 if (ret == 1) {
551 put_caching_control(caching_ctl);
488 free_excluded_extents(fs_info->extent_root, cache); 552 free_excluded_extents(fs_info->extent_root, cache);
489 return 0; 553 return 0;
490 } 554 }
555 } else {
556 /*
557 * We are not going to do the fast caching, set cached to the
558 * appropriate value and wakeup any waiters.
559 */
560 spin_lock(&cache->lock);
561 if (load_cache_only) {
562 cache->caching_ctl = NULL;
563 cache->cached = BTRFS_CACHE_NO;
564 } else {
565 cache->cached = BTRFS_CACHE_STARTED;
566 }
567 spin_unlock(&cache->lock);
568 wake_up(&caching_ctl->wait);
491 } 569 }
492 570
493 if (load_cache_only) 571 if (load_cache_only) {
494 return 0; 572 put_caching_control(caching_ctl);
495
496 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
497 BUG_ON(!caching_ctl);
498
499 INIT_LIST_HEAD(&caching_ctl->list);
500 mutex_init(&caching_ctl->mutex);
501 init_waitqueue_head(&caching_ctl->wait);
502 caching_ctl->block_group = cache;
503 caching_ctl->progress = cache->key.objectid;
504 /* one for caching kthread, one for caching block group list */
505 atomic_set(&caching_ctl->count, 2);
506 caching_ctl->work.func = caching_thread;
507
508 spin_lock(&cache->lock);
509 if (cache->cached != BTRFS_CACHE_NO) {
510 spin_unlock(&cache->lock);
511 kfree(caching_ctl);
512 return 0; 573 return 0;
513 } 574 }
514 cache->caching_ctl = caching_ctl;
515 cache->cached = BTRFS_CACHE_STARTED;
516 spin_unlock(&cache->lock);
517 575
518 down_write(&fs_info->extent_commit_sem); 576 down_write(&fs_info->extent_commit_sem);
577 atomic_inc(&caching_ctl->count);
519 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 578 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
520 up_write(&fs_info->extent_commit_sem); 579 up_write(&fs_info->extent_commit_sem);
521 580
@@ -1770,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1770{ 1829{
1771 int ret; 1830 int ret;
1772 u64 discarded_bytes = 0; 1831 u64 discarded_bytes = 0;
1773 struct btrfs_multi_bio *multi = NULL; 1832 struct btrfs_bio *bbio = NULL;
1774 1833
1775 1834
1776 /* Tell the block device(s) that the sectors can be discarded */ 1835 /* Tell the block device(s) that the sectors can be discarded */
1777 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1836 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1778 bytenr, &num_bytes, &multi, 0); 1837 bytenr, &num_bytes, &bbio, 0);
1779 if (!ret) { 1838 if (!ret) {
1780 struct btrfs_bio_stripe *stripe = multi->stripes; 1839 struct btrfs_bio_stripe *stripe = bbio->stripes;
1781 int i; 1840 int i;
1782 1841
1783 1842
1784 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1843 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1785 if (!stripe->dev->can_discard) 1844 if (!stripe->dev->can_discard)
1786 continue; 1845 continue;
1787 1846
@@ -1800,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1800 */ 1859 */
1801 ret = 0; 1860 ret = 0;
1802 } 1861 }
1803 kfree(multi); 1862 kfree(bbio);
1804 } 1863 }
1805 1864
1806 if (actual_bytes) 1865 if (actual_bytes)
@@ -2700,6 +2759,13 @@ again:
2700 goto again; 2759 goto again;
2701 } 2760 }
2702 2761
2762 /* We've already setup this transaction, go ahead and exit */
2763 if (block_group->cache_generation == trans->transid &&
2764 i_size_read(inode)) {
2765 dcs = BTRFS_DC_SETUP;
2766 goto out_put;
2767 }
2768
2703 /* 2769 /*
2704 * We want to set the generation to 0, that way if anything goes wrong 2770 * We want to set the generation to 0, that way if anything goes wrong
2705 * from here on out we know not to trust this cache when we load up next 2771 * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2815,15 @@ again:
2749 if (!ret) 2815 if (!ret)
2750 dcs = BTRFS_DC_SETUP; 2816 dcs = BTRFS_DC_SETUP;
2751 btrfs_free_reserved_data_space(inode, num_pages); 2817 btrfs_free_reserved_data_space(inode, num_pages);
2818
2752out_put: 2819out_put:
2753 iput(inode); 2820 iput(inode);
2754out_free: 2821out_free:
2755 btrfs_release_path(path); 2822 btrfs_release_path(path);
2756out: 2823out:
2757 spin_lock(&block_group->lock); 2824 spin_lock(&block_group->lock);
2825 if (!ret && dcs == BTRFS_DC_SETUP)
2826 block_group->cache_generation = trans->transid;
2758 block_group->disk_cache_state = dcs; 2827 block_group->disk_cache_state = dcs;
2759 spin_unlock(&block_group->lock); 2828 spin_unlock(&block_group->lock);
2760 2829
@@ -3122,16 +3191,13 @@ commit_trans:
3122 return -ENOSPC; 3191 return -ENOSPC;
3123 } 3192 }
3124 data_sinfo->bytes_may_use += bytes; 3193 data_sinfo->bytes_may_use += bytes;
3125 BTRFS_I(inode)->reserved_bytes += bytes;
3126 spin_unlock(&data_sinfo->lock); 3194 spin_unlock(&data_sinfo->lock);
3127 3195
3128 return 0; 3196 return 0;
3129} 3197}
3130 3198
3131/* 3199/*
3132 * called when we are clearing an delalloc extent from the 3200 * Called if we need to clear a data reservation for this inode.
3133 * inode's io_tree or there was an error for whatever reason
3134 * after calling btrfs_check_data_free_space
3135 */ 3201 */
3136void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3202void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3137{ 3203{
@@ -3144,7 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3144 data_sinfo = BTRFS_I(inode)->space_info; 3210 data_sinfo = BTRFS_I(inode)->space_info;
3145 spin_lock(&data_sinfo->lock); 3211 spin_lock(&data_sinfo->lock);
3146 data_sinfo->bytes_may_use -= bytes; 3212 data_sinfo->bytes_may_use -= bytes;
3147 BTRFS_I(inode)->reserved_bytes -= bytes;
3148 spin_unlock(&data_sinfo->lock); 3213 spin_unlock(&data_sinfo->lock);
3149} 3214}
3150 3215
@@ -3165,6 +3230,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3165 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3230 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3166 int force) 3231 int force)
3167{ 3232{
3233 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3168 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3234 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3169 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3235 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3170 u64 thresh; 3236 u64 thresh;
@@ -3173,11 +3239,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
3173 return 1; 3239 return 1;
3174 3240
3175 /* 3241 /*
3242 * We need to take into account the global rsv because for all intents
3243 * and purposes it's used space. Don't worry about locking the
3244 * global_rsv, it doesn't change except when the transaction commits.
3245 */
3246 num_allocated += global_rsv->size;
3247
3248 /*
3176 * in limited mode, we want to have some free space up to 3249 * in limited mode, we want to have some free space up to
3177 * about 1% of the FS size. 3250 * about 1% of the FS size.
3178 */ 3251 */
3179 if (force == CHUNK_ALLOC_LIMITED) { 3252 if (force == CHUNK_ALLOC_LIMITED) {
3180 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3253 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3181 thresh = max_t(u64, 64 * 1024 * 1024, 3254 thresh = max_t(u64, 64 * 1024 * 1024,
3182 div_factor_fine(thresh, 1)); 3255 div_factor_fine(thresh, 1));
3183 3256
@@ -3199,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3199 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) 3272 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3200 return 0; 3273 return 0;
3201 3274
3202 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3275 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3203 3276
3204 /* 256MB or 5% of the FS */ 3277 /* 256MB or 5% of the FS */
3205 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3278 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3375,26 @@ out:
3302/* 3375/*
3303 * shrink metadata reservation for delalloc 3376 * shrink metadata reservation for delalloc
3304 */ 3377 */
3305static int shrink_delalloc(struct btrfs_trans_handle *trans, 3378static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3306 struct btrfs_root *root, u64 to_reclaim, int sync) 3379 bool wait_ordered)
3307{ 3380{
3308 struct btrfs_block_rsv *block_rsv; 3381 struct btrfs_block_rsv *block_rsv;
3309 struct btrfs_space_info *space_info; 3382 struct btrfs_space_info *space_info;
3383 struct btrfs_trans_handle *trans;
3310 u64 reserved; 3384 u64 reserved;
3311 u64 max_reclaim; 3385 u64 max_reclaim;
3312 u64 reclaimed = 0; 3386 u64 reclaimed = 0;
3313 long time_left; 3387 long time_left;
3314 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3388 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3315 int loops = 0; 3389 int loops = 0;
3316 unsigned long progress; 3390 unsigned long progress;
3317 3391
3392 trans = (struct btrfs_trans_handle *)current->journal_info;
3318 block_rsv = &root->fs_info->delalloc_block_rsv; 3393 block_rsv = &root->fs_info->delalloc_block_rsv;
3319 space_info = block_rsv->space_info; 3394 space_info = block_rsv->space_info;
3320 3395
3321 smp_mb(); 3396 smp_mb();
3322 reserved = space_info->bytes_reserved; 3397 reserved = space_info->bytes_may_use;
3323 progress = space_info->reservation_progress; 3398 progress = space_info->reservation_progress;
3324 3399
3325 if (reserved == 0) 3400 if (reserved == 0)
@@ -3334,7 +3409,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334 } 3409 }
3335 3410
3336 max_reclaim = min(reserved, to_reclaim); 3411 max_reclaim = min(reserved, to_reclaim);
3337 3412 nr_pages = max_t(unsigned long, nr_pages,
3413 max_reclaim >> PAGE_CACHE_SHIFT);
3338 while (loops < 1024) { 3414 while (loops < 1024) {
3339 /* have the flusher threads jump in and do some IO */ 3415 /* have the flusher threads jump in and do some IO */
3340 smp_mb(); 3416 smp_mb();
@@ -3343,9 +3419,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3343 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3419 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3344 3420
3345 spin_lock(&space_info->lock); 3421 spin_lock(&space_info->lock);
3346 if (reserved > space_info->bytes_reserved) 3422 if (reserved > space_info->bytes_may_use)
3347 reclaimed += reserved - space_info->bytes_reserved; 3423 reclaimed += reserved - space_info->bytes_may_use;
3348 reserved = space_info->bytes_reserved; 3424 reserved = space_info->bytes_may_use;
3349 spin_unlock(&space_info->lock); 3425 spin_unlock(&space_info->lock);
3350 3426
3351 loops++; 3427 loops++;
@@ -3356,11 +3432,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 if (trans && trans->transaction->blocked) 3432 if (trans && trans->transaction->blocked)
3357 return -EAGAIN; 3433 return -EAGAIN;
3358 3434
3359 time_left = schedule_timeout_interruptible(1); 3435 if (wait_ordered && !trans) {
3436 btrfs_wait_ordered_extents(root, 0, 0);
3437 } else {
3438 time_left = schedule_timeout_interruptible(1);
3360 3439
3361 /* We were interrupted, exit */ 3440 /* We were interrupted, exit */
3362 if (time_left) 3441 if (time_left)
3363 break; 3442 break;
3443 }
3364 3444
3365 /* we've kicked the IO a few times, if anything has been freed, 3445 /* we've kicked the IO a few times, if anything has been freed,
3366 * exit. There is no sense in looping here for a long time 3446 * exit. There is no sense in looping here for a long time
@@ -3375,34 +3455,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3375 } 3455 }
3376 3456
3377 } 3457 }
3378 if (reclaimed >= to_reclaim && !trans) 3458
3379 btrfs_wait_ordered_extents(root, 0, 0);
3380 return reclaimed >= to_reclaim; 3459 return reclaimed >= to_reclaim;
3381} 3460}
3382 3461
3383/* 3462/**
3384 * Retries tells us how many times we've called reserve_metadata_bytes. The 3463 * maybe_commit_transaction - possibly commit the transaction if its ok to
3385 * idea is if this is the first call (retries == 0) then we will add to our 3464 * @root - the root we're allocating for
3386 * reserved count if we can't make the allocation in order to hold our place 3465 * @bytes - the number of bytes we want to reserve
3387 * while we go and try and free up space. That way for retries > 1 we don't try 3466 * @force - force the commit
3388 * and add space, we just check to see if the amount of unused space is >= the
3389 * total space, meaning that our reservation is valid.
3390 * 3467 *
3391 * However if we don't intend to retry this reservation, pass -1 as retries so 3468 * This will check to make sure that committing the transaction will actually
3392 * that it short circuits this logic. 3469 * get us somewhere and then commit the transaction if it does. Otherwise it
3470 * will return -ENOSPC.
3393 */ 3471 */
3394static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3472static int may_commit_transaction(struct btrfs_root *root,
3395 struct btrfs_root *root, 3473 struct btrfs_space_info *space_info,
3474 u64 bytes, int force)
3475{
3476 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3477 struct btrfs_trans_handle *trans;
3478
3479 trans = (struct btrfs_trans_handle *)current->journal_info;
3480 if (trans)
3481 return -EAGAIN;
3482
3483 if (force)
3484 goto commit;
3485
3486 /* See if there is enough pinned space to make this reservation */
3487 spin_lock(&space_info->lock);
3488 if (space_info->bytes_pinned >= bytes) {
3489 spin_unlock(&space_info->lock);
3490 goto commit;
3491 }
3492 spin_unlock(&space_info->lock);
3493
3494 /*
3495 * See if there is some space in the delayed insertion reservation for
3496 * this reservation.
3497 */
3498 if (space_info != delayed_rsv->space_info)
3499 return -ENOSPC;
3500
3501 spin_lock(&delayed_rsv->lock);
3502 if (delayed_rsv->size < bytes) {
3503 spin_unlock(&delayed_rsv->lock);
3504 return -ENOSPC;
3505 }
3506 spin_unlock(&delayed_rsv->lock);
3507
3508commit:
3509 trans = btrfs_join_transaction(root);
3510 if (IS_ERR(trans))
3511 return -ENOSPC;
3512
3513 return btrfs_commit_transaction(trans, root);
3514}
3515
3516/**
3517 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3518 * @root - the root we're allocating for
3519 * @block_rsv - the block_rsv we're allocating for
3520 * @orig_bytes - the number of bytes we want
3521 * @flush - wether or not we can flush to make our reservation
3522 *
3523 * This will reserve orgi_bytes number of bytes from the space info associated
3524 * with the block_rsv. If there is not enough space it will make an attempt to
3525 * flush out space to make room. It will do this by flushing delalloc if
3526 * possible or committing the transaction. If flush is 0 then no attempts to
3527 * regain reservations will be made and this will fail if there is not enough
3528 * space already.
3529 */
3530static int reserve_metadata_bytes(struct btrfs_root *root,
3396 struct btrfs_block_rsv *block_rsv, 3531 struct btrfs_block_rsv *block_rsv,
3397 u64 orig_bytes, int flush) 3532 u64 orig_bytes, int flush)
3398{ 3533{
3399 struct btrfs_space_info *space_info = block_rsv->space_info; 3534 struct btrfs_space_info *space_info = block_rsv->space_info;
3400 u64 unused; 3535 u64 used;
3401 u64 num_bytes = orig_bytes; 3536 u64 num_bytes = orig_bytes;
3402 int retries = 0; 3537 int retries = 0;
3403 int ret = 0; 3538 int ret = 0;
3404 bool committed = false; 3539 bool committed = false;
3405 bool flushing = false; 3540 bool flushing = false;
3541 bool wait_ordered = false;
3406 3542
3407again: 3543again:
3408 ret = 0; 3544 ret = 0;
@@ -3419,7 +3555,7 @@ again:
3419 * deadlock since we are waiting for the flusher to finish, but 3555 * deadlock since we are waiting for the flusher to finish, but
3420 * hold the current transaction open. 3556 * hold the current transaction open.
3421 */ 3557 */
3422 if (trans) 3558 if (current->journal_info)
3423 return -EAGAIN; 3559 return -EAGAIN;
3424 ret = wait_event_interruptible(space_info->wait, 3560 ret = wait_event_interruptible(space_info->wait,
3425 !space_info->flush); 3561 !space_info->flush);
@@ -3431,9 +3567,9 @@ again:
3431 } 3567 }
3432 3568
3433 ret = -ENOSPC; 3569 ret = -ENOSPC;
3434 unused = space_info->bytes_used + space_info->bytes_reserved + 3570 used = space_info->bytes_used + space_info->bytes_reserved +
3435 space_info->bytes_pinned + space_info->bytes_readonly + 3571 space_info->bytes_pinned + space_info->bytes_readonly +
3436 space_info->bytes_may_use; 3572 space_info->bytes_may_use;
3437 3573
3438 /* 3574 /*
3439 * The idea here is that we've not already over-reserved the block group 3575 * The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3578,9 @@ again:
3442 * lets start flushing stuff first and then come back and try to make 3578 * lets start flushing stuff first and then come back and try to make
3443 * our reservation. 3579 * our reservation.
3444 */ 3580 */
3445 if (unused <= space_info->total_bytes) { 3581 if (used <= space_info->total_bytes) {
3446 unused = space_info->total_bytes - unused; 3582 if (used + orig_bytes <= space_info->total_bytes) {
3447 if (unused >= num_bytes) { 3583 space_info->bytes_may_use += orig_bytes;
3448 space_info->bytes_reserved += orig_bytes;
3449 ret = 0; 3584 ret = 0;
3450 } else { 3585 } else {
3451 /* 3586 /*
@@ -3461,10 +3596,64 @@ again:
3461 * amount plus the amount of bytes that we need for this 3596 * amount plus the amount of bytes that we need for this
3462 * reservation. 3597 * reservation.
3463 */ 3598 */
3464 num_bytes = unused - space_info->total_bytes + 3599 wait_ordered = true;
3600 num_bytes = used - space_info->total_bytes +
3465 (orig_bytes * (retries + 1)); 3601 (orig_bytes * (retries + 1));
3466 } 3602 }
3467 3603
3604 if (ret) {
3605 u64 profile = btrfs_get_alloc_profile(root, 0);
3606 u64 avail;
3607
3608 /*
3609 * If we have a lot of space that's pinned, don't bother doing
3610 * the overcommit dance yet and just commit the transaction.
3611 */
3612 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3613 do_div(avail, 10);
3614 if (space_info->bytes_pinned >= avail && flush && !committed) {
3615 space_info->flush = 1;
3616 flushing = true;
3617 spin_unlock(&space_info->lock);
3618 ret = may_commit_transaction(root, space_info,
3619 orig_bytes, 1);
3620 if (ret)
3621 goto out;
3622 committed = true;
3623 goto again;
3624 }
3625
3626 spin_lock(&root->fs_info->free_chunk_lock);
3627 avail = root->fs_info->free_chunk_space;
3628
3629 /*
3630 * If we have dup, raid1 or raid10 then only half of the free
3631 * space is actually useable.
3632 */
3633 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3634 BTRFS_BLOCK_GROUP_RAID1 |
3635 BTRFS_BLOCK_GROUP_RAID10))
3636 avail >>= 1;
3637
3638 /*
3639 * If we aren't flushing don't let us overcommit too much, say
3640 * 1/8th of the space. If we can flush, let it overcommit up to
3641 * 1/2 of the space.
3642 */
3643 if (flush)
3644 avail >>= 3;
3645 else
3646 avail >>= 1;
3647 spin_unlock(&root->fs_info->free_chunk_lock);
3648
3649 if (used + num_bytes < space_info->total_bytes + avail) {
3650 space_info->bytes_may_use += orig_bytes;
3651 ret = 0;
3652 } else {
3653 wait_ordered = true;
3654 }
3655 }
3656
3468 /* 3657 /*
3469 * Couldn't make our reservation, save our place so while we're trying 3658 * Couldn't make our reservation, save our place so while we're trying
3470 * to reclaim space we can actually use it instead of somebody else 3659 * to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3673,7 @@ again:
3484 * We do synchronous shrinking since we don't actually unreserve 3673 * We do synchronous shrinking since we don't actually unreserve
3485 * metadata until after the IO is completed. 3674 * metadata until after the IO is completed.
3486 */ 3675 */
3487 ret = shrink_delalloc(trans, root, num_bytes, 1); 3676 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3488 if (ret < 0) 3677 if (ret < 0)
3489 goto out; 3678 goto out;
3490 3679
@@ -3496,35 +3685,17 @@ again:
3496 * so go back around and try again. 3685 * so go back around and try again.
3497 */ 3686 */
3498 if (retries < 2) { 3687 if (retries < 2) {
3688 wait_ordered = true;
3499 retries++; 3689 retries++;
3500 goto again; 3690 goto again;
3501 } 3691 }
3502 3692
3503 /*
3504 * Not enough space to be reclaimed, don't bother committing the
3505 * transaction.
3506 */
3507 spin_lock(&space_info->lock);
3508 if (space_info->bytes_pinned < orig_bytes)
3509 ret = -ENOSPC;
3510 spin_unlock(&space_info->lock);
3511 if (ret)
3512 goto out;
3513
3514 ret = -EAGAIN;
3515 if (trans)
3516 goto out;
3517
3518 ret = -ENOSPC; 3693 ret = -ENOSPC;
3519 if (committed) 3694 if (committed)
3520 goto out; 3695 goto out;
3521 3696
3522 trans = btrfs_join_transaction(root); 3697 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3523 if (IS_ERR(trans))
3524 goto out;
3525 ret = btrfs_commit_transaction(trans, root);
3526 if (!ret) { 3698 if (!ret) {
3527 trans = NULL;
3528 committed = true; 3699 committed = true;
3529 goto again; 3700 goto again;
3530 } 3701 }
@@ -3542,10 +3713,12 @@ out:
3542static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, 3713static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3543 struct btrfs_root *root) 3714 struct btrfs_root *root)
3544{ 3715{
3545 struct btrfs_block_rsv *block_rsv; 3716 struct btrfs_block_rsv *block_rsv = NULL;
3546 if (root->ref_cows) 3717
3718 if (root->ref_cows || root == root->fs_info->csum_root)
3547 block_rsv = trans->block_rsv; 3719 block_rsv = trans->block_rsv;
3548 else 3720
3721 if (!block_rsv)
3549 block_rsv = root->block_rsv; 3722 block_rsv = root->block_rsv;
3550 3723
3551 if (!block_rsv) 3724 if (!block_rsv)
@@ -3616,7 +3789,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3616 } 3789 }
3617 if (num_bytes) { 3790 if (num_bytes) {
3618 spin_lock(&space_info->lock); 3791 spin_lock(&space_info->lock);
3619 space_info->bytes_reserved -= num_bytes; 3792 space_info->bytes_may_use -= num_bytes;
3620 space_info->reservation_progress++; 3793 space_info->reservation_progress++;
3621 spin_unlock(&space_info->lock); 3794 spin_unlock(&space_info->lock);
3622 } 3795 }
@@ -3640,9 +3813,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3640{ 3813{
3641 memset(rsv, 0, sizeof(*rsv)); 3814 memset(rsv, 0, sizeof(*rsv));
3642 spin_lock_init(&rsv->lock); 3815 spin_lock_init(&rsv->lock);
3643 atomic_set(&rsv->usage, 1);
3644 rsv->priority = 6;
3645 INIT_LIST_HEAD(&rsv->list);
3646} 3816}
3647 3817
3648struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 3818struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,38 +3833,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3663void btrfs_free_block_rsv(struct btrfs_root *root, 3833void btrfs_free_block_rsv(struct btrfs_root *root,
3664 struct btrfs_block_rsv *rsv) 3834 struct btrfs_block_rsv *rsv)
3665{ 3835{
3666 if (rsv && atomic_dec_and_test(&rsv->usage)) { 3836 btrfs_block_rsv_release(root, rsv, (u64)-1);
3667 btrfs_block_rsv_release(root, rsv, (u64)-1); 3837 kfree(rsv);
3668 if (!rsv->durable)
3669 kfree(rsv);
3670 }
3671}
3672
3673/*
3674 * make the block_rsv struct be able to capture freed space.
3675 * the captured space will re-add to the the block_rsv struct
3676 * after transaction commit
3677 */
3678void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3679 struct btrfs_block_rsv *block_rsv)
3680{
3681 block_rsv->durable = 1;
3682 mutex_lock(&fs_info->durable_block_rsv_mutex);
3683 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3684 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3685} 3838}
3686 3839
3687int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3840static inline int __block_rsv_add(struct btrfs_root *root,
3688 struct btrfs_root *root, 3841 struct btrfs_block_rsv *block_rsv,
3689 struct btrfs_block_rsv *block_rsv, 3842 u64 num_bytes, int flush)
3690 u64 num_bytes)
3691{ 3843{
3692 int ret; 3844 int ret;
3693 3845
3694 if (num_bytes == 0) 3846 if (num_bytes == 0)
3695 return 0; 3847 return 0;
3696 3848
3697 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3849 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3698 if (!ret) { 3850 if (!ret) {
3699 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3851 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3700 return 0; 3852 return 0;
@@ -3703,55 +3855,80 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3703 return ret; 3855 return ret;
3704} 3856}
3705 3857
3706int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 3858int btrfs_block_rsv_add(struct btrfs_root *root,
3707 struct btrfs_root *root, 3859 struct btrfs_block_rsv *block_rsv,
3708 struct btrfs_block_rsv *block_rsv, 3860 u64 num_bytes)
3709 u64 min_reserved, int min_factor) 3861{
3862 return __block_rsv_add(root, block_rsv, num_bytes, 1);
3863}
3864
3865int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3866 struct btrfs_block_rsv *block_rsv,
3867 u64 num_bytes)
3868{
3869 return __block_rsv_add(root, block_rsv, num_bytes, 0);
3870}
3871
3872int btrfs_block_rsv_check(struct btrfs_root *root,
3873 struct btrfs_block_rsv *block_rsv, int min_factor)
3710{ 3874{
3711 u64 num_bytes = 0; 3875 u64 num_bytes = 0;
3712 int commit_trans = 0;
3713 int ret = -ENOSPC; 3876 int ret = -ENOSPC;
3714 3877
3715 if (!block_rsv) 3878 if (!block_rsv)
3716 return 0; 3879 return 0;
3717 3880
3718 spin_lock(&block_rsv->lock); 3881 spin_lock(&block_rsv->lock);
3719 if (min_factor > 0) 3882 num_bytes = div_factor(block_rsv->size, min_factor);
3720 num_bytes = div_factor(block_rsv->size, min_factor); 3883 if (block_rsv->reserved >= num_bytes)
3721 if (min_reserved > num_bytes) 3884 ret = 0;
3722 num_bytes = min_reserved; 3885 spin_unlock(&block_rsv->lock);
3723 3886
3724 if (block_rsv->reserved >= num_bytes) { 3887 return ret;
3888}
3889
3890static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
3891 struct btrfs_block_rsv *block_rsv,
3892 u64 min_reserved, int flush)
3893{
3894 u64 num_bytes = 0;
3895 int ret = -ENOSPC;
3896
3897 if (!block_rsv)
3898 return 0;
3899
3900 spin_lock(&block_rsv->lock);
3901 num_bytes = min_reserved;
3902 if (block_rsv->reserved >= num_bytes)
3725 ret = 0; 3903 ret = 0;
3726 } else { 3904 else
3727 num_bytes -= block_rsv->reserved; 3905 num_bytes -= block_rsv->reserved;
3728 if (block_rsv->durable &&
3729 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3730 commit_trans = 1;
3731 }
3732 spin_unlock(&block_rsv->lock); 3906 spin_unlock(&block_rsv->lock);
3907
3733 if (!ret) 3908 if (!ret)
3734 return 0; 3909 return 0;
3735 3910
3736 if (block_rsv->refill_used) { 3911 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3737 ret = reserve_metadata_bytes(trans, root, block_rsv, 3912 if (!ret) {
3738 num_bytes, 0); 3913 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3739 if (!ret) {
3740 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3741 return 0;
3742 }
3743 }
3744
3745 if (commit_trans) {
3746 if (trans)
3747 return -EAGAIN;
3748 trans = btrfs_join_transaction(root);
3749 BUG_ON(IS_ERR(trans));
3750 ret = btrfs_commit_transaction(trans, root);
3751 return 0; 3914 return 0;
3752 } 3915 }
3753 3916
3754 return -ENOSPC; 3917 return ret;
3918}
3919
3920int btrfs_block_rsv_refill(struct btrfs_root *root,
3921 struct btrfs_block_rsv *block_rsv,
3922 u64 min_reserved)
3923{
3924 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
3925}
3926
3927int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
3928 struct btrfs_block_rsv *block_rsv,
3929 u64 min_reserved)
3930{
3931 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
3755} 3932}
3756 3933
3757int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3934int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3783,7 +3960,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3783 u64 num_bytes; 3960 u64 num_bytes;
3784 u64 meta_used; 3961 u64 meta_used;
3785 u64 data_used; 3962 u64 data_used;
3786 int csum_size = btrfs_super_csum_size(&fs_info->super_copy); 3963 int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3787 3964
3788 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 3965 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3789 spin_lock(&sinfo->lock); 3966 spin_lock(&sinfo->lock);
@@ -3827,12 +4004,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3827 if (sinfo->total_bytes > num_bytes) { 4004 if (sinfo->total_bytes > num_bytes) {
3828 num_bytes = sinfo->total_bytes - num_bytes; 4005 num_bytes = sinfo->total_bytes - num_bytes;
3829 block_rsv->reserved += num_bytes; 4006 block_rsv->reserved += num_bytes;
3830 sinfo->bytes_reserved += num_bytes; 4007 sinfo->bytes_may_use += num_bytes;
3831 } 4008 }
3832 4009
3833 if (block_rsv->reserved >= block_rsv->size) { 4010 if (block_rsv->reserved >= block_rsv->size) {
3834 num_bytes = block_rsv->reserved - block_rsv->size; 4011 num_bytes = block_rsv->reserved - block_rsv->size;
3835 sinfo->bytes_reserved -= num_bytes; 4012 sinfo->bytes_may_use -= num_bytes;
3836 sinfo->reservation_progress++; 4013 sinfo->reservation_progress++;
3837 block_rsv->reserved = block_rsv->size; 4014 block_rsv->reserved = block_rsv->size;
3838 block_rsv->full = 1; 4015 block_rsv->full = 1;
@@ -3848,16 +4025,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3848 4025
3849 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4026 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3850 fs_info->chunk_block_rsv.space_info = space_info; 4027 fs_info->chunk_block_rsv.space_info = space_info;
3851 fs_info->chunk_block_rsv.priority = 10;
3852 4028
3853 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4029 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3854 fs_info->global_block_rsv.space_info = space_info; 4030 fs_info->global_block_rsv.space_info = space_info;
3855 fs_info->global_block_rsv.priority = 10;
3856 fs_info->global_block_rsv.refill_used = 1;
3857 fs_info->delalloc_block_rsv.space_info = space_info; 4031 fs_info->delalloc_block_rsv.space_info = space_info;
3858 fs_info->trans_block_rsv.space_info = space_info; 4032 fs_info->trans_block_rsv.space_info = space_info;
3859 fs_info->empty_block_rsv.space_info = space_info; 4033 fs_info->empty_block_rsv.space_info = space_info;
3860 fs_info->empty_block_rsv.priority = 10; 4034 fs_info->delayed_block_rsv.space_info = space_info;
3861 4035
3862 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4036 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3863 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4037 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +4039,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3865 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4039 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3866 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4040 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3867 4041
3868 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3869
3870 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3871
3872 update_global_block_rsv(fs_info); 4042 update_global_block_rsv(fs_info);
3873} 4043}
3874 4044
@@ -3881,37 +4051,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3881 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4051 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3882 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4052 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3883 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4053 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3884} 4054 WARN_ON(fs_info->delayed_block_rsv.size > 0);
3885 4055 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
3886int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3887 struct btrfs_root *root,
3888 struct btrfs_block_rsv *rsv)
3889{
3890 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3891 u64 num_bytes;
3892 int ret;
3893
3894 /*
3895 * Truncate should be freeing data, but give us 2 items just in case it
3896 * needs to use some space. We may want to be smarter about this in the
3897 * future.
3898 */
3899 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3900
3901 /* We already have enough bytes, just return */
3902 if (rsv->reserved >= num_bytes)
3903 return 0;
3904
3905 num_bytes -= rsv->reserved;
3906
3907 /*
3908 * You should have reserved enough space before hand to do this, so this
3909 * should not fail.
3910 */
3911 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3912 BUG_ON(ret);
3913
3914 return 0;
3915} 4056}
3916 4057
3917void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4058void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3920,9 +4061,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3920 if (!trans->bytes_reserved) 4061 if (!trans->bytes_reserved)
3921 return; 4062 return;
3922 4063
3923 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); 4064 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
3924 btrfs_block_rsv_release(root, trans->block_rsv,
3925 trans->bytes_reserved);
3926 trans->bytes_reserved = 0; 4065 trans->bytes_reserved = 0;
3927} 4066}
3928 4067
@@ -3964,33 +4103,99 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3964 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4103 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3965} 4104}
3966 4105
4106/**
4107 * drop_outstanding_extent - drop an outstanding extent
4108 * @inode: the inode we're dropping the extent for
4109 *
4110 * This is called when we are freeing up an outstanding extent, either called
4111 * after an error or after an extent is written. This will return the number of
4112 * reserved extents that need to be freed. This must be called with
4113 * BTRFS_I(inode)->lock held.
4114 */
3967static unsigned drop_outstanding_extent(struct inode *inode) 4115static unsigned drop_outstanding_extent(struct inode *inode)
3968{ 4116{
4117 unsigned drop_inode_space = 0;
3969 unsigned dropped_extents = 0; 4118 unsigned dropped_extents = 0;
3970 4119
3971 spin_lock(&BTRFS_I(inode)->lock);
3972 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4120 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3973 BTRFS_I(inode)->outstanding_extents--; 4121 BTRFS_I(inode)->outstanding_extents--;
3974 4122
4123 if (BTRFS_I(inode)->outstanding_extents == 0 &&
4124 BTRFS_I(inode)->delalloc_meta_reserved) {
4125 drop_inode_space = 1;
4126 BTRFS_I(inode)->delalloc_meta_reserved = 0;
4127 }
4128
3975 /* 4129 /*
3976 * If we have more or the same amount of outsanding extents than we have 4130 * If we have more or the same amount of outsanding extents than we have
3977 * reserved then we need to leave the reserved extents count alone. 4131 * reserved then we need to leave the reserved extents count alone.
3978 */ 4132 */
3979 if (BTRFS_I(inode)->outstanding_extents >= 4133 if (BTRFS_I(inode)->outstanding_extents >=
3980 BTRFS_I(inode)->reserved_extents) 4134 BTRFS_I(inode)->reserved_extents)
3981 goto out; 4135 return drop_inode_space;
3982 4136
3983 dropped_extents = BTRFS_I(inode)->reserved_extents - 4137 dropped_extents = BTRFS_I(inode)->reserved_extents -
3984 BTRFS_I(inode)->outstanding_extents; 4138 BTRFS_I(inode)->outstanding_extents;
3985 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4139 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3986out: 4140 return dropped_extents + drop_inode_space;
3987 spin_unlock(&BTRFS_I(inode)->lock);
3988 return dropped_extents;
3989} 4141}
3990 4142
3991static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 4143/**
4144 * calc_csum_metadata_size - return the amount of metada space that must be
4145 * reserved/free'd for the given bytes.
4146 * @inode: the inode we're manipulating
4147 * @num_bytes: the number of bytes in question
4148 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4149 *
4150 * This adjusts the number of csum_bytes in the inode and then returns the
4151 * correct amount of metadata that must either be reserved or freed. We
4152 * calculate how many checksums we can fit into one leaf and then divide the
4153 * number of bytes that will need to be checksumed by this value to figure out
4154 * how many checksums will be required. If we are adding bytes then the number
4155 * may go up and we will return the number of additional bytes that must be
4156 * reserved. If it is going down we will return the number of bytes that must
4157 * be freed.
4158 *
4159 * This must be called with BTRFS_I(inode)->lock held.
4160 */
4161static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4162 int reserve)
3992{ 4163{
3993 return num_bytes >>= 3; 4164 struct btrfs_root *root = BTRFS_I(inode)->root;
4165 u64 csum_size;
4166 int num_csums_per_leaf;
4167 int num_csums;
4168 int old_csums;
4169
4170 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4171 BTRFS_I(inode)->csum_bytes == 0)
4172 return 0;
4173
4174 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4175 if (reserve)
4176 BTRFS_I(inode)->csum_bytes += num_bytes;
4177 else
4178 BTRFS_I(inode)->csum_bytes -= num_bytes;
4179 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4180 num_csums_per_leaf = (int)div64_u64(csum_size,
4181 sizeof(struct btrfs_csum_item) +
4182 sizeof(struct btrfs_disk_key));
4183 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4184 num_csums = num_csums + num_csums_per_leaf - 1;
4185 num_csums = num_csums / num_csums_per_leaf;
4186
4187 old_csums = old_csums + num_csums_per_leaf - 1;
4188 old_csums = old_csums / num_csums_per_leaf;
4189
4190 /* No change, no need to reserve more */
4191 if (old_csums == num_csums)
4192 return 0;
4193
4194 if (reserve)
4195 return btrfs_calc_trans_metadata_size(root,
4196 num_csums - old_csums);
4197
4198 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
3994} 4199}
3995 4200
3996int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4201int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3998,10 +4203,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3998 struct btrfs_root *root = BTRFS_I(inode)->root; 4203 struct btrfs_root *root = BTRFS_I(inode)->root;
3999 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4204 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4000 u64 to_reserve = 0; 4205 u64 to_reserve = 0;
4206 u64 csum_bytes;
4001 unsigned nr_extents = 0; 4207 unsigned nr_extents = 0;
4208 int extra_reserve = 0;
4209 int flush = 1;
4002 int ret; 4210 int ret;
4003 4211
4004 if (btrfs_transaction_in_commit(root->fs_info)) 4212 /* Need to be holding the i_mutex here if we aren't free space cache */
4213 if (btrfs_is_free_space_inode(root, inode))
4214 flush = 0;
4215 else
4216 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4217
4218 if (flush && btrfs_transaction_in_commit(root->fs_info))
4005 schedule_timeout(1); 4219 schedule_timeout(1);
4006 4220
4007 num_bytes = ALIGN(num_bytes, root->sectorsize); 4221 num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4010,33 +4224,74 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4010 BTRFS_I(inode)->outstanding_extents++; 4224 BTRFS_I(inode)->outstanding_extents++;
4011 4225
4012 if (BTRFS_I(inode)->outstanding_extents > 4226 if (BTRFS_I(inode)->outstanding_extents >
4013 BTRFS_I(inode)->reserved_extents) { 4227 BTRFS_I(inode)->reserved_extents)
4014 nr_extents = BTRFS_I(inode)->outstanding_extents - 4228 nr_extents = BTRFS_I(inode)->outstanding_extents -
4015 BTRFS_I(inode)->reserved_extents; 4229 BTRFS_I(inode)->reserved_extents;
4016 BTRFS_I(inode)->reserved_extents += nr_extents;
4017 4230
4018 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4231 /*
4232 * Add an item to reserve for updating the inode when we complete the
4233 * delalloc io.
4234 */
4235 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4236 nr_extents++;
4237 extra_reserve = 1;
4019 } 4238 }
4239
4240 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4241 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4242 csum_bytes = BTRFS_I(inode)->csum_bytes;
4020 spin_unlock(&BTRFS_I(inode)->lock); 4243 spin_unlock(&BTRFS_I(inode)->lock);
4021 4244
4022 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4245 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4023 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4024 if (ret) { 4246 if (ret) {
4247 u64 to_free = 0;
4025 unsigned dropped; 4248 unsigned dropped;
4249
4250 spin_lock(&BTRFS_I(inode)->lock);
4251 dropped = drop_outstanding_extent(inode);
4026 /* 4252 /*
4027 * We don't need the return value since our reservation failed, 4253 * If the inodes csum_bytes is the same as the original
4028 * we just need to clean up our counter. 4254 * csum_bytes then we know we haven't raced with any free()ers
4255 * so we can just reduce our inodes csum bytes and carry on.
4256 * Otherwise we have to do the normal free thing to account for
4257 * the case that the free side didn't free up its reserve
4258 * because of this outstanding reservation.
4029 */ 4259 */
4030 dropped = drop_outstanding_extent(inode); 4260 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4031 WARN_ON(dropped > 1); 4261 calc_csum_metadata_size(inode, num_bytes, 0);
4262 else
4263 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4264 spin_unlock(&BTRFS_I(inode)->lock);
4265 if (dropped)
4266 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4267
4268 if (to_free)
4269 btrfs_block_rsv_release(root, block_rsv, to_free);
4032 return ret; 4270 return ret;
4033 } 4271 }
4034 4272
4273 spin_lock(&BTRFS_I(inode)->lock);
4274 if (extra_reserve) {
4275 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4276 nr_extents--;
4277 }
4278 BTRFS_I(inode)->reserved_extents += nr_extents;
4279 spin_unlock(&BTRFS_I(inode)->lock);
4280
4035 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4281 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4036 4282
4037 return 0; 4283 return 0;
4038} 4284}
4039 4285
4286/**
4287 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4288 * @inode: the inode to release the reservation for
4289 * @num_bytes: the number of bytes we're releasing
4290 *
4291 * This will release the metadata reservation for an inode. This can be called
4292 * once we complete IO for a given set of bytes to release their metadata
4293 * reservations.
4294 */
4040void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4295void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4041{ 4296{
4042 struct btrfs_root *root = BTRFS_I(inode)->root; 4297 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4299,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4044 unsigned dropped; 4299 unsigned dropped;
4045 4300
4046 num_bytes = ALIGN(num_bytes, root->sectorsize); 4301 num_bytes = ALIGN(num_bytes, root->sectorsize);
4302 spin_lock(&BTRFS_I(inode)->lock);
4047 dropped = drop_outstanding_extent(inode); 4303 dropped = drop_outstanding_extent(inode);
4048 4304
4049 to_free = calc_csum_metadata_size(inode, num_bytes); 4305 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4306 spin_unlock(&BTRFS_I(inode)->lock);
4050 if (dropped > 0) 4307 if (dropped > 0)
4051 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4308 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4052 4309
@@ -4054,6 +4311,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4054 to_free); 4311 to_free);
4055} 4312}
4056 4313
4314/**
4315 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4316 * @inode: inode we're writing to
4317 * @num_bytes: the number of bytes we want to allocate
4318 *
4319 * This will do the following things
4320 *
4321 * o reserve space in the data space info for num_bytes
4322 * o reserve space in the metadata space info based on number of outstanding
4323 * extents and how much csums will be needed
4324 * o add to the inodes ->delalloc_bytes
4325 * o add it to the fs_info's delalloc inodes list.
4326 *
4327 * This will return 0 for success and -ENOSPC if there is no space left.
4328 */
4057int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4329int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4058{ 4330{
4059 int ret; 4331 int ret;
@@ -4071,6 +4343,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4071 return 0; 4343 return 0;
4072} 4344}
4073 4345
4346/**
4347 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4348 * @inode: inode we're releasing space for
4349 * @num_bytes: the number of bytes we want to free up
4350 *
4351 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
4352 * called in the case that we don't need the metadata AND data reservations
4353 * anymore. So if there is an error or we insert an inline extent.
4354 *
4355 * This function will release the metadata space that was not used and will
4356 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4357 * list if there are no delalloc bytes left.
4358 */
4074void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4359void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4075{ 4360{
4076 btrfs_delalloc_release_metadata(inode, num_bytes); 4361 btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4090,12 +4375,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4090 4375
4091 /* block accounting for super block */ 4376 /* block accounting for super block */
4092 spin_lock(&info->delalloc_lock); 4377 spin_lock(&info->delalloc_lock);
4093 old_val = btrfs_super_bytes_used(&info->super_copy); 4378 old_val = btrfs_super_bytes_used(info->super_copy);
4094 if (alloc) 4379 if (alloc)
4095 old_val += num_bytes; 4380 old_val += num_bytes;
4096 else 4381 else
4097 old_val -= num_bytes; 4382 old_val -= num_bytes;
4098 btrfs_set_super_bytes_used(&info->super_copy, old_val); 4383 btrfs_set_super_bytes_used(info->super_copy, old_val);
4099 spin_unlock(&info->delalloc_lock); 4384 spin_unlock(&info->delalloc_lock);
4100 4385
4101 while (total) { 4386 while (total) {
@@ -4123,7 +4408,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4123 spin_lock(&cache->space_info->lock); 4408 spin_lock(&cache->space_info->lock);
4124 spin_lock(&cache->lock); 4409 spin_lock(&cache->lock);
4125 4410
4126 if (btrfs_super_cache_generation(&info->super_copy) != 0 && 4411 if (btrfs_test_opt(root, SPACE_CACHE) &&
4127 cache->disk_cache_state < BTRFS_DC_CLEAR) 4412 cache->disk_cache_state < BTRFS_DC_CLEAR)
4128 cache->disk_cache_state = BTRFS_DC_CLEAR; 4413 cache->disk_cache_state = BTRFS_DC_CLEAR;
4129 4414
@@ -4135,7 +4420,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4135 btrfs_set_block_group_used(&cache->item, old_val); 4420 btrfs_set_block_group_used(&cache->item, old_val);
4136 cache->reserved -= num_bytes; 4421 cache->reserved -= num_bytes;
4137 cache->space_info->bytes_reserved -= num_bytes; 4422 cache->space_info->bytes_reserved -= num_bytes;
4138 cache->space_info->reservation_progress++;
4139 cache->space_info->bytes_used += num_bytes; 4423 cache->space_info->bytes_used += num_bytes;
4140 cache->space_info->disk_used += num_bytes * factor; 4424 cache->space_info->disk_used += num_bytes * factor;
4141 spin_unlock(&cache->lock); 4425 spin_unlock(&cache->lock);
@@ -4187,7 +4471,6 @@ static int pin_down_extent(struct btrfs_root *root,
4187 if (reserved) { 4471 if (reserved) {
4188 cache->reserved -= num_bytes; 4472 cache->reserved -= num_bytes;
4189 cache->space_info->bytes_reserved -= num_bytes; 4473 cache->space_info->bytes_reserved -= num_bytes;
4190 cache->space_info->reservation_progress++;
4191 } 4474 }
4192 spin_unlock(&cache->lock); 4475 spin_unlock(&cache->lock);
4193 spin_unlock(&cache->space_info->lock); 4476 spin_unlock(&cache->space_info->lock);
@@ -4215,45 +4498,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
4215} 4498}
4216 4499
4217/* 4500/*
4218 * update size of reserved extents. this function may return -EAGAIN 4501 * this function must be called within transaction
4219 * if 'reserve' is true or 'sinfo' is false. 4502 */
4503int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4504 struct btrfs_root *root,
4505 u64 bytenr, u64 num_bytes)
4506{
4507 struct btrfs_block_group_cache *cache;
4508
4509 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4510 BUG_ON(!cache);
4511
4512 /*
4513 * pull in the free space cache (if any) so that our pin
4514 * removes the free space from the cache. We have load_only set
4515 * to one because the slow code to read in the free extents does check
4516 * the pinned extents.
4517 */
4518 cache_block_group(cache, trans, root, 1);
4519
4520 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4521
4522 /* remove us from the free space cache (if we're there at all) */
4523 btrfs_remove_free_space(cache, bytenr, num_bytes);
4524 btrfs_put_block_group(cache);
4525 return 0;
4526}
4527
4528/**
4529 * btrfs_update_reserved_bytes - update the block_group and space info counters
4530 * @cache: The cache we are manipulating
4531 * @num_bytes: The number of bytes in question
4532 * @reserve: One of the reservation enums
4533 *
4534 * This is called by the allocator when it reserves space, or by somebody who is
4535 * freeing space that was never actually used on disk. For example if you
4536 * reserve some space for a new leaf in transaction A and before transaction A
4537 * commits you free that leaf, you call this with reserve set to 0 in order to
4538 * clear the reservation.
4539 *
4540 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4541 * ENOSPC accounting. For data we handle the reservation through clearing the
4542 * delalloc bits in the io_tree. We have to do this since we could end up
4543 * allocating less disk space for the amount of data we have reserved in the
4544 * case of compression.
4545 *
4546 * If this is a reservation and the block group has become read only we cannot
4547 * make the reservation and return -EAGAIN, otherwise this function always
4548 * succeeds.
4220 */ 4549 */
4221int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 4550static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4222 u64 num_bytes, int reserve, int sinfo) 4551 u64 num_bytes, int reserve)
4223{ 4552{
4553 struct btrfs_space_info *space_info = cache->space_info;
4224 int ret = 0; 4554 int ret = 0;
4225 if (sinfo) { 4555 spin_lock(&space_info->lock);
4226 struct btrfs_space_info *space_info = cache->space_info; 4556 spin_lock(&cache->lock);
4227 spin_lock(&space_info->lock); 4557 if (reserve != RESERVE_FREE) {
4228 spin_lock(&cache->lock);
4229 if (reserve) {
4230 if (cache->ro) {
4231 ret = -EAGAIN;
4232 } else {
4233 cache->reserved += num_bytes;
4234 space_info->bytes_reserved += num_bytes;
4235 }
4236 } else {
4237 if (cache->ro)
4238 space_info->bytes_readonly += num_bytes;
4239 cache->reserved -= num_bytes;
4240 space_info->bytes_reserved -= num_bytes;
4241 space_info->reservation_progress++;
4242 }
4243 spin_unlock(&cache->lock);
4244 spin_unlock(&space_info->lock);
4245 } else {
4246 spin_lock(&cache->lock);
4247 if (cache->ro) { 4558 if (cache->ro) {
4248 ret = -EAGAIN; 4559 ret = -EAGAIN;
4249 } else { 4560 } else {
4250 if (reserve) 4561 cache->reserved += num_bytes;
4251 cache->reserved += num_bytes; 4562 space_info->bytes_reserved += num_bytes;
4252 else 4563 if (reserve == RESERVE_ALLOC) {
4253 cache->reserved -= num_bytes; 4564 BUG_ON(space_info->bytes_may_use < num_bytes);
4565 space_info->bytes_may_use -= num_bytes;
4566 }
4254 } 4567 }
4255 spin_unlock(&cache->lock); 4568 } else {
4569 if (cache->ro)
4570 space_info->bytes_readonly += num_bytes;
4571 cache->reserved -= num_bytes;
4572 space_info->bytes_reserved -= num_bytes;
4573 space_info->reservation_progress++;
4256 } 4574 }
4575 spin_unlock(&cache->lock);
4576 spin_unlock(&space_info->lock);
4257 return ret; 4577 return ret;
4258} 4578}
4259 4579
@@ -4319,13 +4639,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4319 spin_lock(&cache->lock); 4639 spin_lock(&cache->lock);
4320 cache->pinned -= len; 4640 cache->pinned -= len;
4321 cache->space_info->bytes_pinned -= len; 4641 cache->space_info->bytes_pinned -= len;
4322 if (cache->ro) { 4642 if (cache->ro)
4323 cache->space_info->bytes_readonly += len; 4643 cache->space_info->bytes_readonly += len;
4324 } else if (cache->reserved_pinned > 0) {
4325 len = min(len, cache->reserved_pinned);
4326 cache->reserved_pinned -= len;
4327 cache->space_info->bytes_reserved += len;
4328 }
4329 spin_unlock(&cache->lock); 4644 spin_unlock(&cache->lock);
4330 spin_unlock(&cache->space_info->lock); 4645 spin_unlock(&cache->space_info->lock);
4331 } 4646 }
@@ -4340,11 +4655,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4340{ 4655{
4341 struct btrfs_fs_info *fs_info = root->fs_info; 4656 struct btrfs_fs_info *fs_info = root->fs_info;
4342 struct extent_io_tree *unpin; 4657 struct extent_io_tree *unpin;
4343 struct btrfs_block_rsv *block_rsv;
4344 struct btrfs_block_rsv *next_rsv;
4345 u64 start; 4658 u64 start;
4346 u64 end; 4659 u64 end;
4347 int idx;
4348 int ret; 4660 int ret;
4349 4661
4350 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4662 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4679,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4367 cond_resched(); 4679 cond_resched();
4368 } 4680 }
4369 4681
4370 mutex_lock(&fs_info->durable_block_rsv_mutex);
4371 list_for_each_entry_safe(block_rsv, next_rsv,
4372 &fs_info->durable_block_rsv_list, list) {
4373
4374 idx = trans->transid & 0x1;
4375 if (block_rsv->freed[idx] > 0) {
4376 block_rsv_add_bytes(block_rsv,
4377 block_rsv->freed[idx], 0);
4378 block_rsv->freed[idx] = 0;
4379 }
4380 if (atomic_read(&block_rsv->usage) == 0) {
4381 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4382
4383 if (block_rsv->freed[0] == 0 &&
4384 block_rsv->freed[1] == 0) {
4385 list_del_init(&block_rsv->list);
4386 kfree(block_rsv);
4387 }
4388 } else {
4389 btrfs_block_rsv_release(root, block_rsv, 0);
4390 }
4391 }
4392 mutex_unlock(&fs_info->durable_block_rsv_mutex);
4393
4394 return 0; 4682 return 0;
4395} 4683}
4396 4684
@@ -4668,7 +4956,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4668 struct extent_buffer *buf, 4956 struct extent_buffer *buf,
4669 u64 parent, int last_ref) 4957 u64 parent, int last_ref)
4670{ 4958{
4671 struct btrfs_block_rsv *block_rsv;
4672 struct btrfs_block_group_cache *cache = NULL; 4959 struct btrfs_block_group_cache *cache = NULL;
4673 int ret; 4960 int ret;
4674 4961
@@ -4683,64 +4970,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4683 if (!last_ref) 4970 if (!last_ref)
4684 return; 4971 return;
4685 4972
4686 block_rsv = get_block_rsv(trans, root);
4687 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 4973 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4688 if (block_rsv->space_info != cache->space_info)
4689 goto out;
4690 4974
4691 if (btrfs_header_generation(buf) == trans->transid) { 4975 if (btrfs_header_generation(buf) == trans->transid) {
4692 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4976 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4693 ret = check_ref_cleanup(trans, root, buf->start); 4977 ret = check_ref_cleanup(trans, root, buf->start);
4694 if (!ret) 4978 if (!ret)
4695 goto pin; 4979 goto out;
4696 } 4980 }
4697 4981
4698 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 4982 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4699 pin_down_extent(root, cache, buf->start, buf->len, 1); 4983 pin_down_extent(root, cache, buf->start, buf->len, 1);
4700 goto pin; 4984 goto out;
4701 } 4985 }
4702 4986
4703 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4987 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4704 4988
4705 btrfs_add_free_space(cache, buf->start, buf->len); 4989 btrfs_add_free_space(cache, buf->start, buf->len);
4706 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); 4990 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4707 if (ret == -EAGAIN) {
4708 /* block group became read-only */
4709 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4710 goto out;
4711 }
4712
4713 ret = 1;
4714 spin_lock(&block_rsv->lock);
4715 if (block_rsv->reserved < block_rsv->size) {
4716 block_rsv->reserved += buf->len;
4717 ret = 0;
4718 }
4719 spin_unlock(&block_rsv->lock);
4720
4721 if (ret) {
4722 spin_lock(&cache->space_info->lock);
4723 cache->space_info->bytes_reserved -= buf->len;
4724 cache->space_info->reservation_progress++;
4725 spin_unlock(&cache->space_info->lock);
4726 }
4727 goto out;
4728 }
4729pin:
4730 if (block_rsv->durable && !cache->ro) {
4731 ret = 0;
4732 spin_lock(&cache->lock);
4733 if (!cache->ro) {
4734 cache->reserved_pinned += buf->len;
4735 ret = 1;
4736 }
4737 spin_unlock(&cache->lock);
4738
4739 if (ret) {
4740 spin_lock(&block_rsv->lock);
4741 block_rsv->freed[trans->transid & 0x1] += buf->len;
4742 spin_unlock(&block_rsv->lock);
4743 }
4744 } 4991 }
4745out: 4992out:
4746 /* 4993 /*
@@ -4876,17 +5123,20 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4876 struct btrfs_root *root = orig_root->fs_info->extent_root; 5123 struct btrfs_root *root = orig_root->fs_info->extent_root;
4877 struct btrfs_free_cluster *last_ptr = NULL; 5124 struct btrfs_free_cluster *last_ptr = NULL;
4878 struct btrfs_block_group_cache *block_group = NULL; 5125 struct btrfs_block_group_cache *block_group = NULL;
5126 struct btrfs_block_group_cache *used_block_group;
4879 int empty_cluster = 2 * 1024 * 1024; 5127 int empty_cluster = 2 * 1024 * 1024;
4880 int allowed_chunk_alloc = 0; 5128 int allowed_chunk_alloc = 0;
4881 int done_chunk_alloc = 0; 5129 int done_chunk_alloc = 0;
4882 struct btrfs_space_info *space_info; 5130 struct btrfs_space_info *space_info;
4883 int last_ptr_loop = 0;
4884 int loop = 0; 5131 int loop = 0;
4885 int index = 0; 5132 int index = 0;
5133 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5134 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
4886 bool found_uncached_bg = false; 5135 bool found_uncached_bg = false;
4887 bool failed_cluster_refill = false; 5136 bool failed_cluster_refill = false;
4888 bool failed_alloc = false; 5137 bool failed_alloc = false;
4889 bool use_cluster = true; 5138 bool use_cluster = true;
5139 bool have_caching_bg = false;
4890 u64 ideal_cache_percent = 0; 5140 u64 ideal_cache_percent = 0;
4891 u64 ideal_cache_offset = 0; 5141 u64 ideal_cache_offset = 0;
4892 5142
@@ -4939,6 +5189,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4939ideal_cache: 5189ideal_cache:
4940 block_group = btrfs_lookup_block_group(root->fs_info, 5190 block_group = btrfs_lookup_block_group(root->fs_info,
4941 search_start); 5191 search_start);
5192 used_block_group = block_group;
4942 /* 5193 /*
4943 * we don't want to use the block group if it doesn't match our 5194 * we don't want to use the block group if it doesn't match our
4944 * allocation bits, or if its not cached. 5195 * allocation bits, or if its not cached.
@@ -4969,12 +5220,14 @@ ideal_cache:
4969 } 5220 }
4970 } 5221 }
4971search: 5222search:
5223 have_caching_bg = false;
4972 down_read(&space_info->groups_sem); 5224 down_read(&space_info->groups_sem);
4973 list_for_each_entry(block_group, &space_info->block_groups[index], 5225 list_for_each_entry(block_group, &space_info->block_groups[index],
4974 list) { 5226 list) {
4975 u64 offset; 5227 u64 offset;
4976 int cached; 5228 int cached;
4977 5229
5230 used_block_group = block_group;
4978 btrfs_get_block_group(block_group); 5231 btrfs_get_block_group(block_group);
4979 search_start = block_group->key.objectid; 5232 search_start = block_group->key.objectid;
4980 5233
@@ -4998,13 +5251,15 @@ search:
4998 } 5251 }
4999 5252
5000have_block_group: 5253have_block_group:
5001 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 5254 cached = block_group_cache_done(block_group);
5255 if (unlikely(!cached)) {
5002 u64 free_percent; 5256 u64 free_percent;
5003 5257
5258 found_uncached_bg = true;
5004 ret = cache_block_group(block_group, trans, 5259 ret = cache_block_group(block_group, trans,
5005 orig_root, 1); 5260 orig_root, 1);
5006 if (block_group->cached == BTRFS_CACHE_FINISHED) 5261 if (block_group->cached == BTRFS_CACHE_FINISHED)
5007 goto have_block_group; 5262 goto alloc;
5008 5263
5009 free_percent = btrfs_block_group_used(&block_group->item); 5264 free_percent = btrfs_block_group_used(&block_group->item);
5010 free_percent *= 100; 5265 free_percent *= 100;
@@ -5026,7 +5281,6 @@ have_block_group:
5026 orig_root, 0); 5281 orig_root, 0);
5027 BUG_ON(ret); 5282 BUG_ON(ret);
5028 } 5283 }
5029 found_uncached_bg = true;
5030 5284
5031 /* 5285 /*
5032 * If loop is set for cached only, try the next block 5286 * If loop is set for cached only, try the next block
@@ -5036,94 +5290,80 @@ have_block_group:
5036 goto loop; 5290 goto loop;
5037 } 5291 }
5038 5292
5039 cached = block_group_cache_done(block_group); 5293alloc:
5040 if (unlikely(!cached))
5041 found_uncached_bg = true;
5042
5043 if (unlikely(block_group->ro)) 5294 if (unlikely(block_group->ro))
5044 goto loop; 5295 goto loop;
5045 5296
5046 spin_lock(&block_group->free_space_ctl->tree_lock); 5297 spin_lock(&block_group->free_space_ctl->tree_lock);
5047 if (cached && 5298 if (cached &&
5048 block_group->free_space_ctl->free_space < 5299 block_group->free_space_ctl->free_space <
5049 num_bytes + empty_size) { 5300 num_bytes + empty_cluster + empty_size) {
5050 spin_unlock(&block_group->free_space_ctl->tree_lock); 5301 spin_unlock(&block_group->free_space_ctl->tree_lock);
5051 goto loop; 5302 goto loop;
5052 } 5303 }
5053 spin_unlock(&block_group->free_space_ctl->tree_lock); 5304 spin_unlock(&block_group->free_space_ctl->tree_lock);
5054 5305
5055 /* 5306 /*
5056 * Ok we want to try and use the cluster allocator, so lets look 5307 * Ok we want to try and use the cluster allocator, so
5057 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will 5308 * lets look there
5058 * have tried the cluster allocator plenty of times at this
5059 * point and not have found anything, so we are likely way too
5060 * fragmented for the clustering stuff to find anything, so lets
5061 * just skip it and let the allocator find whatever block it can
5062 * find
5063 */ 5309 */
5064 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { 5310 if (last_ptr) {
5065 /* 5311 /*
5066 * the refill lock keeps out other 5312 * the refill lock keeps out other
5067 * people trying to start a new cluster 5313 * people trying to start a new cluster
5068 */ 5314 */
5069 spin_lock(&last_ptr->refill_lock); 5315 spin_lock(&last_ptr->refill_lock);
5070 if (last_ptr->block_group && 5316 used_block_group = last_ptr->block_group;
5071 (last_ptr->block_group->ro || 5317 if (used_block_group != block_group &&
5072 !block_group_bits(last_ptr->block_group, data))) { 5318 (!used_block_group ||
5073 offset = 0; 5319 used_block_group->ro ||
5320 !block_group_bits(used_block_group, data))) {
5321 used_block_group = block_group;
5074 goto refill_cluster; 5322 goto refill_cluster;
5075 } 5323 }
5076 5324
5077 offset = btrfs_alloc_from_cluster(block_group, last_ptr, 5325 if (used_block_group != block_group)
5078 num_bytes, search_start); 5326 btrfs_get_block_group(used_block_group);
5327
5328 offset = btrfs_alloc_from_cluster(used_block_group,
5329 last_ptr, num_bytes, used_block_group->key.objectid);
5079 if (offset) { 5330 if (offset) {
5080 /* we have a block, we're done */ 5331 /* we have a block, we're done */
5081 spin_unlock(&last_ptr->refill_lock); 5332 spin_unlock(&last_ptr->refill_lock);
5082 goto checks; 5333 goto checks;
5083 } 5334 }
5084 5335
5085 spin_lock(&last_ptr->lock); 5336 WARN_ON(last_ptr->block_group != used_block_group);
5086 /* 5337 if (used_block_group != block_group) {
5087 * whoops, this cluster doesn't actually point to 5338 btrfs_put_block_group(used_block_group);
5088 * this block group. Get a ref on the block 5339 used_block_group = block_group;
5089 * group is does point to and try again
5090 */
5091 if (!last_ptr_loop && last_ptr->block_group &&
5092 last_ptr->block_group != block_group &&
5093 index <=
5094 get_block_group_index(last_ptr->block_group)) {
5095
5096 btrfs_put_block_group(block_group);
5097 block_group = last_ptr->block_group;
5098 btrfs_get_block_group(block_group);
5099 spin_unlock(&last_ptr->lock);
5100 spin_unlock(&last_ptr->refill_lock);
5101
5102 last_ptr_loop = 1;
5103 search_start = block_group->key.objectid;
5104 /*
5105 * we know this block group is properly
5106 * in the list because
5107 * btrfs_remove_block_group, drops the
5108 * cluster before it removes the block
5109 * group from the list
5110 */
5111 goto have_block_group;
5112 } 5340 }
5113 spin_unlock(&last_ptr->lock);
5114refill_cluster: 5341refill_cluster:
5342 BUG_ON(used_block_group != block_group);
5343 /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5344 * set up a new clusters, so lets just skip it
5345 * and let the allocator find whatever block
5346 * it can find. If we reach this point, we
5347 * will have tried the cluster allocator
5348 * plenty of times and not have found
5349 * anything, so we are likely way too
5350 * fragmented for the clustering stuff to find
5351 * anything. */
5352 if (loop >= LOOP_NO_EMPTY_SIZE) {
5353 spin_unlock(&last_ptr->refill_lock);
5354 goto unclustered_alloc;
5355 }
5356
5115 /* 5357 /*
5116 * this cluster didn't work out, free it and 5358 * this cluster didn't work out, free it and
5117 * start over 5359 * start over
5118 */ 5360 */
5119 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5361 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5120 5362
5121 last_ptr_loop = 0;
5122
5123 /* allocate a cluster in this block group */ 5363 /* allocate a cluster in this block group */
5124 ret = btrfs_find_space_cluster(trans, root, 5364 ret = btrfs_find_space_cluster(trans, root,
5125 block_group, last_ptr, 5365 block_group, last_ptr,
5126 offset, num_bytes, 5366 search_start, num_bytes,
5127 empty_cluster + empty_size); 5367 empty_cluster + empty_size);
5128 if (ret == 0) { 5368 if (ret == 0) {
5129 /* 5369 /*
@@ -5159,6 +5399,7 @@ refill_cluster:
5159 goto loop; 5399 goto loop;
5160 } 5400 }
5161 5401
5402unclustered_alloc:
5162 offset = btrfs_find_space_for_alloc(block_group, search_start, 5403 offset = btrfs_find_space_for_alloc(block_group, search_start,
5163 num_bytes, empty_size); 5404 num_bytes, empty_size);
5164 /* 5405 /*
@@ -5177,20 +5418,22 @@ refill_cluster:
5177 failed_alloc = true; 5418 failed_alloc = true;
5178 goto have_block_group; 5419 goto have_block_group;
5179 } else if (!offset) { 5420 } else if (!offset) {
5421 if (!cached)
5422 have_caching_bg = true;
5180 goto loop; 5423 goto loop;
5181 } 5424 }
5182checks: 5425checks:
5183 search_start = stripe_align(root, offset); 5426 search_start = stripe_align(root, offset);
5184 /* move on to the next group */ 5427 /* move on to the next group */
5185 if (search_start + num_bytes >= search_end) { 5428 if (search_start + num_bytes >= search_end) {
5186 btrfs_add_free_space(block_group, offset, num_bytes); 5429 btrfs_add_free_space(used_block_group, offset, num_bytes);
5187 goto loop; 5430 goto loop;
5188 } 5431 }
5189 5432
5190 /* move on to the next group */ 5433 /* move on to the next group */
5191 if (search_start + num_bytes > 5434 if (search_start + num_bytes >
5192 block_group->key.objectid + block_group->key.offset) { 5435 used_block_group->key.objectid + used_block_group->key.offset) {
5193 btrfs_add_free_space(block_group, offset, num_bytes); 5436 btrfs_add_free_space(used_block_group, offset, num_bytes);
5194 goto loop; 5437 goto loop;
5195 } 5438 }
5196 5439
@@ -5198,14 +5441,14 @@ checks:
5198 ins->offset = num_bytes; 5441 ins->offset = num_bytes;
5199 5442
5200 if (offset < search_start) 5443 if (offset < search_start)
5201 btrfs_add_free_space(block_group, offset, 5444 btrfs_add_free_space(used_block_group, offset,
5202 search_start - offset); 5445 search_start - offset);
5203 BUG_ON(offset > search_start); 5446 BUG_ON(offset > search_start);
5204 5447
5205 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, 5448 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5206 (data & BTRFS_BLOCK_GROUP_DATA)); 5449 alloc_type);
5207 if (ret == -EAGAIN) { 5450 if (ret == -EAGAIN) {
5208 btrfs_add_free_space(block_group, offset, num_bytes); 5451 btrfs_add_free_space(used_block_group, offset, num_bytes);
5209 goto loop; 5452 goto loop;
5210 } 5453 }
5211 5454
@@ -5214,19 +5457,26 @@ checks:
5214 ins->offset = num_bytes; 5457 ins->offset = num_bytes;
5215 5458
5216 if (offset < search_start) 5459 if (offset < search_start)
5217 btrfs_add_free_space(block_group, offset, 5460 btrfs_add_free_space(used_block_group, offset,
5218 search_start - offset); 5461 search_start - offset);
5219 BUG_ON(offset > search_start); 5462 BUG_ON(offset > search_start);
5463 if (used_block_group != block_group)
5464 btrfs_put_block_group(used_block_group);
5220 btrfs_put_block_group(block_group); 5465 btrfs_put_block_group(block_group);
5221 break; 5466 break;
5222loop: 5467loop:
5223 failed_cluster_refill = false; 5468 failed_cluster_refill = false;
5224 failed_alloc = false; 5469 failed_alloc = false;
5225 BUG_ON(index != get_block_group_index(block_group)); 5470 BUG_ON(index != get_block_group_index(block_group));
5471 if (used_block_group != block_group)
5472 btrfs_put_block_group(used_block_group);
5226 btrfs_put_block_group(block_group); 5473 btrfs_put_block_group(block_group);
5227 } 5474 }
5228 up_read(&space_info->groups_sem); 5475 up_read(&space_info->groups_sem);
5229 5476
5477 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5478 goto search;
5479
5230 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 5480 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5231 goto search; 5481 goto search;
5232 5482
@@ -5325,7 +5575,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5325 int index = 0; 5575 int index = 0;
5326 5576
5327 spin_lock(&info->lock); 5577 spin_lock(&info->lock);
5328 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5578 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5579 (unsigned long long)info->flags,
5329 (unsigned long long)(info->total_bytes - info->bytes_used - 5580 (unsigned long long)(info->total_bytes - info->bytes_used -
5330 info->bytes_pinned - info->bytes_reserved - 5581 info->bytes_pinned - info->bytes_reserved -
5331 info->bytes_readonly), 5582 info->bytes_readonly),
@@ -5411,7 +5662,8 @@ again:
5411 return ret; 5662 return ret;
5412} 5663}
5413 5664
5414int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 5665static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5666 u64 start, u64 len, int pin)
5415{ 5667{
5416 struct btrfs_block_group_cache *cache; 5668 struct btrfs_block_group_cache *cache;
5417 int ret = 0; 5669 int ret = 0;
@@ -5426,8 +5678,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5426 if (btrfs_test_opt(root, DISCARD)) 5678 if (btrfs_test_opt(root, DISCARD))
5427 ret = btrfs_discard_extent(root, start, len, NULL); 5679 ret = btrfs_discard_extent(root, start, len, NULL);
5428 5680
5429 btrfs_add_free_space(cache, start, len); 5681 if (pin)
5430 btrfs_update_reserved_bytes(cache, len, 0, 1); 5682 pin_down_extent(root, cache, start, len, 1);
5683 else {
5684 btrfs_add_free_space(cache, start, len);
5685 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5686 }
5431 btrfs_put_block_group(cache); 5687 btrfs_put_block_group(cache);
5432 5688
5433 trace_btrfs_reserved_extent_free(root, start, len); 5689 trace_btrfs_reserved_extent_free(root, start, len);
@@ -5435,6 +5691,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5435 return ret; 5691 return ret;
5436} 5692}
5437 5693
5694int btrfs_free_reserved_extent(struct btrfs_root *root,
5695 u64 start, u64 len)
5696{
5697 return __btrfs_free_reserved_extent(root, start, len, 0);
5698}
5699
5700int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5701 u64 start, u64 len)
5702{
5703 return __btrfs_free_reserved_extent(root, start, len, 1);
5704}
5705
5438static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5706static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5439 struct btrfs_root *root, 5707 struct btrfs_root *root,
5440 u64 parent, u64 root_objectid, 5708 u64 parent, u64 root_objectid,
@@ -5630,7 +5898,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5630 put_caching_control(caching_ctl); 5898 put_caching_control(caching_ctl);
5631 } 5899 }
5632 5900
5633 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); 5901 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5902 RESERVE_ALLOC_NO_ACCOUNT);
5634 BUG_ON(ret); 5903 BUG_ON(ret);
5635 btrfs_put_block_group(block_group); 5904 btrfs_put_block_group(block_group);
5636 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5905 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5956,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5687 block_rsv = get_block_rsv(trans, root); 5956 block_rsv = get_block_rsv(trans, root);
5688 5957
5689 if (block_rsv->size == 0) { 5958 if (block_rsv->size == 0) {
5690 ret = reserve_metadata_bytes(trans, root, block_rsv, 5959 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5691 blocksize, 0);
5692 /* 5960 /*
5693 * If we couldn't reserve metadata bytes try and use some from 5961 * If we couldn't reserve metadata bytes try and use some from
5694 * the global reserve. 5962 * the global reserve.
@@ -5708,13 +5976,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5708 if (!ret) 5976 if (!ret)
5709 return block_rsv; 5977 return block_rsv;
5710 if (ret) { 5978 if (ret) {
5711 WARN_ON(1); 5979 static DEFINE_RATELIMIT_STATE(_rs,
5712 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 5980 DEFAULT_RATELIMIT_INTERVAL,
5713 0); 5981 /*DEFAULT_RATELIMIT_BURST*/ 2);
5982 if (__ratelimit(&_rs)) {
5983 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5984 WARN_ON(1);
5985 }
5986 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5714 if (!ret) { 5987 if (!ret) {
5715 spin_lock(&block_rsv->lock);
5716 block_rsv->size += blocksize;
5717 spin_unlock(&block_rsv->lock);
5718 return block_rsv; 5988 return block_rsv;
5719 } else if (ret && block_rsv != global_rsv) { 5989 } else if (ret && block_rsv != global_rsv) {
5720 ret = block_rsv_use_bytes(global_rsv, blocksize); 5990 ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6862,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6592 cache->bytes_super - btrfs_block_group_used(&cache->item); 6862 cache->bytes_super - btrfs_block_group_used(&cache->item);
6593 6863
6594 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6864 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6595 sinfo->bytes_may_use + sinfo->bytes_readonly + 6865 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6596 cache->reserved_pinned + num_bytes + min_allocable_bytes <= 6866 min_allocable_bytes <= sinfo->total_bytes) {
6597 sinfo->total_bytes) {
6598 sinfo->bytes_readonly += num_bytes; 6867 sinfo->bytes_readonly += num_bytes;
6599 sinfo->bytes_reserved += cache->reserved_pinned;
6600 cache->reserved_pinned = 0;
6601 cache->ro = 1; 6868 cache->ro = 1;
6602 ret = 0; 6869 ret = 0;
6603 } 6870 }
@@ -6964,7 +7231,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6964 struct btrfs_space_info, 7231 struct btrfs_space_info,
6965 list); 7232 list);
6966 if (space_info->bytes_pinned > 0 || 7233 if (space_info->bytes_pinned > 0 ||
6967 space_info->bytes_reserved > 0) { 7234 space_info->bytes_reserved > 0 ||
7235 space_info->bytes_may_use > 0) {
6968 WARN_ON(1); 7236 WARN_ON(1);
6969 dump_space_info(space_info, 0, 0); 7237 dump_space_info(space_info, 0, 0);
6970 } 7238 }
@@ -7006,14 +7274,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7006 return -ENOMEM; 7274 return -ENOMEM;
7007 path->reada = 1; 7275 path->reada = 1;
7008 7276
7009 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 7277 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7010 if (cache_gen != 0 && 7278 if (btrfs_test_opt(root, SPACE_CACHE) &&
7011 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 7279 btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7012 need_clear = 1; 7280 need_clear = 1;
7013 if (btrfs_test_opt(root, CLEAR_CACHE)) 7281 if (btrfs_test_opt(root, CLEAR_CACHE))
7014 need_clear = 1; 7282 need_clear = 1;
7015 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
7016 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
7017 7283
7018 while (1) { 7284 while (1) {
7019 ret = find_first_block_group(root, path, &key); 7285 ret = find_first_block_group(root, path, &key);
@@ -7252,7 +7518,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7252 goto out; 7518 goto out;
7253 } 7519 }
7254 7520
7255 inode = lookup_free_space_inode(root, block_group, path); 7521 inode = lookup_free_space_inode(tree_root, block_group, path);
7256 if (!IS_ERR(inode)) { 7522 if (!IS_ERR(inode)) {
7257 ret = btrfs_orphan_add(trans, inode); 7523 ret = btrfs_orphan_add(trans, inode);
7258 BUG_ON(ret); 7524 BUG_ON(ret);
@@ -7268,7 +7534,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7268 spin_unlock(&block_group->lock); 7534 spin_unlock(&block_group->lock);
7269 } 7535 }
7270 /* One for our lookup ref */ 7536 /* One for our lookup ref */
7271 iput(inode); 7537 btrfs_add_delayed_iput(inode);
7272 } 7538 }
7273 7539
7274 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 7540 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7339,7 +7605,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7339 int mixed = 0; 7605 int mixed = 0;
7340 int ret; 7606 int ret;
7341 7607
7342 disk_super = &fs_info->super_copy; 7608 disk_super = fs_info->super_copy;
7343 if (!btrfs_super_root(disk_super)) 7609 if (!btrfs_super_root(disk_super))
7344 return 1; 7610 return 1;
7345 7611
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..49f3c9dc09f4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -894,6 +895,202 @@ search_again:
894 goto again; 895 goto again;
895} 896}
896 897
898/**
899 * convert_extent - convert all bits in a given range from one bit to another
900 * @tree: the io tree to search
901 * @start: the start offset in bytes
902 * @end: the end offset in bytes (inclusive)
903 * @bits: the bits to set in this range
904 * @clear_bits: the bits to clear in this range
905 * @mask: the allocation mask
906 *
907 * This will go through and set bits for the given range. If any states exist
908 * already in this range they are set with the given bit and cleared of the
909 * clear_bits. This is only meant to be used by things that are mergeable, ie
910 * converting from say DELALLOC to DIRTY. This is not meant to be used with
911 * boundary bits like LOCK.
912 */
913int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914 int bits, int clear_bits, gfp_t mask)
915{
916 struct extent_state *state;
917 struct extent_state *prealloc = NULL;
918 struct rb_node *node;
919 int err = 0;
920 u64 last_start;
921 u64 last_end;
922
923again:
924 if (!prealloc && (mask & __GFP_WAIT)) {
925 prealloc = alloc_extent_state(mask);
926 if (!prealloc)
927 return -ENOMEM;
928 }
929
930 spin_lock(&tree->lock);
931 /*
932 * this search will find all the extents that end after
933 * our range starts.
934 */
935 node = tree_search(tree, start);
936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc) {
939 err = -ENOMEM;
940 goto out;
941 }
942 err = insert_state(tree, prealloc, start, end, &bits);
943 prealloc = NULL;
944 BUG_ON(err == -EEXIST);
945 goto out;
946 }
947 state = rb_entry(node, struct extent_state, rb_node);
948hit_next:
949 last_start = state->start;
950 last_end = state->end;
951
952 /*
953 * | ---- desired range ---- |
954 * | state |
955 *
956 * Just lock what we found and keep going
957 */
958 if (state->start == start && state->end <= end) {
959 struct rb_node *next_node;
960
961 set_state_bits(tree, state, &bits);
962 clear_state_bit(tree, state, &clear_bits, 0);
963
964 merge_state(tree, state);
965 if (last_end == (u64)-1)
966 goto out;
967
968 start = last_end + 1;
969 next_node = rb_next(&state->rb_node);
970 if (next_node && start < end && prealloc && !need_resched()) {
971 state = rb_entry(next_node, struct extent_state,
972 rb_node);
973 if (state->start == start)
974 goto hit_next;
975 }
976 goto search_again;
977 }
978
979 /*
980 * | ---- desired range ---- |
981 * | state |
982 * or
983 * | ------------- state -------------- |
984 *
985 * We need to split the extent we found, and may flip bits on
986 * second half.
987 *
988 * If the extent we found extends past our
989 * range, we just split and search again. It'll get split
990 * again the next time though.
991 *
992 * If the extent we found is inside our range, we set the
993 * desired bit on it.
994 */
995 if (state->start < start) {
996 prealloc = alloc_extent_state_atomic(prealloc);
997 if (!prealloc) {
998 err = -ENOMEM;
999 goto out;
1000 }
1001 err = split_state(tree, state, prealloc, start);
1002 BUG_ON(err == -EEXIST);
1003 prealloc = NULL;
1004 if (err)
1005 goto out;
1006 if (state->end <= end) {
1007 set_state_bits(tree, state, &bits);
1008 clear_state_bit(tree, state, &clear_bits, 0);
1009 merge_state(tree, state);
1010 if (last_end == (u64)-1)
1011 goto out;
1012 start = last_end + 1;
1013 }
1014 goto search_again;
1015 }
1016 /*
1017 * | ---- desired range ---- |
1018 * | state | or | state |
1019 *
1020 * There's a hole, we need to insert something in it and
1021 * ignore the extent we found.
1022 */
1023 if (state->start > start) {
1024 u64 this_end;
1025 if (end < last_start)
1026 this_end = end;
1027 else
1028 this_end = last_start - 1;
1029
1030 prealloc = alloc_extent_state_atomic(prealloc);
1031 if (!prealloc) {
1032 err = -ENOMEM;
1033 goto out;
1034 }
1035
1036 /*
1037 * Avoid to free 'prealloc' if it can be merged with
1038 * the later extent.
1039 */
1040 err = insert_state(tree, prealloc, start, this_end,
1041 &bits);
1042 BUG_ON(err == -EEXIST);
1043 if (err) {
1044 free_extent_state(prealloc);
1045 prealloc = NULL;
1046 goto out;
1047 }
1048 prealloc = NULL;
1049 start = this_end + 1;
1050 goto search_again;
1051 }
1052 /*
1053 * | ---- desired range ---- |
1054 * | state |
1055 * We need to split the extent, and set the bit
1056 * on the first half
1057 */
1058 if (state->start <= end && state->end > end) {
1059 prealloc = alloc_extent_state_atomic(prealloc);
1060 if (!prealloc) {
1061 err = -ENOMEM;
1062 goto out;
1063 }
1064
1065 err = split_state(tree, state, prealloc, end + 1);
1066 BUG_ON(err == -EEXIST);
1067
1068 set_state_bits(tree, prealloc, &bits);
1069 clear_state_bit(tree, prealloc, &clear_bits, 0);
1070
1071 merge_state(tree, prealloc);
1072 prealloc = NULL;
1073 goto out;
1074 }
1075
1076 goto search_again;
1077
1078out:
1079 spin_unlock(&tree->lock);
1080 if (prealloc)
1081 free_extent_state(prealloc);
1082
1083 return err;
1084
1085search_again:
1086 if (start > end)
1087 goto out;
1088 spin_unlock(&tree->lock);
1089 if (mask & __GFP_WAIT)
1090 cond_resched();
1091 goto again;
1092}
1093
897/* wrappers around set/clear extent bit */ 1094/* wrappers around set/clear extent bit */
898int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1095int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask) 1096 gfp_t mask)
@@ -919,7 +1116,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
919 struct extent_state **cached_state, gfp_t mask) 1116 struct extent_state **cached_state, gfp_t mask)
920{ 1117{
921 return set_extent_bit(tree, start, end, 1118 return set_extent_bit(tree, start, end,
922 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 1119 EXTENT_DELALLOC | EXTENT_UPTODATE,
923 0, NULL, cached_state, mask); 1120 0, NULL, cached_state, mask);
924} 1121}
925 1122
@@ -1599,6 +1796,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599 return 0; 1796 return 0;
1600} 1797}
1601 1798
1799/*
1800 * When IO fails, either with EIO or csum verification fails, we
1801 * try other mirrors that might have a good copy of the data. This
1802 * io_failure_record is used to record state as we go through all the
1803 * mirrors. If another mirror has good data, the page is set up to date
1804 * and things continue. If a good mirror can't be found, the original
1805 * bio end_io callback is called to indicate things have failed.
1806 */
1807struct io_failure_record {
1808 struct page *page;
1809 u64 start;
1810 u64 len;
1811 u64 logical;
1812 unsigned long bio_flags;
1813 int this_mirror;
1814 int failed_mirror;
1815 int in_validation;
1816};
1817
1818static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1819 int did_repair)
1820{
1821 int ret;
1822 int err = 0;
1823 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1824
1825 set_state_private(failure_tree, rec->start, 0);
1826 ret = clear_extent_bits(failure_tree, rec->start,
1827 rec->start + rec->len - 1,
1828 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1829 if (ret)
1830 err = ret;
1831
1832 if (did_repair) {
1833 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1834 rec->start + rec->len - 1,
1835 EXTENT_DAMAGED, GFP_NOFS);
1836 if (ret && !err)
1837 err = ret;
1838 }
1839
1840 kfree(rec);
1841 return err;
1842}
1843
1844static void repair_io_failure_callback(struct bio *bio, int err)
1845{
1846 complete(bio->bi_private);
1847}
1848
1849/*
1850 * this bypasses the standard btrfs submit functions deliberately, as
1851 * the standard behavior is to write all copies in a raid setup. here we only
1852 * want to write the one bad copy. so we do the mapping for ourselves and issue
1853 * submit_bio directly.
1854 * to avoid any synchonization issues, wait for the data after writing, which
1855 * actually prevents the read that triggered the error from finishing.
1856 * currently, there can be no more than two copies of every data bit. thus,
1857 * exactly one rewrite is required.
1858 */
1859int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1860 u64 length, u64 logical, struct page *page,
1861 int mirror_num)
1862{
1863 struct bio *bio;
1864 struct btrfs_device *dev;
1865 DECLARE_COMPLETION_ONSTACK(compl);
1866 u64 map_length = 0;
1867 u64 sector;
1868 struct btrfs_bio *bbio = NULL;
1869 int ret;
1870
1871 BUG_ON(!mirror_num);
1872
1873 bio = bio_alloc(GFP_NOFS, 1);
1874 if (!bio)
1875 return -EIO;
1876 bio->bi_private = &compl;
1877 bio->bi_end_io = repair_io_failure_callback;
1878 bio->bi_size = 0;
1879 map_length = length;
1880
1881 ret = btrfs_map_block(map_tree, WRITE, logical,
1882 &map_length, &bbio, mirror_num);
1883 if (ret) {
1884 bio_put(bio);
1885 return -EIO;
1886 }
1887 BUG_ON(mirror_num != bbio->mirror_num);
1888 sector = bbio->stripes[mirror_num-1].physical >> 9;
1889 bio->bi_sector = sector;
1890 dev = bbio->stripes[mirror_num-1].dev;
1891 kfree(bbio);
1892 if (!dev || !dev->bdev || !dev->writeable) {
1893 bio_put(bio);
1894 return -EIO;
1895 }
1896 bio->bi_bdev = dev->bdev;
1897 bio_add_page(bio, page, length, start-page_offset(page));
1898 submit_bio(WRITE_SYNC, bio);
1899 wait_for_completion(&compl);
1900
1901 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1902 /* try to remap that extent elsewhere? */
1903 bio_put(bio);
1904 return -EIO;
1905 }
1906
1907 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1908 "sector %llu)\n", page->mapping->host->i_ino, start,
1909 dev->name, sector);
1910
1911 bio_put(bio);
1912 return 0;
1913}
1914
1915/*
1916 * each time an IO finishes, we do a fast check in the IO failure tree
1917 * to see if we need to process or clean up an io_failure_record
1918 */
1919static int clean_io_failure(u64 start, struct page *page)
1920{
1921 u64 private;
1922 u64 private_failure;
1923 struct io_failure_record *failrec;
1924 struct btrfs_mapping_tree *map_tree;
1925 struct extent_state *state;
1926 int num_copies;
1927 int did_repair = 0;
1928 int ret;
1929 struct inode *inode = page->mapping->host;
1930
1931 private = 0;
1932 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1933 (u64)-1, 1, EXTENT_DIRTY, 0);
1934 if (!ret)
1935 return 0;
1936
1937 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1938 &private_failure);
1939 if (ret)
1940 return 0;
1941
1942 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1943 BUG_ON(!failrec->this_mirror);
1944
1945 if (failrec->in_validation) {
1946 /* there was no real error, just free the record */
1947 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1948 failrec->start);
1949 did_repair = 1;
1950 goto out;
1951 }
1952
1953 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1954 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1955 failrec->start,
1956 EXTENT_LOCKED);
1957 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1958
1959 if (state && state->start == failrec->start) {
1960 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1961 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1962 failrec->len);
1963 if (num_copies > 1) {
1964 ret = repair_io_failure(map_tree, start, failrec->len,
1965 failrec->logical, page,
1966 failrec->failed_mirror);
1967 did_repair = !ret;
1968 }
1969 }
1970
1971out:
1972 if (!ret)
1973 ret = free_io_failure(inode, failrec, did_repair);
1974
1975 return ret;
1976}
1977
1978/*
1979 * this is a generic handler for readpage errors (default
1980 * readpage_io_failed_hook). if other copies exist, read those and write back
1981 * good data to the failed position. does not investigate in remapping the
1982 * failed extent elsewhere, hoping the device will be smart enough to do this as
1983 * needed
1984 */
1985
1986static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1987 u64 start, u64 end, int failed_mirror,
1988 struct extent_state *state)
1989{
1990 struct io_failure_record *failrec = NULL;
1991 u64 private;
1992 struct extent_map *em;
1993 struct inode *inode = page->mapping->host;
1994 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1995 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1996 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1997 struct bio *bio;
1998 int num_copies;
1999 int ret;
2000 int read_mode;
2001 u64 logical;
2002
2003 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2004
2005 ret = get_state_private(failure_tree, start, &private);
2006 if (ret) {
2007 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2008 if (!failrec)
2009 return -ENOMEM;
2010 failrec->start = start;
2011 failrec->len = end - start + 1;
2012 failrec->this_mirror = 0;
2013 failrec->bio_flags = 0;
2014 failrec->in_validation = 0;
2015
2016 read_lock(&em_tree->lock);
2017 em = lookup_extent_mapping(em_tree, start, failrec->len);
2018 if (!em) {
2019 read_unlock(&em_tree->lock);
2020 kfree(failrec);
2021 return -EIO;
2022 }
2023
2024 if (em->start > start || em->start + em->len < start) {
2025 free_extent_map(em);
2026 em = NULL;
2027 }
2028 read_unlock(&em_tree->lock);
2029
2030 if (!em || IS_ERR(em)) {
2031 kfree(failrec);
2032 return -EIO;
2033 }
2034 logical = start - em->start;
2035 logical = em->block_start + logical;
2036 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2037 logical = em->block_start;
2038 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2039 extent_set_compress_type(&failrec->bio_flags,
2040 em->compress_type);
2041 }
2042 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2043 "len=%llu\n", logical, start, failrec->len);
2044 failrec->logical = logical;
2045 free_extent_map(em);
2046
2047 /* set the bits in the private failure tree */
2048 ret = set_extent_bits(failure_tree, start, end,
2049 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2050 if (ret >= 0)
2051 ret = set_state_private(failure_tree, start,
2052 (u64)(unsigned long)failrec);
2053 /* set the bits in the inode's tree */
2054 if (ret >= 0)
2055 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2056 GFP_NOFS);
2057 if (ret < 0) {
2058 kfree(failrec);
2059 return ret;
2060 }
2061 } else {
2062 failrec = (struct io_failure_record *)(unsigned long)private;
2063 pr_debug("bio_readpage_error: (found) logical=%llu, "
2064 "start=%llu, len=%llu, validation=%d\n",
2065 failrec->logical, failrec->start, failrec->len,
2066 failrec->in_validation);
2067 /*
2068 * when data can be on disk more than twice, add to failrec here
2069 * (e.g. with a list for failed_mirror) to make
2070 * clean_io_failure() clean all those errors at once.
2071 */
2072 }
2073 num_copies = btrfs_num_copies(
2074 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2075 failrec->logical, failrec->len);
2076 if (num_copies == 1) {
2077 /*
2078 * we only have a single copy of the data, so don't bother with
2079 * all the retry and error correction code that follows. no
2080 * matter what the error is, it is very likely to persist.
2081 */
2082 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2083 "state=%p, num_copies=%d, next_mirror %d, "
2084 "failed_mirror %d\n", state, num_copies,
2085 failrec->this_mirror, failed_mirror);
2086 free_io_failure(inode, failrec, 0);
2087 return -EIO;
2088 }
2089
2090 if (!state) {
2091 spin_lock(&tree->lock);
2092 state = find_first_extent_bit_state(tree, failrec->start,
2093 EXTENT_LOCKED);
2094 if (state && state->start != failrec->start)
2095 state = NULL;
2096 spin_unlock(&tree->lock);
2097 }
2098
2099 /*
2100 * there are two premises:
2101 * a) deliver good data to the caller
2102 * b) correct the bad sectors on disk
2103 */
2104 if (failed_bio->bi_vcnt > 1) {
2105 /*
2106 * to fulfill b), we need to know the exact failing sectors, as
2107 * we don't want to rewrite any more than the failed ones. thus,
2108 * we need separate read requests for the failed bio
2109 *
2110 * if the following BUG_ON triggers, our validation request got
2111 * merged. we need separate requests for our algorithm to work.
2112 */
2113 BUG_ON(failrec->in_validation);
2114 failrec->in_validation = 1;
2115 failrec->this_mirror = failed_mirror;
2116 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2117 } else {
2118 /*
2119 * we're ready to fulfill a) and b) alongside. get a good copy
2120 * of the failed sector and if we succeed, we have setup
2121 * everything for repair_io_failure to do the rest for us.
2122 */
2123 if (failrec->in_validation) {
2124 BUG_ON(failrec->this_mirror != failed_mirror);
2125 failrec->in_validation = 0;
2126 failrec->this_mirror = 0;
2127 }
2128 failrec->failed_mirror = failed_mirror;
2129 failrec->this_mirror++;
2130 if (failrec->this_mirror == failed_mirror)
2131 failrec->this_mirror++;
2132 read_mode = READ_SYNC;
2133 }
2134
2135 if (!state || failrec->this_mirror > num_copies) {
2136 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2137 "next_mirror %d, failed_mirror %d\n", state,
2138 num_copies, failrec->this_mirror, failed_mirror);
2139 free_io_failure(inode, failrec, 0);
2140 return -EIO;
2141 }
2142
2143 bio = bio_alloc(GFP_NOFS, 1);
2144 bio->bi_private = state;
2145 bio->bi_end_io = failed_bio->bi_end_io;
2146 bio->bi_sector = failrec->logical >> 9;
2147 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2148 bio->bi_size = 0;
2149
2150 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2151
2152 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2153 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2154 failrec->this_mirror, num_copies, failrec->in_validation);
2155
2156 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2157 failrec->bio_flags, 0);
2158 return 0;
2159}
2160
1602/* lots and lots of room for performance fixes in the end_bio funcs */ 2161/* lots and lots of room for performance fixes in the end_bio funcs */
1603 2162
1604/* 2163/*
@@ -1697,6 +2256,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697 struct extent_state *cached = NULL; 2256 struct extent_state *cached = NULL;
1698 struct extent_state *state; 2257 struct extent_state *state;
1699 2258
2259 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2260 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2261 (long int)bio->bi_bdev);
1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 2262 tree = &BTRFS_I(page->mapping->host)->io_tree;
1701 2263
1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2264 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,12 +2289,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727 state); 2289 state);
1728 if (ret) 2290 if (ret)
1729 uptodate = 0; 2291 uptodate = 0;
2292 else
2293 clean_io_failure(start, page);
1730 } 2294 }
1731 if (!uptodate && tree->ops && 2295 if (!uptodate) {
1732 tree->ops->readpage_io_failed_hook) { 2296 int failed_mirror;
1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 2297 failed_mirror = (int)(unsigned long)bio->bi_bdev;
1734 start, end, NULL); 2298 /*
2299 * The generic bio_readpage_error handles errors the
2300 * following way: If possible, new read requests are
2301 * created and submitted and will end up in
2302 * end_bio_extent_readpage as well (if we're lucky, not
2303 * in the !uptodate case). In that case it returns 0 and
2304 * we just go on with the next page in our bio. If it
2305 * can't handle the error it will return -EIO and we
2306 * remain responsible for that page.
2307 */
2308 ret = bio_readpage_error(bio, page, start, end,
2309 failed_mirror, NULL);
1735 if (ret == 0) { 2310 if (ret == 0) {
2311error_handled:
1736 uptodate = 2312 uptodate =
1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 2313 test_bit(BIO_UPTODATE, &bio->bi_flags);
1738 if (err) 2314 if (err)
@@ -1740,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1740 uncache_state(&cached); 2316 uncache_state(&cached);
1741 continue; 2317 continue;
1742 } 2318 }
2319 if (tree->ops && tree->ops->readpage_io_failed_hook) {
2320 ret = tree->ops->readpage_io_failed_hook(
2321 bio, page, start, end,
2322 failed_mirror, state);
2323 if (ret == 0)
2324 goto error_handled;
2325 }
1743 } 2326 }
1744 2327
1745 if (uptodate) { 2328 if (uptodate) {
@@ -1811,6 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811 mirror_num, bio_flags, start); 2394 mirror_num, bio_flags, start);
1812 else 2395 else
1813 submit_bio(rw, bio); 2396 submit_bio(rw, bio);
2397
1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2398 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815 ret = -EOPNOTSUPP; 2399 ret = -EOPNOTSUPP;
1816 bio_put(bio); 2400 bio_put(bio);
@@ -2076,16 +2660,16 @@ out:
2076} 2660}
2077 2661
2078int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2662int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2079 get_extent_t *get_extent) 2663 get_extent_t *get_extent, int mirror_num)
2080{ 2664{
2081 struct bio *bio = NULL; 2665 struct bio *bio = NULL;
2082 unsigned long bio_flags = 0; 2666 unsigned long bio_flags = 0;
2083 int ret; 2667 int ret;
2084 2668
2085 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2669 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2086 &bio_flags); 2670 &bio_flags);
2087 if (bio) 2671 if (bio)
2088 ret = submit_one_bio(READ, bio, 0, bio_flags); 2672 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2089 return ret; 2673 return ret;
2090} 2674}
2091 2675
@@ -2136,6 +2720,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 int compressed; 2720 int compressed;
2137 int write_flags; 2721 int write_flags;
2138 unsigned long nr_written = 0; 2722 unsigned long nr_written = 0;
2723 bool fill_delalloc = true;
2139 2724
2140 if (wbc->sync_mode == WB_SYNC_ALL) 2725 if (wbc->sync_mode == WB_SYNC_ALL)
2141 write_flags = WRITE_SYNC; 2726 write_flags = WRITE_SYNC;
@@ -2145,6 +2730,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2145 trace___extent_writepage(page, inode, wbc); 2730 trace___extent_writepage(page, inode, wbc);
2146 2731
2147 WARN_ON(!PageLocked(page)); 2732 WARN_ON(!PageLocked(page));
2733
2734 ClearPageError(page);
2735
2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2736 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149 if (page->index > end_index || 2737 if (page->index > end_index ||
2150 (page->index == end_index && !pg_offset)) { 2738 (page->index == end_index && !pg_offset)) {
@@ -2166,10 +2754,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2166 2754
2167 set_page_extent_mapped(page); 2755 set_page_extent_mapped(page);
2168 2756
2757 if (!tree->ops || !tree->ops->fill_delalloc)
2758 fill_delalloc = false;
2759
2169 delalloc_start = start; 2760 delalloc_start = start;
2170 delalloc_end = 0; 2761 delalloc_end = 0;
2171 page_started = 0; 2762 page_started = 0;
2172 if (!epd->extent_locked) { 2763 if (!epd->extent_locked && fill_delalloc) {
2173 u64 delalloc_to_write = 0; 2764 u64 delalloc_to_write = 0;
2174 /* 2765 /*
2175 * make sure the wbc mapping index is at least updated 2766 * make sure the wbc mapping index is at least updated
@@ -2421,10 +3012,16 @@ retry:
2421 * swizzled back from swapper_space to tmpfs file 3012 * swizzled back from swapper_space to tmpfs file
2422 * mapping 3013 * mapping
2423 */ 3014 */
2424 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 3015 if (tree->ops &&
2425 tree->ops->write_cache_pages_lock_hook(page); 3016 tree->ops->write_cache_pages_lock_hook) {
2426 else 3017 tree->ops->write_cache_pages_lock_hook(page,
2427 lock_page(page); 3018 data, flush_fn);
3019 } else {
3020 if (!trylock_page(page)) {
3021 flush_fn(data);
3022 lock_page(page);
3023 }
3024 }
2428 3025
2429 if (unlikely(page->mapping != mapping)) { 3026 if (unlikely(page->mapping != mapping)) {
2430 unlock_page(page); 3027 unlock_page(page);
@@ -2790,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2790 return -ENOMEM; 3387 return -ENOMEM;
2791 path->leave_spinning = 1; 3388 path->leave_spinning = 1;
2792 3389
3390 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3391 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3392
2793 /* 3393 /*
2794 * lookup the last file extent. We're not using i_size here 3394 * lookup the last file extent. We're not using i_size here
2795 * because there might be preallocation past i_size 3395 * because there might be preallocation past i_size
@@ -2837,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2837 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3437 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2838 &cached_state, GFP_NOFS); 3438 &cached_state, GFP_NOFS);
2839 3439
2840 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3440 em = get_extent_skip_holes(inode, start, last_for_get_extent,
2841 get_extent); 3441 get_extent);
2842 if (!em) 3442 if (!em)
2843 goto out; 3443 goto out;
@@ -2926,7 +3526,7 @@ out:
2926 return ret; 3526 return ret;
2927} 3527}
2928 3528
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3529inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930 unsigned long i) 3530 unsigned long i)
2931{ 3531{
2932 struct page *p; 3532 struct page *p;
@@ -2951,7 +3551,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2951 return p; 3551 return p;
2952} 3552}
2953 3553
2954static inline unsigned long num_extent_pages(u64 start, u64 len) 3554inline unsigned long num_extent_pages(u64 start, u64 len)
2955{ 3555{
2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3556 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957 (start >> PAGE_CACHE_SHIFT); 3557 (start >> PAGE_CACHE_SHIFT);
@@ -3204,6 +3804,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3204 PAGECACHE_TAG_DIRTY); 3804 PAGECACHE_TAG_DIRTY);
3205 } 3805 }
3206 spin_unlock_irq(&page->mapping->tree_lock); 3806 spin_unlock_irq(&page->mapping->tree_lock);
3807 ClearPageError(page);
3207 unlock_page(page); 3808 unlock_page(page);
3208 } 3809 }
3209 return 0; 3810 return 0;
@@ -3349,8 +3950,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3349} 3950}
3350 3951
3351int read_extent_buffer_pages(struct extent_io_tree *tree, 3952int read_extent_buffer_pages(struct extent_io_tree *tree,
3352 struct extent_buffer *eb, 3953 struct extent_buffer *eb, u64 start, int wait,
3353 u64 start, int wait,
3354 get_extent_t *get_extent, int mirror_num) 3954 get_extent_t *get_extent, int mirror_num)
3355{ 3955{
3356 unsigned long i; 3956 unsigned long i;
@@ -3386,7 +3986,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 num_pages = num_extent_pages(eb->start, eb->len); 3986 num_pages = num_extent_pages(eb->start, eb->len);
3387 for (i = start_i; i < num_pages; i++) { 3987 for (i = start_i; i < num_pages; i++) {
3388 page = extent_buffer_page(eb, i); 3988 page = extent_buffer_page(eb, i);
3389 if (!wait) { 3989 if (wait == WAIT_NONE) {
3390 if (!trylock_page(page)) 3990 if (!trylock_page(page))
3391 goto unlock_exit; 3991 goto unlock_exit;
3392 } else { 3992 } else {
@@ -3430,7 +4030,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3430 if (bio) 4030 if (bio)
3431 submit_one_bio(READ, bio, mirror_num, bio_flags); 4031 submit_one_bio(READ, bio, mirror_num, bio_flags);
3432 4032
3433 if (ret || !wait) 4033 if (ret || wait != WAIT_COMPLETE)
3434 return ret; 4034 return ret;
3435 4035
3436 for (i = start_i; i < num_pages; i++) { 4036 for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..7604c3001322 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,8 @@
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 24
@@ -32,6 +34,7 @@
32#define EXTENT_BUFFER_BLOCKING 1 34#define EXTENT_BUFFER_BLOCKING 1
33#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
35 38
36/* these are flags for extent_clear_unlock_delalloc */ 39/* these are flags for extent_clear_unlock_delalloc */
37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 40#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -67,7 +70,7 @@ struct extent_io_ops {
67 unsigned long bio_flags); 70 unsigned long bio_flags);
68 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
69 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
70 u64 start, u64 end, 73 u64 start, u64 end, int failed_mirror,
71 struct extent_state *state); 74 struct extent_state *state);
72 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
73 u64 start, u64 end, 76 u64 start, u64 end,
@@ -85,7 +88,8 @@ struct extent_io_ops {
85 struct extent_state *other); 88 struct extent_state *other);
86 void (*split_extent_hook)(struct inode *inode, 89 void (*split_extent_hook)(struct inode *inode,
87 struct extent_state *orig, u64 split); 90 struct extent_state *orig, u64 split);
88 int (*write_cache_pages_lock_hook)(struct page *page); 91 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
92 void (*flush_fn)(void *));
89}; 93};
90 94
91struct extent_io_tree { 95struct extent_io_tree {
@@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
185int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 189int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
186 gfp_t mask); 190 gfp_t mask);
187int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 191int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
188 get_extent_t *get_extent); 192 get_extent_t *get_extent, int mirror_num);
189int __init extent_io_init(void); 193int __init extent_io_init(void);
190void extent_io_exit(void); 194void extent_io_exit(void);
191 195
@@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
214 gfp_t mask); 218 gfp_t mask);
215int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 219int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
216 gfp_t mask); 220 gfp_t mask);
221int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
222 int bits, int clear_bits, gfp_t mask);
217int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 223int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
218 struct extent_state **cached_state, gfp_t mask); 224 struct extent_state **cached_state, gfp_t mask);
219int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 225int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
248struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 254struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
249 u64 start, unsigned long len); 255 u64 start, unsigned long len);
250void free_extent_buffer(struct extent_buffer *eb); 256void free_extent_buffer(struct extent_buffer *eb);
257#define WAIT_NONE 0
258#define WAIT_COMPLETE 1
259#define WAIT_PAGE_LOCK 2
251int read_extent_buffer_pages(struct extent_io_tree *tree, 260int read_extent_buffer_pages(struct extent_io_tree *tree,
252 struct extent_buffer *eb, u64 start, int wait, 261 struct extent_buffer *eb, u64 start, int wait,
253 get_extent_t *get_extent, int mirror_num); 262 get_extent_t *get_extent, int mirror_num);
263unsigned long num_extent_pages(u64 start, u64 len);
264struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
254 265
255static inline void extent_buffer_get(struct extent_buffer *eb) 266static inline void extent_buffer_get(struct extent_buffer *eb)
256{ 267{
@@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
300struct bio * 311struct bio *
301btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 312btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
302 gfp_t gfp_flags); 313 gfp_t gfp_flags);
314
315struct btrfs_mapping_tree;
316
317int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
318 u64 length, u64 logical, struct page *page,
319 int mirror_num);
303#endif 320#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
91 struct btrfs_csum_item *item; 91 struct btrfs_csum_item *item;
92 struct extent_buffer *leaf; 92 struct extent_buffer *leaf;
93 u64 csum_offset = 0; 93 u64 csum_offset = 0;
94 u16 csum_size = 94 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
95 btrfs_super_csum_size(&root->fs_info->super_copy);
96 int csums_in_item; 95 int csums_in_item;
97 96
98 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 97 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
162 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
163 u64 disk_bytenr; 162 u64 disk_bytenr;
164 u32 diff; 163 u32 diff;
165 u16 csum_size = 164 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
166 btrfs_super_csum_size(&root->fs_info->super_copy);
167 int ret; 165 int ret;
168 struct btrfs_path *path; 166 struct btrfs_path *path;
169 struct btrfs_csum_item *item = NULL; 167 struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
290 int ret; 288 int ret;
291 size_t size; 289 size_t size;
292 u64 csum_end; 290 u64 csum_end;
293 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 291 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
294 292
295 path = btrfs_alloc_path(); 293 path = btrfs_alloc_path();
296 if (!path) 294 if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
492 u64 bytenr, u64 len) 490 u64 bytenr, u64 len)
493{ 491{
494 struct extent_buffer *leaf; 492 struct extent_buffer *leaf;
495 u16 csum_size = 493 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
496 btrfs_super_csum_size(&root->fs_info->super_copy);
497 u64 csum_end; 494 u64 csum_end;
498 u64 end_byte = bytenr + len; 495 u64 end_byte = bytenr + len;
499 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; 496 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
549 u64 csum_end; 546 u64 csum_end;
550 struct extent_buffer *leaf; 547 struct extent_buffer *leaf;
551 int ret; 548 int ret;
552 u16 csum_size = 549 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
553 btrfs_super_csum_size(&root->fs_info->super_copy);
554 int blocksize_bits = root->fs_info->sb->s_blocksize_bits; 550 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
555 551
556 root = root->fs_info->csum_root; 552 root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
676 struct btrfs_sector_sum *sector_sum; 672 struct btrfs_sector_sum *sector_sum;
677 u32 nritems; 673 u32 nritems;
678 u32 ins_size; 674 u32 ins_size;
679 u16 csum_size = 675 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
680 btrfs_super_csum_size(&root->fs_info->super_copy);
681 676
682 path = btrfs_alloc_path(); 677 path = btrfs_alloc_path();
683 if (!path) 678 if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4e57d59edb7..cc7492c823f3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1069 int i; 1069 int i;
1070 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1070 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1071 struct inode *inode = fdentry(file)->d_inode; 1071 struct inode *inode = fdentry(file)->d_inode;
1072 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1072 int err = 0; 1073 int err = 0;
1073 int faili = 0; 1074 int faili = 0;
1074 u64 start_pos; 1075 u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1080again: 1081again:
1081 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1082 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1083 GFP_NOFS); 1084 mask);
1084 if (!pages[i]) { 1085 if (!pages[i]) {
1085 faili = i - 1; 1086 faili = i - 1;
1086 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1386,7 +1387,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1386 goto out; 1387 goto out;
1387 } 1388 }
1388 1389
1389 file_update_time(file); 1390 err = btrfs_update_time(file);
1391 if (err) {
1392 mutex_unlock(&inode->i_mutex);
1393 goto out;
1394 }
1390 BTRFS_I(inode)->sequence++; 1395 BTRFS_I(inode)->sequence++;
1391 1396
1392 start_pos = round_down(pos, root->sectorsize); 1397 start_pos = round_down(pos, root->sectorsize);
@@ -1615,10 +1620,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1615 goto out; 1620 goto out;
1616 } 1621 }
1617 1622
1618 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1619 if (ret)
1620 goto out;
1621
1622 locked_end = alloc_end - 1; 1623 locked_end = alloc_end - 1;
1623 while (1) { 1624 while (1) {
1624 struct btrfs_ordered_extent *ordered; 1625 struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1665,27 @@ static long btrfs_fallocate(struct file *file, int mode,
1664 if (em->block_start == EXTENT_MAP_HOLE || 1665 if (em->block_start == EXTENT_MAP_HOLE ||
1665 (cur_offset >= inode->i_size && 1666 (cur_offset >= inode->i_size &&
1666 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1667 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1668
1669 /*
1670 * Make sure we have enough space before we do the
1671 * allocation.
1672 */
1673 ret = btrfs_check_data_free_space(inode, last_byte -
1674 cur_offset);
1675 if (ret) {
1676 free_extent_map(em);
1677 break;
1678 }
1679
1667 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1680 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1668 last_byte - cur_offset, 1681 last_byte - cur_offset,
1669 1 << inode->i_blkbits, 1682 1 << inode->i_blkbits,
1670 offset + len, 1683 offset + len,
1671 &alloc_hint); 1684 &alloc_hint);
1685
1686 /* Let go of our reservation. */
1687 btrfs_free_reserved_data_space(inode, last_byte -
1688 cur_offset);
1672 if (ret < 0) { 1689 if (ret < 0) {
1673 free_extent_map(em); 1690 free_extent_map(em);
1674 break; 1691 break;
@@ -1694,8 +1711,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1694 } 1711 }
1695 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1712 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1696 &cached_state, GFP_NOFS); 1713 &cached_state, GFP_NOFS);
1697
1698 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1699out: 1714out:
1700 mutex_unlock(&inode->i_mutex); 1715 mutex_unlock(&inode->i_mutex);
1701 return ret; 1716 return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..ec23d43d0c35 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/math64.h> 22#include <linux/math64.h>
23#include <linux/ratelimit.h>
23#include "ctree.h" 24#include "ctree.h"
24#include "free-space-cache.h" 25#include "free-space-cache.h"
25#include "transaction.h" 26#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
84 *block_group, struct btrfs_path *path) 85 *block_group, struct btrfs_path *path)
85{ 86{
86 struct inode *inode = NULL; 87 struct inode *inode = NULL;
88 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
87 89
88 spin_lock(&block_group->lock); 90 spin_lock(&block_group->lock);
89 if (block_group->inode) 91 if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 100 return inode;
99 101
100 spin_lock(&block_group->lock); 102 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { 103 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
102 printk(KERN_INFO "Old style space inode found, converting.\n"); 104 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; 105 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
106 BTRFS_INODE_NODATACOW;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR; 107 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 } 108 }
106 109
107 if (!btrfs_fs_closing(root->fs_info)) { 110 if (!block_group->iref) {
108 block_group->inode = igrab(inode); 111 block_group->inode = igrab(inode);
109 block_group->iref = 1; 112 block_group->iref = 1;
110 } 113 }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
122 struct btrfs_free_space_header *header; 125 struct btrfs_free_space_header *header;
123 struct btrfs_inode_item *inode_item; 126 struct btrfs_inode_item *inode_item;
124 struct extent_buffer *leaf; 127 struct extent_buffer *leaf;
128 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
125 int ret; 129 int ret;
126 130
127 ret = btrfs_insert_empty_inode(trans, root, path, ino); 131 ret = btrfs_insert_empty_inode(trans, root, path, ino);
128 if (ret) 132 if (ret)
129 return ret; 133 return ret;
130 134
135 /* We inline crc's for the free disk space cache */
136 if (ino != BTRFS_FREE_INO_OBJECTID)
137 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
138
131 leaf = path->nodes[0]; 139 leaf = path->nodes[0];
132 inode_item = btrfs_item_ptr(leaf, path->slots[0], 140 inode_item = btrfs_item_ptr(leaf, path->slots[0],
133 struct btrfs_inode_item); 141 struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
140 btrfs_set_inode_uid(leaf, inode_item, 0); 148 btrfs_set_inode_uid(leaf, inode_item, 0);
141 btrfs_set_inode_gid(leaf, inode_item, 0); 149 btrfs_set_inode_gid(leaf, inode_item, 0);
142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 150 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 151 btrfs_set_inode_flags(leaf, inode_item, flags);
144 BTRFS_INODE_PREALLOC);
145 btrfs_set_inode_nlink(leaf, inode_item, 1); 152 btrfs_set_inode_nlink(leaf, inode_item, 1);
146 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 153 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
147 btrfs_set_inode_block_group(leaf, inode_item, offset); 154 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
191 struct inode *inode) 198 struct inode *inode)
192{ 199{
193 struct btrfs_block_rsv *rsv; 200 struct btrfs_block_rsv *rsv;
201 u64 needed_bytes;
194 loff_t oldsize; 202 loff_t oldsize;
195 int ret = 0; 203 int ret = 0;
196 204
197 rsv = trans->block_rsv; 205 rsv = trans->block_rsv;
198 trans->block_rsv = root->orphan_block_rsv; 206 trans->block_rsv = &root->fs_info->global_block_rsv;
199 ret = btrfs_block_rsv_check(trans, root, 207
200 root->orphan_block_rsv, 208 /* 1 for slack space, 1 for updating the inode */
201 0, 5); 209 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
202 if (ret) 210 btrfs_calc_trans_metadata_size(root, 1);
203 return ret; 211
212 spin_lock(&trans->block_rsv->lock);
213 if (trans->block_rsv->reserved < needed_bytes) {
214 spin_unlock(&trans->block_rsv->lock);
215 trans->block_rsv = rsv;
216 return -ENOSPC;
217 }
218 spin_unlock(&trans->block_rsv->lock);
204 219
205 oldsize = i_size_read(inode); 220 oldsize = i_size_read(inode);
206 btrfs_i_size_write(inode, 0); 221 btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
213 ret = btrfs_truncate_inode_items(trans, root, inode, 228 ret = btrfs_truncate_inode_items(trans, root, inode,
214 0, BTRFS_EXTENT_DATA_KEY); 229 0, BTRFS_EXTENT_DATA_KEY);
215 230
216 trans->block_rsv = rsv;
217 if (ret) { 231 if (ret) {
232 trans->block_rsv = rsv;
218 WARN_ON(1); 233 WARN_ON(1);
219 return ret; 234 return ret;
220 } 235 }
221 236
222 ret = btrfs_update_inode(trans, root, inode); 237 ret = btrfs_update_inode(trans, root, inode);
238 trans->block_rsv = rsv;
239
223 return ret; 240 return ret;
224} 241}
225 242
@@ -242,26 +259,348 @@ static int readahead_cache(struct inode *inode)
242 return 0; 259 return 0;
243} 260}
244 261
262struct io_ctl {
263 void *cur, *orig;
264 struct page *page;
265 struct page **pages;
266 struct btrfs_root *root;
267 unsigned long size;
268 int index;
269 int num_pages;
270 unsigned check_crcs:1;
271};
272
273static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
274 struct btrfs_root *root)
275{
276 memset(io_ctl, 0, sizeof(struct io_ctl));
277 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
278 PAGE_CACHE_SHIFT;
279 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
280 GFP_NOFS);
281 if (!io_ctl->pages)
282 return -ENOMEM;
283 io_ctl->root = root;
284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
285 io_ctl->check_crcs = 1;
286 return 0;
287}
288
289static void io_ctl_free(struct io_ctl *io_ctl)
290{
291 kfree(io_ctl->pages);
292}
293
294static void io_ctl_unmap_page(struct io_ctl *io_ctl)
295{
296 if (io_ctl->cur) {
297 kunmap(io_ctl->page);
298 io_ctl->cur = NULL;
299 io_ctl->orig = NULL;
300 }
301}
302
303static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
304{
305 WARN_ON(io_ctl->cur);
306 BUG_ON(io_ctl->index >= io_ctl->num_pages);
307 io_ctl->page = io_ctl->pages[io_ctl->index++];
308 io_ctl->cur = kmap(io_ctl->page);
309 io_ctl->orig = io_ctl->cur;
310 io_ctl->size = PAGE_CACHE_SIZE;
311 if (clear)
312 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
313}
314
315static void io_ctl_drop_pages(struct io_ctl *io_ctl)
316{
317 int i;
318
319 io_ctl_unmap_page(io_ctl);
320
321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]);
323 unlock_page(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]);
325 }
326}
327
328static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
329 int uptodate)
330{
331 struct page *page;
332 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
333 int i;
334
335 for (i = 0; i < io_ctl->num_pages; i++) {
336 page = find_or_create_page(inode->i_mapping, i, mask);
337 if (!page) {
338 io_ctl_drop_pages(io_ctl);
339 return -ENOMEM;
340 }
341 io_ctl->pages[i] = page;
342 if (uptodate && !PageUptodate(page)) {
343 btrfs_readpage(NULL, page);
344 lock_page(page);
345 if (!PageUptodate(page)) {
346 printk(KERN_ERR "btrfs: error reading free "
347 "space cache\n");
348 io_ctl_drop_pages(io_ctl);
349 return -EIO;
350 }
351 }
352 }
353
354 for (i = 0; i < io_ctl->num_pages; i++) {
355 clear_page_dirty_for_io(io_ctl->pages[i]);
356 set_page_extent_mapped(io_ctl->pages[i]);
357 }
358
359 return 0;
360}
361
362static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
363{
364 u64 *val;
365
366 io_ctl_map_page(io_ctl, 1);
367
368 /*
369 * Skip the csum areas. If we don't check crcs then we just have a
370 * 64bit chunk at the front of the first page.
371 */
372 if (io_ctl->check_crcs) {
373 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
374 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
375 } else {
376 io_ctl->cur += sizeof(u64);
377 io_ctl->size -= sizeof(u64) * 2;
378 }
379
380 val = io_ctl->cur;
381 *val = cpu_to_le64(generation);
382 io_ctl->cur += sizeof(u64);
383}
384
385static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
386{
387 u64 *gen;
388
389 /*
390 * Skip the crc area. If we don't check crcs then we just have a 64bit
391 * chunk at the front of the first page.
392 */
393 if (io_ctl->check_crcs) {
394 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
395 io_ctl->size -= sizeof(u64) +
396 (sizeof(u32) * io_ctl->num_pages);
397 } else {
398 io_ctl->cur += sizeof(u64);
399 io_ctl->size -= sizeof(u64) * 2;
400 }
401
402 gen = io_ctl->cur;
403 if (le64_to_cpu(*gen) != generation) {
404 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
405 "(%Lu) does not match inode (%Lu)\n", *gen,
406 generation);
407 io_ctl_unmap_page(io_ctl);
408 return -EIO;
409 }
410 io_ctl->cur += sizeof(u64);
411 return 0;
412}
413
414static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
415{
416 u32 *tmp;
417 u32 crc = ~(u32)0;
418 unsigned offset = 0;
419
420 if (!io_ctl->check_crcs) {
421 io_ctl_unmap_page(io_ctl);
422 return;
423 }
424
425 if (index == 0)
426 offset = sizeof(u32) * io_ctl->num_pages;;
427
428 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
429 PAGE_CACHE_SIZE - offset);
430 btrfs_csum_final(crc, (char *)&crc);
431 io_ctl_unmap_page(io_ctl);
432 tmp = kmap(io_ctl->pages[0]);
433 tmp += index;
434 *tmp = crc;
435 kunmap(io_ctl->pages[0]);
436}
437
438static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
439{
440 u32 *tmp, val;
441 u32 crc = ~(u32)0;
442 unsigned offset = 0;
443
444 if (!io_ctl->check_crcs) {
445 io_ctl_map_page(io_ctl, 0);
446 return 0;
447 }
448
449 if (index == 0)
450 offset = sizeof(u32) * io_ctl->num_pages;
451
452 tmp = kmap(io_ctl->pages[0]);
453 tmp += index;
454 val = *tmp;
455 kunmap(io_ctl->pages[0]);
456
457 io_ctl_map_page(io_ctl, 0);
458 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
459 PAGE_CACHE_SIZE - offset);
460 btrfs_csum_final(crc, (char *)&crc);
461 if (val != crc) {
462 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
463 "space cache\n");
464 io_ctl_unmap_page(io_ctl);
465 return -EIO;
466 }
467
468 return 0;
469}
470
471static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
472 void *bitmap)
473{
474 struct btrfs_free_space_entry *entry;
475
476 if (!io_ctl->cur)
477 return -ENOSPC;
478
479 entry = io_ctl->cur;
480 entry->offset = cpu_to_le64(offset);
481 entry->bytes = cpu_to_le64(bytes);
482 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
483 BTRFS_FREE_SPACE_EXTENT;
484 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
485 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
486
487 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
488 return 0;
489
490 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
491
492 /* No more pages to map */
493 if (io_ctl->index >= io_ctl->num_pages)
494 return 0;
495
496 /* map the next page */
497 io_ctl_map_page(io_ctl, 1);
498 return 0;
499}
500
501static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
502{
503 if (!io_ctl->cur)
504 return -ENOSPC;
505
506 /*
507 * If we aren't at the start of the current page, unmap this one and
508 * map the next one if there is any left.
509 */
510 if (io_ctl->cur != io_ctl->orig) {
511 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
512 if (io_ctl->index >= io_ctl->num_pages)
513 return -ENOSPC;
514 io_ctl_map_page(io_ctl, 0);
515 }
516
517 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
518 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
519 if (io_ctl->index < io_ctl->num_pages)
520 io_ctl_map_page(io_ctl, 0);
521 return 0;
522}
523
524static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
525{
526 /*
527 * If we're not on the boundary we know we've modified the page and we
528 * need to crc the page.
529 */
530 if (io_ctl->cur != io_ctl->orig)
531 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
532 else
533 io_ctl_unmap_page(io_ctl);
534
535 while (io_ctl->index < io_ctl->num_pages) {
536 io_ctl_map_page(io_ctl, 1);
537 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
538 }
539}
540
541static int io_ctl_read_entry(struct io_ctl *io_ctl,
542 struct btrfs_free_space *entry, u8 *type)
543{
544 struct btrfs_free_space_entry *e;
545 int ret;
546
547 if (!io_ctl->cur) {
548 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
549 if (ret)
550 return ret;
551 }
552
553 e = io_ctl->cur;
554 entry->offset = le64_to_cpu(e->offset);
555 entry->bytes = le64_to_cpu(e->bytes);
556 *type = e->type;
557 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
558 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
559
560 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
561 return 0;
562
563 io_ctl_unmap_page(io_ctl);
564
565 return 0;
566}
567
568static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
569 struct btrfs_free_space *entry)
570{
571 int ret;
572
573 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
574 if (ret)
575 return ret;
576
577 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
578 io_ctl_unmap_page(io_ctl);
579
580 return 0;
581}
582
245int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 583int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
246 struct btrfs_free_space_ctl *ctl, 584 struct btrfs_free_space_ctl *ctl,
247 struct btrfs_path *path, u64 offset) 585 struct btrfs_path *path, u64 offset)
248{ 586{
249 struct btrfs_free_space_header *header; 587 struct btrfs_free_space_header *header;
250 struct extent_buffer *leaf; 588 struct extent_buffer *leaf;
251 struct page *page; 589 struct io_ctl io_ctl;
252 struct btrfs_key key; 590 struct btrfs_key key;
591 struct btrfs_free_space *e, *n;
253 struct list_head bitmaps; 592 struct list_head bitmaps;
254 u64 num_entries; 593 u64 num_entries;
255 u64 num_bitmaps; 594 u64 num_bitmaps;
256 u64 generation; 595 u64 generation;
257 pgoff_t index = 0; 596 u8 type;
258 int ret = 0; 597 int ret = 0;
259 598
260 INIT_LIST_HEAD(&bitmaps); 599 INIT_LIST_HEAD(&bitmaps);
261 600
262 /* Nothing in the space cache, goodbye */ 601 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) 602 if (!i_size_read(inode))
264 goto out; 603 return 0;
265 604
266 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 605 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
267 key.offset = offset; 606 key.offset = offset;
@@ -269,11 +608,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
269 608
270 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 609 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
271 if (ret < 0) 610 if (ret < 0)
272 goto out; 611 return 0;
273 else if (ret > 0) { 612 else if (ret > 0) {
274 btrfs_release_path(path); 613 btrfs_release_path(path);
275 ret = 0; 614 return 0;
276 goto out;
277 } 615 }
278 616
279 ret = -1; 617 ret = -1;
@@ -291,169 +629,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
291 " not match free space cache generation (%llu)\n", 629 " not match free space cache generation (%llu)\n",
292 (unsigned long long)BTRFS_I(inode)->generation, 630 (unsigned long long)BTRFS_I(inode)->generation,
293 (unsigned long long)generation); 631 (unsigned long long)generation);
294 goto out; 632 return 0;
295 } 633 }
296 634
297 if (!num_entries) 635 if (!num_entries)
298 goto out; 636 return 0;
299 637
638 io_ctl_init(&io_ctl, inode, root);
300 ret = readahead_cache(inode); 639 ret = readahead_cache(inode);
301 if (ret) 640 if (ret)
302 goto out; 641 goto out;
303 642
304 while (1) { 643 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
305 struct btrfs_free_space_entry *entry; 644 if (ret)
306 struct btrfs_free_space *e; 645 goto out;
307 void *addr;
308 unsigned long offset = 0;
309 int need_loop = 0;
310 646
311 if (!num_entries && !num_bitmaps) 647 ret = io_ctl_check_crc(&io_ctl, 0);
312 break; 648 if (ret)
649 goto free_cache;
650
651 ret = io_ctl_check_generation(&io_ctl, generation);
652 if (ret)
653 goto free_cache;
313 654
314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 655 while (num_entries) {
315 if (!page) 656 e = kmem_cache_zalloc(btrfs_free_space_cachep,
657 GFP_NOFS);
658 if (!e)
316 goto free_cache; 659 goto free_cache;
317 660
318 if (!PageUptodate(page)) { 661 ret = io_ctl_read_entry(&io_ctl, e, &type);
319 btrfs_readpage(NULL, page); 662 if (ret) {
320 lock_page(page); 663 kmem_cache_free(btrfs_free_space_cachep, e);
321 if (!PageUptodate(page)) { 664 goto free_cache;
322 unlock_page(page);
323 page_cache_release(page);
324 printk(KERN_ERR "btrfs: error reading free "
325 "space cache\n");
326 goto free_cache;
327 }
328 } 665 }
329 addr = kmap(page);
330 666
331 if (index == 0) { 667 if (!e->bytes) {
332 u64 *gen; 668 kmem_cache_free(btrfs_free_space_cachep, e);
669 goto free_cache;
670 }
333 671
334 /* 672 if (type == BTRFS_FREE_SPACE_EXTENT) {
335 * We put a bogus crc in the front of the first page in 673 spin_lock(&ctl->tree_lock);
336 * case old kernels try to mount a fs with the new 674 ret = link_free_space(ctl, e);
337 * format to make sure they discard the cache. 675 spin_unlock(&ctl->tree_lock);
338 */ 676 if (ret) {
339 addr += sizeof(u64); 677 printk(KERN_ERR "Duplicate entries in "
340 offset += sizeof(u64); 678 "free space cache, dumping\n");
341 679 kmem_cache_free(btrfs_free_space_cachep, e);
342 gen = addr;
343 if (*gen != BTRFS_I(inode)->generation) {
344 printk(KERN_ERR "btrfs: space cache generation"
345 " (%llu) does not match inode (%llu)\n",
346 (unsigned long long)*gen,
347 (unsigned long long)
348 BTRFS_I(inode)->generation);
349 kunmap(page);
350 unlock_page(page);
351 page_cache_release(page);
352 goto free_cache; 680 goto free_cache;
353 } 681 }
354 addr += sizeof(u64); 682 } else {
355 offset += sizeof(u64); 683 BUG_ON(!num_bitmaps);
356 } 684 num_bitmaps--;
357 entry = addr; 685 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
358 686 if (!e->bitmap) {
359 while (1) { 687 kmem_cache_free(
360 if (!num_entries) 688 btrfs_free_space_cachep, e);
361 break;
362
363 need_loop = 1;
364 e = kmem_cache_zalloc(btrfs_free_space_cachep,
365 GFP_NOFS);
366 if (!e) {
367 kunmap(page);
368 unlock_page(page);
369 page_cache_release(page);
370 goto free_cache; 689 goto free_cache;
371 } 690 }
372 691 spin_lock(&ctl->tree_lock);
373 e->offset = le64_to_cpu(entry->offset); 692 ret = link_free_space(ctl, e);
374 e->bytes = le64_to_cpu(entry->bytes); 693 ctl->total_bitmaps++;
375 if (!e->bytes) { 694 ctl->op->recalc_thresholds(ctl);
376 kunmap(page); 695 spin_unlock(&ctl->tree_lock);
696 if (ret) {
697 printk(KERN_ERR "Duplicate entries in "
698 "free space cache, dumping\n");
377 kmem_cache_free(btrfs_free_space_cachep, e); 699 kmem_cache_free(btrfs_free_space_cachep, e);
378 unlock_page(page);
379 page_cache_release(page);
380 goto free_cache; 700 goto free_cache;
381 } 701 }
382 702 list_add_tail(&e->list, &bitmaps);
383 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
384 spin_lock(&ctl->tree_lock);
385 ret = link_free_space(ctl, e);
386 spin_unlock(&ctl->tree_lock);
387 if (ret) {
388 printk(KERN_ERR "Duplicate entries in "
389 "free space cache, dumping\n");
390 kunmap(page);
391 unlock_page(page);
392 page_cache_release(page);
393 goto free_cache;
394 }
395 } else {
396 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
397 if (!e->bitmap) {
398 kunmap(page);
399 kmem_cache_free(
400 btrfs_free_space_cachep, e);
401 unlock_page(page);
402 page_cache_release(page);
403 goto free_cache;
404 }
405 spin_lock(&ctl->tree_lock);
406 ret = link_free_space(ctl, e);
407 ctl->total_bitmaps++;
408 ctl->op->recalc_thresholds(ctl);
409 spin_unlock(&ctl->tree_lock);
410 if (ret) {
411 printk(KERN_ERR "Duplicate entries in "
412 "free space cache, dumping\n");
413 kunmap(page);
414 unlock_page(page);
415 page_cache_release(page);
416 goto free_cache;
417 }
418 list_add_tail(&e->list, &bitmaps);
419 }
420
421 num_entries--;
422 offset += sizeof(struct btrfs_free_space_entry);
423 if (offset + sizeof(struct btrfs_free_space_entry) >=
424 PAGE_CACHE_SIZE)
425 break;
426 entry++;
427 } 703 }
428 704
429 /* 705 num_entries--;
430 * We read an entry out of this page, we need to move on to the 706 }
431 * next page.
432 */
433 if (need_loop) {
434 kunmap(page);
435 goto next;
436 }
437 707
438 /* 708 io_ctl_unmap_page(&io_ctl);
439 * We add the bitmaps at the end of the entries in order that 709
440 * the bitmap entries are added to the cache. 710 /*
441 */ 711 * We add the bitmaps at the end of the entries in order that
442 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 712 * the bitmap entries are added to the cache.
713 */
714 list_for_each_entry_safe(e, n, &bitmaps, list) {
443 list_del_init(&e->list); 715 list_del_init(&e->list);
444 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 716 ret = io_ctl_read_bitmap(&io_ctl, e);
445 kunmap(page); 717 if (ret)
446 num_bitmaps--; 718 goto free_cache;
447next:
448 unlock_page(page);
449 page_cache_release(page);
450 index++;
451 } 719 }
452 720
721 io_ctl_drop_pages(&io_ctl);
453 ret = 1; 722 ret = 1;
454out: 723out:
724 io_ctl_free(&io_ctl);
455 return ret; 725 return ret;
456free_cache: 726free_cache:
727 io_ctl_drop_pages(&io_ctl);
457 __btrfs_remove_free_space_cache(ctl); 728 __btrfs_remove_free_space_cache(ctl);
458 goto out; 729 goto out;
459} 730}
@@ -465,7 +736,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
465 struct btrfs_root *root = fs_info->tree_root; 736 struct btrfs_root *root = fs_info->tree_root;
466 struct inode *inode; 737 struct inode *inode;
467 struct btrfs_path *path; 738 struct btrfs_path *path;
468 int ret; 739 int ret = 0;
469 bool matched; 740 bool matched;
470 u64 used = btrfs_block_group_used(&block_group->item); 741 u64 used = btrfs_block_group_used(&block_group->item);
471 742
@@ -497,6 +768,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
497 return 0; 768 return 0;
498 } 769 }
499 770
771 /* We may have converted the inode and made the cache invalid. */
772 spin_lock(&block_group->lock);
773 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
774 spin_unlock(&block_group->lock);
775 goto out;
776 }
777 spin_unlock(&block_group->lock);
778
500 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 779 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
501 path, block_group->key.objectid); 780 path, block_group->key.objectid);
502 btrfs_free_path(path); 781 btrfs_free_path(path);
@@ -530,6 +809,19 @@ out:
530 return ret; 809 return ret;
531} 810}
532 811
812/**
813 * __btrfs_write_out_cache - write out cached info to an inode
814 * @root - the root the inode belongs to
815 * @ctl - the free space cache we are going to write out
816 * @block_group - the block_group for this cache if it belongs to a block_group
817 * @trans - the trans handle
818 * @path - the path to use
819 * @offset - the offset for the key we'll insert
820 *
821 * This function writes out a free space cache struct to disk for quick recovery
822 * on mount. This will return 0 if it was successfull in writing the cache out,
823 * and -1 if it was not.
824 */
533int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 825int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
534 struct btrfs_free_space_ctl *ctl, 826 struct btrfs_free_space_ctl *ctl,
535 struct btrfs_block_group_cache *block_group, 827 struct btrfs_block_group_cache *block_group,
@@ -540,42 +832,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
540 struct extent_buffer *leaf; 832 struct extent_buffer *leaf;
541 struct rb_node *node; 833 struct rb_node *node;
542 struct list_head *pos, *n; 834 struct list_head *pos, *n;
543 struct page **pages;
544 struct page *page;
545 struct extent_state *cached_state = NULL; 835 struct extent_state *cached_state = NULL;
546 struct btrfs_free_cluster *cluster = NULL; 836 struct btrfs_free_cluster *cluster = NULL;
547 struct extent_io_tree *unpin = NULL; 837 struct extent_io_tree *unpin = NULL;
838 struct io_ctl io_ctl;
548 struct list_head bitmap_list; 839 struct list_head bitmap_list;
549 struct btrfs_key key; 840 struct btrfs_key key;
550 u64 start, end, len; 841 u64 start, end, len;
551 u64 bytes = 0;
552 u32 crc = ~(u32)0;
553 int index = 0, num_pages = 0;
554 int entries = 0; 842 int entries = 0;
555 int bitmaps = 0; 843 int bitmaps = 0;
556 int ret = -1; 844 int ret;
557 bool next_page = false; 845 int err = -1;
558 bool out_of_space = false;
559 846
560 INIT_LIST_HEAD(&bitmap_list); 847 INIT_LIST_HEAD(&bitmap_list);
561 848
562 node = rb_first(&ctl->free_space_offset);
563 if (!node)
564 return 0;
565
566 if (!i_size_read(inode)) 849 if (!i_size_read(inode))
567 return -1; 850 return -1;
568 851
569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 852 io_ctl_init(&io_ctl, inode, root);
570 PAGE_CACHE_SHIFT;
571
572 filemap_write_and_wait(inode->i_mapping);
573 btrfs_wait_ordered_range(inode, inode->i_size &
574 ~(root->sectorsize - 1), (u64)-1);
575
576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
577 if (!pages)
578 return -1;
579 853
580 /* Get the cluster for this block_group if it exists */ 854 /* Get the cluster for this block_group if it exists */
581 if (block_group && !list_empty(&block_group->cluster_list)) 855 if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +863,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
589 */ 863 */
590 unpin = root->fs_info->pinned_extents; 864 unpin = root->fs_info->pinned_extents;
591 865
592 /* 866 /* Lock all pages first so we can lock the extent safely. */
593 * Lock all pages first so we can lock the extent safely. 867 io_ctl_prepare_pages(&io_ctl, inode, 0);
594 *
595 * NOTE: Because we hold the ref the entire time we're going to write to
596 * the page find_get_page should never fail, so we don't do a check
597 * after find_get_page at this point. Just putting this here so people
598 * know and don't freak out.
599 */
600 while (index < num_pages) {
601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
602 if (!page) {
603 int i;
604 868
605 for (i = 0; i < num_pages; i++) {
606 unlock_page(pages[i]);
607 page_cache_release(pages[i]);
608 }
609 goto out;
610 }
611 pages[index] = page;
612 index++;
613 }
614
615 index = 0;
616 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 869 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
617 0, &cached_state, GFP_NOFS); 870 0, &cached_state, GFP_NOFS);
618 871
@@ -623,189 +876,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
623 if (block_group) 876 if (block_group)
624 start = block_group->key.objectid; 877 start = block_group->key.objectid;
625 878
626 /* Write out the extent entries */ 879 node = rb_first(&ctl->free_space_offset);
627 do { 880 if (!node && cluster) {
628 struct btrfs_free_space_entry *entry; 881 node = rb_first(&cluster->root);
629 void *addr, *orig; 882 cluster = NULL;
630 unsigned long offset = 0; 883 }
631 884
632 next_page = false; 885 /* Make sure we can fit our crcs into the first page */
886 if (io_ctl.check_crcs &&
887 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
888 WARN_ON(1);
889 goto out_nospc;
890 }
633 891
634 if (index >= num_pages) { 892 io_ctl_set_generation(&io_ctl, trans->transid);
635 out_of_space = true;
636 break;
637 }
638 893
639 page = pages[index]; 894 /* Write out the extent entries */
895 while (node) {
896 struct btrfs_free_space *e;
640 897
641 orig = addr = kmap(page); 898 e = rb_entry(node, struct btrfs_free_space, offset_index);
642 if (index == 0) { 899 entries++;
643 u64 *gen;
644 900
645 /* 901 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
646 * We're going to put in a bogus crc for this page to 902 e->bitmap);
647 * make sure that old kernels who aren't aware of this 903 if (ret)
648 * format will be sure to discard the cache. 904 goto out_nospc;
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652 905
653 gen = addr; 906 if (e->bitmap) {
654 *gen = trans->transid; 907 list_add_tail(&e->list, &bitmap_list);
655 addr += sizeof(u64); 908 bitmaps++;
656 offset += sizeof(u64);
657 } 909 }
658 entry = addr; 910 node = rb_next(node);
659 911 if (!node && cluster) {
660 memset(addr, 0, PAGE_CACHE_SIZE - offset); 912 node = rb_first(&cluster->root);
661 while (node && !next_page) { 913 cluster = NULL;
662 struct btrfs_free_space *e;
663
664 e = rb_entry(node, struct btrfs_free_space, offset_index);
665 entries++;
666
667 entry->offset = cpu_to_le64(e->offset);
668 entry->bytes = cpu_to_le64(e->bytes);
669 if (e->bitmap) {
670 entry->type = BTRFS_FREE_SPACE_BITMAP;
671 list_add_tail(&e->list, &bitmap_list);
672 bitmaps++;
673 } else {
674 entry->type = BTRFS_FREE_SPACE_EXTENT;
675 }
676 node = rb_next(node);
677 if (!node && cluster) {
678 node = rb_first(&cluster->root);
679 cluster = NULL;
680 }
681 offset += sizeof(struct btrfs_free_space_entry);
682 if (offset + sizeof(struct btrfs_free_space_entry) >=
683 PAGE_CACHE_SIZE)
684 next_page = true;
685 entry++;
686 } 914 }
915 }
687 916
688 /* 917 /*
689 * We want to add any pinned extents to our free space cache 918 * We want to add any pinned extents to our free space cache
690 * so we don't leak the space 919 * so we don't leak the space
691 */ 920 */
692 while (block_group && !next_page && 921 while (block_group && (start < block_group->key.objectid +
693 (start < block_group->key.objectid + 922 block_group->key.offset)) {
694 block_group->key.offset)) { 923 ret = find_first_extent_bit(unpin, start, &start, &end,
695 ret = find_first_extent_bit(unpin, start, &start, &end, 924 EXTENT_DIRTY);
696 EXTENT_DIRTY); 925 if (ret) {
697 if (ret) { 926 ret = 0;
698 ret = 0; 927 break;
699 break;
700 }
701
702 /* This pinned extent is out of our range */
703 if (start >= block_group->key.objectid +
704 block_group->key.offset)
705 break;
706
707 len = block_group->key.objectid +
708 block_group->key.offset - start;
709 len = min(len, end + 1 - start);
710
711 entries++;
712 entry->offset = cpu_to_le64(start);
713 entry->bytes = cpu_to_le64(len);
714 entry->type = BTRFS_FREE_SPACE_EXTENT;
715
716 start = end + 1;
717 offset += sizeof(struct btrfs_free_space_entry);
718 if (offset + sizeof(struct btrfs_free_space_entry) >=
719 PAGE_CACHE_SIZE)
720 next_page = true;
721 entry++;
722 } 928 }
723 929
724 /* Generate bogus crc value */ 930 /* This pinned extent is out of our range */
725 if (index == 0) { 931 if (start >= block_group->key.objectid +
726 u32 *tmp; 932 block_group->key.offset)
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc, 933 break;
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734 934
735 kunmap(page); 935 len = block_group->key.objectid +
936 block_group->key.offset - start;
937 len = min(len, end + 1 - start);
736 938
737 bytes += PAGE_CACHE_SIZE; 939 entries++;
940 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
941 if (ret)
942 goto out_nospc;
738 943
739 index++; 944 start = end + 1;
740 } while (node || next_page); 945 }
741 946
742 /* Write out the bitmaps */ 947 /* Write out the bitmaps */
743 list_for_each_safe(pos, n, &bitmap_list) { 948 list_for_each_safe(pos, n, &bitmap_list) {
744 void *addr;
745 struct btrfs_free_space *entry = 949 struct btrfs_free_space *entry =
746 list_entry(pos, struct btrfs_free_space, list); 950 list_entry(pos, struct btrfs_free_space, list);
747 951
748 if (index >= num_pages) { 952 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
749 out_of_space = true; 953 if (ret)
750 break; 954 goto out_nospc;
751 }
752 page = pages[index];
753
754 addr = kmap(page);
755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
756 kunmap(page);
757 bytes += PAGE_CACHE_SIZE;
758
759 list_del_init(&entry->list); 955 list_del_init(&entry->list);
760 index++;
761 }
762
763 if (out_of_space) {
764 btrfs_drop_pages(pages, num_pages);
765 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
766 i_size_read(inode) - 1, &cached_state,
767 GFP_NOFS);
768 ret = 0;
769 goto out;
770 } 956 }
771 957
772 /* Zero out the rest of the pages just to make sure */ 958 /* Zero out the rest of the pages just to make sure */
773 while (index < num_pages) { 959 io_ctl_zero_remaining_pages(&io_ctl);
774 void *addr;
775 960
776 page = pages[index]; 961 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
777 addr = kmap(page); 962 0, i_size_read(inode), &cached_state);
778 memset(addr, 0, PAGE_CACHE_SIZE); 963 io_ctl_drop_pages(&io_ctl);
779 kunmap(page);
780 bytes += PAGE_CACHE_SIZE;
781 index++;
782 }
783
784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
785 bytes, &cached_state);
786 btrfs_drop_pages(pages, num_pages);
787 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 964 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
788 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 965 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
789 966
790 if (ret) { 967 if (ret)
791 ret = 0;
792 goto out; 968 goto out;
793 }
794 969
795 BTRFS_I(inode)->generation = trans->transid;
796 970
797 filemap_write_and_wait(inode->i_mapping); 971 ret = filemap_write_and_wait(inode->i_mapping);
972 if (ret)
973 goto out;
798 974
799 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 975 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
800 key.offset = offset; 976 key.offset = offset;
801 key.type = 0; 977 key.type = 0;
802 978
803 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 979 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
804 if (ret < 0) { 980 if (ret < 0) {
805 ret = -1; 981 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 982 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
807 EXTENT_DIRTY | EXTENT_DELALLOC | 983 GFP_NOFS);
808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
809 goto out; 984 goto out;
810 } 985 }
811 leaf = path->nodes[0]; 986 leaf = path->nodes[0];
@@ -816,15 +991,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
816 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 991 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
817 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 992 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
818 found_key.offset != offset) { 993 found_key.offset != offset) {
819 ret = -1; 994 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
820 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 995 inode->i_size - 1,
821 EXTENT_DIRTY | EXTENT_DELALLOC | 996 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
822 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 997 NULL, GFP_NOFS);
823 GFP_NOFS);
824 btrfs_release_path(path); 998 btrfs_release_path(path);
825 goto out; 999 goto out;
826 } 1000 }
827 } 1001 }
1002
1003 BTRFS_I(inode)->generation = trans->transid;
828 header = btrfs_item_ptr(leaf, path->slots[0], 1004 header = btrfs_item_ptr(leaf, path->slots[0],
829 struct btrfs_free_space_header); 1005 struct btrfs_free_space_header);
830 btrfs_set_free_space_entries(leaf, header, entries); 1006 btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1009,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 btrfs_mark_buffer_dirty(leaf); 1009 btrfs_mark_buffer_dirty(leaf);
834 btrfs_release_path(path); 1010 btrfs_release_path(path);
835 1011
836 ret = 1; 1012 err = 0;
837
838out: 1013out:
839 kfree(pages); 1014 io_ctl_free(&io_ctl);
840 if (ret != 1) { 1015 if (err) {
841 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 1016 invalidate_inode_pages2(inode->i_mapping);
842 BTRFS_I(inode)->generation = 0; 1017 BTRFS_I(inode)->generation = 0;
843 } 1018 }
844 btrfs_update_inode(trans, root, inode); 1019 btrfs_update_inode(trans, root, inode);
845 return ret; 1020 return err;
1021
1022out_nospc:
1023 list_for_each_safe(pos, n, &bitmap_list) {
1024 struct btrfs_free_space *entry =
1025 list_entry(pos, struct btrfs_free_space, list);
1026 list_del_init(&entry->list);
1027 }
1028 io_ctl_drop_pages(&io_ctl);
1029 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1030 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1031 goto out;
846} 1032}
847 1033
848int btrfs_write_out_cache(struct btrfs_root *root, 1034int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1055,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
869 1055
870 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1056 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
871 path, block_group->key.objectid); 1057 path, block_group->key.objectid);
872 if (ret < 0) { 1058 if (ret) {
873 spin_lock(&block_group->lock); 1059 spin_lock(&block_group->lock);
874 block_group->disk_cache_state = BTRFS_DC_ERROR; 1060 block_group->disk_cache_state = BTRFS_DC_ERROR;
875 spin_unlock(&block_group->lock); 1061 spin_unlock(&block_group->lock);
876 ret = 0; 1062 ret = 0;
877 1063#ifdef DEBUG
878 printk(KERN_ERR "btrfs: failed to write free space cace " 1064 printk(KERN_ERR "btrfs: failed to write free space cace "
879 "for block group %llu\n", block_group->key.objectid); 1065 "for block group %llu\n", block_group->key.objectid);
1066#endif
880 } 1067 }
881 1068
882 iput(inode); 1069 iput(inode);
@@ -1283,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1283{ 1470{
1284 info->offset = offset_to_bitmap(ctl, offset); 1471 info->offset = offset_to_bitmap(ctl, offset);
1285 info->bytes = 0; 1472 info->bytes = 0;
1473 INIT_LIST_HEAD(&info->list);
1286 link_free_space(ctl, info); 1474 link_free_space(ctl, info);
1287 ctl->total_bitmaps++; 1475 ctl->total_bitmaps++;
1288 1476
@@ -1662,7 +1850,13 @@ again:
1662 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1850 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1663 1, 0); 1851 1, 0);
1664 if (!info) { 1852 if (!info) {
1665 WARN_ON(1); 1853 /* the tree logging code might be calling us before we
1854 * have fully loaded the free space rbtree for this
1855 * block group. So it is possible the entry won't
1856 * be in the rbtree yet at all. The caching code
1857 * will make sure not to put it in the rbtree if
1858 * the logging code has pinned it.
1859 */
1666 goto out_lock; 1860 goto out_lock;
1667 } 1861 }
1668 } 1862 }
@@ -1701,6 +1895,7 @@ again:
1701 ctl->total_bitmaps--; 1895 ctl->total_bitmaps--;
1702 } 1896 }
1703 kmem_cache_free(btrfs_free_space_cachep, info); 1897 kmem_cache_free(btrfs_free_space_cachep, info);
1898 ret = 0;
1704 goto out_lock; 1899 goto out_lock;
1705 } 1900 }
1706 1901
@@ -1708,7 +1903,8 @@ again:
1708 unlink_free_space(ctl, info); 1903 unlink_free_space(ctl, info);
1709 info->offset += bytes; 1904 info->offset += bytes;
1710 info->bytes -= bytes; 1905 info->bytes -= bytes;
1711 link_free_space(ctl, info); 1906 ret = link_free_space(ctl, info);
1907 WARN_ON(ret);
1712 goto out_lock; 1908 goto out_lock;
1713 } 1909 }
1714 1910
@@ -2124,6 +2320,7 @@ again:
2124 2320
2125 if (!found) { 2321 if (!found) {
2126 start = i; 2322 start = i;
2323 cluster->max_size = 0;
2127 found = true; 2324 found = true;
2128 } 2325 }
2129 2326
@@ -2267,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2267{ 2464{
2268 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2269 struct btrfs_free_space *entry; 2466 struct btrfs_free_space *entry;
2270 struct rb_node *node;
2271 int ret = -ENOSPC; 2467 int ret = -ENOSPC;
2468 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2272 2469
2273 if (ctl->total_bitmaps == 0) 2470 if (ctl->total_bitmaps == 0)
2274 return -ENOSPC; 2471 return -ENOSPC;
2275 2472
2276 /* 2473 /*
2277 * First check our cached list of bitmaps and see if there is an entry 2474 * The bitmap that covers offset won't be in the list unless offset
2278 * here that will work. 2475 * is just its start offset.
2279 */ 2476 */
2477 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2478 if (entry->offset != bitmap_offset) {
2479 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2480 if (entry && list_empty(&entry->list))
2481 list_add(&entry->list, bitmaps);
2482 }
2483
2280 list_for_each_entry(entry, bitmaps, list) { 2484 list_for_each_entry(entry, bitmaps, list) {
2281 if (entry->bytes < min_bytes) 2485 if (entry->bytes < min_bytes)
2282 continue; 2486 continue;
@@ -2287,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2287 } 2491 }
2288 2492
2289 /* 2493 /*
2290 * If we do have entries on our list and we are here then we didn't find 2494 * The bitmaps list has all the bitmaps that record free space
2291 * anything, so go ahead and get the next entry after the last entry in 2495 * starting after offset, so no more search is required.
2292 * this list and start the search from there.
2293 */ 2496 */
2294 if (!list_empty(bitmaps)) { 2497 return -ENOSPC;
2295 entry = list_entry(bitmaps->prev, struct btrfs_free_space,
2296 list);
2297 node = rb_next(&entry->offset_index);
2298 if (!node)
2299 return -ENOSPC;
2300 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2301 goto search;
2302 }
2303
2304 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
2305 if (!entry)
2306 return -ENOSPC;
2307
2308search:
2309 node = &entry->offset_index;
2310 do {
2311 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2312 node = rb_next(&entry->offset_index);
2313 if (!entry->bitmap)
2314 continue;
2315 if (entry->bytes < min_bytes)
2316 continue;
2317 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2318 bytes, min_bytes);
2319 } while (ret && node);
2320
2321 return ret;
2322} 2498}
2323 2499
2324/* 2500/*
@@ -2336,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2336 u64 offset, u64 bytes, u64 empty_size) 2512 u64 offset, u64 bytes, u64 empty_size)
2337{ 2513{
2338 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2514 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2339 struct list_head bitmaps;
2340 struct btrfs_free_space *entry, *tmp; 2515 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps);
2341 u64 min_bytes; 2517 u64 min_bytes;
2342 int ret; 2518 int ret;
2343 2519
@@ -2376,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2376 goto out; 2552 goto out;
2377 } 2553 }
2378 2554
2379 INIT_LIST_HEAD(&bitmaps);
2380 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2381 bytes, min_bytes); 2556 bytes, min_bytes);
2382 if (ret) 2557 if (ret)
@@ -2472,9 +2647,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2472 spin_unlock(&ctl->tree_lock); 2647 spin_unlock(&ctl->tree_lock);
2473 2648
2474 if (bytes >= minlen) { 2649 if (bytes >= minlen) {
2475 int update_ret; 2650 struct btrfs_space_info *space_info;
2476 update_ret = btrfs_update_reserved_bytes(block_group, 2651 int update = 0;
2477 bytes, 1, 1); 2652
2653 space_info = block_group->space_info;
2654 spin_lock(&space_info->lock);
2655 spin_lock(&block_group->lock);
2656 if (!block_group->ro) {
2657 block_group->reserved += bytes;
2658 space_info->bytes_reserved += bytes;
2659 update = 1;
2660 }
2661 spin_unlock(&block_group->lock);
2662 spin_unlock(&space_info->lock);
2478 2663
2479 ret = btrfs_error_discard_extent(fs_info->extent_root, 2664 ret = btrfs_error_discard_extent(fs_info->extent_root,
2480 start, 2665 start,
@@ -2482,9 +2667,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2482 &actually_trimmed); 2667 &actually_trimmed);
2483 2668
2484 btrfs_add_free_space(block_group, start, bytes); 2669 btrfs_add_free_space(block_group, start, bytes);
2485 if (!update_ret) 2670 if (update) {
2486 btrfs_update_reserved_bytes(block_group, 2671 spin_lock(&space_info->lock);
2487 bytes, 0, 1); 2672 spin_lock(&block_group->lock);
2673 if (block_group->ro)
2674 space_info->bytes_readonly += bytes;
2675 block_group->reserved -= bytes;
2676 space_info->bytes_reserved -= bytes;
2677 spin_unlock(&space_info->lock);
2678 spin_unlock(&block_group->lock);
2679 }
2488 2680
2489 if (ret) 2681 if (ret)
2490 break; 2682 break;
@@ -2643,9 +2835,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
2643 return 0; 2835 return 0;
2644 2836
2645 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 2837 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2646 if (ret < 0) 2838 if (ret) {
2839 btrfs_delalloc_release_metadata(inode, inode->i_size);
2840#ifdef DEBUG
2647 printk(KERN_ERR "btrfs: failed to write free ino cache " 2841 printk(KERN_ERR "btrfs: failed to write free ino cache "
2648 "for root %llu\n", root->root_key.objectid); 2842 "for root %llu\n", root->root_key.objectid);
2843#endif
2844 }
2649 2845
2650 iput(inode); 2846 iput(inode);
2651 return ret; 2847 return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..f8962a957d65 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
399 struct btrfs_path *path; 399 struct btrfs_path *path;
400 struct inode *inode; 400 struct inode *inode;
401 struct btrfs_block_rsv *rsv;
402 u64 num_bytes;
401 u64 alloc_hint = 0; 403 u64 alloc_hint = 0;
402 int ret; 404 int ret;
403 int prealloc; 405 int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
421 if (!path) 423 if (!path)
422 return -ENOMEM; 424 return -ENOMEM;
423 425
426 rsv = trans->block_rsv;
427 trans->block_rsv = &root->fs_info->trans_block_rsv;
428
429 num_bytes = trans->bytes_reserved;
430 /*
431 * 1 item for inode item insertion if need
432 * 3 items for inode item update (in the worst case)
433 * 1 item for free space object
434 * 3 items for pre-allocation
435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
438 trans->bytes_reserved);
439 if (ret)
440 goto out;
424again: 441again:
425 inode = lookup_free_ino_inode(root, path); 442 inode = lookup_free_ino_inode(root, path);
426 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
427 ret = PTR_ERR(inode); 444 ret = PTR_ERR(inode);
428 goto out; 445 goto out_release;
429 } 446 }
430 447
431 if (IS_ERR(inode)) { 448 if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
434 451
435 ret = create_free_ino_inode(root, trans, path); 452 ret = create_free_ino_inode(root, trans, path);
436 if (ret) 453 if (ret)
437 goto out; 454 goto out_release;
438 goto again; 455 goto again;
439 } 456 }
440 457
@@ -465,21 +482,26 @@ again:
465 /* Just to make sure we have enough space */ 482 /* Just to make sure we have enough space */
466 prealloc += 8 * PAGE_CACHE_SIZE; 483 prealloc += 8 * PAGE_CACHE_SIZE;
467 484
468 ret = btrfs_check_data_free_space(inode, prealloc); 485 ret = btrfs_delalloc_reserve_space(inode, prealloc);
469 if (ret) 486 if (ret)
470 goto out_put; 487 goto out_put;
471 488
472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 489 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
473 prealloc, prealloc, &alloc_hint); 490 prealloc, prealloc, &alloc_hint);
474 if (ret) 491 if (ret) {
492 btrfs_delalloc_release_space(inode, prealloc);
475 goto out_put; 493 goto out_put;
494 }
476 btrfs_free_reserved_data_space(inode, prealloc); 495 btrfs_free_reserved_data_space(inode, prealloc);
477 496
497 ret = btrfs_write_out_ino_cache(root, trans, path);
478out_put: 498out_put:
479 iput(inode); 499 iput(inode);
500out_release:
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
480out: 502out:
481 if (ret == 0) 503 trans->block_rsv = rsv;
482 ret = btrfs_write_out_ino_cache(root, trans, path); 504 trans->bytes_reserved = num_bytes;
483 505
484 btrfs_free_path(path); 506 btrfs_free_path(path);
485 return ret; 507 return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d004ad66a0..13b0542015ff 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -38,6 +38,7 @@
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/ratelimit.h> 40#include <linux/ratelimit.h>
41#include <linux/mount.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -45,10 +46,10 @@
45#include "btrfs_inode.h" 46#include "btrfs_inode.h"
46#include "ioctl.h" 47#include "ioctl.h"
47#include "print-tree.h" 48#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h" 49#include "ordered-data.h"
50#include "xattr.h" 50#include "xattr.h"
51#include "tree-log.h" 51#include "tree-log.h"
52#include "volumes.h"
52#include "compression.h" 53#include "compression.h"
53#include "locking.h" 54#include "locking.h"
54#include "free-space-cache.h" 55#include "free-space-cache.h"
@@ -93,6 +94,8 @@ static noinline int cow_file_range(struct inode *inode,
93 struct page *locked_page, 94 struct page *locked_page,
94 u64 start, u64 end, int *page_started, 95 u64 start, u64 end, int *page_started,
95 unsigned long *nr_written, int unlock); 96 unsigned long *nr_written, int unlock);
97static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
98 struct btrfs_root *root, struct inode *inode);
96 99
97static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 100static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
98 struct inode *inode, struct inode *dir, 101 struct inode *inode, struct inode *dir,
@@ -393,7 +396,10 @@ again:
393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 396 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
394 WARN_ON(pages); 397 WARN_ON(pages);
395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 398 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
396 BUG_ON(!pages); 399 if (!pages) {
400 /* just bail out to the uncompressed code */
401 goto cont;
402 }
397 403
398 if (BTRFS_I(inode)->force_compress) 404 if (BTRFS_I(inode)->force_compress)
399 compress_type = BTRFS_I(inode)->force_compress; 405 compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +430,7 @@ again:
424 will_compress = 1; 430 will_compress = 1;
425 } 431 }
426 } 432 }
433cont:
427 if (start == 0) { 434 if (start == 0) {
428 trans = btrfs_join_transaction(root); 435 trans = btrfs_join_transaction(root);
429 BUG_ON(IS_ERR(trans)); 436 BUG_ON(IS_ERR(trans));
@@ -820,7 +827,7 @@ static noinline int cow_file_range(struct inode *inode,
820 } 827 }
821 828
822 BUG_ON(disk_num_bytes > 829 BUG_ON(disk_num_bytes >
823 btrfs_super_total_bytes(&root->fs_info->super_copy)); 830 btrfs_super_total_bytes(root->fs_info->super_copy));
824 831
825 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 832 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
826 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 833 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1737,7 +1744,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1737 trans = btrfs_join_transaction(root); 1744 trans = btrfs_join_transaction(root);
1738 BUG_ON(IS_ERR(trans)); 1745 BUG_ON(IS_ERR(trans));
1739 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1746 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1740 ret = btrfs_update_inode(trans, root, inode); 1747 ret = btrfs_update_inode_fallback(trans, root, inode);
1741 BUG_ON(ret); 1748 BUG_ON(ret);
1742 } 1749 }
1743 goto out; 1750 goto out;
@@ -1787,17 +1794,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1787 1794
1788 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1795 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1789 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1796 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1790 ret = btrfs_update_inode(trans, root, inode); 1797 ret = btrfs_update_inode_fallback(trans, root, inode);
1791 BUG_ON(ret); 1798 BUG_ON(ret);
1792 } 1799 }
1793 ret = 0; 1800 ret = 0;
1794out: 1801out:
1795 if (nolock) { 1802 if (root != root->fs_info->tree_root)
1796 if (trans)
1797 btrfs_end_transaction_nolock(trans, root);
1798 } else {
1799 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1803 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1800 if (trans) 1804 if (trans) {
1805 if (nolock)
1806 btrfs_end_transaction_nolock(trans, root);
1807 else
1801 btrfs_end_transaction(trans, root); 1808 btrfs_end_transaction(trans, root);
1802 } 1809 }
1803 1810
@@ -1819,153 +1826,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1819} 1826}
1820 1827
1821/* 1828/*
1822 * When IO fails, either with EIO or csum verification fails, we
1823 * try other mirrors that might have a good copy of the data. This
1824 * io_failure_record is used to record state as we go through all the
1825 * mirrors. If another mirror has good data, the page is set up to date
1826 * and things continue. If a good mirror can't be found, the original
1827 * bio end_io callback is called to indicate things have failed.
1828 */
1829struct io_failure_record {
1830 struct page *page;
1831 u64 start;
1832 u64 len;
1833 u64 logical;
1834 unsigned long bio_flags;
1835 int last_mirror;
1836};
1837
1838static int btrfs_io_failed_hook(struct bio *failed_bio,
1839 struct page *page, u64 start, u64 end,
1840 struct extent_state *state)
1841{
1842 struct io_failure_record *failrec = NULL;
1843 u64 private;
1844 struct extent_map *em;
1845 struct inode *inode = page->mapping->host;
1846 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1847 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1848 struct bio *bio;
1849 int num_copies;
1850 int ret;
1851 int rw;
1852 u64 logical;
1853
1854 ret = get_state_private(failure_tree, start, &private);
1855 if (ret) {
1856 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1857 if (!failrec)
1858 return -ENOMEM;
1859 failrec->start = start;
1860 failrec->len = end - start + 1;
1861 failrec->last_mirror = 0;
1862 failrec->bio_flags = 0;
1863
1864 read_lock(&em_tree->lock);
1865 em = lookup_extent_mapping(em_tree, start, failrec->len);
1866 if (em->start > start || em->start + em->len < start) {
1867 free_extent_map(em);
1868 em = NULL;
1869 }
1870 read_unlock(&em_tree->lock);
1871
1872 if (IS_ERR_OR_NULL(em)) {
1873 kfree(failrec);
1874 return -EIO;
1875 }
1876 logical = start - em->start;
1877 logical = em->block_start + logical;
1878 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1879 logical = em->block_start;
1880 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1881 extent_set_compress_type(&failrec->bio_flags,
1882 em->compress_type);
1883 }
1884 failrec->logical = logical;
1885 free_extent_map(em);
1886 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1887 EXTENT_DIRTY, GFP_NOFS);
1888 set_state_private(failure_tree, start,
1889 (u64)(unsigned long)failrec);
1890 } else {
1891 failrec = (struct io_failure_record *)(unsigned long)private;
1892 }
1893 num_copies = btrfs_num_copies(
1894 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1895 failrec->logical, failrec->len);
1896 failrec->last_mirror++;
1897 if (!state) {
1898 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1899 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1900 failrec->start,
1901 EXTENT_LOCKED);
1902 if (state && state->start != failrec->start)
1903 state = NULL;
1904 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1905 }
1906 if (!state || failrec->last_mirror > num_copies) {
1907 set_state_private(failure_tree, failrec->start, 0);
1908 clear_extent_bits(failure_tree, failrec->start,
1909 failrec->start + failrec->len - 1,
1910 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1911 kfree(failrec);
1912 return -EIO;
1913 }
1914 bio = bio_alloc(GFP_NOFS, 1);
1915 bio->bi_private = state;
1916 bio->bi_end_io = failed_bio->bi_end_io;
1917 bio->bi_sector = failrec->logical >> 9;
1918 bio->bi_bdev = failed_bio->bi_bdev;
1919 bio->bi_size = 0;
1920
1921 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1922 if (failed_bio->bi_rw & REQ_WRITE)
1923 rw = WRITE;
1924 else
1925 rw = READ;
1926
1927 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1928 failrec->last_mirror,
1929 failrec->bio_flags, 0);
1930 return ret;
1931}
1932
1933/*
1934 * each time an IO finishes, we do a fast check in the IO failure tree
1935 * to see if we need to process or clean up an io_failure_record
1936 */
1937static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1938{
1939 u64 private;
1940 u64 private_failure;
1941 struct io_failure_record *failure;
1942 int ret;
1943
1944 private = 0;
1945 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1946 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1947 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1948 start, &private_failure);
1949 if (ret == 0) {
1950 failure = (struct io_failure_record *)(unsigned long)
1951 private_failure;
1952 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1953 failure->start, 0);
1954 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1955 failure->start,
1956 failure->start + failure->len - 1,
1957 EXTENT_DIRTY | EXTENT_LOCKED,
1958 GFP_NOFS);
1959 kfree(failure);
1960 }
1961 }
1962 return 0;
1963}
1964
1965/*
1966 * when reads are done, we need to check csums to verify the data is correct 1829 * when reads are done, we need to check csums to verify the data is correct
1967 * if there's a match, we allow the bio to finish. If not, we go through 1830 * if there's a match, we allow the bio to finish. If not, the code in
1968 * the io_failure_record routines to find good copies 1831 * extent_io.c will try to find good copies for us.
1969 */ 1832 */
1970static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1833static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1971 struct extent_state *state) 1834 struct extent_state *state)
@@ -2011,10 +1874,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2011 1874
2012 kunmap_atomic(kaddr, KM_USER0); 1875 kunmap_atomic(kaddr, KM_USER0);
2013good: 1876good:
2014 /* if the io failure tree for this inode is non-empty,
2015 * check to see if we've recovered from a failed IO
2016 */
2017 btrfs_clean_io_failures(inode, start);
2018 return 0; 1877 return 0;
2019 1878
2020zeroit: 1879zeroit:
@@ -2079,89 +1938,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2079 up_read(&root->fs_info->cleanup_work_sem); 1938 up_read(&root->fs_info->cleanup_work_sem);
2080} 1939}
2081 1940
2082/*
2083 * calculate extra metadata reservation when snapshotting a subvolume
2084 * contains orphan files.
2085 */
2086void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2087 struct btrfs_pending_snapshot *pending,
2088 u64 *bytes_to_reserve)
2089{
2090 struct btrfs_root *root;
2091 struct btrfs_block_rsv *block_rsv;
2092 u64 num_bytes;
2093 int index;
2094
2095 root = pending->root;
2096 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2097 return;
2098
2099 block_rsv = root->orphan_block_rsv;
2100
2101 /* orphan block reservation for the snapshot */
2102 num_bytes = block_rsv->size;
2103
2104 /*
2105 * after the snapshot is created, COWing tree blocks may use more
2106 * space than it frees. So we should make sure there is enough
2107 * reserved space.
2108 */
2109 index = trans->transid & 0x1;
2110 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2111 num_bytes += block_rsv->size -
2112 (block_rsv->reserved + block_rsv->freed[index]);
2113 }
2114
2115 *bytes_to_reserve += num_bytes;
2116}
2117
2118void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2119 struct btrfs_pending_snapshot *pending)
2120{
2121 struct btrfs_root *root = pending->root;
2122 struct btrfs_root *snap = pending->snap;
2123 struct btrfs_block_rsv *block_rsv;
2124 u64 num_bytes;
2125 int index;
2126 int ret;
2127
2128 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2129 return;
2130
2131 /* refill source subvolume's orphan block reservation */
2132 block_rsv = root->orphan_block_rsv;
2133 index = trans->transid & 0x1;
2134 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2135 num_bytes = block_rsv->size -
2136 (block_rsv->reserved + block_rsv->freed[index]);
2137 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2138 root->orphan_block_rsv,
2139 num_bytes);
2140 BUG_ON(ret);
2141 }
2142
2143 /* setup orphan block reservation for the snapshot */
2144 block_rsv = btrfs_alloc_block_rsv(snap);
2145 BUG_ON(!block_rsv);
2146
2147 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2148 snap->orphan_block_rsv = block_rsv;
2149
2150 num_bytes = root->orphan_block_rsv->size;
2151 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2152 block_rsv, num_bytes);
2153 BUG_ON(ret);
2154
2155#if 0
2156 /* insert orphan item for the snapshot */
2157 WARN_ON(!root->orphan_item_inserted);
2158 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2159 snap->root_key.objectid);
2160 BUG_ON(ret);
2161 snap->orphan_item_inserted = 1;
2162#endif
2163}
2164
2165enum btrfs_orphan_cleanup_state { 1941enum btrfs_orphan_cleanup_state {
2166 ORPHAN_CLEANUP_STARTED = 1, 1942 ORPHAN_CLEANUP_STARTED = 1,
2167 ORPHAN_CLEANUP_DONE = 2, 1943 ORPHAN_CLEANUP_DONE = 2,
@@ -2247,9 +2023,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2247 } 2023 }
2248 spin_unlock(&root->orphan_lock); 2024 spin_unlock(&root->orphan_lock);
2249 2025
2250 if (block_rsv)
2251 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2252
2253 /* grab metadata reservation from transaction handle */ 2026 /* grab metadata reservation from transaction handle */
2254 if (reserve) { 2027 if (reserve) {
2255 ret = btrfs_orphan_reserve_metadata(trans, inode); 2028 ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2259,7 +2032,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2259 /* insert an orphan item to track this unlinked/truncated file */ 2032 /* insert an orphan item to track this unlinked/truncated file */
2260 if (insert >= 1) { 2033 if (insert >= 1) {
2261 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2034 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2262 BUG_ON(ret); 2035 BUG_ON(ret && ret != -EEXIST);
2263 } 2036 }
2264 2037
2265 /* insert an orphan item to track subvolume contains orphan files */ 2038 /* insert an orphan item to track subvolume contains orphan files */
@@ -2316,6 +2089,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2316 struct btrfs_key key, found_key; 2089 struct btrfs_key key, found_key;
2317 struct btrfs_trans_handle *trans; 2090 struct btrfs_trans_handle *trans;
2318 struct inode *inode; 2091 struct inode *inode;
2092 u64 last_objectid = 0;
2319 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2093 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2320 2094
2321 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2095 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2367,41 +2141,81 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2367 * crossing root thing. we store the inode number in the 2141 * crossing root thing. we store the inode number in the
2368 * offset of the orphan item. 2142 * offset of the orphan item.
2369 */ 2143 */
2144
2145 if (found_key.offset == last_objectid) {
2146 printk(KERN_ERR "btrfs: Error removing orphan entry, "
2147 "stopping orphan cleanup\n");
2148 ret = -EINVAL;
2149 goto out;
2150 }
2151
2152 last_objectid = found_key.offset;
2153
2370 found_key.objectid = found_key.offset; 2154 found_key.objectid = found_key.offset;
2371 found_key.type = BTRFS_INODE_ITEM_KEY; 2155 found_key.type = BTRFS_INODE_ITEM_KEY;
2372 found_key.offset = 0; 2156 found_key.offset = 0;
2373 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2157 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2374 if (IS_ERR(inode)) { 2158 ret = PTR_RET(inode);
2375 ret = PTR_ERR(inode); 2159 if (ret && ret != -ESTALE)
2376 goto out; 2160 goto out;
2377 }
2378 2161
2379 /* 2162 if (ret == -ESTALE && root == root->fs_info->tree_root) {
2380 * add this inode to the orphan list so btrfs_orphan_del does 2163 struct btrfs_root *dead_root;
2381 * the proper thing when we hit it 2164 struct btrfs_fs_info *fs_info = root->fs_info;
2382 */ 2165 int is_dead_root = 0;
2383 spin_lock(&root->orphan_lock);
2384 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2385 spin_unlock(&root->orphan_lock);
2386 2166
2167 /*
2168 * this is an orphan in the tree root. Currently these
2169 * could come from 2 sources:
2170 * a) a snapshot deletion in progress
2171 * b) a free space cache inode
2172 * We need to distinguish those two, as the snapshot
2173 * orphan must not get deleted.
2174 * find_dead_roots already ran before us, so if this
2175 * is a snapshot deletion, we should find the root
2176 * in the dead_roots list
2177 */
2178 spin_lock(&fs_info->trans_lock);
2179 list_for_each_entry(dead_root, &fs_info->dead_roots,
2180 root_list) {
2181 if (dead_root->root_key.objectid ==
2182 found_key.objectid) {
2183 is_dead_root = 1;
2184 break;
2185 }
2186 }
2187 spin_unlock(&fs_info->trans_lock);
2188 if (is_dead_root) {
2189 /* prevent this orphan from being found again */
2190 key.offset = found_key.objectid - 1;
2191 continue;
2192 }
2193 }
2387 /* 2194 /*
2388 * if this is a bad inode, means we actually succeeded in 2195 * Inode is already gone but the orphan item is still there,
2389 * removing the inode, but not the orphan record, which means 2196 * kill the orphan item.
2390 * we need to manually delete the orphan since iput will just
2391 * do a destroy_inode
2392 */ 2197 */
2393 if (is_bad_inode(inode)) { 2198 if (ret == -ESTALE) {
2394 trans = btrfs_start_transaction(root, 0); 2199 trans = btrfs_start_transaction(root, 1);
2395 if (IS_ERR(trans)) { 2200 if (IS_ERR(trans)) {
2396 ret = PTR_ERR(trans); 2201 ret = PTR_ERR(trans);
2397 goto out; 2202 goto out;
2398 } 2203 }
2399 btrfs_orphan_del(trans, inode); 2204 ret = btrfs_del_orphan_item(trans, root,
2205 found_key.objectid);
2206 BUG_ON(ret);
2400 btrfs_end_transaction(trans, root); 2207 btrfs_end_transaction(trans, root);
2401 iput(inode);
2402 continue; 2208 continue;
2403 } 2209 }
2404 2210
2211 /*
2212 * add this inode to the orphan list so btrfs_orphan_del does
2213 * the proper thing when we hit it
2214 */
2215 spin_lock(&root->orphan_lock);
2216 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2217 spin_unlock(&root->orphan_lock);
2218
2405 /* if we have links, this was a truncate, lets do that */ 2219 /* if we have links, this was a truncate, lets do that */
2406 if (inode->i_nlink) { 2220 if (inode->i_nlink) {
2407 if (!S_ISREG(inode->i_mode)) { 2221 if (!S_ISREG(inode->i_mode)) {
@@ -2410,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2410 continue; 2224 continue;
2411 } 2225 }
2412 nr_truncate++; 2226 nr_truncate++;
2227 /*
2228 * Need to hold the imutex for reservation purposes, not
2229 * a huge deal here but I have a WARN_ON in
2230 * btrfs_delalloc_reserve_space to catch offenders.
2231 */
2232 mutex_lock(&inode->i_mutex);
2413 ret = btrfs_truncate(inode); 2233 ret = btrfs_truncate(inode);
2234 mutex_unlock(&inode->i_mutex);
2414 } else { 2235 } else {
2415 nr_unlink++; 2236 nr_unlink++;
2416 } 2237 }
@@ -2420,6 +2241,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2420 if (ret) 2241 if (ret)
2421 goto out; 2242 goto out;
2422 } 2243 }
2244 /* release the path since we're done with it */
2245 btrfs_release_path(path);
2246
2423 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2247 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2424 2248
2425 if (root->orphan_block_rsv) 2249 if (root->orphan_block_rsv)
@@ -2647,7 +2471,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2647/* 2471/*
2648 * copy everything in the in-memory inode into the btree. 2472 * copy everything in the in-memory inode into the btree.
2649 */ 2473 */
2650noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2474static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
2651 struct btrfs_root *root, struct inode *inode) 2475 struct btrfs_root *root, struct inode *inode)
2652{ 2476{
2653 struct btrfs_inode_item *inode_item; 2477 struct btrfs_inode_item *inode_item;
@@ -2655,21 +2479,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2655 struct extent_buffer *leaf; 2479 struct extent_buffer *leaf;
2656 int ret; 2480 int ret;
2657 2481
2658 /*
2659 * If the inode is a free space inode, we can deadlock during commit
2660 * if we put it into the delayed code.
2661 *
2662 * The data relocation inode should also be directly updated
2663 * without delay
2664 */
2665 if (!btrfs_is_free_space_inode(root, inode)
2666 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2667 ret = btrfs_delayed_update_inode(trans, root, inode);
2668 if (!ret)
2669 btrfs_set_inode_last_trans(trans, inode);
2670 return ret;
2671 }
2672
2673 path = btrfs_alloc_path(); 2482 path = btrfs_alloc_path();
2674 if (!path) 2483 if (!path)
2675 return -ENOMEM; 2484 return -ENOMEM;
@@ -2698,6 +2507,43 @@ failed:
2698} 2507}
2699 2508
2700/* 2509/*
2510 * copy everything in the in-memory inode into the btree.
2511 */
2512noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2513 struct btrfs_root *root, struct inode *inode)
2514{
2515 int ret;
2516
2517 /*
2518 * If the inode is a free space inode, we can deadlock during commit
2519 * if we put it into the delayed code.
2520 *
2521 * The data relocation inode should also be directly updated
2522 * without delay
2523 */
2524 if (!btrfs_is_free_space_inode(root, inode)
2525 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2526 ret = btrfs_delayed_update_inode(trans, root, inode);
2527 if (!ret)
2528 btrfs_set_inode_last_trans(trans, inode);
2529 return ret;
2530 }
2531
2532 return btrfs_update_inode_item(trans, root, inode);
2533}
2534
2535static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
2536 struct btrfs_root *root, struct inode *inode)
2537{
2538 int ret;
2539
2540 ret = btrfs_update_inode(trans, root, inode);
2541 if (ret == -ENOSPC)
2542 return btrfs_update_inode_item(trans, root, inode);
2543 return ret;
2544}
2545
2546/*
2701 * unlink helper that gets used here in inode.c and in the tree logging 2547 * unlink helper that gets used here in inode.c and in the tree logging
2702 * recovery code. It remove a link in a directory with a given name, and 2548 * recovery code. It remove a link in a directory with a given name, and
2703 * also drops the back refs in the inode to the directory 2549 * also drops the back refs in the inode to the directory
@@ -2835,7 +2681,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2835 u64 ino = btrfs_ino(inode); 2681 u64 ino = btrfs_ino(inode);
2836 u64 dir_ino = btrfs_ino(dir); 2682 u64 dir_ino = btrfs_ino(dir);
2837 2683
2838 trans = btrfs_start_transaction(root, 10); 2684 /*
2685 * 1 for the possible orphan item
2686 * 1 for the dir item
2687 * 1 for the dir index
2688 * 1 for the inode ref
2689 * 1 for the inode ref in the tree log
2690 * 2 for the dir entries in the log
2691 * 1 for the inode
2692 */
2693 trans = btrfs_start_transaction(root, 8);
2839 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2694 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2840 return trans; 2695 return trans;
2841 2696
@@ -2858,7 +2713,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2858 return ERR_PTR(-ENOMEM); 2713 return ERR_PTR(-ENOMEM);
2859 } 2714 }
2860 2715
2861 trans = btrfs_start_transaction(root, 0); 2716 /* 1 for the orphan item */
2717 trans = btrfs_start_transaction(root, 1);
2862 if (IS_ERR(trans)) { 2718 if (IS_ERR(trans)) {
2863 btrfs_free_path(path); 2719 btrfs_free_path(path);
2864 root->fs_info->enospc_unlink = 0; 2720 root->fs_info->enospc_unlink = 0;
@@ -2963,6 +2819,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2963 err = 0; 2819 err = 0;
2964out: 2820out:
2965 btrfs_free_path(path); 2821 btrfs_free_path(path);
2822 /* Migrate the orphan reservation over */
2823 if (!err)
2824 err = btrfs_block_rsv_migrate(trans->block_rsv,
2825 &root->fs_info->global_block_rsv,
2826 trans->bytes_reserved);
2827
2966 if (err) { 2828 if (err) {
2967 btrfs_end_transaction(trans, root); 2829 btrfs_end_transaction(trans, root);
2968 root->fs_info->enospc_unlink = 0; 2830 root->fs_info->enospc_unlink = 0;
@@ -2977,6 +2839,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2977 struct btrfs_root *root) 2839 struct btrfs_root *root)
2978{ 2840{
2979 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2841 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2842 btrfs_block_rsv_release(root, trans->block_rsv,
2843 trans->bytes_reserved);
2844 trans->block_rsv = &root->fs_info->trans_block_rsv;
2980 BUG_ON(!root->fs_info->enospc_unlink); 2845 BUG_ON(!root->fs_info->enospc_unlink);
2981 root->fs_info->enospc_unlink = 0; 2846 root->fs_info->enospc_unlink = 0;
2982 } 2847 }
@@ -3368,6 +3233,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3368 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3233 pgoff_t index = from >> PAGE_CACHE_SHIFT;
3369 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3234 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3370 struct page *page; 3235 struct page *page;
3236 gfp_t mask = btrfs_alloc_write_mask(mapping);
3371 int ret = 0; 3237 int ret = 0;
3372 u64 page_start; 3238 u64 page_start;
3373 u64 page_end; 3239 u64 page_end;
@@ -3380,7 +3246,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3380 3246
3381 ret = -ENOMEM; 3247 ret = -ENOMEM;
3382again: 3248again:
3383 page = find_or_create_page(mapping, index, GFP_NOFS); 3249 page = find_or_create_page(mapping, index, mask);
3384 if (!page) { 3250 if (!page) {
3385 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3251 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3386 goto out; 3252 goto out;
@@ -3501,7 +3367,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3501 u64 hint_byte = 0; 3367 u64 hint_byte = 0;
3502 hole_size = last_byte - cur_offset; 3368 hole_size = last_byte - cur_offset;
3503 3369
3504 trans = btrfs_start_transaction(root, 2); 3370 trans = btrfs_start_transaction(root, 3);
3505 if (IS_ERR(trans)) { 3371 if (IS_ERR(trans)) {
3506 err = PTR_ERR(trans); 3372 err = PTR_ERR(trans);
3507 break; 3373 break;
@@ -3511,6 +3377,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3511 cur_offset + hole_size, 3377 cur_offset + hole_size,
3512 &hint_byte, 1); 3378 &hint_byte, 1);
3513 if (err) { 3379 if (err) {
3380 btrfs_update_inode(trans, root, inode);
3514 btrfs_end_transaction(trans, root); 3381 btrfs_end_transaction(trans, root);
3515 break; 3382 break;
3516 } 3383 }
@@ -3520,6 +3387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3520 0, hole_size, 0, hole_size, 3387 0, hole_size, 0, hole_size,
3521 0, 0, 0); 3388 0, 0, 0);
3522 if (err) { 3389 if (err) {
3390 btrfs_update_inode(trans, root, inode);
3523 btrfs_end_transaction(trans, root); 3391 btrfs_end_transaction(trans, root);
3524 break; 3392 break;
3525 } 3393 }
@@ -3527,6 +3395,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3527 btrfs_drop_extent_cache(inode, hole_start, 3395 btrfs_drop_extent_cache(inode, hole_start,
3528 last_byte - 1, 0); 3396 last_byte - 1, 0);
3529 3397
3398 btrfs_update_inode(trans, root, inode);
3530 btrfs_end_transaction(trans, root); 3399 btrfs_end_transaction(trans, root);
3531 } 3400 }
3532 free_extent_map(em); 3401 free_extent_map(em);
@@ -3544,6 +3413,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3544 3413
3545static int btrfs_setsize(struct inode *inode, loff_t newsize) 3414static int btrfs_setsize(struct inode *inode, loff_t newsize)
3546{ 3415{
3416 struct btrfs_root *root = BTRFS_I(inode)->root;
3417 struct btrfs_trans_handle *trans;
3547 loff_t oldsize = i_size_read(inode); 3418 loff_t oldsize = i_size_read(inode);
3548 int ret; 3419 int ret;
3549 3420
@@ -3551,16 +3422,19 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3551 return 0; 3422 return 0;
3552 3423
3553 if (newsize > oldsize) { 3424 if (newsize > oldsize) {
3554 i_size_write(inode, newsize);
3555 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3556 truncate_pagecache(inode, oldsize, newsize); 3425 truncate_pagecache(inode, oldsize, newsize);
3557 ret = btrfs_cont_expand(inode, oldsize, newsize); 3426 ret = btrfs_cont_expand(inode, oldsize, newsize);
3558 if (ret) { 3427 if (ret)
3559 btrfs_setsize(inode, oldsize);
3560 return ret; 3428 return ret;
3561 }
3562 3429
3563 mark_inode_dirty(inode); 3430 trans = btrfs_start_transaction(root, 1);
3431 if (IS_ERR(trans))
3432 return PTR_ERR(trans);
3433
3434 i_size_write(inode, newsize);
3435 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3436 ret = btrfs_update_inode(trans, root, inode);
3437 btrfs_end_transaction_throttle(trans, root);
3564 } else { 3438 } else {
3565 3439
3566 /* 3440 /*
@@ -3600,9 +3474,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3600 3474
3601 if (attr->ia_valid) { 3475 if (attr->ia_valid) {
3602 setattr_copy(inode, attr); 3476 setattr_copy(inode, attr);
3603 mark_inode_dirty(inode); 3477 err = btrfs_dirty_inode(inode);
3604 3478
3605 if (attr->ia_valid & ATTR_MODE) 3479 if (!err && attr->ia_valid & ATTR_MODE)
3606 err = btrfs_acl_chmod(inode); 3480 err = btrfs_acl_chmod(inode);
3607 } 3481 }
3608 3482
@@ -3613,6 +3487,8 @@ void btrfs_evict_inode(struct inode *inode)
3613{ 3487{
3614 struct btrfs_trans_handle *trans; 3488 struct btrfs_trans_handle *trans;
3615 struct btrfs_root *root = BTRFS_I(inode)->root; 3489 struct btrfs_root *root = BTRFS_I(inode)->root;
3490 struct btrfs_block_rsv *rsv, *global_rsv;
3491 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3616 unsigned long nr; 3492 unsigned long nr;
3617 int ret; 3493 int ret;
3618 3494
@@ -3640,22 +3516,55 @@ void btrfs_evict_inode(struct inode *inode)
3640 goto no_delete; 3516 goto no_delete;
3641 } 3517 }
3642 3518
3519 rsv = btrfs_alloc_block_rsv(root);
3520 if (!rsv) {
3521 btrfs_orphan_del(NULL, inode);
3522 goto no_delete;
3523 }
3524 rsv->size = min_size;
3525 global_rsv = &root->fs_info->global_block_rsv;
3526
3643 btrfs_i_size_write(inode, 0); 3527 btrfs_i_size_write(inode, 0);
3644 3528
3529 /*
3530 * This is a bit simpler than btrfs_truncate since
3531 *
3532 * 1) We've already reserved our space for our orphan item in the
3533 * unlink.
3534 * 2) We're going to delete the inode item, so we don't need to update
3535 * it at all.
3536 *
3537 * So we just need to reserve some slack space in case we add bytes when
3538 * doing the truncate.
3539 */
3645 while (1) { 3540 while (1) {
3646 trans = btrfs_join_transaction(root); 3541 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
3647 BUG_ON(IS_ERR(trans)); 3542
3648 trans->block_rsv = root->orphan_block_rsv; 3543 /*
3544 * Try and steal from the global reserve since we will
3545 * likely not use this space anyway, we want to try as
3546 * hard as possible to get this to work.
3547 */
3548 if (ret)
3549 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3649 3550
3650 ret = btrfs_block_rsv_check(trans, root,
3651 root->orphan_block_rsv, 0, 5);
3652 if (ret) { 3551 if (ret) {
3653 BUG_ON(ret != -EAGAIN); 3552 printk(KERN_WARNING "Could not get space for a "
3654 ret = btrfs_commit_transaction(trans, root); 3553 "delete, will truncate on mount %d\n", ret);
3655 BUG_ON(ret); 3554 btrfs_orphan_del(NULL, inode);
3656 continue; 3555 btrfs_free_block_rsv(root, rsv);
3556 goto no_delete;
3557 }
3558
3559 trans = btrfs_start_transaction(root, 0);
3560 if (IS_ERR(trans)) {
3561 btrfs_orphan_del(NULL, inode);
3562 btrfs_free_block_rsv(root, rsv);
3563 goto no_delete;
3657 } 3564 }
3658 3565
3566 trans->block_rsv = rsv;
3567
3659 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3568 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3660 if (ret != -EAGAIN) 3569 if (ret != -EAGAIN)
3661 break; 3570 break;
@@ -3664,14 +3573,17 @@ void btrfs_evict_inode(struct inode *inode)
3664 btrfs_end_transaction(trans, root); 3573 btrfs_end_transaction(trans, root);
3665 trans = NULL; 3574 trans = NULL;
3666 btrfs_btree_balance_dirty(root, nr); 3575 btrfs_btree_balance_dirty(root, nr);
3667
3668 } 3576 }
3669 3577
3578 btrfs_free_block_rsv(root, rsv);
3579
3670 if (ret == 0) { 3580 if (ret == 0) {
3581 trans->block_rsv = root->orphan_block_rsv;
3671 ret = btrfs_orphan_del(trans, inode); 3582 ret = btrfs_orphan_del(trans, inode);
3672 BUG_ON(ret); 3583 BUG_ON(ret);
3673 } 3584 }
3674 3585
3586 trans->block_rsv = &root->fs_info->trans_block_rsv;
3675 if (!(root == root->fs_info->tree_root || 3587 if (!(root == root->fs_info->tree_root ||
3676 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3588 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3677 btrfs_return_ino(root, btrfs_ino(inode)); 3589 btrfs_return_ino(root, btrfs_ino(inode));
@@ -4340,42 +4252,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4340 * FIXME, needs more benchmarking...there are no reasons other than performance 4252 * FIXME, needs more benchmarking...there are no reasons other than performance
4341 * to keep or drop this code. 4253 * to keep or drop this code.
4342 */ 4254 */
4343void btrfs_dirty_inode(struct inode *inode, int flags) 4255int btrfs_dirty_inode(struct inode *inode)
4344{ 4256{
4345 struct btrfs_root *root = BTRFS_I(inode)->root; 4257 struct btrfs_root *root = BTRFS_I(inode)->root;
4346 struct btrfs_trans_handle *trans; 4258 struct btrfs_trans_handle *trans;
4347 int ret; 4259 int ret;
4348 4260
4349 if (BTRFS_I(inode)->dummy_inode) 4261 if (BTRFS_I(inode)->dummy_inode)
4350 return; 4262 return 0;
4351 4263
4352 trans = btrfs_join_transaction(root); 4264 trans = btrfs_join_transaction(root);
4353 BUG_ON(IS_ERR(trans)); 4265 if (IS_ERR(trans))
4266 return PTR_ERR(trans);
4354 4267
4355 ret = btrfs_update_inode(trans, root, inode); 4268 ret = btrfs_update_inode(trans, root, inode);
4356 if (ret && ret == -ENOSPC) { 4269 if (ret && ret == -ENOSPC) {
4357 /* whoops, lets try again with the full transaction */ 4270 /* whoops, lets try again with the full transaction */
4358 btrfs_end_transaction(trans, root); 4271 btrfs_end_transaction(trans, root);
4359 trans = btrfs_start_transaction(root, 1); 4272 trans = btrfs_start_transaction(root, 1);
4360 if (IS_ERR(trans)) { 4273 if (IS_ERR(trans))
4361 printk_ratelimited(KERN_ERR "btrfs: fail to " 4274 return PTR_ERR(trans);
4362 "dirty inode %llu error %ld\n",
4363 (unsigned long long)btrfs_ino(inode),
4364 PTR_ERR(trans));
4365 return;
4366 }
4367 4275
4368 ret = btrfs_update_inode(trans, root, inode); 4276 ret = btrfs_update_inode(trans, root, inode);
4369 if (ret) {
4370 printk_ratelimited(KERN_ERR "btrfs: fail to "
4371 "dirty inode %llu error %d\n",
4372 (unsigned long long)btrfs_ino(inode),
4373 ret);
4374 }
4375 } 4277 }
4376 btrfs_end_transaction(trans, root); 4278 btrfs_end_transaction(trans, root);
4377 if (BTRFS_I(inode)->delayed_node) 4279 if (BTRFS_I(inode)->delayed_node)
4378 btrfs_balance_delayed_items(root); 4280 btrfs_balance_delayed_items(root);
4281
4282 return ret;
4283}
4284
4285/*
4286 * This is a copy of file_update_time. We need this so we can return error on
4287 * ENOSPC for updating the inode in the case of file write and mmap writes.
4288 */
4289int btrfs_update_time(struct file *file)
4290{
4291 struct inode *inode = file->f_path.dentry->d_inode;
4292 struct timespec now;
4293 int ret;
4294 enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
4295
4296 /* First try to exhaust all avenues to not sync */
4297 if (IS_NOCMTIME(inode))
4298 return 0;
4299
4300 now = current_fs_time(inode->i_sb);
4301 if (!timespec_equal(&inode->i_mtime, &now))
4302 sync_it = S_MTIME;
4303
4304 if (!timespec_equal(&inode->i_ctime, &now))
4305 sync_it |= S_CTIME;
4306
4307 if (IS_I_VERSION(inode))
4308 sync_it |= S_VERSION;
4309
4310 if (!sync_it)
4311 return 0;
4312
4313 /* Finally allowed to write? Takes lock. */
4314 if (mnt_want_write_file(file))
4315 return 0;
4316
4317 /* Only change inode inside the lock region */
4318 if (sync_it & S_VERSION)
4319 inode_inc_iversion(inode);
4320 if (sync_it & S_CTIME)
4321 inode->i_ctime = now;
4322 if (sync_it & S_MTIME)
4323 inode->i_mtime = now;
4324 ret = btrfs_dirty_inode(inode);
4325 if (!ret)
4326 mark_inode_dirty_sync(inode);
4327 mnt_drop_write(file->f_path.mnt);
4328 return ret;
4379} 4329}
4380 4330
4381/* 4331/*
@@ -4640,10 +4590,6 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
4640 int err = btrfs_add_link(trans, dir, inode, 4590 int err = btrfs_add_link(trans, dir, inode,
4641 dentry->d_name.name, dentry->d_name.len, 4591 dentry->d_name.name, dentry->d_name.len,
4642 backref, index); 4592 backref, index);
4643 if (!err) {
4644 d_instantiate(dentry, inode);
4645 return 0;
4646 }
4647 if (err > 0) 4593 if (err > 0)
4648 err = -EEXIST; 4594 err = -EEXIST;
4649 return err; 4595 return err;
@@ -4691,13 +4637,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4691 goto out_unlock; 4637 goto out_unlock;
4692 } 4638 }
4693 4639
4640 /*
4641 * If the active LSM wants to access the inode during
4642 * d_instantiate it needs these. Smack checks to see
4643 * if the filesystem supports xattrs by looking at the
4644 * ops vector.
4645 */
4646
4647 inode->i_op = &btrfs_special_inode_operations;
4694 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4648 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4695 if (err) 4649 if (err)
4696 drop_inode = 1; 4650 drop_inode = 1;
4697 else { 4651 else {
4698 inode->i_op = &btrfs_special_inode_operations;
4699 init_special_inode(inode, inode->i_mode, rdev); 4652 init_special_inode(inode, inode->i_mode, rdev);
4700 btrfs_update_inode(trans, root, inode); 4653 btrfs_update_inode(trans, root, inode);
4654 d_instantiate(dentry, inode);
4701 } 4655 }
4702out_unlock: 4656out_unlock:
4703 nr = trans->blocks_used; 4657 nr = trans->blocks_used;
@@ -4749,15 +4703,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4749 goto out_unlock; 4703 goto out_unlock;
4750 } 4704 }
4751 4705
4706 /*
4707 * If the active LSM wants to access the inode during
4708 * d_instantiate it needs these. Smack checks to see
4709 * if the filesystem supports xattrs by looking at the
4710 * ops vector.
4711 */
4712 inode->i_fop = &btrfs_file_operations;
4713 inode->i_op = &btrfs_file_inode_operations;
4714
4752 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4715 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4753 if (err) 4716 if (err)
4754 drop_inode = 1; 4717 drop_inode = 1;
4755 else { 4718 else {
4756 inode->i_mapping->a_ops = &btrfs_aops; 4719 inode->i_mapping->a_ops = &btrfs_aops;
4757 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4720 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4758 inode->i_fop = &btrfs_file_operations;
4759 inode->i_op = &btrfs_file_inode_operations;
4760 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4721 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4722 d_instantiate(dentry, inode);
4761 } 4723 }
4762out_unlock: 4724out_unlock:
4763 nr = trans->blocks_used; 4725 nr = trans->blocks_used;
@@ -4815,6 +4777,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4815 struct dentry *parent = dentry->d_parent; 4777 struct dentry *parent = dentry->d_parent;
4816 err = btrfs_update_inode(trans, root, inode); 4778 err = btrfs_update_inode(trans, root, inode);
4817 BUG_ON(err); 4779 BUG_ON(err);
4780 d_instantiate(dentry, inode);
4818 btrfs_log_new_name(trans, inode, NULL, parent); 4781 btrfs_log_new_name(trans, inode, NULL, parent);
4819 } 4782 }
4820 4783
@@ -5795,8 +5758,7 @@ again:
5795 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5758 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5796 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5759 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5797 if (!ret) 5760 if (!ret)
5798 ret = btrfs_update_inode(trans, root, inode); 5761 err = btrfs_update_inode_fallback(trans, root, inode);
5799 err = ret;
5800 goto out; 5762 goto out;
5801 } 5763 }
5802 5764
@@ -5834,7 +5796,7 @@ again:
5834 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5796 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5835 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5797 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5836 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 5798 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5837 btrfs_update_inode(trans, root, inode); 5799 btrfs_update_inode_fallback(trans, root, inode);
5838 ret = 0; 5800 ret = 0;
5839out_unlock: 5801out_unlock:
5840 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5802 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
@@ -6289,7 +6251,7 @@ int btrfs_readpage(struct file *file, struct page *page)
6289{ 6251{
6290 struct extent_io_tree *tree; 6252 struct extent_io_tree *tree;
6291 tree = &BTRFS_I(page->mapping->host)->io_tree; 6253 tree = &BTRFS_I(page->mapping->host)->io_tree;
6292 return extent_read_full_page(tree, page, btrfs_get_extent); 6254 return extent_read_full_page(tree, page, btrfs_get_extent, 0);
6293} 6255}
6294 6256
6295static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6257static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -6440,7 +6402,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6440 u64 page_start; 6402 u64 page_start;
6441 u64 page_end; 6403 u64 page_end;
6442 6404
6405 /* Need this to keep space reservations serialized */
6406 mutex_lock(&inode->i_mutex);
6443 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6407 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6408 mutex_unlock(&inode->i_mutex);
6409 if (!ret)
6410 ret = btrfs_update_time(vma->vm_file);
6444 if (ret) { 6411 if (ret) {
6445 if (ret == -ENOMEM) 6412 if (ret == -ENOMEM)
6446 ret = VM_FAULT_OOM; 6413 ret = VM_FAULT_OOM;
@@ -6541,6 +6508,7 @@ static int btrfs_truncate(struct inode *inode)
6541 struct btrfs_trans_handle *trans; 6508 struct btrfs_trans_handle *trans;
6542 unsigned long nr; 6509 unsigned long nr;
6543 u64 mask = root->sectorsize - 1; 6510 u64 mask = root->sectorsize - 1;
6511 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6544 6512
6545 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6513 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6546 if (ret) 6514 if (ret)
@@ -6588,19 +6556,23 @@ static int btrfs_truncate(struct inode *inode)
6588 rsv = btrfs_alloc_block_rsv(root); 6556 rsv = btrfs_alloc_block_rsv(root);
6589 if (!rsv) 6557 if (!rsv)
6590 return -ENOMEM; 6558 return -ENOMEM;
6591 btrfs_add_durable_block_rsv(root->fs_info, rsv); 6559 rsv->size = min_size;
6592 6560
6561 /*
6562 * 1 for the truncate slack space
6563 * 1 for the orphan item we're going to add
6564 * 1 for the orphan item deletion
6565 * 1 for updating the inode.
6566 */
6593 trans = btrfs_start_transaction(root, 4); 6567 trans = btrfs_start_transaction(root, 4);
6594 if (IS_ERR(trans)) { 6568 if (IS_ERR(trans)) {
6595 err = PTR_ERR(trans); 6569 err = PTR_ERR(trans);
6596 goto out; 6570 goto out;
6597 } 6571 }
6598 6572
6599 /* 6573 /* Migrate the slack space for the truncate to our reserve */
6600 * Reserve space for the truncate process. Truncate should be adding 6574 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6601 * space, but if there are snapshots it may end up using space. 6575 min_size);
6602 */
6603 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6604 BUG_ON(ret); 6576 BUG_ON(ret);
6605 6577
6606 ret = btrfs_orphan_add(trans, inode); 6578 ret = btrfs_orphan_add(trans, inode);
@@ -6609,21 +6581,6 @@ static int btrfs_truncate(struct inode *inode)
6609 goto out; 6581 goto out;
6610 } 6582 }
6611 6583
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /*
6617 * Ok so we've already migrated our bytes over for the truncate, so here
6618 * just reserve the one slot we need for updating the inode.
6619 */
6620 trans = btrfs_start_transaction(root, 1);
6621 if (IS_ERR(trans)) {
6622 err = PTR_ERR(trans);
6623 goto out;
6624 }
6625 trans->block_rsv = rsv;
6626
6627 /* 6584 /*
6628 * setattr is responsible for setting the ordered_data_close flag, 6585 * setattr is responsible for setting the ordered_data_close flag,
6629 * but that is only tested during the last file release. That 6586 * but that is only tested during the last file release. That
@@ -6645,20 +6602,31 @@ static int btrfs_truncate(struct inode *inode)
6645 btrfs_add_ordered_operation(trans, root, inode); 6602 btrfs_add_ordered_operation(trans, root, inode);
6646 6603
6647 while (1) { 6604 while (1) {
6605 ret = btrfs_block_rsv_refill(root, rsv, min_size);
6606 if (ret) {
6607 /*
6608 * This can only happen with the original transaction we
6609 * started above, every other time we shouldn't have a
6610 * transaction started yet.
6611 */
6612 if (ret == -EAGAIN)
6613 goto end_trans;
6614 err = ret;
6615 break;
6616 }
6617
6648 if (!trans) { 6618 if (!trans) {
6649 trans = btrfs_start_transaction(root, 3); 6619 /* Just need the 1 for updating the inode */
6620 trans = btrfs_start_transaction(root, 1);
6650 if (IS_ERR(trans)) { 6621 if (IS_ERR(trans)) {
6651 err = PTR_ERR(trans); 6622 ret = err = PTR_ERR(trans);
6652 goto out; 6623 trans = NULL;
6624 break;
6653 } 6625 }
6654
6655 ret = btrfs_truncate_reserve_metadata(trans, root,
6656 rsv);
6657 BUG_ON(ret);
6658
6659 trans->block_rsv = rsv;
6660 } 6626 }
6661 6627
6628 trans->block_rsv = rsv;
6629
6662 ret = btrfs_truncate_inode_items(trans, root, inode, 6630 ret = btrfs_truncate_inode_items(trans, root, inode,
6663 inode->i_size, 6631 inode->i_size,
6664 BTRFS_EXTENT_DATA_KEY); 6632 BTRFS_EXTENT_DATA_KEY);
@@ -6673,7 +6641,7 @@ static int btrfs_truncate(struct inode *inode)
6673 err = ret; 6641 err = ret;
6674 break; 6642 break;
6675 } 6643 }
6676 6644end_trans:
6677 nr = trans->blocks_used; 6645 nr = trans->blocks_used;
6678 btrfs_end_transaction(trans, root); 6646 btrfs_end_transaction(trans, root);
6679 trans = NULL; 6647 trans = NULL;
@@ -6693,14 +6661,16 @@ static int btrfs_truncate(struct inode *inode)
6693 ret = btrfs_orphan_del(NULL, inode); 6661 ret = btrfs_orphan_del(NULL, inode);
6694 } 6662 }
6695 6663
6696 trans->block_rsv = &root->fs_info->trans_block_rsv; 6664 if (trans) {
6697 ret = btrfs_update_inode(trans, root, inode); 6665 trans->block_rsv = &root->fs_info->trans_block_rsv;
6698 if (ret && !err) 6666 ret = btrfs_update_inode(trans, root, inode);
6699 err = ret; 6667 if (ret && !err)
6668 err = ret;
6700 6669
6701 nr = trans->blocks_used; 6670 nr = trans->blocks_used;
6702 ret = btrfs_end_transaction_throttle(trans, root); 6671 ret = btrfs_end_transaction_throttle(trans, root);
6703 btrfs_btree_balance_dirty(root, nr); 6672 btrfs_btree_balance_dirty(root, nr);
6673 }
6704 6674
6705out: 6675out:
6706 btrfs_free_block_rsv(root, rsv); 6676 btrfs_free_block_rsv(root, rsv);
@@ -6755,9 +6725,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6755 ei->last_sub_trans = 0; 6725 ei->last_sub_trans = 0;
6756 ei->logged_trans = 0; 6726 ei->logged_trans = 0;
6757 ei->delalloc_bytes = 0; 6727 ei->delalloc_bytes = 0;
6758 ei->reserved_bytes = 0;
6759 ei->disk_i_size = 0; 6728 ei->disk_i_size = 0;
6760 ei->flags = 0; 6729 ei->flags = 0;
6730 ei->csum_bytes = 0;
6761 ei->index_cnt = (u64)-1; 6731 ei->index_cnt = (u64)-1;
6762 ei->last_unlink_trans = 0; 6732 ei->last_unlink_trans = 0;
6763 6733
@@ -6769,6 +6739,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6769 ei->orphan_meta_reserved = 0; 6739 ei->orphan_meta_reserved = 0;
6770 ei->dummy_inode = 0; 6740 ei->dummy_inode = 0;
6771 ei->in_defrag = 0; 6741 ei->in_defrag = 0;
6742 ei->delalloc_meta_reserved = 0;
6772 ei->force_compress = BTRFS_COMPRESS_NONE; 6743 ei->force_compress = BTRFS_COMPRESS_NONE;
6773 6744
6774 ei->delayed_node = NULL; 6745 ei->delayed_node = NULL;
@@ -6803,6 +6774,8 @@ void btrfs_destroy_inode(struct inode *inode)
6803 WARN_ON(inode->i_data.nrpages); 6774 WARN_ON(inode->i_data.nrpages);
6804 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6775 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6805 WARN_ON(BTRFS_I(inode)->reserved_extents); 6776 WARN_ON(BTRFS_I(inode)->reserved_extents);
6777 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
6778 WARN_ON(BTRFS_I(inode)->csum_bytes);
6806 6779
6807 /* 6780 /*
6808 * This can happen where we create an inode, but somebody else also 6781 * This can happen where we create an inode, but somebody else also
@@ -6926,11 +6899,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
6926 struct dentry *dentry, struct kstat *stat) 6899 struct dentry *dentry, struct kstat *stat)
6927{ 6900{
6928 struct inode *inode = dentry->d_inode; 6901 struct inode *inode = dentry->d_inode;
6902 u32 blocksize = inode->i_sb->s_blocksize;
6903
6929 generic_fillattr(inode, stat); 6904 generic_fillattr(inode, stat);
6930 stat->dev = BTRFS_I(inode)->root->anon_dev; 6905 stat->dev = BTRFS_I(inode)->root->anon_dev;
6931 stat->blksize = PAGE_CACHE_SIZE; 6906 stat->blksize = PAGE_CACHE_SIZE;
6932 stat->blocks = (inode_get_bytes(inode) + 6907 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
6933 BTRFS_I(inode)->delalloc_bytes) >> 9; 6908 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
6934 return 0; 6909 return 0;
6935} 6910}
6936 6911
@@ -7206,14 +7181,21 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7206 goto out_unlock; 7181 goto out_unlock;
7207 } 7182 }
7208 7183
7184 /*
7185 * If the active LSM wants to access the inode during
7186 * d_instantiate it needs these. Smack checks to see
7187 * if the filesystem supports xattrs by looking at the
7188 * ops vector.
7189 */
7190 inode->i_fop = &btrfs_file_operations;
7191 inode->i_op = &btrfs_file_inode_operations;
7192
7209 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7193 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
7210 if (err) 7194 if (err)
7211 drop_inode = 1; 7195 drop_inode = 1;
7212 else { 7196 else {
7213 inode->i_mapping->a_ops = &btrfs_aops; 7197 inode->i_mapping->a_ops = &btrfs_aops;
7214 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7198 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
7215 inode->i_fop = &btrfs_file_operations;
7216 inode->i_op = &btrfs_file_inode_operations;
7217 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7199 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
7218 } 7200 }
7219 if (drop_inode) 7201 if (drop_inode)
@@ -7262,6 +7244,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7262 drop_inode = 1; 7244 drop_inode = 1;
7263 7245
7264out_unlock: 7246out_unlock:
7247 if (!err)
7248 d_instantiate(dentry, inode);
7265 nr = trans->blocks_used; 7249 nr = trans->blocks_used;
7266 btrfs_end_transaction_throttle(trans, root); 7250 btrfs_end_transaction_throttle(trans, root);
7267 if (drop_inode) { 7251 if (drop_inode) {
@@ -7420,7 +7404,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
7420 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7404 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
7421 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7405 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
7422 .writepage_start_hook = btrfs_writepage_start_hook, 7406 .writepage_start_hook = btrfs_writepage_start_hook,
7423 .readpage_io_failed_hook = btrfs_io_failed_hook,
7424 .set_bit_hook = btrfs_set_bit_hook, 7407 .set_bit_hook = btrfs_set_bit_hook,
7425 .clear_bit_hook = btrfs_clear_bit_hook, 7408 .clear_bit_hook = btrfs_clear_bit_hook,
7426 .merge_extent_hook = btrfs_merge_extent_hook, 7409 .merge_extent_hook = btrfs_merge_extent_hook,
@@ -7484,6 +7467,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7484 .follow_link = page_follow_link_light, 7467 .follow_link = page_follow_link_light,
7485 .put_link = page_put_link, 7468 .put_link = page_put_link,
7486 .getattr = btrfs_getattr, 7469 .getattr = btrfs_getattr,
7470 .setattr = btrfs_setattr,
7487 .permission = btrfs_permission, 7471 .permission = btrfs_permission,
7488 .setxattr = btrfs_setxattr, 7472 .setxattr = btrfs_setxattr,
7489 .getxattr = btrfs_getxattr, 7473 .getxattr = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dae5dfe41ba5..c04f02c7d5bb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
51#include "volumes.h" 51#include "volumes.h"
52#include "locking.h" 52#include "locking.h"
53#include "inode-map.h" 53#include "inode-map.h"
54#include "backref.h"
54 55
55/* Mask out flags that are inappropriate for the given type of inode. */ 56/* Mask out flags that are inappropriate for the given type of inode. */
56static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 57static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
117/* 118/*
118 * Inherit flags from the parent inode. 119 * Inherit flags from the parent inode.
119 * 120 *
120 * Unlike extN we don't have any flags we don't want to inherit currently. 121 * Currently only the compression flags and the cow flags are inherited.
121 */ 122 */
122void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 123void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
123{ 124{
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
128 129
129 flags = BTRFS_I(dir)->flags; 130 flags = BTRFS_I(dir)->flags;
130 131
131 if (S_ISREG(inode->i_mode)) 132 if (flags & BTRFS_INODE_NOCOMPRESS) {
132 flags &= ~BTRFS_INODE_DIRSYNC; 133 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
133 else if (!S_ISDIR(inode->i_mode)) 134 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
134 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); 135 } else if (flags & BTRFS_INODE_COMPRESS) {
136 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
137 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
138 }
139
140 if (flags & BTRFS_INODE_NODATACOW)
141 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
135 142
136 BTRFS_I(inode)->flags = flags;
137 btrfs_update_iflags(inode); 143 btrfs_update_iflags(inode);
138} 144}
139 145
@@ -246,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
246 trans = btrfs_join_transaction(root); 252 trans = btrfs_join_transaction(root);
247 BUG_ON(IS_ERR(trans)); 253 BUG_ON(IS_ERR(trans));
248 254
255 btrfs_update_iflags(inode);
256 inode->i_ctime = CURRENT_TIME;
249 ret = btrfs_update_inode(trans, root, inode); 257 ret = btrfs_update_inode(trans, root, inode);
250 BUG_ON(ret); 258 BUG_ON(ret);
251 259
252 btrfs_update_iflags(inode);
253 inode->i_ctime = CURRENT_TIME;
254 btrfs_end_transaction(trans, root); 260 btrfs_end_transaction(trans, root);
255 261
256 mnt_drop_write(file->f_path.mnt); 262 mnt_drop_write(file->f_path.mnt);
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
277 struct fstrim_range range; 283 struct fstrim_range range;
278 u64 minlen = ULLONG_MAX; 284 u64 minlen = ULLONG_MAX;
279 u64 num_devices = 0; 285 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
280 int ret; 287 int ret;
281 288
282 if (!capable(CAP_SYS_ADMIN)) 289 if (!capable(CAP_SYS_ADMIN))
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
295 } 302 }
296 } 303 }
297 rcu_read_unlock(); 304 rcu_read_unlock();
305
298 if (!num_devices) 306 if (!num_devices)
299 return -EOPNOTSUPP; 307 return -EOPNOTSUPP;
300
301 if (copy_from_user(&range, arg, sizeof(range))) 308 if (copy_from_user(&range, arg, sizeof(range)))
302 return -EFAULT; 309 return -EFAULT;
310 if (range.start > total_bytes)
311 return -EINVAL;
303 312
313 range.len = min(range.len, total_bytes - range.start);
304 range.minlen = max(range.minlen, minlen); 314 range.minlen = max(range.minlen, minlen);
305 ret = btrfs_trim_fs(root, &range); 315 ret = btrfs_trim_fs(root, &range);
306 if (ret < 0) 316 if (ret < 0)
@@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
760 int ret = 1; 770 int ret = 1;
761 771
762 /* 772 /*
763 * make sure that once we start defragging and extent, we keep on 773 * make sure that once we start defragging an extent, we keep on
764 * defragging it 774 * defragging it
765 */ 775 */
766 if (start < *defrag_end) 776 if (start < *defrag_end)
@@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
805 * extent will force at least part of that big extent to be defragged. 815 * extent will force at least part of that big extent to be defragged.
806 */ 816 */
807 if (ret) { 817 if (ret) {
808 *last_len += len;
809 *defrag_end = extent_map_end(em); 818 *defrag_end = extent_map_end(em);
810 } else { 819 } else {
811 *last_len = 0; 820 *last_len = 0;
@@ -843,13 +852,16 @@ static int cluster_pages_for_defrag(struct inode *inode,
843 int i_done; 852 int i_done;
844 struct btrfs_ordered_extent *ordered; 853 struct btrfs_ordered_extent *ordered;
845 struct extent_state *cached_state = NULL; 854 struct extent_state *cached_state = NULL;
855 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
846 856
847 if (isize == 0) 857 if (isize == 0)
848 return 0; 858 return 0;
849 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
850 860
861 mutex_lock(&inode->i_mutex);
851 ret = btrfs_delalloc_reserve_space(inode, 862 ret = btrfs_delalloc_reserve_space(inode,
852 num_pages << PAGE_CACHE_SHIFT); 863 num_pages << PAGE_CACHE_SHIFT);
864 mutex_unlock(&inode->i_mutex);
853 if (ret) 865 if (ret)
854 return ret; 866 return ret;
855again: 867again:
@@ -860,7 +872,7 @@ again:
860 for (i = 0; i < num_pages; i++) { 872 for (i = 0; i < num_pages; i++) {
861 struct page *page; 873 struct page *page;
862 page = find_or_create_page(inode->i_mapping, 874 page = find_or_create_page(inode->i_mapping,
863 start_index + i, GFP_NOFS); 875 start_index + i, mask);
864 if (!page) 876 if (!page)
865 break; 877 break;
866 878
@@ -972,18 +984,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
972 struct btrfs_super_block *disk_super; 984 struct btrfs_super_block *disk_super;
973 struct file_ra_state *ra = NULL; 985 struct file_ra_state *ra = NULL;
974 unsigned long last_index; 986 unsigned long last_index;
987 u64 isize = i_size_read(inode);
975 u64 features; 988 u64 features;
976 u64 last_len = 0; 989 u64 last_len = 0;
977 u64 skip = 0; 990 u64 skip = 0;
978 u64 defrag_end = 0; 991 u64 defrag_end = 0;
979 u64 newer_off = range->start; 992 u64 newer_off = range->start;
980 int newer_left = 0;
981 unsigned long i; 993 unsigned long i;
994 unsigned long ra_index = 0;
982 int ret; 995 int ret;
983 int defrag_count = 0; 996 int defrag_count = 0;
984 int compress_type = BTRFS_COMPRESS_ZLIB; 997 int compress_type = BTRFS_COMPRESS_ZLIB;
985 int extent_thresh = range->extent_thresh; 998 int extent_thresh = range->extent_thresh;
986 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 999 int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
1000 int cluster = max_cluster;
987 u64 new_align = ~((u64)128 * 1024 - 1); 1001 u64 new_align = ~((u64)128 * 1024 - 1);
988 struct page **pages = NULL; 1002 struct page **pages = NULL;
989 1003
@@ -997,7 +1011,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
997 compress_type = range->compress_type; 1011 compress_type = range->compress_type;
998 } 1012 }
999 1013
1000 if (inode->i_size == 0) 1014 if (isize == 0)
1001 return 0; 1015 return 0;
1002 1016
1003 /* 1017 /*
@@ -1013,7 +1027,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1013 ra = &file->f_ra; 1027 ra = &file->f_ra;
1014 } 1028 }
1015 1029
1016 pages = kmalloc(sizeof(struct page *) * newer_cluster, 1030 pages = kmalloc(sizeof(struct page *) * max_cluster,
1017 GFP_NOFS); 1031 GFP_NOFS);
1018 if (!pages) { 1032 if (!pages) {
1019 ret = -ENOMEM; 1033 ret = -ENOMEM;
@@ -1022,10 +1036,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1022 1036
1023 /* find the last page to defrag */ 1037 /* find the last page to defrag */
1024 if (range->start + range->len > range->start) { 1038 if (range->start + range->len > range->start) {
1025 last_index = min_t(u64, inode->i_size - 1, 1039 last_index = min_t(u64, isize - 1,
1026 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1040 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
1027 } else { 1041 } else {
1028 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1042 last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1029 } 1043 }
1030 1044
1031 if (newer_than) { 1045 if (newer_than) {
@@ -1038,14 +1052,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1038 * the extents in the file evenly spaced 1052 * the extents in the file evenly spaced
1039 */ 1053 */
1040 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1054 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1041 newer_left = newer_cluster;
1042 } else 1055 } else
1043 goto out_ra; 1056 goto out_ra;
1044 } else { 1057 } else {
1045 i = range->start >> PAGE_CACHE_SHIFT; 1058 i = range->start >> PAGE_CACHE_SHIFT;
1046 } 1059 }
1047 if (!max_to_defrag) 1060 if (!max_to_defrag)
1048 max_to_defrag = last_index - 1; 1061 max_to_defrag = last_index;
1049 1062
1050 /* 1063 /*
1051 * make writeback starts from i, so the defrag range can be 1064 * make writeback starts from i, so the defrag range can be
@@ -1079,18 +1092,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1079 i = max(i + 1, next); 1092 i = max(i + 1, next);
1080 continue; 1093 continue;
1081 } 1094 }
1095
1096 if (!newer_than) {
1097 cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
1098 PAGE_CACHE_SHIFT) - i;
1099 cluster = min(cluster, max_cluster);
1100 } else {
1101 cluster = max_cluster;
1102 }
1103
1082 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1104 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
1083 BTRFS_I(inode)->force_compress = compress_type; 1105 BTRFS_I(inode)->force_compress = compress_type;
1084 1106
1085 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); 1107 if (i + cluster > ra_index) {
1108 ra_index = max(i, ra_index);
1109 btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
1110 cluster);
1111 ra_index += max_cluster;
1112 }
1086 1113
1087 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); 1114 ret = cluster_pages_for_defrag(inode, pages, i, cluster);
1088 if (ret < 0) 1115 if (ret < 0)
1089 goto out_ra; 1116 goto out_ra;
1090 1117
1091 defrag_count += ret; 1118 defrag_count += ret;
1092 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1119 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
1093 i += ret;
1094 1120
1095 if (newer_than) { 1121 if (newer_than) {
1096 if (newer_off == (u64)-1) 1122 if (newer_off == (u64)-1)
@@ -1105,12 +1131,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1105 if (!ret) { 1131 if (!ret) {
1106 range->start = newer_off; 1132 range->start = newer_off;
1107 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1133 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1108 newer_left = newer_cluster;
1109 } else { 1134 } else {
1110 break; 1135 break;
1111 } 1136 }
1112 } else { 1137 } else {
1113 i++; 1138 if (ret > 0) {
1139 i += ret;
1140 last_len += ret << PAGE_CACHE_SHIFT;
1141 } else {
1142 i++;
1143 last_len = 0;
1144 }
1114 } 1145 }
1115 } 1146 }
1116 1147
@@ -1136,16 +1167,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1136 mutex_unlock(&inode->i_mutex); 1167 mutex_unlock(&inode->i_mutex);
1137 } 1168 }
1138 1169
1139 disk_super = &root->fs_info->super_copy; 1170 disk_super = root->fs_info->super_copy;
1140 features = btrfs_super_incompat_flags(disk_super); 1171 features = btrfs_super_incompat_flags(disk_super);
1141 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1172 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1142 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1173 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1143 btrfs_set_super_incompat_flags(disk_super, features); 1174 btrfs_set_super_incompat_flags(disk_super, features);
1144 } 1175 }
1145 1176
1146 if (!file) 1177 ret = defrag_count;
1147 kfree(ra);
1148 return defrag_count;
1149 1178
1150out_ra: 1179out_ra:
1151 if (!file) 1180 if (!file)
@@ -1189,12 +1218,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1189 *devstr = '\0'; 1218 *devstr = '\0';
1190 devstr = vol_args->name; 1219 devstr = vol_args->name;
1191 devid = simple_strtoull(devstr, &end, 10); 1220 devid = simple_strtoull(devstr, &end, 10);
1192 printk(KERN_INFO "resizing devid %llu\n", 1221 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1193 (unsigned long long)devid); 1222 (unsigned long long)devid);
1194 } 1223 }
1195 device = btrfs_find_device(root, devid, NULL, NULL); 1224 device = btrfs_find_device(root, devid, NULL, NULL);
1196 if (!device) { 1225 if (!device) {
1197 printk(KERN_INFO "resizer unable to find device %llu\n", 1226 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1198 (unsigned long long)devid); 1227 (unsigned long long)devid);
1199 ret = -EINVAL; 1228 ret = -EINVAL;
1200 goto out_unlock; 1229 goto out_unlock;
@@ -1240,7 +1269,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1240 do_div(new_size, root->sectorsize); 1269 do_div(new_size, root->sectorsize);
1241 new_size *= root->sectorsize; 1270 new_size *= root->sectorsize;
1242 1271
1243 printk(KERN_INFO "new size for %s is %llu\n", 1272 printk(KERN_INFO "btrfs: new size for %s is %llu\n",
1244 device->name, (unsigned long long)new_size); 1273 device->name, (unsigned long long)new_size);
1245 1274
1246 if (new_size > old_size) { 1275 if (new_size > old_size) {
@@ -1251,7 +1280,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1251 } 1280 }
1252 ret = btrfs_grow_device(trans, device, new_size); 1281 ret = btrfs_grow_device(trans, device, new_size);
1253 btrfs_commit_transaction(trans, root); 1282 btrfs_commit_transaction(trans, root);
1254 } else { 1283 } else if (new_size < old_size) {
1255 ret = btrfs_shrink_device(device, new_size); 1284 ret = btrfs_shrink_device(device, new_size);
1256 } 1285 }
1257 1286
@@ -2587,7 +2616,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2587 return PTR_ERR(trans); 2616 return PTR_ERR(trans);
2588 } 2617 }
2589 2618
2590 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2619 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
2591 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 2620 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
2592 dir_id, "default", 7, 1); 2621 dir_id, "default", 7, 1);
2593 if (IS_ERR_OR_NULL(di)) { 2622 if (IS_ERR_OR_NULL(di)) {
@@ -2603,7 +2632,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2603 btrfs_mark_buffer_dirty(path->nodes[0]); 2632 btrfs_mark_buffer_dirty(path->nodes[0]);
2604 btrfs_free_path(path); 2633 btrfs_free_path(path);
2605 2634
2606 disk_super = &root->fs_info->super_copy; 2635 disk_super = root->fs_info->super_copy;
2607 features = btrfs_super_incompat_flags(disk_super); 2636 features = btrfs_super_incompat_flags(disk_super);
2608 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { 2637 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2609 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; 2638 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2864,6 +2893,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
2864 return ret; 2893 return ret;
2865} 2894}
2866 2895
2896static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2897{
2898 int ret = 0;
2899 int i;
2900 u64 rel_ptr;
2901 int size;
2902 struct btrfs_ioctl_ino_path_args *ipa = NULL;
2903 struct inode_fs_paths *ipath = NULL;
2904 struct btrfs_path *path;
2905
2906 if (!capable(CAP_SYS_ADMIN))
2907 return -EPERM;
2908
2909 path = btrfs_alloc_path();
2910 if (!path) {
2911 ret = -ENOMEM;
2912 goto out;
2913 }
2914
2915 ipa = memdup_user(arg, sizeof(*ipa));
2916 if (IS_ERR(ipa)) {
2917 ret = PTR_ERR(ipa);
2918 ipa = NULL;
2919 goto out;
2920 }
2921
2922 size = min_t(u32, ipa->size, 4096);
2923 ipath = init_ipath(size, root, path);
2924 if (IS_ERR(ipath)) {
2925 ret = PTR_ERR(ipath);
2926 ipath = NULL;
2927 goto out;
2928 }
2929
2930 ret = paths_from_inode(ipa->inum, ipath);
2931 if (ret < 0)
2932 goto out;
2933
2934 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2935 rel_ptr = ipath->fspath->val[i] -
2936 (u64)(unsigned long)ipath->fspath->val;
2937 ipath->fspath->val[i] = rel_ptr;
2938 }
2939
2940 ret = copy_to_user((void *)(unsigned long)ipa->fspath,
2941 (void *)(unsigned long)ipath->fspath, size);
2942 if (ret) {
2943 ret = -EFAULT;
2944 goto out;
2945 }
2946
2947out:
2948 btrfs_free_path(path);
2949 free_ipath(ipath);
2950 kfree(ipa);
2951
2952 return ret;
2953}
2954
2955static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
2956{
2957 struct btrfs_data_container *inodes = ctx;
2958 const size_t c = 3 * sizeof(u64);
2959
2960 if (inodes->bytes_left >= c) {
2961 inodes->bytes_left -= c;
2962 inodes->val[inodes->elem_cnt] = inum;
2963 inodes->val[inodes->elem_cnt + 1] = offset;
2964 inodes->val[inodes->elem_cnt + 2] = root;
2965 inodes->elem_cnt += 3;
2966 } else {
2967 inodes->bytes_missing += c - inodes->bytes_left;
2968 inodes->bytes_left = 0;
2969 inodes->elem_missed += 3;
2970 }
2971
2972 return 0;
2973}
2974
2975static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2976 void __user *arg)
2977{
2978 int ret = 0;
2979 int size;
2980 u64 extent_offset;
2981 struct btrfs_ioctl_logical_ino_args *loi;
2982 struct btrfs_data_container *inodes = NULL;
2983 struct btrfs_path *path = NULL;
2984 struct btrfs_key key;
2985
2986 if (!capable(CAP_SYS_ADMIN))
2987 return -EPERM;
2988
2989 loi = memdup_user(arg, sizeof(*loi));
2990 if (IS_ERR(loi)) {
2991 ret = PTR_ERR(loi);
2992 loi = NULL;
2993 goto out;
2994 }
2995
2996 path = btrfs_alloc_path();
2997 if (!path) {
2998 ret = -ENOMEM;
2999 goto out;
3000 }
3001
3002 size = min_t(u32, loi->size, 4096);
3003 inodes = init_data_container(size);
3004 if (IS_ERR(inodes)) {
3005 ret = PTR_ERR(inodes);
3006 inodes = NULL;
3007 goto out;
3008 }
3009
3010 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3011
3012 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3013 ret = -ENOENT;
3014 if (ret < 0)
3015 goto out;
3016
3017 extent_offset = loi->logical - key.objectid;
3018 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3019 extent_offset, build_ino_list, inodes);
3020
3021 if (ret < 0)
3022 goto out;
3023
3024 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3025 (void *)(unsigned long)inodes, size);
3026 if (ret)
3027 ret = -EFAULT;
3028
3029out:
3030 btrfs_free_path(path);
3031 kfree(inodes);
3032 kfree(loi);
3033
3034 return ret;
3035}
3036
2867long btrfs_ioctl(struct file *file, unsigned int 3037long btrfs_ioctl(struct file *file, unsigned int
2868 cmd, unsigned long arg) 3038 cmd, unsigned long arg)
2869{ 3039{
@@ -2921,6 +3091,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2921 return btrfs_ioctl_tree_search(file, argp); 3091 return btrfs_ioctl_tree_search(file, argp);
2922 case BTRFS_IOC_INO_LOOKUP: 3092 case BTRFS_IOC_INO_LOOKUP:
2923 return btrfs_ioctl_ino_lookup(file, argp); 3093 return btrfs_ioctl_ino_lookup(file, argp);
3094 case BTRFS_IOC_INO_PATHS:
3095 return btrfs_ioctl_ino_to_path(root, argp);
3096 case BTRFS_IOC_LOGICAL_INO:
3097 return btrfs_ioctl_logical_to_ino(root, argp);
2924 case BTRFS_IOC_SPACE_INFO: 3098 case BTRFS_IOC_SPACE_INFO:
2925 return btrfs_ioctl_space_info(root, argp); 3099 return btrfs_ioctl_space_info(root, argp);
2926 case BTRFS_IOC_SYNC: 3100 case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb4..252ae9915de8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
193 struct btrfs_ioctl_space_info spaces[0]; 193 struct btrfs_ioctl_space_info spaces[0];
194}; 194};
195 195
196struct btrfs_data_container {
197 __u32 bytes_left; /* out -- bytes not needed to deliver output */
198 __u32 bytes_missing; /* out -- additional bytes needed for result */
199 __u32 elem_cnt; /* out */
200 __u32 elem_missed; /* out */
201 __u64 val[0]; /* out */
202};
203
204struct btrfs_ioctl_ino_path_args {
205 __u64 inum; /* in */
206 __u32 size; /* in */
207 __u64 reserved[4];
208 /* struct btrfs_data_container *fspath; out */
209 __u64 fspath; /* out */
210};
211
212struct btrfs_ioctl_logical_ino_args {
213 __u64 logical; /* in */
214 __u32 size; /* in */
215 __u64 reserved[4];
216 /* struct btrfs_data_container *inodes; out */
217 __u64 inodes;
218};
219
196#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 220#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
197 struct btrfs_ioctl_vol_args) 221 struct btrfs_ioctl_vol_args)
198#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 222#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
248 struct btrfs_ioctl_dev_info_args) 272 struct btrfs_ioctl_dev_info_args)
249#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
250 struct btrfs_ioctl_fs_info_args) 274 struct btrfs_ioctl_fs_info_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
278 struct btrfs_ioctl_ino_path_args)
279
251#endif 280#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) 158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
159{ 159{
160 int i; 160 int i;
161 u32 type; 161 u32 type, nr;
162 u32 nr = btrfs_header_nritems(l);
163 struct btrfs_item *item; 162 struct btrfs_item *item;
164 struct btrfs_root_item *ri; 163 struct btrfs_root_item *ri;
165 struct btrfs_dir_item *di; 164 struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
172 struct btrfs_key key; 171 struct btrfs_key key;
173 struct btrfs_key found_key; 172 struct btrfs_key found_key;
174 173
174 if (!l)
175 return;
176
177 nr = btrfs_header_nritems(l);
178
175 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", 179 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
176 (unsigned long long)btrfs_header_bytenr(l), nr, 180 (unsigned long long)btrfs_header_bytenr(l), nr,
177 btrfs_leaf_free_space(root, l)); 181 btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..2373b39a132b
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,951 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31#undef DEBUG
32
33/*
34 * This is the implementation for the generic read ahead framework.
35 *
36 * To trigger a readahead, btrfs_reada_add must be called. It will start
37 * a read ahead for the given range [start, end) on tree root. The returned
38 * handle can either be used to wait on the readahead to finish
39 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
40 *
41 * The read ahead works as follows:
42 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
43 * reada_start_machine will then search for extents to prefetch and trigger
44 * some reads. When a read finishes for a node, all contained node/leaf
45 * pointers that lie in the given range will also be enqueued. The reads will
46 * be triggered in sequential order, thus giving a big win over a naive
47 * enumeration. It will also make use of multi-device layouts. Each disk
48 * will have its on read pointer and all disks will by utilized in parallel.
49 * Also will no two disks read both sides of a mirror simultaneously, as this
50 * would waste seeking capacity. Instead both disks will read different parts
51 * of the filesystem.
52 * Any number of readaheads can be started in parallel. The read order will be
53 * determined globally, i.e. 2 parallel readaheads will normally finish faster
54 * than the 2 started one after another.
55 */
56
57#define MAX_MIRRORS 2
58#define MAX_IN_FLIGHT 6
59
60struct reada_extctl {
61 struct list_head list;
62 struct reada_control *rc;
63 u64 generation;
64};
65
66struct reada_extent {
67 u64 logical;
68 struct btrfs_key top;
69 u32 blocksize;
70 int err;
71 struct list_head extctl;
72 struct kref refcnt;
73 spinlock_t lock;
74 struct reada_zone *zones[MAX_MIRRORS];
75 int nzones;
76 struct btrfs_device *scheduled_for;
77};
78
79struct reada_zone {
80 u64 start;
81 u64 end;
82 u64 elems;
83 struct list_head list;
84 spinlock_t lock;
85 int locked;
86 struct btrfs_device *device;
87 struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
88 int ndevs;
89 struct kref refcnt;
90};
91
92struct reada_machine_work {
93 struct btrfs_work work;
94 struct btrfs_fs_info *fs_info;
95};
96
97static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
98static void reada_control_release(struct kref *kref);
99static void reada_zone_release(struct kref *kref);
100static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102
103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation);
105
106/* recurses */
107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
109 u64 start, int err)
110{
111 int level = 0;
112 int nritems;
113 int i;
114 u64 bytenr;
115 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121
122 if (eb)
123 level = btrfs_header_level(eb);
124
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 kref_get(&re->refcnt);
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock);
136 /*
137 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore
139 */
140 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock);
144
145 if (err == 0) {
146 nritems = level ? btrfs_header_nritems(eb) : 0;
147 generation = btrfs_header_generation(eb);
148 /*
149 * FIXME: currently we just set nritems to 0 if this is a leaf,
150 * effectively ignoring the content. In a next step we could
151 * trigger more readahead depending from the content, e.g.
152 * fetch the checksums for the extents in the leaf.
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164
165 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec;
167 u64 n_gen;
168 struct btrfs_key key;
169 struct btrfs_key next_key;
170
171 btrfs_node_key_to_cpu(eb, &key, i);
172 if (i + 1 < nritems)
173 btrfs_node_key_to_cpu(eb, &next_key, i + 1);
174 else
175 next_key = re->top;
176 bytenr = btrfs_node_blockptr(eb, i);
177 n_gen = btrfs_node_ptr_generation(eb, i);
178
179 list_for_each_entry(rec, &list, list) {
180 struct reada_control *rc = rec->rc;
181
182 /*
183 * if the generation doesn't match, just ignore this
184 * extctl. This will probably cut off a branch from
185 * prefetch. Alternatively one could start a new (sub-)
186 * prefetch for this branch, starting again from root.
187 * FIXME: move the generation check out of this loop
188 */
189#ifdef DEBUG
190 if (rec->generation != generation) {
191 printk(KERN_DEBUG "generation mismatch for "
192 "(%llu,%d,%llu) %llu != %llu\n",
193 key.objectid, key.type, key.offset,
194 rec->generation, generation);
195 }
196#endif
197 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key,
201 level - 1, n_gen);
202 }
203 }
204 /*
205 * free extctl records
206 */
207 while (!list_empty(&list)) {
208 struct reada_control *rc;
209 struct reada_extctl *rec;
210
211 rec = list_first_entry(&list, struct reada_extctl, list);
212 list_del(&rec->list);
213 rc = rec->rc;
214 kfree(rec);
215
216 kref_get(&rc->refcnt);
217 if (atomic_dec_and_test(&rc->elems)) {
218 kref_put(&rc->refcnt, reada_control_release);
219 wake_up(&rc->wait);
220 }
221 kref_put(&rc->refcnt, reada_control_release);
222
223 reada_extent_put(fs_info, re); /* one ref for each entry */
224 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228
229 return 0;
230}
231
232/*
233 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O
235 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
237 u64 start, int err)
238{
239 int ret;
240
241 ret = __readahead_hook(root, eb, start, err);
242
243 reada_start_machine(root->fs_info);
244
245 return ret;
246}
247
248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
249 struct btrfs_device *dev, u64 logical,
250 struct btrfs_bio *bbio)
251{
252 int ret;
253 int looped = 0;
254 struct reada_zone *zone;
255 struct btrfs_block_group_cache *cache = NULL;
256 u64 start;
257 u64 end;
258 int i;
259
260again:
261 zone = NULL;
262 spin_lock(&fs_info->reada_lock);
263 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
264 logical >> PAGE_CACHE_SHIFT, 1);
265 if (ret == 1)
266 kref_get(&zone->refcnt);
267 spin_unlock(&fs_info->reada_lock);
268
269 if (ret == 1) {
270 if (logical >= zone->start && logical < zone->end)
271 return zone;
272 spin_lock(&fs_info->reada_lock);
273 kref_put(&zone->refcnt, reada_zone_release);
274 spin_unlock(&fs_info->reada_lock);
275 }
276
277 if (looped)
278 return NULL;
279
280 cache = btrfs_lookup_block_group(fs_info, logical);
281 if (!cache)
282 return NULL;
283
284 start = cache->key.objectid;
285 end = start + cache->key.offset - 1;
286 btrfs_put_block_group(cache);
287
288 zone = kzalloc(sizeof(*zone), GFP_NOFS);
289 if (!zone)
290 return NULL;
291
292 zone->start = start;
293 zone->end = end;
294 INIT_LIST_HEAD(&zone->list);
295 spin_lock_init(&zone->lock);
296 zone->locked = 0;
297 kref_init(&zone->refcnt);
298 zone->elems = 0;
299 zone->device = dev; /* our device always sits at index 0 */
300 for (i = 0; i < bbio->num_stripes; ++i) {
301 /* bounds have already been checked */
302 zone->devs[i] = bbio->stripes[i].dev;
303 }
304 zone->ndevs = bbio->num_stripes;
305
306 spin_lock(&fs_info->reada_lock);
307 ret = radix_tree_insert(&dev->reada_zones,
308 (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
309 zone);
310 spin_unlock(&fs_info->reada_lock);
311
312 if (ret) {
313 kfree(zone);
314 looped = 1;
315 goto again;
316 }
317
318 return zone;
319}
320
321static struct reada_extent *reada_find_extent(struct btrfs_root *root,
322 u64 logical,
323 struct btrfs_key *top, int level)
324{
325 int ret;
326 int looped = 0;
327 struct reada_extent *re = NULL;
328 struct btrfs_fs_info *fs_info = root->fs_info;
329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
330 struct btrfs_bio *bbio = NULL;
331 struct btrfs_device *dev;
332 u32 blocksize;
333 u64 length;
334 int nzones = 0;
335 int i;
336 unsigned long index = logical >> PAGE_CACHE_SHIFT;
337
338again:
339 spin_lock(&fs_info->reada_lock);
340 re = radix_tree_lookup(&fs_info->reada_tree, index);
341 if (re)
342 kref_get(&re->refcnt);
343 spin_unlock(&fs_info->reada_lock);
344
345 if (re || looped)
346 return re;
347
348 re = kzalloc(sizeof(*re), GFP_NOFS);
349 if (!re)
350 return NULL;
351
352 blocksize = btrfs_level_size(root, level);
353 re->logical = logical;
354 re->blocksize = blocksize;
355 re->top = *top;
356 INIT_LIST_HEAD(&re->extctl);
357 spin_lock_init(&re->lock);
358 kref_init(&re->refcnt);
359
360 /*
361 * map block
362 */
363 length = blocksize;
364 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
365 if (ret || !bbio || length < blocksize)
366 goto error;
367
368 if (bbio->num_stripes > MAX_MIRRORS) {
369 printk(KERN_ERR "btrfs readahead: more than %d copies not "
370 "supported", MAX_MIRRORS);
371 goto error;
372 }
373
374 for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
375 struct reada_zone *zone;
376
377 dev = bbio->stripes[nzones].dev;
378 zone = reada_find_zone(fs_info, dev, logical, bbio);
379 if (!zone)
380 break;
381
382 re->zones[nzones] = zone;
383 spin_lock(&zone->lock);
384 if (!zone->elems)
385 kref_get(&zone->refcnt);
386 ++zone->elems;
387 spin_unlock(&zone->lock);
388 spin_lock(&fs_info->reada_lock);
389 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock);
391 }
392 re->nzones = nzones;
393 if (nzones == 0) {
394 /* not a single zone found, error and out */
395 goto error;
396 }
397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 spin_lock(&fs_info->reada_lock);
400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
401 if (ret) {
402 spin_unlock(&fs_info->reada_lock);
403 if (ret != -ENOMEM) {
404 /* someone inserted the extent in the meantime */
405 looped = 1;
406 }
407 goto error;
408 }
409 for (i = 0; i < nzones; ++i) {
410 dev = bbio->stripes[i].dev;
411 ret = radix_tree_insert(&dev->reada_extents, index, re);
412 if (ret) {
413 while (--i >= 0) {
414 dev = bbio->stripes[i].dev;
415 BUG_ON(dev == NULL);
416 radix_tree_delete(&dev->reada_extents, index);
417 }
418 BUG_ON(fs_info == NULL);
419 radix_tree_delete(&fs_info->reada_tree, index);
420 spin_unlock(&fs_info->reada_lock);
421 goto error;
422 }
423 }
424 spin_unlock(&fs_info->reada_lock);
425
426 kfree(bbio);
427 return re;
428
429error:
430 while (nzones) {
431 struct reada_zone *zone;
432
433 --nzones;
434 zone = re->zones[nzones];
435 kref_get(&zone->refcnt);
436 spin_lock(&zone->lock);
437 --zone->elems;
438 if (zone->elems == 0) {
439 /*
440 * no fs_info->reada_lock needed, as this can't be
441 * the last ref
442 */
443 kref_put(&zone->refcnt, reada_zone_release);
444 }
445 spin_unlock(&zone->lock);
446
447 spin_lock(&fs_info->reada_lock);
448 kref_put(&zone->refcnt, reada_zone_release);
449 spin_unlock(&fs_info->reada_lock);
450 }
451 kfree(bbio);
452 kfree(re);
453 if (looped)
454 goto again;
455 return NULL;
456}
457
458static void reada_kref_dummy(struct kref *kr)
459{
460}
461
462static void reada_extent_put(struct btrfs_fs_info *fs_info,
463 struct reada_extent *re)
464{
465 int i;
466 unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
467
468 spin_lock(&fs_info->reada_lock);
469 if (!kref_put(&re->refcnt, reada_kref_dummy)) {
470 spin_unlock(&fs_info->reada_lock);
471 return;
472 }
473
474 radix_tree_delete(&fs_info->reada_tree, index);
475 for (i = 0; i < re->nzones; ++i) {
476 struct reada_zone *zone = re->zones[i];
477
478 radix_tree_delete(&zone->device->reada_extents, index);
479 }
480
481 spin_unlock(&fs_info->reada_lock);
482
483 for (i = 0; i < re->nzones; ++i) {
484 struct reada_zone *zone = re->zones[i];
485
486 kref_get(&zone->refcnt);
487 spin_lock(&zone->lock);
488 --zone->elems;
489 if (zone->elems == 0) {
490 /* no fs_info->reada_lock needed, as this can't be
491 * the last ref */
492 kref_put(&zone->refcnt, reada_zone_release);
493 }
494 spin_unlock(&zone->lock);
495
496 spin_lock(&fs_info->reada_lock);
497 kref_put(&zone->refcnt, reada_zone_release);
498 spin_unlock(&fs_info->reada_lock);
499 }
500 if (re->scheduled_for)
501 atomic_dec(&re->scheduled_for->reada_in_flight);
502
503 kfree(re);
504}
505
506static void reada_zone_release(struct kref *kref)
507{
508 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
509
510 radix_tree_delete(&zone->device->reada_zones,
511 zone->end >> PAGE_CACHE_SHIFT);
512
513 kfree(zone);
514}
515
516static void reada_control_release(struct kref *kref)
517{
518 struct reada_control *rc = container_of(kref, struct reada_control,
519 refcnt);
520
521 kfree(rc);
522}
523
524static int reada_add_block(struct reada_control *rc, u64 logical,
525 struct btrfs_key *top, int level, u64 generation)
526{
527 struct btrfs_root *root = rc->root;
528 struct reada_extent *re;
529 struct reada_extctl *rec;
530
531 re = reada_find_extent(root, logical, top, level); /* takes one ref */
532 if (!re)
533 return -1;
534
535 rec = kzalloc(sizeof(*rec), GFP_NOFS);
536 if (!rec) {
537 reada_extent_put(root->fs_info, re);
538 return -1;
539 }
540
541 rec->rc = rc;
542 rec->generation = generation;
543 atomic_inc(&rc->elems);
544
545 spin_lock(&re->lock);
546 list_add_tail(&rec->list, &re->extctl);
547 spin_unlock(&re->lock);
548
549 /* leave the ref on the extent */
550
551 return 0;
552}
553
554/*
555 * called with fs_info->reada_lock held
556 */
557static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
558{
559 int i;
560 unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
561
562 for (i = 0; i < zone->ndevs; ++i) {
563 struct reada_zone *peer;
564 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
565 if (peer && peer->device != zone->device)
566 peer->locked = lock;
567 }
568}
569
570/*
571 * called with fs_info->reada_lock held
572 */
573static int reada_pick_zone(struct btrfs_device *dev)
574{
575 struct reada_zone *top_zone = NULL;
576 struct reada_zone *top_locked_zone = NULL;
577 u64 top_elems = 0;
578 u64 top_locked_elems = 0;
579 unsigned long index = 0;
580 int ret;
581
582 if (dev->reada_curr_zone) {
583 reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
584 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
585 dev->reada_curr_zone = NULL;
586 }
587 /* pick the zone with the most elements */
588 while (1) {
589 struct reada_zone *zone;
590
591 ret = radix_tree_gang_lookup(&dev->reada_zones,
592 (void **)&zone, index, 1);
593 if (ret == 0)
594 break;
595 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
596 if (zone->locked) {
597 if (zone->elems > top_locked_elems) {
598 top_locked_elems = zone->elems;
599 top_locked_zone = zone;
600 }
601 } else {
602 if (zone->elems > top_elems) {
603 top_elems = zone->elems;
604 top_zone = zone;
605 }
606 }
607 }
608 if (top_zone)
609 dev->reada_curr_zone = top_zone;
610 else if (top_locked_zone)
611 dev->reada_curr_zone = top_locked_zone;
612 else
613 return 0;
614
615 dev->reada_next = dev->reada_curr_zone->start;
616 kref_get(&dev->reada_curr_zone->refcnt);
617 reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
618
619 return 1;
620}
621
622static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
623 struct btrfs_device *dev)
624{
625 struct reada_extent *re = NULL;
626 int mirror_num = 0;
627 struct extent_buffer *eb = NULL;
628 u64 logical;
629 u32 blocksize;
630 int ret;
631 int i;
632 int need_kick = 0;
633
634 spin_lock(&fs_info->reada_lock);
635 if (dev->reada_curr_zone == NULL) {
636 ret = reada_pick_zone(dev);
637 if (!ret) {
638 spin_unlock(&fs_info->reada_lock);
639 return 0;
640 }
641 }
642 /*
643 * FIXME currently we issue the reads one extent at a time. If we have
644 * a contiguous block of extents, we could also coagulate them or use
645 * plugging to speed things up
646 */
647 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
648 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
649 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
650 ret = reada_pick_zone(dev);
651 if (!ret) {
652 spin_unlock(&fs_info->reada_lock);
653 return 0;
654 }
655 re = NULL;
656 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
657 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
658 }
659 if (ret == 0) {
660 spin_unlock(&fs_info->reada_lock);
661 return 0;
662 }
663 dev->reada_next = re->logical + re->blocksize;
664 kref_get(&re->refcnt);
665
666 spin_unlock(&fs_info->reada_lock);
667
668 /*
669 * find mirror num
670 */
671 for (i = 0; i < re->nzones; ++i) {
672 if (re->zones[i]->device == dev) {
673 mirror_num = i + 1;
674 break;
675 }
676 }
677 logical = re->logical;
678 blocksize = re->blocksize;
679
680 spin_lock(&re->lock);
681 if (re->scheduled_for == NULL) {
682 re->scheduled_for = dev;
683 need_kick = 1;
684 }
685 spin_unlock(&re->lock);
686
687 reada_extent_put(fs_info, re);
688
689 if (!need_kick)
690 return 0;
691
692 atomic_inc(&dev->reada_in_flight);
693 ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
694 mirror_num, &eb);
695 if (ret)
696 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
697 else if (eb)
698 __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
699
700 if (eb)
701 free_extent_buffer(eb);
702
703 return 1;
704
705}
706
707static void reada_start_machine_worker(struct btrfs_work *work)
708{
709 struct reada_machine_work *rmw;
710 struct btrfs_fs_info *fs_info;
711
712 rmw = container_of(work, struct reada_machine_work, work);
713 fs_info = rmw->fs_info;
714
715 kfree(rmw);
716
717 __reada_start_machine(fs_info);
718}
719
720static void __reada_start_machine(struct btrfs_fs_info *fs_info)
721{
722 struct btrfs_device *device;
723 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
724 u64 enqueued;
725 u64 total = 0;
726 int i;
727
728 do {
729 enqueued = 0;
730 list_for_each_entry(device, &fs_devices->devices, dev_list) {
731 if (atomic_read(&device->reada_in_flight) <
732 MAX_IN_FLIGHT)
733 enqueued += reada_start_machine_dev(fs_info,
734 device);
735 }
736 total += enqueued;
737 } while (enqueued && total < 10000);
738
739 if (enqueued == 0)
740 return;
741
742 /*
743 * If everything is already in the cache, this is effectively single
744 * threaded. To a) not hold the caller for too long and b) to utilize
745 * more cores, we broke the loop above after 10000 iterations and now
746 * enqueue to workers to finish it. This will distribute the load to
747 * the cores.
748 */
749 for (i = 0; i < 2; ++i)
750 reada_start_machine(fs_info);
751}
752
753static void reada_start_machine(struct btrfs_fs_info *fs_info)
754{
755 struct reada_machine_work *rmw;
756
757 rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
758 if (!rmw) {
759 /* FIXME we cannot handle this properly right now */
760 BUG();
761 }
762 rmw->work.func = reada_start_machine_worker;
763 rmw->fs_info = fs_info;
764
765 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
766}
767
768#ifdef DEBUG
769static void dump_devs(struct btrfs_fs_info *fs_info, int all)
770{
771 struct btrfs_device *device;
772 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
773 unsigned long index;
774 int ret;
775 int i;
776 int j;
777 int cnt;
778
779 spin_lock(&fs_info->reada_lock);
780 list_for_each_entry(device, &fs_devices->devices, dev_list) {
781 printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
782 atomic_read(&device->reada_in_flight));
783 index = 0;
784 while (1) {
785 struct reada_zone *zone;
786 ret = radix_tree_gang_lookup(&device->reada_zones,
787 (void **)&zone, index, 1);
788 if (ret == 0)
789 break;
790 printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
791 "%d devs", zone->start, zone->end, zone->elems,
792 zone->locked);
793 for (j = 0; j < zone->ndevs; ++j) {
794 printk(KERN_CONT " %lld",
795 zone->devs[j]->devid);
796 }
797 if (device->reada_curr_zone == zone)
798 printk(KERN_CONT " curr off %llu",
799 device->reada_next - zone->start);
800 printk(KERN_CONT "\n");
801 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
802 }
803 cnt = 0;
804 index = 0;
805 while (all) {
806 struct reada_extent *re = NULL;
807
808 ret = radix_tree_gang_lookup(&device->reada_extents,
809 (void **)&re, index, 1);
810 if (ret == 0)
811 break;
812 printk(KERN_DEBUG
813 " re: logical %llu size %u empty %d for %lld",
814 re->logical, re->blocksize,
815 list_empty(&re->extctl), re->scheduled_for ?
816 re->scheduled_for->devid : -1);
817
818 for (i = 0; i < re->nzones; ++i) {
819 printk(KERN_CONT " zone %llu-%llu devs",
820 re->zones[i]->start,
821 re->zones[i]->end);
822 for (j = 0; j < re->zones[i]->ndevs; ++j) {
823 printk(KERN_CONT " %lld",
824 re->zones[i]->devs[j]->devid);
825 }
826 }
827 printk(KERN_CONT "\n");
828 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
829 if (++cnt > 15)
830 break;
831 }
832 }
833
834 index = 0;
835 cnt = 0;
836 while (all) {
837 struct reada_extent *re = NULL;
838
839 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
840 index, 1);
841 if (ret == 0)
842 break;
843 if (!re->scheduled_for) {
844 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
845 continue;
846 }
847 printk(KERN_DEBUG
848 "re: logical %llu size %u list empty %d for %lld",
849 re->logical, re->blocksize, list_empty(&re->extctl),
850 re->scheduled_for ? re->scheduled_for->devid : -1);
851 for (i = 0; i < re->nzones; ++i) {
852 printk(KERN_CONT " zone %llu-%llu devs",
853 re->zones[i]->start,
854 re->zones[i]->end);
855 for (i = 0; i < re->nzones; ++i) {
856 printk(KERN_CONT " zone %llu-%llu devs",
857 re->zones[i]->start,
858 re->zones[i]->end);
859 for (j = 0; j < re->zones[i]->ndevs; ++j) {
860 printk(KERN_CONT " %lld",
861 re->zones[i]->devs[j]->devid);
862 }
863 }
864 }
865 printk(KERN_CONT "\n");
866 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
867 }
868 spin_unlock(&fs_info->reada_lock);
869}
870#endif
871
872/*
873 * interface
874 */
875struct reada_control *btrfs_reada_add(struct btrfs_root *root,
876 struct btrfs_key *key_start, struct btrfs_key *key_end)
877{
878 struct reada_control *rc;
879 u64 start;
880 u64 generation;
881 int level;
882 struct extent_buffer *node;
883 static struct btrfs_key max_key = {
884 .objectid = (u64)-1,
885 .type = (u8)-1,
886 .offset = (u64)-1
887 };
888
889 rc = kzalloc(sizeof(*rc), GFP_NOFS);
890 if (!rc)
891 return ERR_PTR(-ENOMEM);
892
893 rc->root = root;
894 rc->key_start = *key_start;
895 rc->key_end = *key_end;
896 atomic_set(&rc->elems, 0);
897 init_waitqueue_head(&rc->wait);
898 kref_init(&rc->refcnt);
899 kref_get(&rc->refcnt); /* one ref for having elements */
900
901 node = btrfs_root_node(root);
902 start = node->start;
903 level = btrfs_header_level(node);
904 generation = btrfs_header_generation(node);
905 free_extent_buffer(node);
906
907 reada_add_block(rc, start, &max_key, level, generation);
908
909 reada_start_machine(root->fs_info);
910
911 return rc;
912}
913
914#ifdef DEBUG
915int btrfs_reada_wait(void *handle)
916{
917 struct reada_control *rc = handle;
918
919 while (atomic_read(&rc->elems)) {
920 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
921 5 * HZ);
922 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
923 }
924
925 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
926
927 kref_put(&rc->refcnt, reada_control_release);
928
929 return 0;
930}
931#else
932int btrfs_reada_wait(void *handle)
933{
934 struct reada_control *rc = handle;
935
936 while (atomic_read(&rc->elems)) {
937 wait_event(rc->wait, atomic_read(&rc->elems) == 0);
938 }
939
940 kref_put(&rc->refcnt, reada_control_release);
941
942 return 0;
943}
944#endif
945
946void btrfs_reada_detach(void *handle)
947{
948 struct reada_control *rc = handle;
949
950 kref_put(&rc->refcnt, reada_control_release);
951}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..cfb55434a469 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
1174 list_add_tail(&new_edge->list[UPPER], 1174 list_add_tail(&new_edge->list[UPPER],
1175 &new_node->lower); 1175 &new_node->lower);
1176 } 1176 }
1177 } else {
1178 list_add_tail(&new_node->lower, &cache->leaves);
1177 } 1179 }
1178 1180
1179 rb_node = tree_insert(&cache->rb_root, new_node->bytenr, 1181 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
@@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2041 BUG_ON(IS_ERR(trans)); 2043 BUG_ON(IS_ERR(trans));
2042 trans->block_rsv = rc->block_rsv; 2044 trans->block_rsv = rc->block_rsv;
2043 2045
2044 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2046 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
2045 min_reserved, 0);
2046 if (ret) { 2047 if (ret) {
2047 BUG_ON(ret != -EAGAIN); 2048 BUG_ON(ret != -EAGAIN);
2048 ret = btrfs_commit_transaction(trans, root); 2049 ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2152again: 2153again:
2153 if (!err) { 2154 if (!err) {
2154 num_bytes = rc->merging_rsv_size; 2155 num_bytes = rc->merging_rsv_size;
2155 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2156 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2156 num_bytes);
2157 if (ret) 2157 if (ret)
2158 err = ret; 2158 err = ret;
2159 } 2159 }
@@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2428 2428
2429 trans->block_rsv = rc->block_rsv; 2429 trans->block_rsv = rc->block_rsv;
2430 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); 2430 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2431 if (ret) { 2431 if (ret) {
2432 if (ret == -EAGAIN) 2432 if (ret == -EAGAIN)
2433 rc->commit_transaction = 1; 2433 rc->commit_transaction = 1;
@@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2922 unsigned long last_index; 2922 unsigned long last_index;
2923 struct page *page; 2923 struct page *page;
2924 struct file_ra_state *ra; 2924 struct file_ra_state *ra;
2925 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2925 int nr = 0; 2926 int nr = 0;
2926 int ret = 0; 2927 int ret = 0;
2927 2928
@@ -2946,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2946 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2947 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2948 while (index <= last_index) { 2949 while (index <= last_index) {
2950 mutex_lock(&inode->i_mutex);
2949 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); 2951 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2952 mutex_unlock(&inode->i_mutex);
2950 if (ret) 2953 if (ret)
2951 goto out; 2954 goto out;
2952 2955
@@ -2956,7 +2959,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2956 ra, NULL, index, 2959 ra, NULL, index,
2957 last_index + 1 - index); 2960 last_index + 1 - index);
2958 page = find_or_create_page(inode->i_mapping, index, 2961 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS); 2962 mask);
2960 if (!page) { 2963 if (!page) {
2961 btrfs_delalloc_release_metadata(inode, 2964 btrfs_delalloc_release_metadata(inode,
2962 PAGE_CACHE_SIZE); 2965 PAGE_CACHE_SIZE);
@@ -3323,8 +3326,11 @@ static int find_data_references(struct reloc_control *rc,
3323 } 3326 }
3324 3327
3325 key.objectid = ref_objectid; 3328 key.objectid = ref_objectid;
3326 key.offset = ref_offset;
3327 key.type = BTRFS_EXTENT_DATA_KEY; 3329 key.type = BTRFS_EXTENT_DATA_KEY;
3330 if (ref_offset > ((u64)-1 << 32))
3331 key.offset = 0;
3332 else
3333 key.offset = ref_offset;
3328 3334
3329 path->search_commit_root = 1; 3335 path->search_commit_root = 1;
3330 path->skip_locking = 1; 3336 path->skip_locking = 1;
@@ -3645,14 +3651,11 @@ int prepare_to_relocate(struct reloc_control *rc)
3645 * btrfs_init_reloc_root will use them when there 3651 * btrfs_init_reloc_root will use them when there
3646 * is no reservation in transaction handle. 3652 * is no reservation in transaction handle.
3647 */ 3653 */
3648 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3654 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3649 rc->extent_root->nodesize * 256); 3655 rc->extent_root->nodesize * 256);
3650 if (ret) 3656 if (ret)
3651 return ret; 3657 return ret;
3652 3658
3653 rc->block_rsv->refill_used = 1;
3654 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3655
3656 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3659 memset(&rc->cluster, 0, sizeof(rc->cluster));
3657 rc->search_start = rc->block_group->key.objectid; 3660 rc->search_start = rc->block_group->key.objectid;
3658 rc->extents_found = 0; 3661 rc->extents_found = 0;
@@ -3777,8 +3780,7 @@ restart:
3777 } 3780 }
3778 } 3781 }
3779 3782
3780 ret = btrfs_block_rsv_check(trans, rc->extent_root, 3783 ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3781 rc->block_rsv, 0, 5);
3782 if (ret < 0) { 3784 if (ret < 0) {
3783 if (ret != -EAGAIN) { 3785 if (ret != -EAGAIN) {
3784 err = ret; 3786 err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..ddf2c90d3fc0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
17 */ 17 */
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "volumes.h" 22#include "volumes.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "ordered-data.h" 24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
24 28
25/* 29/*
26 * This is only the first step towards a full-features scrub. It reads all 30 * This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
29 * any can be found. 33 * any can be found.
30 * 34 *
31 * Future enhancements: 35 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are 36 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them 37 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read 38 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy 39 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices 40 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space 41 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */ 42 */
42 43
43struct scrub_bio; 44struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
63struct scrub_page { 64struct scrub_page {
64 u64 flags; /* extent flags */ 65 u64 flags; /* extent flags */
65 u64 generation; 66 u64 generation;
66 u64 mirror_num; 67 int mirror_num;
67 int have_csum; 68 int have_csum;
68 u8 csum[BTRFS_CSUM_SIZE]; 69 u8 csum[BTRFS_CSUM_SIZE];
69}; 70};
@@ -87,6 +88,7 @@ struct scrub_dev {
87 int first_free; 88 int first_free;
88 int curr; 89 int curr;
89 atomic_t in_flight; 90 atomic_t in_flight;
91 atomic_t fixup_cnt;
90 spinlock_t list_lock; 92 spinlock_t list_lock;
91 wait_queue_head_t list_wait; 93 wait_queue_head_t list_wait;
92 u16 csum_size; 94 u16 csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
100 spinlock_t stat_lock; 102 spinlock_t stat_lock;
101}; 103};
102 104
105struct scrub_fixup_nodatasum {
106 struct scrub_dev *sdev;
107 u64 logical;
108 struct btrfs_root *root;
109 struct btrfs_work work;
110 int mirror_num;
111};
112
113struct scrub_warning {
114 struct btrfs_path *path;
115 u64 extent_item_size;
116 char *scratch_buf;
117 char *msg_buf;
118 const char *errstr;
119 sector_t sector;
120 u64 logical;
121 struct btrfs_device *dev;
122 int msg_bufsize;
123 int scratch_bufsize;
124};
125
103static void scrub_free_csums(struct scrub_dev *sdev) 126static void scrub_free_csums(struct scrub_dev *sdev)
104{ 127{
105 while (!list_empty(&sdev->csum_list)) { 128 while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
175 198
176 if (i != SCRUB_BIOS_PER_DEV-1) 199 if (i != SCRUB_BIOS_PER_DEV-1)
177 sdev->bios[i]->next_free = i + 1; 200 sdev->bios[i]->next_free = i + 1;
178 else 201 else
179 sdev->bios[i]->next_free = -1; 202 sdev->bios[i]->next_free = -1;
180 } 203 }
181 sdev->first_free = 0; 204 sdev->first_free = 0;
182 sdev->curr = -1; 205 sdev->curr = -1;
183 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
207 atomic_set(&sdev->fixup_cnt, 0);
184 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
185 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
186 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
187 211
188 spin_lock_init(&sdev->list_lock); 212 spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,366 @@ nomem:
195 return ERR_PTR(-ENOMEM); 219 return ERR_PTR(-ENOMEM);
196} 220}
197 221
222static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
223{
224 u64 isize;
225 u32 nlink;
226 int ret;
227 int i;
228 struct extent_buffer *eb;
229 struct btrfs_inode_item *inode_item;
230 struct scrub_warning *swarn = ctx;
231 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
232 struct inode_fs_paths *ipath = NULL;
233 struct btrfs_root *local_root;
234 struct btrfs_key root_key;
235
236 root_key.objectid = root;
237 root_key.type = BTRFS_ROOT_ITEM_KEY;
238 root_key.offset = (u64)-1;
239 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
240 if (IS_ERR(local_root)) {
241 ret = PTR_ERR(local_root);
242 goto err;
243 }
244
245 ret = inode_item_info(inum, 0, local_root, swarn->path);
246 if (ret) {
247 btrfs_release_path(swarn->path);
248 goto err;
249 }
250
251 eb = swarn->path->nodes[0];
252 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
253 struct btrfs_inode_item);
254 isize = btrfs_inode_size(eb, inode_item);
255 nlink = btrfs_inode_nlink(eb, inode_item);
256 btrfs_release_path(swarn->path);
257
258 ipath = init_ipath(4096, local_root, swarn->path);
259 if (IS_ERR(ipath)) {
260 ret = PTR_ERR(ipath);
261 ipath = NULL;
262 goto err;
263 }
264 ret = paths_from_inode(inum, ipath);
265
266 if (ret < 0)
267 goto err;
268
269 /*
270 * we deliberately ignore the bit ipath might have been too small to
271 * hold all of the paths here
272 */
273 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
274 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
275 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
276 "length %llu, links %u (path: %s)\n", swarn->errstr,
277 swarn->logical, swarn->dev->name,
278 (unsigned long long)swarn->sector, root, inum, offset,
279 min(isize - offset, (u64)PAGE_SIZE), nlink,
280 (char *)(unsigned long)ipath->fspath->val[i]);
281
282 free_ipath(ipath);
283 return 0;
284
285err:
286 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
287 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
288 "resolving failed with ret=%d\n", swarn->errstr,
289 swarn->logical, swarn->dev->name,
290 (unsigned long long)swarn->sector, root, inum, offset, ret);
291
292 free_ipath(ipath);
293 return 0;
294}
295
296static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
297 int ix)
298{
299 struct btrfs_device *dev = sbio->sdev->dev;
300 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
301 struct btrfs_path *path;
302 struct btrfs_key found_key;
303 struct extent_buffer *eb;
304 struct btrfs_extent_item *ei;
305 struct scrub_warning swarn;
306 u32 item_size;
307 int ret;
308 u64 ref_root;
309 u8 ref_level;
310 unsigned long ptr = 0;
311 const int bufsize = 4096;
312 u64 extent_offset;
313
314 path = btrfs_alloc_path();
315
316 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
317 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
318 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
319 swarn.logical = sbio->logical + ix * PAGE_SIZE;
320 swarn.errstr = errstr;
321 swarn.dev = dev;
322 swarn.msg_bufsize = bufsize;
323 swarn.scratch_bufsize = bufsize;
324
325 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
326 goto out;
327
328 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
329 if (ret < 0)
330 goto out;
331
332 extent_offset = swarn.logical - found_key.objectid;
333 swarn.extent_item_size = found_key.offset;
334
335 eb = path->nodes[0];
336 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
337 item_size = btrfs_item_size_nr(eb, path->slots[0]);
338
339 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
340 do {
341 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
342 &ref_root, &ref_level);
343 printk(KERN_WARNING "%s at logical %llu on dev %s, "
344 "sector %llu: metadata %s (level %d) in tree "
345 "%llu\n", errstr, swarn.logical, dev->name,
346 (unsigned long long)swarn.sector,
347 ref_level ? "node" : "leaf",
348 ret < 0 ? -1 : ref_level,
349 ret < 0 ? -1 : ref_root);
350 } while (ret != 1);
351 } else {
352 swarn.path = path;
353 iterate_extent_inodes(fs_info, path, found_key.objectid,
354 extent_offset,
355 scrub_print_warning_inode, &swarn);
356 }
357
358out:
359 btrfs_free_path(path);
360 kfree(swarn.scratch_buf);
361 kfree(swarn.msg_buf);
362}
363
364static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
365{
366 struct page *page = NULL;
367 unsigned long index;
368 struct scrub_fixup_nodatasum *fixup = ctx;
369 int ret;
370 int corrected = 0;
371 struct btrfs_key key;
372 struct inode *inode = NULL;
373 u64 end = offset + PAGE_SIZE - 1;
374 struct btrfs_root *local_root;
375
376 key.objectid = root;
377 key.type = BTRFS_ROOT_ITEM_KEY;
378 key.offset = (u64)-1;
379 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
380 if (IS_ERR(local_root))
381 return PTR_ERR(local_root);
382
383 key.type = BTRFS_INODE_ITEM_KEY;
384 key.objectid = inum;
385 key.offset = 0;
386 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
387 if (IS_ERR(inode))
388 return PTR_ERR(inode);
389
390 index = offset >> PAGE_CACHE_SHIFT;
391
392 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
393 if (!page) {
394 ret = -ENOMEM;
395 goto out;
396 }
397
398 if (PageUptodate(page)) {
399 struct btrfs_mapping_tree *map_tree;
400 if (PageDirty(page)) {
401 /*
402 * we need to write the data to the defect sector. the
403 * data that was in that sector is not in memory,
404 * because the page was modified. we must not write the
405 * modified page to that sector.
406 *
407 * TODO: what could be done here: wait for the delalloc
408 * runner to write out that page (might involve
409 * COW) and see whether the sector is still
410 * referenced afterwards.
411 *
412 * For the meantime, we'll treat this error
413 * incorrectable, although there is a chance that a
414 * later scrub will find the bad sector again and that
415 * there's no dirty page in memory, then.
416 */
417 ret = -EIO;
418 goto out;
419 }
420 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
421 ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
422 fixup->logical, page,
423 fixup->mirror_num);
424 unlock_page(page);
425 corrected = !ret;
426 } else {
427 /*
428 * we need to get good data first. the general readpage path
429 * will call repair_io_failure for us, we just have to make
430 * sure we read the bad mirror.
431 */
432 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
433 EXTENT_DAMAGED, GFP_NOFS);
434 if (ret) {
435 /* set_extent_bits should give proper error */
436 WARN_ON(ret > 0);
437 if (ret > 0)
438 ret = -EFAULT;
439 goto out;
440 }
441
442 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
443 btrfs_get_extent,
444 fixup->mirror_num);
445 wait_on_page_locked(page);
446
447 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
448 end, EXTENT_DAMAGED, 0, NULL);
449 if (!corrected)
450 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
451 EXTENT_DAMAGED, GFP_NOFS);
452 }
453
454out:
455 if (page)
456 put_page(page);
457 if (inode)
458 iput(inode);
459
460 if (ret < 0)
461 return ret;
462
463 if (ret == 0 && corrected) {
464 /*
465 * we only need to call readpage for one of the inodes belonging
466 * to this extent. so make iterate_extent_inodes stop
467 */
468 return 1;
469 }
470
471 return -EIO;
472}
473
474static void scrub_fixup_nodatasum(struct btrfs_work *work)
475{
476 int ret;
477 struct scrub_fixup_nodatasum *fixup;
478 struct scrub_dev *sdev;
479 struct btrfs_trans_handle *trans = NULL;
480 struct btrfs_fs_info *fs_info;
481 struct btrfs_path *path;
482 int uncorrectable = 0;
483
484 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
485 sdev = fixup->sdev;
486 fs_info = fixup->root->fs_info;
487
488 path = btrfs_alloc_path();
489 if (!path) {
490 spin_lock(&sdev->stat_lock);
491 ++sdev->stat.malloc_errors;
492 spin_unlock(&sdev->stat_lock);
493 uncorrectable = 1;
494 goto out;
495 }
496
497 trans = btrfs_join_transaction(fixup->root);
498 if (IS_ERR(trans)) {
499 uncorrectable = 1;
500 goto out;
501 }
502
503 /*
504 * the idea is to trigger a regular read through the standard path. we
505 * read a page from the (failed) logical address by specifying the
506 * corresponding copynum of the failed sector. thus, that readpage is
507 * expected to fail.
508 * that is the point where on-the-fly error correction will kick in
509 * (once it's finished) and rewrite the failed sector if a good copy
510 * can be found.
511 */
512 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
513 path, scrub_fixup_readpage,
514 fixup);
515 if (ret < 0) {
516 uncorrectable = 1;
517 goto out;
518 }
519 WARN_ON(ret != 1);
520
521 spin_lock(&sdev->stat_lock);
522 ++sdev->stat.corrected_errors;
523 spin_unlock(&sdev->stat_lock);
524
525out:
526 if (trans && !IS_ERR(trans))
527 btrfs_end_transaction(trans, fixup->root);
528 if (uncorrectable) {
529 spin_lock(&sdev->stat_lock);
530 ++sdev->stat.uncorrectable_errors;
531 spin_unlock(&sdev->stat_lock);
532 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
533 "(nodatasum) error at logical %llu\n",
534 fixup->logical);
535 }
536
537 btrfs_free_path(path);
538 kfree(fixup);
539
540 /* see caller why we're pretending to be paused in the scrub counters */
541 mutex_lock(&fs_info->scrub_lock);
542 atomic_dec(&fs_info->scrubs_running);
543 atomic_dec(&fs_info->scrubs_paused);
544 mutex_unlock(&fs_info->scrub_lock);
545 atomic_dec(&sdev->fixup_cnt);
546 wake_up(&fs_info->scrub_pause_wait);
547 wake_up(&sdev->list_wait);
548}
549
198/* 550/*
199 * scrub_recheck_error gets called when either verification of the page 551 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case, 552 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only 553 * recheck_error gets called for every page in the bio, even though only
202 * one may be bad 554 * one may be bad
203 */ 555 */
204static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 556static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
205{ 557{
558 struct scrub_dev *sdev = sbio->sdev;
559 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
560 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
561 DEFAULT_RATELIMIT_BURST);
562
206 if (sbio->err) { 563 if (sbio->err) {
207 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 564 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
208 (sbio->physical + ix * PAGE_SIZE) >> 9,
209 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 565 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
210 if (scrub_fixup_check(sbio, ix) == 0) 566 if (scrub_fixup_check(sbio, ix) == 0)
211 return; 567 return 0;
212 } 568 }
569 if (__ratelimit(&_rs))
570 scrub_print_warning("i/o error", sbio, ix);
571 } else {
572 if (__ratelimit(&_rs))
573 scrub_print_warning("checksum error", sbio, ix);
213 } 574 }
214 575
576 spin_lock(&sdev->stat_lock);
577 ++sdev->stat.read_errors;
578 spin_unlock(&sdev->stat_lock);
579
215 scrub_fixup(sbio, ix); 580 scrub_fixup(sbio, ix);
581 return 1;
216} 582}
217 583
218static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 584static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
250 struct scrub_dev *sdev = sbio->sdev; 616 struct scrub_dev *sdev = sbio->sdev;
251 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 617 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
252 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 618 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
253 struct btrfs_multi_bio *multi = NULL; 619 struct btrfs_bio *bbio = NULL;
620 struct scrub_fixup_nodatasum *fixup;
254 u64 logical = sbio->logical + ix * PAGE_SIZE; 621 u64 logical = sbio->logical + ix * PAGE_SIZE;
255 u64 length; 622 u64 length;
256 int i; 623 int i;
@@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
259 626
260 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 627 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
261 (sbio->spag[ix].have_csum == 0)) { 628 (sbio->spag[ix].have_csum == 0)) {
629 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
630 if (!fixup)
631 goto uncorrectable;
632 fixup->sdev = sdev;
633 fixup->logical = logical;
634 fixup->root = fs_info->extent_root;
635 fixup->mirror_num = sbio->spag[ix].mirror_num;
262 /* 636 /*
263 * nodatasum, don't try to fix anything 637 * increment scrubs_running to prevent cancel requests from
264 * FIXME: we can do better, open the inode and trigger a 638 * completing as long as a fixup worker is running. we must also
265 * writeback 639 * increment scrubs_paused to prevent deadlocking on pause
640 * requests used for transactions commits (as the worker uses a
641 * transaction context). it is safe to regard the fixup worker
642 * as paused for all matters practical. effectively, we only
643 * avoid cancellation requests from completing.
266 */ 644 */
267 goto uncorrectable; 645 mutex_lock(&fs_info->scrub_lock);
646 atomic_inc(&fs_info->scrubs_running);
647 atomic_inc(&fs_info->scrubs_paused);
648 mutex_unlock(&fs_info->scrub_lock);
649 atomic_inc(&sdev->fixup_cnt);
650 fixup->work.func = scrub_fixup_nodatasum;
651 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
652 return;
268 } 653 }
269 654
270 length = PAGE_SIZE; 655 length = PAGE_SIZE;
271 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 656 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
272 &multi, 0); 657 &bbio, 0);
273 if (ret || !multi || length < PAGE_SIZE) { 658 if (ret || !bbio || length < PAGE_SIZE) {
274 printk(KERN_ERR 659 printk(KERN_ERR
275 "scrub_fixup: btrfs_map_block failed us for %llu\n", 660 "scrub_fixup: btrfs_map_block failed us for %llu\n",
276 (unsigned long long)logical); 661 (unsigned long long)logical);
277 WARN_ON(1); 662 WARN_ON(1);
663 kfree(bbio);
278 return; 664 return;
279 } 665 }
280 666
281 if (multi->num_stripes == 1) 667 if (bbio->num_stripes == 1)
282 /* there aren't any replicas */ 668 /* there aren't any replicas */
283 goto uncorrectable; 669 goto uncorrectable;
284 670
285 /* 671 /*
286 * first find a good copy 672 * first find a good copy
287 */ 673 */
288 for (i = 0; i < multi->num_stripes; ++i) { 674 for (i = 0; i < bbio->num_stripes; ++i) {
289 if (i == sbio->spag[ix].mirror_num) 675 if (i + 1 == sbio->spag[ix].mirror_num)
290 continue; 676 continue;
291 677
292 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 678 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
293 multi->stripes[i].physical >> 9, 679 bbio->stripes[i].physical >> 9,
294 sbio->bio->bi_io_vec[ix].bv_page)) { 680 sbio->bio->bi_io_vec[ix].bv_page)) {
295 /* I/O-error, this is not a good copy */ 681 /* I/O-error, this is not a good copy */
296 continue; 682 continue;
@@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
299 if (scrub_fixup_check(sbio, ix) == 0) 685 if (scrub_fixup_check(sbio, ix) == 0)
300 break; 686 break;
301 } 687 }
302 if (i == multi->num_stripes) 688 if (i == bbio->num_stripes)
303 goto uncorrectable; 689 goto uncorrectable;
304 690
305 if (!sdev->readonly) { 691 if (!sdev->readonly) {
@@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
314 } 700 }
315 } 701 }
316 702
317 kfree(multi); 703 kfree(bbio);
318 spin_lock(&sdev->stat_lock); 704 spin_lock(&sdev->stat_lock);
319 ++sdev->stat.corrected_errors; 705 ++sdev->stat.corrected_errors;
320 spin_unlock(&sdev->stat_lock); 706 spin_unlock(&sdev->stat_lock);
321 707
322 if (printk_ratelimit()) 708 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
323 printk(KERN_ERR "btrfs: fixed up at %llu\n", 709 (unsigned long long)logical);
324 (unsigned long long)logical);
325 return; 710 return;
326 711
327uncorrectable: 712uncorrectable:
328 kfree(multi); 713 kfree(bbio);
329 spin_lock(&sdev->stat_lock); 714 spin_lock(&sdev->stat_lock);
330 ++sdev->stat.uncorrectable_errors; 715 ++sdev->stat.uncorrectable_errors;
331 spin_unlock(&sdev->stat_lock); 716 spin_unlock(&sdev->stat_lock);
332 717
333 if (printk_ratelimit()) 718 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
334 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 719 "logical %llu\n", (unsigned long long)logical);
335 (unsigned long long)logical);
336} 720}
337 721
338static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 722static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work)
382 int ret; 766 int ret;
383 767
384 if (sbio->err) { 768 if (sbio->err) {
769 ret = 0;
385 for (i = 0; i < sbio->count; ++i) 770 for (i = 0; i < sbio->count; ++i)
386 scrub_recheck_error(sbio, i); 771 ret |= scrub_recheck_error(sbio, i);
772 if (!ret) {
773 spin_lock(&sdev->stat_lock);
774 ++sdev->stat.unverified_errors;
775 spin_unlock(&sdev->stat_lock);
776 }
387 777
388 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 778 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
389 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 779 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work)
396 bi->bv_offset = 0; 786 bi->bv_offset = 0;
397 bi->bv_len = PAGE_SIZE; 787 bi->bv_len = PAGE_SIZE;
398 } 788 }
399
400 spin_lock(&sdev->stat_lock);
401 ++sdev->stat.read_errors;
402 spin_unlock(&sdev->stat_lock);
403 goto out; 789 goto out;
404 } 790 }
405 for (i = 0; i < sbio->count; ++i) { 791 for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work)
420 WARN_ON(1); 806 WARN_ON(1);
421 } 807 }
422 kunmap_atomic(buffer, KM_USER0); 808 kunmap_atomic(buffer, KM_USER0);
423 if (ret) 809 if (ret) {
424 scrub_recheck_error(sbio, i); 810 ret = scrub_recheck_error(sbio, i);
811 if (!ret) {
812 spin_lock(&sdev->stat_lock);
813 ++sdev->stat.unverified_errors;
814 spin_unlock(&sdev->stat_lock);
815 }
816 }
425 } 817 }
426 818
427out: 819out:
@@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
557static int scrub_submit(struct scrub_dev *sdev) 949static int scrub_submit(struct scrub_dev *sdev)
558{ 950{
559 struct scrub_bio *sbio; 951 struct scrub_bio *sbio;
560 struct bio *bio;
561 int i;
562 952
563 if (sdev->curr == -1) 953 if (sdev->curr == -1)
564 return 0; 954 return 0;
565 955
566 sbio = sdev->bios[sdev->curr]; 956 sbio = sdev->bios[sdev->curr];
567
568 bio = bio_alloc(GFP_NOFS, sbio->count);
569 if (!bio)
570 goto nomem;
571
572 bio->bi_private = sbio;
573 bio->bi_end_io = scrub_bio_end_io;
574 bio->bi_bdev = sdev->dev->bdev;
575 bio->bi_sector = sbio->physical >> 9;
576
577 for (i = 0; i < sbio->count; ++i) {
578 struct page *page;
579 int ret;
580
581 page = alloc_page(GFP_NOFS);
582 if (!page)
583 goto nomem;
584
585 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
586 if (!ret) {
587 __free_page(page);
588 goto nomem;
589 }
590 }
591
592 sbio->err = 0; 957 sbio->err = 0;
593 sdev->curr = -1; 958 sdev->curr = -1;
594 atomic_inc(&sdev->in_flight); 959 atomic_inc(&sdev->in_flight);
595 960
596 submit_bio(READ, bio); 961 submit_bio(READ, sbio->bio);
597 962
598 return 0; 963 return 0;
599
600nomem:
601 scrub_free_bio(bio);
602
603 return -ENOMEM;
604} 964}
605 965
606static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 966static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
607 u64 physical, u64 flags, u64 gen, u64 mirror_num, 967 u64 physical, u64 flags, u64 gen, int mirror_num,
608 u8 *csum, int force) 968 u8 *csum, int force)
609{ 969{
610 struct scrub_bio *sbio; 970 struct scrub_bio *sbio;
971 struct page *page;
972 int ret;
611 973
612again: 974again:
613 /* 975 /*
@@ -628,12 +990,22 @@ again:
628 } 990 }
629 sbio = sdev->bios[sdev->curr]; 991 sbio = sdev->bios[sdev->curr];
630 if (sbio->count == 0) { 992 if (sbio->count == 0) {
993 struct bio *bio;
994
631 sbio->physical = physical; 995 sbio->physical = physical;
632 sbio->logical = logical; 996 sbio->logical = logical;
997 bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
998 if (!bio)
999 return -ENOMEM;
1000
1001 bio->bi_private = sbio;
1002 bio->bi_end_io = scrub_bio_end_io;
1003 bio->bi_bdev = sdev->dev->bdev;
1004 bio->bi_sector = sbio->physical >> 9;
1005 sbio->err = 0;
1006 sbio->bio = bio;
633 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 1007 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
634 sbio->logical + sbio->count * PAGE_SIZE != logical) { 1008 sbio->logical + sbio->count * PAGE_SIZE != logical) {
635 int ret;
636
637 ret = scrub_submit(sdev); 1009 ret = scrub_submit(sdev);
638 if (ret) 1010 if (ret)
639 return ret; 1011 return ret;
@@ -643,6 +1015,20 @@ again:
643 sbio->spag[sbio->count].generation = gen; 1015 sbio->spag[sbio->count].generation = gen;
644 sbio->spag[sbio->count].have_csum = 0; 1016 sbio->spag[sbio->count].have_csum = 0;
645 sbio->spag[sbio->count].mirror_num = mirror_num; 1017 sbio->spag[sbio->count].mirror_num = mirror_num;
1018
1019 page = alloc_page(GFP_NOFS);
1020 if (!page)
1021 return -ENOMEM;
1022
1023 ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
1024 if (!ret) {
1025 __free_page(page);
1026 ret = scrub_submit(sdev);
1027 if (ret)
1028 return ret;
1029 goto again;
1030 }
1031
646 if (csum) { 1032 if (csum) {
647 sbio->spag[sbio->count].have_csum = 1; 1033 sbio->spag[sbio->count].have_csum = 1;
648 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 1034 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
701 1087
702/* scrub extent tries to collect up to 64 kB for each bio */ 1088/* scrub extent tries to collect up to 64 kB for each bio */
703static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1089static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
704 u64 physical, u64 flags, u64 gen, u64 mirror_num) 1090 u64 physical, u64 flags, u64 gen, int mirror_num)
705{ 1091{
706 int ret; 1092 int ret;
707 u8 csum[BTRFS_CSUM_SIZE]; 1093 u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
741 int slot; 1127 int slot;
742 int i; 1128 int i;
743 u64 nstripes; 1129 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l; 1130 struct extent_buffer *l;
746 struct btrfs_key key; 1131 struct btrfs_key key;
747 u64 physical; 1132 u64 physical;
748 u64 logical; 1133 u64 logical;
749 u64 generation; 1134 u64 generation;
750 u64 mirror_num; 1135 int mirror_num;
1136 struct reada_control *reada1;
1137 struct reada_control *reada2;
1138 struct btrfs_key key_start;
1139 struct btrfs_key key_end;
751 1140
752 u64 increment = map->stripe_len; 1141 u64 increment = map->stripe_len;
753 u64 offset; 1142 u64 offset;
@@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1147 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num; 1148 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes; 1149 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0; 1150 mirror_num = 1;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1151 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes; 1152 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes); 1153 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor; 1154 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes; 1155 mirror_num = num % map->sub_stripes + 1;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1156 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len; 1157 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes; 1158 mirror_num = num % map->num_stripes + 1;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1159 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len; 1160 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes; 1161 mirror_num = num % map->num_stripes + 1;
773 } else { 1162 } else {
774 increment = map->stripe_len; 1163 increment = map->stripe_len;
775 mirror_num = 0; 1164 mirror_num = 1;
776 } 1165 }
777 1166
778 path = btrfs_alloc_path(); 1167 path = btrfs_alloc_path();
779 if (!path) 1168 if (!path)
780 return -ENOMEM; 1169 return -ENOMEM;
781 1170
782 path->reada = 2;
783 path->search_commit_root = 1; 1171 path->search_commit_root = 1;
784 path->skip_locking = 1; 1172 path->skip_locking = 1;
785 1173
786 /* 1174 /*
787 * find all extents for each stripe and just read them to get 1175 * trigger the readahead for extent tree csum tree and wait for
788 * them into the page cache 1176 * completion. During readahead, the scrub is officially paused
789 * FIXME: we can do better. build a more intelligent prefetching 1177 * to not hold off transaction commits
790 */ 1178 */
791 logical = base + offset; 1179 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802 1180
803 /* 1181 wait_event(sdev->list_wait,
804 * we might miss half an extent here, but that doesn't matter, 1182 atomic_read(&sdev->in_flight) == 0);
805 * as it's only the prefetch 1183 atomic_inc(&fs_info->scrubs_paused);
806 */ 1184 wake_up(&fs_info->scrub_pause_wait);
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816
817 break;
818 }
819 btrfs_item_key_to_cpu(l, &key, slot);
820 1185
821 if (key.objectid >= logical + map->stripe_len) 1186 /* FIXME it might be better to start readahead at commit root */
822 break; 1187 key_start.objectid = logical;
1188 key_start.type = BTRFS_EXTENT_ITEM_KEY;
1189 key_start.offset = (u64)0;
1190 key_end.objectid = base + offset + nstripes * increment;
1191 key_end.type = BTRFS_EXTENT_ITEM_KEY;
1192 key_end.offset = (u64)0;
1193 reada1 = btrfs_reada_add(root, &key_start, &key_end);
1194
1195 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1196 key_start.type = BTRFS_EXTENT_CSUM_KEY;
1197 key_start.offset = logical;
1198 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1199 key_end.type = BTRFS_EXTENT_CSUM_KEY;
1200 key_end.offset = base + offset + nstripes * increment;
1201 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1202
1203 if (!IS_ERR(reada1))
1204 btrfs_reada_wait(reada1);
1205 if (!IS_ERR(reada2))
1206 btrfs_reada_wait(reada2);
823 1207
824 path->slots[0]++; 1208 mutex_lock(&fs_info->scrub_lock);
825 } 1209 while (atomic_read(&fs_info->scrub_pause_req)) {
826 btrfs_release_path(path); 1210 mutex_unlock(&fs_info->scrub_lock);
827 logical += increment; 1211 wait_event(fs_info->scrub_pause_wait,
828 physical += map->stripe_len; 1212 atomic_read(&fs_info->scrub_pause_req) == 0);
829 cond_resched(); 1213 mutex_lock(&fs_info->scrub_lock);
830 } 1214 }
1215 atomic_dec(&fs_info->scrubs_paused);
1216 mutex_unlock(&fs_info->scrub_lock);
1217 wake_up(&fs_info->scrub_pause_wait);
831 1218
832 /* 1219 /*
833 * collect all data csums for the stripe to avoid seeking during 1220 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB 1221 * the scrub. This might currently (crc32) end up to be about 1MB
835 */ 1222 */
836 start_stripe = 0;
837 blk_start_plug(&plug); 1223 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846 1224
847 logical += increment;
848 cond_resched();
849 }
850 /* 1225 /*
851 * now find all extents for each stripe and scrub them 1226 * now find all extents for each stripe and scrub them
852 */ 1227 */
853 logical = base + offset + start_stripe * increment; 1228 logical = base + offset;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 1229 physical = map->stripes[num].physical;
855 ret = 0; 1230 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) { 1231 for (i = 0; i < nstripes; ++i) {
857 /* 1232 /*
858 * canceled? 1233 * canceled?
859 */ 1234 */
@@ -882,11 +1257,14 @@ again:
882 atomic_dec(&fs_info->scrubs_paused); 1257 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock); 1258 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait); 1259 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 } 1260 }
889 1261
1262 ret = btrfs_lookup_csums_range(csum_root, logical,
1263 logical + map->stripe_len - 1,
1264 &sdev->csum_list, 1);
1265 if (ret)
1266 goto out;
1267
890 key.objectid = logical; 1268 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY; 1269 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0; 1270 key.offset = (u64)0;
@@ -982,7 +1360,6 @@ next:
982 1360
983out: 1361out:
984 blk_finish_plug(&plug); 1362 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path); 1363 btrfs_free_path(path);
987 return ret < 0 ? ret : 0; 1364 return ret < 0 ? ret : 0;
988} 1365}
@@ -1158,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1158static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 1535static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
1159{ 1536{
1160 struct btrfs_fs_info *fs_info = root->fs_info; 1537 struct btrfs_fs_info *fs_info = root->fs_info;
1538 int ret = 0;
1161 1539
1162 mutex_lock(&fs_info->scrub_lock); 1540 mutex_lock(&fs_info->scrub_lock);
1163 if (fs_info->scrub_workers_refcnt == 0) { 1541 if (fs_info->scrub_workers_refcnt == 0) {
1164 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1542 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
1165 fs_info->thread_pool_size, &fs_info->generic_worker); 1543 fs_info->thread_pool_size, &fs_info->generic_worker);
1166 fs_info->scrub_workers.idle_thresh = 4; 1544 fs_info->scrub_workers.idle_thresh = 4;
1167 btrfs_start_workers(&fs_info->scrub_workers, 1); 1545 ret = btrfs_start_workers(&fs_info->scrub_workers);
1546 if (ret)
1547 goto out;
1168 } 1548 }
1169 ++fs_info->scrub_workers_refcnt; 1549 ++fs_info->scrub_workers_refcnt;
1550out:
1170 mutex_unlock(&fs_info->scrub_lock); 1551 mutex_unlock(&fs_info->scrub_lock);
1171 1552
1172 return 0; 1553 return ret;
1173} 1554}
1174 1555
1175static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 1556static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
@@ -1253,10 +1634,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1253 ret = scrub_enumerate_chunks(sdev, start, end); 1634 ret = scrub_enumerate_chunks(sdev, start, end);
1254 1635
1255 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1636 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1256
1257 atomic_dec(&fs_info->scrubs_running); 1637 atomic_dec(&fs_info->scrubs_running);
1258 wake_up(&fs_info->scrub_pause_wait); 1638 wake_up(&fs_info->scrub_pause_wait);
1259 1639
1640 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1641
1260 if (progress) 1642 if (progress)
1261 memcpy(progress, &sdev->stat, sizeof(*progress)); 1643 memcpy(progress, &sdev->stat, sizeof(*progress));
1262 1644
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..34a8b6112ea4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,8 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h>
44#include <linux/ratelimit.h>
43#include "compat.h" 45#include "compat.h"
44#include "delayed-inode.h" 46#include "delayed-inode.h"
45#include "ctree.h" 47#include "ctree.h"
@@ -58,6 +60,7 @@
58#include <trace/events/btrfs.h> 60#include <trace/events/btrfs.h>
59 61
60static const struct super_operations btrfs_super_ops; 62static const struct super_operations btrfs_super_ops;
63static struct file_system_type btrfs_fs_type;
61 64
62static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 65static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
63 char nbuf[16]) 66 char nbuf[16])
@@ -162,7 +165,7 @@ enum {
162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 165 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 166 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 167 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err, 168 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
166}; 169};
167 170
168static match_table_t tokens = { 171static match_table_t tokens = {
@@ -195,6 +198,8 @@ static match_table_t tokens = {
195 {Opt_subvolrootid, "subvolrootid=%d"}, 198 {Opt_subvolrootid, "subvolrootid=%d"},
196 {Opt_defrag, "autodefrag"}, 199 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"}, 200 {Opt_inode_cache, "inode_cache"},
201 {Opt_no_space_cache, "nospace_cache"},
202 {Opt_recovery, "recovery"},
198 {Opt_err, NULL}, 203 {Opt_err, NULL},
199}; 204};
200 205
@@ -206,14 +211,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
206{ 211{
207 struct btrfs_fs_info *info = root->fs_info; 212 struct btrfs_fs_info *info = root->fs_info;
208 substring_t args[MAX_OPT_ARGS]; 213 substring_t args[MAX_OPT_ARGS];
209 char *p, *num, *orig; 214 char *p, *num, *orig = NULL;
215 u64 cache_gen;
210 int intarg; 216 int intarg;
211 int ret = 0; 217 int ret = 0;
212 char *compress_type; 218 char *compress_type;
213 bool compress_force = false; 219 bool compress_force = false;
214 220
221 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
222 if (cache_gen)
223 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
224
215 if (!options) 225 if (!options)
216 return 0; 226 goto out;
217 227
218 /* 228 /*
219 * strsep changes the string, duplicate it because parse_options 229 * strsep changes the string, duplicate it because parse_options
@@ -360,9 +370,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
360 btrfs_set_opt(info->mount_opt, DISCARD); 370 btrfs_set_opt(info->mount_opt, DISCARD);
361 break; 371 break;
362 case Opt_space_cache: 372 case Opt_space_cache:
363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
364 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 373 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
365 break; 374 break;
375 case Opt_no_space_cache:
376 printk(KERN_INFO "btrfs: disabling disk space caching\n");
377 btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
378 break;
366 case Opt_inode_cache: 379 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n"); 380 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); 381 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +394,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
381 printk(KERN_INFO "btrfs: enabling auto defrag"); 394 printk(KERN_INFO "btrfs: enabling auto defrag");
382 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 395 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
383 break; 396 break;
397 case Opt_recovery:
398 printk(KERN_INFO "btrfs: enabling auto recovery");
399 btrfs_set_opt(info->mount_opt, RECOVERY);
400 break;
384 case Opt_err: 401 case Opt_err:
385 printk(KERN_INFO "btrfs: unrecognized mount option " 402 printk(KERN_INFO "btrfs: unrecognized mount option "
386 "'%s'\n", p); 403 "'%s'\n", p);
@@ -391,6 +408,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
391 } 408 }
392 } 409 }
393out: 410out:
411 if (!ret && btrfs_test_opt(root, SPACE_CACHE))
412 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
394 kfree(orig); 413 kfree(orig);
395 return ret; 414 return ret;
396} 415}
@@ -406,12 +425,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
406 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) 425 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
407{ 426{
408 substring_t args[MAX_OPT_ARGS]; 427 substring_t args[MAX_OPT_ARGS];
409 char *opts, *orig, *p; 428 char *device_name, *opts, *orig, *p;
410 int error = 0; 429 int error = 0;
411 int intarg; 430 int intarg;
412 431
413 if (!options) 432 if (!options)
414 goto out; 433 return 0;
415 434
416 /* 435 /*
417 * strsep changes the string, duplicate it because parse_options 436 * strsep changes the string, duplicate it because parse_options
@@ -430,6 +449,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
430 token = match_token(p, tokens, args); 449 token = match_token(p, tokens, args);
431 switch (token) { 450 switch (token) {
432 case Opt_subvol: 451 case Opt_subvol:
452 kfree(*subvol_name);
433 *subvol_name = match_strdup(&args[0]); 453 *subvol_name = match_strdup(&args[0]);
434 break; 454 break;
435 case Opt_subvolid: 455 case Opt_subvolid:
@@ -457,29 +477,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
457 } 477 }
458 break; 478 break;
459 case Opt_device: 479 case Opt_device:
460 error = btrfs_scan_one_device(match_strdup(&args[0]), 480 device_name = match_strdup(&args[0]);
481 if (!device_name) {
482 error = -ENOMEM;
483 goto out;
484 }
485 error = btrfs_scan_one_device(device_name,
461 flags, holder, fs_devices); 486 flags, holder, fs_devices);
487 kfree(device_name);
462 if (error) 488 if (error)
463 goto out_free_opts; 489 goto out;
464 break; 490 break;
465 default: 491 default:
466 break; 492 break;
467 } 493 }
468 } 494 }
469 495
470 out_free_opts: 496out:
471 kfree(orig); 497 kfree(orig);
472 out:
473 /*
474 * If no subvolume name is specified we use the default one. Allocate
475 * a copy of the string "." here so that code later in the
476 * mount path doesn't care if it's the default volume or another one.
477 */
478 if (!*subvol_name) {
479 *subvol_name = kstrdup(".", GFP_KERNEL);
480 if (!*subvol_name)
481 return -ENOMEM;
482 }
483 return error; 498 return error;
484} 499}
485 500
@@ -492,7 +507,6 @@ static struct dentry *get_default_root(struct super_block *sb,
492 struct btrfs_path *path; 507 struct btrfs_path *path;
493 struct btrfs_key location; 508 struct btrfs_key location;
494 struct inode *inode; 509 struct inode *inode;
495 struct dentry *dentry;
496 u64 dir_id; 510 u64 dir_id;
497 int new = 0; 511 int new = 0;
498 512
@@ -517,7 +531,7 @@ static struct dentry *get_default_root(struct super_block *sb,
517 * will mount by default if we haven't been given a specific subvolume 531 * will mount by default if we haven't been given a specific subvolume
518 * to mount. 532 * to mount.
519 */ 533 */
520 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 534 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
521 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 535 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
522 if (IS_ERR(di)) { 536 if (IS_ERR(di)) {
523 btrfs_free_path(path); 537 btrfs_free_path(path);
@@ -566,29 +580,7 @@ setup_root:
566 return dget(sb->s_root); 580 return dget(sb->s_root);
567 } 581 }
568 582
569 if (new) { 583 return d_obtain_alias(inode);
570 const struct qstr name = { .name = "/", .len = 1 };
571
572 /*
573 * New inode, we need to make the dentry a sibling of s_root so
574 * everything gets cleaned up properly on unmount.
575 */
576 dentry = d_alloc(sb->s_root, &name);
577 if (!dentry) {
578 iput(inode);
579 return ERR_PTR(-ENOMEM);
580 }
581 d_splice_alias(inode, dentry);
582 } else {
583 /*
584 * We found the inode in cache, just find a dentry for it and
585 * put the reference to the inode we just got.
586 */
587 dentry = d_find_alias(inode);
588 iput(inode);
589 }
590
591 return dentry;
592} 584}
593 585
594static int btrfs_fill_super(struct super_block *sb, 586static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +711,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
719 seq_puts(seq, ",noacl"); 711 seq_puts(seq, ",noacl");
720 if (btrfs_test_opt(root, SPACE_CACHE)) 712 if (btrfs_test_opt(root, SPACE_CACHE))
721 seq_puts(seq, ",space_cache"); 713 seq_puts(seq, ",space_cache");
714 else
715 seq_puts(seq, ",nospace_cache");
722 if (btrfs_test_opt(root, CLEAR_CACHE)) 716 if (btrfs_test_opt(root, CLEAR_CACHE))
723 seq_puts(seq, ",clear_cache"); 717 seq_puts(seq, ",clear_cache");
724 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 718 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +747,137 @@ static int btrfs_set_super(struct super_block *s, void *data)
753 return set_anon_super(s, data); 747 return set_anon_super(s, data);
754} 748}
755 749
750/*
751 * subvolumes are identified by ino 256
752 */
753static inline int is_subvolume_inode(struct inode *inode)
754{
755 if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
756 return 1;
757 return 0;
758}
759
760/*
761 * This will strip out the subvol=%s argument for an argument string and add
762 * subvolid=0 to make sure we get the actual tree root for path walking to the
763 * subvol we want.
764 */
765static char *setup_root_args(char *args)
766{
767 unsigned copied = 0;
768 unsigned len = strlen(args) + 2;
769 char *pos;
770 char *ret;
771
772 /*
773 * We need the same args as before, but minus
774 *
775 * subvol=a
776 *
777 * and add
778 *
779 * subvolid=0
780 *
781 * which is a difference of 2 characters, so we allocate strlen(args) +
782 * 2 characters.
783 */
784 ret = kzalloc(len * sizeof(char), GFP_NOFS);
785 if (!ret)
786 return NULL;
787 pos = strstr(args, "subvol=");
788
789 /* This shouldn't happen, but just in case.. */
790 if (!pos) {
791 kfree(ret);
792 return NULL;
793 }
794
795 /*
796 * The subvol=<> arg is not at the front of the string, copy everybody
797 * up to that into ret.
798 */
799 if (pos != args) {
800 *pos = '\0';
801 strcpy(ret, args);
802 copied += strlen(args);
803 pos++;
804 }
805
806 strncpy(ret + copied, "subvolid=0", len - copied);
807
808 /* Length of subvolid=0 */
809 copied += 10;
810
811 /*
812 * If there is no , after the subvol= option then we know there's no
813 * other options and we can just return.
814 */
815 pos = strchr(pos, ',');
816 if (!pos)
817 return ret;
818
819 /* Copy the rest of the arguments into our buffer */
820 strncpy(ret + copied, pos, len - copied);
821 copied += strlen(pos);
822
823 return ret;
824}
825
826static struct dentry *mount_subvol(const char *subvol_name, int flags,
827 const char *device_name, char *data)
828{
829 struct super_block *s;
830 struct dentry *root;
831 struct vfsmount *mnt;
832 struct mnt_namespace *ns_private;
833 char *newargs;
834 struct path path;
835 int error;
836
837 newargs = setup_root_args(data);
838 if (!newargs)
839 return ERR_PTR(-ENOMEM);
840 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
841 newargs);
842 kfree(newargs);
843 if (IS_ERR(mnt))
844 return ERR_CAST(mnt);
845
846 ns_private = create_mnt_ns(mnt);
847 if (IS_ERR(ns_private)) {
848 mntput(mnt);
849 return ERR_CAST(ns_private);
850 }
851
852 /*
853 * This will trigger the automount of the subvol so we can just
854 * drop the mnt we have here and return the dentry that we
855 * found.
856 */
857 error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
858 LOOKUP_FOLLOW, &path);
859 put_mnt_ns(ns_private);
860 if (error)
861 return ERR_PTR(error);
862
863 if (!is_subvolume_inode(path.dentry->d_inode)) {
864 path_put(&path);
865 mntput(mnt);
866 error = -EINVAL;
867 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
868 subvol_name);
869 return ERR_PTR(-EINVAL);
870 }
871
872 /* Get a ref to the sb and the dentry we found and return it */
873 s = path.mnt->mnt_sb;
874 atomic_inc(&s->s_active);
875 root = dget(path.dentry);
876 path_put(&path);
877 down_write(&s->s_umount);
878
879 return root;
880}
756 881
757/* 882/*
758 * Find a superblock for the given device / mount point. 883 * Find a superblock for the given device / mount point.
@@ -767,7 +892,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
767 struct super_block *s; 892 struct super_block *s;
768 struct dentry *root; 893 struct dentry *root;
769 struct btrfs_fs_devices *fs_devices = NULL; 894 struct btrfs_fs_devices *fs_devices = NULL;
770 struct btrfs_root *tree_root = NULL;
771 struct btrfs_fs_info *fs_info = NULL; 895 struct btrfs_fs_info *fs_info = NULL;
772 fmode_t mode = FMODE_READ; 896 fmode_t mode = FMODE_READ;
773 char *subvol_name = NULL; 897 char *subvol_name = NULL;
@@ -781,21 +905,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
781 error = btrfs_parse_early_options(data, mode, fs_type, 905 error = btrfs_parse_early_options(data, mode, fs_type,
782 &subvol_name, &subvol_objectid, 906 &subvol_name, &subvol_objectid,
783 &subvol_rootid, &fs_devices); 907 &subvol_rootid, &fs_devices);
784 if (error) 908 if (error) {
909 kfree(subvol_name);
785 return ERR_PTR(error); 910 return ERR_PTR(error);
911 }
786 912
787 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 913 if (subvol_name) {
788 if (error) 914 root = mount_subvol(subvol_name, flags, device_name, data);
789 goto error_free_subvol_name; 915 kfree(subvol_name);
916 return root;
917 }
790 918
791 error = btrfs_open_devices(fs_devices, mode, fs_type); 919 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
792 if (error) 920 if (error)
793 goto error_free_subvol_name; 921 return ERR_PTR(error);
794
795 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
796 error = -EACCES;
797 goto error_close_devices;
798 }
799 922
800 /* 923 /*
801 * Setup a dummy root and fs_info for test/set super. This is because 924 * Setup a dummy root and fs_info for test/set super. This is because
@@ -804,19 +927,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
804 * then open_ctree will properly initialize everything later. 927 * then open_ctree will properly initialize everything later.
805 */ 928 */
806 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); 929 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
807 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 930 if (!fs_info)
808 if (!fs_info || !tree_root) { 931 return ERR_PTR(-ENOMEM);
932
933 fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
934 if (!fs_info->tree_root) {
809 error = -ENOMEM; 935 error = -ENOMEM;
810 goto error_close_devices; 936 goto error_fs_info;
811 } 937 }
812 fs_info->tree_root = tree_root; 938 fs_info->tree_root->fs_info = fs_info;
813 fs_info->fs_devices = fs_devices; 939 fs_info->fs_devices = fs_devices;
814 tree_root->fs_info = fs_info; 940
941 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
942 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
943 if (!fs_info->super_copy || !fs_info->super_for_commit) {
944 error = -ENOMEM;
945 goto error_fs_info;
946 }
947
948 error = btrfs_open_devices(fs_devices, mode, fs_type);
949 if (error)
950 goto error_fs_info;
951
952 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
953 error = -EACCES;
954 goto error_close_devices;
955 }
815 956
816 bdev = fs_devices->latest_bdev; 957 bdev = fs_devices->latest_bdev;
817 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); 958 s = sget(fs_type, btrfs_test_super, btrfs_set_super,
818 if (IS_ERR(s)) 959 fs_info->tree_root);
819 goto error_s; 960 if (IS_ERR(s)) {
961 error = PTR_ERR(s);
962 goto error_close_devices;
963 }
820 964
821 if (s->s_root) { 965 if (s->s_root) {
822 if ((flags ^ s->s_flags) & MS_RDONLY) { 966 if ((flags ^ s->s_flags) & MS_RDONLY) {
@@ -826,75 +970,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
826 } 970 }
827 971
828 btrfs_close_devices(fs_devices); 972 btrfs_close_devices(fs_devices);
829 kfree(fs_info); 973 free_fs_info(fs_info);
830 kfree(tree_root);
831 } else { 974 } else {
832 char b[BDEVNAME_SIZE]; 975 char b[BDEVNAME_SIZE];
833 976
834 s->s_flags = flags | MS_NOSEC; 977 s->s_flags = flags | MS_NOSEC;
835 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 978 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
979 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
836 error = btrfs_fill_super(s, fs_devices, data, 980 error = btrfs_fill_super(s, fs_devices, data,
837 flags & MS_SILENT ? 1 : 0); 981 flags & MS_SILENT ? 1 : 0);
838 if (error) { 982 if (error) {
839 deactivate_locked_super(s); 983 deactivate_locked_super(s);
840 goto error_free_subvol_name; 984 return ERR_PTR(error);
841 } 985 }
842 986
843 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
844 s->s_flags |= MS_ACTIVE; 987 s->s_flags |= MS_ACTIVE;
845 } 988 }
846 989
847 /* if they gave us a subvolume name bind mount into that */ 990 root = get_default_root(s, subvol_objectid);
848 if (strcmp(subvol_name, ".")) { 991 if (IS_ERR(root)) {
849 struct dentry *new_root; 992 deactivate_locked_super(s);
850 993 return root;
851 root = get_default_root(s, subvol_rootid);
852 if (IS_ERR(root)) {
853 error = PTR_ERR(root);
854 deactivate_locked_super(s);
855 goto error_free_subvol_name;
856 }
857
858 mutex_lock(&root->d_inode->i_mutex);
859 new_root = lookup_one_len(subvol_name, root,
860 strlen(subvol_name));
861 mutex_unlock(&root->d_inode->i_mutex);
862
863 if (IS_ERR(new_root)) {
864 dput(root);
865 deactivate_locked_super(s);
866 error = PTR_ERR(new_root);
867 goto error_free_subvol_name;
868 }
869 if (!new_root->d_inode) {
870 dput(root);
871 dput(new_root);
872 deactivate_locked_super(s);
873 error = -ENXIO;
874 goto error_free_subvol_name;
875 }
876 dput(root);
877 root = new_root;
878 } else {
879 root = get_default_root(s, subvol_objectid);
880 if (IS_ERR(root)) {
881 error = PTR_ERR(root);
882 deactivate_locked_super(s);
883 goto error_free_subvol_name;
884 }
885 } 994 }
886 995
887 kfree(subvol_name);
888 return root; 996 return root;
889 997
890error_s:
891 error = PTR_ERR(s);
892error_close_devices: 998error_close_devices:
893 btrfs_close_devices(fs_devices); 999 btrfs_close_devices(fs_devices);
894 kfree(fs_info); 1000error_fs_info:
895 kfree(tree_root); 1001 free_fs_info(fs_info);
896error_free_subvol_name:
897 kfree(subvol_name);
898 return ERR_PTR(error); 1002 return ERR_PTR(error);
899} 1003}
900 1004
@@ -919,7 +1023,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
919 if (root->fs_info->fs_devices->rw_devices == 0) 1023 if (root->fs_info->fs_devices->rw_devices == 0)
920 return -EACCES; 1024 return -EACCES;
921 1025
922 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 1026 if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
923 return -EINVAL; 1027 return -EINVAL;
924 1028
925 ret = btrfs_cleanup_fs_roots(root->fs_info); 1029 ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -976,11 +1080,11 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
976 u64 avail_space; 1080 u64 avail_space;
977 u64 used_space; 1081 u64 used_space;
978 u64 min_stripe_size; 1082 u64 min_stripe_size;
979 int min_stripes = 1; 1083 int min_stripes = 1, num_stripes = 1;
980 int i = 0, nr_devices; 1084 int i = 0, nr_devices;
981 int ret; 1085 int ret;
982 1086
983 nr_devices = fs_info->fs_devices->rw_devices; 1087 nr_devices = fs_info->fs_devices->open_devices;
984 BUG_ON(!nr_devices); 1088 BUG_ON(!nr_devices);
985 1089
986 devices_info = kmalloc(sizeof(*devices_info) * nr_devices, 1090 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -990,20 +1094,24 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
990 1094
991 /* calc min stripe number for data space alloction */ 1095 /* calc min stripe number for data space alloction */
992 type = btrfs_get_alloc_profile(root, 1); 1096 type = btrfs_get_alloc_profile(root, 1);
993 if (type & BTRFS_BLOCK_GROUP_RAID0) 1097 if (type & BTRFS_BLOCK_GROUP_RAID0) {
994 min_stripes = 2; 1098 min_stripes = 2;
995 else if (type & BTRFS_BLOCK_GROUP_RAID1) 1099 num_stripes = nr_devices;
1100 } else if (type & BTRFS_BLOCK_GROUP_RAID1) {
996 min_stripes = 2; 1101 min_stripes = 2;
997 else if (type & BTRFS_BLOCK_GROUP_RAID10) 1102 num_stripes = 2;
1103 } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
998 min_stripes = 4; 1104 min_stripes = 4;
1105 num_stripes = 4;
1106 }
999 1107
1000 if (type & BTRFS_BLOCK_GROUP_DUP) 1108 if (type & BTRFS_BLOCK_GROUP_DUP)
1001 min_stripe_size = 2 * BTRFS_STRIPE_LEN; 1109 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
1002 else 1110 else
1003 min_stripe_size = BTRFS_STRIPE_LEN; 1111 min_stripe_size = BTRFS_STRIPE_LEN;
1004 1112
1005 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 1113 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1006 if (!device->in_fs_metadata) 1114 if (!device->in_fs_metadata || !device->bdev)
1007 continue; 1115 continue;
1008 1116
1009 avail_space = device->total_bytes - device->bytes_used; 1117 avail_space = device->total_bytes - device->bytes_used;
@@ -1064,13 +1172,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1064 i = nr_devices - 1; 1172 i = nr_devices - 1;
1065 avail_space = 0; 1173 avail_space = 0;
1066 while (nr_devices >= min_stripes) { 1174 while (nr_devices >= min_stripes) {
1175 if (num_stripes > nr_devices)
1176 num_stripes = nr_devices;
1177
1067 if (devices_info[i].max_avail >= min_stripe_size) { 1178 if (devices_info[i].max_avail >= min_stripe_size) {
1068 int j; 1179 int j;
1069 u64 alloc_size; 1180 u64 alloc_size;
1070 1181
1071 avail_space += devices_info[i].max_avail * min_stripes; 1182 avail_space += devices_info[i].max_avail * num_stripes;
1072 alloc_size = devices_info[i].max_avail; 1183 alloc_size = devices_info[i].max_avail;
1073 for (j = i + 1 - min_stripes; j <= i; j++) 1184 for (j = i + 1 - num_stripes; j <= i; j++)
1074 devices_info[j].max_avail -= alloc_size; 1185 devices_info[j].max_avail -= alloc_size;
1075 } 1186 }
1076 i--; 1187 i--;
@@ -1085,7 +1196,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1085static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1196static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1086{ 1197{
1087 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1198 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
1088 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1199 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1089 struct list_head *head = &root->fs_info->space_info; 1200 struct list_head *head = &root->fs_info->space_info;
1090 struct btrfs_space_info *found; 1201 struct btrfs_space_info *found;
1091 u64 total_used = 0; 1202 u64 total_used = 0;
@@ -1187,6 +1298,16 @@ static int btrfs_unfreeze(struct super_block *sb)
1187 return 0; 1298 return 0;
1188} 1299}
1189 1300
1301static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
1302{
1303 int ret;
1304
1305 ret = btrfs_dirty_inode(inode);
1306 if (ret)
1307 printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
1308 "error %d\n", btrfs_ino(inode), ret);
1309}
1310
1190static const struct super_operations btrfs_super_ops = { 1311static const struct super_operations btrfs_super_ops = {
1191 .drop_inode = btrfs_drop_inode, 1312 .drop_inode = btrfs_drop_inode,
1192 .evict_inode = btrfs_evict_inode, 1313 .evict_inode = btrfs_evict_inode,
@@ -1194,7 +1315,7 @@ static const struct super_operations btrfs_super_ops = {
1194 .sync_fs = btrfs_sync_fs, 1315 .sync_fs = btrfs_sync_fs,
1195 .show_options = btrfs_show_options, 1316 .show_options = btrfs_show_options,
1196 .write_inode = btrfs_write_inode, 1317 .write_inode = btrfs_write_inode,
1197 .dirty_inode = btrfs_dirty_inode, 1318 .dirty_inode = btrfs_fs_dirty_inode,
1198 .alloc_inode = btrfs_alloc_inode, 1319 .alloc_inode = btrfs_alloc_inode,
1199 .destroy_inode = btrfs_destroy_inode, 1320 .destroy_inode = btrfs_destroy_inode,
1200 .statfs = btrfs_statfs, 1321 .statfs = btrfs_statfs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..81376d94cd3c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
55 struct btrfs_transaction *cur_trans; 55 struct btrfs_transaction *cur_trans;
56 56
57 spin_lock(&root->fs_info->trans_lock); 57 spin_lock(&root->fs_info->trans_lock);
58loop:
58 if (root->fs_info->trans_no_join) { 59 if (root->fs_info->trans_no_join) {
59 if (!nofail) { 60 if (!nofail) {
60 spin_unlock(&root->fs_info->trans_lock); 61 spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
75 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 76 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
76 if (!cur_trans) 77 if (!cur_trans)
77 return -ENOMEM; 78 return -ENOMEM;
79
78 spin_lock(&root->fs_info->trans_lock); 80 spin_lock(&root->fs_info->trans_lock);
79 if (root->fs_info->running_transaction) { 81 if (root->fs_info->running_transaction) {
82 /*
83 * someone started a transaction after we unlocked. Make sure
84 * to redo the trans_no_join checks above
85 */
80 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 86 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
81 cur_trans = root->fs_info->running_transaction; 87 cur_trans = root->fs_info->running_transaction;
82 atomic_inc(&cur_trans->use_count); 88 goto loop;
83 atomic_inc(&cur_trans->num_writers);
84 cur_trans->num_joined++;
85 spin_unlock(&root->fs_info->trans_lock);
86 return 0;
87 } 89 }
90
88 atomic_set(&cur_trans->num_writers, 1); 91 atomic_set(&cur_trans->num_writers, 1);
89 cur_trans->num_joined = 0; 92 cur_trans->num_joined = 0;
90 init_waitqueue_head(&cur_trans->writer_wait); 93 init_waitqueue_head(&cur_trans->writer_wait);
@@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
275 */ 278 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) { 279 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 280 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root, 281 ret = btrfs_block_rsv_add(root,
279 &root->fs_info->trans_block_rsv, 282 &root->fs_info->trans_block_rsv,
280 num_bytes); 283 num_bytes);
281 if (ret) 284 if (ret)
@@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 421 struct btrfs_root *root)
419{ 422{
420 int ret; 423 int ret;
421 ret = btrfs_block_rsv_check(trans, root, 424
422 &root->fs_info->global_block_rsv, 0, 5); 425 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
423 return ret ? 1 : 0; 426 return ret ? 1 : 0;
424} 427}
425 428
@@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
427 struct btrfs_root *root) 430 struct btrfs_root *root)
428{ 431{
429 struct btrfs_transaction *cur_trans = trans->transaction; 432 struct btrfs_transaction *cur_trans = trans->transaction;
433 struct btrfs_block_rsv *rsv = trans->block_rsv;
430 int updates; 434 int updates;
431 435
432 smp_mb(); 436 smp_mb();
433 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 437 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
434 return 1; 438 return 1;
435 439
440 /*
441 * We need to do this in case we're deleting csums so the global block
442 * rsv get's used instead of the csum block rsv.
443 */
444 trans->block_rsv = NULL;
445
436 updates = trans->delayed_ref_updates; 446 updates = trans->delayed_ref_updates;
437 trans->delayed_ref_updates = 0; 447 trans->delayed_ref_updates = 0;
438 if (updates) 448 if (updates)
439 btrfs_run_delayed_refs(trans, root, updates); 449 btrfs_run_delayed_refs(trans, root, updates);
440 450
451 trans->block_rsv = rsv;
452
441 return should_end_transaction(trans, root); 453 return should_end_transaction(trans, root);
442} 454}
443 455
@@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
453 return 0; 465 return 0;
454 } 466 }
455 467
468 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL;
456 while (count < 4) { 470 while (count < 4) {
457 unsigned long cur = trans->delayed_ref_updates; 471 unsigned long cur = trans->delayed_ref_updates;
458 trans->delayed_ref_updates = 0; 472 trans->delayed_ref_updates = 0;
@@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
473 count++; 487 count++;
474 } 488 }
475 489
476 btrfs_trans_release_metadata(trans, root);
477
478 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 490 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
479 should_end_transaction(trans, root)) { 491 should_end_transaction(trans, root)) {
480 trans->transaction->blocked = 1; 492 trans->transaction->blocked = 1;
@@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
562int btrfs_write_marked_extents(struct btrfs_root *root, 574int btrfs_write_marked_extents(struct btrfs_root *root,
563 struct extent_io_tree *dirty_pages, int mark) 575 struct extent_io_tree *dirty_pages, int mark)
564{ 576{
565 int ret;
566 int err = 0; 577 int err = 0;
567 int werr = 0; 578 int werr = 0;
568 struct page *page; 579 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
569 struct inode *btree_inode = root->fs_info->btree_inode;
570 u64 start = 0; 580 u64 start = 0;
571 u64 end; 581 u64 end;
572 unsigned long index;
573
574 while (1) {
575 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
576 mark);
577 if (ret)
578 break;
579 while (start <= end) {
580 cond_resched();
581
582 index = start >> PAGE_CACHE_SHIFT;
583 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
584 page = find_get_page(btree_inode->i_mapping, index);
585 if (!page)
586 continue;
587
588 btree_lock_page_hook(page);
589 if (!page->mapping) {
590 unlock_page(page);
591 page_cache_release(page);
592 continue;
593 }
594 582
595 if (PageWriteback(page)) { 583 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
596 if (PageDirty(page)) 584 mark)) {
597 wait_on_page_writeback(page); 585 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
598 else { 586 GFP_NOFS);
599 unlock_page(page); 587 err = filemap_fdatawrite_range(mapping, start, end);
600 page_cache_release(page); 588 if (err)
601 continue; 589 werr = err;
602 } 590 cond_resched();
603 } 591 start = end + 1;
604 err = write_one_page(page, 0);
605 if (err)
606 werr = err;
607 page_cache_release(page);
608 }
609 } 592 }
610 if (err) 593 if (err)
611 werr = err; 594 werr = err;
@@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
621int btrfs_wait_marked_extents(struct btrfs_root *root, 604int btrfs_wait_marked_extents(struct btrfs_root *root,
622 struct extent_io_tree *dirty_pages, int mark) 605 struct extent_io_tree *dirty_pages, int mark)
623{ 606{
624 int ret;
625 int err = 0; 607 int err = 0;
626 int werr = 0; 608 int werr = 0;
627 struct page *page; 609 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
628 struct inode *btree_inode = root->fs_info->btree_inode;
629 u64 start = 0; 610 u64 start = 0;
630 u64 end; 611 u64 end;
631 unsigned long index;
632 612
633 while (1) { 613 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
634 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 614 EXTENT_NEED_WAIT)) {
635 mark); 615 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
636 if (ret) 616 err = filemap_fdatawait_range(mapping, start, end);
637 break; 617 if (err)
638 618 werr = err;
639 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 619 cond_resched();
640 while (start <= end) { 620 start = end + 1;
641 index = start >> PAGE_CACHE_SHIFT;
642 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
643 page = find_get_page(btree_inode->i_mapping, index);
644 if (!page)
645 continue;
646 if (PageDirty(page)) {
647 btree_lock_page_hook(page);
648 wait_on_page_writeback(page);
649 err = write_one_page(page, 0);
650 if (err)
651 werr = err;
652 }
653 wait_on_page_writeback(page);
654 page_cache_release(page);
655 cond_resched();
656 }
657 } 621 }
658 if (err) 622 if (err)
659 werr = err; 623 werr = err;
@@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
673 637
674 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 638 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
675 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 639 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
676 return ret || ret2; 640
641 if (ret)
642 return ret;
643 if (ret2)
644 return ret2;
645 return 0;
677} 646}
678 647
679int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 648int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -816,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
816 785
817 btrfs_save_ino_cache(root, trans); 786 btrfs_save_ino_cache(root, trans);
818 787
788 /* see comments in should_cow_block() */
789 root->force_cow = 0;
790 smp_wmb();
791
819 if (root->commit_root != root->node) { 792 if (root->commit_root != root->node) {
820 mutex_lock(&root->fs_commit_mutex); 793 mutex_lock(&root->fs_commit_mutex);
821 switch_commit_root(root); 794 switch_commit_root(root);
@@ -911,11 +884,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
911 } 884 }
912 885
913 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 886 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
914 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
915 887
916 if (to_reserve > 0) { 888 if (to_reserve > 0) {
917 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 889 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
918 to_reserve); 890 to_reserve);
919 if (ret) { 891 if (ret) {
920 pending->error = ret; 892 pending->error = ret;
921 goto fail; 893 goto fail;
@@ -979,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
979 btrfs_tree_unlock(old); 951 btrfs_tree_unlock(old);
980 free_extent_buffer(old); 952 free_extent_buffer(old);
981 953
954 /* see comments in should_cow_block() */
955 root->force_cow = 1;
956 smp_wmb();
957
982 btrfs_set_root_node(new_root_item, tmp); 958 btrfs_set_root_node(new_root_item, tmp);
983 /* record when the snapshot was created in key.offset */ 959 /* record when the snapshot was created in key.offset */
984 key.offset = trans->transid; 960 key.offset = trans->transid;
@@ -1002,7 +978,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1002 BUG_ON(IS_ERR(pending->snap)); 978 BUG_ON(IS_ERR(pending->snap));
1003 979
1004 btrfs_reloc_post_snapshot(trans, pending); 980 btrfs_reloc_post_snapshot(trans, pending);
1005 btrfs_orphan_post_snapshot(trans, pending);
1006fail: 981fail:
1007 kfree(new_root_item); 982 kfree(new_root_item);
1008 trans->block_rsv = rsv; 983 trans->block_rsv = rsv;
@@ -1032,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root)
1032 struct btrfs_root_item *root_item; 1007 struct btrfs_root_item *root_item;
1033 struct btrfs_super_block *super; 1008 struct btrfs_super_block *super;
1034 1009
1035 super = &root->fs_info->super_copy; 1010 super = root->fs_info->super_copy;
1036 1011
1037 root_item = &root->fs_info->chunk_root->root_item; 1012 root_item = &root->fs_info->chunk_root->root_item;
1038 super->chunk_root = root_item->bytenr; 1013 super->chunk_root = root_item->bytenr;
@@ -1043,7 +1018,7 @@ static void update_super_roots(struct btrfs_root *root)
1043 super->root = root_item->bytenr; 1018 super->root = root_item->bytenr;
1044 super->generation = root_item->generation; 1019 super->generation = root_item->generation;
1045 super->root_level = root_item->level; 1020 super->root_level = root_item->level;
1046 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 1021 if (btrfs_test_opt(root, SPACE_CACHE))
1047 super->cache_generation = root_item->generation; 1022 super->cache_generation = root_item->generation;
1048} 1023}
1049 1024
@@ -1168,14 +1143,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1168 1143
1169 btrfs_run_ordered_operations(root, 0); 1144 btrfs_run_ordered_operations(root, 0);
1170 1145
1146 btrfs_trans_release_metadata(trans, root);
1147 trans->block_rsv = NULL;
1148
1171 /* make a pass through all the delayed refs we have so far 1149 /* make a pass through all the delayed refs we have so far
1172 * any runnings procs may add more while we are here 1150 * any runnings procs may add more while we are here
1173 */ 1151 */
1174 ret = btrfs_run_delayed_refs(trans, root, 0); 1152 ret = btrfs_run_delayed_refs(trans, root, 0);
1175 BUG_ON(ret); 1153 BUG_ON(ret);
1176 1154
1177 btrfs_trans_release_metadata(trans, root);
1178
1179 cur_trans = trans->transaction; 1155 cur_trans = trans->transaction;
1180 /* 1156 /*
1181 * set the flushing flag so procs in this transaction have to 1157 * set the flushing flag so procs in this transaction have to
@@ -1341,12 +1317,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1341 update_super_roots(root); 1317 update_super_roots(root);
1342 1318
1343 if (!root->fs_info->log_root_recovering) { 1319 if (!root->fs_info->log_root_recovering) {
1344 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1320 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
1345 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1321 btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1346 } 1322 }
1347 1323
1348 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1324 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1349 sizeof(root->fs_info->super_copy)); 1325 sizeof(*root->fs_info->super_copy));
1350 1326
1351 trans->transaction->blocked = 0; 1327 trans->transaction->blocked = 0;
1352 spin_lock(&root->fs_info->trans_lock); 1328 spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639fca067..f4d81c06d48f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
276 struct walk_control *wc, u64 gen) 276 struct walk_control *wc, u64 gen)
277{ 277{
278 if (wc->pin) 278 if (wc->pin)
279 btrfs_pin_extent(log->fs_info->extent_root, 279 btrfs_pin_extent_for_log_replay(wc->trans,
280 eb->start, eb->len, 0); 280 log->fs_info->extent_root,
281 eb->start, eb->len);
281 282
282 if (btrfs_buffer_uptodate(eb, gen)) { 283 if (btrfs_buffer_uptodate(eb, gen)) {
283 if (wc->write) 284 if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1760 1761
1761 WARN_ON(root_owner != 1762 WARN_ON(root_owner !=
1762 BTRFS_TREE_LOG_OBJECTID); 1763 BTRFS_TREE_LOG_OBJECTID);
1763 ret = btrfs_free_reserved_extent(root, 1764 ret = btrfs_free_and_pin_reserved_extent(root,
1764 bytenr, blocksize); 1765 bytenr, blocksize);
1765 BUG_ON(ret); 1766 BUG_ON(ret);
1766 } 1767 }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1828 btrfs_tree_unlock(next); 1829 btrfs_tree_unlock(next);
1829 1830
1830 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1831 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1831 ret = btrfs_free_reserved_extent(root, 1832 ret = btrfs_free_and_pin_reserved_extent(root,
1832 path->nodes[*level]->start, 1833 path->nodes[*level]->start,
1833 path->nodes[*level]->len); 1834 path->nodes[*level]->len);
1834 BUG_ON(ret); 1835 BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1897 1898
1898 WARN_ON(log->root_key.objectid != 1899 WARN_ON(log->root_key.objectid !=
1899 BTRFS_TREE_LOG_OBJECTID); 1900 BTRFS_TREE_LOG_OBJECTID);
1900 ret = btrfs_free_reserved_extent(log, next->start, 1901 ret = btrfs_free_and_pin_reserved_extent(log, next->start,
1901 next->len); 1902 next->len);
1902 BUG_ON(ret); 1903 BUG_ON(ret);
1903 } 1904 }
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2013 /* wait for previous tree log sync to complete */ 2014 /* wait for previous tree log sync to complete */
2014 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2015 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2015 wait_log_commit(trans, root, root->log_transid - 1); 2016 wait_log_commit(trans, root, root->log_transid - 1);
2016
2017 while (1) { 2017 while (1) {
2018 unsigned long batch = root->log_batch; 2018 unsigned long batch = root->log_batch;
2019 if (root->log_multiple_pids) { 2019 /* when we're on an ssd, just kick the log commit out */
2020 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2020 mutex_unlock(&root->log_mutex); 2021 mutex_unlock(&root->log_mutex);
2021 schedule_timeout_uninterruptible(1); 2022 schedule_timeout_uninterruptible(1);
2022 mutex_lock(&root->log_mutex); 2023 mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2117 BUG_ON(ret); 2118 BUG_ON(ret);
2118 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2119 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2119 2120
2120 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2121 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2121 log_root_tree->node->start); 2122 log_root_tree->node->start);
2122 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2123 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2123 btrfs_header_level(log_root_tree->node)); 2124 btrfs_header_level(log_root_tree->node));
2124 2125
2125 log_root_tree->log_batch = 0; 2126 log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..f4b839fd3c9d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -295,6 +295,12 @@ loop_lock:
295 btrfs_requeue_work(&device->work); 295 btrfs_requeue_work(&device->work);
296 goto done; 296 goto done;
297 } 297 }
298 /* unplug every 64 requests just for good measure */
299 if (batch_run % 64 == 0) {
300 blk_finish_plug(&plug);
301 blk_start_plug(&plug);
302 sync_pending = 0;
303 }
298 } 304 }
299 305
300 cond_resched(); 306 cond_resched();
@@ -366,6 +372,14 @@ static noinline int device_list_add(const char *path,
366 } 372 }
367 INIT_LIST_HEAD(&device->dev_alloc_list); 373 INIT_LIST_HEAD(&device->dev_alloc_list);
368 374
375 /* init readahead state */
376 spin_lock_init(&device->reada_lock);
377 device->reada_curr_zone = NULL;
378 atomic_set(&device->reada_in_flight, 0);
379 device->reada_next = 0;
380 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
381 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
382
369 mutex_lock(&fs_devices->device_list_mutex); 383 mutex_lock(&fs_devices->device_list_mutex);
370 list_add_rcu(&device->dev_list, &fs_devices->devices); 384 list_add_rcu(&device->dev_list, &fs_devices->devices);
371 mutex_unlock(&fs_devices->device_list_mutex); 385 mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +611,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
597 set_blocksize(bdev, 4096); 611 set_blocksize(bdev, 4096);
598 612
599 bh = btrfs_read_dev_super(bdev); 613 bh = btrfs_read_dev_super(bdev);
600 if (!bh) { 614 if (!bh)
601 ret = -EINVAL;
602 goto error_close; 615 goto error_close;
603 }
604 616
605 disk_super = (struct btrfs_super_block *)bh->b_data; 617 disk_super = (struct btrfs_super_block *)bh->b_data;
606 devid = btrfs_stack_device_id(&disk_super->dev_item); 618 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +667,7 @@ error:
655 continue; 667 continue;
656 } 668 }
657 if (fs_devices->open_devices == 0) { 669 if (fs_devices->open_devices == 0) {
658 ret = -EIO; 670 ret = -EINVAL;
659 goto out; 671 goto out;
660 } 672 }
661 fs_devices->seeding = seeding; 673 fs_devices->seeding = seeding;
@@ -993,7 +1005,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
993 key.objectid = device->devid; 1005 key.objectid = device->devid;
994 key.offset = start; 1006 key.offset = start;
995 key.type = BTRFS_DEV_EXTENT_KEY; 1007 key.type = BTRFS_DEV_EXTENT_KEY;
996 1008again:
997 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1009 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
998 if (ret > 0) { 1010 if (ret > 0) {
999 ret = btrfs_previous_item(root, path, key.objectid, 1011 ret = btrfs_previous_item(root, path, key.objectid,
@@ -1006,6 +1018,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1006 struct btrfs_dev_extent); 1018 struct btrfs_dev_extent);
1007 BUG_ON(found_key.offset > start || found_key.offset + 1019 BUG_ON(found_key.offset > start || found_key.offset +
1008 btrfs_dev_extent_length(leaf, extent) < start); 1020 btrfs_dev_extent_length(leaf, extent) < start);
1021 key = found_key;
1022 btrfs_release_path(path);
1023 goto again;
1009 } else if (ret == 0) { 1024 } else if (ret == 0) {
1010 leaf = path->nodes[0]; 1025 leaf = path->nodes[0];
1011 extent = btrfs_item_ptr(leaf, path->slots[0], 1026 extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1013,8 +1028,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1013 } 1028 }
1014 BUG_ON(ret); 1029 BUG_ON(ret);
1015 1030
1016 if (device->bytes_used > 0) 1031 if (device->bytes_used > 0) {
1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1032 u64 len = btrfs_dev_extent_length(leaf, extent);
1033 device->bytes_used -= len;
1034 spin_lock(&root->fs_info->free_chunk_lock);
1035 root->fs_info->free_chunk_space += len;
1036 spin_unlock(&root->fs_info->free_chunk_lock);
1037 }
1018 ret = btrfs_del_item(trans, root, path); 1038 ret = btrfs_del_item(trans, root, path);
1019 1039
1020out: 1040out:
@@ -1356,6 +1376,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1356 if (ret) 1376 if (ret)
1357 goto error_undo; 1377 goto error_undo;
1358 1378
1379 spin_lock(&root->fs_info->free_chunk_lock);
1380 root->fs_info->free_chunk_space = device->total_bytes -
1381 device->bytes_used;
1382 spin_unlock(&root->fs_info->free_chunk_lock);
1383
1359 device->in_fs_metadata = 0; 1384 device->in_fs_metadata = 0;
1360 btrfs_scrub_cancel_dev(root, device); 1385 btrfs_scrub_cancel_dev(root, device);
1361 1386
@@ -1387,8 +1412,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1387 call_rcu(&device->rcu, free_device); 1412 call_rcu(&device->rcu, free_device);
1388 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1413 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1389 1414
1390 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1415 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1391 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1416 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1392 1417
1393 if (cur_devices->open_devices == 0) { 1418 if (cur_devices->open_devices == 0) {
1394 struct btrfs_fs_devices *fs_devices; 1419 struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1475,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1450 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1475 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1451 struct btrfs_fs_devices *old_devices; 1476 struct btrfs_fs_devices *old_devices;
1452 struct btrfs_fs_devices *seed_devices; 1477 struct btrfs_fs_devices *seed_devices;
1453 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1478 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1454 struct btrfs_device *device; 1479 struct btrfs_device *device;
1455 u64 super_flags; 1480 u64 super_flags;
1456 1481
@@ -1592,7 +1617,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1592 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1617 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1593 return -EINVAL; 1618 return -EINVAL;
1594 1619
1595 bdev = blkdev_get_by_path(device_path, FMODE_EXCL, 1620 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1596 root->fs_info->bdev_holder); 1621 root->fs_info->bdev_holder);
1597 if (IS_ERR(bdev)) 1622 if (IS_ERR(bdev))
1598 return PTR_ERR(bdev); 1623 return PTR_ERR(bdev);
@@ -1691,15 +1716,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1691 root->fs_info->fs_devices->num_can_discard++; 1716 root->fs_info->fs_devices->num_can_discard++;
1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1717 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 1718
1719 spin_lock(&root->fs_info->free_chunk_lock);
1720 root->fs_info->free_chunk_space += device->total_bytes;
1721 spin_unlock(&root->fs_info->free_chunk_lock);
1722
1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1723 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 root->fs_info->fs_devices->rotating = 1; 1724 root->fs_info->fs_devices->rotating = 1;
1696 1725
1697 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1726 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1698 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1727 btrfs_set_super_total_bytes(root->fs_info->super_copy,
1699 total_bytes + device->total_bytes); 1728 total_bytes + device->total_bytes);
1700 1729
1701 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1730 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1702 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1731 btrfs_set_super_num_devices(root->fs_info->super_copy,
1703 total_bytes + 1); 1732 total_bytes + 1);
1704 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1733 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1705 1734
@@ -1790,7 +1819,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1790 struct btrfs_device *device, u64 new_size) 1819 struct btrfs_device *device, u64 new_size)
1791{ 1820{
1792 struct btrfs_super_block *super_copy = 1821 struct btrfs_super_block *super_copy =
1793 &device->dev_root->fs_info->super_copy; 1822 device->dev_root->fs_info->super_copy;
1794 u64 old_total = btrfs_super_total_bytes(super_copy); 1823 u64 old_total = btrfs_super_total_bytes(super_copy);
1795 u64 diff = new_size - device->total_bytes; 1824 u64 diff = new_size - device->total_bytes;
1796 1825
@@ -1849,7 +1878,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1878static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1850 chunk_offset) 1879 chunk_offset)
1851{ 1880{
1852 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1881 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1853 struct btrfs_disk_key *disk_key; 1882 struct btrfs_disk_key *disk_key;
1854 struct btrfs_chunk *chunk; 1883 struct btrfs_chunk *chunk;
1855 u8 *ptr; 1884 u8 *ptr;
@@ -2175,7 +2204,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2175 bool retried = false; 2204 bool retried = false;
2176 struct extent_buffer *l; 2205 struct extent_buffer *l;
2177 struct btrfs_key key; 2206 struct btrfs_key key;
2178 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2207 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2179 u64 old_total = btrfs_super_total_bytes(super_copy); 2208 u64 old_total = btrfs_super_total_bytes(super_copy);
2180 u64 old_size = device->total_bytes; 2209 u64 old_size = device->total_bytes;
2181 u64 diff = device->total_bytes - new_size; 2210 u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2221,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2192 lock_chunks(root); 2221 lock_chunks(root);
2193 2222
2194 device->total_bytes = new_size; 2223 device->total_bytes = new_size;
2195 if (device->writeable) 2224 if (device->writeable) {
2196 device->fs_devices->total_rw_bytes -= diff; 2225 device->fs_devices->total_rw_bytes -= diff;
2226 spin_lock(&root->fs_info->free_chunk_lock);
2227 root->fs_info->free_chunk_space -= diff;
2228 spin_unlock(&root->fs_info->free_chunk_lock);
2229 }
2197 unlock_chunks(root); 2230 unlock_chunks(root);
2198 2231
2199again: 2232again:
@@ -2257,6 +2290,9 @@ again:
2257 device->total_bytes = old_size; 2290 device->total_bytes = old_size;
2258 if (device->writeable) 2291 if (device->writeable)
2259 device->fs_devices->total_rw_bytes += diff; 2292 device->fs_devices->total_rw_bytes += diff;
2293 spin_lock(&root->fs_info->free_chunk_lock);
2294 root->fs_info->free_chunk_space += diff;
2295 spin_unlock(&root->fs_info->free_chunk_lock);
2260 unlock_chunks(root); 2296 unlock_chunks(root);
2261 goto done; 2297 goto done;
2262 } 2298 }
@@ -2292,7 +2328,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2292 struct btrfs_key *key, 2328 struct btrfs_key *key,
2293 struct btrfs_chunk *chunk, int item_size) 2329 struct btrfs_chunk *chunk, int item_size)
2294{ 2330{
2295 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2331 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2296 struct btrfs_disk_key disk_key; 2332 struct btrfs_disk_key disk_key;
2297 u32 array_size; 2333 u32 array_size;
2298 u8 *ptr; 2334 u8 *ptr;
@@ -2615,6 +2651,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2615 index++; 2651 index++;
2616 } 2652 }
2617 2653
2654 spin_lock(&extent_root->fs_info->free_chunk_lock);
2655 extent_root->fs_info->free_chunk_space -= (stripe_size *
2656 map->num_stripes);
2657 spin_unlock(&extent_root->fs_info->free_chunk_lock);
2658
2618 index = 0; 2659 index = 0;
2619 stripe = &chunk->stripe; 2660 stripe = &chunk->stripe;
2620 while (index < map->num_stripes) { 2661 while (index < map->num_stripes) {
@@ -2848,7 +2889,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2848 2889
2849static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2890static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2850 u64 logical, u64 *length, 2891 u64 logical, u64 *length,
2851 struct btrfs_multi_bio **multi_ret, 2892 struct btrfs_bio **bbio_ret,
2852 int mirror_num) 2893 int mirror_num)
2853{ 2894{
2854 struct extent_map *em; 2895 struct extent_map *em;
@@ -2866,18 +2907,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2866 int i; 2907 int i;
2867 int num_stripes; 2908 int num_stripes;
2868 int max_errors = 0; 2909 int max_errors = 0;
2869 struct btrfs_multi_bio *multi = NULL; 2910 struct btrfs_bio *bbio = NULL;
2870 2911
2871 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2912 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2872 stripes_allocated = 1; 2913 stripes_allocated = 1;
2873again: 2914again:
2874 if (multi_ret) { 2915 if (bbio_ret) {
2875 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2916 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2876 GFP_NOFS); 2917 GFP_NOFS);
2877 if (!multi) 2918 if (!bbio)
2878 return -ENOMEM; 2919 return -ENOMEM;
2879 2920
2880 atomic_set(&multi->error, 0); 2921 atomic_set(&bbio->error, 0);
2881 } 2922 }
2882 2923
2883 read_lock(&em_tree->lock); 2924 read_lock(&em_tree->lock);
@@ -2898,7 +2939,7 @@ again:
2898 if (mirror_num > map->num_stripes) 2939 if (mirror_num > map->num_stripes)
2899 mirror_num = 0; 2940 mirror_num = 0;
2900 2941
2901 /* if our multi bio struct is too small, back off and try again */ 2942 /* if our btrfs_bio struct is too small, back off and try again */
2902 if (rw & REQ_WRITE) { 2943 if (rw & REQ_WRITE) {
2903 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2944 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2904 BTRFS_BLOCK_GROUP_DUP)) { 2945 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2917,11 +2958,11 @@ again:
2917 stripes_required = map->num_stripes; 2958 stripes_required = map->num_stripes;
2918 } 2959 }
2919 } 2960 }
2920 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2961 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2921 stripes_allocated < stripes_required) { 2962 stripes_allocated < stripes_required) {
2922 stripes_allocated = map->num_stripes; 2963 stripes_allocated = map->num_stripes;
2923 free_extent_map(em); 2964 free_extent_map(em);
2924 kfree(multi); 2965 kfree(bbio);
2925 goto again; 2966 goto again;
2926 } 2967 }
2927 stripe_nr = offset; 2968 stripe_nr = offset;
@@ -2950,7 +2991,7 @@ again:
2950 *length = em->len - offset; 2991 *length = em->len - offset;
2951 } 2992 }
2952 2993
2953 if (!multi_ret) 2994 if (!bbio_ret)
2954 goto out; 2995 goto out;
2955 2996
2956 num_stripes = 1; 2997 num_stripes = 1;
@@ -2975,13 +3016,17 @@ again:
2975 stripe_index = find_live_mirror(map, 0, 3016 stripe_index = find_live_mirror(map, 0,
2976 map->num_stripes, 3017 map->num_stripes,
2977 current->pid % map->num_stripes); 3018 current->pid % map->num_stripes);
3019 mirror_num = stripe_index + 1;
2978 } 3020 }
2979 3021
2980 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3022 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2981 if (rw & (REQ_WRITE | REQ_DISCARD)) 3023 if (rw & (REQ_WRITE | REQ_DISCARD)) {
2982 num_stripes = map->num_stripes; 3024 num_stripes = map->num_stripes;
2983 else if (mirror_num) 3025 } else if (mirror_num) {
2984 stripe_index = mirror_num - 1; 3026 stripe_index = mirror_num - 1;
3027 } else {
3028 mirror_num = 1;
3029 }
2985 3030
2986 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3031 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2987 int factor = map->num_stripes / map->sub_stripes; 3032 int factor = map->num_stripes / map->sub_stripes;
@@ -3001,6 +3046,7 @@ again:
3001 stripe_index = find_live_mirror(map, stripe_index, 3046 stripe_index = find_live_mirror(map, stripe_index,
3002 map->sub_stripes, stripe_index + 3047 map->sub_stripes, stripe_index +
3003 current->pid % map->sub_stripes); 3048 current->pid % map->sub_stripes);
3049 mirror_num = stripe_index + 1;
3004 } 3050 }
3005 } else { 3051 } else {
3006 /* 3052 /*
@@ -3009,15 +3055,16 @@ again:
3009 * stripe_index is the number of our device in the stripe array 3055 * stripe_index is the number of our device in the stripe array
3010 */ 3056 */
3011 stripe_index = do_div(stripe_nr, map->num_stripes); 3057 stripe_index = do_div(stripe_nr, map->num_stripes);
3058 mirror_num = stripe_index + 1;
3012 } 3059 }
3013 BUG_ON(stripe_index >= map->num_stripes); 3060 BUG_ON(stripe_index >= map->num_stripes);
3014 3061
3015 if (rw & REQ_DISCARD) { 3062 if (rw & REQ_DISCARD) {
3016 for (i = 0; i < num_stripes; i++) { 3063 for (i = 0; i < num_stripes; i++) {
3017 multi->stripes[i].physical = 3064 bbio->stripes[i].physical =
3018 map->stripes[stripe_index].physical + 3065 map->stripes[stripe_index].physical +
3019 stripe_offset + stripe_nr * map->stripe_len; 3066 stripe_offset + stripe_nr * map->stripe_len;
3020 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3067 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3021 3068
3022 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3069 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3023 u64 stripes; 3070 u64 stripes;
@@ -3038,16 +3085,16 @@ again:
3038 } 3085 }
3039 stripes = stripe_nr_end - 1 - j; 3086 stripes = stripe_nr_end - 1 - j;
3040 do_div(stripes, map->num_stripes); 3087 do_div(stripes, map->num_stripes);
3041 multi->stripes[i].length = map->stripe_len * 3088 bbio->stripes[i].length = map->stripe_len *
3042 (stripes - stripe_nr + 1); 3089 (stripes - stripe_nr + 1);
3043 3090
3044 if (i == 0) { 3091 if (i == 0) {
3045 multi->stripes[i].length -= 3092 bbio->stripes[i].length -=
3046 stripe_offset; 3093 stripe_offset;
3047 stripe_offset = 0; 3094 stripe_offset = 0;
3048 } 3095 }
3049 if (stripe_index == last_stripe) 3096 if (stripe_index == last_stripe)
3050 multi->stripes[i].length -= 3097 bbio->stripes[i].length -=
3051 stripe_end_offset; 3098 stripe_end_offset;
3052 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3099 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3053 u64 stripes; 3100 u64 stripes;
@@ -3072,11 +3119,11 @@ again:
3072 } 3119 }
3073 stripes = stripe_nr_end - 1 - j; 3120 stripes = stripe_nr_end - 1 - j;
3074 do_div(stripes, factor); 3121 do_div(stripes, factor);
3075 multi->stripes[i].length = map->stripe_len * 3122 bbio->stripes[i].length = map->stripe_len *
3076 (stripes - stripe_nr + 1); 3123 (stripes - stripe_nr + 1);
3077 3124
3078 if (i < map->sub_stripes) { 3125 if (i < map->sub_stripes) {
3079 multi->stripes[i].length -= 3126 bbio->stripes[i].length -=
3080 stripe_offset; 3127 stripe_offset;
3081 if (i == map->sub_stripes - 1) 3128 if (i == map->sub_stripes - 1)
3082 stripe_offset = 0; 3129 stripe_offset = 0;
@@ -3084,11 +3131,11 @@ again:
3084 if (stripe_index >= last_stripe && 3131 if (stripe_index >= last_stripe &&
3085 stripe_index <= (last_stripe + 3132 stripe_index <= (last_stripe +
3086 map->sub_stripes - 1)) { 3133 map->sub_stripes - 1)) {
3087 multi->stripes[i].length -= 3134 bbio->stripes[i].length -=
3088 stripe_end_offset; 3135 stripe_end_offset;
3089 } 3136 }
3090 } else 3137 } else
3091 multi->stripes[i].length = *length; 3138 bbio->stripes[i].length = *length;
3092 3139
3093 stripe_index++; 3140 stripe_index++;
3094 if (stripe_index == map->num_stripes) { 3141 if (stripe_index == map->num_stripes) {
@@ -3099,19 +3146,20 @@ again:
3099 } 3146 }
3100 } else { 3147 } else {
3101 for (i = 0; i < num_stripes; i++) { 3148 for (i = 0; i < num_stripes; i++) {
3102 multi->stripes[i].physical = 3149 bbio->stripes[i].physical =
3103 map->stripes[stripe_index].physical + 3150 map->stripes[stripe_index].physical +
3104 stripe_offset + 3151 stripe_offset +
3105 stripe_nr * map->stripe_len; 3152 stripe_nr * map->stripe_len;
3106 multi->stripes[i].dev = 3153 bbio->stripes[i].dev =
3107 map->stripes[stripe_index].dev; 3154 map->stripes[stripe_index].dev;
3108 stripe_index++; 3155 stripe_index++;
3109 } 3156 }
3110 } 3157 }
3111 if (multi_ret) { 3158 if (bbio_ret) {
3112 *multi_ret = multi; 3159 *bbio_ret = bbio;
3113 multi->num_stripes = num_stripes; 3160 bbio->num_stripes = num_stripes;
3114 multi->max_errors = max_errors; 3161 bbio->max_errors = max_errors;
3162 bbio->mirror_num = mirror_num;
3115 } 3163 }
3116out: 3164out:
3117 free_extent_map(em); 3165 free_extent_map(em);
@@ -3120,9 +3168,9 @@ out:
3120 3168
3121int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3169int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3122 u64 logical, u64 *length, 3170 u64 logical, u64 *length,
3123 struct btrfs_multi_bio **multi_ret, int mirror_num) 3171 struct btrfs_bio **bbio_ret, int mirror_num)
3124{ 3172{
3125 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3173 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3126 mirror_num); 3174 mirror_num);
3127} 3175}
3128 3176
@@ -3191,30 +3239,32 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3191 return 0; 3239 return 0;
3192} 3240}
3193 3241
3194static void end_bio_multi_stripe(struct bio *bio, int err) 3242static void btrfs_end_bio(struct bio *bio, int err)
3195{ 3243{
3196 struct btrfs_multi_bio *multi = bio->bi_private; 3244 struct btrfs_bio *bbio = bio->bi_private;
3197 int is_orig_bio = 0; 3245 int is_orig_bio = 0;
3198 3246
3199 if (err) 3247 if (err)
3200 atomic_inc(&multi->error); 3248 atomic_inc(&bbio->error);
3201 3249
3202 if (bio == multi->orig_bio) 3250 if (bio == bbio->orig_bio)
3203 is_orig_bio = 1; 3251 is_orig_bio = 1;
3204 3252
3205 if (atomic_dec_and_test(&multi->stripes_pending)) { 3253 if (atomic_dec_and_test(&bbio->stripes_pending)) {
3206 if (!is_orig_bio) { 3254 if (!is_orig_bio) {
3207 bio_put(bio); 3255 bio_put(bio);
3208 bio = multi->orig_bio; 3256 bio = bbio->orig_bio;
3209 } 3257 }
3210 bio->bi_private = multi->private; 3258 bio->bi_private = bbio->private;
3211 bio->bi_end_io = multi->end_io; 3259 bio->bi_end_io = bbio->end_io;
3260 bio->bi_bdev = (struct block_device *)
3261 (unsigned long)bbio->mirror_num;
3212 /* only send an error to the higher layers if it is 3262 /* only send an error to the higher layers if it is
3213 * beyond the tolerance of the multi-bio 3263 * beyond the tolerance of the multi-bio
3214 */ 3264 */
3215 if (atomic_read(&multi->error) > multi->max_errors) { 3265 if (atomic_read(&bbio->error) > bbio->max_errors) {
3216 err = -EIO; 3266 err = -EIO;
3217 } else if (err) { 3267 } else {
3218 /* 3268 /*
3219 * this bio is actually up to date, we didn't 3269 * this bio is actually up to date, we didn't
3220 * go over the max number of errors 3270 * go over the max number of errors
@@ -3222,7 +3272,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
3222 set_bit(BIO_UPTODATE, &bio->bi_flags); 3272 set_bit(BIO_UPTODATE, &bio->bi_flags);
3223 err = 0; 3273 err = 0;
3224 } 3274 }
3225 kfree(multi); 3275 kfree(bbio);
3226 3276
3227 bio_endio(bio, err); 3277 bio_endio(bio, err);
3228 } else if (!is_orig_bio) { 3278 } else if (!is_orig_bio) {
@@ -3302,20 +3352,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3302 u64 logical = (u64)bio->bi_sector << 9; 3352 u64 logical = (u64)bio->bi_sector << 9;
3303 u64 length = 0; 3353 u64 length = 0;
3304 u64 map_length; 3354 u64 map_length;
3305 struct btrfs_multi_bio *multi = NULL;
3306 int ret; 3355 int ret;
3307 int dev_nr = 0; 3356 int dev_nr = 0;
3308 int total_devs = 1; 3357 int total_devs = 1;
3358 struct btrfs_bio *bbio = NULL;
3309 3359
3310 length = bio->bi_size; 3360 length = bio->bi_size;
3311 map_tree = &root->fs_info->mapping_tree; 3361 map_tree = &root->fs_info->mapping_tree;
3312 map_length = length; 3362 map_length = length;
3313 3363
3314 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3364 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
3315 mirror_num); 3365 mirror_num);
3316 BUG_ON(ret); 3366 BUG_ON(ret);
3317 3367
3318 total_devs = multi->num_stripes; 3368 total_devs = bbio->num_stripes;
3319 if (map_length < length) { 3369 if (map_length < length) {
3320 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3370 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3321 "len %llu\n", (unsigned long long)logical, 3371 "len %llu\n", (unsigned long long)logical,
@@ -3323,25 +3373,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3323 (unsigned long long)map_length); 3373 (unsigned long long)map_length);
3324 BUG(); 3374 BUG();
3325 } 3375 }
3326 multi->end_io = first_bio->bi_end_io; 3376
3327 multi->private = first_bio->bi_private; 3377 bbio->orig_bio = first_bio;
3328 multi->orig_bio = first_bio; 3378 bbio->private = first_bio->bi_private;
3329 atomic_set(&multi->stripes_pending, multi->num_stripes); 3379 bbio->end_io = first_bio->bi_end_io;
3380 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
3330 3381
3331 while (dev_nr < total_devs) { 3382 while (dev_nr < total_devs) {
3332 if (total_devs > 1) { 3383 if (dev_nr < total_devs - 1) {
3333 if (dev_nr < total_devs - 1) { 3384 bio = bio_clone(first_bio, GFP_NOFS);
3334 bio = bio_clone(first_bio, GFP_NOFS); 3385 BUG_ON(!bio);
3335 BUG_ON(!bio); 3386 } else {
3336 } else { 3387 bio = first_bio;
3337 bio = first_bio;
3338 }
3339 bio->bi_private = multi;
3340 bio->bi_end_io = end_bio_multi_stripe;
3341 } 3388 }
3342 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3389 bio->bi_private = bbio;
3343 dev = multi->stripes[dev_nr].dev; 3390 bio->bi_end_io = btrfs_end_bio;
3391 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
3392 dev = bbio->stripes[dev_nr].dev;
3344 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3393 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3394 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
3395 "(%s id %llu), size=%u\n", rw,
3396 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
3397 dev->name, dev->devid, bio->bi_size);
3345 bio->bi_bdev = dev->bdev; 3398 bio->bi_bdev = dev->bdev;
3346 if (async_submit) 3399 if (async_submit)
3347 schedule_bio(root, dev, rw, bio); 3400 schedule_bio(root, dev, rw, bio);
@@ -3354,8 +3407,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3354 } 3407 }
3355 dev_nr++; 3408 dev_nr++;
3356 } 3409 }
3357 if (total_devs == 1)
3358 kfree(multi);
3359 return 0; 3410 return 0;
3360} 3411}
3361 3412
@@ -3616,15 +3667,20 @@ static int read_one_dev(struct btrfs_root *root,
3616 fill_device_from_item(leaf, dev_item, device); 3667 fill_device_from_item(leaf, dev_item, device);
3617 device->dev_root = root->fs_info->dev_root; 3668 device->dev_root = root->fs_info->dev_root;
3618 device->in_fs_metadata = 1; 3669 device->in_fs_metadata = 1;
3619 if (device->writeable) 3670 if (device->writeable) {
3620 device->fs_devices->total_rw_bytes += device->total_bytes; 3671 device->fs_devices->total_rw_bytes += device->total_bytes;
3672 spin_lock(&root->fs_info->free_chunk_lock);
3673 root->fs_info->free_chunk_space += device->total_bytes -
3674 device->bytes_used;
3675 spin_unlock(&root->fs_info->free_chunk_lock);
3676 }
3621 ret = 0; 3677 ret = 0;
3622 return ret; 3678 return ret;
3623} 3679}
3624 3680
3625int btrfs_read_sys_array(struct btrfs_root *root) 3681int btrfs_read_sys_array(struct btrfs_root *root)
3626{ 3682{
3627 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3683 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3628 struct extent_buffer *sb; 3684 struct extent_buffer *sb;
3629 struct btrfs_disk_key *disk_key; 3685 struct btrfs_disk_key *disk_key;
3630 struct btrfs_chunk *chunk; 3686 struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e177..78f2d4d4f37f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,20 @@ struct btrfs_device {
92 struct btrfs_work work; 92 struct btrfs_work work;
93 struct rcu_head rcu; 93 struct rcu_head rcu;
94 struct work_struct rcu_work; 94 struct work_struct rcu_work;
95
96 /* readahead state */
97 spinlock_t reada_lock;
98 atomic_t reada_in_flight;
99 u64 reada_next;
100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents;
103
104 /* for sending down flush barriers */
105 struct bio *flush_bio;
106 struct completion flush_wait;
107 int nobarriers;
108
95}; 109};
96 110
97struct btrfs_fs_devices { 111struct btrfs_fs_devices {
@@ -136,7 +150,10 @@ struct btrfs_bio_stripe {
136 u64 length; /* only used for discard mappings */ 150 u64 length; /* only used for discard mappings */
137}; 151};
138 152
139struct btrfs_multi_bio { 153struct btrfs_bio;
154typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
155
156struct btrfs_bio {
140 atomic_t stripes_pending; 157 atomic_t stripes_pending;
141 bio_end_io_t *end_io; 158 bio_end_io_t *end_io;
142 struct bio *orig_bio; 159 struct bio *orig_bio;
@@ -144,6 +161,7 @@ struct btrfs_multi_bio {
144 atomic_t error; 161 atomic_t error;
145 int max_errors; 162 int max_errors;
146 int num_stripes; 163 int num_stripes;
164 int mirror_num;
147 struct btrfs_bio_stripe stripes[]; 165 struct btrfs_bio_stripe stripes[];
148}; 166};
149 167
@@ -171,7 +189,7 @@ struct map_lookup {
171int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 189int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
172 u64 end, u64 *length); 190 u64 end, u64 *length);
173 191
174#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 192#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
175 (sizeof(struct btrfs_bio_stripe) * (n))) 193 (sizeof(struct btrfs_bio_stripe) * (n)))
176 194
177int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 195int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
180 u64 chunk_offset, u64 start, u64 num_bytes); 198 u64 chunk_offset, u64 start, u64 num_bytes);
181int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 199int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
182 u64 logical, u64 *length, 200 u64 logical, u64 *length,
183 struct btrfs_multi_bio **multi_ret, int mirror_num); 201 struct btrfs_bio **bbio_ret, int mirror_num);
184int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 202int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
185 u64 chunk_start, u64 physical, u64 devid, 203 u64 chunk_start, u64 physical, u64 devid,
186 u64 **logical, int *naddrs, int *stripe_len); 204 u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 426aa464f1af..3848b04e310e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
127again: 127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), 128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size); 129 name, name_len, value, size);
130 /*
131 * If we're setting an xattr to a new value but the new value is say
132 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
133 * back from split_leaf. This is because it thinks we'll be extending
134 * the existing item size, but we're asking for enough space to add the
135 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
136 * the rest of the function figure it out.
137 */
138 if (ret == -EOVERFLOW)
139 ret = -EEXIST;
140
130 if (ret == -EEXIST) { 141 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE) 142 if (flags & XATTR_CREATE)
132 goto out; 143 goto out;