aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile25
-rw-r--r--fs/btrfs/acl.c351
-rw-r--r--fs/btrfs/async-thread.c419
-rw-r--r--fs/btrfs/async-thread.h101
-rw-r--r--fs/btrfs/btrfs_inode.h131
-rw-r--r--fs/btrfs/compat.h7
-rw-r--r--fs/btrfs/compression.c709
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/crc32c.h29
-rw-r--r--fs/btrfs/ctree.c3953
-rw-r--r--fs/btrfs/ctree.h2129
-rw-r--r--fs/btrfs/dir-item.c386
-rw-r--r--fs/btrfs/disk-io.c2343
-rw-r--r--fs/btrfs/disk-io.h102
-rw-r--r--fs/btrfs/export.c203
-rw-r--r--fs/btrfs/export.h19
-rw-r--r--fs/btrfs/extent-tree.c5986
-rw-r--r--fs/btrfs/extent_io.c3717
-rw-r--r--fs/btrfs/extent_io.h269
-rw-r--r--fs/btrfs/extent_map.c351
-rw-r--r--fs/btrfs/extent_map.h62
-rw-r--r--fs/btrfs/file-item.c831
-rw-r--r--fs/btrfs/file.c1288
-rw-r--r--fs/btrfs/free-space-cache.c495
-rw-r--r--fs/btrfs/hash.h27
-rw-r--r--fs/btrfs/inode-item.c206
-rw-r--r--fs/btrfs/inode-map.c144
-rw-r--r--fs/btrfs/inode.c5035
-rw-r--r--fs/btrfs/ioctl.c1132
-rw-r--r--fs/btrfs/ioctl.h67
-rw-r--r--fs/btrfs/locking.c88
-rw-r--r--fs/btrfs/locking.h27
-rw-r--r--fs/btrfs/ordered-data.c730
-rw-r--r--fs/btrfs/ordered-data.h158
-rw-r--r--fs/btrfs/orphan.c67
-rw-r--r--fs/btrfs/print-tree.c216
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/ref-cache.c230
-rw-r--r--fs/btrfs/ref-cache.h77
-rw-r--r--fs/btrfs/root-tree.c366
-rw-r--r--fs/btrfs/struct-funcs.c139
-rw-r--r--fs/btrfs/super.c720
-rw-r--r--fs/btrfs/sysfs.c269
-rw-r--r--fs/btrfs/transaction.c1097
-rw-r--r--fs/btrfs/transaction.h106
-rw-r--r--fs/btrfs/tree-defrag.c147
-rw-r--r--fs/btrfs/tree-log.c2898
-rw-r--r--fs/btrfs/tree-log.h41
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c3218
-rw-r--r--fs/btrfs/volumes.h162
-rw-r--r--fs/btrfs/xattr.c322
-rw-r--r--fs/btrfs/xattr.h39
-rw-r--r--fs/btrfs/zlib.c632
55 files changed, 42383 insertions, 0 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000000..d2cf5a54a4b8
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,25 @@
1ifneq ($(KERNELRELEASE),)
2# kbuild part of makefile
3
4obj-$(CONFIG_BTRFS_FS) := btrfs.o
5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 file-item.o inode-item.o inode-map.o disk-io.o \
7 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o
12else
13
14# Normal Makefile
15
16KERNELDIR := /lib/modules/`uname -r`/build
17all:
18 $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
19
20modules_install:
21 $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
22clean:
23 $(MAKE) -C $(KERNELDIR) M=`pwd` clean
24
25endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000000..1d53b62dbba5
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,351 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/string.h>
21#include <linux/xattr.h>
22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h>
24#include <linux/sched.h>
25
26#include "ctree.h"
27#include "btrfs_inode.h"
28#include "xattr.h"
29
30#ifdef CONFIG_FS_POSIX_ACL
31
32static void btrfs_update_cached_acl(struct inode *inode,
33 struct posix_acl **p_acl,
34 struct posix_acl *acl)
35{
36 spin_lock(&inode->i_lock);
37 if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
38 posix_acl_release(*p_acl);
39 *p_acl = posix_acl_dup(acl);
40 spin_unlock(&inode->i_lock);
41}
42
43static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
44{
45 int size;
46 const char *name;
47 char *value = NULL;
48 struct posix_acl *acl = NULL, **p_acl;
49
50 switch (type) {
51 case ACL_TYPE_ACCESS:
52 name = POSIX_ACL_XATTR_ACCESS;
53 p_acl = &BTRFS_I(inode)->i_acl;
54 break;
55 case ACL_TYPE_DEFAULT:
56 name = POSIX_ACL_XATTR_DEFAULT;
57 p_acl = &BTRFS_I(inode)->i_default_acl;
58 break;
59 default:
60 return ERR_PTR(-EINVAL);
61 }
62
63 spin_lock(&inode->i_lock);
64 if (*p_acl != BTRFS_ACL_NOT_CACHED)
65 acl = posix_acl_dup(*p_acl);
66 spin_unlock(&inode->i_lock);
67
68 if (acl)
69 return acl;
70
71
72 size = __btrfs_getxattr(inode, name, "", 0);
73 if (size > 0) {
74 value = kzalloc(size, GFP_NOFS);
75 if (!value)
76 return ERR_PTR(-ENOMEM);
77 size = __btrfs_getxattr(inode, name, value, size);
78 if (size > 0) {
79 acl = posix_acl_from_xattr(value, size);
80 btrfs_update_cached_acl(inode, p_acl, acl);
81 }
82 kfree(value);
83 } else if (size == -ENOENT) {
84 acl = NULL;
85 btrfs_update_cached_acl(inode, p_acl, acl);
86 }
87
88 return acl;
89}
90
91static int btrfs_xattr_get_acl(struct inode *inode, int type,
92 void *value, size_t size)
93{
94 struct posix_acl *acl;
95 int ret = 0;
96
97 acl = btrfs_get_acl(inode, type);
98
99 if (IS_ERR(acl))
100 return PTR_ERR(acl);
101 if (acl == NULL)
102 return -ENODATA;
103 ret = posix_acl_to_xattr(acl, value, size);
104 posix_acl_release(acl);
105
106 return ret;
107}
108
109/*
110 * Needs to be called with fs_mutex held
111 */
112static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
113{
114 int ret, size = 0;
115 const char *name;
116 struct posix_acl **p_acl;
117 char *value = NULL;
118 mode_t mode;
119
120 if (acl) {
121 ret = posix_acl_valid(acl);
122 if (ret < 0)
123 return ret;
124 ret = 0;
125 }
126
127 switch (type) {
128 case ACL_TYPE_ACCESS:
129 mode = inode->i_mode;
130 ret = posix_acl_equiv_mode(acl, &mode);
131 if (ret < 0)
132 return ret;
133 ret = 0;
134 inode->i_mode = mode;
135 name = POSIX_ACL_XATTR_ACCESS;
136 p_acl = &BTRFS_I(inode)->i_acl;
137 break;
138 case ACL_TYPE_DEFAULT:
139 if (!S_ISDIR(inode->i_mode))
140 return acl ? -EINVAL : 0;
141 name = POSIX_ACL_XATTR_DEFAULT;
142 p_acl = &BTRFS_I(inode)->i_default_acl;
143 break;
144 default:
145 return -EINVAL;
146 }
147
148 if (acl) {
149 size = posix_acl_xattr_size(acl->a_count);
150 value = kmalloc(size, GFP_NOFS);
151 if (!value) {
152 ret = -ENOMEM;
153 goto out;
154 }
155
156 ret = posix_acl_to_xattr(acl, value, size);
157 if (ret < 0)
158 goto out;
159 }
160
161 ret = __btrfs_setxattr(inode, name, value, size, 0);
162
163out:
164 kfree(value);
165
166 if (!ret)
167 btrfs_update_cached_acl(inode, p_acl, acl);
168
169 return ret;
170}
171
172static int btrfs_xattr_set_acl(struct inode *inode, int type,
173 const void *value, size_t size)
174{
175 int ret = 0;
176 struct posix_acl *acl = NULL;
177
178 if (value) {
179 acl = posix_acl_from_xattr(value, size);
180 if (acl == NULL) {
181 value = NULL;
182 size = 0;
183 } else if (IS_ERR(acl)) {
184 return PTR_ERR(acl);
185 }
186 }
187
188 ret = btrfs_set_acl(inode, acl, type);
189
190 posix_acl_release(acl);
191
192 return ret;
193}
194
195
196static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
197 void *value, size_t size)
198{
199 return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
200}
201
202static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
203 const void *value, size_t size, int flags)
204{
205 return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
206}
207
208static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
209 void *value, size_t size)
210{
211 return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
212}
213
214static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
215 const void *value, size_t size, int flags)
216{
217 return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
218}
219
220int btrfs_check_acl(struct inode *inode, int mask)
221{
222 struct posix_acl *acl;
223 int error = -EAGAIN;
224
225 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
226
227 if (IS_ERR(acl))
228 return PTR_ERR(acl);
229 if (acl) {
230 error = posix_acl_permission(inode, acl, mask);
231 posix_acl_release(acl);
232 }
233
234 return error;
235}
236
237/*
238 * btrfs_init_acl is already generally called under fs_mutex, so the locking
239 * stuff has been fixed to work with that. If the locking stuff changes, we
240 * need to re-evaluate the acl locking stuff.
241 */
242int btrfs_init_acl(struct inode *inode, struct inode *dir)
243{
244 struct posix_acl *acl = NULL;
245 int ret = 0;
246
247 /* this happens with subvols */
248 if (!dir)
249 return 0;
250
251 if (!S_ISLNK(inode->i_mode)) {
252 if (IS_POSIXACL(dir)) {
253 acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
254 if (IS_ERR(acl))
255 return PTR_ERR(acl);
256 }
257
258 if (!acl)
259 inode->i_mode &= ~current->fs->umask;
260 }
261
262 if (IS_POSIXACL(dir) && acl) {
263 struct posix_acl *clone;
264 mode_t mode;
265
266 if (S_ISDIR(inode->i_mode)) {
267 ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
268 if (ret)
269 goto failed;
270 }
271 clone = posix_acl_clone(acl, GFP_NOFS);
272 ret = -ENOMEM;
273 if (!clone)
274 goto failed;
275
276 mode = inode->i_mode;
277 ret = posix_acl_create_masq(clone, &mode);
278 if (ret >= 0) {
279 inode->i_mode = mode;
280 if (ret > 0) {
281 /* we need an acl */
282 ret = btrfs_set_acl(inode, clone,
283 ACL_TYPE_ACCESS);
284 }
285 }
286 }
287failed:
288 posix_acl_release(acl);
289
290 return ret;
291}
292
293int btrfs_acl_chmod(struct inode *inode)
294{
295 struct posix_acl *acl, *clone;
296 int ret = 0;
297
298 if (S_ISLNK(inode->i_mode))
299 return -EOPNOTSUPP;
300
301 if (!IS_POSIXACL(inode))
302 return 0;
303
304 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
305 if (IS_ERR(acl) || !acl)
306 return PTR_ERR(acl);
307
308 clone = posix_acl_clone(acl, GFP_KERNEL);
309 posix_acl_release(acl);
310 if (!clone)
311 return -ENOMEM;
312
313 ret = posix_acl_chmod_masq(clone, inode->i_mode);
314 if (!ret)
315 ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
316
317 posix_acl_release(clone);
318
319 return ret;
320}
321
322struct xattr_handler btrfs_xattr_acl_default_handler = {
323 .prefix = POSIX_ACL_XATTR_DEFAULT,
324 .get = btrfs_xattr_acl_default_get,
325 .set = btrfs_xattr_acl_default_set,
326};
327
328struct xattr_handler btrfs_xattr_acl_access_handler = {
329 .prefix = POSIX_ACL_XATTR_ACCESS,
330 .get = btrfs_xattr_acl_access_get,
331 .set = btrfs_xattr_acl_access_set,
332};
333
334#else /* CONFIG_FS_POSIX_ACL */
335
336int btrfs_acl_chmod(struct inode *inode)
337{
338 return 0;
339}
340
341int btrfs_init_acl(struct inode *inode, struct inode *dir)
342{
343 return 0;
344}
345
346int btrfs_check_acl(struct inode *inode, int mask)
347{
348 return 0;
349}
350
351#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..8e2fec05dbe0
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,419 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/kthread.h>
21#include <linux/list.h>
22#include <linux/spinlock.h>
23# include <linux/freezer.h>
24#include "async-thread.h"
25
26#define WORK_QUEUED_BIT 0
27#define WORK_DONE_BIT 1
28#define WORK_ORDER_DONE_BIT 2
29
30/*
31 * container for the kthread task pointer and the list of pending work
32 * One of these is allocated per thread.
33 */
34struct btrfs_worker_thread {
35 /* pool we belong to */
36 struct btrfs_workers *workers;
37
38 /* list of struct btrfs_work that are waiting for service */
39 struct list_head pending;
40
41 /* list of worker threads from struct btrfs_workers */
42 struct list_head worker_list;
43
44 /* kthread */
45 struct task_struct *task;
46
47 /* number of things on the pending list */
48 atomic_t num_pending;
49
50 unsigned long sequence;
51
52 /* protects the pending list. */
53 spinlock_t lock;
54
55 /* set to non-zero when this thread is already awake and kicking */
56 int working;
57
58 /* are we currently idle */
59 int idle;
60};
61
62/*
63 * helper function to move a thread onto the idle list after it
64 * has finished some requests.
65 */
66static void check_idle_worker(struct btrfs_worker_thread *worker)
67{
68 if (!worker->idle && atomic_read(&worker->num_pending) <
69 worker->workers->idle_thresh / 2) {
70 unsigned long flags;
71 spin_lock_irqsave(&worker->workers->lock, flags);
72 worker->idle = 1;
73 list_move(&worker->worker_list, &worker->workers->idle_list);
74 spin_unlock_irqrestore(&worker->workers->lock, flags);
75 }
76}
77
78/*
79 * helper function to move a thread off the idle list after new
80 * pending work is added.
81 */
82static void check_busy_worker(struct btrfs_worker_thread *worker)
83{
84 if (worker->idle && atomic_read(&worker->num_pending) >=
85 worker->workers->idle_thresh) {
86 unsigned long flags;
87 spin_lock_irqsave(&worker->workers->lock, flags);
88 worker->idle = 0;
89 list_move_tail(&worker->worker_list,
90 &worker->workers->worker_list);
91 spin_unlock_irqrestore(&worker->workers->lock, flags);
92 }
93}
94
95static noinline int run_ordered_completions(struct btrfs_workers *workers,
96 struct btrfs_work *work)
97{
98 unsigned long flags;
99
100 if (!workers->ordered)
101 return 0;
102
103 set_bit(WORK_DONE_BIT, &work->flags);
104
105 spin_lock_irqsave(&workers->lock, flags);
106
107 while (!list_empty(&workers->order_list)) {
108 work = list_entry(workers->order_list.next,
109 struct btrfs_work, order_list);
110
111 if (!test_bit(WORK_DONE_BIT, &work->flags))
112 break;
113
114 /* we are going to call the ordered done function, but
115 * we leave the work item on the list as a barrier so
116 * that later work items that are done don't have their
117 * functions called before this one returns
118 */
119 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
120 break;
121
122 spin_unlock_irqrestore(&workers->lock, flags);
123
124 work->ordered_func(work);
125
126 /* now take the lock again and call the freeing code */
127 spin_lock_irqsave(&workers->lock, flags);
128 list_del(&work->order_list);
129 work->ordered_free(work);
130 }
131
132 spin_unlock_irqrestore(&workers->lock, flags);
133 return 0;
134}
135
136/*
137 * main loop for servicing work items
138 */
139static int worker_loop(void *arg)
140{
141 struct btrfs_worker_thread *worker = arg;
142 struct list_head *cur;
143 struct btrfs_work *work;
144 do {
145 spin_lock_irq(&worker->lock);
146 while (!list_empty(&worker->pending)) {
147 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list);
149 list_del(&work->list);
150 clear_bit(WORK_QUEUED_BIT, &work->flags);
151
152 work->worker = worker;
153 spin_unlock_irq(&worker->lock);
154
155 work->func(work);
156
157 atomic_dec(&worker->num_pending);
158 /*
159 * unless this is an ordered work queue,
160 * 'work' was probably freed by func above.
161 */
162 run_ordered_completions(worker->workers, work);
163
164 spin_lock_irq(&worker->lock);
165 check_idle_worker(worker);
166
167 }
168 worker->working = 0;
169 if (freezing(current)) {
170 refrigerator();
171 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop())
175 schedule();
176 __set_current_state(TASK_RUNNING);
177 }
178 } while (!kthread_should_stop());
179 return 0;
180}
181
182/*
183 * this will wait for all the worker threads to shutdown
184 */
185int btrfs_stop_workers(struct btrfs_workers *workers)
186{
187 struct list_head *cur;
188 struct btrfs_worker_thread *worker;
189
190 list_splice_init(&workers->idle_list, &workers->worker_list);
191 while (!list_empty(&workers->worker_list)) {
192 cur = workers->worker_list.next;
193 worker = list_entry(cur, struct btrfs_worker_thread,
194 worker_list);
195 kthread_stop(worker->task);
196 list_del(&worker->worker_list);
197 kfree(worker);
198 }
199 return 0;
200}
201
202/*
203 * simple init on struct btrfs_workers
204 */
205void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
206{
207 workers->num_workers = 0;
208 INIT_LIST_HEAD(&workers->worker_list);
209 INIT_LIST_HEAD(&workers->idle_list);
210 INIT_LIST_HEAD(&workers->order_list);
211 spin_lock_init(&workers->lock);
212 workers->max_workers = max;
213 workers->idle_thresh = 32;
214 workers->name = name;
215 workers->ordered = 0;
216}
217
218/*
219 * starts new worker threads. This does not enforce the max worker
220 * count in case you need to temporarily go past it.
221 */
222int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
223{
224 struct btrfs_worker_thread *worker;
225 int ret = 0;
226 int i;
227
228 for (i = 0; i < num_workers; i++) {
229 worker = kzalloc(sizeof(*worker), GFP_NOFS);
230 if (!worker) {
231 ret = -ENOMEM;
232 goto fail;
233 }
234
235 INIT_LIST_HEAD(&worker->pending);
236 INIT_LIST_HEAD(&worker->worker_list);
237 spin_lock_init(&worker->lock);
238 atomic_set(&worker->num_pending, 0);
239 worker->task = kthread_run(worker_loop, worker,
240 "btrfs-%s-%d", workers->name,
241 workers->num_workers + i);
242 worker->workers = workers;
243 if (IS_ERR(worker->task)) {
244 kfree(worker);
245 ret = PTR_ERR(worker->task);
246 goto fail;
247 }
248
249 spin_lock_irq(&workers->lock);
250 list_add_tail(&worker->worker_list, &workers->idle_list);
251 worker->idle = 1;
252 workers->num_workers++;
253 spin_unlock_irq(&workers->lock);
254 }
255 return 0;
256fail:
257 btrfs_stop_workers(workers);
258 return ret;
259}
260
261/*
262 * run through the list and find a worker thread that doesn't have a lot
263 * to do right now. This can return null if we aren't yet at the thread
264 * count limit and all of the threads are busy.
265 */
266static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
267{
268 struct btrfs_worker_thread *worker;
269 struct list_head *next;
270 int enforce_min = workers->num_workers < workers->max_workers;
271
272 /*
273 * if we find an idle thread, don't move it to the end of the
274 * idle list. This improves the chance that the next submission
275 * will reuse the same thread, and maybe catch it while it is still
276 * working
277 */
278 if (!list_empty(&workers->idle_list)) {
279 next = workers->idle_list.next;
280 worker = list_entry(next, struct btrfs_worker_thread,
281 worker_list);
282 return worker;
283 }
284 if (enforce_min || list_empty(&workers->worker_list))
285 return NULL;
286
287 /*
288 * if we pick a busy task, move the task to the end of the list.
289 * hopefully this will keep things somewhat evenly balanced.
290 * Do the move in batches based on the sequence number. This groups
291 * requests submitted at roughly the same time onto the same worker.
292 */
293 next = workers->worker_list.next;
294 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
295 atomic_inc(&worker->num_pending);
296 worker->sequence++;
297
298 if (worker->sequence % workers->idle_thresh == 0)
299 list_move_tail(next, &workers->worker_list);
300 return worker;
301}
302
303/*
304 * selects a worker thread to take the next job. This will either find
305 * an idle worker, start a new worker up to the max count, or just return
306 * one of the existing busy workers.
307 */
308static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
309{
310 struct btrfs_worker_thread *worker;
311 unsigned long flags;
312
313again:
314 spin_lock_irqsave(&workers->lock, flags);
315 worker = next_worker(workers);
316 spin_unlock_irqrestore(&workers->lock, flags);
317
318 if (!worker) {
319 spin_lock_irqsave(&workers->lock, flags);
320 if (workers->num_workers >= workers->max_workers) {
321 struct list_head *fallback = NULL;
322 /*
323 * we have failed to find any workers, just
324 * return the force one
325 */
326 if (!list_empty(&workers->worker_list))
327 fallback = workers->worker_list.next;
328 if (!list_empty(&workers->idle_list))
329 fallback = workers->idle_list.next;
330 BUG_ON(!fallback);
331 worker = list_entry(fallback,
332 struct btrfs_worker_thread, worker_list);
333 spin_unlock_irqrestore(&workers->lock, flags);
334 } else {
335 spin_unlock_irqrestore(&workers->lock, flags);
336 /* we're below the limit, start another worker */
337 btrfs_start_workers(workers, 1);
338 goto again;
339 }
340 }
341 return worker;
342}
343
344/*
345 * btrfs_requeue_work just puts the work item back on the tail of the list
346 * it was taken from. It is intended for use with long running work functions
347 * that make some progress and want to give the cpu up for others.
348 */
349int btrfs_requeue_work(struct btrfs_work *work)
350{
351 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags;
353
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out;
356
357 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending);
360
361 /* by definition we're busy, take ourselves off the idle
362 * list
363 */
364 if (worker->idle) {
365 spin_lock_irqsave(&worker->workers->lock, flags);
366 worker->idle = 0;
367 list_move_tail(&worker->worker_list,
368 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 }
371
372 spin_unlock_irqrestore(&worker->lock, flags);
373
374out:
375 return 0;
376}
377
378/*
379 * places a struct btrfs_work into the pending queue of one of the kthreads
380 */
381int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
382{
383 struct btrfs_worker_thread *worker;
384 unsigned long flags;
385 int wake = 0;
386
387 /* don't requeue something already on a list */
388 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
389 goto out;
390
391 worker = find_worker(workers);
392 if (workers->ordered) {
393 spin_lock_irqsave(&workers->lock, flags);
394 list_add_tail(&work->order_list, &workers->order_list);
395 spin_unlock_irqrestore(&workers->lock, flags);
396 } else {
397 INIT_LIST_HEAD(&work->order_list);
398 }
399
400 spin_lock_irqsave(&worker->lock, flags);
401 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404
405 /*
406 * avoid calling into wake_up_process if this thread has already
407 * been kicked
408 */
409 if (!worker->working)
410 wake = 1;
411 worker->working = 1;
412
413 spin_unlock_irqrestore(&worker->lock, flags);
414
415 if (wake)
416 wake_up_process(worker->task);
417out:
418 return 0;
419}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000000..31be4ed8b63e
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,101 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_
21
22struct btrfs_worker_thread;
23
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work {
39 /*
40 * func should be set to the function you want called
41 * your work struct is passed as the only arg
42 *
43 * ordered_func must be set for work sent to an ordered work queue,
44 * and it is called to complete a given work item in the same
45 * order they were sent to the queue.
46 */
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 /* max number of workers allowed. changed by btrfs_start_workers */
68 int max_workers;
69
70 /* once a worker has this many requests or fewer, it is idle */
71 int idle_thresh;
72
73 /* force completions in the order they were queued */
74 int ordered;
75
76 /* list with all the work threads. The workers on the idle thread
77 * may be actively servicing jobs, but they haven't yet hit the
78 * idle thresh limit above.
79 */
80 struct list_head worker_list;
81 struct list_head idle_list;
82
83 /*
84 * when operating in ordered mode, this maintains the list
85 * of work items waiting for completion
86 */
87 struct list_head order_list;
88
89 /* lock for finding the next worker thread to queue on */
90 spinlock_t lock;
91
92 /* extra name for this worker, used for current->name */
93 char *name;
94};
95
96int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
97int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
98int btrfs_stop_workers(struct btrfs_workers *workers);
99void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
100int btrfs_requeue_work(struct btrfs_work *work);
101#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000000..a8c9693b75ac
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,131 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_I__
20#define __BTRFS_I__
21
22#include "extent_map.h"
23#include "extent_io.h"
24#include "ordered-data.h"
25
26/* in memory btrfs inode */
27struct btrfs_inode {
28 /* which subvolume this inode belongs to */
29 struct btrfs_root *root;
30
31 /* key used to find this inode on disk. This is used by the code
32 * to read in roots of subvolumes
33 */
34 struct btrfs_key location;
35
36 /* the extent_tree has caches of all the extent mappings to disk */
37 struct extent_map_tree extent_tree;
38
39 /* the io_tree does range state (DIRTY, LOCKED etc) */
40 struct extent_io_tree io_tree;
41
42 /* special utility tree used to record which mirrors have already been
43 * tried when checksums fail for a given block
44 */
45 struct extent_io_tree io_failure_tree;
46
47 /* held while inesrting or deleting extents from files */
48 struct mutex extent_mutex;
49
50 /* held while logging the inode in tree-log.c */
51 struct mutex log_mutex;
52
53 /* used to order data wrt metadata */
54 struct btrfs_ordered_inode_tree ordered_tree;
55
56 /* standard acl pointers */
57 struct posix_acl *i_acl;
58 struct posix_acl *i_default_acl;
59
60 /* for keeping track of orphaned inodes */
61 struct list_head i_orphan;
62
63 /* list of all the delalloc inodes in the FS. There are times we need
64 * to write all the delalloc pages to disk, and this list is used
65 * to walk them all.
66 */
67 struct list_head delalloc_inodes;
68
69 /* full 64 bit generation number, struct vfs_inode doesn't have a big
70 * enough field for this.
71 */
72 u64 generation;
73
74 /* sequence number for NFS changes */
75 u64 sequence;
76
77 /*
78 * transid of the trans_handle that last modified this inode
79 */
80 u64 last_trans;
81 /*
82 * transid that last logged this inode
83 */
84 u64 logged_trans;
85
86 /*
87 * trans that last made a change that should be fully fsync'd. This
88 * gets reset to zero each time the inode is logged
89 */
90 u64 log_dirty_trans;
91
92 /* total number of bytes pending delalloc, used by stat to calc the
93 * real block usage of the file
94 */
95 u64 delalloc_bytes;
96
97 /*
98 * the size of the file stored in the metadata on disk. data=ordered
99 * means the in-memory i_size might be larger than the size on disk
100 * because not all the blocks are written yet.
101 */
102 u64 disk_i_size;
103
104 /* flags field from the on disk inode */
105 u32 flags;
106
107 /*
108 * if this is a directory then index_cnt is the counter for the index
109 * number for new files that are created
110 */
111 u64 index_cnt;
112
113 /* the start of block group preferred for allocations. */
114 u64 block_group;
115
116 struct inode vfs_inode;
117};
118
119static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
120{
121 return container_of(inode, struct btrfs_inode, vfs_inode);
122}
123
124static inline void btrfs_i_size_write(struct inode *inode, u64 size)
125{
126 inode->i_size = size;
127 BTRFS_I(inode)->disk_i_size = size;
128}
129
130
131#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 000000000000..7c4503ef6efd
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,7 @@
1#ifndef _COMPAT_H_
2#define _COMPAT_H_
3
4#define btrfs_drop_nlink(inode) drop_nlink(inode)
5#define btrfs_inc_nlink(inode) inc_nlink(inode)
6
7#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..ee848d8585d9
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,709 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h>
37#include "compat.h"
38#include "ctree.h"
39#include "disk-io.h"
40#include "transaction.h"
41#include "btrfs_inode.h"
42#include "volumes.h"
43#include "ordered-data.h"
44#include "compression.h"
45#include "extent_io.h"
46#include "extent_map.h"
47
48struct compressed_bio {
49 /* number of bios pending for this compressed extent */
50 atomic_t pending_bios;
51
52 /* the pages with the compressed data on them */
53 struct page **compressed_pages;
54
55 /* inode that owns this data */
56 struct inode *inode;
57
58 /* starting offset in the inode for our pages */
59 u64 start;
60
61 /* number of bytes in the inode we're working on */
62 unsigned long len;
63
64 /* number of bytes on disk */
65 unsigned long compressed_len;
66
67 /* number of compressed pages in the array */
68 unsigned long nr_pages;
69
70 /* IO errors */
71 int errors;
72 int mirror_num;
73
74 /* for reads, this is the bio we are copying the data into */
75 struct bio *orig_bio;
76
77 /*
78 * the start of a variable length array of checksums only
79 * used by reads
80 */
81 u32 sums;
82};
83
84static inline int compressed_bio_size(struct btrfs_root *root,
85 unsigned long disk_size)
86{
87 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
88 return sizeof(struct compressed_bio) +
89 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
90 csum_size;
91}
92
93static struct bio *compressed_bio_alloc(struct block_device *bdev,
94 u64 first_byte, gfp_t gfp_flags)
95{
96 struct bio *bio;
97 int nr_vecs;
98
99 nr_vecs = bio_get_nr_vecs(bdev);
100 bio = bio_alloc(gfp_flags, nr_vecs);
101
102 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
103 while (!bio && (nr_vecs /= 2))
104 bio = bio_alloc(gfp_flags, nr_vecs);
105 }
106
107 if (bio) {
108 bio->bi_size = 0;
109 bio->bi_bdev = bdev;
110 bio->bi_sector = first_byte >> 9;
111 }
112 return bio;
113}
114
115static int check_compressed_csum(struct inode *inode,
116 struct compressed_bio *cb,
117 u64 disk_start)
118{
119 int ret;
120 struct btrfs_root *root = BTRFS_I(inode)->root;
121 struct page *page;
122 unsigned long i;
123 char *kaddr;
124 u32 csum;
125 u32 *cb_sum = &cb->sums;
126
127 if (btrfs_test_flag(inode, NODATASUM))
128 return 0;
129
130 for (i = 0; i < cb->nr_pages; i++) {
131 page = cb->compressed_pages[i];
132 csum = ~(u32)0;
133
134 kaddr = kmap_atomic(page, KM_USER0);
135 csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
136 btrfs_csum_final(csum, (char *)&csum);
137 kunmap_atomic(kaddr, KM_USER0);
138
139 if (csum != *cb_sum) {
140 printk(KERN_INFO "btrfs csum failed ino %lu "
141 "extent %llu csum %u "
142 "wanted %u mirror %d\n", inode->i_ino,
143 (unsigned long long)disk_start,
144 csum, *cb_sum, cb->mirror_num);
145 ret = -EIO;
146 goto fail;
147 }
148 cb_sum++;
149
150 }
151 ret = 0;
152fail:
153 return ret;
154}
155
156/* when we finish reading compressed pages from the disk, we
157 * decompress them and then run the bio end_io routines on the
158 * decompressed pages (in the inode address space).
159 *
160 * This allows the checksumming and other IO error handling routines
161 * to work normally
162 *
163 * The compressed pages are freed here, and it must be run
164 * in process context
165 */
166static void end_compressed_bio_read(struct bio *bio, int err)
167{
168 struct extent_io_tree *tree;
169 struct compressed_bio *cb = bio->bi_private;
170 struct inode *inode;
171 struct page *page;
172 unsigned long index;
173 int ret;
174
175 if (err)
176 cb->errors = 1;
177
178 /* if there are more bios still pending for this compressed
179 * extent, just exit
180 */
181 if (!atomic_dec_and_test(&cb->pending_bios))
182 goto out;
183
184 inode = cb->inode;
185 ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
186 if (ret)
187 goto csum_failed;
188
189 /* ok, we're the last bio for this extent, lets start
190 * the decompression.
191 */
192 tree = &BTRFS_I(inode)->io_tree;
193 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
194 cb->start,
195 cb->orig_bio->bi_io_vec,
196 cb->orig_bio->bi_vcnt,
197 cb->compressed_len);
198csum_failed:
199 if (ret)
200 cb->errors = 1;
201
202 /* release the compressed pages */
203 index = 0;
204 for (index = 0; index < cb->nr_pages; index++) {
205 page = cb->compressed_pages[index];
206 page->mapping = NULL;
207 page_cache_release(page);
208 }
209
210 /* do io completion on the original bio */
211 if (cb->errors) {
212 bio_io_error(cb->orig_bio);
213 } else {
214 int bio_index = 0;
215 struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
216
217 /*
218 * we have verified the checksum already, set page
219 * checked so the end_io handlers know about it
220 */
221 while (bio_index < cb->orig_bio->bi_vcnt) {
222 SetPageChecked(bvec->bv_page);
223 bvec++;
224 bio_index++;
225 }
226 bio_endio(cb->orig_bio, 0);
227 }
228
229 /* finally free the cb struct */
230 kfree(cb->compressed_pages);
231 kfree(cb);
232out:
233 bio_put(bio);
234}
235
236/*
237 * Clear the writeback bits on all of the file
238 * pages for a compressed write
239 */
240static noinline int end_compressed_writeback(struct inode *inode, u64 start,
241 unsigned long ram_size)
242{
243 unsigned long index = start >> PAGE_CACHE_SHIFT;
244 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
245 struct page *pages[16];
246 unsigned long nr_pages = end_index - index + 1;
247 int i;
248 int ret;
249
250 while (nr_pages > 0) {
251 ret = find_get_pages_contig(inode->i_mapping, index,
252 min_t(unsigned long,
253 nr_pages, ARRAY_SIZE(pages)), pages);
254 if (ret == 0) {
255 nr_pages -= 1;
256 index += 1;
257 continue;
258 }
259 for (i = 0; i < ret; i++) {
260 end_page_writeback(pages[i]);
261 page_cache_release(pages[i]);
262 }
263 nr_pages -= ret;
264 index += ret;
265 }
266 /* the inode may be gone now */
267 return 0;
268}
269
270/*
271 * do the cleanup once all the compressed pages hit the disk.
272 * This will clear writeback on the file pages and free the compressed
273 * pages.
274 *
275 * This also calls the writeback end hooks for the file pages so that
276 * metadata and checksums can be updated in the file.
277 */
278static void end_compressed_bio_write(struct bio *bio, int err)
279{
280 struct extent_io_tree *tree;
281 struct compressed_bio *cb = bio->bi_private;
282 struct inode *inode;
283 struct page *page;
284 unsigned long index;
285
286 if (err)
287 cb->errors = 1;
288
289 /* if there are more bios still pending for this compressed
290 * extent, just exit
291 */
292 if (!atomic_dec_and_test(&cb->pending_bios))
293 goto out;
294
295 /* ok, we're the last bio for this extent, step one is to
296 * call back into the FS and do all the end_io operations
297 */
298 inode = cb->inode;
299 tree = &BTRFS_I(inode)->io_tree;
300 cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
301 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
302 cb->start,
303 cb->start + cb->len - 1,
304 NULL, 1);
305 cb->compressed_pages[0]->mapping = NULL;
306
307 end_compressed_writeback(inode, cb->start, cb->len);
308 /* note, our inode could be gone now */
309
310 /*
311 * release the compressed pages, these came from alloc_page and
312 * are not attached to the inode at all
313 */
314 index = 0;
315 for (index = 0; index < cb->nr_pages; index++) {
316 page = cb->compressed_pages[index];
317 page->mapping = NULL;
318 page_cache_release(page);
319 }
320
321 /* finally free the cb struct */
322 kfree(cb->compressed_pages);
323 kfree(cb);
324out:
325 bio_put(bio);
326}
327
328/*
329 * worker function to build and submit bios for previously compressed pages.
330 * The corresponding pages in the inode should be marked for writeback
331 * and the compressed pages should have a reference on them for dropping
332 * when the IO is complete.
333 *
334 * This also checksums the file bytes and gets things ready for
335 * the end io hooks.
336 */
337int btrfs_submit_compressed_write(struct inode *inode, u64 start,
338 unsigned long len, u64 disk_start,
339 unsigned long compressed_len,
340 struct page **compressed_pages,
341 unsigned long nr_pages)
342{
343 struct bio *bio = NULL;
344 struct btrfs_root *root = BTRFS_I(inode)->root;
345 struct compressed_bio *cb;
346 unsigned long bytes_left;
347 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
348 int page_index = 0;
349 struct page *page;
350 u64 first_byte = disk_start;
351 struct block_device *bdev;
352 int ret;
353
354 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
355 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
356 atomic_set(&cb->pending_bios, 0);
357 cb->errors = 0;
358 cb->inode = inode;
359 cb->start = start;
360 cb->len = len;
361 cb->mirror_num = 0;
362 cb->compressed_pages = compressed_pages;
363 cb->compressed_len = compressed_len;
364 cb->orig_bio = NULL;
365 cb->nr_pages = nr_pages;
366
367 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
368
369 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
370 bio->bi_private = cb;
371 bio->bi_end_io = end_compressed_bio_write;
372 atomic_inc(&cb->pending_bios);
373
374 /* create and submit bios for the compressed pages */
375 bytes_left = compressed_len;
376 for (page_index = 0; page_index < cb->nr_pages; page_index++) {
377 page = compressed_pages[page_index];
378 page->mapping = inode->i_mapping;
379 if (bio->bi_size)
380 ret = io_tree->ops->merge_bio_hook(page, 0,
381 PAGE_CACHE_SIZE,
382 bio, 0);
383 else
384 ret = 0;
385
386 page->mapping = NULL;
387 if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
388 PAGE_CACHE_SIZE) {
389 bio_get(bio);
390
391 /*
392 * inc the count before we submit the bio so
393 * we know the end IO handler won't happen before
394 * we inc the count. Otherwise, the cb might get
395 * freed before we're done setting it up
396 */
397 atomic_inc(&cb->pending_bios);
398 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
399 BUG_ON(ret);
400
401 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
402 BUG_ON(ret);
403
404 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
405 BUG_ON(ret);
406
407 bio_put(bio);
408
409 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
410 bio->bi_private = cb;
411 bio->bi_end_io = end_compressed_bio_write;
412 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
413 }
414 if (bytes_left < PAGE_CACHE_SIZE) {
415 printk("bytes left %lu compress len %lu nr %lu\n",
416 bytes_left, cb->compressed_len, cb->nr_pages);
417 }
418 bytes_left -= PAGE_CACHE_SIZE;
419 first_byte += PAGE_CACHE_SIZE;
420 cond_resched();
421 }
422 bio_get(bio);
423
424 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
425 BUG_ON(ret);
426
427 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
428 BUG_ON(ret);
429
430 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
431 BUG_ON(ret);
432
433 bio_put(bio);
434 return 0;
435}
436
437static noinline int add_ra_bio_pages(struct inode *inode,
438 u64 compressed_end,
439 struct compressed_bio *cb)
440{
441 unsigned long end_index;
442 unsigned long page_index;
443 u64 last_offset;
444 u64 isize = i_size_read(inode);
445 int ret;
446 struct page *page;
447 unsigned long nr_pages = 0;
448 struct extent_map *em;
449 struct address_space *mapping = inode->i_mapping;
450 struct pagevec pvec;
451 struct extent_map_tree *em_tree;
452 struct extent_io_tree *tree;
453 u64 end;
454 int misses = 0;
455
456 page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
457 last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
458 em_tree = &BTRFS_I(inode)->extent_tree;
459 tree = &BTRFS_I(inode)->io_tree;
460
461 if (isize == 0)
462 return 0;
463
464 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
465
466 pagevec_init(&pvec, 0);
467 while (last_offset < compressed_end) {
468 page_index = last_offset >> PAGE_CACHE_SHIFT;
469
470 if (page_index > end_index)
471 break;
472
473 rcu_read_lock();
474 page = radix_tree_lookup(&mapping->page_tree, page_index);
475 rcu_read_unlock();
476 if (page) {
477 misses++;
478 if (misses > 4)
479 break;
480 goto next;
481 }
482
483 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
484 if (!page)
485 break;
486
487 page->index = page_index;
488 /*
489 * what we want to do here is call add_to_page_cache_lru,
490 * but that isn't exported, so we reproduce it here
491 */
492 if (add_to_page_cache(page, mapping,
493 page->index, GFP_NOFS)) {
494 page_cache_release(page);
495 goto next;
496 }
497
498 /* open coding of lru_cache_add, also not exported */
499 page_cache_get(page);
500 if (!pagevec_add(&pvec, page))
501 __pagevec_lru_add_file(&pvec);
502
503 end = last_offset + PAGE_CACHE_SIZE - 1;
504 /*
505 * at this point, we have a locked page in the page cache
506 * for these bytes in the file. But, we have to make
507 * sure they map to this compressed extent on disk.
508 */
509 set_page_extent_mapped(page);
510 lock_extent(tree, last_offset, end, GFP_NOFS);
511 spin_lock(&em_tree->lock);
512 em = lookup_extent_mapping(em_tree, last_offset,
513 PAGE_CACHE_SIZE);
514 spin_unlock(&em_tree->lock);
515
516 if (!em || last_offset < em->start ||
517 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
518 (em->block_start >> 9) != cb->orig_bio->bi_sector) {
519 free_extent_map(em);
520 unlock_extent(tree, last_offset, end, GFP_NOFS);
521 unlock_page(page);
522 page_cache_release(page);
523 break;
524 }
525 free_extent_map(em);
526
527 if (page->index == end_index) {
528 char *userpage;
529 size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
530
531 if (zero_offset) {
532 int zeros;
533 zeros = PAGE_CACHE_SIZE - zero_offset;
534 userpage = kmap_atomic(page, KM_USER0);
535 memset(userpage + zero_offset, 0, zeros);
536 flush_dcache_page(page);
537 kunmap_atomic(userpage, KM_USER0);
538 }
539 }
540
541 ret = bio_add_page(cb->orig_bio, page,
542 PAGE_CACHE_SIZE, 0);
543
544 if (ret == PAGE_CACHE_SIZE) {
545 nr_pages++;
546 page_cache_release(page);
547 } else {
548 unlock_extent(tree, last_offset, end, GFP_NOFS);
549 unlock_page(page);
550 page_cache_release(page);
551 break;
552 }
553next:
554 last_offset += PAGE_CACHE_SIZE;
555 }
556 if (pagevec_count(&pvec))
557 __pagevec_lru_add_file(&pvec);
558 return 0;
559}
560
561/*
562 * for a compressed read, the bio we get passed has all the inode pages
563 * in it. We don't actually do IO on those pages but allocate new ones
564 * to hold the compressed pages on disk.
565 *
566 * bio->bi_sector points to the compressed extent on disk
567 * bio->bi_io_vec points to all of the inode pages
568 * bio->bi_vcnt is a count of pages
569 *
570 * After the compressed pages are read, we copy the bytes into the
571 * bio we were passed and then call the bio end_io calls
572 */
573int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
574 int mirror_num, unsigned long bio_flags)
575{
576 struct extent_io_tree *tree;
577 struct extent_map_tree *em_tree;
578 struct compressed_bio *cb;
579 struct btrfs_root *root = BTRFS_I(inode)->root;
580 unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
581 unsigned long compressed_len;
582 unsigned long nr_pages;
583 unsigned long page_index;
584 struct page *page;
585 struct block_device *bdev;
586 struct bio *comp_bio;
587 u64 cur_disk_byte = (u64)bio->bi_sector << 9;
588 u64 em_len;
589 u64 em_start;
590 struct extent_map *em;
591 int ret;
592 u32 *sums;
593
594 tree = &BTRFS_I(inode)->io_tree;
595 em_tree = &BTRFS_I(inode)->extent_tree;
596
597 /* we need the actual starting offset of this extent in the file */
598 spin_lock(&em_tree->lock);
599 em = lookup_extent_mapping(em_tree,
600 page_offset(bio->bi_io_vec->bv_page),
601 PAGE_CACHE_SIZE);
602 spin_unlock(&em_tree->lock);
603
604 compressed_len = em->block_len;
605 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
606 atomic_set(&cb->pending_bios, 0);
607 cb->errors = 0;
608 cb->inode = inode;
609 cb->mirror_num = mirror_num;
610 sums = &cb->sums;
611
612 cb->start = em->orig_start;
613 em_len = em->len;
614 em_start = em->start;
615
616 free_extent_map(em);
617 em = NULL;
618
619 cb->len = uncompressed_len;
620 cb->compressed_len = compressed_len;
621 cb->orig_bio = bio;
622
623 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
624 PAGE_CACHE_SIZE;
625 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
626 GFP_NOFS);
627 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
628
629 for (page_index = 0; page_index < nr_pages; page_index++) {
630 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
631 __GFP_HIGHMEM);
632 }
633 cb->nr_pages = nr_pages;
634
635 add_ra_bio_pages(inode, em_start + em_len, cb);
636
637 /* include any pages we added in add_ra-bio_pages */
638 uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
639 cb->len = uncompressed_len;
640
641 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
642 comp_bio->bi_private = cb;
643 comp_bio->bi_end_io = end_compressed_bio_read;
644 atomic_inc(&cb->pending_bios);
645
646 for (page_index = 0; page_index < nr_pages; page_index++) {
647 page = cb->compressed_pages[page_index];
648 page->mapping = inode->i_mapping;
649 page->index = em_start >> PAGE_CACHE_SHIFT;
650
651 if (comp_bio->bi_size)
652 ret = tree->ops->merge_bio_hook(page, 0,
653 PAGE_CACHE_SIZE,
654 comp_bio, 0);
655 else
656 ret = 0;
657
658 page->mapping = NULL;
659 if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
660 PAGE_CACHE_SIZE) {
661 bio_get(comp_bio);
662
663 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
664 BUG_ON(ret);
665
666 /*
667 * inc the count before we submit the bio so
668 * we know the end IO handler won't happen before
669 * we inc the count. Otherwise, the cb might get
670 * freed before we're done setting it up
671 */
672 atomic_inc(&cb->pending_bios);
673
674 if (!btrfs_test_flag(inode, NODATASUM)) {
675 btrfs_lookup_bio_sums(root, inode, comp_bio,
676 sums);
677 }
678 sums += (comp_bio->bi_size + root->sectorsize - 1) /
679 root->sectorsize;
680
681 ret = btrfs_map_bio(root, READ, comp_bio,
682 mirror_num, 0);
683 BUG_ON(ret);
684
685 bio_put(comp_bio);
686
687 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
688 GFP_NOFS);
689 comp_bio->bi_private = cb;
690 comp_bio->bi_end_io = end_compressed_bio_read;
691
692 bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
693 }
694 cur_disk_byte += PAGE_CACHE_SIZE;
695 }
696 bio_get(comp_bio);
697
698 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
699 BUG_ON(ret);
700
701 if (!btrfs_test_flag(inode, NODATASUM))
702 btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
703
704 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
705 BUG_ON(ret);
706
707 bio_put(comp_bio);
708 return 0;
709}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_
21
22int btrfs_zlib_decompress(unsigned char *data_in,
23 struct page *dest_page,
24 unsigned long start_byte,
25 size_t srclen, size_t destlen);
26int btrfs_zlib_compress_pages(struct address_space *mapping,
27 u64 start, unsigned long len,
28 struct page **pages,
29 unsigned long nr_dest_pages,
30 unsigned long *out_pages,
31 unsigned long *total_in,
32 unsigned long *total_out,
33 unsigned long max_out);
34int btrfs_zlib_decompress_biovec(struct page **pages_in,
35 u64 disk_start,
36 struct bio_vec *bvec,
37 int vcnt,
38 size_t srclen);
39void btrfs_zlib_exit(void);
40int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start,
42 unsigned long compressed_len,
43 struct page **compressed_pages,
44 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags);
47#endif
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 000000000000..6e1b3de36700
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,29 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CRC32C__
20#define __BTRFS_CRC32C__
21#include <linux/crc32c.h>
22
23/*
24 * this file used to do more for selecting the HW version of crc32c,
25 * perhaps it will one day again soon.
26 */
27#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
28#endif
29
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000000..9e46c0776816
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3953 @@
1/*
2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "transaction.h"
23#include "print-tree.h"
24#include "locking.h"
25
26static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27 *root, struct btrfs_path *path, int level);
28static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29 *root, struct btrfs_key *ins_key,
30 struct btrfs_path *path, int data_size, int extend);
31static int push_node_left(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct extent_buffer *dst,
33 struct extent_buffer *src, int empty);
34static int balance_node_right(struct btrfs_trans_handle *trans,
35 struct btrfs_root *root,
36 struct extent_buffer *dst_buf,
37 struct extent_buffer *src_buf);
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot);
40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void)
47{
48 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) {
51 btrfs_init_path(path);
52 path->reada = 1;
53 }
54 return path;
55}
56
57/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p)
59{
60 btrfs_release_path(NULL, p);
61 kmem_cache_free(btrfs_path_cachep, p);
62}
63
64/*
65 * path release drops references on the extent buffers in the path
66 * and it drops any locks held by this path
67 *
68 * It is safe to call this on paths that no locks or extent buffers held.
69 */
70noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71{
72 int i;
73
74 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75 p->slots[i] = 0;
76 if (!p->nodes[i])
77 continue;
78 if (p->locks[i]) {
79 btrfs_tree_unlock(p->nodes[i]);
80 p->locks[i] = 0;
81 }
82 free_extent_buffer(p->nodes[i]);
83 p->nodes[i] = NULL;
84 }
85}
86
87/*
88 * safely gets a reference on the root node of a tree. A lock
89 * is not taken, so a concurrent writer may put a different node
90 * at the root of the tree. See btrfs_lock_root_node for the
91 * looping required.
92 *
93 * The extent buffer returned by this has a reference taken, so
94 * it won't disappear. It may stop being the root of the tree
95 * at any time because there are no locks held.
96 */
97struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98{
99 struct extent_buffer *eb;
100 spin_lock(&root->node_lock);
101 eb = root->node;
102 extent_buffer_get(eb);
103 spin_unlock(&root->node_lock);
104 return eb;
105}
106
107/* loop around taking references on and locking the root node of the
108 * tree until you end up with a lock on the root. A locked buffer
109 * is returned, with a reference held.
110 */
111struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
112{
113 struct extent_buffer *eb;
114
115 while (1) {
116 eb = btrfs_root_node(root);
117 btrfs_tree_lock(eb);
118
119 spin_lock(&root->node_lock);
120 if (eb == root->node) {
121 spin_unlock(&root->node_lock);
122 break;
123 }
124 spin_unlock(&root->node_lock);
125
126 btrfs_tree_unlock(eb);
127 free_extent_buffer(eb);
128 }
129 return eb;
130}
131
132/* cowonly root (everything not a reference counted cow subvolume), just get
133 * put onto a simple dirty list. transaction.c walks this to make sure they
134 * get properly updated on disk.
135 */
136static void add_root_to_dirty_list(struct btrfs_root *root)
137{
138 if (root->track_dirty && list_empty(&root->dirty_list)) {
139 list_add(&root->dirty_list,
140 &root->fs_info->dirty_cowonly_roots);
141 }
142}
143
144/*
145 * used by snapshot creation to make a copy of a root for a tree with
146 * a given objectid. The buffer with the new root node is returned in
147 * cow_ret, and this func returns zero on success or a negative error code.
148 */
149int btrfs_copy_root(struct btrfs_trans_handle *trans,
150 struct btrfs_root *root,
151 struct extent_buffer *buf,
152 struct extent_buffer **cow_ret, u64 new_root_objectid)
153{
154 struct extent_buffer *cow;
155 u32 nritems;
156 int ret = 0;
157 int level;
158 struct btrfs_root *new_root;
159
160 new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
161 if (!new_root)
162 return -ENOMEM;
163
164 memcpy(new_root, root, sizeof(*new_root));
165 new_root->root_key.objectid = new_root_objectid;
166
167 WARN_ON(root->ref_cows && trans->transid !=
168 root->fs_info->running_transaction->transid);
169 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
170
171 level = btrfs_header_level(buf);
172 nritems = btrfs_header_nritems(buf);
173
174 cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
175 new_root_objectid, trans->transid,
176 level, buf->start, 0);
177 if (IS_ERR(cow)) {
178 kfree(new_root);
179 return PTR_ERR(cow);
180 }
181
182 copy_extent_buffer(cow, buf, 0, 0, cow->len);
183 btrfs_set_header_bytenr(cow, cow->start);
184 btrfs_set_header_generation(cow, trans->transid);
185 btrfs_set_header_owner(cow, new_root_objectid);
186 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187
188 write_extent_buffer(cow, root->fs_info->fsid,
189 (unsigned long)btrfs_header_fsid(cow),
190 BTRFS_FSID_SIZE);
191
192 WARN_ON(btrfs_header_generation(buf) > trans->transid);
193 ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
194 kfree(new_root);
195
196 if (ret)
197 return ret;
198
199 btrfs_mark_buffer_dirty(cow);
200 *cow_ret = cow;
201 return 0;
202}
203
204/*
205 * does the dirty work in cow of a single block. The parent block (if
206 * supplied) is updated to point to the new cow copy. The new buffer is marked
207 * dirty and returned locked. If you modify the block it needs to be marked
208 * dirty again.
209 *
210 * search_start -- an allocation hint for the new block
211 *
212 * empty_size -- a hint that you plan on doing more cow. This is the size in
213 * bytes the allocator should try to find free next to the block it returns.
214 * This is just a hint and may be ignored by the allocator.
215 *
216 * prealloc_dest -- if you have already reserved a destination for the cow,
217 * this uses that block instead of allocating a new one.
218 * btrfs_alloc_reserved_extent is used to finish the allocation.
219 */
220static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
221 struct btrfs_root *root,
222 struct extent_buffer *buf,
223 struct extent_buffer *parent, int parent_slot,
224 struct extent_buffer **cow_ret,
225 u64 search_start, u64 empty_size,
226 u64 prealloc_dest)
227{
228 u64 parent_start;
229 struct extent_buffer *cow;
230 u32 nritems;
231 int ret = 0;
232 int level;
233 int unlock_orig = 0;
234
235 if (*cow_ret == buf)
236 unlock_orig = 1;
237
238 WARN_ON(!btrfs_tree_locked(buf));
239
240 if (parent)
241 parent_start = parent->start;
242 else
243 parent_start = 0;
244
245 WARN_ON(root->ref_cows && trans->transid !=
246 root->fs_info->running_transaction->transid);
247 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
248
249 level = btrfs_header_level(buf);
250 nritems = btrfs_header_nritems(buf);
251
252 if (prealloc_dest) {
253 struct btrfs_key ins;
254
255 ins.objectid = prealloc_dest;
256 ins.offset = buf->len;
257 ins.type = BTRFS_EXTENT_ITEM_KEY;
258
259 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
260 root->root_key.objectid,
261 trans->transid, level, &ins);
262 BUG_ON(ret);
263 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264 buf->len);
265 } else {
266 cow = btrfs_alloc_free_block(trans, root, buf->len,
267 parent_start,
268 root->root_key.objectid,
269 trans->transid, level,
270 search_start, empty_size);
271 }
272 if (IS_ERR(cow))
273 return PTR_ERR(cow);
274
275 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid);
278 btrfs_set_header_owner(cow, root->root_key.objectid);
279 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
280
281 write_extent_buffer(cow, root->fs_info->fsid,
282 (unsigned long)btrfs_header_fsid(cow),
283 BTRFS_FSID_SIZE);
284
285 WARN_ON(btrfs_header_generation(buf) > trans->transid);
286 if (btrfs_header_generation(buf) != trans->transid) {
287 u32 nr_extents;
288 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
289 if (ret)
290 return ret;
291
292 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
293 WARN_ON(ret);
294 } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
295 /*
296 * There are only two places that can drop reference to
297 * tree blocks owned by living reloc trees, one is here,
298 * the other place is btrfs_drop_subtree. In both places,
299 * we check reference count while tree block is locked.
300 * Furthermore, if reference count is one, it won't get
301 * increased by someone else.
302 */
303 u32 refs;
304 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
305 buf->len, &refs);
306 BUG_ON(ret);
307 if (refs == 1) {
308 ret = btrfs_update_ref(trans, root, buf, cow,
309 0, nritems);
310 clean_tree_block(trans, root, buf);
311 } else {
312 ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
313 }
314 BUG_ON(ret);
315 } else {
316 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
317 if (ret)
318 return ret;
319 clean_tree_block(trans, root, buf);
320 }
321
322 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
323 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
324 WARN_ON(ret);
325 }
326
327 if (buf == root->node) {
328 WARN_ON(parent && parent != buf);
329
330 spin_lock(&root->node_lock);
331 root->node = cow;
332 extent_buffer_get(cow);
333 spin_unlock(&root->node_lock);
334
335 if (buf != root->commit_root) {
336 btrfs_free_extent(trans, root, buf->start,
337 buf->len, buf->start,
338 root->root_key.objectid,
339 btrfs_header_generation(buf),
340 level, 1);
341 }
342 free_extent_buffer(buf);
343 add_root_to_dirty_list(root);
344 } else {
345 btrfs_set_node_blockptr(parent, parent_slot,
346 cow->start);
347 WARN_ON(trans->transid == 0);
348 btrfs_set_node_ptr_generation(parent, parent_slot,
349 trans->transid);
350 btrfs_mark_buffer_dirty(parent);
351 WARN_ON(btrfs_header_generation(parent) != trans->transid);
352 btrfs_free_extent(trans, root, buf->start, buf->len,
353 parent_start, btrfs_header_owner(parent),
354 btrfs_header_generation(parent), level, 1);
355 }
356 if (unlock_orig)
357 btrfs_tree_unlock(buf);
358 free_extent_buffer(buf);
359 btrfs_mark_buffer_dirty(cow);
360 *cow_ret = cow;
361 return 0;
362}
363
364/*
365 * cows a single block, see __btrfs_cow_block for the real work.
366 * This version of it has extra checks so that a block isn't cow'd more than
367 * once per transaction, as long as it hasn't been written yet
368 */
369noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
370 struct btrfs_root *root, struct extent_buffer *buf,
371 struct extent_buffer *parent, int parent_slot,
372 struct extent_buffer **cow_ret, u64 prealloc_dest)
373{
374 u64 search_start;
375 int ret;
376
377 if (trans->transaction != root->fs_info->running_transaction) {
378 printk(KERN_CRIT "trans %llu running %llu\n",
379 (unsigned long long)trans->transid,
380 (unsigned long long)
381 root->fs_info->running_transaction->transid);
382 WARN_ON(1);
383 }
384 if (trans->transid != root->fs_info->generation) {
385 printk(KERN_CRIT "trans %llu running %llu\n",
386 (unsigned long long)trans->transid,
387 (unsigned long long)root->fs_info->generation);
388 WARN_ON(1);
389 }
390
391 spin_lock(&root->fs_info->hash_lock);
392 if (btrfs_header_generation(buf) == trans->transid &&
393 btrfs_header_owner(buf) == root->root_key.objectid &&
394 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395 *cow_ret = buf;
396 spin_unlock(&root->fs_info->hash_lock);
397 WARN_ON(prealloc_dest);
398 return 0;
399 }
400 spin_unlock(&root->fs_info->hash_lock);
401 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
402 ret = __btrfs_cow_block(trans, root, buf, parent,
403 parent_slot, cow_ret, search_start, 0,
404 prealloc_dest);
405 return ret;
406}
407
408/*
409 * helper function for defrag to decide if two blocks pointed to by a
410 * node are actually close by
411 */
412static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
413{
414 if (blocknr < other && other - (blocknr + blocksize) < 32768)
415 return 1;
416 if (blocknr > other && blocknr - (other + blocksize) < 32768)
417 return 1;
418 return 0;
419}
420
421/*
422 * compare two keys in a memcmp fashion
423 */
424static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
425{
426 struct btrfs_key k1;
427
428 btrfs_disk_key_to_cpu(&k1, disk);
429
430 if (k1.objectid > k2->objectid)
431 return 1;
432 if (k1.objectid < k2->objectid)
433 return -1;
434 if (k1.type > k2->type)
435 return 1;
436 if (k1.type < k2->type)
437 return -1;
438 if (k1.offset > k2->offset)
439 return 1;
440 if (k1.offset < k2->offset)
441 return -1;
442 return 0;
443}
444
445/*
446 * same as comp_keys only with two btrfs_key's
447 */
448static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
449{
450 if (k1->objectid > k2->objectid)
451 return 1;
452 if (k1->objectid < k2->objectid)
453 return -1;
454 if (k1->type > k2->type)
455 return 1;
456 if (k1->type < k2->type)
457 return -1;
458 if (k1->offset > k2->offset)
459 return 1;
460 if (k1->offset < k2->offset)
461 return -1;
462 return 0;
463}
464
465/*
466 * this is used by the defrag code to go through all the
467 * leaves pointed to by a node and reallocate them so that
468 * disk order is close to key order
469 */
470int btrfs_realloc_node(struct btrfs_trans_handle *trans,
471 struct btrfs_root *root, struct extent_buffer *parent,
472 int start_slot, int cache_only, u64 *last_ret,
473 struct btrfs_key *progress)
474{
475 struct extent_buffer *cur;
476 u64 blocknr;
477 u64 gen;
478 u64 search_start = *last_ret;
479 u64 last_block = 0;
480 u64 other;
481 u32 parent_nritems;
482 int end_slot;
483 int i;
484 int err = 0;
485 int parent_level;
486 int uptodate;
487 u32 blocksize;
488 int progress_passed = 0;
489 struct btrfs_disk_key disk_key;
490
491 parent_level = btrfs_header_level(parent);
492 if (cache_only && parent_level != 1)
493 return 0;
494
495 if (trans->transaction != root->fs_info->running_transaction)
496 WARN_ON(1);
497 if (trans->transid != root->fs_info->generation)
498 WARN_ON(1);
499
500 parent_nritems = btrfs_header_nritems(parent);
501 blocksize = btrfs_level_size(root, parent_level - 1);
502 end_slot = parent_nritems;
503
504 if (parent_nritems == 1)
505 return 0;
506
507 for (i = start_slot; i < end_slot; i++) {
508 int close = 1;
509
510 if (!parent->map_token) {
511 map_extent_buffer(parent,
512 btrfs_node_key_ptr_offset(i),
513 sizeof(struct btrfs_key_ptr),
514 &parent->map_token, &parent->kaddr,
515 &parent->map_start, &parent->map_len,
516 KM_USER1);
517 }
518 btrfs_node_key(parent, &disk_key, i);
519 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
520 continue;
521
522 progress_passed = 1;
523 blocknr = btrfs_node_blockptr(parent, i);
524 gen = btrfs_node_ptr_generation(parent, i);
525 if (last_block == 0)
526 last_block = blocknr;
527
528 if (i > 0) {
529 other = btrfs_node_blockptr(parent, i - 1);
530 close = close_blocks(blocknr, other, blocksize);
531 }
532 if (!close && i < end_slot - 2) {
533 other = btrfs_node_blockptr(parent, i + 1);
534 close = close_blocks(blocknr, other, blocksize);
535 }
536 if (close) {
537 last_block = blocknr;
538 continue;
539 }
540 if (parent->map_token) {
541 unmap_extent_buffer(parent, parent->map_token,
542 KM_USER1);
543 parent->map_token = NULL;
544 }
545
546 cur = btrfs_find_tree_block(root, blocknr, blocksize);
547 if (cur)
548 uptodate = btrfs_buffer_uptodate(cur, gen);
549 else
550 uptodate = 0;
551 if (!cur || !uptodate) {
552 if (cache_only) {
553 free_extent_buffer(cur);
554 continue;
555 }
556 if (!cur) {
557 cur = read_tree_block(root, blocknr,
558 blocksize, gen);
559 } else if (!uptodate) {
560 btrfs_read_buffer(cur, gen);
561 }
562 }
563 if (search_start == 0)
564 search_start = last_block;
565
566 btrfs_tree_lock(cur);
567 err = __btrfs_cow_block(trans, root, cur, parent, i,
568 &cur, search_start,
569 min(16 * blocksize,
570 (end_slot - i) * blocksize), 0);
571 if (err) {
572 btrfs_tree_unlock(cur);
573 free_extent_buffer(cur);
574 break;
575 }
576 search_start = cur->start;
577 last_block = cur->start;
578 *last_ret = search_start;
579 btrfs_tree_unlock(cur);
580 free_extent_buffer(cur);
581 }
582 if (parent->map_token) {
583 unmap_extent_buffer(parent, parent->map_token,
584 KM_USER1);
585 parent->map_token = NULL;
586 }
587 return err;
588}
589
590/*
591 * The leaf data grows from end-to-front in the node.
592 * this returns the address of the start of the last item,
593 * which is the stop of the leaf data stack
594 */
595static inline unsigned int leaf_data_end(struct btrfs_root *root,
596 struct extent_buffer *leaf)
597{
598 u32 nr = btrfs_header_nritems(leaf);
599 if (nr == 0)
600 return BTRFS_LEAF_DATA_SIZE(root);
601 return btrfs_item_offset_nr(leaf, nr - 1);
602}
603
604/*
605 * extra debugging checks to make sure all the items in a key are
606 * well formed and in the proper order
607 */
608static int check_node(struct btrfs_root *root, struct btrfs_path *path,
609 int level)
610{
611 struct extent_buffer *parent = NULL;
612 struct extent_buffer *node = path->nodes[level];
613 struct btrfs_disk_key parent_key;
614 struct btrfs_disk_key node_key;
615 int parent_slot;
616 int slot;
617 struct btrfs_key cpukey;
618 u32 nritems = btrfs_header_nritems(node);
619
620 if (path->nodes[level + 1])
621 parent = path->nodes[level + 1];
622
623 slot = path->slots[level];
624 BUG_ON(nritems == 0);
625 if (parent) {
626 parent_slot = path->slots[level + 1];
627 btrfs_node_key(parent, &parent_key, parent_slot);
628 btrfs_node_key(node, &node_key, 0);
629 BUG_ON(memcmp(&parent_key, &node_key,
630 sizeof(struct btrfs_disk_key)));
631 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
632 btrfs_header_bytenr(node));
633 }
634 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
635 if (slot != 0) {
636 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
637 btrfs_node_key(node, &node_key, slot);
638 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
639 }
640 if (slot < nritems - 1) {
641 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
642 btrfs_node_key(node, &node_key, slot);
643 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
644 }
645 return 0;
646}
647
648/*
649 * extra checking to make sure all the items in a leaf are
650 * well formed and in the proper order
651 */
652static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
653 int level)
654{
655 struct extent_buffer *leaf = path->nodes[level];
656 struct extent_buffer *parent = NULL;
657 int parent_slot;
658 struct btrfs_key cpukey;
659 struct btrfs_disk_key parent_key;
660 struct btrfs_disk_key leaf_key;
661 int slot = path->slots[0];
662
663 u32 nritems = btrfs_header_nritems(leaf);
664
665 if (path->nodes[level + 1])
666 parent = path->nodes[level + 1];
667
668 if (nritems == 0)
669 return 0;
670
671 if (parent) {
672 parent_slot = path->slots[level + 1];
673 btrfs_node_key(parent, &parent_key, parent_slot);
674 btrfs_item_key(leaf, &leaf_key, 0);
675
676 BUG_ON(memcmp(&parent_key, &leaf_key,
677 sizeof(struct btrfs_disk_key)));
678 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
679 btrfs_header_bytenr(leaf));
680 }
681 if (slot != 0 && slot < nritems - 1) {
682 btrfs_item_key(leaf, &leaf_key, slot);
683 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
684 if (comp_keys(&leaf_key, &cpukey) <= 0) {
685 btrfs_print_leaf(root, leaf);
686 printk(KERN_CRIT "slot %d offset bad key\n", slot);
687 BUG_ON(1);
688 }
689 if (btrfs_item_offset_nr(leaf, slot - 1) !=
690 btrfs_item_end_nr(leaf, slot)) {
691 btrfs_print_leaf(root, leaf);
692 printk(KERN_CRIT "slot %d offset bad\n", slot);
693 BUG_ON(1);
694 }
695 }
696 if (slot < nritems - 1) {
697 btrfs_item_key(leaf, &leaf_key, slot);
698 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
699 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
700 if (btrfs_item_offset_nr(leaf, slot) !=
701 btrfs_item_end_nr(leaf, slot + 1)) {
702 btrfs_print_leaf(root, leaf);
703 printk(KERN_CRIT "slot %d offset bad\n", slot);
704 BUG_ON(1);
705 }
706 }
707 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
708 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
709 return 0;
710}
711
712static noinline int check_block(struct btrfs_root *root,
713 struct btrfs_path *path, int level)
714{
715 return 0;
716 if (level == 0)
717 return check_leaf(root, path, level);
718 return check_node(root, path, level);
719}
720
721/*
722 * search for key in the extent_buffer. The items start at offset p,
723 * and they are item_size apart. There are 'max' items in p.
724 *
725 * the slot in the array is returned via slot, and it points to
726 * the place where you would insert key if it is not found in
727 * the array.
728 *
729 * slot may point to max if the key is bigger than all of the keys
730 */
731static noinline int generic_bin_search(struct extent_buffer *eb,
732 unsigned long p,
733 int item_size, struct btrfs_key *key,
734 int max, int *slot)
735{
736 int low = 0;
737 int high = max;
738 int mid;
739 int ret;
740 struct btrfs_disk_key *tmp = NULL;
741 struct btrfs_disk_key unaligned;
742 unsigned long offset;
743 char *map_token = NULL;
744 char *kaddr = NULL;
745 unsigned long map_start = 0;
746 unsigned long map_len = 0;
747 int err;
748
749 while (low < high) {
750 mid = (low + high) / 2;
751 offset = p + mid * item_size;
752
753 if (!map_token || offset < map_start ||
754 (offset + sizeof(struct btrfs_disk_key)) >
755 map_start + map_len) {
756 if (map_token) {
757 unmap_extent_buffer(eb, map_token, KM_USER0);
758 map_token = NULL;
759 }
760
761 err = map_private_extent_buffer(eb, offset,
762 sizeof(struct btrfs_disk_key),
763 &map_token, &kaddr,
764 &map_start, &map_len, KM_USER0);
765
766 if (!err) {
767 tmp = (struct btrfs_disk_key *)(kaddr + offset -
768 map_start);
769 } else {
770 read_extent_buffer(eb, &unaligned,
771 offset, sizeof(unaligned));
772 tmp = &unaligned;
773 }
774
775 } else {
776 tmp = (struct btrfs_disk_key *)(kaddr + offset -
777 map_start);
778 }
779 ret = comp_keys(tmp, key);
780
781 if (ret < 0)
782 low = mid + 1;
783 else if (ret > 0)
784 high = mid;
785 else {
786 *slot = mid;
787 if (map_token)
788 unmap_extent_buffer(eb, map_token, KM_USER0);
789 return 0;
790 }
791 }
792 *slot = low;
793 if (map_token)
794 unmap_extent_buffer(eb, map_token, KM_USER0);
795 return 1;
796}
797
798/*
799 * simple bin_search frontend that does the right thing for
800 * leaves vs nodes
801 */
802static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
803 int level, int *slot)
804{
805 if (level == 0) {
806 return generic_bin_search(eb,
807 offsetof(struct btrfs_leaf, items),
808 sizeof(struct btrfs_item),
809 key, btrfs_header_nritems(eb),
810 slot);
811 } else {
812 return generic_bin_search(eb,
813 offsetof(struct btrfs_node, ptrs),
814 sizeof(struct btrfs_key_ptr),
815 key, btrfs_header_nritems(eb),
816 slot);
817 }
818 return -1;
819}
820
821/* given a node and slot number, this reads the blocks it points to. The
822 * extent buffer is returned with a reference taken (but unlocked).
823 * NULL is returned on error.
824 */
825static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
826 struct extent_buffer *parent, int slot)
827{
828 int level = btrfs_header_level(parent);
829 if (slot < 0)
830 return NULL;
831 if (slot >= btrfs_header_nritems(parent))
832 return NULL;
833
834 BUG_ON(level == 0);
835
836 return read_tree_block(root, btrfs_node_blockptr(parent, slot),
837 btrfs_level_size(root, level - 1),
838 btrfs_node_ptr_generation(parent, slot));
839}
840
841/*
842 * node level balancing, used to make sure nodes are in proper order for
843 * item deletion. We balance from the top down, so we have to make sure
844 * that a deletion won't leave an node completely empty later on.
845 */
846static noinline int balance_level(struct btrfs_trans_handle *trans,
847 struct btrfs_root *root,
848 struct btrfs_path *path, int level)
849{
850 struct extent_buffer *right = NULL;
851 struct extent_buffer *mid;
852 struct extent_buffer *left = NULL;
853 struct extent_buffer *parent = NULL;
854 int ret = 0;
855 int wret;
856 int pslot;
857 int orig_slot = path->slots[level];
858 int err_on_enospc = 0;
859 u64 orig_ptr;
860
861 if (level == 0)
862 return 0;
863
864 mid = path->nodes[level];
865 WARN_ON(!path->locks[level]);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867
868 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
869
870 if (level < BTRFS_MAX_LEVEL - 1)
871 parent = path->nodes[level + 1];
872 pslot = path->slots[level + 1];
873
874 /*
875 * deal with the case where there is only one pointer in the root
876 * by promoting the node below to a root
877 */
878 if (!parent) {
879 struct extent_buffer *child;
880
881 if (btrfs_header_nritems(mid) != 1)
882 return 0;
883
884 /* promote the child to a root */
885 child = read_node_slot(root, mid, 0);
886 btrfs_tree_lock(child);
887 BUG_ON(!child);
888 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 BUG_ON(ret);
890
891 spin_lock(&root->node_lock);
892 root->node = child;
893 spin_unlock(&root->node_lock);
894
895 ret = btrfs_update_extent_ref(trans, root, child->start,
896 mid->start, child->start,
897 root->root_key.objectid,
898 trans->transid, level - 1);
899 BUG_ON(ret);
900
901 add_root_to_dirty_list(root);
902 btrfs_tree_unlock(child);
903 path->locks[level] = 0;
904 path->nodes[level] = NULL;
905 clean_tree_block(trans, root, mid);
906 btrfs_tree_unlock(mid);
907 /* once for the path */
908 free_extent_buffer(mid);
909 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
910 mid->start, root->root_key.objectid,
911 btrfs_header_generation(mid),
912 level, 1);
913 /* once for the root ptr */
914 free_extent_buffer(mid);
915 return ret;
916 }
917 if (btrfs_header_nritems(mid) >
918 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
919 return 0;
920
921 if (btrfs_header_nritems(mid) < 2)
922 err_on_enospc = 1;
923
924 left = read_node_slot(root, parent, pslot - 1);
925 if (left) {
926 btrfs_tree_lock(left);
927 wret = btrfs_cow_block(trans, root, left,
928 parent, pslot - 1, &left, 0);
929 if (wret) {
930 ret = wret;
931 goto enospc;
932 }
933 }
934 right = read_node_slot(root, parent, pslot + 1);
935 if (right) {
936 btrfs_tree_lock(right);
937 wret = btrfs_cow_block(trans, root, right,
938 parent, pslot + 1, &right, 0);
939 if (wret) {
940 ret = wret;
941 goto enospc;
942 }
943 }
944
945 /* first, try to make some room in the middle buffer */
946 if (left) {
947 orig_slot += btrfs_header_nritems(left);
948 wret = push_node_left(trans, root, left, mid, 1);
949 if (wret < 0)
950 ret = wret;
951 if (btrfs_header_nritems(mid) < 2)
952 err_on_enospc = 1;
953 }
954
955 /*
956 * then try to empty the right most buffer into the middle
957 */
958 if (right) {
959 wret = push_node_left(trans, root, mid, right, 1);
960 if (wret < 0 && wret != -ENOSPC)
961 ret = wret;
962 if (btrfs_header_nritems(right) == 0) {
963 u64 bytenr = right->start;
964 u64 generation = btrfs_header_generation(parent);
965 u32 blocksize = right->len;
966
967 clean_tree_block(trans, root, right);
968 btrfs_tree_unlock(right);
969 free_extent_buffer(right);
970 right = NULL;
971 wret = del_ptr(trans, root, path, level + 1, pslot +
972 1);
973 if (wret)
974 ret = wret;
975 wret = btrfs_free_extent(trans, root, bytenr,
976 blocksize, parent->start,
977 btrfs_header_owner(parent),
978 generation, level, 1);
979 if (wret)
980 ret = wret;
981 } else {
982 struct btrfs_disk_key right_key;
983 btrfs_node_key(right, &right_key, 0);
984 btrfs_set_node_key(parent, &right_key, pslot + 1);
985 btrfs_mark_buffer_dirty(parent);
986 }
987 }
988 if (btrfs_header_nritems(mid) == 1) {
989 /*
990 * we're not allowed to leave a node with one item in the
991 * tree during a delete. A deletion from lower in the tree
992 * could try to delete the only pointer in this node.
993 * So, pull some keys from the left.
994 * There has to be a left pointer at this point because
995 * otherwise we would have pulled some pointers from the
996 * right
997 */
998 BUG_ON(!left);
999 wret = balance_node_right(trans, root, mid, left);
1000 if (wret < 0) {
1001 ret = wret;
1002 goto enospc;
1003 }
1004 if (wret == 1) {
1005 wret = push_node_left(trans, root, left, mid, 1);
1006 if (wret < 0)
1007 ret = wret;
1008 }
1009 BUG_ON(wret == 1);
1010 }
1011 if (btrfs_header_nritems(mid) == 0) {
1012 /* we've managed to empty the middle node, drop it */
1013 u64 root_gen = btrfs_header_generation(parent);
1014 u64 bytenr = mid->start;
1015 u32 blocksize = mid->len;
1016
1017 clean_tree_block(trans, root, mid);
1018 btrfs_tree_unlock(mid);
1019 free_extent_buffer(mid);
1020 mid = NULL;
1021 wret = del_ptr(trans, root, path, level + 1, pslot);
1022 if (wret)
1023 ret = wret;
1024 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1025 parent->start,
1026 btrfs_header_owner(parent),
1027 root_gen, level, 1);
1028 if (wret)
1029 ret = wret;
1030 } else {
1031 /* update the parent key to reflect our changes */
1032 struct btrfs_disk_key mid_key;
1033 btrfs_node_key(mid, &mid_key, 0);
1034 btrfs_set_node_key(parent, &mid_key, pslot);
1035 btrfs_mark_buffer_dirty(parent);
1036 }
1037
1038 /* update the path */
1039 if (left) {
1040 if (btrfs_header_nritems(left) > orig_slot) {
1041 extent_buffer_get(left);
1042 /* left was locked after cow */
1043 path->nodes[level] = left;
1044 path->slots[level + 1] -= 1;
1045 path->slots[level] = orig_slot;
1046 if (mid) {
1047 btrfs_tree_unlock(mid);
1048 free_extent_buffer(mid);
1049 }
1050 } else {
1051 orig_slot -= btrfs_header_nritems(left);
1052 path->slots[level] = orig_slot;
1053 }
1054 }
1055 /* double check we haven't messed things up */
1056 check_block(root, path, level);
1057 if (orig_ptr !=
1058 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1059 BUG();
1060enospc:
1061 if (right) {
1062 btrfs_tree_unlock(right);
1063 free_extent_buffer(right);
1064 }
1065 if (left) {
1066 if (path->nodes[level] != left)
1067 btrfs_tree_unlock(left);
1068 free_extent_buffer(left);
1069 }
1070 return ret;
1071}
1072
1073/* Node balancing for insertion. Here we only split or push nodes around
1074 * when they are completely full. This is also done top down, so we
1075 * have to be pessimistic.
1076 */
1077static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1078 struct btrfs_root *root,
1079 struct btrfs_path *path, int level)
1080{
1081 struct extent_buffer *right = NULL;
1082 struct extent_buffer *mid;
1083 struct extent_buffer *left = NULL;
1084 struct extent_buffer *parent = NULL;
1085 int ret = 0;
1086 int wret;
1087 int pslot;
1088 int orig_slot = path->slots[level];
1089 u64 orig_ptr;
1090
1091 if (level == 0)
1092 return 1;
1093
1094 mid = path->nodes[level];
1095 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1096 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1097
1098 if (level < BTRFS_MAX_LEVEL - 1)
1099 parent = path->nodes[level + 1];
1100 pslot = path->slots[level + 1];
1101
1102 if (!parent)
1103 return 1;
1104
1105 left = read_node_slot(root, parent, pslot - 1);
1106
1107 /* first, try to make some room in the middle buffer */
1108 if (left) {
1109 u32 left_nr;
1110
1111 btrfs_tree_lock(left);
1112 left_nr = btrfs_header_nritems(left);
1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1114 wret = 1;
1115 } else {
1116 ret = btrfs_cow_block(trans, root, left, parent,
1117 pslot - 1, &left, 0);
1118 if (ret)
1119 wret = 1;
1120 else {
1121 wret = push_node_left(trans, root,
1122 left, mid, 0);
1123 }
1124 }
1125 if (wret < 0)
1126 ret = wret;
1127 if (wret == 0) {
1128 struct btrfs_disk_key disk_key;
1129 orig_slot += left_nr;
1130 btrfs_node_key(mid, &disk_key, 0);
1131 btrfs_set_node_key(parent, &disk_key, pslot);
1132 btrfs_mark_buffer_dirty(parent);
1133 if (btrfs_header_nritems(left) > orig_slot) {
1134 path->nodes[level] = left;
1135 path->slots[level + 1] -= 1;
1136 path->slots[level] = orig_slot;
1137 btrfs_tree_unlock(mid);
1138 free_extent_buffer(mid);
1139 } else {
1140 orig_slot -=
1141 btrfs_header_nritems(left);
1142 path->slots[level] = orig_slot;
1143 btrfs_tree_unlock(left);
1144 free_extent_buffer(left);
1145 }
1146 return 0;
1147 }
1148 btrfs_tree_unlock(left);
1149 free_extent_buffer(left);
1150 }
1151 right = read_node_slot(root, parent, pslot + 1);
1152
1153 /*
1154 * then try to empty the right most buffer into the middle
1155 */
1156 if (right) {
1157 u32 right_nr;
1158 btrfs_tree_lock(right);
1159 right_nr = btrfs_header_nritems(right);
1160 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1161 wret = 1;
1162 } else {
1163 ret = btrfs_cow_block(trans, root, right,
1164 parent, pslot + 1,
1165 &right, 0);
1166 if (ret)
1167 wret = 1;
1168 else {
1169 wret = balance_node_right(trans, root,
1170 right, mid);
1171 }
1172 }
1173 if (wret < 0)
1174 ret = wret;
1175 if (wret == 0) {
1176 struct btrfs_disk_key disk_key;
1177
1178 btrfs_node_key(right, &disk_key, 0);
1179 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1180 btrfs_mark_buffer_dirty(parent);
1181
1182 if (btrfs_header_nritems(mid) <= orig_slot) {
1183 path->nodes[level] = right;
1184 path->slots[level + 1] += 1;
1185 path->slots[level] = orig_slot -
1186 btrfs_header_nritems(mid);
1187 btrfs_tree_unlock(mid);
1188 free_extent_buffer(mid);
1189 } else {
1190 btrfs_tree_unlock(right);
1191 free_extent_buffer(right);
1192 }
1193 return 0;
1194 }
1195 btrfs_tree_unlock(right);
1196 free_extent_buffer(right);
1197 }
1198 return 1;
1199}
1200
1201/*
1202 * readahead one full node of leaves, finding things that are close
1203 * to the block in 'slot', and triggering ra on them.
1204 */
1205static noinline void reada_for_search(struct btrfs_root *root,
1206 struct btrfs_path *path,
1207 int level, int slot, u64 objectid)
1208{
1209 struct extent_buffer *node;
1210 struct btrfs_disk_key disk_key;
1211 u32 nritems;
1212 u64 search;
1213 u64 lowest_read;
1214 u64 highest_read;
1215 u64 nread = 0;
1216 int direction = path->reada;
1217 struct extent_buffer *eb;
1218 u32 nr;
1219 u32 blocksize;
1220 u32 nscan = 0;
1221
1222 if (level != 1)
1223 return;
1224
1225 if (!path->nodes[level])
1226 return;
1227
1228 node = path->nodes[level];
1229
1230 search = btrfs_node_blockptr(node, slot);
1231 blocksize = btrfs_level_size(root, level - 1);
1232 eb = btrfs_find_tree_block(root, search, blocksize);
1233 if (eb) {
1234 free_extent_buffer(eb);
1235 return;
1236 }
1237
1238 highest_read = search;
1239 lowest_read = search;
1240
1241 nritems = btrfs_header_nritems(node);
1242 nr = slot;
1243 while (1) {
1244 if (direction < 0) {
1245 if (nr == 0)
1246 break;
1247 nr--;
1248 } else if (direction > 0) {
1249 nr++;
1250 if (nr >= nritems)
1251 break;
1252 }
1253 if (path->reada < 0 && objectid) {
1254 btrfs_node_key(node, &disk_key, nr);
1255 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1256 break;
1257 }
1258 search = btrfs_node_blockptr(node, nr);
1259 if ((search >= lowest_read && search <= highest_read) ||
1260 (search < lowest_read && lowest_read - search <= 16384) ||
1261 (search > highest_read && search - highest_read <= 16384)) {
1262 readahead_tree_block(root, search, blocksize,
1263 btrfs_node_ptr_generation(node, nr));
1264 nread += blocksize;
1265 }
1266 nscan++;
1267 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
1268 break;
1269
1270 if (nread > (256 * 1024) || nscan > 128)
1271 break;
1272
1273 if (search < lowest_read)
1274 lowest_read = search;
1275 if (search > highest_read)
1276 highest_read = search;
1277 }
1278}
1279
1280/*
1281 * when we walk down the tree, it is usually safe to unlock the higher layers
1282 * in the tree. The exceptions are when our path goes through slot 0, because
1283 * operations on the tree might require changing key pointers higher up in the
1284 * tree.
1285 *
1286 * callers might also have set path->keep_locks, which tells this code to keep
1287 * the lock if the path points to the last slot in the block. This is part of
1288 * walking through the tree, and selecting the next slot in the higher block.
1289 *
1290 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1291 * if lowest_unlock is 1, level 0 won't be unlocked
1292 */
1293static noinline void unlock_up(struct btrfs_path *path, int level,
1294 int lowest_unlock)
1295{
1296 int i;
1297 int skip_level = level;
1298 int no_skips = 0;
1299 struct extent_buffer *t;
1300
1301 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1302 if (!path->nodes[i])
1303 break;
1304 if (!path->locks[i])
1305 break;
1306 if (!no_skips && path->slots[i] == 0) {
1307 skip_level = i + 1;
1308 continue;
1309 }
1310 if (!no_skips && path->keep_locks) {
1311 u32 nritems;
1312 t = path->nodes[i];
1313 nritems = btrfs_header_nritems(t);
1314 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1315 skip_level = i + 1;
1316 continue;
1317 }
1318 }
1319 if (skip_level < i && i >= lowest_unlock)
1320 no_skips = 1;
1321
1322 t = path->nodes[i];
1323 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1324 btrfs_tree_unlock(t);
1325 path->locks[i] = 0;
1326 }
1327 }
1328}
1329
1330/*
1331 * look for key in the tree. path is filled in with nodes along the way
1332 * if key is found, we return zero and you can find the item in the leaf
1333 * level of the path (level 0)
1334 *
1335 * If the key isn't found, the path points to the slot where it should
1336 * be inserted, and 1 is returned. If there are other errors during the
1337 * search a negative error number is returned.
1338 *
1339 * if ins_len > 0, nodes and leaves will be split as we walk down the
1340 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
1341 * possible)
1342 */
1343int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1344 *root, struct btrfs_key *key, struct btrfs_path *p, int
1345 ins_len, int cow)
1346{
1347 struct extent_buffer *b;
1348 struct extent_buffer *tmp;
1349 int slot;
1350 int ret;
1351 int level;
1352 int should_reada = p->reada;
1353 int lowest_unlock = 1;
1354 int blocksize;
1355 u8 lowest_level = 0;
1356 u64 blocknr;
1357 u64 gen;
1358 struct btrfs_key prealloc_block;
1359
1360 lowest_level = p->lowest_level;
1361 WARN_ON(lowest_level && ins_len > 0);
1362 WARN_ON(p->nodes[0] != NULL);
1363
1364 if (ins_len < 0)
1365 lowest_unlock = 2;
1366
1367 prealloc_block.objectid = 0;
1368
1369again:
1370 if (p->skip_locking)
1371 b = btrfs_root_node(root);
1372 else
1373 b = btrfs_lock_root_node(root);
1374
1375 while (b) {
1376 level = btrfs_header_level(b);
1377
1378 /*
1379 * setup the path here so we can release it under lock
1380 * contention with the cow code
1381 */
1382 p->nodes[level] = b;
1383 if (!p->skip_locking)
1384 p->locks[level] = 1;
1385
1386 if (cow) {
1387 int wret;
1388
1389 /* is a cow on this block not required */
1390 spin_lock(&root->fs_info->hash_lock);
1391 if (btrfs_header_generation(b) == trans->transid &&
1392 btrfs_header_owner(b) == root->root_key.objectid &&
1393 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1394 spin_unlock(&root->fs_info->hash_lock);
1395 goto cow_done;
1396 }
1397 spin_unlock(&root->fs_info->hash_lock);
1398
1399 /* ok, we have to cow, is our old prealloc the right
1400 * size?
1401 */
1402 if (prealloc_block.objectid &&
1403 prealloc_block.offset != b->len) {
1404 btrfs_free_reserved_extent(root,
1405 prealloc_block.objectid,
1406 prealloc_block.offset);
1407 prealloc_block.objectid = 0;
1408 }
1409
1410 /*
1411 * for higher level blocks, try not to allocate blocks
1412 * with the block and the parent locks held.
1413 */
1414 if (level > 1 && !prealloc_block.objectid &&
1415 btrfs_path_lock_waiting(p, level)) {
1416 u32 size = b->len;
1417 u64 hint = b->start;
1418
1419 btrfs_release_path(root, p);
1420 ret = btrfs_reserve_extent(trans, root,
1421 size, size, 0,
1422 hint, (u64)-1,
1423 &prealloc_block, 0);
1424 BUG_ON(ret);
1425 goto again;
1426 }
1427
1428 wret = btrfs_cow_block(trans, root, b,
1429 p->nodes[level + 1],
1430 p->slots[level + 1],
1431 &b, prealloc_block.objectid);
1432 prealloc_block.objectid = 0;
1433 if (wret) {
1434 free_extent_buffer(b);
1435 ret = wret;
1436 goto done;
1437 }
1438 }
1439cow_done:
1440 BUG_ON(!cow && ins_len);
1441 if (level != btrfs_header_level(b))
1442 WARN_ON(1);
1443 level = btrfs_header_level(b);
1444
1445 p->nodes[level] = b;
1446 if (!p->skip_locking)
1447 p->locks[level] = 1;
1448
1449 ret = check_block(root, p, level);
1450 if (ret) {
1451 ret = -1;
1452 goto done;
1453 }
1454
1455 ret = bin_search(b, key, level, &slot);
1456 if (level != 0) {
1457 if (ret && slot > 0)
1458 slot -= 1;
1459 p->slots[level] = slot;
1460 if ((p->search_for_split || ins_len > 0) &&
1461 btrfs_header_nritems(b) >=
1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463 int sret = split_node(trans, root, p, level);
1464 BUG_ON(sret > 0);
1465 if (sret) {
1466 ret = sret;
1467 goto done;
1468 }
1469 b = p->nodes[level];
1470 slot = p->slots[level];
1471 } else if (ins_len < 0) {
1472 int sret = balance_level(trans, root, p,
1473 level);
1474 if (sret) {
1475 ret = sret;
1476 goto done;
1477 }
1478 b = p->nodes[level];
1479 if (!b) {
1480 btrfs_release_path(NULL, p);
1481 goto again;
1482 }
1483 slot = p->slots[level];
1484 BUG_ON(btrfs_header_nritems(b) == 1);
1485 }
1486 unlock_up(p, level, lowest_unlock);
1487
1488 /* this is only true while dropping a snapshot */
1489 if (level == lowest_level) {
1490 ret = 0;
1491 goto done;
1492 }
1493
1494 blocknr = btrfs_node_blockptr(b, slot);
1495 gen = btrfs_node_ptr_generation(b, slot);
1496 blocksize = btrfs_level_size(root, level - 1);
1497
1498 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1499 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1500 b = tmp;
1501 } else {
1502 /*
1503 * reduce lock contention at high levels
1504 * of the btree by dropping locks before
1505 * we read.
1506 */
1507 if (level > 1) {
1508 btrfs_release_path(NULL, p);
1509 if (tmp)
1510 free_extent_buffer(tmp);
1511 if (should_reada)
1512 reada_for_search(root, p,
1513 level, slot,
1514 key->objectid);
1515
1516 tmp = read_tree_block(root, blocknr,
1517 blocksize, gen);
1518 if (tmp)
1519 free_extent_buffer(tmp);
1520 goto again;
1521 } else {
1522 if (tmp)
1523 free_extent_buffer(tmp);
1524 if (should_reada)
1525 reada_for_search(root, p,
1526 level, slot,
1527 key->objectid);
1528 b = read_node_slot(root, b, slot);
1529 }
1530 }
1531 if (!p->skip_locking)
1532 btrfs_tree_lock(b);
1533 } else {
1534 p->slots[level] = slot;
1535 if (ins_len > 0 &&
1536 btrfs_leaf_free_space(root, b) < ins_len) {
1537 int sret = split_leaf(trans, root, key,
1538 p, ins_len, ret == 0);
1539 BUG_ON(sret > 0);
1540 if (sret) {
1541 ret = sret;
1542 goto done;
1543 }
1544 }
1545 if (!p->search_for_split)
1546 unlock_up(p, level, lowest_unlock);
1547 goto done;
1548 }
1549 }
1550 ret = 1;
1551done:
1552 if (prealloc_block.objectid) {
1553 btrfs_free_reserved_extent(root,
1554 prealloc_block.objectid,
1555 prealloc_block.offset);
1556 }
1557
1558 return ret;
1559}
1560
1561int btrfs_merge_path(struct btrfs_trans_handle *trans,
1562 struct btrfs_root *root,
1563 struct btrfs_key *node_keys,
1564 u64 *nodes, int lowest_level)
1565{
1566 struct extent_buffer *eb;
1567 struct extent_buffer *parent;
1568 struct btrfs_key key;
1569 u64 bytenr;
1570 u64 generation;
1571 u32 blocksize;
1572 int level;
1573 int slot;
1574 int key_match;
1575 int ret;
1576
1577 eb = btrfs_lock_root_node(root);
1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1579 BUG_ON(ret);
1580
1581 parent = eb;
1582 while (1) {
1583 level = btrfs_header_level(parent);
1584 if (level == 0 || level <= lowest_level)
1585 break;
1586
1587 ret = bin_search(parent, &node_keys[lowest_level], level,
1588 &slot);
1589 if (ret && slot > 0)
1590 slot--;
1591
1592 bytenr = btrfs_node_blockptr(parent, slot);
1593 if (nodes[level - 1] == bytenr)
1594 break;
1595
1596 blocksize = btrfs_level_size(root, level - 1);
1597 generation = btrfs_node_ptr_generation(parent, slot);
1598 btrfs_node_key_to_cpu(eb, &key, slot);
1599 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1600
1601 if (generation == trans->transid) {
1602 eb = read_tree_block(root, bytenr, blocksize,
1603 generation);
1604 btrfs_tree_lock(eb);
1605 }
1606
1607 /*
1608 * if node keys match and node pointer hasn't been modified
1609 * in the running transaction, we can merge the path. for
1610 * blocks owened by reloc trees, the node pointer check is
1611 * skipped, this is because these blocks are fully controlled
1612 * by the space balance code, no one else can modify them.
1613 */
1614 if (!nodes[level - 1] || !key_match ||
1615 (generation == trans->transid &&
1616 btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1617 if (level == 1 || level == lowest_level + 1) {
1618 if (generation == trans->transid) {
1619 btrfs_tree_unlock(eb);
1620 free_extent_buffer(eb);
1621 }
1622 break;
1623 }
1624
1625 if (generation != trans->transid) {
1626 eb = read_tree_block(root, bytenr, blocksize,
1627 generation);
1628 btrfs_tree_lock(eb);
1629 }
1630
1631 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1632 &eb, 0);
1633 BUG_ON(ret);
1634
1635 if (root->root_key.objectid ==
1636 BTRFS_TREE_RELOC_OBJECTID) {
1637 if (!nodes[level - 1]) {
1638 nodes[level - 1] = eb->start;
1639 memcpy(&node_keys[level - 1], &key,
1640 sizeof(node_keys[0]));
1641 } else {
1642 WARN_ON(1);
1643 }
1644 }
1645
1646 btrfs_tree_unlock(parent);
1647 free_extent_buffer(parent);
1648 parent = eb;
1649 continue;
1650 }
1651
1652 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1653 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1654 btrfs_mark_buffer_dirty(parent);
1655
1656 ret = btrfs_inc_extent_ref(trans, root,
1657 nodes[level - 1],
1658 blocksize, parent->start,
1659 btrfs_header_owner(parent),
1660 btrfs_header_generation(parent),
1661 level - 1);
1662 BUG_ON(ret);
1663
1664 /*
1665 * If the block was created in the running transaction,
1666 * it's possible this is the last reference to it, so we
1667 * should drop the subtree.
1668 */
1669 if (generation == trans->transid) {
1670 ret = btrfs_drop_subtree(trans, root, eb, parent);
1671 BUG_ON(ret);
1672 btrfs_tree_unlock(eb);
1673 free_extent_buffer(eb);
1674 } else {
1675 ret = btrfs_free_extent(trans, root, bytenr,
1676 blocksize, parent->start,
1677 btrfs_header_owner(parent),
1678 btrfs_header_generation(parent),
1679 level - 1, 1);
1680 BUG_ON(ret);
1681 }
1682 break;
1683 }
1684 btrfs_tree_unlock(parent);
1685 free_extent_buffer(parent);
1686 return 0;
1687}
1688
1689/*
1690 * adjust the pointers going up the tree, starting at level
1691 * making sure the right key of each node is points to 'key'.
1692 * This is used after shifting pointers to the left, so it stops
1693 * fixing up pointers when a given leaf/node is not in slot 0 of the
1694 * higher levels
1695 *
1696 * If this fails to write a tree block, it returns -1, but continues
1697 * fixing up the blocks in ram so the tree is consistent.
1698 */
1699static int fixup_low_keys(struct btrfs_trans_handle *trans,
1700 struct btrfs_root *root, struct btrfs_path *path,
1701 struct btrfs_disk_key *key, int level)
1702{
1703 int i;
1704 int ret = 0;
1705 struct extent_buffer *t;
1706
1707 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1708 int tslot = path->slots[i];
1709 if (!path->nodes[i])
1710 break;
1711 t = path->nodes[i];
1712 btrfs_set_node_key(t, key, tslot);
1713 btrfs_mark_buffer_dirty(path->nodes[i]);
1714 if (tslot != 0)
1715 break;
1716 }
1717 return ret;
1718}
1719
1720/*
1721 * update item key.
1722 *
1723 * This function isn't completely safe. It's the caller's responsibility
1724 * that the new key won't break the order
1725 */
1726int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1727 struct btrfs_root *root, struct btrfs_path *path,
1728 struct btrfs_key *new_key)
1729{
1730 struct btrfs_disk_key disk_key;
1731 struct extent_buffer *eb;
1732 int slot;
1733
1734 eb = path->nodes[0];
1735 slot = path->slots[0];
1736 if (slot > 0) {
1737 btrfs_item_key(eb, &disk_key, slot - 1);
1738 if (comp_keys(&disk_key, new_key) >= 0)
1739 return -1;
1740 }
1741 if (slot < btrfs_header_nritems(eb) - 1) {
1742 btrfs_item_key(eb, &disk_key, slot + 1);
1743 if (comp_keys(&disk_key, new_key) <= 0)
1744 return -1;
1745 }
1746
1747 btrfs_cpu_key_to_disk(&disk_key, new_key);
1748 btrfs_set_item_key(eb, &disk_key, slot);
1749 btrfs_mark_buffer_dirty(eb);
1750 if (slot == 0)
1751 fixup_low_keys(trans, root, path, &disk_key, 1);
1752 return 0;
1753}
1754
1755/*
1756 * try to push data from one node into the next node left in the
1757 * tree.
1758 *
1759 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1760 * error, and > 0 if there was no room in the left hand block.
1761 */
1762static int push_node_left(struct btrfs_trans_handle *trans,
1763 struct btrfs_root *root, struct extent_buffer *dst,
1764 struct extent_buffer *src, int empty)
1765{
1766 int push_items = 0;
1767 int src_nritems;
1768 int dst_nritems;
1769 int ret = 0;
1770
1771 src_nritems = btrfs_header_nritems(src);
1772 dst_nritems = btrfs_header_nritems(dst);
1773 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1774 WARN_ON(btrfs_header_generation(src) != trans->transid);
1775 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1776
1777 if (!empty && src_nritems <= 8)
1778 return 1;
1779
1780 if (push_items <= 0)
1781 return 1;
1782
1783 if (empty) {
1784 push_items = min(src_nritems, push_items);
1785 if (push_items < src_nritems) {
1786 /* leave at least 8 pointers in the node if
1787 * we aren't going to empty it
1788 */
1789 if (src_nritems - push_items < 8) {
1790 if (push_items <= 8)
1791 return 1;
1792 push_items -= 8;
1793 }
1794 }
1795 } else
1796 push_items = min(src_nritems - 8, push_items);
1797
1798 copy_extent_buffer(dst, src,
1799 btrfs_node_key_ptr_offset(dst_nritems),
1800 btrfs_node_key_ptr_offset(0),
1801 push_items * sizeof(struct btrfs_key_ptr));
1802
1803 if (push_items < src_nritems) {
1804 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1805 btrfs_node_key_ptr_offset(push_items),
1806 (src_nritems - push_items) *
1807 sizeof(struct btrfs_key_ptr));
1808 }
1809 btrfs_set_header_nritems(src, src_nritems - push_items);
1810 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1811 btrfs_mark_buffer_dirty(src);
1812 btrfs_mark_buffer_dirty(dst);
1813
1814 ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
1815 BUG_ON(ret);
1816
1817 return ret;
1818}
1819
1820/*
1821 * try to push data from one node into the next node right in the
1822 * tree.
1823 *
1824 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1825 * error, and > 0 if there was no room in the right hand block.
1826 *
1827 * this will only push up to 1/2 the contents of the left node over
1828 */
1829static int balance_node_right(struct btrfs_trans_handle *trans,
1830 struct btrfs_root *root,
1831 struct extent_buffer *dst,
1832 struct extent_buffer *src)
1833{
1834 int push_items = 0;
1835 int max_push;
1836 int src_nritems;
1837 int dst_nritems;
1838 int ret = 0;
1839
1840 WARN_ON(btrfs_header_generation(src) != trans->transid);
1841 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1842
1843 src_nritems = btrfs_header_nritems(src);
1844 dst_nritems = btrfs_header_nritems(dst);
1845 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1846 if (push_items <= 0)
1847 return 1;
1848
1849 if (src_nritems < 4)
1850 return 1;
1851
1852 max_push = src_nritems / 2 + 1;
1853 /* don't try to empty the node */
1854 if (max_push >= src_nritems)
1855 return 1;
1856
1857 if (max_push < push_items)
1858 push_items = max_push;
1859
1860 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1861 btrfs_node_key_ptr_offset(0),
1862 (dst_nritems) *
1863 sizeof(struct btrfs_key_ptr));
1864
1865 copy_extent_buffer(dst, src,
1866 btrfs_node_key_ptr_offset(0),
1867 btrfs_node_key_ptr_offset(src_nritems - push_items),
1868 push_items * sizeof(struct btrfs_key_ptr));
1869
1870 btrfs_set_header_nritems(src, src_nritems - push_items);
1871 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1872
1873 btrfs_mark_buffer_dirty(src);
1874 btrfs_mark_buffer_dirty(dst);
1875
1876 ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
1877 BUG_ON(ret);
1878
1879 return ret;
1880}
1881
1882/*
1883 * helper function to insert a new root level in the tree.
1884 * A new node is allocated, and a single item is inserted to
1885 * point to the existing root
1886 *
1887 * returns zero on success or < 0 on failure.
1888 */
1889static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1890 struct btrfs_root *root,
1891 struct btrfs_path *path, int level)
1892{
1893 u64 lower_gen;
1894 struct extent_buffer *lower;
1895 struct extent_buffer *c;
1896 struct extent_buffer *old;
1897 struct btrfs_disk_key lower_key;
1898 int ret;
1899
1900 BUG_ON(path->nodes[level]);
1901 BUG_ON(path->nodes[level-1] != root->node);
1902
1903 lower = path->nodes[level-1];
1904 if (level == 1)
1905 btrfs_item_key(lower, &lower_key, 0);
1906 else
1907 btrfs_node_key(lower, &lower_key, 0);
1908
1909 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1910 root->root_key.objectid, trans->transid,
1911 level, root->node->start, 0);
1912 if (IS_ERR(c))
1913 return PTR_ERR(c);
1914
1915 memset_extent_buffer(c, 0, 0, root->nodesize);
1916 btrfs_set_header_nritems(c, 1);
1917 btrfs_set_header_level(c, level);
1918 btrfs_set_header_bytenr(c, c->start);
1919 btrfs_set_header_generation(c, trans->transid);
1920 btrfs_set_header_owner(c, root->root_key.objectid);
1921
1922 write_extent_buffer(c, root->fs_info->fsid,
1923 (unsigned long)btrfs_header_fsid(c),
1924 BTRFS_FSID_SIZE);
1925
1926 write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1927 (unsigned long)btrfs_header_chunk_tree_uuid(c),
1928 BTRFS_UUID_SIZE);
1929
1930 btrfs_set_node_key(c, &lower_key, 0);
1931 btrfs_set_node_blockptr(c, 0, lower->start);
1932 lower_gen = btrfs_header_generation(lower);
1933 WARN_ON(lower_gen != trans->transid);
1934
1935 btrfs_set_node_ptr_generation(c, 0, lower_gen);
1936
1937 btrfs_mark_buffer_dirty(c);
1938
1939 spin_lock(&root->node_lock);
1940 old = root->node;
1941 root->node = c;
1942 spin_unlock(&root->node_lock);
1943
1944 ret = btrfs_update_extent_ref(trans, root, lower->start,
1945 lower->start, c->start,
1946 root->root_key.objectid,
1947 trans->transid, level - 1);
1948 BUG_ON(ret);
1949
1950 /* the super has an extra ref to root->node */
1951 free_extent_buffer(old);
1952
1953 add_root_to_dirty_list(root);
1954 extent_buffer_get(c);
1955 path->nodes[level] = c;
1956 path->locks[level] = 1;
1957 path->slots[level] = 0;
1958 return 0;
1959}
1960
1961/*
1962 * worker function to insert a single pointer in a node.
1963 * the node should have enough room for the pointer already
1964 *
1965 * slot and level indicate where you want the key to go, and
1966 * blocknr is the block the key points to.
1967 *
1968 * returns zero on success and < 0 on any error
1969 */
1970static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
1971 *root, struct btrfs_path *path, struct btrfs_disk_key
1972 *key, u64 bytenr, int slot, int level)
1973{
1974 struct extent_buffer *lower;
1975 int nritems;
1976
1977 BUG_ON(!path->nodes[level]);
1978 lower = path->nodes[level];
1979 nritems = btrfs_header_nritems(lower);
1980 if (slot > nritems)
1981 BUG();
1982 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
1983 BUG();
1984 if (slot != nritems) {
1985 memmove_extent_buffer(lower,
1986 btrfs_node_key_ptr_offset(slot + 1),
1987 btrfs_node_key_ptr_offset(slot),
1988 (nritems - slot) * sizeof(struct btrfs_key_ptr));
1989 }
1990 btrfs_set_node_key(lower, key, slot);
1991 btrfs_set_node_blockptr(lower, slot, bytenr);
1992 WARN_ON(trans->transid == 0);
1993 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
1994 btrfs_set_header_nritems(lower, nritems + 1);
1995 btrfs_mark_buffer_dirty(lower);
1996 return 0;
1997}
1998
1999/*
2000 * split the node at the specified level in path in two.
2001 * The path is corrected to point to the appropriate node after the split
2002 *
2003 * Before splitting this tries to make some room in the node by pushing
2004 * left and right, if either one works, it returns right away.
2005 *
2006 * returns 0 on success and < 0 on failure
2007 */
2008static noinline int split_node(struct btrfs_trans_handle *trans,
2009 struct btrfs_root *root,
2010 struct btrfs_path *path, int level)
2011{
2012 struct extent_buffer *c;
2013 struct extent_buffer *split;
2014 struct btrfs_disk_key disk_key;
2015 int mid;
2016 int ret;
2017 int wret;
2018 u32 c_nritems;
2019
2020 c = path->nodes[level];
2021 WARN_ON(btrfs_header_generation(c) != trans->transid);
2022 if (c == root->node) {
2023 /* trying to split the root, lets make a new one */
2024 ret = insert_new_root(trans, root, path, level + 1);
2025 if (ret)
2026 return ret;
2027 } else {
2028 ret = push_nodes_for_insert(trans, root, path, level);
2029 c = path->nodes[level];
2030 if (!ret && btrfs_header_nritems(c) <
2031 BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2032 return 0;
2033 if (ret < 0)
2034 return ret;
2035 }
2036
2037 c_nritems = btrfs_header_nritems(c);
2038
2039 split = btrfs_alloc_free_block(trans, root, root->nodesize,
2040 path->nodes[level + 1]->start,
2041 root->root_key.objectid,
2042 trans->transid, level, c->start, 0);
2043 if (IS_ERR(split))
2044 return PTR_ERR(split);
2045
2046 btrfs_set_header_flags(split, btrfs_header_flags(c));
2047 btrfs_set_header_level(split, btrfs_header_level(c));
2048 btrfs_set_header_bytenr(split, split->start);
2049 btrfs_set_header_generation(split, trans->transid);
2050 btrfs_set_header_owner(split, root->root_key.objectid);
2051 btrfs_set_header_flags(split, 0);
2052 write_extent_buffer(split, root->fs_info->fsid,
2053 (unsigned long)btrfs_header_fsid(split),
2054 BTRFS_FSID_SIZE);
2055 write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2056 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2057 BTRFS_UUID_SIZE);
2058
2059 mid = (c_nritems + 1) / 2;
2060
2061 copy_extent_buffer(split, c,
2062 btrfs_node_key_ptr_offset(0),
2063 btrfs_node_key_ptr_offset(mid),
2064 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2065 btrfs_set_header_nritems(split, c_nritems - mid);
2066 btrfs_set_header_nritems(c, mid);
2067 ret = 0;
2068
2069 btrfs_mark_buffer_dirty(c);
2070 btrfs_mark_buffer_dirty(split);
2071
2072 btrfs_node_key(split, &disk_key, 0);
2073 wret = insert_ptr(trans, root, path, &disk_key, split->start,
2074 path->slots[level + 1] + 1,
2075 level + 1);
2076 if (wret)
2077 ret = wret;
2078
2079 ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2080 BUG_ON(ret);
2081
2082 if (path->slots[level] >= mid) {
2083 path->slots[level] -= mid;
2084 btrfs_tree_unlock(c);
2085 free_extent_buffer(c);
2086 path->nodes[level] = split;
2087 path->slots[level + 1] += 1;
2088 } else {
2089 btrfs_tree_unlock(split);
2090 free_extent_buffer(split);
2091 }
2092 return ret;
2093}
2094
2095/*
2096 * how many bytes are required to store the items in a leaf. start
2097 * and nr indicate which items in the leaf to check. This totals up the
2098 * space used both by the item structs and the item data
2099 */
2100static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2101{
2102 int data_len;
2103 int nritems = btrfs_header_nritems(l);
2104 int end = min(nritems, start + nr) - 1;
2105
2106 if (!nr)
2107 return 0;
2108 data_len = btrfs_item_end_nr(l, start);
2109 data_len = data_len - btrfs_item_offset_nr(l, end);
2110 data_len += sizeof(struct btrfs_item) * nr;
2111 WARN_ON(data_len < 0);
2112 return data_len;
2113}
2114
2115/*
2116 * The space between the end of the leaf items and
2117 * the start of the leaf data. IOW, how much room
2118 * the leaf has left for both items and data
2119 */
2120noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2121 struct extent_buffer *leaf)
2122{
2123 int nritems = btrfs_header_nritems(leaf);
2124 int ret;
2125 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2126 if (ret < 0) {
2127 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2128 "used %d nritems %d\n",
2129 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2130 leaf_space_used(leaf, 0, nritems), nritems);
2131 }
2132 return ret;
2133}
2134
2135/*
2136 * push some data in the path leaf to the right, trying to free up at
2137 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2138 *
2139 * returns 1 if the push failed because the other node didn't have enough
2140 * room, 0 if everything worked out and < 0 if there were major errors.
2141 */
2142static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2143 *root, struct btrfs_path *path, int data_size,
2144 int empty)
2145{
2146 struct extent_buffer *left = path->nodes[0];
2147 struct extent_buffer *right;
2148 struct extent_buffer *upper;
2149 struct btrfs_disk_key disk_key;
2150 int slot;
2151 u32 i;
2152 int free_space;
2153 int push_space = 0;
2154 int push_items = 0;
2155 struct btrfs_item *item;
2156 u32 left_nritems;
2157 u32 nr;
2158 u32 right_nritems;
2159 u32 data_end;
2160 u32 this_item_size;
2161 int ret;
2162
2163 slot = path->slots[1];
2164 if (!path->nodes[1])
2165 return 1;
2166
2167 upper = path->nodes[1];
2168 if (slot >= btrfs_header_nritems(upper) - 1)
2169 return 1;
2170
2171 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2172
2173 right = read_node_slot(root, upper, slot + 1);
2174 btrfs_tree_lock(right);
2175 free_space = btrfs_leaf_free_space(root, right);
2176 if (free_space < data_size)
2177 goto out_unlock;
2178
2179 /* cow and double check */
2180 ret = btrfs_cow_block(trans, root, right, upper,
2181 slot + 1, &right, 0);
2182 if (ret)
2183 goto out_unlock;
2184
2185 free_space = btrfs_leaf_free_space(root, right);
2186 if (free_space < data_size)
2187 goto out_unlock;
2188
2189 left_nritems = btrfs_header_nritems(left);
2190 if (left_nritems == 0)
2191 goto out_unlock;
2192
2193 if (empty)
2194 nr = 0;
2195 else
2196 nr = 1;
2197
2198 if (path->slots[0] >= left_nritems)
2199 push_space += data_size;
2200
2201 i = left_nritems - 1;
2202 while (i >= nr) {
2203 item = btrfs_item_nr(left, i);
2204
2205 if (!empty && push_items > 0) {
2206 if (path->slots[0] > i)
2207 break;
2208 if (path->slots[0] == i) {
2209 int space = btrfs_leaf_free_space(root, left);
2210 if (space + push_space * 2 > free_space)
2211 break;
2212 }
2213 }
2214
2215 if (path->slots[0] == i)
2216 push_space += data_size;
2217
2218 if (!left->map_token) {
2219 map_extent_buffer(left, (unsigned long)item,
2220 sizeof(struct btrfs_item),
2221 &left->map_token, &left->kaddr,
2222 &left->map_start, &left->map_len,
2223 KM_USER1);
2224 }
2225
2226 this_item_size = btrfs_item_size(left, item);
2227 if (this_item_size + sizeof(*item) + push_space > free_space)
2228 break;
2229
2230 push_items++;
2231 push_space += this_item_size + sizeof(*item);
2232 if (i == 0)
2233 break;
2234 i--;
2235 }
2236 if (left->map_token) {
2237 unmap_extent_buffer(left, left->map_token, KM_USER1);
2238 left->map_token = NULL;
2239 }
2240
2241 if (push_items == 0)
2242 goto out_unlock;
2243
2244 if (!empty && push_items == left_nritems)
2245 WARN_ON(1);
2246
2247 /* push left to right */
2248 right_nritems = btrfs_header_nritems(right);
2249
2250 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2251 push_space -= leaf_data_end(root, left);
2252
2253 /* make room in the right data area */
2254 data_end = leaf_data_end(root, right);
2255 memmove_extent_buffer(right,
2256 btrfs_leaf_data(right) + data_end - push_space,
2257 btrfs_leaf_data(right) + data_end,
2258 BTRFS_LEAF_DATA_SIZE(root) - data_end);
2259
2260 /* copy from the left data area */
2261 copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2262 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2263 btrfs_leaf_data(left) + leaf_data_end(root, left),
2264 push_space);
2265
2266 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2267 btrfs_item_nr_offset(0),
2268 right_nritems * sizeof(struct btrfs_item));
2269
2270 /* copy the items from left to right */
2271 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2272 btrfs_item_nr_offset(left_nritems - push_items),
2273 push_items * sizeof(struct btrfs_item));
2274
2275 /* update the item pointers */
2276 right_nritems += push_items;
2277 btrfs_set_header_nritems(right, right_nritems);
2278 push_space = BTRFS_LEAF_DATA_SIZE(root);
2279 for (i = 0; i < right_nritems; i++) {
2280 item = btrfs_item_nr(right, i);
2281 if (!right->map_token) {
2282 map_extent_buffer(right, (unsigned long)item,
2283 sizeof(struct btrfs_item),
2284 &right->map_token, &right->kaddr,
2285 &right->map_start, &right->map_len,
2286 KM_USER1);
2287 }
2288 push_space -= btrfs_item_size(right, item);
2289 btrfs_set_item_offset(right, item, push_space);
2290 }
2291
2292 if (right->map_token) {
2293 unmap_extent_buffer(right, right->map_token, KM_USER1);
2294 right->map_token = NULL;
2295 }
2296 left_nritems -= push_items;
2297 btrfs_set_header_nritems(left, left_nritems);
2298
2299 if (left_nritems)
2300 btrfs_mark_buffer_dirty(left);
2301 btrfs_mark_buffer_dirty(right);
2302
2303 ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2304 BUG_ON(ret);
2305
2306 btrfs_item_key(right, &disk_key, 0);
2307 btrfs_set_node_key(upper, &disk_key, slot + 1);
2308 btrfs_mark_buffer_dirty(upper);
2309
2310 /* then fixup the leaf pointer in the path */
2311 if (path->slots[0] >= left_nritems) {
2312 path->slots[0] -= left_nritems;
2313 if (btrfs_header_nritems(path->nodes[0]) == 0)
2314 clean_tree_block(trans, root, path->nodes[0]);
2315 btrfs_tree_unlock(path->nodes[0]);
2316 free_extent_buffer(path->nodes[0]);
2317 path->nodes[0] = right;
2318 path->slots[1] += 1;
2319 } else {
2320 btrfs_tree_unlock(right);
2321 free_extent_buffer(right);
2322 }
2323 return 0;
2324
2325out_unlock:
2326 btrfs_tree_unlock(right);
2327 free_extent_buffer(right);
2328 return 1;
2329}
2330
2331/*
2332 * push some data in the path leaf to the left, trying to free up at
2333 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2334 */
2335static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2336 *root, struct btrfs_path *path, int data_size,
2337 int empty)
2338{
2339 struct btrfs_disk_key disk_key;
2340 struct extent_buffer *right = path->nodes[0];
2341 struct extent_buffer *left;
2342 int slot;
2343 int i;
2344 int free_space;
2345 int push_space = 0;
2346 int push_items = 0;
2347 struct btrfs_item *item;
2348 u32 old_left_nritems;
2349 u32 right_nritems;
2350 u32 nr;
2351 int ret = 0;
2352 int wret;
2353 u32 this_item_size;
2354 u32 old_left_item_size;
2355
2356 slot = path->slots[1];
2357 if (slot == 0)
2358 return 1;
2359 if (!path->nodes[1])
2360 return 1;
2361
2362 right_nritems = btrfs_header_nritems(right);
2363 if (right_nritems == 0)
2364 return 1;
2365
2366 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2367
2368 left = read_node_slot(root, path->nodes[1], slot - 1);
2369 btrfs_tree_lock(left);
2370 free_space = btrfs_leaf_free_space(root, left);
2371 if (free_space < data_size) {
2372 ret = 1;
2373 goto out;
2374 }
2375
2376 /* cow and double check */
2377 ret = btrfs_cow_block(trans, root, left,
2378 path->nodes[1], slot - 1, &left, 0);
2379 if (ret) {
2380 /* we hit -ENOSPC, but it isn't fatal here */
2381 ret = 1;
2382 goto out;
2383 }
2384
2385 free_space = btrfs_leaf_free_space(root, left);
2386 if (free_space < data_size) {
2387 ret = 1;
2388 goto out;
2389 }
2390
2391 if (empty)
2392 nr = right_nritems;
2393 else
2394 nr = right_nritems - 1;
2395
2396 for (i = 0; i < nr; i++) {
2397 item = btrfs_item_nr(right, i);
2398 if (!right->map_token) {
2399 map_extent_buffer(right, (unsigned long)item,
2400 sizeof(struct btrfs_item),
2401 &right->map_token, &right->kaddr,
2402 &right->map_start, &right->map_len,
2403 KM_USER1);
2404 }
2405
2406 if (!empty && push_items > 0) {
2407 if (path->slots[0] < i)
2408 break;
2409 if (path->slots[0] == i) {
2410 int space = btrfs_leaf_free_space(root, right);
2411 if (space + push_space * 2 > free_space)
2412 break;
2413 }
2414 }
2415
2416 if (path->slots[0] == i)
2417 push_space += data_size;
2418
2419 this_item_size = btrfs_item_size(right, item);
2420 if (this_item_size + sizeof(*item) + push_space > free_space)
2421 break;
2422
2423 push_items++;
2424 push_space += this_item_size + sizeof(*item);
2425 }
2426
2427 if (right->map_token) {
2428 unmap_extent_buffer(right, right->map_token, KM_USER1);
2429 right->map_token = NULL;
2430 }
2431
2432 if (push_items == 0) {
2433 ret = 1;
2434 goto out;
2435 }
2436 if (!empty && push_items == btrfs_header_nritems(right))
2437 WARN_ON(1);
2438
2439 /* push data from right to left */
2440 copy_extent_buffer(left, right,
2441 btrfs_item_nr_offset(btrfs_header_nritems(left)),
2442 btrfs_item_nr_offset(0),
2443 push_items * sizeof(struct btrfs_item));
2444
2445 push_space = BTRFS_LEAF_DATA_SIZE(root) -
2446 btrfs_item_offset_nr(right, push_items - 1);
2447
2448 copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2449 leaf_data_end(root, left) - push_space,
2450 btrfs_leaf_data(right) +
2451 btrfs_item_offset_nr(right, push_items - 1),
2452 push_space);
2453 old_left_nritems = btrfs_header_nritems(left);
2454 BUG_ON(old_left_nritems <= 0);
2455
2456 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2457 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2458 u32 ioff;
2459
2460 item = btrfs_item_nr(left, i);
2461 if (!left->map_token) {
2462 map_extent_buffer(left, (unsigned long)item,
2463 sizeof(struct btrfs_item),
2464 &left->map_token, &left->kaddr,
2465 &left->map_start, &left->map_len,
2466 KM_USER1);
2467 }
2468
2469 ioff = btrfs_item_offset(left, item);
2470 btrfs_set_item_offset(left, item,
2471 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2472 }
2473 btrfs_set_header_nritems(left, old_left_nritems + push_items);
2474 if (left->map_token) {
2475 unmap_extent_buffer(left, left->map_token, KM_USER1);
2476 left->map_token = NULL;
2477 }
2478
2479 /* fixup right node */
2480 if (push_items > right_nritems) {
2481 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2482 right_nritems);
2483 WARN_ON(1);
2484 }
2485
2486 if (push_items < right_nritems) {
2487 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2488 leaf_data_end(root, right);
2489 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2490 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2491 btrfs_leaf_data(right) +
2492 leaf_data_end(root, right), push_space);
2493
2494 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2495 btrfs_item_nr_offset(push_items),
2496 (btrfs_header_nritems(right) - push_items) *
2497 sizeof(struct btrfs_item));
2498 }
2499 right_nritems -= push_items;
2500 btrfs_set_header_nritems(right, right_nritems);
2501 push_space = BTRFS_LEAF_DATA_SIZE(root);
2502 for (i = 0; i < right_nritems; i++) {
2503 item = btrfs_item_nr(right, i);
2504
2505 if (!right->map_token) {
2506 map_extent_buffer(right, (unsigned long)item,
2507 sizeof(struct btrfs_item),
2508 &right->map_token, &right->kaddr,
2509 &right->map_start, &right->map_len,
2510 KM_USER1);
2511 }
2512
2513 push_space = push_space - btrfs_item_size(right, item);
2514 btrfs_set_item_offset(right, item, push_space);
2515 }
2516 if (right->map_token) {
2517 unmap_extent_buffer(right, right->map_token, KM_USER1);
2518 right->map_token = NULL;
2519 }
2520
2521 btrfs_mark_buffer_dirty(left);
2522 if (right_nritems)
2523 btrfs_mark_buffer_dirty(right);
2524
2525 ret = btrfs_update_ref(trans, root, right, left,
2526 old_left_nritems, push_items);
2527 BUG_ON(ret);
2528
2529 btrfs_item_key(right, &disk_key, 0);
2530 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2531 if (wret)
2532 ret = wret;
2533
2534 /* then fixup the leaf pointer in the path */
2535 if (path->slots[0] < push_items) {
2536 path->slots[0] += old_left_nritems;
2537 if (btrfs_header_nritems(path->nodes[0]) == 0)
2538 clean_tree_block(trans, root, path->nodes[0]);
2539 btrfs_tree_unlock(path->nodes[0]);
2540 free_extent_buffer(path->nodes[0]);
2541 path->nodes[0] = left;
2542 path->slots[1] -= 1;
2543 } else {
2544 btrfs_tree_unlock(left);
2545 free_extent_buffer(left);
2546 path->slots[0] -= push_items;
2547 }
2548 BUG_ON(path->slots[0] < 0);
2549 return ret;
2550out:
2551 btrfs_tree_unlock(left);
2552 free_extent_buffer(left);
2553 return ret;
2554}
2555
2556/*
2557 * split the path's leaf in two, making sure there is at least data_size
2558 * available for the resulting leaf level of the path.
2559 *
2560 * returns 0 if all went well and < 0 on failure.
2561 */
2562static noinline int split_leaf(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root,
2564 struct btrfs_key *ins_key,
2565 struct btrfs_path *path, int data_size,
2566 int extend)
2567{
2568 struct extent_buffer *l;
2569 u32 nritems;
2570 int mid;
2571 int slot;
2572 struct extent_buffer *right;
2573 int data_copy_size;
2574 int rt_data_off;
2575 int i;
2576 int ret = 0;
2577 int wret;
2578 int double_split;
2579 int num_doubles = 0;
2580 struct btrfs_disk_key disk_key;
2581
2582 /* first try to make some room by pushing left and right */
2583 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2584 wret = push_leaf_right(trans, root, path, data_size, 0);
2585 if (wret < 0)
2586 return wret;
2587 if (wret) {
2588 wret = push_leaf_left(trans, root, path, data_size, 0);
2589 if (wret < 0)
2590 return wret;
2591 }
2592 l = path->nodes[0];
2593
2594 /* did the pushes work? */
2595 if (btrfs_leaf_free_space(root, l) >= data_size)
2596 return 0;
2597 }
2598
2599 if (!path->nodes[1]) {
2600 ret = insert_new_root(trans, root, path, 1);
2601 if (ret)
2602 return ret;
2603 }
2604again:
2605 double_split = 0;
2606 l = path->nodes[0];
2607 slot = path->slots[0];
2608 nritems = btrfs_header_nritems(l);
2609 mid = (nritems + 1) / 2;
2610
2611 right = btrfs_alloc_free_block(trans, root, root->leafsize,
2612 path->nodes[1]->start,
2613 root->root_key.objectid,
2614 trans->transid, 0, l->start, 0);
2615 if (IS_ERR(right)) {
2616 BUG_ON(1);
2617 return PTR_ERR(right);
2618 }
2619
2620 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2621 btrfs_set_header_bytenr(right, right->start);
2622 btrfs_set_header_generation(right, trans->transid);
2623 btrfs_set_header_owner(right, root->root_key.objectid);
2624 btrfs_set_header_level(right, 0);
2625 write_extent_buffer(right, root->fs_info->fsid,
2626 (unsigned long)btrfs_header_fsid(right),
2627 BTRFS_FSID_SIZE);
2628
2629 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2630 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2631 BTRFS_UUID_SIZE);
2632 if (mid <= slot) {
2633 if (nritems == 1 ||
2634 leaf_space_used(l, mid, nritems - mid) + data_size >
2635 BTRFS_LEAF_DATA_SIZE(root)) {
2636 if (slot >= nritems) {
2637 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2638 btrfs_set_header_nritems(right, 0);
2639 wret = insert_ptr(trans, root, path,
2640 &disk_key, right->start,
2641 path->slots[1] + 1, 1);
2642 if (wret)
2643 ret = wret;
2644
2645 btrfs_tree_unlock(path->nodes[0]);
2646 free_extent_buffer(path->nodes[0]);
2647 path->nodes[0] = right;
2648 path->slots[0] = 0;
2649 path->slots[1] += 1;
2650 btrfs_mark_buffer_dirty(right);
2651 return ret;
2652 }
2653 mid = slot;
2654 if (mid != nritems &&
2655 leaf_space_used(l, mid, nritems - mid) +
2656 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2657 double_split = 1;
2658 }
2659 }
2660 } else {
2661 if (leaf_space_used(l, 0, mid) + data_size >
2662 BTRFS_LEAF_DATA_SIZE(root)) {
2663 if (!extend && data_size && slot == 0) {
2664 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2665 btrfs_set_header_nritems(right, 0);
2666 wret = insert_ptr(trans, root, path,
2667 &disk_key,
2668 right->start,
2669 path->slots[1], 1);
2670 if (wret)
2671 ret = wret;
2672 btrfs_tree_unlock(path->nodes[0]);
2673 free_extent_buffer(path->nodes[0]);
2674 path->nodes[0] = right;
2675 path->slots[0] = 0;
2676 if (path->slots[1] == 0) {
2677 wret = fixup_low_keys(trans, root,
2678 path, &disk_key, 1);
2679 if (wret)
2680 ret = wret;
2681 }
2682 btrfs_mark_buffer_dirty(right);
2683 return ret;
2684 } else if ((extend || !data_size) && slot == 0) {
2685 mid = 1;
2686 } else {
2687 mid = slot;
2688 if (mid != nritems &&
2689 leaf_space_used(l, mid, nritems - mid) +
2690 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2691 double_split = 1;
2692 }
2693 }
2694 }
2695 }
2696 nritems = nritems - mid;
2697 btrfs_set_header_nritems(right, nritems);
2698 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2699
2700 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2701 btrfs_item_nr_offset(mid),
2702 nritems * sizeof(struct btrfs_item));
2703
2704 copy_extent_buffer(right, l,
2705 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2706 data_copy_size, btrfs_leaf_data(l) +
2707 leaf_data_end(root, l), data_copy_size);
2708
2709 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2710 btrfs_item_end_nr(l, mid);
2711
2712 for (i = 0; i < nritems; i++) {
2713 struct btrfs_item *item = btrfs_item_nr(right, i);
2714 u32 ioff;
2715
2716 if (!right->map_token) {
2717 map_extent_buffer(right, (unsigned long)item,
2718 sizeof(struct btrfs_item),
2719 &right->map_token, &right->kaddr,
2720 &right->map_start, &right->map_len,
2721 KM_USER1);
2722 }
2723
2724 ioff = btrfs_item_offset(right, item);
2725 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2726 }
2727
2728 if (right->map_token) {
2729 unmap_extent_buffer(right, right->map_token, KM_USER1);
2730 right->map_token = NULL;
2731 }
2732
2733 btrfs_set_header_nritems(l, mid);
2734 ret = 0;
2735 btrfs_item_key(right, &disk_key, 0);
2736 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2737 path->slots[1] + 1, 1);
2738 if (wret)
2739 ret = wret;
2740
2741 btrfs_mark_buffer_dirty(right);
2742 btrfs_mark_buffer_dirty(l);
2743 BUG_ON(path->slots[0] != slot);
2744
2745 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2746 BUG_ON(ret);
2747
2748 if (mid <= slot) {
2749 btrfs_tree_unlock(path->nodes[0]);
2750 free_extent_buffer(path->nodes[0]);
2751 path->nodes[0] = right;
2752 path->slots[0] -= mid;
2753 path->slots[1] += 1;
2754 } else {
2755 btrfs_tree_unlock(right);
2756 free_extent_buffer(right);
2757 }
2758
2759 BUG_ON(path->slots[0] < 0);
2760
2761 if (double_split) {
2762 BUG_ON(num_doubles != 0);
2763 num_doubles++;
2764 goto again;
2765 }
2766 return ret;
2767}
2768
2769/*
2770 * This function splits a single item into two items,
2771 * giving 'new_key' to the new item and splitting the
2772 * old one at split_offset (from the start of the item).
2773 *
2774 * The path may be released by this operation. After
2775 * the split, the path is pointing to the old item. The
2776 * new item is going to be in the same node as the old one.
2777 *
2778 * Note, the item being split must be smaller enough to live alone on
2779 * a tree block with room for one extra struct btrfs_item
2780 *
2781 * This allows us to split the item in place, keeping a lock on the
2782 * leaf the entire time.
2783 */
2784int btrfs_split_item(struct btrfs_trans_handle *trans,
2785 struct btrfs_root *root,
2786 struct btrfs_path *path,
2787 struct btrfs_key *new_key,
2788 unsigned long split_offset)
2789{
2790 u32 item_size;
2791 struct extent_buffer *leaf;
2792 struct btrfs_key orig_key;
2793 struct btrfs_item *item;
2794 struct btrfs_item *new_item;
2795 int ret = 0;
2796 int slot;
2797 u32 nritems;
2798 u32 orig_offset;
2799 struct btrfs_disk_key disk_key;
2800 char *buf;
2801
2802 leaf = path->nodes[0];
2803 btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
2804 if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
2805 goto split;
2806
2807 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2808 btrfs_release_path(root, path);
2809
2810 path->search_for_split = 1;
2811 path->keep_locks = 1;
2812
2813 ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
2814 path->search_for_split = 0;
2815
2816 /* if our item isn't there or got smaller, return now */
2817 if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
2818 path->slots[0])) {
2819 path->keep_locks = 0;
2820 return -EAGAIN;
2821 }
2822
2823 ret = split_leaf(trans, root, &orig_key, path,
2824 sizeof(struct btrfs_item), 1);
2825 path->keep_locks = 0;
2826 BUG_ON(ret);
2827
2828 leaf = path->nodes[0];
2829 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830
2831split:
2832 item = btrfs_item_nr(leaf, path->slots[0]);
2833 orig_offset = btrfs_item_offset(leaf, item);
2834 item_size = btrfs_item_size(leaf, item);
2835
2836
2837 buf = kmalloc(item_size, GFP_NOFS);
2838 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
2839 path->slots[0]), item_size);
2840 slot = path->slots[0] + 1;
2841 leaf = path->nodes[0];
2842
2843 nritems = btrfs_header_nritems(leaf);
2844
2845 if (slot != nritems) {
2846 /* shift the items */
2847 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
2848 btrfs_item_nr_offset(slot),
2849 (nritems - slot) * sizeof(struct btrfs_item));
2850
2851 }
2852
2853 btrfs_cpu_key_to_disk(&disk_key, new_key);
2854 btrfs_set_item_key(leaf, &disk_key, slot);
2855
2856 new_item = btrfs_item_nr(leaf, slot);
2857
2858 btrfs_set_item_offset(leaf, new_item, orig_offset);
2859 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
2860
2861 btrfs_set_item_offset(leaf, item,
2862 orig_offset + item_size - split_offset);
2863 btrfs_set_item_size(leaf, item, split_offset);
2864
2865 btrfs_set_header_nritems(leaf, nritems + 1);
2866
2867 /* write the data for the start of the original item */
2868 write_extent_buffer(leaf, buf,
2869 btrfs_item_ptr_offset(leaf, path->slots[0]),
2870 split_offset);
2871
2872 /* write the data for the new item */
2873 write_extent_buffer(leaf, buf + split_offset,
2874 btrfs_item_ptr_offset(leaf, slot),
2875 item_size - split_offset);
2876 btrfs_mark_buffer_dirty(leaf);
2877
2878 ret = 0;
2879 if (btrfs_leaf_free_space(root, leaf) < 0) {
2880 btrfs_print_leaf(root, leaf);
2881 BUG();
2882 }
2883 kfree(buf);
2884 return ret;
2885}
2886
2887/*
2888 * make the item pointed to by the path smaller. new_size indicates
2889 * how small to make it, and from_end tells us if we just chop bytes
2890 * off the end of the item or if we shift the item to chop bytes off
2891 * the front.
2892 */
2893int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2894 struct btrfs_root *root,
2895 struct btrfs_path *path,
2896 u32 new_size, int from_end)
2897{
2898 int ret = 0;
2899 int slot;
2900 int slot_orig;
2901 struct extent_buffer *leaf;
2902 struct btrfs_item *item;
2903 u32 nritems;
2904 unsigned int data_end;
2905 unsigned int old_data_start;
2906 unsigned int old_size;
2907 unsigned int size_diff;
2908 int i;
2909
2910 slot_orig = path->slots[0];
2911 leaf = path->nodes[0];
2912 slot = path->slots[0];
2913
2914 old_size = btrfs_item_size_nr(leaf, slot);
2915 if (old_size == new_size)
2916 return 0;
2917
2918 nritems = btrfs_header_nritems(leaf);
2919 data_end = leaf_data_end(root, leaf);
2920
2921 old_data_start = btrfs_item_offset_nr(leaf, slot);
2922
2923 size_diff = old_size - new_size;
2924
2925 BUG_ON(slot < 0);
2926 BUG_ON(slot >= nritems);
2927
2928 /*
2929 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2930 */
2931 /* first correct the data pointers */
2932 for (i = slot; i < nritems; i++) {
2933 u32 ioff;
2934 item = btrfs_item_nr(leaf, i);
2935
2936 if (!leaf->map_token) {
2937 map_extent_buffer(leaf, (unsigned long)item,
2938 sizeof(struct btrfs_item),
2939 &leaf->map_token, &leaf->kaddr,
2940 &leaf->map_start, &leaf->map_len,
2941 KM_USER1);
2942 }
2943
2944 ioff = btrfs_item_offset(leaf, item);
2945 btrfs_set_item_offset(leaf, item, ioff + size_diff);
2946 }
2947
2948 if (leaf->map_token) {
2949 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2950 leaf->map_token = NULL;
2951 }
2952
2953 /* shift the data */
2954 if (from_end) {
2955 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2956 data_end + size_diff, btrfs_leaf_data(leaf) +
2957 data_end, old_data_start + new_size - data_end);
2958 } else {
2959 struct btrfs_disk_key disk_key;
2960 u64 offset;
2961
2962 btrfs_item_key(leaf, &disk_key, slot);
2963
2964 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
2965 unsigned long ptr;
2966 struct btrfs_file_extent_item *fi;
2967
2968 fi = btrfs_item_ptr(leaf, slot,
2969 struct btrfs_file_extent_item);
2970 fi = (struct btrfs_file_extent_item *)(
2971 (unsigned long)fi - size_diff);
2972
2973 if (btrfs_file_extent_type(leaf, fi) ==
2974 BTRFS_FILE_EXTENT_INLINE) {
2975 ptr = btrfs_item_ptr_offset(leaf, slot);
2976 memmove_extent_buffer(leaf, ptr,
2977 (unsigned long)fi,
2978 offsetof(struct btrfs_file_extent_item,
2979 disk_bytenr));
2980 }
2981 }
2982
2983 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2984 data_end + size_diff, btrfs_leaf_data(leaf) +
2985 data_end, old_data_start - data_end);
2986
2987 offset = btrfs_disk_key_offset(&disk_key);
2988 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
2989 btrfs_set_item_key(leaf, &disk_key, slot);
2990 if (slot == 0)
2991 fixup_low_keys(trans, root, path, &disk_key, 1);
2992 }
2993
2994 item = btrfs_item_nr(leaf, slot);
2995 btrfs_set_item_size(leaf, item, new_size);
2996 btrfs_mark_buffer_dirty(leaf);
2997
2998 ret = 0;
2999 if (btrfs_leaf_free_space(root, leaf) < 0) {
3000 btrfs_print_leaf(root, leaf);
3001 BUG();
3002 }
3003 return ret;
3004}
3005
3006/*
3007 * make the item pointed to by the path bigger, data_size is the new size.
3008 */
3009int btrfs_extend_item(struct btrfs_trans_handle *trans,
3010 struct btrfs_root *root, struct btrfs_path *path,
3011 u32 data_size)
3012{
3013 int ret = 0;
3014 int slot;
3015 int slot_orig;
3016 struct extent_buffer *leaf;
3017 struct btrfs_item *item;
3018 u32 nritems;
3019 unsigned int data_end;
3020 unsigned int old_data;
3021 unsigned int old_size;
3022 int i;
3023
3024 slot_orig = path->slots[0];
3025 leaf = path->nodes[0];
3026
3027 nritems = btrfs_header_nritems(leaf);
3028 data_end = leaf_data_end(root, leaf);
3029
3030 if (btrfs_leaf_free_space(root, leaf) < data_size) {
3031 btrfs_print_leaf(root, leaf);
3032 BUG();
3033 }
3034 slot = path->slots[0];
3035 old_data = btrfs_item_end_nr(leaf, slot);
3036
3037 BUG_ON(slot < 0);
3038 if (slot >= nritems) {
3039 btrfs_print_leaf(root, leaf);
3040 printk(KERN_CRIT "slot %d too large, nritems %d\n",
3041 slot, nritems);
3042 BUG_ON(1);
3043 }
3044
3045 /*
3046 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3047 */
3048 /* first correct the data pointers */
3049 for (i = slot; i < nritems; i++) {
3050 u32 ioff;
3051 item = btrfs_item_nr(leaf, i);
3052
3053 if (!leaf->map_token) {
3054 map_extent_buffer(leaf, (unsigned long)item,
3055 sizeof(struct btrfs_item),
3056 &leaf->map_token, &leaf->kaddr,
3057 &leaf->map_start, &leaf->map_len,
3058 KM_USER1);
3059 }
3060 ioff = btrfs_item_offset(leaf, item);
3061 btrfs_set_item_offset(leaf, item, ioff - data_size);
3062 }
3063
3064 if (leaf->map_token) {
3065 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3066 leaf->map_token = NULL;
3067 }
3068
3069 /* shift the data */
3070 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3071 data_end - data_size, btrfs_leaf_data(leaf) +
3072 data_end, old_data - data_end);
3073
3074 data_end = old_data;
3075 old_size = btrfs_item_size_nr(leaf, slot);
3076 item = btrfs_item_nr(leaf, slot);
3077 btrfs_set_item_size(leaf, item, old_size + data_size);
3078 btrfs_mark_buffer_dirty(leaf);
3079
3080 ret = 0;
3081 if (btrfs_leaf_free_space(root, leaf) < 0) {
3082 btrfs_print_leaf(root, leaf);
3083 BUG();
3084 }
3085 return ret;
3086}
3087
3088/*
3089 * Given a key and some data, insert items into the tree.
3090 * This does all the path init required, making room in the tree if needed.
3091 * Returns the number of keys that were inserted.
3092 */
3093int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3094 struct btrfs_root *root,
3095 struct btrfs_path *path,
3096 struct btrfs_key *cpu_key, u32 *data_size,
3097 int nr)
3098{
3099 struct extent_buffer *leaf;
3100 struct btrfs_item *item;
3101 int ret = 0;
3102 int slot;
3103 int i;
3104 u32 nritems;
3105 u32 total_data = 0;
3106 u32 total_size = 0;
3107 unsigned int data_end;
3108 struct btrfs_disk_key disk_key;
3109 struct btrfs_key found_key;
3110
3111 for (i = 0; i < nr; i++) {
3112 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3113 BTRFS_LEAF_DATA_SIZE(root)) {
3114 break;
3115 nr = i;
3116 }
3117 total_data += data_size[i];
3118 total_size += data_size[i] + sizeof(struct btrfs_item);
3119 }
3120 BUG_ON(nr == 0);
3121
3122 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3123 if (ret == 0)
3124 return -EEXIST;
3125 if (ret < 0)
3126 goto out;
3127
3128 leaf = path->nodes[0];
3129
3130 nritems = btrfs_header_nritems(leaf);
3131 data_end = leaf_data_end(root, leaf);
3132
3133 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3134 for (i = nr; i >= 0; i--) {
3135 total_data -= data_size[i];
3136 total_size -= data_size[i] + sizeof(struct btrfs_item);
3137 if (total_size < btrfs_leaf_free_space(root, leaf))
3138 break;
3139 }
3140 nr = i;
3141 }
3142
3143 slot = path->slots[0];
3144 BUG_ON(slot < 0);
3145
3146 if (slot != nritems) {
3147 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3148
3149 item = btrfs_item_nr(leaf, slot);
3150 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3151
3152 /* figure out how many keys we can insert in here */
3153 total_data = data_size[0];
3154 for (i = 1; i < nr; i++) {
3155 if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3156 break;
3157 total_data += data_size[i];
3158 }
3159 nr = i;
3160
3161 if (old_data < data_end) {
3162 btrfs_print_leaf(root, leaf);
3163 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3164 slot, old_data, data_end);
3165 BUG_ON(1);
3166 }
3167 /*
3168 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3169 */
3170 /* first correct the data pointers */
3171 WARN_ON(leaf->map_token);
3172 for (i = slot; i < nritems; i++) {
3173 u32 ioff;
3174
3175 item = btrfs_item_nr(leaf, i);
3176 if (!leaf->map_token) {
3177 map_extent_buffer(leaf, (unsigned long)item,
3178 sizeof(struct btrfs_item),
3179 &leaf->map_token, &leaf->kaddr,
3180 &leaf->map_start, &leaf->map_len,
3181 KM_USER1);
3182 }
3183
3184 ioff = btrfs_item_offset(leaf, item);
3185 btrfs_set_item_offset(leaf, item, ioff - total_data);
3186 }
3187 if (leaf->map_token) {
3188 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3189 leaf->map_token = NULL;
3190 }
3191
3192 /* shift the items */
3193 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3194 btrfs_item_nr_offset(slot),
3195 (nritems - slot) * sizeof(struct btrfs_item));
3196
3197 /* shift the data */
3198 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3199 data_end - total_data, btrfs_leaf_data(leaf) +
3200 data_end, old_data - data_end);
3201 data_end = old_data;
3202 } else {
3203 /*
3204 * this sucks but it has to be done, if we are inserting at
3205 * the end of the leaf only insert 1 of the items, since we
3206 * have no way of knowing whats on the next leaf and we'd have
3207 * to drop our current locks to figure it out
3208 */
3209 nr = 1;
3210 }
3211
3212 /* setup the item for the new data */
3213 for (i = 0; i < nr; i++) {
3214 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3215 btrfs_set_item_key(leaf, &disk_key, slot + i);
3216 item = btrfs_item_nr(leaf, slot + i);
3217 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3218 data_end -= data_size[i];
3219 btrfs_set_item_size(leaf, item, data_size[i]);
3220 }
3221 btrfs_set_header_nritems(leaf, nritems + nr);
3222 btrfs_mark_buffer_dirty(leaf);
3223
3224 ret = 0;
3225 if (slot == 0) {
3226 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3227 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3228 }
3229
3230 if (btrfs_leaf_free_space(root, leaf) < 0) {
3231 btrfs_print_leaf(root, leaf);
3232 BUG();
3233 }
3234out:
3235 if (!ret)
3236 ret = nr;
3237 return ret;
3238}
3239
3240/*
3241 * Given a key and some data, insert items into the tree.
3242 * This does all the path init required, making room in the tree if needed.
3243 */
3244int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3245 struct btrfs_root *root,
3246 struct btrfs_path *path,
3247 struct btrfs_key *cpu_key, u32 *data_size,
3248 int nr)
3249{
3250 struct extent_buffer *leaf;
3251 struct btrfs_item *item;
3252 int ret = 0;
3253 int slot;
3254 int slot_orig;
3255 int i;
3256 u32 nritems;
3257 u32 total_size = 0;
3258 u32 total_data = 0;
3259 unsigned int data_end;
3260 struct btrfs_disk_key disk_key;
3261
3262 for (i = 0; i < nr; i++)
3263 total_data += data_size[i];
3264
3265 total_size = total_data + (nr * sizeof(struct btrfs_item));
3266 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3267 if (ret == 0)
3268 return -EEXIST;
3269 if (ret < 0)
3270 goto out;
3271
3272 slot_orig = path->slots[0];
3273 leaf = path->nodes[0];
3274
3275 nritems = btrfs_header_nritems(leaf);
3276 data_end = leaf_data_end(root, leaf);
3277
3278 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3279 btrfs_print_leaf(root, leaf);
3280 printk(KERN_CRIT "not enough freespace need %u have %d\n",
3281 total_size, btrfs_leaf_free_space(root, leaf));
3282 BUG();
3283 }
3284
3285 slot = path->slots[0];
3286 BUG_ON(slot < 0);
3287
3288 if (slot != nritems) {
3289 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3290
3291 if (old_data < data_end) {
3292 btrfs_print_leaf(root, leaf);
3293 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3294 slot, old_data, data_end);
3295 BUG_ON(1);
3296 }
3297 /*
3298 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3299 */
3300 /* first correct the data pointers */
3301 WARN_ON(leaf->map_token);
3302 for (i = slot; i < nritems; i++) {
3303 u32 ioff;
3304
3305 item = btrfs_item_nr(leaf, i);
3306 if (!leaf->map_token) {
3307 map_extent_buffer(leaf, (unsigned long)item,
3308 sizeof(struct btrfs_item),
3309 &leaf->map_token, &leaf->kaddr,
3310 &leaf->map_start, &leaf->map_len,
3311 KM_USER1);
3312 }
3313
3314 ioff = btrfs_item_offset(leaf, item);
3315 btrfs_set_item_offset(leaf, item, ioff - total_data);
3316 }
3317 if (leaf->map_token) {
3318 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3319 leaf->map_token = NULL;
3320 }
3321
3322 /* shift the items */
3323 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3324 btrfs_item_nr_offset(slot),
3325 (nritems - slot) * sizeof(struct btrfs_item));
3326
3327 /* shift the data */
3328 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3329 data_end - total_data, btrfs_leaf_data(leaf) +
3330 data_end, old_data - data_end);
3331 data_end = old_data;
3332 }
3333
3334 /* setup the item for the new data */
3335 for (i = 0; i < nr; i++) {
3336 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3337 btrfs_set_item_key(leaf, &disk_key, slot + i);
3338 item = btrfs_item_nr(leaf, slot + i);
3339 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3340 data_end -= data_size[i];
3341 btrfs_set_item_size(leaf, item, data_size[i]);
3342 }
3343 btrfs_set_header_nritems(leaf, nritems + nr);
3344 btrfs_mark_buffer_dirty(leaf);
3345
3346 ret = 0;
3347 if (slot == 0) {
3348 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3349 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3350 }
3351
3352 if (btrfs_leaf_free_space(root, leaf) < 0) {
3353 btrfs_print_leaf(root, leaf);
3354 BUG();
3355 }
3356out:
3357 return ret;
3358}
3359
3360/*
3361 * Given a key and some data, insert an item into the tree.
3362 * This does all the path init required, making room in the tree if needed.
3363 */
3364int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3365 *root, struct btrfs_key *cpu_key, void *data, u32
3366 data_size)
3367{
3368 int ret = 0;
3369 struct btrfs_path *path;
3370 struct extent_buffer *leaf;
3371 unsigned long ptr;
3372
3373 path = btrfs_alloc_path();
3374 BUG_ON(!path);
3375 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3376 if (!ret) {
3377 leaf = path->nodes[0];
3378 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3379 write_extent_buffer(leaf, data, ptr, data_size);
3380 btrfs_mark_buffer_dirty(leaf);
3381 }
3382 btrfs_free_path(path);
3383 return ret;
3384}
3385
3386/*
3387 * delete the pointer from a given node.
3388 *
3389 * the tree should have been previously balanced so the deletion does not
3390 * empty a node.
3391 */
3392static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3393 struct btrfs_path *path, int level, int slot)
3394{
3395 struct extent_buffer *parent = path->nodes[level];
3396 u32 nritems;
3397 int ret = 0;
3398 int wret;
3399
3400 nritems = btrfs_header_nritems(parent);
3401 if (slot != nritems - 1) {
3402 memmove_extent_buffer(parent,
3403 btrfs_node_key_ptr_offset(slot),
3404 btrfs_node_key_ptr_offset(slot + 1),
3405 sizeof(struct btrfs_key_ptr) *
3406 (nritems - slot - 1));
3407 }
3408 nritems--;
3409 btrfs_set_header_nritems(parent, nritems);
3410 if (nritems == 0 && parent == root->node) {
3411 BUG_ON(btrfs_header_level(root->node) != 1);
3412 /* just turn the root into a leaf and break */
3413 btrfs_set_header_level(root->node, 0);
3414 } else if (slot == 0) {
3415 struct btrfs_disk_key disk_key;
3416
3417 btrfs_node_key(parent, &disk_key, 0);
3418 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3419 if (wret)
3420 ret = wret;
3421 }
3422 btrfs_mark_buffer_dirty(parent);
3423 return ret;
3424}
3425
3426/*
3427 * a helper function to delete the leaf pointed to by path->slots[1] and
3428 * path->nodes[1]. bytenr is the node block pointer, but since the callers
3429 * already know it, it is faster to have them pass it down than to
3430 * read it out of the node again.
3431 *
3432 * This deletes the pointer in path->nodes[1] and frees the leaf
3433 * block extent. zero is returned if it all worked out, < 0 otherwise.
3434 *
3435 * The path must have already been setup for deleting the leaf, including
3436 * all the proper balancing. path->nodes[1] must be locked.
3437 */
3438noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3439 struct btrfs_root *root,
3440 struct btrfs_path *path, u64 bytenr)
3441{
3442 int ret;
3443 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3444
3445 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3446 if (ret)
3447 return ret;
3448
3449 ret = btrfs_free_extent(trans, root, bytenr,
3450 btrfs_level_size(root, 0),
3451 path->nodes[1]->start,
3452 btrfs_header_owner(path->nodes[1]),
3453 root_gen, 0, 1);
3454 return ret;
3455}
3456/*
3457 * delete the item at the leaf level in path. If that empties
3458 * the leaf, remove it from the tree
3459 */
3460int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3461 struct btrfs_path *path, int slot, int nr)
3462{
3463 struct extent_buffer *leaf;
3464 struct btrfs_item *item;
3465 int last_off;
3466 int dsize = 0;
3467 int ret = 0;
3468 int wret;
3469 int i;
3470 u32 nritems;
3471
3472 leaf = path->nodes[0];
3473 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3474
3475 for (i = 0; i < nr; i++)
3476 dsize += btrfs_item_size_nr(leaf, slot + i);
3477
3478 nritems = btrfs_header_nritems(leaf);
3479
3480 if (slot + nr != nritems) {
3481 int data_end = leaf_data_end(root, leaf);
3482
3483 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3484 data_end + dsize,
3485 btrfs_leaf_data(leaf) + data_end,
3486 last_off - data_end);
3487
3488 for (i = slot + nr; i < nritems; i++) {
3489 u32 ioff;
3490
3491 item = btrfs_item_nr(leaf, i);
3492 if (!leaf->map_token) {
3493 map_extent_buffer(leaf, (unsigned long)item,
3494 sizeof(struct btrfs_item),
3495 &leaf->map_token, &leaf->kaddr,
3496 &leaf->map_start, &leaf->map_len,
3497 KM_USER1);
3498 }
3499 ioff = btrfs_item_offset(leaf, item);
3500 btrfs_set_item_offset(leaf, item, ioff + dsize);
3501 }
3502
3503 if (leaf->map_token) {
3504 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3505 leaf->map_token = NULL;
3506 }
3507
3508 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3509 btrfs_item_nr_offset(slot + nr),
3510 sizeof(struct btrfs_item) *
3511 (nritems - slot - nr));
3512 }
3513 btrfs_set_header_nritems(leaf, nritems - nr);
3514 nritems -= nr;
3515
3516 /* delete the leaf if we've emptied it */
3517 if (nritems == 0) {
3518 if (leaf == root->node) {
3519 btrfs_set_header_level(leaf, 0);
3520 } else {
3521 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3522 BUG_ON(ret);
3523 }
3524 } else {
3525 int used = leaf_space_used(leaf, 0, nritems);
3526 if (slot == 0) {
3527 struct btrfs_disk_key disk_key;
3528
3529 btrfs_item_key(leaf, &disk_key, 0);
3530 wret = fixup_low_keys(trans, root, path,
3531 &disk_key, 1);
3532 if (wret)
3533 ret = wret;
3534 }
3535
3536 /* delete the leaf if it is mostly empty */
3537 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3538 /* push_leaf_left fixes the path.
3539 * make sure the path still points to our leaf
3540 * for possible call to del_ptr below
3541 */
3542 slot = path->slots[1];
3543 extent_buffer_get(leaf);
3544
3545 wret = push_leaf_left(trans, root, path, 1, 1);
3546 if (wret < 0 && wret != -ENOSPC)
3547 ret = wret;
3548
3549 if (path->nodes[0] == leaf &&
3550 btrfs_header_nritems(leaf)) {
3551 wret = push_leaf_right(trans, root, path, 1, 1);
3552 if (wret < 0 && wret != -ENOSPC)
3553 ret = wret;
3554 }
3555
3556 if (btrfs_header_nritems(leaf) == 0) {
3557 path->slots[1] = slot;
3558 ret = btrfs_del_leaf(trans, root, path,
3559 leaf->start);
3560 BUG_ON(ret);
3561 free_extent_buffer(leaf);
3562 } else {
3563 /* if we're still in the path, make sure
3564 * we're dirty. Otherwise, one of the
3565 * push_leaf functions must have already
3566 * dirtied this buffer
3567 */
3568 if (path->nodes[0] == leaf)
3569 btrfs_mark_buffer_dirty(leaf);
3570 free_extent_buffer(leaf);
3571 }
3572 } else {
3573 btrfs_mark_buffer_dirty(leaf);
3574 }
3575 }
3576 return ret;
3577}
3578
3579/*
3580 * search the tree again to find a leaf with lesser keys
3581 * returns 0 if it found something or 1 if there are no lesser leaves.
3582 * returns < 0 on io errors.
3583 *
3584 * This may release the path, and so you may lose any locks held at the
3585 * time you call it.
3586 */
3587int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3588{
3589 struct btrfs_key key;
3590 struct btrfs_disk_key found_key;
3591 int ret;
3592
3593 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3594
3595 if (key.offset > 0)
3596 key.offset--;
3597 else if (key.type > 0)
3598 key.type--;
3599 else if (key.objectid > 0)
3600 key.objectid--;
3601 else
3602 return 1;
3603
3604 btrfs_release_path(root, path);
3605 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3606 if (ret < 0)
3607 return ret;
3608 btrfs_item_key(path->nodes[0], &found_key, 0);
3609 ret = comp_keys(&found_key, &key);
3610 if (ret < 0)
3611 return 0;
3612 return 1;
3613}
3614
3615/*
3616 * A helper function to walk down the tree starting at min_key, and looking
3617 * for nodes or leaves that are either in cache or have a minimum
3618 * transaction id. This is used by the btree defrag code, and tree logging
3619 *
3620 * This does not cow, but it does stuff the starting key it finds back
3621 * into min_key, so you can call btrfs_search_slot with cow=1 on the
3622 * key and get a writable path.
3623 *
3624 * This does lock as it descends, and path->keep_locks should be set
3625 * to 1 by the caller.
3626 *
3627 * This honors path->lowest_level to prevent descent past a given level
3628 * of the tree.
3629 *
3630 * min_trans indicates the oldest transaction that you are interested
3631 * in walking through. Any nodes or leaves older than min_trans are
3632 * skipped over (without reading them).
3633 *
3634 * returns zero if something useful was found, < 0 on error and 1 if there
3635 * was nothing in the tree that matched the search criteria.
3636 */
3637int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3638 struct btrfs_key *max_key,
3639 struct btrfs_path *path, int cache_only,
3640 u64 min_trans)
3641{
3642 struct extent_buffer *cur;
3643 struct btrfs_key found_key;
3644 int slot;
3645 int sret;
3646 u32 nritems;
3647 int level;
3648 int ret = 1;
3649
3650 WARN_ON(!path->keep_locks);
3651again:
3652 cur = btrfs_lock_root_node(root);
3653 level = btrfs_header_level(cur);
3654 WARN_ON(path->nodes[level]);
3655 path->nodes[level] = cur;
3656 path->locks[level] = 1;
3657
3658 if (btrfs_header_generation(cur) < min_trans) {
3659 ret = 1;
3660 goto out;
3661 }
3662 while (1) {
3663 nritems = btrfs_header_nritems(cur);
3664 level = btrfs_header_level(cur);
3665 sret = bin_search(cur, min_key, level, &slot);
3666
3667 /* at the lowest level, we're done, setup the path and exit */
3668 if (level == path->lowest_level) {
3669 if (slot >= nritems)
3670 goto find_next_key;
3671 ret = 0;
3672 path->slots[level] = slot;
3673 btrfs_item_key_to_cpu(cur, &found_key, slot);
3674 goto out;
3675 }
3676 if (sret && slot > 0)
3677 slot--;
3678 /*
3679 * check this node pointer against the cache_only and
3680 * min_trans parameters. If it isn't in cache or is too
3681 * old, skip to the next one.
3682 */
3683 while (slot < nritems) {
3684 u64 blockptr;
3685 u64 gen;
3686 struct extent_buffer *tmp;
3687 struct btrfs_disk_key disk_key;
3688
3689 blockptr = btrfs_node_blockptr(cur, slot);
3690 gen = btrfs_node_ptr_generation(cur, slot);
3691 if (gen < min_trans) {
3692 slot++;
3693 continue;
3694 }
3695 if (!cache_only)
3696 break;
3697
3698 if (max_key) {
3699 btrfs_node_key(cur, &disk_key, slot);
3700 if (comp_keys(&disk_key, max_key) >= 0) {
3701 ret = 1;
3702 goto out;
3703 }
3704 }
3705
3706 tmp = btrfs_find_tree_block(root, blockptr,
3707 btrfs_level_size(root, level - 1));
3708
3709 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3710 free_extent_buffer(tmp);
3711 break;
3712 }
3713 if (tmp)
3714 free_extent_buffer(tmp);
3715 slot++;
3716 }
3717find_next_key:
3718 /*
3719 * we didn't find a candidate key in this node, walk forward
3720 * and find another one
3721 */
3722 if (slot >= nritems) {
3723 path->slots[level] = slot;
3724 sret = btrfs_find_next_key(root, path, min_key, level,
3725 cache_only, min_trans);
3726 if (sret == 0) {
3727 btrfs_release_path(root, path);
3728 goto again;
3729 } else {
3730 goto out;
3731 }
3732 }
3733 /* save our key for returning back */
3734 btrfs_node_key_to_cpu(cur, &found_key, slot);
3735 path->slots[level] = slot;
3736 if (level == path->lowest_level) {
3737 ret = 0;
3738 unlock_up(path, level, 1);
3739 goto out;
3740 }
3741 cur = read_node_slot(root, cur, slot);
3742
3743 btrfs_tree_lock(cur);
3744 path->locks[level - 1] = 1;
3745 path->nodes[level - 1] = cur;
3746 unlock_up(path, level, 1);
3747 }
3748out:
3749 if (ret == 0)
3750 memcpy(min_key, &found_key, sizeof(found_key));
3751 return ret;
3752}
3753
3754/*
3755 * this is similar to btrfs_next_leaf, but does not try to preserve
3756 * and fixup the path. It looks for and returns the next key in the
3757 * tree based on the current path and the cache_only and min_trans
3758 * parameters.
3759 *
3760 * 0 is returned if another key is found, < 0 if there are any errors
3761 * and 1 is returned if there are no higher keys in the tree
3762 *
3763 * path->keep_locks should be set to 1 on the search made before
3764 * calling this function.
3765 */
3766int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3767 struct btrfs_key *key, int lowest_level,
3768 int cache_only, u64 min_trans)
3769{
3770 int level = lowest_level;
3771 int slot;
3772 struct extent_buffer *c;
3773
3774 WARN_ON(!path->keep_locks);
3775 while (level < BTRFS_MAX_LEVEL) {
3776 if (!path->nodes[level])
3777 return 1;
3778
3779 slot = path->slots[level] + 1;
3780 c = path->nodes[level];
3781next:
3782 if (slot >= btrfs_header_nritems(c)) {
3783 level++;
3784 if (level == BTRFS_MAX_LEVEL)
3785 return 1;
3786 continue;
3787 }
3788 if (level == 0)
3789 btrfs_item_key_to_cpu(c, key, slot);
3790 else {
3791 u64 blockptr = btrfs_node_blockptr(c, slot);
3792 u64 gen = btrfs_node_ptr_generation(c, slot);
3793
3794 if (cache_only) {
3795 struct extent_buffer *cur;
3796 cur = btrfs_find_tree_block(root, blockptr,
3797 btrfs_level_size(root, level - 1));
3798 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
3799 slot++;
3800 if (cur)
3801 free_extent_buffer(cur);
3802 goto next;
3803 }
3804 free_extent_buffer(cur);
3805 }
3806 if (gen < min_trans) {
3807 slot++;
3808 goto next;
3809 }
3810 btrfs_node_key_to_cpu(c, key, slot);
3811 }
3812 return 0;
3813 }
3814 return 1;
3815}
3816
3817/*
3818 * search the tree again to find a leaf with greater keys
3819 * returns 0 if it found something or 1 if there are no greater leaves.
3820 * returns < 0 on io errors.
3821 */
3822int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3823{
3824 int slot;
3825 int level = 1;
3826 struct extent_buffer *c;
3827 struct extent_buffer *next = NULL;
3828 struct btrfs_key key;
3829 u32 nritems;
3830 int ret;
3831
3832 nritems = btrfs_header_nritems(path->nodes[0]);
3833 if (nritems == 0)
3834 return 1;
3835
3836 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
3837
3838 btrfs_release_path(root, path);
3839 path->keep_locks = 1;
3840 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3841 path->keep_locks = 0;
3842
3843 if (ret < 0)
3844 return ret;
3845
3846 nritems = btrfs_header_nritems(path->nodes[0]);
3847 /*
3848 * by releasing the path above we dropped all our locks. A balance
3849 * could have added more items next to the key that used to be
3850 * at the very end of the block. So, check again here and
3851 * advance the path if there are now more items available.
3852 */
3853 if (nritems > 0 && path->slots[0] < nritems - 1) {
3854 path->slots[0]++;
3855 goto done;
3856 }
3857
3858 while (level < BTRFS_MAX_LEVEL) {
3859 if (!path->nodes[level])
3860 return 1;
3861
3862 slot = path->slots[level] + 1;
3863 c = path->nodes[level];
3864 if (slot >= btrfs_header_nritems(c)) {
3865 level++;
3866 if (level == BTRFS_MAX_LEVEL)
3867 return 1;
3868 continue;
3869 }
3870
3871 if (next) {
3872 btrfs_tree_unlock(next);
3873 free_extent_buffer(next);
3874 }
3875
3876 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3877 path->reada)
3878 reada_for_search(root, path, level, slot, 0);
3879
3880 next = read_node_slot(root, c, slot);
3881 if (!path->skip_locking) {
3882 WARN_ON(!btrfs_tree_locked(c));
3883 btrfs_tree_lock(next);
3884 }
3885 break;
3886 }
3887 path->slots[level] = slot;
3888 while (1) {
3889 level--;
3890 c = path->nodes[level];
3891 if (path->locks[level])
3892 btrfs_tree_unlock(c);
3893 free_extent_buffer(c);
3894 path->nodes[level] = next;
3895 path->slots[level] = 0;
3896 if (!path->skip_locking)
3897 path->locks[level] = 1;
3898 if (!level)
3899 break;
3900 if (level == 1 && path->locks[1] && path->reada)
3901 reada_for_search(root, path, level, slot, 0);
3902 next = read_node_slot(root, next, 0);
3903 if (!path->skip_locking) {
3904 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3905 btrfs_tree_lock(next);
3906 }
3907 }
3908done:
3909 unlock_up(path, 0, 1);
3910 return 0;
3911}
3912
3913/*
3914 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
3915 * searching until it gets past min_objectid or finds an item of 'type'
3916 *
3917 * returns 0 if something is found, 1 if nothing was found and < 0 on error
3918 */
3919int btrfs_previous_item(struct btrfs_root *root,
3920 struct btrfs_path *path, u64 min_objectid,
3921 int type)
3922{
3923 struct btrfs_key found_key;
3924 struct extent_buffer *leaf;
3925 u32 nritems;
3926 int ret;
3927
3928 while (1) {
3929 if (path->slots[0] == 0) {
3930 ret = btrfs_prev_leaf(root, path);
3931 if (ret != 0)
3932 return ret;
3933 } else {
3934 path->slots[0]--;
3935 }
3936 leaf = path->nodes[0];
3937 nritems = btrfs_header_nritems(leaf);
3938 if (nritems == 0)
3939 return 1;
3940 if (path->slots[0] == nritems)
3941 path->slots[0]--;
3942
3943 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3944 if (found_key.type == type)
3945 return 0;
3946 if (found_key.objectid < min_objectid)
3947 break;
3948 if (found_key.objectid == min_objectid &&
3949 found_key.type < type)
3950 break;
3951 }
3952 return 1;
3953}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000000..eee060f88113
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2129 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CTREE__
20#define __BTRFS_CTREE__
21
22#include <linux/version.h>
23#include <linux/mm.h>
24#include <linux/highmem.h>
25#include <linux/fs.h>
26#include <linux/completion.h>
27#include <linux/backing-dev.h>
28#include <linux/wait.h>
29#include <asm/kmap_types.h>
30#include "extent_io.h"
31#include "extent_map.h"
32#include "async-thread.h"
33
34struct btrfs_trans_handle;
35struct btrfs_transaction;
36extern struct kmem_cache *btrfs_trans_handle_cachep;
37extern struct kmem_cache *btrfs_transaction_cachep;
38extern struct kmem_cache *btrfs_bit_radix_cachep;
39extern struct kmem_cache *btrfs_path_cachep;
40struct btrfs_ordered_sum;
41
42#define BTRFS_MAGIC "_BHRfS_M"
43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45
46#ifdef CONFIG_LOCKDEP
47# define BTRFS_MAX_LEVEL 7
48#else
49# define BTRFS_MAX_LEVEL 8
50#endif
51
52/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL
54
55/* stores information about which extents are in use, and reference counts */
56#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
57
58/*
59 * chunk tree stores translations from logical -> physical block numbering
60 * the super block points to the chunk tree
61 */
62#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
63
64/*
65 * stores information about which areas of a given device are in use.
66 * one per device. The tree of tree roots points to the device tree
67 */
68#define BTRFS_DEV_TREE_OBJECTID 4ULL
69
70/* one per subvolume, storing files and directories */
71#define BTRFS_FS_TREE_OBJECTID 5ULL
72
73/* directory objectid inside the root tree */
74#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
75
76/* holds checksums of all the data extents */
77#define BTRFS_CSUM_TREE_OBJECTID 7ULL
78
79/* orhpan objectid for tracking unlinked/truncated files */
80#define BTRFS_ORPHAN_OBJECTID -5ULL
81
82/* does write ahead logging to speed up fsyncs */
83#define BTRFS_TREE_LOG_OBJECTID -6ULL
84#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
85
86/* for space balancing */
87#define BTRFS_TREE_RELOC_OBJECTID -8ULL
88#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
89
90/*
91 * extent checksums all have this objectid
92 * this allows them to share the logging tree
93 * for fsyncs
94 */
95#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
96
97/* dummy objectid represents multiple objectids */
98#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
99
100/*
101 * All files have objectids in this range.
102 */
103#define BTRFS_FIRST_FREE_OBJECTID 256ULL
104#define BTRFS_LAST_FREE_OBJECTID -256ULL
105#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
106
107
108/*
109 * the device items go into the chunk tree. The key is in the form
110 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
111 */
112#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
113
114/*
115 * we can actually store much bigger names, but lets not confuse the rest
116 * of linux
117 */
118#define BTRFS_NAME_LEN 255
119
120/* 32 bytes in various csum fields */
121#define BTRFS_CSUM_SIZE 32
122
123/* csum types */
124#define BTRFS_CSUM_TYPE_CRC32 0
125
126static int btrfs_csum_sizes[] = { 4, 0 };
127
128/* four bytes for CRC32 */
129#define BTRFS_EMPTY_DIR_SIZE 0
130
131#define BTRFS_FT_UNKNOWN 0
132#define BTRFS_FT_REG_FILE 1
133#define BTRFS_FT_DIR 2
134#define BTRFS_FT_CHRDEV 3
135#define BTRFS_FT_BLKDEV 4
136#define BTRFS_FT_FIFO 5
137#define BTRFS_FT_SOCK 6
138#define BTRFS_FT_SYMLINK 7
139#define BTRFS_FT_XATTR 8
140#define BTRFS_FT_MAX 9
141
142/*
143 * the key defines the order in the tree, and so it also defines (optimal)
144 * block layout. objectid corresonds to the inode number. The flags
145 * tells us things about the object, and is a kind of stream selector.
146 * so for a given inode, keys with flags of 1 might refer to the inode
147 * data, flags of 2 may point to file data in the btree and flags == 3
148 * may point to extents.
149 *
150 * offset is the starting byte offset for this key in the stream.
151 *
152 * btrfs_disk_key is in disk byte order. struct btrfs_key is always
153 * in cpu native order. Otherwise they are identical and their sizes
154 * should be the same (ie both packed)
155 */
156struct btrfs_disk_key {
157 __le64 objectid;
158 u8 type;
159 __le64 offset;
160} __attribute__ ((__packed__));
161
162struct btrfs_key {
163 u64 objectid;
164 u8 type;
165 u64 offset;
166} __attribute__ ((__packed__));
167
168struct btrfs_mapping_tree {
169 struct extent_map_tree map_tree;
170};
171
172#define BTRFS_UUID_SIZE 16
173struct btrfs_dev_item {
174 /* the internal btrfs device id */
175 __le64 devid;
176
177 /* size of the device */
178 __le64 total_bytes;
179
180 /* bytes used */
181 __le64 bytes_used;
182
183 /* optimal io alignment for this device */
184 __le32 io_align;
185
186 /* optimal io width for this device */
187 __le32 io_width;
188
189 /* minimal io size for this device */
190 __le32 sector_size;
191
192 /* type and info about this device */
193 __le64 type;
194
195 /* expected generation for this device */
196 __le64 generation;
197
198 /*
199 * starting byte of this partition on the device,
200 * to allowr for stripe alignment in the future
201 */
202 __le64 start_offset;
203
204 /* grouping information for allocation decisions */
205 __le32 dev_group;
206
207 /* seek speed 0-100 where 100 is fastest */
208 u8 seek_speed;
209
210 /* bandwidth 0-100 where 100 is fastest */
211 u8 bandwidth;
212
213 /* btrfs generated uuid for this device */
214 u8 uuid[BTRFS_UUID_SIZE];
215
216 /* uuid of FS who owns this device */
217 u8 fsid[BTRFS_UUID_SIZE];
218} __attribute__ ((__packed__));
219
220struct btrfs_stripe {
221 __le64 devid;
222 __le64 offset;
223 u8 dev_uuid[BTRFS_UUID_SIZE];
224} __attribute__ ((__packed__));
225
226struct btrfs_chunk {
227 /* size of this chunk in bytes */
228 __le64 length;
229
230 /* objectid of the root referencing this chunk */
231 __le64 owner;
232
233 __le64 stripe_len;
234 __le64 type;
235
236 /* optimal io alignment for this chunk */
237 __le32 io_align;
238
239 /* optimal io width for this chunk */
240 __le32 io_width;
241
242 /* minimal io size for this chunk */
243 __le32 sector_size;
244
245 /* 2^16 stripes is quite a lot, a second limit is the size of a single
246 * item in the btree
247 */
248 __le16 num_stripes;
249
250 /* sub stripes only matter for raid10 */
251 __le16 sub_stripes;
252 struct btrfs_stripe stripe;
253 /* additional stripes go here */
254} __attribute__ ((__packed__));
255
256static inline unsigned long btrfs_chunk_item_size(int num_stripes)
257{
258 BUG_ON(num_stripes == 0);
259 return sizeof(struct btrfs_chunk) +
260 sizeof(struct btrfs_stripe) * (num_stripes - 1);
261}
262
263#define BTRFS_FSID_SIZE 16
264#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
265
266/*
267 * every tree block (leaf or node) starts with this header.
268 */
269struct btrfs_header {
270 /* these first four must match the super block */
271 u8 csum[BTRFS_CSUM_SIZE];
272 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
273 __le64 bytenr; /* which block this node is supposed to live in */
274 __le64 flags;
275
276 /* allowed to be different from the super from here on down */
277 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
278 __le64 generation;
279 __le64 owner;
280 __le32 nritems;
281 u8 level;
282} __attribute__ ((__packed__));
283
284#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
285 sizeof(struct btrfs_header)) / \
286 sizeof(struct btrfs_key_ptr))
287#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
288#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
289#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
290 sizeof(struct btrfs_item) - \
291 sizeof(struct btrfs_file_extent_item))
292
293#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
294
295/*
296 * this is a very generous portion of the super block, giving us
297 * room to translate 14 chunks with 3 stripes each.
298 */
299#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
300#define BTRFS_LABEL_SIZE 256
301
302/*
303 * the super block basically lists the main trees of the FS
304 * it currently lacks any block count etc etc
305 */
306struct btrfs_super_block {
307 u8 csum[BTRFS_CSUM_SIZE];
308 /* the first 4 fields must match struct btrfs_header */
309 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
310 __le64 bytenr; /* this block number */
311 __le64 flags;
312
313 /* allowed to be different from the btrfs_header from here own down */
314 __le64 magic;
315 __le64 generation;
316 __le64 root;
317 __le64 chunk_root;
318 __le64 log_root;
319
320 /* this will help find the new super based on the log root */
321 __le64 log_root_transid;
322 __le64 total_bytes;
323 __le64 bytes_used;
324 __le64 root_dir_objectid;
325 __le64 num_devices;
326 __le32 sectorsize;
327 __le32 nodesize;
328 __le32 leafsize;
329 __le32 stripesize;
330 __le32 sys_chunk_array_size;
331 __le64 chunk_root_generation;
332 __le64 compat_flags;
333 __le64 compat_ro_flags;
334 __le64 incompat_flags;
335 __le16 csum_type;
336 u8 root_level;
337 u8 chunk_root_level;
338 u8 log_root_level;
339 struct btrfs_dev_item dev_item;
340
341 char label[BTRFS_LABEL_SIZE];
342
343 /* future expansion */
344 __le64 reserved[32];
345 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
346} __attribute__ ((__packed__));
347
348/*
349 * Compat flags that we support. If any incompat flags are set other than the
350 * ones specified below then we will fail to mount
351 */
352#define BTRFS_FEATURE_COMPAT_SUPP 0x0
353#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0
354#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0
355
356/*
357 * A leaf is full of items. offset and size tell us where to find
358 * the item in the leaf (relative to the start of the data area)
359 */
360struct btrfs_item {
361 struct btrfs_disk_key key;
362 __le32 offset;
363 __le32 size;
364} __attribute__ ((__packed__));
365
366/*
367 * leaves have an item area and a data area:
368 * [item0, item1....itemN] [free space] [dataN...data1, data0]
369 *
370 * The data is separate from the items to get the keys closer together
371 * during searches.
372 */
373struct btrfs_leaf {
374 struct btrfs_header header;
375 struct btrfs_item items[];
376} __attribute__ ((__packed__));
377
378/*
379 * all non-leaf blocks are nodes, they hold only keys and pointers to
380 * other blocks
381 */
382struct btrfs_key_ptr {
383 struct btrfs_disk_key key;
384 __le64 blockptr;
385 __le64 generation;
386} __attribute__ ((__packed__));
387
388struct btrfs_node {
389 struct btrfs_header header;
390 struct btrfs_key_ptr ptrs[];
391} __attribute__ ((__packed__));
392
393/*
394 * btrfs_paths remember the path taken from the root down to the leaf.
395 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
396 * to any other levels that are present.
397 *
398 * The slots array records the index of the item or block pointer
399 * used while walking the tree.
400 */
401struct btrfs_path {
402 struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
403 int slots[BTRFS_MAX_LEVEL];
404 /* if there is real range locking, this locks field will change */
405 int locks[BTRFS_MAX_LEVEL];
406 int reada;
407 /* keep some upper locks as we walk down */
408 int keep_locks;
409 int skip_locking;
410 int lowest_level;
411
412 /*
413 * set by btrfs_split_item, tells search_slot to keep all locks
414 * and to force calls to keep space in the nodes
415 */
416 int search_for_split;
417};
418
419/*
420 * items in the extent btree are used to record the objectid of the
421 * owner of the block and the number of references
422 */
423struct btrfs_extent_item {
424 __le32 refs;
425} __attribute__ ((__packed__));
426
427struct btrfs_extent_ref {
428 __le64 root;
429 __le64 generation;
430 __le64 objectid;
431 __le32 num_refs;
432} __attribute__ ((__packed__));
433
434/* dev extents record free space on individual devices. The owner
435 * field points back to the chunk allocation mapping tree that allocated
436 * the extent. The chunk tree uuid field is a way to double check the owner
437 */
438struct btrfs_dev_extent {
439 __le64 chunk_tree;
440 __le64 chunk_objectid;
441 __le64 chunk_offset;
442 __le64 length;
443 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
444} __attribute__ ((__packed__));
445
446struct btrfs_inode_ref {
447 __le64 index;
448 __le16 name_len;
449 /* name goes here */
450} __attribute__ ((__packed__));
451
452struct btrfs_timespec {
453 __le64 sec;
454 __le32 nsec;
455} __attribute__ ((__packed__));
456
457typedef enum {
458 BTRFS_COMPRESS_NONE = 0,
459 BTRFS_COMPRESS_ZLIB = 1,
460 BTRFS_COMPRESS_LAST = 2,
461} btrfs_compression_type;
462
463/* we don't understand any encryption methods right now */
464typedef enum {
465 BTRFS_ENCRYPTION_NONE = 0,
466 BTRFS_ENCRYPTION_LAST = 1,
467} btrfs_encryption_type;
468
469struct btrfs_inode_item {
470 /* nfs style generation number */
471 __le64 generation;
472 /* transid that last touched this inode */
473 __le64 transid;
474 __le64 size;
475 __le64 nbytes;
476 __le64 block_group;
477 __le32 nlink;
478 __le32 uid;
479 __le32 gid;
480 __le32 mode;
481 __le64 rdev;
482 __le64 flags;
483
484 /* modification sequence number for NFS */
485 __le64 sequence;
486
487 /*
488 * a little future expansion, for more than this we can
489 * just grow the inode item and version it
490 */
491 __le64 reserved[4];
492 struct btrfs_timespec atime;
493 struct btrfs_timespec ctime;
494 struct btrfs_timespec mtime;
495 struct btrfs_timespec otime;
496} __attribute__ ((__packed__));
497
498struct btrfs_dir_log_item {
499 __le64 end;
500} __attribute__ ((__packed__));
501
502struct btrfs_dir_item {
503 struct btrfs_disk_key location;
504 __le64 transid;
505 __le16 data_len;
506 __le16 name_len;
507 u8 type;
508} __attribute__ ((__packed__));
509
510struct btrfs_root_item {
511 struct btrfs_inode_item inode;
512 __le64 generation;
513 __le64 root_dirid;
514 __le64 bytenr;
515 __le64 byte_limit;
516 __le64 bytes_used;
517 __le64 last_snapshot;
518 __le64 flags;
519 __le32 refs;
520 struct btrfs_disk_key drop_progress;
521 u8 drop_level;
522 u8 level;
523} __attribute__ ((__packed__));
524
525/*
526 * this is used for both forward and backward root refs
527 */
528struct btrfs_root_ref {
529 __le64 dirid;
530 __le64 sequence;
531 __le16 name_len;
532} __attribute__ ((__packed__));
533
534#define BTRFS_FILE_EXTENT_INLINE 0
535#define BTRFS_FILE_EXTENT_REG 1
536#define BTRFS_FILE_EXTENT_PREALLOC 2
537
538struct btrfs_file_extent_item {
539 /*
540 * transaction id that created this extent
541 */
542 __le64 generation;
543 /*
544 * max number of bytes to hold this extent in ram
545 * when we split a compressed extent we can't know how big
546 * each of the resulting pieces will be. So, this is
547 * an upper limit on the size of the extent in ram instead of
548 * an exact limit.
549 */
550 __le64 ram_bytes;
551
552 /*
553 * 32 bits for the various ways we might encode the data,
554 * including compression and encryption. If any of these
555 * are set to something a given disk format doesn't understand
556 * it is treated like an incompat flag for reading and writing,
557 * but not for stat.
558 */
559 u8 compression;
560 u8 encryption;
561 __le16 other_encoding; /* spare for later use */
562
563 /* are we inline data or a real extent? */
564 u8 type;
565
566 /*
567 * disk space consumed by the extent, checksum blocks are included
568 * in these numbers
569 */
570 __le64 disk_bytenr;
571 __le64 disk_num_bytes;
572 /*
573 * the logical offset in file blocks (no csums)
574 * this extent record is for. This allows a file extent to point
575 * into the middle of an existing extent on disk, sharing it
576 * between two snapshots (useful if some bytes in the middle of the
577 * extent have changed
578 */
579 __le64 offset;
580 /*
581 * the logical number of file blocks (no csums included). This
582 * always reflects the size uncompressed and without encoding.
583 */
584 __le64 num_bytes;
585
586} __attribute__ ((__packed__));
587
588struct btrfs_csum_item {
589 u8 csum;
590} __attribute__ ((__packed__));
591
592/* different types of block groups (and chunks) */
593#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
594#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
595#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
596#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
597#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
598#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
599#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
600
601struct btrfs_block_group_item {
602 __le64 used;
603 __le64 chunk_objectid;
604 __le64 flags;
605} __attribute__ ((__packed__));
606
607struct btrfs_space_info {
608 u64 flags;
609 u64 total_bytes;
610 u64 bytes_used;
611 u64 bytes_pinned;
612 u64 bytes_reserved;
613 u64 bytes_readonly;
614 int full;
615 int force_alloc;
616 struct list_head list;
617
618 /* for block groups in our same type */
619 struct list_head block_groups;
620 spinlock_t lock;
621 struct rw_semaphore groups_sem;
622};
623
624struct btrfs_free_space {
625 struct rb_node bytes_index;
626 struct rb_node offset_index;
627 u64 offset;
628 u64 bytes;
629};
630
631struct btrfs_block_group_cache {
632 struct btrfs_key key;
633 struct btrfs_block_group_item item;
634 spinlock_t lock;
635 struct mutex alloc_mutex;
636 struct mutex cache_mutex;
637 u64 pinned;
638 u64 reserved;
639 u64 flags;
640 int cached;
641 int ro;
642 int dirty;
643
644 struct btrfs_space_info *space_info;
645
646 /* free space cache stuff */
647 struct rb_root free_space_bytes;
648 struct rb_root free_space_offset;
649
650 /* block group cache stuff */
651 struct rb_node cache_node;
652
653 /* for block groups in the same raid type */
654 struct list_head list;
655
656 /* usage count */
657 atomic_t count;
658};
659
660struct btrfs_leaf_ref_tree {
661 struct rb_root root;
662 struct list_head list;
663 spinlock_t lock;
664};
665
666struct btrfs_device;
667struct btrfs_fs_devices;
668struct btrfs_fs_info {
669 u8 fsid[BTRFS_FSID_SIZE];
670 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
671 struct btrfs_root *extent_root;
672 struct btrfs_root *tree_root;
673 struct btrfs_root *chunk_root;
674 struct btrfs_root *dev_root;
675 struct btrfs_root *fs_root;
676 struct btrfs_root *csum_root;
677
678 /* the log root tree is a directory of all the other log roots */
679 struct btrfs_root *log_root_tree;
680 struct radix_tree_root fs_roots_radix;
681
682 /* block group cache stuff */
683 spinlock_t block_group_cache_lock;
684 struct rb_root block_group_cache_tree;
685
686 struct extent_io_tree pinned_extents;
687 struct extent_io_tree pending_del;
688 struct extent_io_tree extent_ins;
689
690 /* logical->physical extent mapping */
691 struct btrfs_mapping_tree mapping_tree;
692
693 u64 generation;
694 u64 last_trans_committed;
695 u64 last_trans_new_blockgroup;
696 u64 open_ioctl_trans;
697 unsigned long mount_opt;
698 u64 max_extent;
699 u64 max_inline;
700 u64 alloc_start;
701 struct btrfs_transaction *running_transaction;
702 wait_queue_head_t transaction_throttle;
703 wait_queue_head_t transaction_wait;
704
705 wait_queue_head_t async_submit_wait;
706 wait_queue_head_t tree_log_wait;
707
708 struct btrfs_super_block super_copy;
709 struct btrfs_super_block super_for_commit;
710 struct block_device *__bdev;
711 struct super_block *sb;
712 struct inode *btree_inode;
713 struct backing_dev_info bdi;
714 spinlock_t hash_lock;
715 struct mutex trans_mutex;
716 struct mutex tree_log_mutex;
717 struct mutex transaction_kthread_mutex;
718 struct mutex cleaner_mutex;
719 struct mutex extent_ins_mutex;
720 struct mutex pinned_mutex;
721 struct mutex chunk_mutex;
722 struct mutex drop_mutex;
723 struct mutex volume_mutex;
724 struct mutex tree_reloc_mutex;
725 struct list_head trans_list;
726 struct list_head hashers;
727 struct list_head dead_roots;
728
729 atomic_t nr_async_submits;
730 atomic_t async_submit_draining;
731 atomic_t nr_async_bios;
732 atomic_t async_delalloc_pages;
733 atomic_t tree_log_writers;
734 atomic_t tree_log_commit;
735 unsigned long tree_log_batch;
736 u64 tree_log_transid;
737
738 /*
739 * this is used by the balancing code to wait for all the pending
740 * ordered extents
741 */
742 spinlock_t ordered_extent_lock;
743 struct list_head ordered_extents;
744 struct list_head delalloc_inodes;
745
746 /*
747 * there is a pool of worker threads for checksumming during writes
748 * and a pool for checksumming after reads. This is because readers
749 * can run with FS locks held, and the writers may be waiting for
750 * those locks. We don't want ordering in the pending list to cause
751 * deadlocks, and so the two are serviced separately.
752 *
753 * A third pool does submit_bio to avoid deadlocking with the other
754 * two
755 */
756 struct btrfs_workers workers;
757 struct btrfs_workers delalloc_workers;
758 struct btrfs_workers endio_workers;
759 struct btrfs_workers endio_meta_workers;
760 struct btrfs_workers endio_meta_write_workers;
761 struct btrfs_workers endio_write_workers;
762 struct btrfs_workers submit_workers;
763 /*
764 * fixup workers take dirty pages that didn't properly go through
765 * the cow mechanism and make them safe to write. It happens
766 * for the sys_munmap function call path
767 */
768 struct btrfs_workers fixup_workers;
769 struct task_struct *transaction_kthread;
770 struct task_struct *cleaner_kthread;
771 int thread_pool_size;
772
773 /* tree relocation relocated fields */
774 struct list_head dead_reloc_roots;
775 struct btrfs_leaf_ref_tree reloc_ref_tree;
776 struct btrfs_leaf_ref_tree shared_ref_tree;
777
778 struct kobject super_kobj;
779 struct completion kobj_unregister;
780 int do_barriers;
781 int closing;
782 int log_root_recovering;
783 atomic_t throttles;
784 atomic_t throttle_gen;
785
786 u64 total_pinned;
787 struct list_head dirty_cowonly_roots;
788
789 struct btrfs_fs_devices *fs_devices;
790 struct list_head space_info;
791 spinlock_t delalloc_lock;
792 spinlock_t new_trans_lock;
793 u64 delalloc_bytes;
794 u64 last_alloc;
795 u64 last_data_alloc;
796
797 spinlock_t ref_cache_lock;
798 u64 total_ref_cache_size;
799
800 u64 avail_data_alloc_bits;
801 u64 avail_metadata_alloc_bits;
802 u64 avail_system_alloc_bits;
803 u64 data_alloc_profile;
804 u64 metadata_alloc_profile;
805 u64 system_alloc_profile;
806
807 void *bdev_holder;
808};
809
810/*
811 * in ram representation of the tree. extent_root is used for all allocations
812 * and for the extent tree extent_root root.
813 */
814struct btrfs_dirty_root;
815struct btrfs_root {
816 struct extent_buffer *node;
817
818 /* the node lock is held while changing the node pointer */
819 spinlock_t node_lock;
820
821 struct extent_buffer *commit_root;
822 struct btrfs_leaf_ref_tree *ref_tree;
823 struct btrfs_leaf_ref_tree ref_tree_struct;
824 struct btrfs_dirty_root *dirty_root;
825 struct btrfs_root *log_root;
826 struct btrfs_root *reloc_root;
827
828 struct btrfs_root_item root_item;
829 struct btrfs_key root_key;
830 struct btrfs_fs_info *fs_info;
831 struct extent_io_tree dirty_log_pages;
832
833 struct kobject root_kobj;
834 struct completion kobj_unregister;
835 struct mutex objectid_mutex;
836 struct mutex log_mutex;
837
838 u64 objectid;
839 u64 last_trans;
840
841 /* data allocations are done in sectorsize units */
842 u32 sectorsize;
843
844 /* node allocations are done in nodesize units */
845 u32 nodesize;
846
847 /* leaf allocations are done in leafsize units */
848 u32 leafsize;
849
850 u32 stripesize;
851
852 u32 type;
853 u64 highest_inode;
854 u64 last_inode_alloc;
855 int ref_cows;
856 int track_dirty;
857 u64 defrag_trans_start;
858 struct btrfs_key defrag_progress;
859 struct btrfs_key defrag_max;
860 int defrag_running;
861 int defrag_level;
862 char *name;
863 int in_sysfs;
864
865 /* the dirty list is only used by non-reference counted roots */
866 struct list_head dirty_list;
867
868 spinlock_t list_lock;
869 struct list_head dead_list;
870 struct list_head orphan_list;
871
872 /*
873 * right now this just gets used so that a root has its own devid
874 * for stat. It may be used for more later
875 */
876 struct super_block anon_super;
877};
878
879/*
880
881 * inode items have the data typically returned from stat and store other
882 * info about object characteristics. There is one for every file and dir in
883 * the FS
884 */
885#define BTRFS_INODE_ITEM_KEY 1
886#define BTRFS_INODE_REF_KEY 12
887#define BTRFS_XATTR_ITEM_KEY 24
888#define BTRFS_ORPHAN_ITEM_KEY 48
889/* reserve 2-15 close to the inode for later flexibility */
890
891/*
892 * dir items are the name -> inode pointers in a directory. There is one
893 * for every name in a directory.
894 */
895#define BTRFS_DIR_LOG_ITEM_KEY 60
896#define BTRFS_DIR_LOG_INDEX_KEY 72
897#define BTRFS_DIR_ITEM_KEY 84
898#define BTRFS_DIR_INDEX_KEY 96
899/*
900 * extent data is for file data
901 */
902#define BTRFS_EXTENT_DATA_KEY 108
903
904/*
905 * extent csums are stored in a separate tree and hold csums for
906 * an entire extent on disk.
907 */
908#define BTRFS_EXTENT_CSUM_KEY 128
909
910/*
911 * root items point to tree roots. There are typically in the root
912 * tree used by the super block to find all the other trees
913 */
914#define BTRFS_ROOT_ITEM_KEY 132
915
916/*
917 * root backrefs tie subvols and snapshots to the directory entries that
918 * reference them
919 */
920#define BTRFS_ROOT_BACKREF_KEY 144
921
922/*
923 * root refs make a fast index for listing all of the snapshots and
924 * subvolumes referenced by a given root. They point directly to the
925 * directory item in the root that references the subvol
926 */
927#define BTRFS_ROOT_REF_KEY 156
928
929/*
930 * extent items are in the extent map tree. These record which blocks
931 * are used, and how many references there are to each block
932 */
933#define BTRFS_EXTENT_ITEM_KEY 168
934#define BTRFS_EXTENT_REF_KEY 180
935
936/*
937 * block groups give us hints into the extent allocation trees. Which
938 * blocks are free etc etc
939 */
940#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
941
942#define BTRFS_DEV_EXTENT_KEY 204
943#define BTRFS_DEV_ITEM_KEY 216
944#define BTRFS_CHUNK_ITEM_KEY 228
945
946/*
947 * string items are for debugging. They just store a short string of
948 * data in the FS
949 */
950#define BTRFS_STRING_ITEM_KEY 253
951
952#define BTRFS_MOUNT_NODATASUM (1 << 0)
953#define BTRFS_MOUNT_NODATACOW (1 << 1)
954#define BTRFS_MOUNT_NOBARRIER (1 << 2)
955#define BTRFS_MOUNT_SSD (1 << 3)
956#define BTRFS_MOUNT_DEGRADED (1 << 4)
957#define BTRFS_MOUNT_COMPRESS (1 << 5)
958
959#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
960#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
961#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
962 BTRFS_MOUNT_##opt)
963/*
964 * Inode flags
965 */
966#define BTRFS_INODE_NODATASUM (1 << 0)
967#define BTRFS_INODE_NODATACOW (1 << 1)
968#define BTRFS_INODE_READONLY (1 << 2)
969#define BTRFS_INODE_NOCOMPRESS (1 << 3)
970#define BTRFS_INODE_PREALLOC (1 << 4)
971#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
972 ~BTRFS_INODE_##flag)
973#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
974 BTRFS_INODE_##flag)
975#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \
976 BTRFS_INODE_##flag)
977/* some macros to generate set/get funcs for the struct fields. This
978 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
979 * one for u8:
980 */
981#define le8_to_cpu(v) (v)
982#define cpu_to_le8(v) (v)
983#define __le8 u8
984
985#define read_eb_member(eb, ptr, type, member, result) ( \
986 read_extent_buffer(eb, (char *)(result), \
987 ((unsigned long)(ptr)) + \
988 offsetof(type, member), \
989 sizeof(((type *)0)->member)))
990
991#define write_eb_member(eb, ptr, type, member, result) ( \
992 write_extent_buffer(eb, (char *)(result), \
993 ((unsigned long)(ptr)) + \
994 offsetof(type, member), \
995 sizeof(((type *)0)->member)))
996
997#ifndef BTRFS_SETGET_FUNCS
998#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
999u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
1000void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
1001#endif
1002
1003#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1004static inline u##bits btrfs_##name(struct extent_buffer *eb) \
1005{ \
1006 type *p = kmap_atomic(eb->first_page, KM_USER0); \
1007 u##bits res = le##bits##_to_cpu(p->member); \
1008 kunmap_atomic(p, KM_USER0); \
1009 return res; \
1010} \
1011static inline void btrfs_set_##name(struct extent_buffer *eb, \
1012 u##bits val) \
1013{ \
1014 type *p = kmap_atomic(eb->first_page, KM_USER0); \
1015 p->member = cpu_to_le##bits(val); \
1016 kunmap_atomic(p, KM_USER0); \
1017}
1018
1019#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
1020static inline u##bits btrfs_##name(type *s) \
1021{ \
1022 return le##bits##_to_cpu(s->member); \
1023} \
1024static inline void btrfs_set_##name(type *s, u##bits val) \
1025{ \
1026 s->member = cpu_to_le##bits(val); \
1027}
1028
1029BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
1030BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
1031BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
1032BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
1033BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
1034BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
1035 start_offset, 64);
1036BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
1037BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
1038BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
1039BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
1040BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
1041BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
1042
1043BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
1044BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
1045 total_bytes, 64);
1046BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
1047 bytes_used, 64);
1048BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
1049 io_align, 32);
1050BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
1051 io_width, 32);
1052BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
1053 sector_size, 32);
1054BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
1055BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
1056 dev_group, 32);
1057BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
1058 seek_speed, 8);
1059BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
1060 bandwidth, 8);
1061BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
1062 generation, 64);
1063
1064static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
1065{
1066 return (char *)d + offsetof(struct btrfs_dev_item, uuid);
1067}
1068
1069static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
1070{
1071 return (char *)d + offsetof(struct btrfs_dev_item, fsid);
1072}
1073
1074BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
1075BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
1076BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
1077BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
1078BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
1079BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
1080BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
1081BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
1082BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
1083BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
1084BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
1085
1086static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
1087{
1088 return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
1089}
1090
1091BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
1092BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
1093BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
1094 stripe_len, 64);
1095BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
1096 io_align, 32);
1097BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
1098 io_width, 32);
1099BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
1100 sector_size, 32);
1101BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
1102BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
1103 num_stripes, 16);
1104BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
1105 sub_stripes, 16);
1106BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
1107BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
1108
1109static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
1110 int nr)
1111{
1112 unsigned long offset = (unsigned long)c;
1113 offset += offsetof(struct btrfs_chunk, stripe);
1114 offset += nr * sizeof(struct btrfs_stripe);
1115 return (struct btrfs_stripe *)offset;
1116}
1117
1118static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
1119{
1120 return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
1121}
1122
1123static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
1124 struct btrfs_chunk *c, int nr)
1125{
1126 return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
1127}
1128
1129static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
1130 struct btrfs_chunk *c, int nr,
1131 u64 val)
1132{
1133 btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
1134}
1135
1136static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
1137 struct btrfs_chunk *c, int nr)
1138{
1139 return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
1140}
1141
1142static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
1143 struct btrfs_chunk *c, int nr,
1144 u64 val)
1145{
1146 btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
1147}
1148
1149/* struct btrfs_block_group_item */
1150BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
1151 used, 64);
1152BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
1153 used, 64);
1154BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
1155 struct btrfs_block_group_item, chunk_objectid, 64);
1156
1157BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
1158 struct btrfs_block_group_item, chunk_objectid, 64);
1159BTRFS_SETGET_FUNCS(disk_block_group_flags,
1160 struct btrfs_block_group_item, flags, 64);
1161BTRFS_SETGET_STACK_FUNCS(block_group_flags,
1162 struct btrfs_block_group_item, flags, 64);
1163
1164/* struct btrfs_inode_ref */
1165BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1166BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1167
1168/* struct btrfs_inode_item */
1169BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1170BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
1171BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
1172BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
1173BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
1174BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
1175BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
1176BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
1177BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
1178BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
1179BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
1180BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
1181
1182static inline struct btrfs_timespec *
1183btrfs_inode_atime(struct btrfs_inode_item *inode_item)
1184{
1185 unsigned long ptr = (unsigned long)inode_item;
1186 ptr += offsetof(struct btrfs_inode_item, atime);
1187 return (struct btrfs_timespec *)ptr;
1188}
1189
1190static inline struct btrfs_timespec *
1191btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
1192{
1193 unsigned long ptr = (unsigned long)inode_item;
1194 ptr += offsetof(struct btrfs_inode_item, mtime);
1195 return (struct btrfs_timespec *)ptr;
1196}
1197
1198static inline struct btrfs_timespec *
1199btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
1200{
1201 unsigned long ptr = (unsigned long)inode_item;
1202 ptr += offsetof(struct btrfs_inode_item, ctime);
1203 return (struct btrfs_timespec *)ptr;
1204}
1205
1206static inline struct btrfs_timespec *
1207btrfs_inode_otime(struct btrfs_inode_item *inode_item)
1208{
1209 unsigned long ptr = (unsigned long)inode_item;
1210 ptr += offsetof(struct btrfs_inode_item, otime);
1211 return (struct btrfs_timespec *)ptr;
1212}
1213
1214BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
1215BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
1216
1217/* struct btrfs_dev_extent */
1218BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
1219 chunk_tree, 64);
1220BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
1221 chunk_objectid, 64);
1222BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
1223 chunk_offset, 64);
1224BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
1225
1226static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
1227{
1228 unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
1229 return (u8 *)((unsigned long)dev + ptr);
1230}
1231
1232/* struct btrfs_extent_ref */
1233BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
1234BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
1235BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
1236BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
1237
1238BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
1239BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
1240 generation, 64);
1241BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
1242 objectid, 64);
1243BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
1244 num_refs, 32);
1245
1246/* struct btrfs_extent_item */
1247BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
1248BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
1249 refs, 32);
1250
1251/* struct btrfs_node */
1252BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
1253BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
1254
1255static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
1256{
1257 unsigned long ptr;
1258 ptr = offsetof(struct btrfs_node, ptrs) +
1259 sizeof(struct btrfs_key_ptr) * nr;
1260 return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
1261}
1262
1263static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
1264 int nr, u64 val)
1265{
1266 unsigned long ptr;
1267 ptr = offsetof(struct btrfs_node, ptrs) +
1268 sizeof(struct btrfs_key_ptr) * nr;
1269 btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
1270}
1271
1272static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
1273{
1274 unsigned long ptr;
1275 ptr = offsetof(struct btrfs_node, ptrs) +
1276 sizeof(struct btrfs_key_ptr) * nr;
1277 return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
1278}
1279
1280static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
1281 int nr, u64 val)
1282{
1283 unsigned long ptr;
1284 ptr = offsetof(struct btrfs_node, ptrs) +
1285 sizeof(struct btrfs_key_ptr) * nr;
1286 btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
1287}
1288
1289static inline unsigned long btrfs_node_key_ptr_offset(int nr)
1290{
1291 return offsetof(struct btrfs_node, ptrs) +
1292 sizeof(struct btrfs_key_ptr) * nr;
1293}
1294
1295void btrfs_node_key(struct extent_buffer *eb,
1296 struct btrfs_disk_key *disk_key, int nr);
1297
1298static inline void btrfs_set_node_key(struct extent_buffer *eb,
1299 struct btrfs_disk_key *disk_key, int nr)
1300{
1301 unsigned long ptr;
1302 ptr = btrfs_node_key_ptr_offset(nr);
1303 write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
1304 struct btrfs_key_ptr, key, disk_key);
1305}
1306
1307/* struct btrfs_item */
1308BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
1309BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
1310
1311static inline unsigned long btrfs_item_nr_offset(int nr)
1312{
1313 return offsetof(struct btrfs_leaf, items) +
1314 sizeof(struct btrfs_item) * nr;
1315}
1316
1317static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
1318 int nr)
1319{
1320 return (struct btrfs_item *)btrfs_item_nr_offset(nr);
1321}
1322
1323static inline u32 btrfs_item_end(struct extent_buffer *eb,
1324 struct btrfs_item *item)
1325{
1326 return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
1327}
1328
1329static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
1330{
1331 return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
1332}
1333
1334static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
1335{
1336 return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
1337}
1338
1339static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
1340{
1341 return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
1342}
1343
1344static inline void btrfs_item_key(struct extent_buffer *eb,
1345 struct btrfs_disk_key *disk_key, int nr)
1346{
1347 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1348 read_eb_member(eb, item, struct btrfs_item, key, disk_key);
1349}
1350
1351static inline void btrfs_set_item_key(struct extent_buffer *eb,
1352 struct btrfs_disk_key *disk_key, int nr)
1353{
1354 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1355 write_eb_member(eb, item, struct btrfs_item, key, disk_key);
1356}
1357
1358BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
1359
1360/*
1361 * struct btrfs_root_ref
1362 */
1363BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
1364BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
1365BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
1366
1367/* struct btrfs_dir_item */
1368BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
1369BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
1370BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
1371BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
1372
1373static inline void btrfs_dir_item_key(struct extent_buffer *eb,
1374 struct btrfs_dir_item *item,
1375 struct btrfs_disk_key *key)
1376{
1377 read_eb_member(eb, item, struct btrfs_dir_item, location, key);
1378}
1379
1380static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1381 struct btrfs_dir_item *item,
1382 struct btrfs_disk_key *key)
1383{
1384 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1385}
1386
1387/* struct btrfs_disk_key */
1388BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1389 objectid, 64);
1390BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
1391BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
1392
1393static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
1394 struct btrfs_disk_key *disk)
1395{
1396 cpu->offset = le64_to_cpu(disk->offset);
1397 cpu->type = disk->type;
1398 cpu->objectid = le64_to_cpu(disk->objectid);
1399}
1400
1401static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
1402 struct btrfs_key *cpu)
1403{
1404 disk->offset = cpu_to_le64(cpu->offset);
1405 disk->type = cpu->type;
1406 disk->objectid = cpu_to_le64(cpu->objectid);
1407}
1408
1409static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
1410 struct btrfs_key *key, int nr)
1411{
1412 struct btrfs_disk_key disk_key;
1413 btrfs_node_key(eb, &disk_key, nr);
1414 btrfs_disk_key_to_cpu(key, &disk_key);
1415}
1416
1417static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
1418 struct btrfs_key *key, int nr)
1419{
1420 struct btrfs_disk_key disk_key;
1421 btrfs_item_key(eb, &disk_key, nr);
1422 btrfs_disk_key_to_cpu(key, &disk_key);
1423}
1424
1425static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
1426 struct btrfs_dir_item *item,
1427 struct btrfs_key *key)
1428{
1429 struct btrfs_disk_key disk_key;
1430 btrfs_dir_item_key(eb, item, &disk_key);
1431 btrfs_disk_key_to_cpu(key, &disk_key);
1432}
1433
1434
1435static inline u8 btrfs_key_type(struct btrfs_key *key)
1436{
1437 return key->type;
1438}
1439
1440static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
1441{
1442 key->type = val;
1443}
1444
1445/* struct btrfs_header */
1446BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
1447BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
1448 generation, 64);
1449BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
1450BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
1451BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
1452BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
1453
1454static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
1455{
1456 return (btrfs_header_flags(eb) & flag) == flag;
1457}
1458
1459static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
1460{
1461 u64 flags = btrfs_header_flags(eb);
1462 btrfs_set_header_flags(eb, flags | flag);
1463 return (flags & flag) == flag;
1464}
1465
1466static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
1467{
1468 u64 flags = btrfs_header_flags(eb);
1469 btrfs_set_header_flags(eb, flags & ~flag);
1470 return (flags & flag) == flag;
1471}
1472
1473static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
1474{
1475 unsigned long ptr = offsetof(struct btrfs_header, fsid);
1476 return (u8 *)ptr;
1477}
1478
1479static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
1480{
1481 unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
1482 return (u8 *)ptr;
1483}
1484
1485static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
1486{
1487 unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
1488 return (u8 *)ptr;
1489}
1490
1491static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
1492{
1493 unsigned long ptr = offsetof(struct btrfs_header, csum);
1494 return (u8 *)ptr;
1495}
1496
1497static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
1498{
1499 return NULL;
1500}
1501
1502static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
1503{
1504 return NULL;
1505}
1506
1507static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
1508{
1509 return NULL;
1510}
1511
1512static inline int btrfs_is_leaf(struct extent_buffer *eb)
1513{
1514 return btrfs_header_level(eb) == 0;
1515}
1516
1517/* struct btrfs_root_item */
1518BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
1519 generation, 64);
1520BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
1521BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
1522BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
1523
1524BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
1525 generation, 64);
1526BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
1527BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
1528BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
1529BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
1530BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
1531BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
1532BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1533BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1534 last_snapshot, 64);
1535
1536/* struct btrfs_super_block */
1537
1538BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
1539BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
1540BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
1541 generation, 64);
1542BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
1543BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
1544 struct btrfs_super_block, sys_chunk_array_size, 32);
1545BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
1546 struct btrfs_super_block, chunk_root_generation, 64);
1547BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
1548 root_level, 8);
1549BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
1550 chunk_root, 64);
1551BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
1552 chunk_root_level, 8);
1553BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
1554 log_root, 64);
1555BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
1556 log_root_transid, 64);
1557BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
1558 log_root_level, 8);
1559BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
1560 total_bytes, 64);
1561BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
1562 bytes_used, 64);
1563BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
1564 sectorsize, 32);
1565BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
1566 nodesize, 32);
1567BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
1568 leafsize, 32);
1569BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
1570 stripesize, 32);
1571BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
1572 root_dir_objectid, 64);
1573BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1574 num_devices, 64);
1575BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1576 compat_flags, 64);
1577BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1578 compat_flags, 64);
1579BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1580 incompat_flags, 64);
1581BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
1582 csum_type, 16);
1583
1584static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
1585{
1586 int t = btrfs_super_csum_type(s);
1587 BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
1588 return btrfs_csum_sizes[t];
1589}
1590
1591static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
1592{
1593 return offsetof(struct btrfs_leaf, items);
1594}
1595
1596/* struct btrfs_file_extent_item */
1597BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
1598
1599static inline unsigned long
1600btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
1601{
1602 unsigned long offset = (unsigned long)e;
1603 offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
1604 return offset;
1605}
1606
1607static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
1608{
1609 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
1610}
1611
1612BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
1613 disk_bytenr, 64);
1614BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
1615 generation, 64);
1616BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
1617 disk_num_bytes, 64);
1618BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
1619 offset, 64);
1620BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
1621 num_bytes, 64);
1622BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
1623 ram_bytes, 64);
1624BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
1625 compression, 8);
1626BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
1627 encryption, 8);
1628BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
1629 other_encoding, 16);
1630
1631/* this returns the number of file bytes represented by the inline item.
1632 * If an item is compressed, this is the uncompressed size
1633 */
1634static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1635 struct btrfs_file_extent_item *e)
1636{
1637 return btrfs_file_extent_ram_bytes(eb, e);
1638}
1639
1640/*
1641 * this returns the number of bytes used by the item on disk, minus the
1642 * size of any extent headers. If a file is compressed on disk, this is
1643 * the compressed size
1644 */
1645static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
1646 struct btrfs_item *e)
1647{
1648 unsigned long offset;
1649 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1650 return btrfs_item_size(eb, e) - offset;
1651}
1652
1653static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
1654{
1655 return sb->s_fs_info;
1656}
1657
1658static inline int btrfs_set_root_name(struct btrfs_root *root,
1659 const char *name, int len)
1660{
1661 /* if we already have a name just free it */
1662 kfree(root->name);
1663
1664 root->name = kmalloc(len+1, GFP_KERNEL);
1665 if (!root->name)
1666 return -ENOMEM;
1667
1668 memcpy(root->name, name, len);
1669 root->name[len] = '\0';
1670
1671 return 0;
1672}
1673
1674static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
1675{
1676 if (level == 0)
1677 return root->leafsize;
1678 return root->nodesize;
1679}
1680
1681/* helper function to cast into the data area of the leaf. */
1682#define btrfs_item_ptr(leaf, slot, type) \
1683 ((type *)(btrfs_leaf_data(leaf) + \
1684 btrfs_item_offset_nr(leaf, slot)))
1685
1686#define btrfs_item_ptr_offset(leaf, slot) \
1687 ((unsigned long)(btrfs_leaf_data(leaf) + \
1688 btrfs_item_offset_nr(leaf, slot)))
1689
1690static inline struct dentry *fdentry(struct file *file)
1691{
1692 return file->f_path.dentry;
1693}
1694
1695/* extent-tree.c */
1696int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1697int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1698 struct btrfs_root *root, u64 bytenr,
1699 u64 num_bytes, u32 *refs);
1700int btrfs_update_pinned_extents(struct btrfs_root *root,
1701 u64 bytenr, u64 num, int pin);
1702int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1703 struct btrfs_root *root, struct extent_buffer *leaf);
1704int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1705 struct btrfs_root *root, u64 objectid, u64 bytenr);
1706int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1707 struct btrfs_root *root);
1708int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1709struct btrfs_block_group_cache *btrfs_lookup_block_group(
1710 struct btrfs_fs_info *info,
1711 u64 bytenr);
1712u64 btrfs_find_block_group(struct btrfs_root *root,
1713 u64 search_start, u64 search_hint, int owner);
1714struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1715 struct btrfs_root *root,
1716 u32 blocksize, u64 parent,
1717 u64 root_objectid,
1718 u64 ref_generation,
1719 int level,
1720 u64 hint,
1721 u64 empty_size);
1722struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1723 struct btrfs_root *root,
1724 u64 bytenr, u32 blocksize);
1725int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1726 struct btrfs_root *root,
1727 u64 num_bytes, u64 parent, u64 min_bytes,
1728 u64 root_objectid, u64 ref_generation,
1729 u64 owner, u64 empty_size, u64 hint_byte,
1730 u64 search_end, struct btrfs_key *ins, u64 data);
1731int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
1732 struct btrfs_root *root, u64 parent,
1733 u64 root_objectid, u64 ref_generation,
1734 u64 owner, struct btrfs_key *ins);
1735int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
1736 struct btrfs_root *root, u64 parent,
1737 u64 root_objectid, u64 ref_generation,
1738 u64 owner, struct btrfs_key *ins);
1739int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1740 struct btrfs_root *root,
1741 u64 num_bytes, u64 min_alloc_size,
1742 u64 empty_size, u64 hint_byte,
1743 u64 search_end, struct btrfs_key *ins,
1744 u64 data);
1745int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1746 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1747 u32 *nr_extents);
1748int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1749 struct extent_buffer *buf, u32 nr_extents);
1750int btrfs_update_ref(struct btrfs_trans_handle *trans,
1751 struct btrfs_root *root, struct extent_buffer *orig_buf,
1752 struct extent_buffer *buf, int start_slot, int nr);
1753int btrfs_free_extent(struct btrfs_trans_handle *trans,
1754 struct btrfs_root *root,
1755 u64 bytenr, u64 num_bytes, u64 parent,
1756 u64 root_objectid, u64 ref_generation,
1757 u64 owner_objectid, int pin);
1758int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
1759int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1760 struct btrfs_root *root,
1761 struct extent_io_tree *unpin);
1762int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1763 struct btrfs_root *root,
1764 u64 bytenr, u64 num_bytes, u64 parent,
1765 u64 root_objectid, u64 ref_generation,
1766 u64 owner_objectid);
1767int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1768 struct btrfs_root *root, u64 bytenr,
1769 u64 orig_parent, u64 parent,
1770 u64 root_objectid, u64 ref_generation,
1771 u64 owner_objectid);
1772int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1773 struct btrfs_root *root);
1774int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
1775int btrfs_free_block_groups(struct btrfs_fs_info *info);
1776int btrfs_read_block_groups(struct btrfs_root *root);
1777int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1778 struct btrfs_root *root, u64 bytes_used,
1779 u64 type, u64 chunk_objectid, u64 chunk_offset,
1780 u64 size);
1781int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1782 struct btrfs_root *root, u64 group_start);
1783int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
1784int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
1785 struct btrfs_root *root);
1786int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
1787int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
1788 struct btrfs_root *root,
1789 struct extent_buffer *buf, u64 orig_start);
1790int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1791int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1792int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1793u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1794/* ctree.c */
1795int btrfs_previous_item(struct btrfs_root *root,
1796 struct btrfs_path *path, u64 min_objectid,
1797 int type);
1798int btrfs_merge_path(struct btrfs_trans_handle *trans,
1799 struct btrfs_root *root,
1800 struct btrfs_key *node_keys,
1801 u64 *nodes, int lowest_level);
1802int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1803 struct btrfs_root *root, struct btrfs_path *path,
1804 struct btrfs_key *new_key);
1805struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
1806struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
1807int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
1808 struct btrfs_key *key, int lowest_level,
1809 int cache_only, u64 min_trans);
1810int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1811 struct btrfs_key *max_key,
1812 struct btrfs_path *path, int cache_only,
1813 u64 min_trans);
1814int btrfs_cow_block(struct btrfs_trans_handle *trans,
1815 struct btrfs_root *root, struct extent_buffer *buf,
1816 struct extent_buffer *parent, int parent_slot,
1817 struct extent_buffer **cow_ret, u64 prealloc_dest);
1818int btrfs_copy_root(struct btrfs_trans_handle *trans,
1819 struct btrfs_root *root,
1820 struct extent_buffer *buf,
1821 struct extent_buffer **cow_ret, u64 new_root_objectid);
1822int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
1823 *root, struct btrfs_path *path, u32 data_size);
1824int btrfs_truncate_item(struct btrfs_trans_handle *trans,
1825 struct btrfs_root *root,
1826 struct btrfs_path *path,
1827 u32 new_size, int from_end);
1828int btrfs_split_item(struct btrfs_trans_handle *trans,
1829 struct btrfs_root *root,
1830 struct btrfs_path *path,
1831 struct btrfs_key *new_key,
1832 unsigned long split_offset);
1833int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1834 *root, struct btrfs_key *key, struct btrfs_path *p, int
1835 ins_len, int cow);
1836int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1837 struct btrfs_root *root, struct extent_buffer *parent,
1838 int start_slot, int cache_only, u64 *last_ret,
1839 struct btrfs_key *progress);
1840void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1841struct btrfs_path *btrfs_alloc_path(void);
1842void btrfs_free_path(struct btrfs_path *p);
1843void btrfs_init_path(struct btrfs_path *p);
1844int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1845 struct btrfs_path *path, int slot, int nr);
1846int btrfs_del_leaf(struct btrfs_trans_handle *trans,
1847 struct btrfs_root *root,
1848 struct btrfs_path *path, u64 bytenr);
1849static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
1850 struct btrfs_root *root,
1851 struct btrfs_path *path)
1852{
1853 return btrfs_del_items(trans, root, path, path->slots[0], 1);
1854}
1855
1856int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
1857 *root, struct btrfs_key *key, void *data, u32 data_size);
1858int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
1859 struct btrfs_root *root,
1860 struct btrfs_path *path,
1861 struct btrfs_key *cpu_key, u32 *data_size,
1862 int nr);
1863int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
1864 struct btrfs_root *root,
1865 struct btrfs_path *path,
1866 struct btrfs_key *cpu_key, u32 *data_size, int nr);
1867
1868static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
1869 struct btrfs_root *root,
1870 struct btrfs_path *path,
1871 struct btrfs_key *key,
1872 u32 data_size)
1873{
1874 return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
1875}
1876
1877int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
1878int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
1879int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
1880int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
1881 *root);
1882int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
1883 struct btrfs_root *root,
1884 struct extent_buffer *node,
1885 struct extent_buffer *parent);
1886/* root-item.c */
1887int btrfs_find_root_ref(struct btrfs_root *tree_root,
1888 struct btrfs_path *path,
1889 u64 root_id, u64 ref_id);
1890int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
1891 struct btrfs_root *tree_root,
1892 u64 root_id, u8 type, u64 ref_id,
1893 u64 dirid, u64 sequence,
1894 const char *name, int name_len);
1895int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1896 struct btrfs_key *key);
1897int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
1898 *root, struct btrfs_key *key, struct btrfs_root_item
1899 *item);
1900int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
1901 *root, struct btrfs_key *key, struct btrfs_root_item
1902 *item);
1903int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
1904 btrfs_root_item *item, struct btrfs_key *key);
1905int btrfs_search_root(struct btrfs_root *root, u64 search_start,
1906 u64 *found_objectid);
1907int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
1908 struct btrfs_root *latest_root);
1909/* dir-item.c */
1910int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
1911 struct btrfs_root *root, const char *name,
1912 int name_len, u64 dir,
1913 struct btrfs_key *location, u8 type, u64 index);
1914struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
1915 struct btrfs_root *root,
1916 struct btrfs_path *path, u64 dir,
1917 const char *name, int name_len,
1918 int mod);
1919struct btrfs_dir_item *
1920btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1921 struct btrfs_root *root,
1922 struct btrfs_path *path, u64 dir,
1923 u64 objectid, const char *name, int name_len,
1924 int mod);
1925struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
1926 struct btrfs_path *path,
1927 const char *name, int name_len);
1928int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
1929 struct btrfs_root *root,
1930 struct btrfs_path *path,
1931 struct btrfs_dir_item *di);
1932int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
1933 struct btrfs_root *root, const char *name,
1934 u16 name_len, const void *data, u16 data_len,
1935 u64 dir);
1936struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
1937 struct btrfs_root *root,
1938 struct btrfs_path *path, u64 dir,
1939 const char *name, u16 name_len,
1940 int mod);
1941
1942/* orphan.c */
1943int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
1944 struct btrfs_root *root, u64 offset);
1945int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
1946 struct btrfs_root *root, u64 offset);
1947
1948/* inode-map.c */
1949int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
1950 struct btrfs_root *fs_root,
1951 u64 dirid, u64 *objectid);
1952int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
1953
1954/* inode-item.c */
1955int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
1956 struct btrfs_root *root,
1957 const char *name, int name_len,
1958 u64 inode_objectid, u64 ref_objectid, u64 index);
1959int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
1960 struct btrfs_root *root,
1961 const char *name, int name_len,
1962 u64 inode_objectid, u64 ref_objectid, u64 *index);
1963int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
1964 struct btrfs_root *root,
1965 struct btrfs_path *path, u64 objectid);
1966int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
1967 *root, struct btrfs_path *path,
1968 struct btrfs_key *location, int mod);
1969
1970/* file-item.c */
1971int btrfs_del_csums(struct btrfs_trans_handle *trans,
1972 struct btrfs_root *root, u64 bytenr, u64 len);
1973int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
1974 struct bio *bio, u32 *dst);
1975int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
1976 struct btrfs_root *root,
1977 u64 objectid, u64 pos,
1978 u64 disk_offset, u64 disk_num_bytes,
1979 u64 num_bytes, u64 offset, u64 ram_bytes,
1980 u8 compression, u8 encryption, u16 other_encoding);
1981int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
1982 struct btrfs_root *root,
1983 struct btrfs_path *path, u64 objectid,
1984 u64 bytenr, int mod);
1985int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
1986 struct btrfs_root *root,
1987 struct btrfs_ordered_sum *sums);
1988int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
1989 struct bio *bio, u64 file_start, int contig);
1990int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
1991 u64 start, unsigned long len);
1992struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
1993 struct btrfs_root *root,
1994 struct btrfs_path *path,
1995 u64 bytenr, int cow);
1996int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, struct btrfs_path *path,
1998 u64 isize);
1999int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
2000 u64 end, struct list_head *list);
2001/* inode.c */
2002
2003/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
2004#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
2005#define ClearPageChecked ClearPageFsMisc
2006#define SetPageChecked SetPageFsMisc
2007#define PageChecked PageFsMisc
2008#endif
2009
2010struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
2011int btrfs_set_inode_index(struct inode *dir, u64 *index);
2012int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2013 struct btrfs_root *root,
2014 struct inode *dir, struct inode *inode,
2015 const char *name, int name_len);
2016int btrfs_add_link(struct btrfs_trans_handle *trans,
2017 struct inode *parent_inode, struct inode *inode,
2018 const char *name, int name_len, int add_backref, u64 index);
2019int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2020 struct btrfs_root *root,
2021 struct inode *inode, u64 new_size,
2022 u32 min_type);
2023
2024int btrfs_start_delalloc_inodes(struct btrfs_root *root);
2025int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
2026int btrfs_writepages(struct address_space *mapping,
2027 struct writeback_control *wbc);
2028int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2029 struct btrfs_root *new_root, struct dentry *dentry,
2030 u64 new_dirid, u64 alloc_hint);
2031int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2032 size_t size, struct bio *bio, unsigned long bio_flags);
2033
2034unsigned long btrfs_force_ra(struct address_space *mapping,
2035 struct file_ra_state *ra, struct file *file,
2036 pgoff_t offset, pgoff_t last_index);
2037int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
2038 int for_del);
2039int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
2040int btrfs_readpage(struct file *file, struct page *page);
2041void btrfs_delete_inode(struct inode *inode);
2042void btrfs_put_inode(struct inode *inode);
2043void btrfs_read_locked_inode(struct inode *inode);
2044int btrfs_write_inode(struct inode *inode, int wait);
2045void btrfs_dirty_inode(struct inode *inode);
2046struct inode *btrfs_alloc_inode(struct super_block *sb);
2047void btrfs_destroy_inode(struct inode *inode);
2048int btrfs_init_cachep(void);
2049void btrfs_destroy_cachep(void);
2050long btrfs_ioctl_trans_end(struct file *file);
2051struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
2052 struct btrfs_root *root, int wait);
2053struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
2054 struct btrfs_root *root);
2055struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2056 struct btrfs_root *root, int *is_new);
2057int btrfs_commit_write(struct file *file, struct page *page,
2058 unsigned from, unsigned to);
2059struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2060 size_t page_offset, u64 start, u64 end,
2061 int create);
2062int btrfs_update_inode(struct btrfs_trans_handle *trans,
2063 struct btrfs_root *root,
2064 struct inode *inode);
2065int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2066int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2067void btrfs_orphan_cleanup(struct btrfs_root *root);
2068int btrfs_cont_expand(struct inode *inode, loff_t size);
2069
2070/* ioctl.c */
2071long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
2072
2073/* file.c */
2074int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
2075int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2076 int skip_pinned);
2077int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2078extern struct file_operations btrfs_file_operations;
2079int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2080 struct btrfs_root *root, struct inode *inode,
2081 u64 start, u64 end, u64 inline_limit, u64 *hint_block);
2082int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2083 struct btrfs_root *root,
2084 struct inode *inode, u64 start, u64 end);
2085int btrfs_release_file(struct inode *inode, struct file *file);
2086
2087/* tree-defrag.c */
2088int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
2089 struct btrfs_root *root, int cache_only);
2090
2091/* sysfs.c */
2092int btrfs_init_sysfs(void);
2093void btrfs_exit_sysfs(void);
2094int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
2095int btrfs_sysfs_add_root(struct btrfs_root *root);
2096void btrfs_sysfs_del_root(struct btrfs_root *root);
2097void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2098
2099/* xattr.c */
2100ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2101
2102/* super.c */
2103u64 btrfs_parse_size(char *str);
2104int btrfs_parse_options(struct btrfs_root *root, char *options);
2105int btrfs_sync_fs(struct super_block *sb, int wait);
2106
2107/* acl.c */
2108int btrfs_check_acl(struct inode *inode, int mask);
2109int btrfs_init_acl(struct inode *inode, struct inode *dir);
2110int btrfs_acl_chmod(struct inode *inode);
2111
2112/* free-space-cache.c */
2113int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2114 u64 bytenr, u64 size);
2115int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2116 u64 offset, u64 bytes);
2117int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2118 u64 bytenr, u64 size);
2119int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2120 u64 offset, u64 bytes);
2121void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2122 *block_group);
2123struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2124 *block_group, u64 offset,
2125 u64 bytes);
2126void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2127 u64 bytes);
2128u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2129#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000000..926a0b287a7d
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,386 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "hash.h"
22#include "transaction.h"
23
24/*
25 * insert a name into a directory, doing overflow properly if there is a hash
26 * collision. data_size indicates how big the item inserted should be. On
27 * success a struct btrfs_dir_item pointer is returned, otherwise it is
28 * an ERR_PTR.
29 *
30 * The name is not copied into the dir item, you have to do that yourself.
31 */
32static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
33 *trans,
34 struct btrfs_root *root,
35 struct btrfs_path *path,
36 struct btrfs_key *cpu_key,
37 u32 data_size,
38 const char *name,
39 int name_len)
40{
41 int ret;
42 char *ptr;
43 struct btrfs_item *item;
44 struct extent_buffer *leaf;
45
46 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
47 if (ret == -EEXIST) {
48 struct btrfs_dir_item *di;
49 di = btrfs_match_dir_item_name(root, path, name, name_len);
50 if (di)
51 return ERR_PTR(-EEXIST);
52 ret = btrfs_extend_item(trans, root, path, data_size);
53 WARN_ON(ret > 0);
54 }
55 if (ret < 0)
56 return ERR_PTR(ret);
57 WARN_ON(ret > 0);
58 leaf = path->nodes[0];
59 item = btrfs_item_nr(leaf, path->slots[0]);
60 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
61 BUG_ON(data_size > btrfs_item_size(leaf, item));
62 ptr += btrfs_item_size(leaf, item) - data_size;
63 return (struct btrfs_dir_item *)ptr;
64}
65
66/*
67 * xattrs work a lot like directories, this inserts an xattr item
68 * into the tree
69 */
70int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
71 struct btrfs_root *root, const char *name,
72 u16 name_len, const void *data, u16 data_len,
73 u64 dir)
74{
75 int ret = 0;
76 struct btrfs_path *path;
77 struct btrfs_dir_item *dir_item;
78 unsigned long name_ptr, data_ptr;
79 struct btrfs_key key, location;
80 struct btrfs_disk_key disk_key;
81 struct extent_buffer *leaf;
82 u32 data_size;
83
84 key.objectid = dir;
85 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
86 key.offset = btrfs_name_hash(name, name_len);
87 path = btrfs_alloc_path();
88 if (!path)
89 return -ENOMEM;
90 if (name_len + data_len + sizeof(struct btrfs_dir_item) >
91 BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
92 return -ENOSPC;
93
94 data_size = sizeof(*dir_item) + name_len + data_len;
95 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
96 name, name_len);
97 /*
98 * FIXME: at some point we should handle xattr's that are larger than
99 * what we can fit in our leaf. We set location to NULL b/c we arent
100 * pointing at anything else, that will change if we store the xattr
101 * data in a separate inode.
102 */
103 BUG_ON(IS_ERR(dir_item));
104 memset(&location, 0, sizeof(location));
105
106 leaf = path->nodes[0];
107 btrfs_cpu_key_to_disk(&disk_key, &location);
108 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
109 btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
110 btrfs_set_dir_name_len(leaf, dir_item, name_len);
111 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
112 btrfs_set_dir_data_len(leaf, dir_item, data_len);
113 name_ptr = (unsigned long)(dir_item + 1);
114 data_ptr = (unsigned long)((char *)name_ptr + name_len);
115
116 write_extent_buffer(leaf, name, name_ptr, name_len);
117 write_extent_buffer(leaf, data, data_ptr, data_len);
118 btrfs_mark_buffer_dirty(path->nodes[0]);
119
120 btrfs_free_path(path);
121 return ret;
122}
123
124/*
125 * insert a directory item in the tree, doing all the magic for
126 * both indexes. 'dir' indicates which objectid to insert it into,
127 * 'location' is the key to stuff into the directory item, 'type' is the
128 * type of the inode we're pointing to, and 'index' is the sequence number
129 * to use for the second index (if one is created).
130 */
131int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
132 *root, const char *name, int name_len, u64 dir,
133 struct btrfs_key *location, u8 type, u64 index)
134{
135 int ret = 0;
136 int ret2 = 0;
137 struct btrfs_path *path;
138 struct btrfs_dir_item *dir_item;
139 struct extent_buffer *leaf;
140 unsigned long name_ptr;
141 struct btrfs_key key;
142 struct btrfs_disk_key disk_key;
143 u32 data_size;
144
145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len);
148 path = btrfs_alloc_path();
149 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len);
152 if (IS_ERR(dir_item)) {
153 ret = PTR_ERR(dir_item);
154 if (ret == -EEXIST)
155 goto second_insert;
156 goto out;
157 }
158
159 leaf = path->nodes[0];
160 btrfs_cpu_key_to_disk(&disk_key, location);
161 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
162 btrfs_set_dir_type(leaf, dir_item, type);
163 btrfs_set_dir_data_len(leaf, dir_item, 0);
164 btrfs_set_dir_name_len(leaf, dir_item, name_len);
165 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
166 name_ptr = (unsigned long)(dir_item + 1);
167
168 write_extent_buffer(leaf, name, name_ptr, name_len);
169 btrfs_mark_buffer_dirty(leaf);
170
171second_insert:
172 /* FIXME, use some real flag for selecting the extra index */
173 if (root == root->fs_info->tree_root) {
174 ret = 0;
175 goto out;
176 }
177 btrfs_release_path(root, path);
178
179 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
180 key.offset = index;
181 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
182 name, name_len);
183 if (IS_ERR(dir_item)) {
184 ret2 = PTR_ERR(dir_item);
185 goto out;
186 }
187 leaf = path->nodes[0];
188 btrfs_cpu_key_to_disk(&disk_key, location);
189 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
190 btrfs_set_dir_type(leaf, dir_item, type);
191 btrfs_set_dir_data_len(leaf, dir_item, 0);
192 btrfs_set_dir_name_len(leaf, dir_item, name_len);
193 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
194 name_ptr = (unsigned long)(dir_item + 1);
195 write_extent_buffer(leaf, name, name_ptr, name_len);
196 btrfs_mark_buffer_dirty(leaf);
197out:
198 btrfs_free_path(path);
199 if (ret)
200 return ret;
201 if (ret2)
202 return ret2;
203 return 0;
204}
205
206/*
207 * lookup a directory item based on name. 'dir' is the objectid
208 * we're searching in, and 'mod' tells us if you plan on deleting the
209 * item (use mod < 0) or changing the options (use mod > 0)
210 */
211struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
212 struct btrfs_root *root,
213 struct btrfs_path *path, u64 dir,
214 const char *name, int name_len,
215 int mod)
216{
217 int ret;
218 struct btrfs_key key;
219 int ins_len = mod < 0 ? -1 : 0;
220 int cow = mod != 0;
221 struct btrfs_key found_key;
222 struct extent_buffer *leaf;
223
224 key.objectid = dir;
225 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
226
227 key.offset = btrfs_name_hash(name, name_len);
228
229 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
230 if (ret < 0)
231 return ERR_PTR(ret);
232 if (ret > 0) {
233 if (path->slots[0] == 0)
234 return NULL;
235 path->slots[0]--;
236 }
237
238 leaf = path->nodes[0];
239 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
240
241 if (found_key.objectid != dir ||
242 btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
243 found_key.offset != key.offset)
244 return NULL;
245
246 return btrfs_match_dir_item_name(root, path, name, name_len);
247}
248
249/*
250 * lookup a directory item based on index. 'dir' is the objectid
251 * we're searching in, and 'mod' tells us if you plan on deleting the
252 * item (use mod < 0) or changing the options (use mod > 0)
253 *
254 * The name is used to make sure the index really points to the name you were
255 * looking for.
256 */
257struct btrfs_dir_item *
258btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
259 struct btrfs_root *root,
260 struct btrfs_path *path, u64 dir,
261 u64 objectid, const char *name, int name_len,
262 int mod)
263{
264 int ret;
265 struct btrfs_key key;
266 int ins_len = mod < 0 ? -1 : 0;
267 int cow = mod != 0;
268
269 key.objectid = dir;
270 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
271 key.offset = objectid;
272
273 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
274 if (ret < 0)
275 return ERR_PTR(ret);
276 if (ret > 0)
277 return ERR_PTR(-ENOENT);
278 return btrfs_match_dir_item_name(root, path, name, name_len);
279}
280
281struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
282 struct btrfs_root *root,
283 struct btrfs_path *path, u64 dir,
284 const char *name, u16 name_len,
285 int mod)
286{
287 int ret;
288 struct btrfs_key key;
289 int ins_len = mod < 0 ? -1 : 0;
290 int cow = mod != 0;
291 struct btrfs_key found_key;
292 struct extent_buffer *leaf;
293
294 key.objectid = dir;
295 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
296 key.offset = btrfs_name_hash(name, name_len);
297 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
298 if (ret < 0)
299 return ERR_PTR(ret);
300 if (ret > 0) {
301 if (path->slots[0] == 0)
302 return NULL;
303 path->slots[0]--;
304 }
305
306 leaf = path->nodes[0];
307 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
308
309 if (found_key.objectid != dir ||
310 btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
311 found_key.offset != key.offset)
312 return NULL;
313
314 return btrfs_match_dir_item_name(root, path, name, name_len);
315}
316
317/*
318 * helper function to look at the directory item pointed to by 'path'
319 * this walks through all the entries in a dir item and finds one
320 * for a specific name.
321 */
322struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
323 struct btrfs_path *path,
324 const char *name, int name_len)
325{
326 struct btrfs_dir_item *dir_item;
327 unsigned long name_ptr;
328 u32 total_len;
329 u32 cur = 0;
330 u32 this_len;
331 struct extent_buffer *leaf;
332
333 leaf = path->nodes[0];
334 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
335 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
336 while (cur < total_len) {
337 this_len = sizeof(*dir_item) +
338 btrfs_dir_name_len(leaf, dir_item) +
339 btrfs_dir_data_len(leaf, dir_item);
340 name_ptr = (unsigned long)(dir_item + 1);
341
342 if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
343 memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
344 return dir_item;
345
346 cur += this_len;
347 dir_item = (struct btrfs_dir_item *)((char *)dir_item +
348 this_len);
349 }
350 return NULL;
351}
352
353/*
354 * given a pointer into a directory item, delete it. This
355 * handles items that have more than one entry in them.
356 */
357int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
358 struct btrfs_root *root,
359 struct btrfs_path *path,
360 struct btrfs_dir_item *di)
361{
362
363 struct extent_buffer *leaf;
364 u32 sub_item_len;
365 u32 item_len;
366 int ret = 0;
367
368 leaf = path->nodes[0];
369 sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
370 btrfs_dir_data_len(leaf, di);
371 item_len = btrfs_item_size_nr(leaf, path->slots[0]);
372 if (sub_item_len == item_len) {
373 ret = btrfs_del_item(trans, root, path);
374 } else {
375 /* MARKER */
376 unsigned long ptr = (unsigned long)di;
377 unsigned long start;
378
379 start = btrfs_item_ptr_offset(leaf, path->slots[0]);
380 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
381 item_len - (ptr + sub_item_len - start));
382 ret = btrfs_truncate_item(trans, root, path,
383 item_len - sub_item_len, 1);
384 }
385 return 0;
386}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000000..81a313874ae5
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2343 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/fs.h>
21#include <linux/blkdev.h>
22#include <linux/scatterlist.h>
23#include <linux/swap.h>
24#include <linux/radix-tree.h>
25#include <linux/writeback.h>
26#include <linux/buffer_head.h>
27#include <linux/workqueue.h>
28#include <linux/kthread.h>
29#include <linux/freezer.h>
30#include "compat.h"
31#include "crc32c.h"
32#include "ctree.h"
33#include "disk-io.h"
34#include "transaction.h"
35#include "btrfs_inode.h"
36#include "volumes.h"
37#include "print-tree.h"
38#include "async-thread.h"
39#include "locking.h"
40#include "ref-cache.h"
41#include "tree-log.h"
42
43static struct extent_io_ops btree_extent_io_ops;
44static void end_workqueue_fn(struct btrfs_work *work);
45
46/*
47 * end_io_wq structs are used to do processing in task context when an IO is
48 * complete. This is used during reads to verify checksums, and it is used
49 * by writes to insert metadata for new file extents after IO is complete.
50 */
51struct end_io_wq {
52 struct bio *bio;
53 bio_end_io_t *end_io;
54 void *private;
55 struct btrfs_fs_info *info;
56 int error;
57 int metadata;
58 struct list_head list;
59 struct btrfs_work work;
60};
61
62/*
63 * async submit bios are used to offload expensive checksumming
64 * onto the worker threads. They checksum file and metadata bios
65 * just before they are sent down the IO stack.
66 */
67struct async_submit_bio {
68 struct inode *inode;
69 struct bio *bio;
70 struct list_head list;
71 extent_submit_bio_hook_t *submit_bio_start;
72 extent_submit_bio_hook_t *submit_bio_done;
73 int rw;
74 int mirror_num;
75 unsigned long bio_flags;
76 struct btrfs_work work;
77};
78
79/*
80 * extents on the btree inode are pretty simple, there's one extent
81 * that covers the entire device
82 */
83static struct extent_map *btree_get_extent(struct inode *inode,
84 struct page *page, size_t page_offset, u64 start, u64 len,
85 int create)
86{
87 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
88 struct extent_map *em;
89 int ret;
90
91 spin_lock(&em_tree->lock);
92 em = lookup_extent_mapping(em_tree, start, len);
93 if (em) {
94 em->bdev =
95 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
96 spin_unlock(&em_tree->lock);
97 goto out;
98 }
99 spin_unlock(&em_tree->lock);
100
101 em = alloc_extent_map(GFP_NOFS);
102 if (!em) {
103 em = ERR_PTR(-ENOMEM);
104 goto out;
105 }
106 em->start = 0;
107 em->len = (u64)-1;
108 em->block_len = (u64)-1;
109 em->block_start = 0;
110 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
111
112 spin_lock(&em_tree->lock);
113 ret = add_extent_mapping(em_tree, em);
114 if (ret == -EEXIST) {
115 u64 failed_start = em->start;
116 u64 failed_len = em->len;
117
118 free_extent_map(em);
119 em = lookup_extent_mapping(em_tree, start, len);
120 if (em) {
121 ret = 0;
122 } else {
123 em = lookup_extent_mapping(em_tree, failed_start,
124 failed_len);
125 ret = -EIO;
126 }
127 } else if (ret) {
128 free_extent_map(em);
129 em = NULL;
130 }
131 spin_unlock(&em_tree->lock);
132
133 if (ret)
134 em = ERR_PTR(ret);
135out:
136 return em;
137}
138
139u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
140{
141 return btrfs_crc32c(seed, data, len);
142}
143
144void btrfs_csum_final(u32 crc, char *result)
145{
146 *(__le32 *)result = ~cpu_to_le32(crc);
147}
148
149/*
150 * compute the csum for a btree block, and either verify it or write it
151 * into the csum field of the block.
152 */
153static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
154 int verify)
155{
156 u16 csum_size =
157 btrfs_super_csum_size(&root->fs_info->super_copy);
158 char *result = NULL;
159 unsigned long len;
160 unsigned long cur_len;
161 unsigned long offset = BTRFS_CSUM_SIZE;
162 char *map_token = NULL;
163 char *kaddr;
164 unsigned long map_start;
165 unsigned long map_len;
166 int err;
167 u32 crc = ~(u32)0;
168 unsigned long inline_result;
169
170 len = buf->len - offset;
171 while (len > 0) {
172 err = map_private_extent_buffer(buf, offset, 32,
173 &map_token, &kaddr,
174 &map_start, &map_len, KM_USER0);
175 if (err)
176 return 1;
177 cur_len = min(len, map_len - (offset - map_start));
178 crc = btrfs_csum_data(root, kaddr + offset - map_start,
179 crc, cur_len);
180 len -= cur_len;
181 offset += cur_len;
182 unmap_extent_buffer(buf, map_token, KM_USER0);
183 }
184 if (csum_size > sizeof(inline_result)) {
185 result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
186 if (!result)
187 return 1;
188 } else {
189 result = (char *)&inline_result;
190 }
191
192 btrfs_csum_final(crc, result);
193
194 if (verify) {
195 if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
196 u32 val;
197 u32 found = 0;
198 memcpy(&found, result, csum_size);
199
200 read_extent_buffer(buf, &val, 0, csum_size);
201 printk(KERN_INFO "btrfs: %s checksum verify failed "
202 "on %llu wanted %X found %X level %d\n",
203 root->fs_info->sb->s_id,
204 buf->start, val, found, btrfs_header_level(buf));
205 if (result != (char *)&inline_result)
206 kfree(result);
207 return 1;
208 }
209 } else {
210 write_extent_buffer(buf, result, 0, csum_size);
211 }
212 if (result != (char *)&inline_result)
213 kfree(result);
214 return 0;
215}
216
217/*
218 * we can't consider a given block up to date unless the transid of the
219 * block matches the transid in the parent node's pointer. This is how we
220 * detect blocks that either didn't get written at all or got written
221 * in the wrong place.
222 */
223static int verify_parent_transid(struct extent_io_tree *io_tree,
224 struct extent_buffer *eb, u64 parent_transid)
225{
226 int ret;
227
228 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
229 return 0;
230
231 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
232 if (extent_buffer_uptodate(io_tree, eb) &&
233 btrfs_header_generation(eb) == parent_transid) {
234 ret = 0;
235 goto out;
236 }
237 printk("parent transid verify failed on %llu wanted %llu found %llu\n",
238 (unsigned long long)eb->start,
239 (unsigned long long)parent_transid,
240 (unsigned long long)btrfs_header_generation(eb));
241 ret = 1;
242 clear_extent_buffer_uptodate(io_tree, eb);
243out:
244 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
245 GFP_NOFS);
246 return ret;
247}
248
249/*
250 * helper to read a given tree block, doing retries as required when
251 * the checksums don't match and we have alternate mirrors to try.
252 */
253static int btree_read_extent_buffer_pages(struct btrfs_root *root,
254 struct extent_buffer *eb,
255 u64 start, u64 parent_transid)
256{
257 struct extent_io_tree *io_tree;
258 int ret;
259 int num_copies = 0;
260 int mirror_num = 0;
261
262 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
263 while (1) {
264 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
265 btree_get_extent, mirror_num);
266 if (!ret &&
267 !verify_parent_transid(io_tree, eb, parent_transid))
268 return ret;
269
270 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
271 eb->start, eb->len);
272 if (num_copies == 1)
273 return ret;
274
275 mirror_num++;
276 if (mirror_num > num_copies)
277 return ret;
278 }
279 return -EIO;
280}
281
282/*
283 * checksum a dirty tree block before IO. This has extra checks to make sure
284 * we only fill in the checksum field in the first page of a multi-page block
285 */
286
287static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
288{
289 struct extent_io_tree *tree;
290 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
291 u64 found_start;
292 int found_level;
293 unsigned long len;
294 struct extent_buffer *eb;
295 int ret;
296
297 tree = &BTRFS_I(page->mapping->host)->io_tree;
298
299 if (page->private == EXTENT_PAGE_PRIVATE)
300 goto out;
301 if (!page->private)
302 goto out;
303 len = page->private >> 2;
304 WARN_ON(len == 0);
305
306 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
307 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
308 btrfs_header_generation(eb));
309 BUG_ON(ret);
310 found_start = btrfs_header_bytenr(eb);
311 if (found_start != start) {
312 WARN_ON(1);
313 goto err;
314 }
315 if (eb->first_page != page) {
316 WARN_ON(1);
317 goto err;
318 }
319 if (!PageUptodate(page)) {
320 WARN_ON(1);
321 goto err;
322 }
323 found_level = btrfs_header_level(eb);
324
325 csum_tree_block(root, eb, 0);
326err:
327 free_extent_buffer(eb);
328out:
329 return 0;
330}
331
332static int check_tree_block_fsid(struct btrfs_root *root,
333 struct extent_buffer *eb)
334{
335 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
336 u8 fsid[BTRFS_UUID_SIZE];
337 int ret = 1;
338
339 read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
340 BTRFS_FSID_SIZE);
341 while (fs_devices) {
342 if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
343 ret = 0;
344 break;
345 }
346 fs_devices = fs_devices->seed;
347 }
348 return ret;
349}
350
351static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
352 struct extent_state *state)
353{
354 struct extent_io_tree *tree;
355 u64 found_start;
356 int found_level;
357 unsigned long len;
358 struct extent_buffer *eb;
359 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
360 int ret = 0;
361
362 tree = &BTRFS_I(page->mapping->host)->io_tree;
363 if (page->private == EXTENT_PAGE_PRIVATE)
364 goto out;
365 if (!page->private)
366 goto out;
367
368 len = page->private >> 2;
369 WARN_ON(len == 0);
370
371 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
372
373 found_start = btrfs_header_bytenr(eb);
374 if (found_start != start) {
375 printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
376 (unsigned long long)found_start,
377 (unsigned long long)eb->start);
378 ret = -EIO;
379 goto err;
380 }
381 if (eb->first_page != page) {
382 printk(KERN_INFO "btrfs bad first page %lu %lu\n",
383 eb->first_page->index, page->index);
384 WARN_ON(1);
385 ret = -EIO;
386 goto err;
387 }
388 if (check_tree_block_fsid(root, eb)) {
389 printk(KERN_INFO "btrfs bad fsid on block %llu\n",
390 (unsigned long long)eb->start);
391 ret = -EIO;
392 goto err;
393 }
394 found_level = btrfs_header_level(eb);
395
396 ret = csum_tree_block(root, eb, 1);
397 if (ret)
398 ret = -EIO;
399
400 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
401 end = eb->start + end - 1;
402err:
403 free_extent_buffer(eb);
404out:
405 return ret;
406}
407
408static void end_workqueue_bio(struct bio *bio, int err)
409{
410 struct end_io_wq *end_io_wq = bio->bi_private;
411 struct btrfs_fs_info *fs_info;
412
413 fs_info = end_io_wq->info;
414 end_io_wq->error = err;
415 end_io_wq->work.func = end_workqueue_fn;
416 end_io_wq->work.flags = 0;
417
418 if (bio->bi_rw & (1 << BIO_RW)) {
419 if (end_io_wq->metadata)
420 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
421 &end_io_wq->work);
422 else
423 btrfs_queue_worker(&fs_info->endio_write_workers,
424 &end_io_wq->work);
425 } else {
426 if (end_io_wq->metadata)
427 btrfs_queue_worker(&fs_info->endio_meta_workers,
428 &end_io_wq->work);
429 else
430 btrfs_queue_worker(&fs_info->endio_workers,
431 &end_io_wq->work);
432 }
433}
434
435int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
436 int metadata)
437{
438 struct end_io_wq *end_io_wq;
439 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
440 if (!end_io_wq)
441 return -ENOMEM;
442
443 end_io_wq->private = bio->bi_private;
444 end_io_wq->end_io = bio->bi_end_io;
445 end_io_wq->info = info;
446 end_io_wq->error = 0;
447 end_io_wq->bio = bio;
448 end_io_wq->metadata = metadata;
449
450 bio->bi_private = end_io_wq;
451 bio->bi_end_io = end_workqueue_bio;
452 return 0;
453}
454
455unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
456{
457 unsigned long limit = min_t(unsigned long,
458 info->workers.max_workers,
459 info->fs_devices->open_devices);
460 return 256 * limit;
461}
462
463int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
464{
465 return atomic_read(&info->nr_async_bios) >
466 btrfs_async_submit_limit(info);
467}
468
469static void run_one_async_start(struct btrfs_work *work)
470{
471 struct btrfs_fs_info *fs_info;
472 struct async_submit_bio *async;
473
474 async = container_of(work, struct async_submit_bio, work);
475 fs_info = BTRFS_I(async->inode)->root->fs_info;
476 async->submit_bio_start(async->inode, async->rw, async->bio,
477 async->mirror_num, async->bio_flags);
478}
479
480static void run_one_async_done(struct btrfs_work *work)
481{
482 struct btrfs_fs_info *fs_info;
483 struct async_submit_bio *async;
484 int limit;
485
486 async = container_of(work, struct async_submit_bio, work);
487 fs_info = BTRFS_I(async->inode)->root->fs_info;
488
489 limit = btrfs_async_submit_limit(fs_info);
490 limit = limit * 2 / 3;
491
492 atomic_dec(&fs_info->nr_async_submits);
493
494 if (atomic_read(&fs_info->nr_async_submits) < limit &&
495 waitqueue_active(&fs_info->async_submit_wait))
496 wake_up(&fs_info->async_submit_wait);
497
498 async->submit_bio_done(async->inode, async->rw, async->bio,
499 async->mirror_num, async->bio_flags);
500}
501
502static void run_one_async_free(struct btrfs_work *work)
503{
504 struct async_submit_bio *async;
505
506 async = container_of(work, struct async_submit_bio, work);
507 kfree(async);
508}
509
510int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
511 int rw, struct bio *bio, int mirror_num,
512 unsigned long bio_flags,
513 extent_submit_bio_hook_t *submit_bio_start,
514 extent_submit_bio_hook_t *submit_bio_done)
515{
516 struct async_submit_bio *async;
517
518 async = kmalloc(sizeof(*async), GFP_NOFS);
519 if (!async)
520 return -ENOMEM;
521
522 async->inode = inode;
523 async->rw = rw;
524 async->bio = bio;
525 async->mirror_num = mirror_num;
526 async->submit_bio_start = submit_bio_start;
527 async->submit_bio_done = submit_bio_done;
528
529 async->work.func = run_one_async_start;
530 async->work.ordered_func = run_one_async_done;
531 async->work.ordered_free = run_one_async_free;
532
533 async->work.flags = 0;
534 async->bio_flags = bio_flags;
535
536 atomic_inc(&fs_info->nr_async_submits);
537 btrfs_queue_worker(&fs_info->workers, &async->work);
538#if 0
539 int limit = btrfs_async_submit_limit(fs_info);
540 if (atomic_read(&fs_info->nr_async_submits) > limit) {
541 wait_event_timeout(fs_info->async_submit_wait,
542 (atomic_read(&fs_info->nr_async_submits) < limit),
543 HZ/10);
544
545 wait_event_timeout(fs_info->async_submit_wait,
546 (atomic_read(&fs_info->nr_async_bios) < limit),
547 HZ/10);
548 }
549#endif
550 while (atomic_read(&fs_info->async_submit_draining) &&
551 atomic_read(&fs_info->nr_async_submits)) {
552 wait_event(fs_info->async_submit_wait,
553 (atomic_read(&fs_info->nr_async_submits) == 0));
554 }
555
556 return 0;
557}
558
559static int btree_csum_one_bio(struct bio *bio)
560{
561 struct bio_vec *bvec = bio->bi_io_vec;
562 int bio_index = 0;
563 struct btrfs_root *root;
564
565 WARN_ON(bio->bi_vcnt <= 0);
566 while (bio_index < bio->bi_vcnt) {
567 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
568 csum_dirty_buffer(root, bvec->bv_page);
569 bio_index++;
570 bvec++;
571 }
572 return 0;
573}
574
575static int __btree_submit_bio_start(struct inode *inode, int rw,
576 struct bio *bio, int mirror_num,
577 unsigned long bio_flags)
578{
579 /*
580 * when we're called for a write, we're already in the async
581 * submission context. Just jump into btrfs_map_bio
582 */
583 btree_csum_one_bio(bio);
584 return 0;
585}
586
587static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
588 int mirror_num, unsigned long bio_flags)
589{
590 /*
591 * when we're called for a write, we're already in the async
592 * submission context. Just jump into btrfs_map_bio
593 */
594 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
595}
596
597static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
598 int mirror_num, unsigned long bio_flags)
599{
600 int ret;
601
602 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
603 bio, 1);
604 BUG_ON(ret);
605
606 if (!(rw & (1 << BIO_RW))) {
607 /*
608 * called for a read, do the setup so that checksum validation
609 * can happen in the async kernel threads
610 */
611 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
612 mirror_num, 0);
613 }
614 /*
615 * kthread helpers are used to submit writes so that checksumming
616 * can happen in parallel across all CPUs
617 */
618 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
619 inode, rw, bio, mirror_num, 0,
620 __btree_submit_bio_start,
621 __btree_submit_bio_done);
622}
623
624static int btree_writepage(struct page *page, struct writeback_control *wbc)
625{
626 struct extent_io_tree *tree;
627 tree = &BTRFS_I(page->mapping->host)->io_tree;
628
629 if (current->flags & PF_MEMALLOC) {
630 redirty_page_for_writepage(wbc, page);
631 unlock_page(page);
632 return 0;
633 }
634 return extent_write_full_page(tree, page, btree_get_extent, wbc);
635}
636
637static int btree_writepages(struct address_space *mapping,
638 struct writeback_control *wbc)
639{
640 struct extent_io_tree *tree;
641 tree = &BTRFS_I(mapping->host)->io_tree;
642 if (wbc->sync_mode == WB_SYNC_NONE) {
643 u64 num_dirty;
644 u64 start = 0;
645 unsigned long thresh = 32 * 1024 * 1024;
646
647 if (wbc->for_kupdate)
648 return 0;
649
650 num_dirty = count_range_bits(tree, &start, (u64)-1,
651 thresh, EXTENT_DIRTY);
652 if (num_dirty < thresh)
653 return 0;
654 }
655 return extent_writepages(tree, mapping, btree_get_extent, wbc);
656}
657
658static int btree_readpage(struct file *file, struct page *page)
659{
660 struct extent_io_tree *tree;
661 tree = &BTRFS_I(page->mapping->host)->io_tree;
662 return extent_read_full_page(tree, page, btree_get_extent);
663}
664
665static int btree_releasepage(struct page *page, gfp_t gfp_flags)
666{
667 struct extent_io_tree *tree;
668 struct extent_map_tree *map;
669 int ret;
670
671 if (PageWriteback(page) || PageDirty(page))
672 return 0;
673
674 tree = &BTRFS_I(page->mapping->host)->io_tree;
675 map = &BTRFS_I(page->mapping->host)->extent_tree;
676
677 ret = try_release_extent_state(map, tree, page, gfp_flags);
678 if (!ret)
679 return 0;
680
681 ret = try_release_extent_buffer(tree, page);
682 if (ret == 1) {
683 ClearPagePrivate(page);
684 set_page_private(page, 0);
685 page_cache_release(page);
686 }
687
688 return ret;
689}
690
691static void btree_invalidatepage(struct page *page, unsigned long offset)
692{
693 struct extent_io_tree *tree;
694 tree = &BTRFS_I(page->mapping->host)->io_tree;
695 extent_invalidatepage(tree, page, offset);
696 btree_releasepage(page, GFP_NOFS);
697 if (PagePrivate(page)) {
698 printk(KERN_WARNING "btrfs warning page private not zero "
699 "on page %llu\n", (unsigned long long)page_offset(page));
700 ClearPagePrivate(page);
701 set_page_private(page, 0);
702 page_cache_release(page);
703 }
704}
705
706#if 0
707static int btree_writepage(struct page *page, struct writeback_control *wbc)
708{
709 struct buffer_head *bh;
710 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
711 struct buffer_head *head;
712 if (!page_has_buffers(page)) {
713 create_empty_buffers(page, root->fs_info->sb->s_blocksize,
714 (1 << BH_Dirty)|(1 << BH_Uptodate));
715 }
716 head = page_buffers(page);
717 bh = head;
718 do {
719 if (buffer_dirty(bh))
720 csum_tree_block(root, bh, 0);
721 bh = bh->b_this_page;
722 } while (bh != head);
723 return block_write_full_page(page, btree_get_block, wbc);
724}
725#endif
726
727static struct address_space_operations btree_aops = {
728 .readpage = btree_readpage,
729 .writepage = btree_writepage,
730 .writepages = btree_writepages,
731 .releasepage = btree_releasepage,
732 .invalidatepage = btree_invalidatepage,
733 .sync_page = block_sync_page,
734};
735
736int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
737 u64 parent_transid)
738{
739 struct extent_buffer *buf = NULL;
740 struct inode *btree_inode = root->fs_info->btree_inode;
741 int ret = 0;
742
743 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
744 if (!buf)
745 return 0;
746 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
747 buf, 0, 0, btree_get_extent, 0);
748 free_extent_buffer(buf);
749 return ret;
750}
751
752struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
753 u64 bytenr, u32 blocksize)
754{
755 struct inode *btree_inode = root->fs_info->btree_inode;
756 struct extent_buffer *eb;
757 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
758 bytenr, blocksize, GFP_NOFS);
759 return eb;
760}
761
762struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
763 u64 bytenr, u32 blocksize)
764{
765 struct inode *btree_inode = root->fs_info->btree_inode;
766 struct extent_buffer *eb;
767
768 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
769 bytenr, blocksize, NULL, GFP_NOFS);
770 return eb;
771}
772
773
774int btrfs_write_tree_block(struct extent_buffer *buf)
775{
776 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
777 buf->start + buf->len - 1, WB_SYNC_ALL);
778}
779
780int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
781{
782 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
783 buf->start, buf->start + buf->len - 1);
784}
785
786struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
787 u32 blocksize, u64 parent_transid)
788{
789 struct extent_buffer *buf = NULL;
790 struct inode *btree_inode = root->fs_info->btree_inode;
791 struct extent_io_tree *io_tree;
792 int ret;
793
794 io_tree = &BTRFS_I(btree_inode)->io_tree;
795
796 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
797 if (!buf)
798 return NULL;
799
800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
801
802 if (ret == 0)
803 buf->flags |= EXTENT_UPTODATE;
804 else
805 WARN_ON(1);
806 return buf;
807
808}
809
810int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
811 struct extent_buffer *buf)
812{
813 struct inode *btree_inode = root->fs_info->btree_inode;
814 if (btrfs_header_generation(buf) ==
815 root->fs_info->running_transaction->transid) {
816 WARN_ON(!btrfs_tree_locked(buf));
817 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
818 buf);
819 }
820 return 0;
821}
822
823static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
824 u32 stripesize, struct btrfs_root *root,
825 struct btrfs_fs_info *fs_info,
826 u64 objectid)
827{
828 root->node = NULL;
829 root->commit_root = NULL;
830 root->ref_tree = NULL;
831 root->sectorsize = sectorsize;
832 root->nodesize = nodesize;
833 root->leafsize = leafsize;
834 root->stripesize = stripesize;
835 root->ref_cows = 0;
836 root->track_dirty = 0;
837
838 root->fs_info = fs_info;
839 root->objectid = objectid;
840 root->last_trans = 0;
841 root->highest_inode = 0;
842 root->last_inode_alloc = 0;
843 root->name = NULL;
844 root->in_sysfs = 0;
845
846 INIT_LIST_HEAD(&root->dirty_list);
847 INIT_LIST_HEAD(&root->orphan_list);
848 INIT_LIST_HEAD(&root->dead_list);
849 spin_lock_init(&root->node_lock);
850 spin_lock_init(&root->list_lock);
851 mutex_init(&root->objectid_mutex);
852 mutex_init(&root->log_mutex);
853 extent_io_tree_init(&root->dirty_log_pages,
854 fs_info->btree_inode->i_mapping, GFP_NOFS);
855
856 btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
857 root->ref_tree = &root->ref_tree_struct;
858
859 memset(&root->root_key, 0, sizeof(root->root_key));
860 memset(&root->root_item, 0, sizeof(root->root_item));
861 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
862 memset(&root->root_kobj, 0, sizeof(root->root_kobj));
863 root->defrag_trans_start = fs_info->generation;
864 init_completion(&root->kobj_unregister);
865 root->defrag_running = 0;
866 root->defrag_level = 0;
867 root->root_key.objectid = objectid;
868 root->anon_super.s_root = NULL;
869 root->anon_super.s_dev = 0;
870 INIT_LIST_HEAD(&root->anon_super.s_list);
871 INIT_LIST_HEAD(&root->anon_super.s_instances);
872 init_rwsem(&root->anon_super.s_umount);
873
874 return 0;
875}
876
877static int find_and_setup_root(struct btrfs_root *tree_root,
878 struct btrfs_fs_info *fs_info,
879 u64 objectid,
880 struct btrfs_root *root)
881{
882 int ret;
883 u32 blocksize;
884 u64 generation;
885
886 __setup_root(tree_root->nodesize, tree_root->leafsize,
887 tree_root->sectorsize, tree_root->stripesize,
888 root, fs_info, objectid);
889 ret = btrfs_find_last_root(tree_root, objectid,
890 &root->root_item, &root->root_key);
891 BUG_ON(ret);
892
893 generation = btrfs_root_generation(&root->root_item);
894 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
895 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
896 blocksize, generation);
897 BUG_ON(!root->node);
898 return 0;
899}
900
901int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
902 struct btrfs_fs_info *fs_info)
903{
904 struct extent_buffer *eb;
905 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
906 u64 start = 0;
907 u64 end = 0;
908 int ret;
909
910 if (!log_root_tree)
911 return 0;
912
913 while (1) {
914 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
915 0, &start, &end, EXTENT_DIRTY);
916 if (ret)
917 break;
918
919 clear_extent_dirty(&log_root_tree->dirty_log_pages,
920 start, end, GFP_NOFS);
921 }
922 eb = fs_info->log_root_tree->node;
923
924 WARN_ON(btrfs_header_level(eb) != 0);
925 WARN_ON(btrfs_header_nritems(eb) != 0);
926
927 ret = btrfs_free_reserved_extent(fs_info->tree_root,
928 eb->start, eb->len);
929 BUG_ON(ret);
930
931 free_extent_buffer(eb);
932 kfree(fs_info->log_root_tree);
933 fs_info->log_root_tree = NULL;
934 return 0;
935}
936
937int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
938 struct btrfs_fs_info *fs_info)
939{
940 struct btrfs_root *root;
941 struct btrfs_root *tree_root = fs_info->tree_root;
942
943 root = kzalloc(sizeof(*root), GFP_NOFS);
944 if (!root)
945 return -ENOMEM;
946
947 __setup_root(tree_root->nodesize, tree_root->leafsize,
948 tree_root->sectorsize, tree_root->stripesize,
949 root, fs_info, BTRFS_TREE_LOG_OBJECTID);
950
951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
952 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
954 root->ref_cows = 0;
955
956 root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
957 0, BTRFS_TREE_LOG_OBJECTID,
958 trans->transid, 0, 0, 0);
959
960 btrfs_set_header_nritems(root->node, 0);
961 btrfs_set_header_level(root->node, 0);
962 btrfs_set_header_bytenr(root->node, root->node->start);
963 btrfs_set_header_generation(root->node, trans->transid);
964 btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
965
966 write_extent_buffer(root->node, root->fs_info->fsid,
967 (unsigned long)btrfs_header_fsid(root->node),
968 BTRFS_FSID_SIZE);
969 btrfs_mark_buffer_dirty(root->node);
970 btrfs_tree_unlock(root->node);
971 fs_info->log_root_tree = root;
972 return 0;
973}
974
975struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
976 struct btrfs_key *location)
977{
978 struct btrfs_root *root;
979 struct btrfs_fs_info *fs_info = tree_root->fs_info;
980 struct btrfs_path *path;
981 struct extent_buffer *l;
982 u64 highest_inode;
983 u64 generation;
984 u32 blocksize;
985 int ret = 0;
986
987 root = kzalloc(sizeof(*root), GFP_NOFS);
988 if (!root)
989 return ERR_PTR(-ENOMEM);
990 if (location->offset == (u64)-1) {
991 ret = find_and_setup_root(tree_root, fs_info,
992 location->objectid, root);
993 if (ret) {
994 kfree(root);
995 return ERR_PTR(ret);
996 }
997 goto insert;
998 }
999
1000 __setup_root(tree_root->nodesize, tree_root->leafsize,
1001 tree_root->sectorsize, tree_root->stripesize,
1002 root, fs_info, location->objectid);
1003
1004 path = btrfs_alloc_path();
1005 BUG_ON(!path);
1006 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1007 if (ret != 0) {
1008 if (ret > 0)
1009 ret = -ENOENT;
1010 goto out;
1011 }
1012 l = path->nodes[0];
1013 read_extent_buffer(l, &root->root_item,
1014 btrfs_item_ptr_offset(l, path->slots[0]),
1015 sizeof(root->root_item));
1016 memcpy(&root->root_key, location, sizeof(*location));
1017 ret = 0;
1018out:
1019 btrfs_release_path(root, path);
1020 btrfs_free_path(path);
1021 if (ret) {
1022 kfree(root);
1023 return ERR_PTR(ret);
1024 }
1025 generation = btrfs_root_generation(&root->root_item);
1026 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1027 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1028 blocksize, generation);
1029 BUG_ON(!root->node);
1030insert:
1031 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1032 root->ref_cows = 1;
1033 ret = btrfs_find_highest_inode(root, &highest_inode);
1034 if (ret == 0) {
1035 root->highest_inode = highest_inode;
1036 root->last_inode_alloc = highest_inode;
1037 }
1038 }
1039 return root;
1040}
1041
1042struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1043 u64 root_objectid)
1044{
1045 struct btrfs_root *root;
1046
1047 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
1048 return fs_info->tree_root;
1049 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
1050 return fs_info->extent_root;
1051
1052 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1053 (unsigned long)root_objectid);
1054 return root;
1055}
1056
1057struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1058 struct btrfs_key *location)
1059{
1060 struct btrfs_root *root;
1061 int ret;
1062
1063 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1064 return fs_info->tree_root;
1065 if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1066 return fs_info->extent_root;
1067 if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1068 return fs_info->chunk_root;
1069 if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1070 return fs_info->dev_root;
1071 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1072 return fs_info->csum_root;
1073
1074 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1075 (unsigned long)location->objectid);
1076 if (root)
1077 return root;
1078
1079 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1080 if (IS_ERR(root))
1081 return root;
1082
1083 set_anon_super(&root->anon_super, NULL);
1084
1085 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1086 (unsigned long)root->root_key.objectid,
1087 root);
1088 if (ret) {
1089 free_extent_buffer(root->node);
1090 kfree(root);
1091 return ERR_PTR(ret);
1092 }
1093 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
1094 ret = btrfs_find_dead_roots(fs_info->tree_root,
1095 root->root_key.objectid, root);
1096 BUG_ON(ret);
1097 btrfs_orphan_cleanup(root);
1098 }
1099 return root;
1100}
1101
1102struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1103 struct btrfs_key *location,
1104 const char *name, int namelen)
1105{
1106 struct btrfs_root *root;
1107 int ret;
1108
1109 root = btrfs_read_fs_root_no_name(fs_info, location);
1110 if (!root)
1111 return NULL;
1112
1113 if (root->in_sysfs)
1114 return root;
1115
1116 ret = btrfs_set_root_name(root, name, namelen);
1117 if (ret) {
1118 free_extent_buffer(root->node);
1119 kfree(root);
1120 return ERR_PTR(ret);
1121 }
1122#if 0
1123 ret = btrfs_sysfs_add_root(root);
1124 if (ret) {
1125 free_extent_buffer(root->node);
1126 kfree(root->name);
1127 kfree(root);
1128 return ERR_PTR(ret);
1129 }
1130#endif
1131 root->in_sysfs = 1;
1132 return root;
1133}
1134
1135static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1136{
1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1138 int ret = 0;
1139 struct list_head *cur;
1140 struct btrfs_device *device;
1141 struct backing_dev_info *bdi;
1142#if 0
1143 if ((bdi_bits & (1 << BDI_write_congested)) &&
1144 btrfs_congested_async(info, 0))
1145 return 1;
1146#endif
1147 list_for_each(cur, &info->fs_devices->devices) {
1148 device = list_entry(cur, struct btrfs_device, dev_list);
1149 if (!device->bdev)
1150 continue;
1151 bdi = blk_get_backing_dev_info(device->bdev);
1152 if (bdi && bdi_congested(bdi, bdi_bits)) {
1153 ret = 1;
1154 break;
1155 }
1156 }
1157 return ret;
1158}
1159
1160/*
1161 * this unplugs every device on the box, and it is only used when page
1162 * is null
1163 */
1164static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1165{
1166 struct list_head *cur;
1167 struct btrfs_device *device;
1168 struct btrfs_fs_info *info;
1169
1170 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1171 list_for_each(cur, &info->fs_devices->devices) {
1172 device = list_entry(cur, struct btrfs_device, dev_list);
1173 if (!device->bdev)
1174 continue;
1175
1176 bdi = blk_get_backing_dev_info(device->bdev);
1177 if (bdi->unplug_io_fn)
1178 bdi->unplug_io_fn(bdi, page);
1179 }
1180}
1181
1182static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1183{
1184 struct inode *inode;
1185 struct extent_map_tree *em_tree;
1186 struct extent_map *em;
1187 struct address_space *mapping;
1188 u64 offset;
1189
1190 /* the generic O_DIRECT read code does this */
1191 if (1 || !page) {
1192 __unplug_io_fn(bdi, page);
1193 return;
1194 }
1195
1196 /*
1197 * page->mapping may change at any time. Get a consistent copy
1198 * and use that for everything below
1199 */
1200 smp_mb();
1201 mapping = page->mapping;
1202 if (!mapping)
1203 return;
1204
1205 inode = mapping->host;
1206
1207 /*
1208 * don't do the expensive searching for a small number of
1209 * devices
1210 */
1211 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1212 __unplug_io_fn(bdi, page);
1213 return;
1214 }
1215
1216 offset = page_offset(page);
1217
1218 em_tree = &BTRFS_I(inode)->extent_tree;
1219 spin_lock(&em_tree->lock);
1220 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1221 spin_unlock(&em_tree->lock);
1222 if (!em) {
1223 __unplug_io_fn(bdi, page);
1224 return;
1225 }
1226
1227 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1228 free_extent_map(em);
1229 __unplug_io_fn(bdi, page);
1230 return;
1231 }
1232 offset = offset - em->start;
1233 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1234 em->block_start + offset, page);
1235 free_extent_map(em);
1236}
1237
1238static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1239{
1240 bdi_init(bdi);
1241 bdi->ra_pages = default_backing_dev_info.ra_pages;
1242 bdi->state = 0;
1243 bdi->capabilities = default_backing_dev_info.capabilities;
1244 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1245 bdi->unplug_io_data = info;
1246 bdi->congested_fn = btrfs_congested_fn;
1247 bdi->congested_data = info;
1248 return 0;
1249}
1250
1251static int bio_ready_for_csum(struct bio *bio)
1252{
1253 u64 length = 0;
1254 u64 buf_len = 0;
1255 u64 start = 0;
1256 struct page *page;
1257 struct extent_io_tree *io_tree = NULL;
1258 struct btrfs_fs_info *info = NULL;
1259 struct bio_vec *bvec;
1260 int i;
1261 int ret;
1262
1263 bio_for_each_segment(bvec, bio, i) {
1264 page = bvec->bv_page;
1265 if (page->private == EXTENT_PAGE_PRIVATE) {
1266 length += bvec->bv_len;
1267 continue;
1268 }
1269 if (!page->private) {
1270 length += bvec->bv_len;
1271 continue;
1272 }
1273 length = bvec->bv_len;
1274 buf_len = page->private >> 2;
1275 start = page_offset(page) + bvec->bv_offset;
1276 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1277 info = BTRFS_I(page->mapping->host)->root->fs_info;
1278 }
1279 /* are we fully contained in this bio? */
1280 if (buf_len <= length)
1281 return 1;
1282
1283 ret = extent_range_uptodate(io_tree, start + length,
1284 start + buf_len - 1);
1285 if (ret == 1)
1286 return ret;
1287 return ret;
1288}
1289
1290/*
1291 * called by the kthread helper functions to finally call the bio end_io
1292 * functions. This is where read checksum verification actually happens
1293 */
1294static void end_workqueue_fn(struct btrfs_work *work)
1295{
1296 struct bio *bio;
1297 struct end_io_wq *end_io_wq;
1298 struct btrfs_fs_info *fs_info;
1299 int error;
1300
1301 end_io_wq = container_of(work, struct end_io_wq, work);
1302 bio = end_io_wq->bio;
1303 fs_info = end_io_wq->info;
1304
1305 /* metadata bio reads are special because the whole tree block must
1306 * be checksummed at once. This makes sure the entire block is in
1307 * ram and up to date before trying to verify things. For
1308 * blocksize <= pagesize, it is basically a noop
1309 */
1310 if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
1311 !bio_ready_for_csum(bio)) {
1312 btrfs_queue_worker(&fs_info->endio_meta_workers,
1313 &end_io_wq->work);
1314 return;
1315 }
1316 error = end_io_wq->error;
1317 bio->bi_private = end_io_wq->private;
1318 bio->bi_end_io = end_io_wq->end_io;
1319 kfree(end_io_wq);
1320 bio_endio(bio, error);
1321}
1322
1323static int cleaner_kthread(void *arg)
1324{
1325 struct btrfs_root *root = arg;
1326
1327 do {
1328 smp_mb();
1329 if (root->fs_info->closing)
1330 break;
1331
1332 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1333 mutex_lock(&root->fs_info->cleaner_mutex);
1334 btrfs_clean_old_snapshots(root);
1335 mutex_unlock(&root->fs_info->cleaner_mutex);
1336
1337 if (freezing(current)) {
1338 refrigerator();
1339 } else {
1340 smp_mb();
1341 if (root->fs_info->closing)
1342 break;
1343 set_current_state(TASK_INTERRUPTIBLE);
1344 schedule();
1345 __set_current_state(TASK_RUNNING);
1346 }
1347 } while (!kthread_should_stop());
1348 return 0;
1349}
1350
1351static int transaction_kthread(void *arg)
1352{
1353 struct btrfs_root *root = arg;
1354 struct btrfs_trans_handle *trans;
1355 struct btrfs_transaction *cur;
1356 unsigned long now;
1357 unsigned long delay;
1358 int ret;
1359
1360 do {
1361 smp_mb();
1362 if (root->fs_info->closing)
1363 break;
1364
1365 delay = HZ * 30;
1366 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1367 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1368
1369 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1370 printk(KERN_INFO "btrfs: total reference cache "
1371 "size %llu\n",
1372 root->fs_info->total_ref_cache_size);
1373 }
1374
1375 mutex_lock(&root->fs_info->trans_mutex);
1376 cur = root->fs_info->running_transaction;
1377 if (!cur) {
1378 mutex_unlock(&root->fs_info->trans_mutex);
1379 goto sleep;
1380 }
1381
1382 now = get_seconds();
1383 if (now < cur->start_time || now - cur->start_time < 30) {
1384 mutex_unlock(&root->fs_info->trans_mutex);
1385 delay = HZ * 5;
1386 goto sleep;
1387 }
1388 mutex_unlock(&root->fs_info->trans_mutex);
1389 trans = btrfs_start_transaction(root, 1);
1390 ret = btrfs_commit_transaction(trans, root);
1391sleep:
1392 wake_up_process(root->fs_info->cleaner_kthread);
1393 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1394
1395 if (freezing(current)) {
1396 refrigerator();
1397 } else {
1398 if (root->fs_info->closing)
1399 break;
1400 set_current_state(TASK_INTERRUPTIBLE);
1401 schedule_timeout(delay);
1402 __set_current_state(TASK_RUNNING);
1403 }
1404 } while (!kthread_should_stop());
1405 return 0;
1406}
1407
1408struct btrfs_root *open_ctree(struct super_block *sb,
1409 struct btrfs_fs_devices *fs_devices,
1410 char *options)
1411{
1412 u32 sectorsize;
1413 u32 nodesize;
1414 u32 leafsize;
1415 u32 blocksize;
1416 u32 stripesize;
1417 u64 generation;
1418 u64 features;
1419 struct btrfs_key location;
1420 struct buffer_head *bh;
1421 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
1422 GFP_NOFS);
1423 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1424 GFP_NOFS);
1425 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
1426 GFP_NOFS);
1427 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1428 GFP_NOFS);
1429 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1430 GFP_NOFS);
1431 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
1432 GFP_NOFS);
1433 struct btrfs_root *log_tree_root;
1434
1435 int ret;
1436 int err = -EINVAL;
1437
1438 struct btrfs_super_block *disk_super;
1439
1440 if (!extent_root || !tree_root || !fs_info ||
1441 !chunk_root || !dev_root || !csum_root) {
1442 err = -ENOMEM;
1443 goto fail;
1444 }
1445 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1446 INIT_LIST_HEAD(&fs_info->trans_list);
1447 INIT_LIST_HEAD(&fs_info->dead_roots);
1448 INIT_LIST_HEAD(&fs_info->hashers);
1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1450 spin_lock_init(&fs_info->hash_lock);
1451 spin_lock_init(&fs_info->delalloc_lock);
1452 spin_lock_init(&fs_info->new_trans_lock);
1453 spin_lock_init(&fs_info->ref_cache_lock);
1454
1455 init_completion(&fs_info->kobj_unregister);
1456 fs_info->tree_root = tree_root;
1457 fs_info->extent_root = extent_root;
1458 fs_info->csum_root = csum_root;
1459 fs_info->chunk_root = chunk_root;
1460 fs_info->dev_root = dev_root;
1461 fs_info->fs_devices = fs_devices;
1462 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1463 INIT_LIST_HEAD(&fs_info->space_info);
1464 btrfs_mapping_init(&fs_info->mapping_tree);
1465 atomic_set(&fs_info->nr_async_submits, 0);
1466 atomic_set(&fs_info->async_delalloc_pages, 0);
1467 atomic_set(&fs_info->async_submit_draining, 0);
1468 atomic_set(&fs_info->nr_async_bios, 0);
1469 atomic_set(&fs_info->throttles, 0);
1470 atomic_set(&fs_info->throttle_gen, 0);
1471 fs_info->sb = sb;
1472 fs_info->max_extent = (u64)-1;
1473 fs_info->max_inline = 8192 * 1024;
1474 setup_bdi(fs_info, &fs_info->bdi);
1475 fs_info->btree_inode = new_inode(sb);
1476 fs_info->btree_inode->i_ino = 1;
1477 fs_info->btree_inode->i_nlink = 1;
1478
1479 fs_info->thread_pool_size = min_t(unsigned long,
1480 num_online_cpus() + 2, 8);
1481
1482 INIT_LIST_HEAD(&fs_info->ordered_extents);
1483 spin_lock_init(&fs_info->ordered_extent_lock);
1484
1485 sb->s_blocksize = 4096;
1486 sb->s_blocksize_bits = blksize_bits(4096);
1487
1488 /*
1489 * we set the i_size on the btree inode to the max possible int.
1490 * the real end of the address space is determined by all of
1491 * the devices in the system
1492 */
1493 fs_info->btree_inode->i_size = OFFSET_MAX;
1494 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1495 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1496
1497 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1498 fs_info->btree_inode->i_mapping,
1499 GFP_NOFS);
1500 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1501 GFP_NOFS);
1502
1503 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1504
1505 spin_lock_init(&fs_info->block_group_cache_lock);
1506 fs_info->block_group_cache_tree.rb_node = NULL;
1507
1508 extent_io_tree_init(&fs_info->pinned_extents,
1509 fs_info->btree_inode->i_mapping, GFP_NOFS);
1510 extent_io_tree_init(&fs_info->pending_del,
1511 fs_info->btree_inode->i_mapping, GFP_NOFS);
1512 extent_io_tree_init(&fs_info->extent_ins,
1513 fs_info->btree_inode->i_mapping, GFP_NOFS);
1514 fs_info->do_barriers = 1;
1515
1516 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
1517 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
1518 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
1519
1520 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1521 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1522 sizeof(struct btrfs_key));
1523 insert_inode_hash(fs_info->btree_inode);
1524
1525 mutex_init(&fs_info->trans_mutex);
1526 mutex_init(&fs_info->tree_log_mutex);
1527 mutex_init(&fs_info->drop_mutex);
1528 mutex_init(&fs_info->extent_ins_mutex);
1529 mutex_init(&fs_info->pinned_mutex);
1530 mutex_init(&fs_info->chunk_mutex);
1531 mutex_init(&fs_info->transaction_kthread_mutex);
1532 mutex_init(&fs_info->cleaner_mutex);
1533 mutex_init(&fs_info->volume_mutex);
1534 mutex_init(&fs_info->tree_reloc_mutex);
1535 init_waitqueue_head(&fs_info->transaction_throttle);
1536 init_waitqueue_head(&fs_info->transaction_wait);
1537 init_waitqueue_head(&fs_info->async_submit_wait);
1538 init_waitqueue_head(&fs_info->tree_log_wait);
1539 atomic_set(&fs_info->tree_log_commit, 0);
1540 atomic_set(&fs_info->tree_log_writers, 0);
1541 fs_info->tree_log_transid = 0;
1542
1543 __setup_root(4096, 4096, 4096, 4096, tree_root,
1544 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1545
1546
1547 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1548 if (!bh)
1549 goto fail_iput;
1550
1551 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1552 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
1553 sizeof(fs_info->super_for_commit));
1554 brelse(bh);
1555
1556 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
1557
1558 disk_super = &fs_info->super_copy;
1559 if (!btrfs_super_root(disk_super))
1560 goto fail_iput;
1561
1562 ret = btrfs_parse_options(tree_root, options);
1563 if (ret) {
1564 err = ret;
1565 goto fail_iput;
1566 }
1567
1568 features = btrfs_super_incompat_flags(disk_super) &
1569 ~BTRFS_FEATURE_INCOMPAT_SUPP;
1570 if (features) {
1571 printk(KERN_ERR "BTRFS: couldn't mount because of "
1572 "unsupported optional features (%Lx).\n",
1573 features);
1574 err = -EINVAL;
1575 goto fail_iput;
1576 }
1577
1578 features = btrfs_super_compat_ro_flags(disk_super) &
1579 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
1580 if (!(sb->s_flags & MS_RDONLY) && features) {
1581 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
1582 "unsupported option features (%Lx).\n",
1583 features);
1584 err = -EINVAL;
1585 goto fail_iput;
1586 }
1587
1588 /*
1589 * we need to start all the end_io workers up front because the
1590 * queue work function gets called at interrupt time, and so it
1591 * cannot dynamically grow.
1592 */
1593 btrfs_init_workers(&fs_info->workers, "worker",
1594 fs_info->thread_pool_size);
1595
1596 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1597 fs_info->thread_pool_size);
1598
1599 btrfs_init_workers(&fs_info->submit_workers, "submit",
1600 min_t(u64, fs_devices->num_devices,
1601 fs_info->thread_pool_size));
1602
1603 /* a higher idle thresh on the submit workers makes it much more
1604 * likely that bios will be send down in a sane order to the
1605 * devices
1606 */
1607 fs_info->submit_workers.idle_thresh = 64;
1608
1609 fs_info->workers.idle_thresh = 16;
1610 fs_info->workers.ordered = 1;
1611
1612 fs_info->delalloc_workers.idle_thresh = 2;
1613 fs_info->delalloc_workers.ordered = 1;
1614
1615 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1616 btrfs_init_workers(&fs_info->endio_workers, "endio",
1617 fs_info->thread_pool_size);
1618 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1619 fs_info->thread_pool_size);
1620 btrfs_init_workers(&fs_info->endio_meta_write_workers,
1621 "endio-meta-write", fs_info->thread_pool_size);
1622 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1623 fs_info->thread_pool_size);
1624
1625 /*
1626 * endios are largely parallel and should have a very
1627 * low idle thresh
1628 */
1629 fs_info->endio_workers.idle_thresh = 4;
1630 fs_info->endio_write_workers.idle_thresh = 64;
1631 fs_info->endio_meta_write_workers.idle_thresh = 64;
1632
1633 btrfs_start_workers(&fs_info->workers, 1);
1634 btrfs_start_workers(&fs_info->submit_workers, 1);
1635 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1636 btrfs_start_workers(&fs_info->fixup_workers, 1);
1637 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1638 btrfs_start_workers(&fs_info->endio_meta_workers,
1639 fs_info->thread_pool_size);
1640 btrfs_start_workers(&fs_info->endio_meta_write_workers,
1641 fs_info->thread_pool_size);
1642 btrfs_start_workers(&fs_info->endio_write_workers,
1643 fs_info->thread_pool_size);
1644
1645 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1646 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1647 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
1648
1649 nodesize = btrfs_super_nodesize(disk_super);
1650 leafsize = btrfs_super_leafsize(disk_super);
1651 sectorsize = btrfs_super_sectorsize(disk_super);
1652 stripesize = btrfs_super_stripesize(disk_super);
1653 tree_root->nodesize = nodesize;
1654 tree_root->leafsize = leafsize;
1655 tree_root->sectorsize = sectorsize;
1656 tree_root->stripesize = stripesize;
1657
1658 sb->s_blocksize = sectorsize;
1659 sb->s_blocksize_bits = blksize_bits(sectorsize);
1660
1661 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
1662 sizeof(disk_super->magic))) {
1663 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
1664 goto fail_sb_buffer;
1665 }
1666
1667 mutex_lock(&fs_info->chunk_mutex);
1668 ret = btrfs_read_sys_array(tree_root);
1669 mutex_unlock(&fs_info->chunk_mutex);
1670 if (ret) {
1671 printk(KERN_WARNING "btrfs: failed to read the system "
1672 "array on %s\n", sb->s_id);
1673 goto fail_sys_array;
1674 }
1675
1676 blocksize = btrfs_level_size(tree_root,
1677 btrfs_super_chunk_root_level(disk_super));
1678 generation = btrfs_super_chunk_root_generation(disk_super);
1679
1680 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1681 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1682
1683 chunk_root->node = read_tree_block(chunk_root,
1684 btrfs_super_chunk_root(disk_super),
1685 blocksize, generation);
1686 BUG_ON(!chunk_root->node);
1687
1688 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1689 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
1690 BTRFS_UUID_SIZE);
1691
1692 mutex_lock(&fs_info->chunk_mutex);
1693 ret = btrfs_read_chunk_tree(chunk_root);
1694 mutex_unlock(&fs_info->chunk_mutex);
1695 if (ret) {
1696 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1697 sb->s_id);
1698 goto fail_chunk_root;
1699 }
1700
1701 btrfs_close_extra_devices(fs_devices);
1702
1703 blocksize = btrfs_level_size(tree_root,
1704 btrfs_super_root_level(disk_super));
1705 generation = btrfs_super_generation(disk_super);
1706
1707 tree_root->node = read_tree_block(tree_root,
1708 btrfs_super_root(disk_super),
1709 blocksize, generation);
1710 if (!tree_root->node)
1711 goto fail_chunk_root;
1712
1713
1714 ret = find_and_setup_root(tree_root, fs_info,
1715 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1716 if (ret)
1717 goto fail_tree_root;
1718 extent_root->track_dirty = 1;
1719
1720 ret = find_and_setup_root(tree_root, fs_info,
1721 BTRFS_DEV_TREE_OBJECTID, dev_root);
1722 dev_root->track_dirty = 1;
1723
1724 if (ret)
1725 goto fail_extent_root;
1726
1727 ret = find_and_setup_root(tree_root, fs_info,
1728 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1729 if (ret)
1730 goto fail_extent_root;
1731
1732 csum_root->track_dirty = 1;
1733
1734 btrfs_read_block_groups(extent_root);
1735
1736 fs_info->generation = generation;
1737 fs_info->last_trans_committed = generation;
1738 fs_info->data_alloc_profile = (u64)-1;
1739 fs_info->metadata_alloc_profile = (u64)-1;
1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1742 "btrfs-cleaner");
1743 if (!fs_info->cleaner_kthread)
1744 goto fail_csum_root;
1745
1746 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1747 tree_root,
1748 "btrfs-transaction");
1749 if (!fs_info->transaction_kthread)
1750 goto fail_cleaner;
1751
1752 if (btrfs_super_log_root(disk_super) != 0) {
1753 u64 bytenr = btrfs_super_log_root(disk_super);
1754
1755 if (fs_devices->rw_devices == 0) {
1756 printk(KERN_WARNING "Btrfs log replay required "
1757 "on RO media\n");
1758 err = -EIO;
1759 goto fail_trans_kthread;
1760 }
1761 blocksize =
1762 btrfs_level_size(tree_root,
1763 btrfs_super_log_root_level(disk_super));
1764
1765 log_tree_root = kzalloc(sizeof(struct btrfs_root),
1766 GFP_NOFS);
1767
1768 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1769 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1770
1771 log_tree_root->node = read_tree_block(tree_root, bytenr,
1772 blocksize,
1773 generation + 1);
1774 ret = btrfs_recover_log_trees(log_tree_root);
1775 BUG_ON(ret);
1776
1777 if (sb->s_flags & MS_RDONLY) {
1778 ret = btrfs_commit_super(tree_root);
1779 BUG_ON(ret);
1780 }
1781 }
1782
1783 if (!(sb->s_flags & MS_RDONLY)) {
1784 ret = btrfs_cleanup_reloc_trees(tree_root);
1785 BUG_ON(ret);
1786 }
1787
1788 location.objectid = BTRFS_FS_TREE_OBJECTID;
1789 location.type = BTRFS_ROOT_ITEM_KEY;
1790 location.offset = (u64)-1;
1791
1792 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1793 if (!fs_info->fs_root)
1794 goto fail_trans_kthread;
1795 return tree_root;
1796
1797fail_trans_kthread:
1798 kthread_stop(fs_info->transaction_kthread);
1799fail_cleaner:
1800 kthread_stop(fs_info->cleaner_kthread);
1801
1802 /*
1803 * make sure we're done with the btree inode before we stop our
1804 * kthreads
1805 */
1806 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
1807 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1808
1809fail_csum_root:
1810 free_extent_buffer(csum_root->node);
1811fail_extent_root:
1812 free_extent_buffer(extent_root->node);
1813fail_tree_root:
1814 free_extent_buffer(tree_root->node);
1815fail_chunk_root:
1816 free_extent_buffer(chunk_root->node);
1817fail_sys_array:
1818 free_extent_buffer(dev_root->node);
1819fail_sb_buffer:
1820 btrfs_stop_workers(&fs_info->fixup_workers);
1821 btrfs_stop_workers(&fs_info->delalloc_workers);
1822 btrfs_stop_workers(&fs_info->workers);
1823 btrfs_stop_workers(&fs_info->endio_workers);
1824 btrfs_stop_workers(&fs_info->endio_meta_workers);
1825 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
1826 btrfs_stop_workers(&fs_info->endio_write_workers);
1827 btrfs_stop_workers(&fs_info->submit_workers);
1828fail_iput:
1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1830 iput(fs_info->btree_inode);
1831fail:
1832 btrfs_close_devices(fs_info->fs_devices);
1833 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1834
1835 kfree(extent_root);
1836 kfree(tree_root);
1837 bdi_destroy(&fs_info->bdi);
1838 kfree(fs_info);
1839 kfree(chunk_root);
1840 kfree(dev_root);
1841 kfree(csum_root);
1842 return ERR_PTR(err);
1843}
1844
1845static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
1846{
1847 char b[BDEVNAME_SIZE];
1848
1849 if (uptodate) {
1850 set_buffer_uptodate(bh);
1851 } else {
1852 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
1853 printk(KERN_WARNING "lost page write due to "
1854 "I/O error on %s\n",
1855 bdevname(bh->b_bdev, b));
1856 }
1857 /* note, we dont' set_buffer_write_io_error because we have
1858 * our own ways of dealing with the IO errors
1859 */
1860 clear_buffer_uptodate(bh);
1861 }
1862 unlock_buffer(bh);
1863 put_bh(bh);
1864}
1865
1866struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
1867{
1868 struct buffer_head *bh;
1869 struct buffer_head *latest = NULL;
1870 struct btrfs_super_block *super;
1871 int i;
1872 u64 transid = 0;
1873 u64 bytenr;
1874
1875 /* we would like to check all the supers, but that would make
1876 * a btrfs mount succeed after a mkfs from a different FS.
1877 * So, we need to add a special mount option to scan for
1878 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1879 */
1880 for (i = 0; i < 1; i++) {
1881 bytenr = btrfs_sb_offset(i);
1882 if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
1883 break;
1884 bh = __bread(bdev, bytenr / 4096, 4096);
1885 if (!bh)
1886 continue;
1887
1888 super = (struct btrfs_super_block *)bh->b_data;
1889 if (btrfs_super_bytenr(super) != bytenr ||
1890 strncmp((char *)(&super->magic), BTRFS_MAGIC,
1891 sizeof(super->magic))) {
1892 brelse(bh);
1893 continue;
1894 }
1895
1896 if (!latest || btrfs_super_generation(super) > transid) {
1897 brelse(latest);
1898 latest = bh;
1899 transid = btrfs_super_generation(super);
1900 } else {
1901 brelse(bh);
1902 }
1903 }
1904 return latest;
1905}
1906
1907static int write_dev_supers(struct btrfs_device *device,
1908 struct btrfs_super_block *sb,
1909 int do_barriers, int wait, int max_mirrors)
1910{
1911 struct buffer_head *bh;
1912 int i;
1913 int ret;
1914 int errors = 0;
1915 u32 crc;
1916 u64 bytenr;
1917 int last_barrier = 0;
1918
1919 if (max_mirrors == 0)
1920 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
1921
1922 /* make sure only the last submit_bh does a barrier */
1923 if (do_barriers) {
1924 for (i = 0; i < max_mirrors; i++) {
1925 bytenr = btrfs_sb_offset(i);
1926 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1927 device->total_bytes)
1928 break;
1929 last_barrier = i;
1930 }
1931 }
1932
1933 for (i = 0; i < max_mirrors; i++) {
1934 bytenr = btrfs_sb_offset(i);
1935 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
1936 break;
1937
1938 if (wait) {
1939 bh = __find_get_block(device->bdev, bytenr / 4096,
1940 BTRFS_SUPER_INFO_SIZE);
1941 BUG_ON(!bh);
1942 brelse(bh);
1943 wait_on_buffer(bh);
1944 if (buffer_uptodate(bh)) {
1945 brelse(bh);
1946 continue;
1947 }
1948 } else {
1949 btrfs_set_super_bytenr(sb, bytenr);
1950
1951 crc = ~(u32)0;
1952 crc = btrfs_csum_data(NULL, (char *)sb +
1953 BTRFS_CSUM_SIZE, crc,
1954 BTRFS_SUPER_INFO_SIZE -
1955 BTRFS_CSUM_SIZE);
1956 btrfs_csum_final(crc, sb->csum);
1957
1958 bh = __getblk(device->bdev, bytenr / 4096,
1959 BTRFS_SUPER_INFO_SIZE);
1960 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
1961
1962 set_buffer_uptodate(bh);
1963 get_bh(bh);
1964 lock_buffer(bh);
1965 bh->b_end_io = btrfs_end_buffer_write_sync;
1966 }
1967
1968 if (i == last_barrier && do_barriers && device->barriers) {
1969 ret = submit_bh(WRITE_BARRIER, bh);
1970 if (ret == -EOPNOTSUPP) {
1971 printk("btrfs: disabling barriers on dev %s\n",
1972 device->name);
1973 set_buffer_uptodate(bh);
1974 device->barriers = 0;
1975 get_bh(bh);
1976 lock_buffer(bh);
1977 ret = submit_bh(WRITE, bh);
1978 }
1979 } else {
1980 ret = submit_bh(WRITE, bh);
1981 }
1982
1983 if (!ret && wait) {
1984 wait_on_buffer(bh);
1985 if (!buffer_uptodate(bh))
1986 errors++;
1987 } else if (ret) {
1988 errors++;
1989 }
1990 if (wait)
1991 brelse(bh);
1992 }
1993 return errors < i ? 0 : -1;
1994}
1995
1996int write_all_supers(struct btrfs_root *root, int max_mirrors)
1997{
1998 struct list_head *cur;
1999 struct list_head *head = &root->fs_info->fs_devices->devices;
2000 struct btrfs_device *dev;
2001 struct btrfs_super_block *sb;
2002 struct btrfs_dev_item *dev_item;
2003 int ret;
2004 int do_barriers;
2005 int max_errors;
2006 int total_errors = 0;
2007 u64 flags;
2008
2009 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
2010 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2011
2012 sb = &root->fs_info->super_for_commit;
2013 dev_item = &sb->dev_item;
2014 list_for_each(cur, head) {
2015 dev = list_entry(cur, struct btrfs_device, dev_list);
2016 if (!dev->bdev) {
2017 total_errors++;
2018 continue;
2019 }
2020 if (!dev->in_fs_metadata || !dev->writeable)
2021 continue;
2022
2023 btrfs_set_stack_device_generation(dev_item, 0);
2024 btrfs_set_stack_device_type(dev_item, dev->type);
2025 btrfs_set_stack_device_id(dev_item, dev->devid);
2026 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
2027 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
2028 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
2029 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
2030 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
2031 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
2032 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
2033
2034 flags = btrfs_super_flags(sb);
2035 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
2036
2037 ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
2038 if (ret)
2039 total_errors++;
2040 }
2041 if (total_errors > max_errors) {
2042 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2043 total_errors);
2044 BUG();
2045 }
2046
2047 total_errors = 0;
2048 list_for_each(cur, head) {
2049 dev = list_entry(cur, struct btrfs_device, dev_list);
2050 if (!dev->bdev)
2051 continue;
2052 if (!dev->in_fs_metadata || !dev->writeable)
2053 continue;
2054
2055 ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
2056 if (ret)
2057 total_errors++;
2058 }
2059 if (total_errors > max_errors) {
2060 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2061 total_errors);
2062 BUG();
2063 }
2064 return 0;
2065}
2066
2067int write_ctree_super(struct btrfs_trans_handle *trans,
2068 struct btrfs_root *root, int max_mirrors)
2069{
2070 int ret;
2071
2072 ret = write_all_supers(root, max_mirrors);
2073 return ret;
2074}
2075
2076int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2077{
2078 radix_tree_delete(&fs_info->fs_roots_radix,
2079 (unsigned long)root->root_key.objectid);
2080 if (root->anon_super.s_dev) {
2081 down_write(&root->anon_super.s_umount);
2082 kill_anon_super(&root->anon_super);
2083 }
2084 if (root->node)
2085 free_extent_buffer(root->node);
2086 if (root->commit_root)
2087 free_extent_buffer(root->commit_root);
2088 kfree(root->name);
2089 kfree(root);
2090 return 0;
2091}
2092
2093static int del_fs_roots(struct btrfs_fs_info *fs_info)
2094{
2095 int ret;
2096 struct btrfs_root *gang[8];
2097 int i;
2098
2099 while (1) {
2100 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2101 (void **)gang, 0,
2102 ARRAY_SIZE(gang));
2103 if (!ret)
2104 break;
2105 for (i = 0; i < ret; i++)
2106 btrfs_free_fs_root(fs_info, gang[i]);
2107 }
2108 return 0;
2109}
2110
2111int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2112{
2113 u64 root_objectid = 0;
2114 struct btrfs_root *gang[8];
2115 int i;
2116 int ret;
2117
2118 while (1) {
2119 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2120 (void **)gang, root_objectid,
2121 ARRAY_SIZE(gang));
2122 if (!ret)
2123 break;
2124 for (i = 0; i < ret; i++) {
2125 root_objectid = gang[i]->root_key.objectid;
2126 ret = btrfs_find_dead_roots(fs_info->tree_root,
2127 root_objectid, gang[i]);
2128 BUG_ON(ret);
2129 btrfs_orphan_cleanup(gang[i]);
2130 }
2131 root_objectid++;
2132 }
2133 return 0;
2134}
2135
2136int btrfs_commit_super(struct btrfs_root *root)
2137{
2138 struct btrfs_trans_handle *trans;
2139 int ret;
2140
2141 mutex_lock(&root->fs_info->cleaner_mutex);
2142 btrfs_clean_old_snapshots(root);
2143 mutex_unlock(&root->fs_info->cleaner_mutex);
2144 trans = btrfs_start_transaction(root, 1);
2145 ret = btrfs_commit_transaction(trans, root);
2146 BUG_ON(ret);
2147 /* run commit again to drop the original snapshot */
2148 trans = btrfs_start_transaction(root, 1);
2149 btrfs_commit_transaction(trans, root);
2150 ret = btrfs_write_and_wait_transaction(NULL, root);
2151 BUG_ON(ret);
2152
2153 ret = write_ctree_super(NULL, root, 0);
2154 return ret;
2155}
2156
2157int close_ctree(struct btrfs_root *root)
2158{
2159 struct btrfs_fs_info *fs_info = root->fs_info;
2160 int ret;
2161
2162 fs_info->closing = 1;
2163 smp_mb();
2164
2165 kthread_stop(root->fs_info->transaction_kthread);
2166 kthread_stop(root->fs_info->cleaner_kthread);
2167
2168 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2169 ret = btrfs_commit_super(root);
2170 if (ret)
2171 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2172 }
2173
2174 if (fs_info->delalloc_bytes) {
2175 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2176 fs_info->delalloc_bytes);
2177 }
2178 if (fs_info->total_ref_cache_size) {
2179 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
2180 (unsigned long long)fs_info->total_ref_cache_size);
2181 }
2182
2183 if (fs_info->extent_root->node)
2184 free_extent_buffer(fs_info->extent_root->node);
2185
2186 if (fs_info->tree_root->node)
2187 free_extent_buffer(fs_info->tree_root->node);
2188
2189 if (root->fs_info->chunk_root->node)
2190 free_extent_buffer(root->fs_info->chunk_root->node);
2191
2192 if (root->fs_info->dev_root->node)
2193 free_extent_buffer(root->fs_info->dev_root->node);
2194
2195 if (root->fs_info->csum_root->node)
2196 free_extent_buffer(root->fs_info->csum_root->node);
2197
2198 btrfs_free_block_groups(root->fs_info);
2199
2200 del_fs_roots(fs_info);
2201
2202 iput(fs_info->btree_inode);
2203
2204 btrfs_stop_workers(&fs_info->fixup_workers);
2205 btrfs_stop_workers(&fs_info->delalloc_workers);
2206 btrfs_stop_workers(&fs_info->workers);
2207 btrfs_stop_workers(&fs_info->endio_workers);
2208 btrfs_stop_workers(&fs_info->endio_meta_workers);
2209 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2210 btrfs_stop_workers(&fs_info->endio_write_workers);
2211 btrfs_stop_workers(&fs_info->submit_workers);
2212
2213#if 0
2214 while (!list_empty(&fs_info->hashers)) {
2215 struct btrfs_hasher *hasher;
2216 hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
2217 hashers);
2218 list_del(&hasher->hashers);
2219 crypto_free_hash(&fs_info->hash_tfm);
2220 kfree(hasher);
2221 }
2222#endif
2223 btrfs_close_devices(fs_info->fs_devices);
2224 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2225
2226 bdi_destroy(&fs_info->bdi);
2227
2228 kfree(fs_info->extent_root);
2229 kfree(fs_info->tree_root);
2230 kfree(fs_info->chunk_root);
2231 kfree(fs_info->dev_root);
2232 kfree(fs_info->csum_root);
2233 return 0;
2234}
2235
2236int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2237{
2238 int ret;
2239 struct inode *btree_inode = buf->first_page->mapping->host;
2240
2241 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
2242 if (!ret)
2243 return ret;
2244
2245 ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
2246 parent_transid);
2247 return !ret;
2248}
2249
2250int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
2251{
2252 struct inode *btree_inode = buf->first_page->mapping->host;
2253 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
2254 buf);
2255}
2256
2257void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2258{
2259 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2260 u64 transid = btrfs_header_generation(buf);
2261 struct inode *btree_inode = root->fs_info->btree_inode;
2262
2263 WARN_ON(!btrfs_tree_locked(buf));
2264 if (transid != root->fs_info->generation) {
2265 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
2266 "found %llu running %llu\n",
2267 (unsigned long long)buf->start,
2268 (unsigned long long)transid,
2269 (unsigned long long)root->fs_info->generation);
2270 WARN_ON(1);
2271 }
2272 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
2273}
2274
2275void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2276{
2277 /*
2278 * looks as though older kernels can get into trouble with
2279 * this code, they end up stuck in balance_dirty_pages forever
2280 */
2281 struct extent_io_tree *tree;
2282 u64 num_dirty;
2283 u64 start = 0;
2284 unsigned long thresh = 32 * 1024 * 1024;
2285 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2286
2287 if (current_is_pdflush() || current->flags & PF_MEMALLOC)
2288 return;
2289
2290 num_dirty = count_range_bits(tree, &start, (u64)-1,
2291 thresh, EXTENT_DIRTY);
2292 if (num_dirty > thresh) {
2293 balance_dirty_pages_ratelimited_nr(
2294 root->fs_info->btree_inode->i_mapping, 1);
2295 }
2296 return;
2297}
2298
2299int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2300{
2301 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2302 int ret;
2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2304 if (ret == 0)
2305 buf->flags |= EXTENT_UPTODATE;
2306 return ret;
2307}
2308
2309int btree_lock_page_hook(struct page *page)
2310{
2311 struct inode *inode = page->mapping->host;
2312 struct btrfs_root *root = BTRFS_I(inode)->root;
2313 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2314 struct extent_buffer *eb;
2315 unsigned long len;
2316 u64 bytenr = page_offset(page);
2317
2318 if (page->private == EXTENT_PAGE_PRIVATE)
2319 goto out;
2320
2321 len = page->private >> 2;
2322 eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
2323 if (!eb)
2324 goto out;
2325
2326 btrfs_tree_lock(eb);
2327 spin_lock(&root->fs_info->hash_lock);
2328 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2329 spin_unlock(&root->fs_info->hash_lock);
2330 btrfs_tree_unlock(eb);
2331 free_extent_buffer(eb);
2332out:
2333 lock_page(page);
2334 return 0;
2335}
2336
2337static struct extent_io_ops btree_extent_io_ops = {
2338 .write_cache_pages_lock_hook = btree_lock_page_hook,
2339 .readpage_end_io_hook = btree_readpage_end_io_hook,
2340 .submit_bio_hook = btree_submit_bio_hook,
2341 /* note we're sharing with inode.c for the merge bio hook */
2342 .merge_bio_hook = btrfs_merge_bio_hook,
2343};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000000..c0ff404c31b7
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,102 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __DISKIO__
20#define __DISKIO__
21
22#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
23#define BTRFS_SUPER_INFO_SIZE 4096
24
25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12
27
28static inline u64 btrfs_sb_offset(int mirror)
29{
30 u64 start = 16 * 1024;
31 if (mirror)
32 return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
33 return BTRFS_SUPER_INFO_OFFSET;
34}
35
36struct btrfs_device;
37struct btrfs_fs_devices;
38
39struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans,
46 struct btrfs_root *root, struct extent_buffer *buf);
47struct btrfs_root *open_ctree(struct super_block *sb,
48 struct btrfs_fs_devices *fs_devices,
49 char *options);
50int close_ctree(struct btrfs_root *root);
51int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
58 u64 root_objectid);
59struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
60 struct btrfs_key *location,
61 const char *name, int namelen);
62struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
63 struct btrfs_key *location);
64struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
65 struct btrfs_key *location);
66int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
67int btrfs_insert_dev_radix(struct btrfs_root *root,
68 struct block_device *bdev,
69 u64 device_id,
70 u64 block_start,
71 u64 num_blocks);
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
76int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
77int wait_on_tree_block_writeback(struct btrfs_root *root,
78 struct extent_buffer *buf);
79int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
80u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
81void btrfs_csum_final(u32 crc, char *result);
82int btrfs_open_device(struct btrfs_device *dev);
83int btrfs_verify_block_csum(struct btrfs_root *root,
84 struct extent_buffer *buf);
85int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
86 int metadata);
87int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
88 int rw, struct bio *bio, int mirror_num,
89 unsigned long bio_flags,
90 extent_submit_bio_hook_t *submit_bio_start,
91 extent_submit_bio_hook_t *submit_bio_done);
92
93int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
94unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
95int btrfs_write_tree_block(struct extent_buffer *buf);
96int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
97int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
98 struct btrfs_fs_info *fs_info);
99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
100 struct btrfs_fs_info *fs_info);
101int btree_lock_page_hook(struct page *page);
102#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000000..85315d2c90de
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,203 @@
1#include <linux/fs.h>
2#include <linux/types.h>
3#include "ctree.h"
4#include "disk-io.h"
5#include "btrfs_inode.h"
6#include "print-tree.h"
7#include "export.h"
8#include "compat.h"
9
10#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
11 parent_objectid) / 4)
12#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
13 parent_root_objectid) / 4)
14#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
15
16static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
17 int connectable)
18{
19 struct btrfs_fid *fid = (struct btrfs_fid *)fh;
20 struct inode *inode = dentry->d_inode;
21 int len = *max_len;
22 int type;
23
24 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
25 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
26 return 255;
27
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT;
30
31 fid->objectid = BTRFS_I(inode)->location.objectid;
32 fid->root_objectid = BTRFS_I(inode)->root->objectid;
33 fid->gen = inode->i_generation;
34
35 if (connectable && !S_ISDIR(inode->i_mode)) {
36 struct inode *parent;
37 u64 parent_root_id;
38
39 spin_lock(&dentry->d_lock);
40
41 parent = dentry->d_parent->d_inode;
42 fid->parent_objectid = BTRFS_I(parent)->location.objectid;
43 fid->parent_gen = parent->i_generation;
44 parent_root_id = BTRFS_I(parent)->root->objectid;
45
46 spin_unlock(&dentry->d_lock);
47
48 if (parent_root_id != fid->root_objectid) {
49 fid->parent_root_objectid = parent_root_id;
50 len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
51 type = FILEID_BTRFS_WITH_PARENT_ROOT;
52 } else {
53 len = BTRFS_FID_SIZE_CONNECTABLE;
54 type = FILEID_BTRFS_WITH_PARENT;
55 }
56 }
57
58 *max_len = len;
59 return type;
60}
61
62static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
63 u64 root_objectid, u32 generation)
64{
65 struct btrfs_root *root;
66 struct inode *inode;
67 struct btrfs_key key;
68
69 key.objectid = root_objectid;
70 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
71 key.offset = (u64)-1;
72
73 root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
74 if (IS_ERR(root))
75 return ERR_CAST(root);
76
77 key.objectid = objectid;
78 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
79 key.offset = 0;
80
81 inode = btrfs_iget(sb, &key, root, NULL);
82 if (IS_ERR(inode))
83 return (void *)inode;
84
85 if (generation != inode->i_generation) {
86 iput(inode);
87 return ERR_PTR(-ESTALE);
88 }
89
90 return d_obtain_alias(inode);
91}
92
93static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
94 int fh_len, int fh_type)
95{
96 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
97 u64 objectid, root_objectid;
98 u32 generation;
99
100 if (fh_type == FILEID_BTRFS_WITH_PARENT) {
101 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
102 return NULL;
103 root_objectid = fid->root_objectid;
104 } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
105 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
106 return NULL;
107 root_objectid = fid->parent_root_objectid;
108 } else
109 return NULL;
110
111 objectid = fid->parent_objectid;
112 generation = fid->parent_gen;
113
114 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
115}
116
117static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
118 int fh_len, int fh_type)
119{
120 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
121 u64 objectid, root_objectid;
122 u32 generation;
123
124 if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
125 fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
126 (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
127 fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
128 (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
129 fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
130 return NULL;
131
132 objectid = fid->objectid;
133 root_objectid = fid->root_objectid;
134 generation = fid->gen;
135
136 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
137}
138
139static struct dentry *btrfs_get_parent(struct dentry *child)
140{
141 struct inode *dir = child->d_inode;
142 struct btrfs_root *root = BTRFS_I(dir)->root;
143 struct btrfs_key key;
144 struct btrfs_path *path;
145 struct extent_buffer *leaf;
146 int slot;
147 u64 objectid;
148 int ret;
149
150 path = btrfs_alloc_path();
151
152 key.objectid = dir->i_ino;
153 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
154 key.offset = (u64)-1;
155
156 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
157 if (ret < 0) {
158 /* Error */
159 btrfs_free_path(path);
160 return ERR_PTR(ret);
161 }
162 leaf = path->nodes[0];
163 slot = path->slots[0];
164 if (ret) {
165 /* btrfs_search_slot() returns the slot where we'd want to
166 insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
167 The _real_ backref, telling us what the parent inode
168 _actually_ is, will be in the slot _before_ the one
169 that btrfs_search_slot() returns. */
170 if (!slot) {
171 /* Unless there is _no_ key in the tree before... */
172 btrfs_free_path(path);
173 return ERR_PTR(-EIO);
174 }
175 slot--;
176 }
177
178 btrfs_item_key_to_cpu(leaf, &key, slot);
179 btrfs_free_path(path);
180
181 if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
182 return ERR_PTR(-EINVAL);
183
184 objectid = key.offset;
185
186 /* If we are already at the root of a subvol, return the real root */
187 if (objectid == dir->i_ino)
188 return dget(dir->i_sb->s_root);
189
190 /* Build a new key for the inode item */
191 key.objectid = objectid;
192 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
193 key.offset = 0;
194
195 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
196}
197
198const struct export_operations btrfs_export_ops = {
199 .encode_fh = btrfs_encode_fh,
200 .fh_to_dentry = btrfs_fh_to_dentry,
201 .fh_to_parent = btrfs_fh_to_parent,
202 .get_parent = btrfs_get_parent,
203};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000000..074348a95841
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
1#ifndef BTRFS_EXPORT_H
2#define BTRFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations btrfs_export_ops;
7
8struct btrfs_fid {
9 u64 objectid;
10 u64 root_objectid;
11 u32 gen;
12
13 u64 parent_objectid;
14 u32 parent_gen;
15
16 u64 parent_root_objectid;
17} __attribute__ ((packed));
18
19#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000000..293da650873f
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,5986 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/pagemap.h>
20#include <linux/writeback.h>
21#include <linux/blkdev.h>
22#include <linux/version.h>
23#include "compat.h"
24#include "hash.h"
25#include "crc32c.h"
26#include "ctree.h"
27#include "disk-io.h"
28#include "print-tree.h"
29#include "transaction.h"
30#include "volumes.h"
31#include "locking.h"
32#include "ref-cache.h"
33#include "compat.h"
34
35#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1
37#define PENDING_BACKREF_UPDATE 2
38
39struct pending_extent_op {
40 int type;
41 u64 bytenr;
42 u64 num_bytes;
43 u64 parent;
44 u64 orig_parent;
45 u64 generation;
46 u64 orig_generation;
47 int level;
48 struct list_head list;
49 int del;
50};
51
52static int finish_current_insert(struct btrfs_trans_handle *trans,
53 struct btrfs_root *extent_root, int all);
54static int del_pending_extents(struct btrfs_trans_handle *trans,
55 struct btrfs_root *extent_root, int all);
56static int pin_down_bytes(struct btrfs_trans_handle *trans,
57 struct btrfs_root *root,
58 u64 bytenr, u64 num_bytes, int is_data);
59static int update_block_group(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free);
63
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{
66 return (cache->flags & bits) == bits;
67}
68
69/*
70 * this adds the block group to the fs_info rb tree for the block group
71 * cache
72 */
73static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
74 struct btrfs_block_group_cache *block_group)
75{
76 struct rb_node **p;
77 struct rb_node *parent = NULL;
78 struct btrfs_block_group_cache *cache;
79
80 spin_lock(&info->block_group_cache_lock);
81 p = &info->block_group_cache_tree.rb_node;
82
83 while (*p) {
84 parent = *p;
85 cache = rb_entry(parent, struct btrfs_block_group_cache,
86 cache_node);
87 if (block_group->key.objectid < cache->key.objectid) {
88 p = &(*p)->rb_left;
89 } else if (block_group->key.objectid > cache->key.objectid) {
90 p = &(*p)->rb_right;
91 } else {
92 spin_unlock(&info->block_group_cache_lock);
93 return -EEXIST;
94 }
95 }
96
97 rb_link_node(&block_group->cache_node, parent, p);
98 rb_insert_color(&block_group->cache_node,
99 &info->block_group_cache_tree);
100 spin_unlock(&info->block_group_cache_lock);
101
102 return 0;
103}
104
105/*
106 * This will return the block group at or after bytenr if contains is 0, else
107 * it will return the block group that contains the bytenr
108 */
109static struct btrfs_block_group_cache *
110block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
111 int contains)
112{
113 struct btrfs_block_group_cache *cache, *ret = NULL;
114 struct rb_node *n;
115 u64 end, start;
116
117 spin_lock(&info->block_group_cache_lock);
118 n = info->block_group_cache_tree.rb_node;
119
120 while (n) {
121 cache = rb_entry(n, struct btrfs_block_group_cache,
122 cache_node);
123 end = cache->key.objectid + cache->key.offset - 1;
124 start = cache->key.objectid;
125
126 if (bytenr < start) {
127 if (!contains && (!ret || start < ret->key.objectid))
128 ret = cache;
129 n = n->rb_left;
130 } else if (bytenr > start) {
131 if (contains && bytenr <= end) {
132 ret = cache;
133 break;
134 }
135 n = n->rb_right;
136 } else {
137 ret = cache;
138 break;
139 }
140 }
141 if (ret)
142 atomic_inc(&ret->count);
143 spin_unlock(&info->block_group_cache_lock);
144
145 return ret;
146}
147
148/*
149 * this is only called by cache_block_group, since we could have freed extents
150 * we need to check the pinned_extents for any extents that can't be used yet
151 * since their free space will be released as soon as the transaction commits.
152 */
153static int add_new_free_space(struct btrfs_block_group_cache *block_group,
154 struct btrfs_fs_info *info, u64 start, u64 end)
155{
156 u64 extent_start, extent_end, size;
157 int ret;
158
159 mutex_lock(&info->pinned_mutex);
160 while (start < end) {
161 ret = find_first_extent_bit(&info->pinned_extents, start,
162 &extent_start, &extent_end,
163 EXTENT_DIRTY);
164 if (ret)
165 break;
166
167 if (extent_start == start) {
168 start = extent_end + 1;
169 } else if (extent_start > start && extent_start < end) {
170 size = extent_start - start;
171 ret = btrfs_add_free_space(block_group, start,
172 size);
173 BUG_ON(ret);
174 start = extent_end + 1;
175 } else {
176 break;
177 }
178 }
179
180 if (start < end) {
181 size = end - start;
182 ret = btrfs_add_free_space(block_group, start, size);
183 BUG_ON(ret);
184 }
185 mutex_unlock(&info->pinned_mutex);
186
187 return 0;
188}
189
190static int remove_sb_from_cache(struct btrfs_root *root,
191 struct btrfs_block_group_cache *cache)
192{
193 u64 bytenr;
194 u64 *logical;
195 int stripe_len;
196 int i, nr, ret;
197
198 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
199 bytenr = btrfs_sb_offset(i);
200 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
201 cache->key.objectid, bytenr, 0,
202 &logical, &nr, &stripe_len);
203 BUG_ON(ret);
204 while (nr--) {
205 btrfs_remove_free_space(cache, logical[nr],
206 stripe_len);
207 }
208 kfree(logical);
209 }
210 return 0;
211}
212
213static int cache_block_group(struct btrfs_root *root,
214 struct btrfs_block_group_cache *block_group)
215{
216 struct btrfs_path *path;
217 int ret = 0;
218 struct btrfs_key key;
219 struct extent_buffer *leaf;
220 int slot;
221 u64 last;
222
223 if (!block_group)
224 return 0;
225
226 root = root->fs_info->extent_root;
227
228 if (block_group->cached)
229 return 0;
230
231 path = btrfs_alloc_path();
232 if (!path)
233 return -ENOMEM;
234
235 path->reada = 2;
236 /*
237 * we get into deadlocks with paths held by callers of this function.
238 * since the alloc_mutex is protecting things right now, just
239 * skip the locking here
240 */
241 path->skip_locking = 1;
242 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
243 key.objectid = last;
244 key.offset = 0;
245 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
246 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
247 if (ret < 0)
248 goto err;
249
250 while (1) {
251 leaf = path->nodes[0];
252 slot = path->slots[0];
253 if (slot >= btrfs_header_nritems(leaf)) {
254 ret = btrfs_next_leaf(root, path);
255 if (ret < 0)
256 goto err;
257 if (ret == 0)
258 continue;
259 else
260 break;
261 }
262 btrfs_item_key_to_cpu(leaf, &key, slot);
263 if (key.objectid < block_group->key.objectid)
264 goto next;
265
266 if (key.objectid >= block_group->key.objectid +
267 block_group->key.offset)
268 break;
269
270 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
271 add_new_free_space(block_group, root->fs_info, last,
272 key.objectid);
273
274 last = key.objectid + key.offset;
275 }
276next:
277 path->slots[0]++;
278 }
279
280 add_new_free_space(block_group, root->fs_info, last,
281 block_group->key.objectid +
282 block_group->key.offset);
283
284 remove_sb_from_cache(root, block_group);
285 block_group->cached = 1;
286 ret = 0;
287err:
288 btrfs_free_path(path);
289 return ret;
290}
291
292/*
293 * return the block group that starts at or after bytenr
294 */
295static struct btrfs_block_group_cache *
296btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
297{
298 struct btrfs_block_group_cache *cache;
299
300 cache = block_group_cache_tree_search(info, bytenr, 0);
301
302 return cache;
303}
304
305/*
306 * return the block group that contains teh given bytenr
307 */
308struct btrfs_block_group_cache *btrfs_lookup_block_group(
309 struct btrfs_fs_info *info,
310 u64 bytenr)
311{
312 struct btrfs_block_group_cache *cache;
313
314 cache = block_group_cache_tree_search(info, bytenr, 1);
315
316 return cache;
317}
318
319static inline void put_block_group(struct btrfs_block_group_cache *cache)
320{
321 if (atomic_dec_and_test(&cache->count))
322 kfree(cache);
323}
324
325static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags)
327{
328 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found;
331 list_for_each(cur, head) {
332 found = list_entry(cur, struct btrfs_space_info, list);
333 if (found->flags == flags)
334 return found;
335 }
336 return NULL;
337}
338
339static u64 div_factor(u64 num, int factor)
340{
341 if (factor == 10)
342 return num;
343 num *= factor;
344 do_div(num, 10);
345 return num;
346}
347
348u64 btrfs_find_block_group(struct btrfs_root *root,
349 u64 search_start, u64 search_hint, int owner)
350{
351 struct btrfs_block_group_cache *cache;
352 u64 used;
353 u64 last = max(search_hint, search_start);
354 u64 group_start = 0;
355 int full_search = 0;
356 int factor = 9;
357 int wrapped = 0;
358again:
359 while (1) {
360 cache = btrfs_lookup_first_block_group(root->fs_info, last);
361 if (!cache)
362 break;
363
364 spin_lock(&cache->lock);
365 last = cache->key.objectid + cache->key.offset;
366 used = btrfs_block_group_used(&cache->item);
367
368 if ((full_search || !cache->ro) &&
369 block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
370 if (used + cache->pinned + cache->reserved <
371 div_factor(cache->key.offset, factor)) {
372 group_start = cache->key.objectid;
373 spin_unlock(&cache->lock);
374 put_block_group(cache);
375 goto found;
376 }
377 }
378 spin_unlock(&cache->lock);
379 put_block_group(cache);
380 cond_resched();
381 }
382 if (!wrapped) {
383 last = search_start;
384 wrapped = 1;
385 goto again;
386 }
387 if (!full_search && factor < 10) {
388 last = search_start;
389 full_search = 1;
390 factor = 10;
391 goto again;
392 }
393found:
394 return group_start;
395}
396
397/* simple helper to search for an existing extent at a given offset */
398int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
399{
400 int ret;
401 struct btrfs_key key;
402 struct btrfs_path *path;
403
404 path = btrfs_alloc_path();
405 BUG_ON(!path);
406 key.objectid = start;
407 key.offset = len;
408 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
409 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
410 0, 0);
411 btrfs_free_path(path);
412 return ret;
413}
414
415/*
416 * Back reference rules. Back refs have three main goals:
417 *
418 * 1) differentiate between all holders of references to an extent so that
419 * when a reference is dropped we can make sure it was a valid reference
420 * before freeing the extent.
421 *
422 * 2) Provide enough information to quickly find the holders of an extent
423 * if we notice a given block is corrupted or bad.
424 *
425 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
426 * maintenance. This is actually the same as #2, but with a slightly
427 * different use case.
428 *
429 * File extents can be referenced by:
430 *
431 * - multiple snapshots, subvolumes, or different generations in one subvol
432 * - different files inside a single subvolume
433 * - different offsets inside a file (bookend extents in file.c)
434 *
435 * The extent ref structure has fields for:
436 *
437 * - Objectid of the subvolume root
438 * - Generation number of the tree holding the reference
439 * - objectid of the file holding the reference
440 * - number of references holding by parent node (alway 1 for tree blocks)
441 *
442 * Btree leaf may hold multiple references to a file extent. In most cases,
443 * these references are from same file and the corresponding offsets inside
444 * the file are close together.
445 *
446 * When a file extent is allocated the fields are filled in:
447 * (root_key.objectid, trans->transid, inode objectid, 1)
448 *
449 * When a leaf is cow'd new references are added for every file extent found
450 * in the leaf. It looks similar to the create case, but trans->transid will
451 * be different when the block is cow'd.
452 *
453 * (root_key.objectid, trans->transid, inode objectid,
454 * number of references in the leaf)
455 *
456 * When a file extent is removed either during snapshot deletion or
457 * file truncation, we find the corresponding back reference and check
458 * the following fields:
459 *
460 * (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
461 * inode objectid)
462 *
463 * Btree extents can be referenced by:
464 *
465 * - Different subvolumes
466 * - Different generations of the same subvolume
467 *
468 * When a tree block is created, back references are inserted:
469 *
470 * (root->root_key.objectid, trans->transid, level, 1)
471 *
472 * When a tree block is cow'd, new back references are added for all the
473 * blocks it points to. If the tree block isn't in reference counted root,
474 * the old back references are removed. These new back references are of
475 * the form (trans->transid will have increased since creation):
476 *
477 * (root->root_key.objectid, trans->transid, level, 1)
478 *
479 * When a backref is in deleting, the following fields are checked:
480 *
481 * if backref was for a tree root:
482 * (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
483 * else
484 * (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
485 *
486 * Back Reference Key composing:
487 *
488 * The key objectid corresponds to the first byte in the extent, the key
489 * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
490 * byte of parent extent. If a extent is tree root, the key offset is set
491 * to the key objectid.
492 */
493
494static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
495 struct btrfs_root *root,
496 struct btrfs_path *path,
497 u64 bytenr, u64 parent,
498 u64 ref_root, u64 ref_generation,
499 u64 owner_objectid, int del)
500{
501 struct btrfs_key key;
502 struct btrfs_extent_ref *ref;
503 struct extent_buffer *leaf;
504 u64 ref_objectid;
505 int ret;
506
507 key.objectid = bytenr;
508 key.type = BTRFS_EXTENT_REF_KEY;
509 key.offset = parent;
510
511 ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
512 if (ret < 0)
513 goto out;
514 if (ret > 0) {
515 ret = -ENOENT;
516 goto out;
517 }
518
519 leaf = path->nodes[0];
520 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
521 ref_objectid = btrfs_ref_objectid(leaf, ref);
522 if (btrfs_ref_root(leaf, ref) != ref_root ||
523 btrfs_ref_generation(leaf, ref) != ref_generation ||
524 (ref_objectid != owner_objectid &&
525 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
526 ret = -EIO;
527 WARN_ON(1);
528 goto out;
529 }
530 ret = 0;
531out:
532 return ret;
533}
534
535/*
536 * updates all the backrefs that are pending on update_list for the
537 * extent_root
538 */
539static noinline int update_backrefs(struct btrfs_trans_handle *trans,
540 struct btrfs_root *extent_root,
541 struct btrfs_path *path,
542 struct list_head *update_list)
543{
544 struct btrfs_key key;
545 struct btrfs_extent_ref *ref;
546 struct btrfs_fs_info *info = extent_root->fs_info;
547 struct pending_extent_op *op;
548 struct extent_buffer *leaf;
549 int ret = 0;
550 struct list_head *cur = update_list->next;
551 u64 ref_objectid;
552 u64 ref_root = extent_root->root_key.objectid;
553
554 op = list_entry(cur, struct pending_extent_op, list);
555
556search:
557 key.objectid = op->bytenr;
558 key.type = BTRFS_EXTENT_REF_KEY;
559 key.offset = op->orig_parent;
560
561 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
562 BUG_ON(ret);
563
564 leaf = path->nodes[0];
565
566loop:
567 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
568
569 ref_objectid = btrfs_ref_objectid(leaf, ref);
570
571 if (btrfs_ref_root(leaf, ref) != ref_root ||
572 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
573 (ref_objectid != op->level &&
574 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
575 printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
576 "root %llu, owner %u\n",
577 (unsigned long long)op->bytenr,
578 (unsigned long long)op->orig_parent,
579 (unsigned long long)ref_root, op->level);
580 btrfs_print_leaf(extent_root, leaf);
581 BUG();
582 }
583
584 key.objectid = op->bytenr;
585 key.offset = op->parent;
586 key.type = BTRFS_EXTENT_REF_KEY;
587 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
588 BUG_ON(ret);
589 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
590 btrfs_set_ref_generation(leaf, ref, op->generation);
591
592 cur = cur->next;
593
594 list_del_init(&op->list);
595 unlock_extent(&info->extent_ins, op->bytenr,
596 op->bytenr + op->num_bytes - 1, GFP_NOFS);
597 kfree(op);
598
599 if (cur == update_list) {
600 btrfs_mark_buffer_dirty(path->nodes[0]);
601 btrfs_release_path(extent_root, path);
602 goto out;
603 }
604
605 op = list_entry(cur, struct pending_extent_op, list);
606
607 path->slots[0]++;
608 while (path->slots[0] < btrfs_header_nritems(leaf)) {
609 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
610 if (key.objectid == op->bytenr &&
611 key.type == BTRFS_EXTENT_REF_KEY)
612 goto loop;
613 path->slots[0]++;
614 }
615
616 btrfs_mark_buffer_dirty(path->nodes[0]);
617 btrfs_release_path(extent_root, path);
618 goto search;
619
620out:
621 return 0;
622}
623
624static noinline int insert_extents(struct btrfs_trans_handle *trans,
625 struct btrfs_root *extent_root,
626 struct btrfs_path *path,
627 struct list_head *insert_list, int nr)
628{
629 struct btrfs_key *keys;
630 u32 *data_size;
631 struct pending_extent_op *op;
632 struct extent_buffer *leaf;
633 struct list_head *cur = insert_list->next;
634 struct btrfs_fs_info *info = extent_root->fs_info;
635 u64 ref_root = extent_root->root_key.objectid;
636 int i = 0, last = 0, ret;
637 int total = nr * 2;
638
639 if (!nr)
640 return 0;
641
642 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
643 if (!keys)
644 return -ENOMEM;
645
646 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
647 if (!data_size) {
648 kfree(keys);
649 return -ENOMEM;
650 }
651
652 list_for_each_entry(op, insert_list, list) {
653 keys[i].objectid = op->bytenr;
654 keys[i].offset = op->num_bytes;
655 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
656 data_size[i] = sizeof(struct btrfs_extent_item);
657 i++;
658
659 keys[i].objectid = op->bytenr;
660 keys[i].offset = op->parent;
661 keys[i].type = BTRFS_EXTENT_REF_KEY;
662 data_size[i] = sizeof(struct btrfs_extent_ref);
663 i++;
664 }
665
666 op = list_entry(cur, struct pending_extent_op, list);
667 i = 0;
668 while (i < total) {
669 int c;
670 ret = btrfs_insert_some_items(trans, extent_root, path,
671 keys+i, data_size+i, total-i);
672 BUG_ON(ret < 0);
673
674 if (last && ret > 1)
675 BUG();
676
677 leaf = path->nodes[0];
678 for (c = 0; c < ret; c++) {
679 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
680
681 /*
682 * if the first item we inserted was a backref, then
683 * the EXTENT_ITEM will be the odd c's, else it will
684 * be the even c's
685 */
686 if ((ref_first && (c % 2)) ||
687 (!ref_first && !(c % 2))) {
688 struct btrfs_extent_item *itm;
689
690 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
691 struct btrfs_extent_item);
692 btrfs_set_extent_refs(path->nodes[0], itm, 1);
693 op->del++;
694 } else {
695 struct btrfs_extent_ref *ref;
696
697 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
698 struct btrfs_extent_ref);
699 btrfs_set_ref_root(leaf, ref, ref_root);
700 btrfs_set_ref_generation(leaf, ref,
701 op->generation);
702 btrfs_set_ref_objectid(leaf, ref, op->level);
703 btrfs_set_ref_num_refs(leaf, ref, 1);
704 op->del++;
705 }
706
707 /*
708 * using del to see when its ok to free up the
709 * pending_extent_op. In the case where we insert the
710 * last item on the list in order to help do batching
711 * we need to not free the extent op until we actually
712 * insert the extent_item
713 */
714 if (op->del == 2) {
715 unlock_extent(&info->extent_ins, op->bytenr,
716 op->bytenr + op->num_bytes - 1,
717 GFP_NOFS);
718 cur = cur->next;
719 list_del_init(&op->list);
720 kfree(op);
721 if (cur != insert_list)
722 op = list_entry(cur,
723 struct pending_extent_op,
724 list);
725 }
726 }
727 btrfs_mark_buffer_dirty(leaf);
728 btrfs_release_path(extent_root, path);
729
730 /*
731 * Ok backref's and items usually go right next to eachother,
732 * but if we could only insert 1 item that means that we
733 * inserted on the end of a leaf, and we have no idea what may
734 * be on the next leaf so we just play it safe. In order to
735 * try and help this case we insert the last thing on our
736 * insert list so hopefully it will end up being the last
737 * thing on the leaf and everything else will be before it,
738 * which will let us insert a whole bunch of items at the same
739 * time.
740 */
741 if (ret == 1 && !last && (i + ret < total)) {
742 /*
743 * last: where we will pick up the next time around
744 * i: our current key to insert, will be total - 1
745 * cur: the current op we are screwing with
746 * op: duh
747 */
748 last = i + ret;
749 i = total - 1;
750 cur = insert_list->prev;
751 op = list_entry(cur, struct pending_extent_op, list);
752 } else if (last) {
753 /*
754 * ok we successfully inserted the last item on the
755 * list, lets reset everything
756 *
757 * i: our current key to insert, so where we left off
758 * last time
759 * last: done with this
760 * cur: the op we are messing with
761 * op: duh
762 * total: since we inserted the last key, we need to
763 * decrement total so we dont overflow
764 */
765 i = last;
766 last = 0;
767 total--;
768 if (i < total) {
769 cur = insert_list->next;
770 op = list_entry(cur, struct pending_extent_op,
771 list);
772 }
773 } else {
774 i += ret;
775 }
776
777 cond_resched();
778 }
779 ret = 0;
780 kfree(keys);
781 kfree(data_size);
782 return ret;
783}
784
785static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
786 struct btrfs_root *root,
787 struct btrfs_path *path,
788 u64 bytenr, u64 parent,
789 u64 ref_root, u64 ref_generation,
790 u64 owner_objectid)
791{
792 struct btrfs_key key;
793 struct extent_buffer *leaf;
794 struct btrfs_extent_ref *ref;
795 u32 num_refs;
796 int ret;
797
798 key.objectid = bytenr;
799 key.type = BTRFS_EXTENT_REF_KEY;
800 key.offset = parent;
801
802 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
803 if (ret == 0) {
804 leaf = path->nodes[0];
805 ref = btrfs_item_ptr(leaf, path->slots[0],
806 struct btrfs_extent_ref);
807 btrfs_set_ref_root(leaf, ref, ref_root);
808 btrfs_set_ref_generation(leaf, ref, ref_generation);
809 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
810 btrfs_set_ref_num_refs(leaf, ref, 1);
811 } else if (ret == -EEXIST) {
812 u64 existing_owner;
813 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
814 leaf = path->nodes[0];
815 ref = btrfs_item_ptr(leaf, path->slots[0],
816 struct btrfs_extent_ref);
817 if (btrfs_ref_root(leaf, ref) != ref_root ||
818 btrfs_ref_generation(leaf, ref) != ref_generation) {
819 ret = -EIO;
820 WARN_ON(1);
821 goto out;
822 }
823
824 num_refs = btrfs_ref_num_refs(leaf, ref);
825 BUG_ON(num_refs == 0);
826 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
827
828 existing_owner = btrfs_ref_objectid(leaf, ref);
829 if (existing_owner != owner_objectid &&
830 existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
831 btrfs_set_ref_objectid(leaf, ref,
832 BTRFS_MULTIPLE_OBJECTIDS);
833 }
834 ret = 0;
835 } else {
836 goto out;
837 }
838 btrfs_mark_buffer_dirty(path->nodes[0]);
839out:
840 btrfs_release_path(root, path);
841 return ret;
842}
843
844static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
845 struct btrfs_root *root,
846 struct btrfs_path *path)
847{
848 struct extent_buffer *leaf;
849 struct btrfs_extent_ref *ref;
850 u32 num_refs;
851 int ret = 0;
852
853 leaf = path->nodes[0];
854 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
855 num_refs = btrfs_ref_num_refs(leaf, ref);
856 BUG_ON(num_refs == 0);
857 num_refs -= 1;
858 if (num_refs == 0) {
859 ret = btrfs_del_item(trans, root, path);
860 } else {
861 btrfs_set_ref_num_refs(leaf, ref, num_refs);
862 btrfs_mark_buffer_dirty(leaf);
863 }
864 btrfs_release_path(root, path);
865 return ret;
866}
867
868#ifdef BIO_RW_DISCARD
869static void btrfs_issue_discard(struct block_device *bdev,
870 u64 start, u64 len)
871{
872 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
873}
874#endif
875
876static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
877 u64 num_bytes)
878{
879#ifdef BIO_RW_DISCARD
880 int ret;
881 u64 map_length = num_bytes;
882 struct btrfs_multi_bio *multi = NULL;
883
884 /* Tell the block device(s) that the sectors can be discarded */
885 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
886 bytenr, &map_length, &multi, 0);
887 if (!ret) {
888 struct btrfs_bio_stripe *stripe = multi->stripes;
889 int i;
890
891 if (map_length > num_bytes)
892 map_length = num_bytes;
893
894 for (i = 0; i < multi->num_stripes; i++, stripe++) {
895 btrfs_issue_discard(stripe->dev->bdev,
896 stripe->physical,
897 map_length);
898 }
899 kfree(multi);
900 }
901
902 return ret;
903#else
904 return 0;
905#endif
906}
907
908static noinline int free_extents(struct btrfs_trans_handle *trans,
909 struct btrfs_root *extent_root,
910 struct list_head *del_list)
911{
912 struct btrfs_fs_info *info = extent_root->fs_info;
913 struct btrfs_path *path;
914 struct btrfs_key key, found_key;
915 struct extent_buffer *leaf;
916 struct list_head *cur;
917 struct pending_extent_op *op;
918 struct btrfs_extent_item *ei;
919 int ret, num_to_del, extent_slot = 0, found_extent = 0;
920 u32 refs;
921 u64 bytes_freed = 0;
922
923 path = btrfs_alloc_path();
924 if (!path)
925 return -ENOMEM;
926 path->reada = 1;
927
928search:
929 /* search for the backref for the current ref we want to delete */
930 cur = del_list->next;
931 op = list_entry(cur, struct pending_extent_op, list);
932 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
933 op->orig_parent,
934 extent_root->root_key.objectid,
935 op->orig_generation, op->level, 1);
936 if (ret) {
937 printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
938 "root %llu gen %llu owner %u\n",
939 (unsigned long long)op->bytenr,
940 (unsigned long long)extent_root->root_key.objectid,
941 (unsigned long long)op->orig_generation, op->level);
942 btrfs_print_leaf(extent_root, path->nodes[0]);
943 WARN_ON(1);
944 goto out;
945 }
946
947 extent_slot = path->slots[0];
948 num_to_del = 1;
949 found_extent = 0;
950
951 /*
952 * if we aren't the first item on the leaf we can move back one and see
953 * if our ref is right next to our extent item
954 */
955 if (likely(extent_slot)) {
956 extent_slot--;
957 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
958 extent_slot);
959 if (found_key.objectid == op->bytenr &&
960 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
961 found_key.offset == op->num_bytes) {
962 num_to_del++;
963 found_extent = 1;
964 }
965 }
966
967 /*
968 * if we didn't find the extent we need to delete the backref and then
969 * search for the extent item key so we can update its ref count
970 */
971 if (!found_extent) {
972 key.objectid = op->bytenr;
973 key.type = BTRFS_EXTENT_ITEM_KEY;
974 key.offset = op->num_bytes;
975
976 ret = remove_extent_backref(trans, extent_root, path);
977 BUG_ON(ret);
978 btrfs_release_path(extent_root, path);
979 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
980 BUG_ON(ret);
981 extent_slot = path->slots[0];
982 }
983
984 /* this is where we update the ref count for the extent */
985 leaf = path->nodes[0];
986 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
987 refs = btrfs_extent_refs(leaf, ei);
988 BUG_ON(refs == 0);
989 refs--;
990 btrfs_set_extent_refs(leaf, ei, refs);
991
992 btrfs_mark_buffer_dirty(leaf);
993
994 /*
995 * This extent needs deleting. The reason cur_slot is extent_slot +
996 * num_to_del is because extent_slot points to the slot where the extent
997 * is, and if the backref was not right next to the extent we will be
998 * deleting at least 1 item, and will want to start searching at the
999 * slot directly next to extent_slot. However if we did find the
1000 * backref next to the extent item them we will be deleting at least 2
1001 * items and will want to start searching directly after the ref slot
1002 */
1003 if (!refs) {
1004 struct list_head *pos, *n, *end;
1005 int cur_slot = extent_slot+num_to_del;
1006 u64 super_used;
1007 u64 root_used;
1008
1009 path->slots[0] = extent_slot;
1010 bytes_freed = op->num_bytes;
1011
1012 mutex_lock(&info->pinned_mutex);
1013 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1014 op->num_bytes, op->level >=
1015 BTRFS_FIRST_FREE_OBJECTID);
1016 mutex_unlock(&info->pinned_mutex);
1017 BUG_ON(ret < 0);
1018 op->del = ret;
1019
1020 /*
1021 * we need to see if we can delete multiple things at once, so
1022 * start looping through the list of extents we are wanting to
1023 * delete and see if their extent/backref's are right next to
1024 * eachother and the extents only have 1 ref
1025 */
1026 for (pos = cur->next; pos != del_list; pos = pos->next) {
1027 struct pending_extent_op *tmp;
1028
1029 tmp = list_entry(pos, struct pending_extent_op, list);
1030
1031 /* we only want to delete extent+ref at this stage */
1032 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1033 break;
1034
1035 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1036 if (found_key.objectid != tmp->bytenr ||
1037 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1038 found_key.offset != tmp->num_bytes)
1039 break;
1040
1041 /* check to make sure this extent only has one ref */
1042 ei = btrfs_item_ptr(leaf, cur_slot,
1043 struct btrfs_extent_item);
1044 if (btrfs_extent_refs(leaf, ei) != 1)
1045 break;
1046
1047 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1048 if (found_key.objectid != tmp->bytenr ||
1049 found_key.type != BTRFS_EXTENT_REF_KEY ||
1050 found_key.offset != tmp->orig_parent)
1051 break;
1052
1053 /*
1054 * the ref is right next to the extent, we can set the
1055 * ref count to 0 since we will delete them both now
1056 */
1057 btrfs_set_extent_refs(leaf, ei, 0);
1058
1059 /* pin down the bytes for this extent */
1060 mutex_lock(&info->pinned_mutex);
1061 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1062 tmp->num_bytes, tmp->level >=
1063 BTRFS_FIRST_FREE_OBJECTID);
1064 mutex_unlock(&info->pinned_mutex);
1065 BUG_ON(ret < 0);
1066
1067 /*
1068 * use the del field to tell if we need to go ahead and
1069 * free up the extent when we delete the item or not.
1070 */
1071 tmp->del = ret;
1072 bytes_freed += tmp->num_bytes;
1073
1074 num_to_del += 2;
1075 cur_slot += 2;
1076 }
1077 end = pos;
1078
1079 /* update the free space counters */
1080 spin_lock(&info->delalloc_lock);
1081 super_used = btrfs_super_bytes_used(&info->super_copy);
1082 btrfs_set_super_bytes_used(&info->super_copy,
1083 super_used - bytes_freed);
1084
1085 root_used = btrfs_root_used(&extent_root->root_item);
1086 btrfs_set_root_used(&extent_root->root_item,
1087 root_used - bytes_freed);
1088 spin_unlock(&info->delalloc_lock);
1089
1090 /* delete the items */
1091 ret = btrfs_del_items(trans, extent_root, path,
1092 path->slots[0], num_to_del);
1093 BUG_ON(ret);
1094
1095 /*
1096 * loop through the extents we deleted and do the cleanup work
1097 * on them
1098 */
1099 for (pos = cur, n = pos->next; pos != end;
1100 pos = n, n = pos->next) {
1101 struct pending_extent_op *tmp;
1102 tmp = list_entry(pos, struct pending_extent_op, list);
1103
1104 /*
1105 * remember tmp->del tells us wether or not we pinned
1106 * down the extent
1107 */
1108 ret = update_block_group(trans, extent_root,
1109 tmp->bytenr, tmp->num_bytes, 0,
1110 tmp->del);
1111 BUG_ON(ret);
1112
1113 list_del_init(&tmp->list);
1114 unlock_extent(&info->extent_ins, tmp->bytenr,
1115 tmp->bytenr + tmp->num_bytes - 1,
1116 GFP_NOFS);
1117 kfree(tmp);
1118 }
1119 } else if (refs && found_extent) {
1120 /*
1121 * the ref and extent were right next to eachother, but the
1122 * extent still has a ref, so just free the backref and keep
1123 * going
1124 */
1125 ret = remove_extent_backref(trans, extent_root, path);
1126 BUG_ON(ret);
1127
1128 list_del_init(&op->list);
1129 unlock_extent(&info->extent_ins, op->bytenr,
1130 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1131 kfree(op);
1132 } else {
1133 /*
1134 * the extent has multiple refs and the backref we were looking
1135 * for was not right next to it, so just unlock and go next,
1136 * we're good to go
1137 */
1138 list_del_init(&op->list);
1139 unlock_extent(&info->extent_ins, op->bytenr,
1140 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1141 kfree(op);
1142 }
1143
1144 btrfs_release_path(extent_root, path);
1145 if (!list_empty(del_list))
1146 goto search;
1147
1148out:
1149 btrfs_free_path(path);
1150 return ret;
1151}
1152
1153static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1154 struct btrfs_root *root, u64 bytenr,
1155 u64 orig_parent, u64 parent,
1156 u64 orig_root, u64 ref_root,
1157 u64 orig_generation, u64 ref_generation,
1158 u64 owner_objectid)
1159{
1160 int ret;
1161 struct btrfs_root *extent_root = root->fs_info->extent_root;
1162 struct btrfs_path *path;
1163
1164 if (root == root->fs_info->extent_root) {
1165 struct pending_extent_op *extent_op;
1166 u64 num_bytes;
1167
1168 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1169 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1170 mutex_lock(&root->fs_info->extent_ins_mutex);
1171 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1172 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1173 u64 priv;
1174 ret = get_state_private(&root->fs_info->extent_ins,
1175 bytenr, &priv);
1176 BUG_ON(ret);
1177 extent_op = (struct pending_extent_op *)
1178 (unsigned long)priv;
1179 BUG_ON(extent_op->parent != orig_parent);
1180 BUG_ON(extent_op->generation != orig_generation);
1181
1182 extent_op->parent = parent;
1183 extent_op->generation = ref_generation;
1184 } else {
1185 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1186 BUG_ON(!extent_op);
1187
1188 extent_op->type = PENDING_BACKREF_UPDATE;
1189 extent_op->bytenr = bytenr;
1190 extent_op->num_bytes = num_bytes;
1191 extent_op->parent = parent;
1192 extent_op->orig_parent = orig_parent;
1193 extent_op->generation = ref_generation;
1194 extent_op->orig_generation = orig_generation;
1195 extent_op->level = (int)owner_objectid;
1196 INIT_LIST_HEAD(&extent_op->list);
1197 extent_op->del = 0;
1198
1199 set_extent_bits(&root->fs_info->extent_ins,
1200 bytenr, bytenr + num_bytes - 1,
1201 EXTENT_WRITEBACK, GFP_NOFS);
1202 set_state_private(&root->fs_info->extent_ins,
1203 bytenr, (unsigned long)extent_op);
1204 }
1205 mutex_unlock(&root->fs_info->extent_ins_mutex);
1206 return 0;
1207 }
1208
1209 path = btrfs_alloc_path();
1210 if (!path)
1211 return -ENOMEM;
1212 ret = lookup_extent_backref(trans, extent_root, path,
1213 bytenr, orig_parent, orig_root,
1214 orig_generation, owner_objectid, 1);
1215 if (ret)
1216 goto out;
1217 ret = remove_extent_backref(trans, extent_root, path);
1218 if (ret)
1219 goto out;
1220 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1221 parent, ref_root, ref_generation,
1222 owner_objectid);
1223 BUG_ON(ret);
1224 finish_current_insert(trans, extent_root, 0);
1225 del_pending_extents(trans, extent_root, 0);
1226out:
1227 btrfs_free_path(path);
1228 return ret;
1229}
1230
1231int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1232 struct btrfs_root *root, u64 bytenr,
1233 u64 orig_parent, u64 parent,
1234 u64 ref_root, u64 ref_generation,
1235 u64 owner_objectid)
1236{
1237 int ret;
1238 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1239 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1240 return 0;
1241 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
1242 parent, ref_root, ref_root,
1243 ref_generation, ref_generation,
1244 owner_objectid);
1245 return ret;
1246}
1247
1248static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1249 struct btrfs_root *root, u64 bytenr,
1250 u64 orig_parent, u64 parent,
1251 u64 orig_root, u64 ref_root,
1252 u64 orig_generation, u64 ref_generation,
1253 u64 owner_objectid)
1254{
1255 struct btrfs_path *path;
1256 int ret;
1257 struct btrfs_key key;
1258 struct extent_buffer *l;
1259 struct btrfs_extent_item *item;
1260 u32 refs;
1261
1262 path = btrfs_alloc_path();
1263 if (!path)
1264 return -ENOMEM;
1265
1266 path->reada = 1;
1267 key.objectid = bytenr;
1268 key.type = BTRFS_EXTENT_ITEM_KEY;
1269 key.offset = (u64)-1;
1270
1271 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1272 0, 1);
1273 if (ret < 0)
1274 return ret;
1275 BUG_ON(ret == 0 || path->slots[0] == 0);
1276
1277 path->slots[0]--;
1278 l = path->nodes[0];
1279
1280 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1281 if (key.objectid != bytenr) {
1282 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
1283 printk(KERN_ERR "btrfs wanted %llu found %llu\n",
1284 (unsigned long long)bytenr,
1285 (unsigned long long)key.objectid);
1286 BUG();
1287 }
1288 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1289
1290 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1291 refs = btrfs_extent_refs(l, item);
1292 btrfs_set_extent_refs(l, item, refs + 1);
1293 btrfs_mark_buffer_dirty(path->nodes[0]);
1294
1295 btrfs_release_path(root->fs_info->extent_root, path);
1296
1297 path->reada = 1;
1298 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1299 path, bytenr, parent,
1300 ref_root, ref_generation,
1301 owner_objectid);
1302 BUG_ON(ret);
1303 finish_current_insert(trans, root->fs_info->extent_root, 0);
1304 del_pending_extents(trans, root->fs_info->extent_root, 0);
1305
1306 btrfs_free_path(path);
1307 return 0;
1308}
1309
1310int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1311 struct btrfs_root *root,
1312 u64 bytenr, u64 num_bytes, u64 parent,
1313 u64 ref_root, u64 ref_generation,
1314 u64 owner_objectid)
1315{
1316 int ret;
1317 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1318 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1319 return 0;
1320 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
1321 0, ref_root, 0, ref_generation,
1322 owner_objectid);
1323 return ret;
1324}
1325
1326int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root)
1328{
1329 finish_current_insert(trans, root->fs_info->extent_root, 1);
1330 del_pending_extents(trans, root->fs_info->extent_root, 1);
1331 return 0;
1332}
1333
1334int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1335 struct btrfs_root *root, u64 bytenr,
1336 u64 num_bytes, u32 *refs)
1337{
1338 struct btrfs_path *path;
1339 int ret;
1340 struct btrfs_key key;
1341 struct extent_buffer *l;
1342 struct btrfs_extent_item *item;
1343
1344 WARN_ON(num_bytes < root->sectorsize);
1345 path = btrfs_alloc_path();
1346 path->reada = 1;
1347 key.objectid = bytenr;
1348 key.offset = num_bytes;
1349 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1350 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1351 0, 0);
1352 if (ret < 0)
1353 goto out;
1354 if (ret != 0) {
1355 btrfs_print_leaf(root, path->nodes[0]);
1356 printk(KERN_INFO "btrfs failed to find block number %llu\n",
1357 (unsigned long long)bytenr);
1358 BUG();
1359 }
1360 l = path->nodes[0];
1361 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1362 *refs = btrfs_extent_refs(l, item);
1363out:
1364 btrfs_free_path(path);
1365 return 0;
1366}
1367
1368int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1369 struct btrfs_root *root, u64 objectid, u64 bytenr)
1370{
1371 struct btrfs_root *extent_root = root->fs_info->extent_root;
1372 struct btrfs_path *path;
1373 struct extent_buffer *leaf;
1374 struct btrfs_extent_ref *ref_item;
1375 struct btrfs_key key;
1376 struct btrfs_key found_key;
1377 u64 ref_root;
1378 u64 last_snapshot;
1379 u32 nritems;
1380 int ret;
1381
1382 key.objectid = bytenr;
1383 key.offset = (u64)-1;
1384 key.type = BTRFS_EXTENT_ITEM_KEY;
1385
1386 path = btrfs_alloc_path();
1387 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1388 if (ret < 0)
1389 goto out;
1390 BUG_ON(ret == 0);
1391
1392 ret = -ENOENT;
1393 if (path->slots[0] == 0)
1394 goto out;
1395
1396 path->slots[0]--;
1397 leaf = path->nodes[0];
1398 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1399
1400 if (found_key.objectid != bytenr ||
1401 found_key.type != BTRFS_EXTENT_ITEM_KEY)
1402 goto out;
1403
1404 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1405 while (1) {
1406 leaf = path->nodes[0];
1407 nritems = btrfs_header_nritems(leaf);
1408 if (path->slots[0] >= nritems) {
1409 ret = btrfs_next_leaf(extent_root, path);
1410 if (ret < 0)
1411 goto out;
1412 if (ret == 0)
1413 continue;
1414 break;
1415 }
1416 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1417 if (found_key.objectid != bytenr)
1418 break;
1419
1420 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1421 path->slots[0]++;
1422 continue;
1423 }
1424
1425 ref_item = btrfs_item_ptr(leaf, path->slots[0],
1426 struct btrfs_extent_ref);
1427 ref_root = btrfs_ref_root(leaf, ref_item);
1428 if ((ref_root != root->root_key.objectid &&
1429 ref_root != BTRFS_TREE_LOG_OBJECTID) ||
1430 objectid != btrfs_ref_objectid(leaf, ref_item)) {
1431 ret = 1;
1432 goto out;
1433 }
1434 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1435 ret = 1;
1436 goto out;
1437 }
1438
1439 path->slots[0]++;
1440 }
1441 ret = 0;
1442out:
1443 btrfs_free_path(path);
1444 return ret;
1445}
1446
1447int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1448 struct extent_buffer *buf, u32 nr_extents)
1449{
1450 struct btrfs_key key;
1451 struct btrfs_file_extent_item *fi;
1452 u64 root_gen;
1453 u32 nritems;
1454 int i;
1455 int level;
1456 int ret = 0;
1457 int shared = 0;
1458
1459 if (!root->ref_cows)
1460 return 0;
1461
1462 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1463 shared = 0;
1464 root_gen = root->root_key.offset;
1465 } else {
1466 shared = 1;
1467 root_gen = trans->transid - 1;
1468 }
1469
1470 level = btrfs_header_level(buf);
1471 nritems = btrfs_header_nritems(buf);
1472
1473 if (level == 0) {
1474 struct btrfs_leaf_ref *ref;
1475 struct btrfs_extent_info *info;
1476
1477 ref = btrfs_alloc_leaf_ref(root, nr_extents);
1478 if (!ref) {
1479 ret = -ENOMEM;
1480 goto out;
1481 }
1482
1483 ref->root_gen = root_gen;
1484 ref->bytenr = buf->start;
1485 ref->owner = btrfs_header_owner(buf);
1486 ref->generation = btrfs_header_generation(buf);
1487 ref->nritems = nr_extents;
1488 info = ref->extents;
1489
1490 for (i = 0; nr_extents > 0 && i < nritems; i++) {
1491 u64 disk_bytenr;
1492 btrfs_item_key_to_cpu(buf, &key, i);
1493 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1494 continue;
1495 fi = btrfs_item_ptr(buf, i,
1496 struct btrfs_file_extent_item);
1497 if (btrfs_file_extent_type(buf, fi) ==
1498 BTRFS_FILE_EXTENT_INLINE)
1499 continue;
1500 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1501 if (disk_bytenr == 0)
1502 continue;
1503
1504 info->bytenr = disk_bytenr;
1505 info->num_bytes =
1506 btrfs_file_extent_disk_num_bytes(buf, fi);
1507 info->objectid = key.objectid;
1508 info->offset = key.offset;
1509 info++;
1510 }
1511
1512 ret = btrfs_add_leaf_ref(root, ref, shared);
1513 if (ret == -EEXIST && shared) {
1514 struct btrfs_leaf_ref *old;
1515 old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1516 BUG_ON(!old);
1517 btrfs_remove_leaf_ref(root, old);
1518 btrfs_free_leaf_ref(root, old);
1519 ret = btrfs_add_leaf_ref(root, ref, shared);
1520 }
1521 WARN_ON(ret);
1522 btrfs_free_leaf_ref(root, ref);
1523 }
1524out:
1525 return ret;
1526}
1527
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1530 u32 *nr_extents)
1531{
1532 u64 bytenr;
1533 u64 ref_root;
1534 u64 orig_root;
1535 u64 ref_generation;
1536 u64 orig_generation;
1537 u32 nritems;
1538 u32 nr_file_extents = 0;
1539 struct btrfs_key key;
1540 struct btrfs_file_extent_item *fi;
1541 int i;
1542 int level;
1543 int ret = 0;
1544 int faili = 0;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64);
1547
1548 ref_root = btrfs_header_owner(buf);
1549 ref_generation = btrfs_header_generation(buf);
1550 orig_root = btrfs_header_owner(orig_buf);
1551 orig_generation = btrfs_header_generation(orig_buf);
1552
1553 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf);
1555
1556 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref;
1558 } else {
1559 if (level == 0 &&
1560 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1561 goto out;
1562 if (level != 0 &&
1563 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1564 goto out;
1565 process_func = __btrfs_update_extent_ref;
1566 }
1567
1568 for (i = 0; i < nritems; i++) {
1569 cond_resched();
1570 if (level == 0) {
1571 btrfs_item_key_to_cpu(buf, &key, i);
1572 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1573 continue;
1574 fi = btrfs_item_ptr(buf, i,
1575 struct btrfs_file_extent_item);
1576 if (btrfs_file_extent_type(buf, fi) ==
1577 BTRFS_FILE_EXTENT_INLINE)
1578 continue;
1579 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1580 if (bytenr == 0)
1581 continue;
1582
1583 nr_file_extents++;
1584
1585 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start,
1587 orig_root, ref_root,
1588 orig_generation, ref_generation,
1589 key.objectid);
1590
1591 if (ret) {
1592 faili = i;
1593 WARN_ON(1);
1594 goto fail;
1595 }
1596 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start,
1600 orig_root, ref_root,
1601 orig_generation, ref_generation,
1602 level - 1);
1603 if (ret) {
1604 faili = i;
1605 WARN_ON(1);
1606 goto fail;
1607 }
1608 }
1609 }
1610out:
1611 if (nr_extents) {
1612 if (level == 0)
1613 *nr_extents = nr_file_extents;
1614 else
1615 *nr_extents = nritems;
1616 }
1617 return 0;
1618fail:
1619 WARN_ON(1);
1620 return ret;
1621}
1622
1623int btrfs_update_ref(struct btrfs_trans_handle *trans,
1624 struct btrfs_root *root, struct extent_buffer *orig_buf,
1625 struct extent_buffer *buf, int start_slot, int nr)
1626
1627{
1628 u64 bytenr;
1629 u64 ref_root;
1630 u64 orig_root;
1631 u64 ref_generation;
1632 u64 orig_generation;
1633 struct btrfs_key key;
1634 struct btrfs_file_extent_item *fi;
1635 int i;
1636 int ret;
1637 int slot;
1638 int level;
1639
1640 BUG_ON(start_slot < 0);
1641 BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1642
1643 ref_root = btrfs_header_owner(buf);
1644 ref_generation = btrfs_header_generation(buf);
1645 orig_root = btrfs_header_owner(orig_buf);
1646 orig_generation = btrfs_header_generation(orig_buf);
1647 level = btrfs_header_level(buf);
1648
1649 if (!root->ref_cows) {
1650 if (level == 0 &&
1651 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1652 return 0;
1653 if (level != 0 &&
1654 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1655 return 0;
1656 }
1657
1658 for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1659 cond_resched();
1660 if (level == 0) {
1661 btrfs_item_key_to_cpu(buf, &key, slot);
1662 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1663 continue;
1664 fi = btrfs_item_ptr(buf, slot,
1665 struct btrfs_file_extent_item);
1666 if (btrfs_file_extent_type(buf, fi) ==
1667 BTRFS_FILE_EXTENT_INLINE)
1668 continue;
1669 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1670 if (bytenr == 0)
1671 continue;
1672 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1673 orig_buf->start, buf->start,
1674 orig_root, ref_root,
1675 orig_generation, ref_generation,
1676 key.objectid);
1677 if (ret)
1678 goto fail;
1679 } else {
1680 bytenr = btrfs_node_blockptr(buf, slot);
1681 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1682 orig_buf->start, buf->start,
1683 orig_root, ref_root,
1684 orig_generation, ref_generation,
1685 level - 1);
1686 if (ret)
1687 goto fail;
1688 }
1689 }
1690 return 0;
1691fail:
1692 WARN_ON(1);
1693 return -1;
1694}
1695
1696static int write_one_cache_group(struct btrfs_trans_handle *trans,
1697 struct btrfs_root *root,
1698 struct btrfs_path *path,
1699 struct btrfs_block_group_cache *cache)
1700{
1701 int ret;
1702 int pending_ret;
1703 struct btrfs_root *extent_root = root->fs_info->extent_root;
1704 unsigned long bi;
1705 struct extent_buffer *leaf;
1706
1707 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1708 if (ret < 0)
1709 goto fail;
1710 BUG_ON(ret);
1711
1712 leaf = path->nodes[0];
1713 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1714 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1715 btrfs_mark_buffer_dirty(leaf);
1716 btrfs_release_path(extent_root, path);
1717fail:
1718 finish_current_insert(trans, extent_root, 0);
1719 pending_ret = del_pending_extents(trans, extent_root, 0);
1720 if (ret)
1721 return ret;
1722 if (pending_ret)
1723 return pending_ret;
1724 return 0;
1725
1726}
1727
1728int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1729 struct btrfs_root *root)
1730{
1731 struct btrfs_block_group_cache *cache, *entry;
1732 struct rb_node *n;
1733 int err = 0;
1734 int werr = 0;
1735 struct btrfs_path *path;
1736 u64 last = 0;
1737
1738 path = btrfs_alloc_path();
1739 if (!path)
1740 return -ENOMEM;
1741
1742 while (1) {
1743 cache = NULL;
1744 spin_lock(&root->fs_info->block_group_cache_lock);
1745 for (n = rb_first(&root->fs_info->block_group_cache_tree);
1746 n; n = rb_next(n)) {
1747 entry = rb_entry(n, struct btrfs_block_group_cache,
1748 cache_node);
1749 if (entry->dirty) {
1750 cache = entry;
1751 break;
1752 }
1753 }
1754 spin_unlock(&root->fs_info->block_group_cache_lock);
1755
1756 if (!cache)
1757 break;
1758
1759 cache->dirty = 0;
1760 last += cache->key.offset;
1761
1762 err = write_one_cache_group(trans, root,
1763 path, cache);
1764 /*
1765 * if we fail to write the cache group, we want
1766 * to keep it marked dirty in hopes that a later
1767 * write will work
1768 */
1769 if (err) {
1770 werr = err;
1771 continue;
1772 }
1773 }
1774 btrfs_free_path(path);
1775 return werr;
1776}
1777
1778int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1779{
1780 struct btrfs_block_group_cache *block_group;
1781 int readonly = 0;
1782
1783 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1784 if (!block_group || block_group->ro)
1785 readonly = 1;
1786 if (block_group)
1787 put_block_group(block_group);
1788 return readonly;
1789}
1790
1791static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1792 u64 total_bytes, u64 bytes_used,
1793 struct btrfs_space_info **space_info)
1794{
1795 struct btrfs_space_info *found;
1796
1797 found = __find_space_info(info, flags);
1798 if (found) {
1799 spin_lock(&found->lock);
1800 found->total_bytes += total_bytes;
1801 found->bytes_used += bytes_used;
1802 found->full = 0;
1803 spin_unlock(&found->lock);
1804 *space_info = found;
1805 return 0;
1806 }
1807 found = kzalloc(sizeof(*found), GFP_NOFS);
1808 if (!found)
1809 return -ENOMEM;
1810
1811 list_add(&found->list, &info->space_info);
1812 INIT_LIST_HEAD(&found->block_groups);
1813 init_rwsem(&found->groups_sem);
1814 spin_lock_init(&found->lock);
1815 found->flags = flags;
1816 found->total_bytes = total_bytes;
1817 found->bytes_used = bytes_used;
1818 found->bytes_pinned = 0;
1819 found->bytes_reserved = 0;
1820 found->bytes_readonly = 0;
1821 found->full = 0;
1822 found->force_alloc = 0;
1823 *space_info = found;
1824 return 0;
1825}
1826
1827static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1828{
1829 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1830 BTRFS_BLOCK_GROUP_RAID1 |
1831 BTRFS_BLOCK_GROUP_RAID10 |
1832 BTRFS_BLOCK_GROUP_DUP);
1833 if (extra_flags) {
1834 if (flags & BTRFS_BLOCK_GROUP_DATA)
1835 fs_info->avail_data_alloc_bits |= extra_flags;
1836 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1837 fs_info->avail_metadata_alloc_bits |= extra_flags;
1838 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1839 fs_info->avail_system_alloc_bits |= extra_flags;
1840 }
1841}
1842
1843static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1844{
1845 spin_lock(&cache->space_info->lock);
1846 spin_lock(&cache->lock);
1847 if (!cache->ro) {
1848 cache->space_info->bytes_readonly += cache->key.offset -
1849 btrfs_block_group_used(&cache->item);
1850 cache->ro = 1;
1851 }
1852 spin_unlock(&cache->lock);
1853 spin_unlock(&cache->space_info->lock);
1854}
1855
1856u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1857{
1858 u64 num_devices = root->fs_info->fs_devices->rw_devices;
1859
1860 if (num_devices == 1)
1861 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1862 if (num_devices < 4)
1863 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1864
1865 if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1866 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1867 BTRFS_BLOCK_GROUP_RAID10))) {
1868 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1869 }
1870
1871 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1872 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1873 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1874 }
1875
1876 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1877 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1878 (flags & BTRFS_BLOCK_GROUP_RAID10) |
1879 (flags & BTRFS_BLOCK_GROUP_DUP)))
1880 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1881 return flags;
1882}
1883
1884static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1885 struct btrfs_root *extent_root, u64 alloc_bytes,
1886 u64 flags, int force)
1887{
1888 struct btrfs_space_info *space_info;
1889 u64 thresh;
1890 int ret = 0;
1891
1892 mutex_lock(&extent_root->fs_info->chunk_mutex);
1893
1894 flags = btrfs_reduce_alloc_profile(extent_root, flags);
1895
1896 space_info = __find_space_info(extent_root->fs_info, flags);
1897 if (!space_info) {
1898 ret = update_space_info(extent_root->fs_info, flags,
1899 0, 0, &space_info);
1900 BUG_ON(ret);
1901 }
1902 BUG_ON(!space_info);
1903
1904 spin_lock(&space_info->lock);
1905 if (space_info->force_alloc) {
1906 force = 1;
1907 space_info->force_alloc = 0;
1908 }
1909 if (space_info->full) {
1910 spin_unlock(&space_info->lock);
1911 goto out;
1912 }
1913
1914 thresh = space_info->total_bytes - space_info->bytes_readonly;
1915 thresh = div_factor(thresh, 6);
1916 if (!force &&
1917 (space_info->bytes_used + space_info->bytes_pinned +
1918 space_info->bytes_reserved + alloc_bytes) < thresh) {
1919 spin_unlock(&space_info->lock);
1920 goto out;
1921 }
1922 spin_unlock(&space_info->lock);
1923
1924 ret = btrfs_alloc_chunk(trans, extent_root, flags);
1925 if (ret)
1926 space_info->full = 1;
1927out:
1928 mutex_unlock(&extent_root->fs_info->chunk_mutex);
1929 return ret;
1930}
1931
1932static int update_block_group(struct btrfs_trans_handle *trans,
1933 struct btrfs_root *root,
1934 u64 bytenr, u64 num_bytes, int alloc,
1935 int mark_free)
1936{
1937 struct btrfs_block_group_cache *cache;
1938 struct btrfs_fs_info *info = root->fs_info;
1939 u64 total = num_bytes;
1940 u64 old_val;
1941 u64 byte_in_group;
1942
1943 while (total) {
1944 cache = btrfs_lookup_block_group(info, bytenr);
1945 if (!cache)
1946 return -1;
1947 byte_in_group = bytenr - cache->key.objectid;
1948 WARN_ON(byte_in_group > cache->key.offset);
1949
1950 spin_lock(&cache->space_info->lock);
1951 spin_lock(&cache->lock);
1952 cache->dirty = 1;
1953 old_val = btrfs_block_group_used(&cache->item);
1954 num_bytes = min(total, cache->key.offset - byte_in_group);
1955 if (alloc) {
1956 old_val += num_bytes;
1957 cache->space_info->bytes_used += num_bytes;
1958 if (cache->ro)
1959 cache->space_info->bytes_readonly -= num_bytes;
1960 btrfs_set_block_group_used(&cache->item, old_val);
1961 spin_unlock(&cache->lock);
1962 spin_unlock(&cache->space_info->lock);
1963 } else {
1964 old_val -= num_bytes;
1965 cache->space_info->bytes_used -= num_bytes;
1966 if (cache->ro)
1967 cache->space_info->bytes_readonly += num_bytes;
1968 btrfs_set_block_group_used(&cache->item, old_val);
1969 spin_unlock(&cache->lock);
1970 spin_unlock(&cache->space_info->lock);
1971 if (mark_free) {
1972 int ret;
1973
1974 ret = btrfs_discard_extent(root, bytenr,
1975 num_bytes);
1976 WARN_ON(ret);
1977
1978 ret = btrfs_add_free_space(cache, bytenr,
1979 num_bytes);
1980 WARN_ON(ret);
1981 }
1982 }
1983 put_block_group(cache);
1984 total -= num_bytes;
1985 bytenr += num_bytes;
1986 }
1987 return 0;
1988}
1989
1990static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1991{
1992 struct btrfs_block_group_cache *cache;
1993 u64 bytenr;
1994
1995 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
1996 if (!cache)
1997 return 0;
1998
1999 bytenr = cache->key.objectid;
2000 put_block_group(cache);
2001
2002 return bytenr;
2003}
2004
2005int btrfs_update_pinned_extents(struct btrfs_root *root,
2006 u64 bytenr, u64 num, int pin)
2007{
2008 u64 len;
2009 struct btrfs_block_group_cache *cache;
2010 struct btrfs_fs_info *fs_info = root->fs_info;
2011
2012 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2013 if (pin) {
2014 set_extent_dirty(&fs_info->pinned_extents,
2015 bytenr, bytenr + num - 1, GFP_NOFS);
2016 } else {
2017 clear_extent_dirty(&fs_info->pinned_extents,
2018 bytenr, bytenr + num - 1, GFP_NOFS);
2019 }
2020 while (num > 0) {
2021 cache = btrfs_lookup_block_group(fs_info, bytenr);
2022 BUG_ON(!cache);
2023 len = min(num, cache->key.offset -
2024 (bytenr - cache->key.objectid));
2025 if (pin) {
2026 spin_lock(&cache->space_info->lock);
2027 spin_lock(&cache->lock);
2028 cache->pinned += len;
2029 cache->space_info->bytes_pinned += len;
2030 spin_unlock(&cache->lock);
2031 spin_unlock(&cache->space_info->lock);
2032 fs_info->total_pinned += len;
2033 } else {
2034 spin_lock(&cache->space_info->lock);
2035 spin_lock(&cache->lock);
2036 cache->pinned -= len;
2037 cache->space_info->bytes_pinned -= len;
2038 spin_unlock(&cache->lock);
2039 spin_unlock(&cache->space_info->lock);
2040 fs_info->total_pinned -= len;
2041 if (cache->cached)
2042 btrfs_add_free_space(cache, bytenr, len);
2043 }
2044 put_block_group(cache);
2045 bytenr += len;
2046 num -= len;
2047 }
2048 return 0;
2049}
2050
2051static int update_reserved_extents(struct btrfs_root *root,
2052 u64 bytenr, u64 num, int reserve)
2053{
2054 u64 len;
2055 struct btrfs_block_group_cache *cache;
2056 struct btrfs_fs_info *fs_info = root->fs_info;
2057
2058 while (num > 0) {
2059 cache = btrfs_lookup_block_group(fs_info, bytenr);
2060 BUG_ON(!cache);
2061 len = min(num, cache->key.offset -
2062 (bytenr - cache->key.objectid));
2063
2064 spin_lock(&cache->space_info->lock);
2065 spin_lock(&cache->lock);
2066 if (reserve) {
2067 cache->reserved += len;
2068 cache->space_info->bytes_reserved += len;
2069 } else {
2070 cache->reserved -= len;
2071 cache->space_info->bytes_reserved -= len;
2072 }
2073 spin_unlock(&cache->lock);
2074 spin_unlock(&cache->space_info->lock);
2075 put_block_group(cache);
2076 bytenr += len;
2077 num -= len;
2078 }
2079 return 0;
2080}
2081
2082int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2083{
2084 u64 last = 0;
2085 u64 start;
2086 u64 end;
2087 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2088 int ret;
2089
2090 mutex_lock(&root->fs_info->pinned_mutex);
2091 while (1) {
2092 ret = find_first_extent_bit(pinned_extents, last,
2093 &start, &end, EXTENT_DIRTY);
2094 if (ret)
2095 break;
2096 set_extent_dirty(copy, start, end, GFP_NOFS);
2097 last = end + 1;
2098 }
2099 mutex_unlock(&root->fs_info->pinned_mutex);
2100 return 0;
2101}
2102
2103int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2104 struct btrfs_root *root,
2105 struct extent_io_tree *unpin)
2106{
2107 u64 start;
2108 u64 end;
2109 int ret;
2110
2111 mutex_lock(&root->fs_info->pinned_mutex);
2112 while (1) {
2113 ret = find_first_extent_bit(unpin, 0, &start, &end,
2114 EXTENT_DIRTY);
2115 if (ret)
2116 break;
2117
2118 ret = btrfs_discard_extent(root, start, end + 1 - start);
2119
2120 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2121 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2122
2123 if (need_resched()) {
2124 mutex_unlock(&root->fs_info->pinned_mutex);
2125 cond_resched();
2126 mutex_lock(&root->fs_info->pinned_mutex);
2127 }
2128 }
2129 mutex_unlock(&root->fs_info->pinned_mutex);
2130 return ret;
2131}
2132
2133static int finish_current_insert(struct btrfs_trans_handle *trans,
2134 struct btrfs_root *extent_root, int all)
2135{
2136 u64 start;
2137 u64 end;
2138 u64 priv;
2139 u64 search = 0;
2140 u64 skipped = 0;
2141 struct btrfs_fs_info *info = extent_root->fs_info;
2142 struct btrfs_path *path;
2143 struct pending_extent_op *extent_op, *tmp;
2144 struct list_head insert_list, update_list;
2145 int ret;
2146 int num_inserts = 0, max_inserts;
2147
2148 path = btrfs_alloc_path();
2149 INIT_LIST_HEAD(&insert_list);
2150 INIT_LIST_HEAD(&update_list);
2151
2152 max_inserts = extent_root->leafsize /
2153 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2154 sizeof(struct btrfs_extent_ref) +
2155 sizeof(struct btrfs_extent_item));
2156again:
2157 mutex_lock(&info->extent_ins_mutex);
2158 while (1) {
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK);
2161 if (ret) {
2162 if (skipped && all && !num_inserts) {
2163 skipped = 0;
2164 search = 0;
2165 continue;
2166 }
2167 mutex_unlock(&info->extent_ins_mutex);
2168 break;
2169 }
2170
2171 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2172 if (!ret) {
2173 skipped = 1;
2174 search = end + 1;
2175 if (need_resched()) {
2176 mutex_unlock(&info->extent_ins_mutex);
2177 cond_resched();
2178 mutex_lock(&info->extent_ins_mutex);
2179 }
2180 continue;
2181 }
2182
2183 ret = get_state_private(&info->extent_ins, start, &priv);
2184 BUG_ON(ret);
2185 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2186
2187 if (extent_op->type == PENDING_EXTENT_INSERT) {
2188 num_inserts++;
2189 list_add_tail(&extent_op->list, &insert_list);
2190 search = end + 1;
2191 if (num_inserts == max_inserts) {
2192 mutex_unlock(&info->extent_ins_mutex);
2193 break;
2194 }
2195 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2196 list_add_tail(&extent_op->list, &update_list);
2197 search = end + 1;
2198 } else {
2199 BUG();
2200 }
2201 }
2202
2203 /*
2204 * process the update list, clear the writeback bit for it, and if
2205 * somebody marked this thing for deletion then just unlock it and be
2206 * done, the free_extents will handle it
2207 */
2208 mutex_lock(&info->extent_ins_mutex);
2209 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2210 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2211 extent_op->bytenr + extent_op->num_bytes - 1,
2212 EXTENT_WRITEBACK, GFP_NOFS);
2213 if (extent_op->del) {
2214 list_del_init(&extent_op->list);
2215 unlock_extent(&info->extent_ins, extent_op->bytenr,
2216 extent_op->bytenr + extent_op->num_bytes
2217 - 1, GFP_NOFS);
2218 kfree(extent_op);
2219 }
2220 }
2221 mutex_unlock(&info->extent_ins_mutex);
2222
2223 /*
2224 * still have things left on the update list, go ahead an update
2225 * everything
2226 */
2227 if (!list_empty(&update_list)) {
2228 ret = update_backrefs(trans, extent_root, path, &update_list);
2229 BUG_ON(ret);
2230 }
2231
2232 /*
2233 * if no inserts need to be done, but we skipped some extents and we
2234 * need to make sure everything is cleaned then reset everything and
2235 * go back to the beginning
2236 */
2237 if (!num_inserts && all && skipped) {
2238 search = 0;
2239 skipped = 0;
2240 INIT_LIST_HEAD(&update_list);
2241 INIT_LIST_HEAD(&insert_list);
2242 goto again;
2243 } else if (!num_inserts) {
2244 goto out;
2245 }
2246
2247 /*
2248 * process the insert extents list. Again if we are deleting this
2249 * extent, then just unlock it, pin down the bytes if need be, and be
2250 * done with it. Saves us from having to actually insert the extent
2251 * into the tree and then subsequently come along and delete it
2252 */
2253 mutex_lock(&info->extent_ins_mutex);
2254 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2255 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2256 extent_op->bytenr + extent_op->num_bytes - 1,
2257 EXTENT_WRITEBACK, GFP_NOFS);
2258 if (extent_op->del) {
2259 u64 used;
2260 list_del_init(&extent_op->list);
2261 unlock_extent(&info->extent_ins, extent_op->bytenr,
2262 extent_op->bytenr + extent_op->num_bytes
2263 - 1, GFP_NOFS);
2264
2265 mutex_lock(&extent_root->fs_info->pinned_mutex);
2266 ret = pin_down_bytes(trans, extent_root,
2267 extent_op->bytenr,
2268 extent_op->num_bytes, 0);
2269 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2270
2271 spin_lock(&info->delalloc_lock);
2272 used = btrfs_super_bytes_used(&info->super_copy);
2273 btrfs_set_super_bytes_used(&info->super_copy,
2274 used - extent_op->num_bytes);
2275 used = btrfs_root_used(&extent_root->root_item);
2276 btrfs_set_root_used(&extent_root->root_item,
2277 used - extent_op->num_bytes);
2278 spin_unlock(&info->delalloc_lock);
2279
2280 ret = update_block_group(trans, extent_root,
2281 extent_op->bytenr,
2282 extent_op->num_bytes,
2283 0, ret > 0);
2284 BUG_ON(ret);
2285 kfree(extent_op);
2286 num_inserts--;
2287 }
2288 }
2289 mutex_unlock(&info->extent_ins_mutex);
2290
2291 ret = insert_extents(trans, extent_root, path, &insert_list,
2292 num_inserts);
2293 BUG_ON(ret);
2294
2295 /*
2296 * if we broke out of the loop in order to insert stuff because we hit
2297 * the maximum number of inserts at a time we can handle, then loop
2298 * back and pick up where we left off
2299 */
2300 if (num_inserts == max_inserts) {
2301 INIT_LIST_HEAD(&insert_list);
2302 INIT_LIST_HEAD(&update_list);
2303 num_inserts = 0;
2304 goto again;
2305 }
2306
2307 /*
2308 * again, if we need to make absolutely sure there are no more pending
2309 * extent operations left and we know that we skipped some, go back to
2310 * the beginning and do it all again
2311 */
2312 if (all && skipped) {
2313 INIT_LIST_HEAD(&insert_list);
2314 INIT_LIST_HEAD(&update_list);
2315 search = 0;
2316 skipped = 0;
2317 num_inserts = 0;
2318 goto again;
2319 }
2320out:
2321 btrfs_free_path(path);
2322 return 0;
2323}
2324
2325static int pin_down_bytes(struct btrfs_trans_handle *trans,
2326 struct btrfs_root *root,
2327 u64 bytenr, u64 num_bytes, int is_data)
2328{
2329 int err = 0;
2330 struct extent_buffer *buf;
2331
2332 if (is_data)
2333 goto pinit;
2334
2335 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2336 if (!buf)
2337 goto pinit;
2338
2339 /* we can reuse a block if it hasn't been written
2340 * and it is from this transaction. We can't
2341 * reuse anything from the tree log root because
2342 * it has tiny sub-transactions.
2343 */
2344 if (btrfs_buffer_uptodate(buf, 0) &&
2345 btrfs_try_tree_lock(buf)) {
2346 u64 header_owner = btrfs_header_owner(buf);
2347 u64 header_transid = btrfs_header_generation(buf);
2348 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2349 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2350 header_transid == trans->transid &&
2351 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2352 clean_tree_block(NULL, root, buf);
2353 btrfs_tree_unlock(buf);
2354 free_extent_buffer(buf);
2355 return 1;
2356 }
2357 btrfs_tree_unlock(buf);
2358 }
2359 free_extent_buffer(buf);
2360pinit:
2361 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2362
2363 BUG_ON(err < 0);
2364 return 0;
2365}
2366
2367/*
2368 * remove an extent from the root, returns 0 on success
2369 */
2370static int __free_extent(struct btrfs_trans_handle *trans,
2371 struct btrfs_root *root,
2372 u64 bytenr, u64 num_bytes, u64 parent,
2373 u64 root_objectid, u64 ref_generation,
2374 u64 owner_objectid, int pin, int mark_free)
2375{
2376 struct btrfs_path *path;
2377 struct btrfs_key key;
2378 struct btrfs_fs_info *info = root->fs_info;
2379 struct btrfs_root *extent_root = info->extent_root;
2380 struct extent_buffer *leaf;
2381 int ret;
2382 int extent_slot = 0;
2383 int found_extent = 0;
2384 int num_to_del = 1;
2385 struct btrfs_extent_item *ei;
2386 u32 refs;
2387
2388 key.objectid = bytenr;
2389 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2390 key.offset = num_bytes;
2391 path = btrfs_alloc_path();
2392 if (!path)
2393 return -ENOMEM;
2394
2395 path->reada = 1;
2396 ret = lookup_extent_backref(trans, extent_root, path,
2397 bytenr, parent, root_objectid,
2398 ref_generation, owner_objectid, 1);
2399 if (ret == 0) {
2400 struct btrfs_key found_key;
2401 extent_slot = path->slots[0];
2402 while (extent_slot > 0) {
2403 extent_slot--;
2404 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2405 extent_slot);
2406 if (found_key.objectid != bytenr)
2407 break;
2408 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2409 found_key.offset == num_bytes) {
2410 found_extent = 1;
2411 break;
2412 }
2413 if (path->slots[0] - extent_slot > 5)
2414 break;
2415 }
2416 if (!found_extent) {
2417 ret = remove_extent_backref(trans, extent_root, path);
2418 BUG_ON(ret);
2419 btrfs_release_path(extent_root, path);
2420 ret = btrfs_search_slot(trans, extent_root,
2421 &key, path, -1, 1);
2422 if (ret) {
2423 printk(KERN_ERR "umm, got %d back from search"
2424 ", was looking for %llu\n", ret,
2425 (unsigned long long)bytenr);
2426 btrfs_print_leaf(extent_root, path->nodes[0]);
2427 }
2428 BUG_ON(ret);
2429 extent_slot = path->slots[0];
2430 }
2431 } else {
2432 btrfs_print_leaf(extent_root, path->nodes[0]);
2433 WARN_ON(1);
2434 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2435 "root %llu gen %llu owner %llu\n",
2436 (unsigned long long)bytenr,
2437 (unsigned long long)root_objectid,
2438 (unsigned long long)ref_generation,
2439 (unsigned long long)owner_objectid);
2440 }
2441
2442 leaf = path->nodes[0];
2443 ei = btrfs_item_ptr(leaf, extent_slot,
2444 struct btrfs_extent_item);
2445 refs = btrfs_extent_refs(leaf, ei);
2446 BUG_ON(refs == 0);
2447 refs -= 1;
2448 btrfs_set_extent_refs(leaf, ei, refs);
2449
2450 btrfs_mark_buffer_dirty(leaf);
2451
2452 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
2453 struct btrfs_extent_ref *ref;
2454 ref = btrfs_item_ptr(leaf, path->slots[0],
2455 struct btrfs_extent_ref);
2456 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
2457 /* if the back ref and the extent are next to each other
2458 * they get deleted below in one shot
2459 */
2460 path->slots[0] = extent_slot;
2461 num_to_del = 2;
2462 } else if (found_extent) {
2463 /* otherwise delete the extent back ref */
2464 ret = remove_extent_backref(trans, extent_root, path);
2465 BUG_ON(ret);
2466 /* if refs are 0, we need to setup the path for deletion */
2467 if (refs == 0) {
2468 btrfs_release_path(extent_root, path);
2469 ret = btrfs_search_slot(trans, extent_root, &key, path,
2470 -1, 1);
2471 BUG_ON(ret);
2472 }
2473 }
2474
2475 if (refs == 0) {
2476 u64 super_used;
2477 u64 root_used;
2478
2479 if (pin) {
2480 mutex_lock(&root->fs_info->pinned_mutex);
2481 ret = pin_down_bytes(trans, root, bytenr, num_bytes,
2482 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
2483 mutex_unlock(&root->fs_info->pinned_mutex);
2484 if (ret > 0)
2485 mark_free = 1;
2486 BUG_ON(ret < 0);
2487 }
2488 /* block accounting for super block */
2489 spin_lock(&info->delalloc_lock);
2490 super_used = btrfs_super_bytes_used(&info->super_copy);
2491 btrfs_set_super_bytes_used(&info->super_copy,
2492 super_used - num_bytes);
2493
2494 /* block accounting for root item */
2495 root_used = btrfs_root_used(&root->root_item);
2496 btrfs_set_root_used(&root->root_item,
2497 root_used - num_bytes);
2498 spin_unlock(&info->delalloc_lock);
2499 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2500 num_to_del);
2501 BUG_ON(ret);
2502 btrfs_release_path(extent_root, path);
2503
2504 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2505 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2506 BUG_ON(ret);
2507 }
2508
2509 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2510 mark_free);
2511 BUG_ON(ret);
2512 }
2513 btrfs_free_path(path);
2514 finish_current_insert(trans, extent_root, 0);
2515 return ret;
2516}
2517
2518/*
2519 * find all the blocks marked as pending in the radix tree and remove
2520 * them from the extent map
2521 */
2522static int del_pending_extents(struct btrfs_trans_handle *trans,
2523 struct btrfs_root *extent_root, int all)
2524{
2525 int ret;
2526 int err = 0;
2527 u64 start;
2528 u64 end;
2529 u64 priv;
2530 u64 search = 0;
2531 int nr = 0, skipped = 0;
2532 struct extent_io_tree *pending_del;
2533 struct extent_io_tree *extent_ins;
2534 struct pending_extent_op *extent_op;
2535 struct btrfs_fs_info *info = extent_root->fs_info;
2536 struct list_head delete_list;
2537
2538 INIT_LIST_HEAD(&delete_list);
2539 extent_ins = &extent_root->fs_info->extent_ins;
2540 pending_del = &extent_root->fs_info->pending_del;
2541
2542again:
2543 mutex_lock(&info->extent_ins_mutex);
2544 while (1) {
2545 ret = find_first_extent_bit(pending_del, search, &start, &end,
2546 EXTENT_WRITEBACK);
2547 if (ret) {
2548 if (all && skipped && !nr) {
2549 search = 0;
2550 continue;
2551 }
2552 mutex_unlock(&info->extent_ins_mutex);
2553 break;
2554 }
2555
2556 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2557 if (!ret) {
2558 search = end+1;
2559 skipped = 1;
2560
2561 if (need_resched()) {
2562 mutex_unlock(&info->extent_ins_mutex);
2563 cond_resched();
2564 mutex_lock(&info->extent_ins_mutex);
2565 }
2566
2567 continue;
2568 }
2569 BUG_ON(ret < 0);
2570
2571 ret = get_state_private(pending_del, start, &priv);
2572 BUG_ON(ret);
2573 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2574
2575 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2576 GFP_NOFS);
2577 if (!test_range_bit(extent_ins, start, end,
2578 EXTENT_WRITEBACK, 0)) {
2579 list_add_tail(&extent_op->list, &delete_list);
2580 nr++;
2581 } else {
2582 kfree(extent_op);
2583
2584 ret = get_state_private(&info->extent_ins, start,
2585 &priv);
2586 BUG_ON(ret);
2587 extent_op = (struct pending_extent_op *)
2588 (unsigned long)priv;
2589
2590 clear_extent_bits(&info->extent_ins, start, end,
2591 EXTENT_WRITEBACK, GFP_NOFS);
2592
2593 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2594 list_add_tail(&extent_op->list, &delete_list);
2595 search = end + 1;
2596 nr++;
2597 continue;
2598 }
2599
2600 mutex_lock(&extent_root->fs_info->pinned_mutex);
2601 ret = pin_down_bytes(trans, extent_root, start,
2602 end + 1 - start, 0);
2603 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2604
2605 ret = update_block_group(trans, extent_root, start,
2606 end + 1 - start, 0, ret > 0);
2607
2608 unlock_extent(extent_ins, start, end, GFP_NOFS);
2609 BUG_ON(ret);
2610 kfree(extent_op);
2611 }
2612 if (ret)
2613 err = ret;
2614
2615 search = end + 1;
2616
2617 if (need_resched()) {
2618 mutex_unlock(&info->extent_ins_mutex);
2619 cond_resched();
2620 mutex_lock(&info->extent_ins_mutex);
2621 }
2622 }
2623
2624 if (nr) {
2625 ret = free_extents(trans, extent_root, &delete_list);
2626 BUG_ON(ret);
2627 }
2628
2629 if (all && skipped) {
2630 INIT_LIST_HEAD(&delete_list);
2631 search = 0;
2632 nr = 0;
2633 goto again;
2634 }
2635
2636 return err;
2637}
2638
2639/*
2640 * remove an extent from the root, returns 0 on success
2641 */
2642static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2643 struct btrfs_root *root,
2644 u64 bytenr, u64 num_bytes, u64 parent,
2645 u64 root_objectid, u64 ref_generation,
2646 u64 owner_objectid, int pin)
2647{
2648 struct btrfs_root *extent_root = root->fs_info->extent_root;
2649 int pending_ret;
2650 int ret;
2651
2652 WARN_ON(num_bytes < root->sectorsize);
2653 if (root == extent_root) {
2654 struct pending_extent_op *extent_op = NULL;
2655
2656 mutex_lock(&root->fs_info->extent_ins_mutex);
2657 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
2658 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
2659 u64 priv;
2660 ret = get_state_private(&root->fs_info->extent_ins,
2661 bytenr, &priv);
2662 BUG_ON(ret);
2663 extent_op = (struct pending_extent_op *)
2664 (unsigned long)priv;
2665
2666 extent_op->del = 1;
2667 if (extent_op->type == PENDING_EXTENT_INSERT) {
2668 mutex_unlock(&root->fs_info->extent_ins_mutex);
2669 return 0;
2670 }
2671 }
2672
2673 if (extent_op) {
2674 ref_generation = extent_op->orig_generation;
2675 parent = extent_op->orig_parent;
2676 }
2677
2678 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2679 BUG_ON(!extent_op);
2680
2681 extent_op->type = PENDING_EXTENT_DELETE;
2682 extent_op->bytenr = bytenr;
2683 extent_op->num_bytes = num_bytes;
2684 extent_op->parent = parent;
2685 extent_op->orig_parent = parent;
2686 extent_op->generation = ref_generation;
2687 extent_op->orig_generation = ref_generation;
2688 extent_op->level = (int)owner_objectid;
2689 INIT_LIST_HEAD(&extent_op->list);
2690 extent_op->del = 0;
2691
2692 set_extent_bits(&root->fs_info->pending_del,
2693 bytenr, bytenr + num_bytes - 1,
2694 EXTENT_WRITEBACK, GFP_NOFS);
2695 set_state_private(&root->fs_info->pending_del,
2696 bytenr, (unsigned long)extent_op);
2697 mutex_unlock(&root->fs_info->extent_ins_mutex);
2698 return 0;
2699 }
2700 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache;
2704
2705 /* btrfs_free_reserved_extent */
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0;
2712 }
2713 pin = 1;
2714 }
2715
2716 /* if data pin when any transaction has committed this */
2717 if (ref_generation != trans->transid)
2718 pin = 1;
2719
2720 ret = __free_extent(trans, root, bytenr, num_bytes, parent,
2721 root_objectid, ref_generation,
2722 owner_objectid, pin, pin == 0);
2723
2724 finish_current_insert(trans, root->fs_info->extent_root, 0);
2725 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
2726 return ret ? ret : pending_ret;
2727}
2728
2729int btrfs_free_extent(struct btrfs_trans_handle *trans,
2730 struct btrfs_root *root,
2731 u64 bytenr, u64 num_bytes, u64 parent,
2732 u64 root_objectid, u64 ref_generation,
2733 u64 owner_objectid, int pin)
2734{
2735 int ret;
2736
2737 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
2738 root_objectid, ref_generation,
2739 owner_objectid, pin);
2740 return ret;
2741}
2742
2743static u64 stripe_align(struct btrfs_root *root, u64 val)
2744{
2745 u64 mask = ((u64)root->stripesize - 1);
2746 u64 ret = (val + mask) & ~mask;
2747 return ret;
2748}
2749
2750/*
2751 * walks the btree of allocated extents and find a hole of a given size.
2752 * The key ins is changed to record the hole:
2753 * ins->objectid == block start
2754 * ins->flags = BTRFS_EXTENT_ITEM_KEY
2755 * ins->offset == number of blocks
2756 * Any available blocks before search_start are skipped.
2757 */
2758static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2759 struct btrfs_root *orig_root,
2760 u64 num_bytes, u64 empty_size,
2761 u64 search_start, u64 search_end,
2762 u64 hint_byte, struct btrfs_key *ins,
2763 u64 exclude_start, u64 exclude_nr,
2764 int data)
2765{
2766 int ret = 0;
2767 struct btrfs_root *root = orig_root->fs_info->extent_root;
2768 u64 total_needed = num_bytes;
2769 u64 *last_ptr = NULL;
2770 u64 last_wanted = 0;
2771 struct btrfs_block_group_cache *block_group = NULL;
2772 int chunk_alloc_done = 0;
2773 int empty_cluster = 2 * 1024 * 1024;
2774 int allowed_chunk_alloc = 0;
2775 struct list_head *head = NULL, *cur = NULL;
2776 int loop = 0;
2777 int extra_loop = 0;
2778 struct btrfs_space_info *space_info;
2779
2780 WARN_ON(num_bytes < root->sectorsize);
2781 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2782 ins->objectid = 0;
2783 ins->offset = 0;
2784
2785 if (orig_root->ref_cows || empty_size)
2786 allowed_chunk_alloc = 1;
2787
2788 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2789 last_ptr = &root->fs_info->last_alloc;
2790 empty_cluster = 64 * 1024;
2791 }
2792
2793 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2794 last_ptr = &root->fs_info->last_data_alloc;
2795
2796 if (last_ptr) {
2797 if (*last_ptr) {
2798 hint_byte = *last_ptr;
2799 last_wanted = *last_ptr;
2800 } else
2801 empty_size += empty_cluster;
2802 } else {
2803 empty_cluster = 0;
2804 }
2805 search_start = max(search_start, first_logical_byte(root, 0));
2806 search_start = max(search_start, hint_byte);
2807
2808 if (last_wanted && search_start != last_wanted) {
2809 last_wanted = 0;
2810 empty_size += empty_cluster;
2811 }
2812
2813 total_needed += empty_size;
2814 block_group = btrfs_lookup_block_group(root->fs_info, search_start);
2815 if (!block_group)
2816 block_group = btrfs_lookup_first_block_group(root->fs_info,
2817 search_start);
2818 space_info = __find_space_info(root->fs_info, data);
2819
2820 down_read(&space_info->groups_sem);
2821 while (1) {
2822 struct btrfs_free_space *free_space;
2823 /*
2824 * the only way this happens if our hint points to a block
2825 * group thats not of the proper type, while looping this
2826 * should never happen
2827 */
2828 if (empty_size)
2829 extra_loop = 1;
2830
2831 if (!block_group)
2832 goto new_group_no_lock;
2833
2834 if (unlikely(!block_group->cached)) {
2835 mutex_lock(&block_group->cache_mutex);
2836 ret = cache_block_group(root, block_group);
2837 mutex_unlock(&block_group->cache_mutex);
2838 if (ret)
2839 break;
2840 }
2841
2842 mutex_lock(&block_group->alloc_mutex);
2843 if (unlikely(!block_group_bits(block_group, data)))
2844 goto new_group;
2845
2846 if (unlikely(block_group->ro))
2847 goto new_group;
2848
2849 free_space = btrfs_find_free_space(block_group, search_start,
2850 total_needed);
2851 if (free_space) {
2852 u64 start = block_group->key.objectid;
2853 u64 end = block_group->key.objectid +
2854 block_group->key.offset;
2855
2856 search_start = stripe_align(root, free_space->offset);
2857
2858 /* move on to the next group */
2859 if (search_start + num_bytes >= search_end)
2860 goto new_group;
2861
2862 /* move on to the next group */
2863 if (search_start + num_bytes > end)
2864 goto new_group;
2865
2866 if (last_wanted && search_start != last_wanted) {
2867 total_needed += empty_cluster;
2868 empty_size += empty_cluster;
2869 last_wanted = 0;
2870 /*
2871 * if search_start is still in this block group
2872 * then we just re-search this block group
2873 */
2874 if (search_start >= start &&
2875 search_start < end) {
2876 mutex_unlock(&block_group->alloc_mutex);
2877 continue;
2878 }
2879
2880 /* else we go to the next block group */
2881 goto new_group;
2882 }
2883
2884 if (exclude_nr > 0 &&
2885 (search_start + num_bytes > exclude_start &&
2886 search_start < exclude_start + exclude_nr)) {
2887 search_start = exclude_start + exclude_nr;
2888 /*
2889 * if search_start is still in this block group
2890 * then we just re-search this block group
2891 */
2892 if (search_start >= start &&
2893 search_start < end) {
2894 mutex_unlock(&block_group->alloc_mutex);
2895 last_wanted = 0;
2896 continue;
2897 }
2898
2899 /* else we go to the next block group */
2900 goto new_group;
2901 }
2902
2903 ins->objectid = search_start;
2904 ins->offset = num_bytes;
2905
2906 btrfs_remove_free_space_lock(block_group, search_start,
2907 num_bytes);
2908 /* we are all good, lets return */
2909 mutex_unlock(&block_group->alloc_mutex);
2910 break;
2911 }
2912new_group:
2913 mutex_unlock(&block_group->alloc_mutex);
2914 put_block_group(block_group);
2915 block_group = NULL;
2916new_group_no_lock:
2917 /* don't try to compare new allocations against the
2918 * last allocation any more
2919 */
2920 last_wanted = 0;
2921
2922 /*
2923 * Here's how this works.
2924 * loop == 0: we were searching a block group via a hint
2925 * and didn't find anything, so we start at
2926 * the head of the block groups and keep searching
2927 * loop == 1: we're searching through all of the block groups
2928 * if we hit the head again we have searched
2929 * all of the block groups for this space and we
2930 * need to try and allocate, if we cant error out.
2931 * loop == 2: we allocated more space and are looping through
2932 * all of the block groups again.
2933 */
2934 if (loop == 0) {
2935 head = &space_info->block_groups;
2936 cur = head->next;
2937 loop++;
2938 } else if (loop == 1 && cur == head) {
2939 int keep_going;
2940
2941 /* at this point we give up on the empty_size
2942 * allocations and just try to allocate the min
2943 * space.
2944 *
2945 * The extra_loop field was set if an empty_size
2946 * allocation was attempted above, and if this
2947 * is try we need to try the loop again without
2948 * the additional empty_size.
2949 */
2950 total_needed -= empty_size;
2951 empty_size = 0;
2952 keep_going = extra_loop;
2953 loop++;
2954
2955 if (allowed_chunk_alloc && !chunk_alloc_done) {
2956 up_read(&space_info->groups_sem);
2957 ret = do_chunk_alloc(trans, root, num_bytes +
2958 2 * 1024 * 1024, data, 1);
2959 down_read(&space_info->groups_sem);
2960 if (ret < 0)
2961 goto loop_check;
2962 head = &space_info->block_groups;
2963 /*
2964 * we've allocated a new chunk, keep
2965 * trying
2966 */
2967 keep_going = 1;
2968 chunk_alloc_done = 1;
2969 } else if (!allowed_chunk_alloc) {
2970 space_info->force_alloc = 1;
2971 }
2972loop_check:
2973 if (keep_going) {
2974 cur = head->next;
2975 extra_loop = 0;
2976 } else {
2977 break;
2978 }
2979 } else if (cur == head) {
2980 break;
2981 }
2982
2983 block_group = list_entry(cur, struct btrfs_block_group_cache,
2984 list);
2985 atomic_inc(&block_group->count);
2986
2987 search_start = block_group->key.objectid;
2988 cur = cur->next;
2989 }
2990
2991 /* we found what we needed */
2992 if (ins->objectid) {
2993 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2994 trans->block_group = block_group->key.objectid;
2995
2996 if (last_ptr)
2997 *last_ptr = ins->objectid + ins->offset;
2998 ret = 0;
2999 } else if (!ret) {
3000 printk(KERN_ERR "btrfs searching for %llu bytes, "
3001 "num_bytes %llu, loop %d, allowed_alloc %d\n",
3002 (unsigned long long)total_needed,
3003 (unsigned long long)num_bytes,
3004 loop, allowed_chunk_alloc);
3005 ret = -ENOSPC;
3006 }
3007 if (block_group)
3008 put_block_group(block_group);
3009
3010 up_read(&space_info->groups_sem);
3011 return ret;
3012}
3013
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{
3016 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used -
3021 info->bytes_pinned - info->bytes_reserved),
3022 (info->full) ? "" : "not ");
3023
3024 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n",
3030 (unsigned long long)cache->key.objectid,
3031 (unsigned long long)cache->key.offset,
3032 (unsigned long long)btrfs_block_group_used(&cache->item),
3033 (unsigned long long)cache->pinned,
3034 (unsigned long long)cache->reserved);
3035 btrfs_dump_free_space(cache, bytes);
3036 spin_unlock(&cache->lock);
3037 }
3038 up_read(&info->groups_sem);
3039}
3040
3041static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3042 struct btrfs_root *root,
3043 u64 num_bytes, u64 min_alloc_size,
3044 u64 empty_size, u64 hint_byte,
3045 u64 search_end, struct btrfs_key *ins,
3046 u64 data)
3047{
3048 int ret;
3049 u64 search_start = 0;
3050 u64 alloc_profile;
3051 struct btrfs_fs_info *info = root->fs_info;
3052
3053 if (data) {
3054 alloc_profile = info->avail_data_alloc_bits &
3055 info->data_alloc_profile;
3056 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3057 } else if (root == root->fs_info->chunk_root) {
3058 alloc_profile = info->avail_system_alloc_bits &
3059 info->system_alloc_profile;
3060 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3061 } else {
3062 alloc_profile = info->avail_metadata_alloc_bits &
3063 info->metadata_alloc_profile;
3064 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3065 }
3066again:
3067 data = btrfs_reduce_alloc_profile(root, data);
3068 /*
3069 * the only place that sets empty_size is btrfs_realloc_node, which
3070 * is not called recursively on allocations
3071 */
3072 if (empty_size || root->ref_cows) {
3073 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
3074 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3075 2 * 1024 * 1024,
3076 BTRFS_BLOCK_GROUP_METADATA |
3077 (info->metadata_alloc_profile &
3078 info->avail_metadata_alloc_bits), 0);
3079 }
3080 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3081 num_bytes + 2 * 1024 * 1024, data, 0);
3082 }
3083
3084 WARN_ON(num_bytes < root->sectorsize);
3085 ret = find_free_extent(trans, root, num_bytes, empty_size,
3086 search_start, search_end, hint_byte, ins,
3087 trans->alloc_exclude_start,
3088 trans->alloc_exclude_nr, data);
3089
3090 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
3091 num_bytes = num_bytes >> 1;
3092 num_bytes = num_bytes & ~(root->sectorsize - 1);
3093 num_bytes = max(num_bytes, min_alloc_size);
3094 do_chunk_alloc(trans, root->fs_info->extent_root,
3095 num_bytes, data, 1);
3096 goto again;
3097 }
3098 if (ret) {
3099 struct btrfs_space_info *sinfo;
3100
3101 sinfo = __find_space_info(root->fs_info, data);
3102 printk(KERN_ERR "btrfs allocation failed flags %llu, "
3103 "wanted %llu\n", (unsigned long long)data,
3104 (unsigned long long)num_bytes);
3105 dump_space_info(sinfo, num_bytes);
3106 BUG();
3107 }
3108
3109 return ret;
3110}
3111
3112int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3113{
3114 struct btrfs_block_group_cache *cache;
3115 int ret = 0;
3116
3117 cache = btrfs_lookup_block_group(root->fs_info, start);
3118 if (!cache) {
3119 printk(KERN_ERR "Unable to find block group for %llu\n",
3120 (unsigned long long)start);
3121 return -ENOSPC;
3122 }
3123
3124 ret = btrfs_discard_extent(root, start, len);
3125
3126 btrfs_add_free_space(cache, start, len);
3127 put_block_group(cache);
3128 update_reserved_extents(root, start, len, 0);
3129
3130 return ret;
3131}
3132
3133int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3134 struct btrfs_root *root,
3135 u64 num_bytes, u64 min_alloc_size,
3136 u64 empty_size, u64 hint_byte,
3137 u64 search_end, struct btrfs_key *ins,
3138 u64 data)
3139{
3140 int ret;
3141 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3142 empty_size, hint_byte, search_end, ins,
3143 data);
3144 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3145 return ret;
3146}
3147
3148static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3149 struct btrfs_root *root, u64 parent,
3150 u64 root_objectid, u64 ref_generation,
3151 u64 owner, struct btrfs_key *ins)
3152{
3153 int ret;
3154 int pending_ret;
3155 u64 super_used;
3156 u64 root_used;
3157 u64 num_bytes = ins->offset;
3158 u32 sizes[2];
3159 struct btrfs_fs_info *info = root->fs_info;
3160 struct btrfs_root *extent_root = info->extent_root;
3161 struct btrfs_extent_item *extent_item;
3162 struct btrfs_extent_ref *ref;
3163 struct btrfs_path *path;
3164 struct btrfs_key keys[2];
3165
3166 if (parent == 0)
3167 parent = ins->objectid;
3168
3169 /* block accounting for super block */
3170 spin_lock(&info->delalloc_lock);
3171 super_used = btrfs_super_bytes_used(&info->super_copy);
3172 btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
3173
3174 /* block accounting for root item */
3175 root_used = btrfs_root_used(&root->root_item);
3176 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3177 spin_unlock(&info->delalloc_lock);
3178
3179 if (root == extent_root) {
3180 struct pending_extent_op *extent_op;
3181
3182 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3183 BUG_ON(!extent_op);
3184
3185 extent_op->type = PENDING_EXTENT_INSERT;
3186 extent_op->bytenr = ins->objectid;
3187 extent_op->num_bytes = ins->offset;
3188 extent_op->parent = parent;
3189 extent_op->orig_parent = 0;
3190 extent_op->generation = ref_generation;
3191 extent_op->orig_generation = 0;
3192 extent_op->level = (int)owner;
3193 INIT_LIST_HEAD(&extent_op->list);
3194 extent_op->del = 0;
3195
3196 mutex_lock(&root->fs_info->extent_ins_mutex);
3197 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3198 ins->objectid + ins->offset - 1,
3199 EXTENT_WRITEBACK, GFP_NOFS);
3200 set_state_private(&root->fs_info->extent_ins,
3201 ins->objectid, (unsigned long)extent_op);
3202 mutex_unlock(&root->fs_info->extent_ins_mutex);
3203 goto update_block;
3204 }
3205
3206 memcpy(&keys[0], ins, sizeof(*ins));
3207 keys[1].objectid = ins->objectid;
3208 keys[1].type = BTRFS_EXTENT_REF_KEY;
3209 keys[1].offset = parent;
3210 sizes[0] = sizeof(*extent_item);
3211 sizes[1] = sizeof(*ref);
3212
3213 path = btrfs_alloc_path();
3214 BUG_ON(!path);
3215
3216 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3217 sizes, 2);
3218 BUG_ON(ret);
3219
3220 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3221 struct btrfs_extent_item);
3222 btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
3223 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3224 struct btrfs_extent_ref);
3225
3226 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3227 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3228 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3229 btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
3230
3231 btrfs_mark_buffer_dirty(path->nodes[0]);
3232
3233 trans->alloc_exclude_start = 0;
3234 trans->alloc_exclude_nr = 0;
3235 btrfs_free_path(path);
3236 finish_current_insert(trans, extent_root, 0);
3237 pending_ret = del_pending_extents(trans, extent_root, 0);
3238
3239 if (ret)
3240 goto out;
3241 if (pending_ret) {
3242 ret = pending_ret;
3243 goto out;
3244 }
3245
3246update_block:
3247 ret = update_block_group(trans, root, ins->objectid,
3248 ins->offset, 1, 0);
3249 if (ret) {
3250 printk(KERN_ERR "btrfs update block group failed for %llu "
3251 "%llu\n", (unsigned long long)ins->objectid,
3252 (unsigned long long)ins->offset);
3253 BUG();
3254 }
3255out:
3256 return ret;
3257}
3258
3259int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3260 struct btrfs_root *root, u64 parent,
3261 u64 root_objectid, u64 ref_generation,
3262 u64 owner, struct btrfs_key *ins)
3263{
3264 int ret;
3265
3266 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3267 return 0;
3268 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3269 ref_generation, owner, ins);
3270 update_reserved_extents(root, ins->objectid, ins->offset, 0);
3271 return ret;
3272}
3273
3274/*
3275 * this is used by the tree logging recovery code. It records that
3276 * an extent has been allocated and makes sure to clear the free
3277 * space cache bits as well
3278 */
3279int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3280 struct btrfs_root *root, u64 parent,
3281 u64 root_objectid, u64 ref_generation,
3282 u64 owner, struct btrfs_key *ins)
3283{
3284 int ret;
3285 struct btrfs_block_group_cache *block_group;
3286
3287 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3288 mutex_lock(&block_group->cache_mutex);
3289 cache_block_group(root, block_group);
3290 mutex_unlock(&block_group->cache_mutex);
3291
3292 ret = btrfs_remove_free_space(block_group, ins->objectid,
3293 ins->offset);
3294 BUG_ON(ret);
3295 put_block_group(block_group);
3296 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3297 ref_generation, owner, ins);
3298 return ret;
3299}
3300
3301/*
3302 * finds a free extent and does all the dirty work required for allocation
3303 * returns the key for the extent through ins, and a tree buffer for
3304 * the first block of the extent through buf.
3305 *
3306 * returns 0 if everything worked, non-zero otherwise.
3307 */
3308int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3309 struct btrfs_root *root,
3310 u64 num_bytes, u64 parent, u64 min_alloc_size,
3311 u64 root_objectid, u64 ref_generation,
3312 u64 owner_objectid, u64 empty_size, u64 hint_byte,
3313 u64 search_end, struct btrfs_key *ins, u64 data)
3314{
3315 int ret;
3316
3317 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3318 min_alloc_size, empty_size, hint_byte,
3319 search_end, ins, data);
3320 BUG_ON(ret);
3321 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3322 ret = __btrfs_alloc_reserved_extent(trans, root, parent,
3323 root_objectid, ref_generation,
3324 owner_objectid, ins);
3325 BUG_ON(ret);
3326
3327 } else {
3328 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3329 }
3330 return ret;
3331}
3332
3333struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3334 struct btrfs_root *root,
3335 u64 bytenr, u32 blocksize)
3336{
3337 struct extent_buffer *buf;
3338
3339 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3340 if (!buf)
3341 return ERR_PTR(-ENOMEM);
3342 btrfs_set_header_generation(buf, trans->transid);
3343 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf);
3345 btrfs_set_buffer_uptodate(buf);
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS);
3349 } else {
3350 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3351 buf->start + buf->len - 1, GFP_NOFS);
3352 }
3353 trans->blocks_used++;
3354 return buf;
3355}
3356
3357/*
3358 * helper function to allocate a block for a given tree
3359 * returns the tree buffer or NULL.
3360 */
3361struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3362 struct btrfs_root *root,
3363 u32 blocksize, u64 parent,
3364 u64 root_objectid,
3365 u64 ref_generation,
3366 int level,
3367 u64 hint,
3368 u64 empty_size)
3369{
3370 struct btrfs_key ins;
3371 int ret;
3372 struct extent_buffer *buf;
3373
3374 ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3375 root_objectid, ref_generation, level,
3376 empty_size, hint, (u64)-1, &ins, 0);
3377 if (ret) {
3378 BUG_ON(ret > 0);
3379 return ERR_PTR(ret);
3380 }
3381
3382 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
3383 return buf;
3384}
3385
3386int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3387 struct btrfs_root *root, struct extent_buffer *leaf)
3388{
3389 u64 leaf_owner;
3390 u64 leaf_generation;
3391 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi;
3393 int i;
3394 int nritems;
3395 int ret;
3396
3397 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf);
3401
3402 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr;
3404 cond_resched();
3405
3406 btrfs_item_key_to_cpu(leaf, &key, i);
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue;
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3410 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE)
3412 continue;
3413 /*
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3418 if (disk_bytenr == 0)
3419 continue;
3420
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation,
3424 key.objectid, 0);
3425 BUG_ON(ret);
3426
3427 atomic_inc(&root->fs_info->throttle_gen);
3428 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched();
3430 }
3431 return 0;
3432}
3433
3434static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3435 struct btrfs_root *root,
3436 struct btrfs_leaf_ref *ref)
3437{
3438 int i;
3439 int ret;
3440 struct btrfs_extent_info *info = ref->extents;
3441
3442 for (i = 0; i < ref->nritems; i++) {
3443 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation,
3446 info->objectid, 0);
3447
3448 atomic_inc(&root->fs_info->throttle_gen);
3449 wake_up(&root->fs_info->transaction_throttle);
3450 cond_resched();
3451
3452 BUG_ON(ret);
3453 info++;
3454 }
3455
3456 return 0;
3457}
3458
3459static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3460 u64 len, u32 *refs)
3461{
3462 int ret;
3463
3464 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
3465 BUG_ON(ret);
3466
3467#if 0 /* some debugging code in case we see problems here */
3468 /* if the refs count is one, it won't get increased again. But
3469 * if the ref count is > 1, someone may be decreasing it at
3470 * the same time we are.
3471 */
3472 if (*refs != 1) {
3473 struct extent_buffer *eb = NULL;
3474 eb = btrfs_find_create_tree_block(root, start, len);
3475 if (eb)
3476 btrfs_tree_lock(eb);
3477
3478 mutex_lock(&root->fs_info->alloc_mutex);
3479 ret = lookup_extent_ref(NULL, root, start, len, refs);
3480 BUG_ON(ret);
3481 mutex_unlock(&root->fs_info->alloc_mutex);
3482
3483 if (eb) {
3484 btrfs_tree_unlock(eb);
3485 free_extent_buffer(eb);
3486 }
3487 if (*refs == 1) {
3488 printk(KERN_ERR "btrfs block %llu went down to one "
3489 "during drop_snap\n", (unsigned long long)start);
3490 }
3491
3492 }
3493#endif
3494
3495 cond_resched();
3496 return ret;
3497}
3498
3499/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes.
3502 */
3503static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3504 struct btrfs_root *root,
3505 struct btrfs_path *path, int *level)
3506{
3507 u64 root_owner;
3508 u64 root_gen;
3509 u64 bytenr;
3510 u64 ptr_gen;
3511 struct extent_buffer *next;
3512 struct extent_buffer *cur;
3513 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize;
3516 int ret;
3517 u32 refs;
3518
3519 WARN_ON(*level < 0);
3520 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3521 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
3522 path->nodes[*level]->len, &refs);
3523 BUG_ON(ret);
3524 if (refs > 1)
3525 goto out;
3526
3527 /*
3528 * walk down to the last node level and free all the leaves
3529 */
3530 while (*level >= 0) {
3531 WARN_ON(*level < 0);
3532 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3533 cur = path->nodes[*level];
3534
3535 if (btrfs_header_level(cur) != *level)
3536 WARN_ON(1);
3537
3538 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur))
3540 break;
3541 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret);
3544 break;
3545 }
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1);
3549
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret);
3552 if (refs != 1) {
3553 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent);
3555 root_gen = btrfs_header_generation(parent);
3556 path->slots[*level]++;
3557
3558 ret = __btrfs_free_extent(trans, root, bytenr,
3559 blocksize, parent->start,
3560 root_owner, root_gen,
3561 *level - 1, 1);
3562 BUG_ON(ret);
3563
3564 atomic_inc(&root->fs_info->throttle_gen);
3565 wake_up(&root->fs_info->transaction_throttle);
3566 cond_resched();
3567
3568 continue;
3569 }
3570 /*
3571 * at this point, we have a single ref, and since the
3572 * only place referencing this extent is a dead root
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */
3576 if (*level == 1) {
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]);
3613 path->nodes[*level-1] = next;
3614 *level = btrfs_header_level(next);
3615 path->slots[*level] = 0;
3616 cond_resched();
3617 }
3618out:
3619 WARN_ON(*level < 0);
3620 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3621
3622 if (path->nodes[*level] == root->node) {
3623 parent = path->nodes[*level];
3624 bytenr = path->nodes[*level]->start;
3625 } else {
3626 parent = path->nodes[*level + 1];
3627 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3628 }
3629
3630 blocksize = btrfs_level_size(root, *level);
3631 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent);
3633
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen,
3636 *level, 1);
3637 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL;
3639 *level += 1;
3640 BUG_ON(ret);
3641
3642 cond_resched();
3643 return 0;
3644}
3645
3646/*
3647 * helper function for drop_subtree, this function is similar to
3648 * walk_down_tree. The main difference is that it checks reference
3649 * counts while tree blocks are locked.
3650 */
3651static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3652 struct btrfs_root *root,
3653 struct btrfs_path *path, int *level)
3654{
3655 struct extent_buffer *next;
3656 struct extent_buffer *cur;
3657 struct extent_buffer *parent;
3658 u64 bytenr;
3659 u64 ptr_gen;
3660 u32 blocksize;
3661 u32 refs;
3662 int ret;
3663
3664 cur = path->nodes[*level];
3665 ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3666 &refs);
3667 BUG_ON(ret);
3668 if (refs > 1)
3669 goto out;
3670
3671 while (*level >= 0) {
3672 cur = path->nodes[*level];
3673 if (*level == 0) {
3674 ret = btrfs_drop_leaf_ref(trans, root, cur);
3675 BUG_ON(ret);
3676 clean_tree_block(trans, root, cur);
3677 break;
3678 }
3679 if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3680 clean_tree_block(trans, root, cur);
3681 break;
3682 }
3683
3684 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3685 blocksize = btrfs_level_size(root, *level - 1);
3686 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3687
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next);
3690
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs);
3693 BUG_ON(ret);
3694 if (refs > 1) {
3695 parent = path->nodes[*level];
3696 ret = btrfs_free_extent(trans, root, bytenr,
3697 blocksize, parent->start,
3698 btrfs_header_owner(parent),
3699 btrfs_header_generation(parent),
3700 *level - 1, 1);
3701 BUG_ON(ret);
3702 path->slots[*level]++;
3703 btrfs_tree_unlock(next);
3704 free_extent_buffer(next);
3705 continue;
3706 }
3707
3708 *level = btrfs_header_level(next);
3709 path->nodes[*level] = next;
3710 path->slots[*level] = 0;
3711 path->locks[*level] = 1;
3712 cond_resched();
3713 }
3714out:
3715 parent = path->nodes[*level + 1];
3716 bytenr = path->nodes[*level]->start;
3717 blocksize = path->nodes[*level]->len;
3718
3719 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3720 parent->start, btrfs_header_owner(parent),
3721 btrfs_header_generation(parent), *level, 1);
3722 BUG_ON(ret);
3723
3724 if (path->locks[*level]) {
3725 btrfs_tree_unlock(path->nodes[*level]);
3726 path->locks[*level] = 0;
3727 }
3728 free_extent_buffer(path->nodes[*level]);
3729 path->nodes[*level] = NULL;
3730 *level += 1;
3731 cond_resched();
3732 return 0;
3733}
3734
3735/*
3736 * helper for dropping snapshots. This walks back up the tree in the path
3737 * to find the first node higher up where we haven't yet gone through
3738 * all the slots
3739 */
3740static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3741 struct btrfs_root *root,
3742 struct btrfs_path *path,
3743 int *level, int max_level)
3744{
3745 u64 root_owner;
3746 u64 root_gen;
3747 struct btrfs_root_item *root_item = &root->root_item;
3748 int i;
3749 int slot;
3750 int ret;
3751
3752 for (i = *level; i < max_level && path->nodes[i]; i++) {
3753 slot = path->slots[i];
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key;
3757 node = path->nodes[i];
3758 path->slots[i]++;
3759 *level = i;
3760 WARN_ON(*level == 0);
3761 btrfs_node_key(node, &disk_key, path->slots[i]);
3762 memcpy(&root_item->drop_progress,
3763 &disk_key, sizeof(disk_key));
3764 root_item->drop_level = i;
3765 return 0;
3766 } else {
3767 struct extent_buffer *parent;
3768 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level];
3770 else
3771 parent = path->nodes[*level + 1];
3772
3773 root_owner = btrfs_header_owner(parent);
3774 root_gen = btrfs_header_generation(parent);
3775
3776 clean_tree_block(trans, root, path->nodes[*level]);
3777 ret = btrfs_free_extent(trans, root,
3778 path->nodes[*level]->start,
3779 path->nodes[*level]->len,
3780 parent->start, root_owner,
3781 root_gen, *level, 1);
3782 BUG_ON(ret);
3783 if (path->locks[*level]) {
3784 btrfs_tree_unlock(path->nodes[*level]);
3785 path->locks[*level] = 0;
3786 }
3787 free_extent_buffer(path->nodes[*level]);
3788 path->nodes[*level] = NULL;
3789 *level = i + 1;
3790 }
3791 }
3792 return 1;
3793}
3794
3795/*
3796 * drop the reference count on the tree rooted at 'snap'. This traverses
3797 * the tree freeing any blocks that have a ref count of zero after being
3798 * decremented.
3799 */
3800int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3801 *root)
3802{
3803 int ret = 0;
3804 int wret;
3805 int level;
3806 struct btrfs_path *path;
3807 int i;
3808 int orig_level;
3809 struct btrfs_root_item *root_item = &root->root_item;
3810
3811 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3812 path = btrfs_alloc_path();
3813 BUG_ON(!path);
3814
3815 level = btrfs_header_level(root->node);
3816 orig_level = level;
3817 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3818 path->nodes[level] = root->node;
3819 extent_buffer_get(root->node);
3820 path->slots[level] = 0;
3821 } else {
3822 struct btrfs_key key;
3823 struct btrfs_disk_key found_key;
3824 struct extent_buffer *node;
3825
3826 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3827 level = root_item->drop_level;
3828 path->lowest_level = level;
3829 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3830 if (wret < 0) {
3831 ret = wret;
3832 goto out;
3833 }
3834 node = path->nodes[level];
3835 btrfs_node_key(node, &found_key, path->slots[level]);
3836 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3837 sizeof(found_key)));
3838 /*
3839 * unlock our path, this is safe because only this
3840 * function is allowed to delete this snapshot
3841 */
3842 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3843 if (path->nodes[i] && path->locks[i]) {
3844 path->locks[i] = 0;
3845 btrfs_tree_unlock(path->nodes[i]);
3846 }
3847 }
3848 }
3849 while (1) {
3850 wret = walk_down_tree(trans, root, path, &level);
3851 if (wret > 0)
3852 break;
3853 if (wret < 0)
3854 ret = wret;
3855
3856 wret = walk_up_tree(trans, root, path, &level,
3857 BTRFS_MAX_LEVEL);
3858 if (wret > 0)
3859 break;
3860 if (wret < 0)
3861 ret = wret;
3862 if (trans->transaction->in_commit) {
3863 ret = -EAGAIN;
3864 break;
3865 }
3866 atomic_inc(&root->fs_info->throttle_gen);
3867 wake_up(&root->fs_info->transaction_throttle);
3868 }
3869 for (i = 0; i <= orig_level; i++) {
3870 if (path->nodes[i]) {
3871 free_extent_buffer(path->nodes[i]);
3872 path->nodes[i] = NULL;
3873 }
3874 }
3875out:
3876 btrfs_free_path(path);
3877 return ret;
3878}
3879
3880int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3881 struct btrfs_root *root,
3882 struct extent_buffer *node,
3883 struct extent_buffer *parent)
3884{
3885 struct btrfs_path *path;
3886 int level;
3887 int parent_level;
3888 int ret = 0;
3889 int wret;
3890
3891 path = btrfs_alloc_path();
3892 BUG_ON(!path);
3893
3894 BUG_ON(!btrfs_tree_locked(parent));
3895 parent_level = btrfs_header_level(parent);
3896 extent_buffer_get(parent);
3897 path->nodes[parent_level] = parent;
3898 path->slots[parent_level] = btrfs_header_nritems(parent);
3899
3900 BUG_ON(!btrfs_tree_locked(node));
3901 level = btrfs_header_level(node);
3902 extent_buffer_get(node);
3903 path->nodes[level] = node;
3904 path->slots[level] = 0;
3905
3906 while (1) {
3907 wret = walk_down_subtree(trans, root, path, &level);
3908 if (wret < 0)
3909 ret = wret;
3910 if (wret != 0)
3911 break;
3912
3913 wret = walk_up_tree(trans, root, path, &level, parent_level);
3914 if (wret < 0)
3915 ret = wret;
3916 if (wret != 0)
3917 break;
3918 }
3919
3920 btrfs_free_path(path);
3921 return ret;
3922}
3923
3924static unsigned long calc_ra(unsigned long start, unsigned long last,
3925 unsigned long nr)
3926{
3927 return min(last, start + nr - 1);
3928}
3929
3930static noinline int relocate_inode_pages(struct inode *inode, u64 start,
3931 u64 len)
3932{
3933 u64 page_start;
3934 u64 page_end;
3935 unsigned long first_index;
3936 unsigned long last_index;
3937 unsigned long i;
3938 struct page *page;
3939 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3940 struct file_ra_state *ra;
3941 struct btrfs_ordered_extent *ordered;
3942 unsigned int total_read = 0;
3943 unsigned int total_dirty = 0;
3944 int ret = 0;
3945
3946 ra = kzalloc(sizeof(*ra), GFP_NOFS);
3947
3948 mutex_lock(&inode->i_mutex);
3949 first_index = start >> PAGE_CACHE_SHIFT;
3950 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3951
3952 /* make sure the dirty trick played by the caller work */
3953 ret = invalidate_inode_pages2_range(inode->i_mapping,
3954 first_index, last_index);
3955 if (ret)
3956 goto out_unlock;
3957
3958 file_ra_state_init(ra, inode->i_mapping);
3959
3960 for (i = first_index ; i <= last_index; i++) {
3961 if (total_read % ra->ra_pages == 0) {
3962 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3963 calc_ra(i, last_index, ra->ra_pages));
3964 }
3965 total_read++;
3966again:
3967 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3968 BUG_ON(1);
3969 page = grab_cache_page(inode->i_mapping, i);
3970 if (!page) {
3971 ret = -ENOMEM;
3972 goto out_unlock;
3973 }
3974 if (!PageUptodate(page)) {
3975 btrfs_readpage(NULL, page);
3976 lock_page(page);
3977 if (!PageUptodate(page)) {
3978 unlock_page(page);
3979 page_cache_release(page);
3980 ret = -EIO;
3981 goto out_unlock;
3982 }
3983 }
3984 wait_on_page_writeback(page);
3985
3986 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3987 page_end = page_start + PAGE_CACHE_SIZE - 1;
3988 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3989
3990 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3991 if (ordered) {
3992 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3993 unlock_page(page);
3994 page_cache_release(page);
3995 btrfs_start_ordered_extent(inode, ordered, 1);
3996 btrfs_put_ordered_extent(ordered);
3997 goto again;
3998 }
3999 set_page_extent_mapped(page);
4000
4001 if (i == first_index)
4002 set_extent_bits(io_tree, page_start, page_end,
4003 EXTENT_BOUNDARY, GFP_NOFS);
4004 btrfs_set_extent_delalloc(inode, page_start, page_end);
4005
4006 set_page_dirty(page);
4007 total_dirty++;
4008
4009 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4010 unlock_page(page);
4011 page_cache_release(page);
4012 }
4013
4014out_unlock:
4015 kfree(ra);
4016 mutex_unlock(&inode->i_mutex);
4017 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
4018 return ret;
4019}
4020
4021static noinline int relocate_data_extent(struct inode *reloc_inode,
4022 struct btrfs_key *extent_key,
4023 u64 offset)
4024{
4025 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4026 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4027 struct extent_map *em;
4028 u64 start = extent_key->objectid - offset;
4029 u64 end = start + extent_key->offset - 1;
4030
4031 em = alloc_extent_map(GFP_NOFS);
4032 BUG_ON(!em || IS_ERR(em));
4033
4034 em->start = start;
4035 em->len = extent_key->offset;
4036 em->block_len = extent_key->offset;
4037 em->block_start = extent_key->objectid;
4038 em->bdev = root->fs_info->fs_devices->latest_bdev;
4039 set_bit(EXTENT_FLAG_PINNED, &em->flags);
4040
4041 /* setup extent map to cheat btrfs_readpage */
4042 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4043 while (1) {
4044 int ret;
4045 spin_lock(&em_tree->lock);
4046 ret = add_extent_mapping(em_tree, em);
4047 spin_unlock(&em_tree->lock);
4048 if (ret != -EEXIST) {
4049 free_extent_map(em);
4050 break;
4051 }
4052 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4053 }
4054 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4055
4056 return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4057}
4058
4059struct btrfs_ref_path {
4060 u64 extent_start;
4061 u64 nodes[BTRFS_MAX_LEVEL];
4062 u64 root_objectid;
4063 u64 root_generation;
4064 u64 owner_objectid;
4065 u32 num_refs;
4066 int lowest_level;
4067 int current_level;
4068 int shared_level;
4069
4070 struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4071 u64 new_nodes[BTRFS_MAX_LEVEL];
4072};
4073
4074struct disk_extent {
4075 u64 ram_bytes;
4076 u64 disk_bytenr;
4077 u64 disk_num_bytes;
4078 u64 offset;
4079 u64 num_bytes;
4080 u8 compression;
4081 u8 encryption;
4082 u16 other_encoding;
4083};
4084
4085static int is_cowonly_root(u64 root_objectid)
4086{
4087 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4088 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4089 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4090 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4091 root_objectid == BTRFS_TREE_LOG_OBJECTID ||
4092 root_objectid == BTRFS_CSUM_TREE_OBJECTID)
4093 return 1;
4094 return 0;
4095}
4096
4097static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
4098 struct btrfs_root *extent_root,
4099 struct btrfs_ref_path *ref_path,
4100 int first_time)
4101{
4102 struct extent_buffer *leaf;
4103 struct btrfs_path *path;
4104 struct btrfs_extent_ref *ref;
4105 struct btrfs_key key;
4106 struct btrfs_key found_key;
4107 u64 bytenr;
4108 u32 nritems;
4109 int level;
4110 int ret = 1;
4111
4112 path = btrfs_alloc_path();
4113 if (!path)
4114 return -ENOMEM;
4115
4116 if (first_time) {
4117 ref_path->lowest_level = -1;
4118 ref_path->current_level = -1;
4119 ref_path->shared_level = -1;
4120 goto walk_up;
4121 }
4122walk_down:
4123 level = ref_path->current_level - 1;
4124 while (level >= -1) {
4125 u64 parent;
4126 if (level < ref_path->lowest_level)
4127 break;
4128
4129 if (level >= 0)
4130 bytenr = ref_path->nodes[level];
4131 else
4132 bytenr = ref_path->extent_start;
4133 BUG_ON(bytenr == 0);
4134
4135 parent = ref_path->nodes[level + 1];
4136 ref_path->nodes[level + 1] = 0;
4137 ref_path->current_level = level;
4138 BUG_ON(parent == 0);
4139
4140 key.objectid = bytenr;
4141 key.offset = parent + 1;
4142 key.type = BTRFS_EXTENT_REF_KEY;
4143
4144 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4145 if (ret < 0)
4146 goto out;
4147 BUG_ON(ret == 0);
4148
4149 leaf = path->nodes[0];
4150 nritems = btrfs_header_nritems(leaf);
4151 if (path->slots[0] >= nritems) {
4152 ret = btrfs_next_leaf(extent_root, path);
4153 if (ret < 0)
4154 goto out;
4155 if (ret > 0)
4156 goto next;
4157 leaf = path->nodes[0];
4158 }
4159
4160 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4161 if (found_key.objectid == bytenr &&
4162 found_key.type == BTRFS_EXTENT_REF_KEY) {
4163 if (level < ref_path->shared_level)
4164 ref_path->shared_level = level;
4165 goto found;
4166 }
4167next:
4168 level--;
4169 btrfs_release_path(extent_root, path);
4170 cond_resched();
4171 }
4172 /* reached lowest level */
4173 ret = 1;
4174 goto out;
4175walk_up:
4176 level = ref_path->current_level;
4177 while (level < BTRFS_MAX_LEVEL - 1) {
4178 u64 ref_objectid;
4179
4180 if (level >= 0)
4181 bytenr = ref_path->nodes[level];
4182 else
4183 bytenr = ref_path->extent_start;
4184
4185 BUG_ON(bytenr == 0);
4186
4187 key.objectid = bytenr;
4188 key.offset = 0;
4189 key.type = BTRFS_EXTENT_REF_KEY;
4190
4191 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4192 if (ret < 0)
4193 goto out;
4194
4195 leaf = path->nodes[0];
4196 nritems = btrfs_header_nritems(leaf);
4197 if (path->slots[0] >= nritems) {
4198 ret = btrfs_next_leaf(extent_root, path);
4199 if (ret < 0)
4200 goto out;
4201 if (ret > 0) {
4202 /* the extent was freed by someone */
4203 if (ref_path->lowest_level == level)
4204 goto out;
4205 btrfs_release_path(extent_root, path);
4206 goto walk_down;
4207 }
4208 leaf = path->nodes[0];
4209 }
4210
4211 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4212 if (found_key.objectid != bytenr ||
4213 found_key.type != BTRFS_EXTENT_REF_KEY) {
4214 /* the extent was freed by someone */
4215 if (ref_path->lowest_level == level) {
4216 ret = 1;
4217 goto out;
4218 }
4219 btrfs_release_path(extent_root, path);
4220 goto walk_down;
4221 }
4222found:
4223 ref = btrfs_item_ptr(leaf, path->slots[0],
4224 struct btrfs_extent_ref);
4225 ref_objectid = btrfs_ref_objectid(leaf, ref);
4226 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4227 if (first_time) {
4228 level = (int)ref_objectid;
4229 BUG_ON(level >= BTRFS_MAX_LEVEL);
4230 ref_path->lowest_level = level;
4231 ref_path->current_level = level;
4232 ref_path->nodes[level] = bytenr;
4233 } else {
4234 WARN_ON(ref_objectid != level);
4235 }
4236 } else {
4237 WARN_ON(level != -1);
4238 }
4239 first_time = 0;
4240
4241 if (ref_path->lowest_level == level) {
4242 ref_path->owner_objectid = ref_objectid;
4243 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4244 }
4245
4246 /*
4247 * the block is tree root or the block isn't in reference
4248 * counted tree.
4249 */
4250 if (found_key.objectid == found_key.offset ||
4251 is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4252 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4253 ref_path->root_generation =
4254 btrfs_ref_generation(leaf, ref);
4255 if (level < 0) {
4256 /* special reference from the tree log */
4257 ref_path->nodes[0] = found_key.offset;
4258 ref_path->current_level = 0;
4259 }
4260 ret = 0;
4261 goto out;
4262 }
4263
4264 level++;
4265 BUG_ON(ref_path->nodes[level] != 0);
4266 ref_path->nodes[level] = found_key.offset;
4267 ref_path->current_level = level;
4268
4269 /*
4270 * the reference was created in the running transaction,
4271 * no need to continue walking up.
4272 */
4273 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4274 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4275 ref_path->root_generation =
4276 btrfs_ref_generation(leaf, ref);
4277 ret = 0;
4278 goto out;
4279 }
4280
4281 btrfs_release_path(extent_root, path);
4282 cond_resched();
4283 }
4284 /* reached max tree level, but no tree root found. */
4285 BUG();
4286out:
4287 btrfs_free_path(path);
4288 return ret;
4289}
4290
4291static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4292 struct btrfs_root *extent_root,
4293 struct btrfs_ref_path *ref_path,
4294 u64 extent_start)
4295{
4296 memset(ref_path, 0, sizeof(*ref_path));
4297 ref_path->extent_start = extent_start;
4298
4299 return __next_ref_path(trans, extent_root, ref_path, 1);
4300}
4301
4302static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4303 struct btrfs_root *extent_root,
4304 struct btrfs_ref_path *ref_path)
4305{
4306 return __next_ref_path(trans, extent_root, ref_path, 0);
4307}
4308
4309static noinline int get_new_locations(struct inode *reloc_inode,
4310 struct btrfs_key *extent_key,
4311 u64 offset, int no_fragment,
4312 struct disk_extent **extents,
4313 int *nr_extents)
4314{
4315 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4316 struct btrfs_path *path;
4317 struct btrfs_file_extent_item *fi;
4318 struct extent_buffer *leaf;
4319 struct disk_extent *exts = *extents;
4320 struct btrfs_key found_key;
4321 u64 cur_pos;
4322 u64 last_byte;
4323 u32 nritems;
4324 int nr = 0;
4325 int max = *nr_extents;
4326 int ret;
4327
4328 WARN_ON(!no_fragment && *extents);
4329 if (!exts) {
4330 max = 1;
4331 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4332 if (!exts)
4333 return -ENOMEM;
4334 }
4335
4336 path = btrfs_alloc_path();
4337 BUG_ON(!path);
4338
4339 cur_pos = extent_key->objectid - offset;
4340 last_byte = extent_key->objectid + extent_key->offset;
4341 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4342 cur_pos, 0);
4343 if (ret < 0)
4344 goto out;
4345 if (ret > 0) {
4346 ret = -ENOENT;
4347 goto out;
4348 }
4349
4350 while (1) {
4351 leaf = path->nodes[0];
4352 nritems = btrfs_header_nritems(leaf);
4353 if (path->slots[0] >= nritems) {
4354 ret = btrfs_next_leaf(root, path);
4355 if (ret < 0)
4356 goto out;
4357 if (ret > 0)
4358 break;
4359 leaf = path->nodes[0];
4360 }
4361
4362 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4363 if (found_key.offset != cur_pos ||
4364 found_key.type != BTRFS_EXTENT_DATA_KEY ||
4365 found_key.objectid != reloc_inode->i_ino)
4366 break;
4367
4368 fi = btrfs_item_ptr(leaf, path->slots[0],
4369 struct btrfs_file_extent_item);
4370 if (btrfs_file_extent_type(leaf, fi) !=
4371 BTRFS_FILE_EXTENT_REG ||
4372 btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4373 break;
4374
4375 if (nr == max) {
4376 struct disk_extent *old = exts;
4377 max *= 2;
4378 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4379 memcpy(exts, old, sizeof(*exts) * nr);
4380 if (old != *extents)
4381 kfree(old);
4382 }
4383
4384 exts[nr].disk_bytenr =
4385 btrfs_file_extent_disk_bytenr(leaf, fi);
4386 exts[nr].disk_num_bytes =
4387 btrfs_file_extent_disk_num_bytes(leaf, fi);
4388 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4389 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4390 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4391 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4392 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4393 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4394 fi);
4395 BUG_ON(exts[nr].offset > 0);
4396 BUG_ON(exts[nr].compression || exts[nr].encryption);
4397 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4398
4399 cur_pos += exts[nr].num_bytes;
4400 nr++;
4401
4402 if (cur_pos + offset >= last_byte)
4403 break;
4404
4405 if (no_fragment) {
4406 ret = 1;
4407 goto out;
4408 }
4409 path->slots[0]++;
4410 }
4411
4412 BUG_ON(cur_pos + offset > last_byte);
4413 if (cur_pos + offset < last_byte) {
4414 ret = -ENOENT;
4415 goto out;
4416 }
4417 ret = 0;
4418out:
4419 btrfs_free_path(path);
4420 if (ret) {
4421 if (exts != *extents)
4422 kfree(exts);
4423 } else {
4424 *extents = exts;
4425 *nr_extents = nr;
4426 }
4427 return ret;
4428}
4429
4430static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4431 struct btrfs_root *root,
4432 struct btrfs_path *path,
4433 struct btrfs_key *extent_key,
4434 struct btrfs_key *leaf_key,
4435 struct btrfs_ref_path *ref_path,
4436 struct disk_extent *new_extents,
4437 int nr_extents)
4438{
4439 struct extent_buffer *leaf;
4440 struct btrfs_file_extent_item *fi;
4441 struct inode *inode = NULL;
4442 struct btrfs_key key;
4443 u64 lock_start = 0;
4444 u64 lock_end = 0;
4445 u64 num_bytes;
4446 u64 ext_offset;
4447 u64 first_pos;
4448 u32 nritems;
4449 int nr_scaned = 0;
4450 int extent_locked = 0;
4451 int extent_type;
4452 int ret;
4453
4454 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid &&
4459 key.type < BTRFS_EXTENT_DATA_KEY)) {
4460 key.objectid = ref_path->owner_objectid;
4461 key.type = BTRFS_EXTENT_DATA_KEY;
4462 key.offset = 0;
4463 }
4464 }
4465
4466 while (1) {
4467 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4468 if (ret < 0)
4469 goto out;
4470
4471 leaf = path->nodes[0];
4472 nritems = btrfs_header_nritems(leaf);
4473next:
4474 if (extent_locked && ret > 0) {
4475 /*
4476 * the file extent item was modified by someone
4477 * before the extent got locked.
4478 */
4479 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4480 lock_end, GFP_NOFS);
4481 extent_locked = 0;
4482 }
4483
4484 if (path->slots[0] >= nritems) {
4485 if (++nr_scaned > 2)
4486 break;
4487
4488 BUG_ON(extent_locked);
4489 ret = btrfs_next_leaf(root, path);
4490 if (ret < 0)
4491 goto out;
4492 if (ret > 0)
4493 break;
4494 leaf = path->nodes[0];
4495 nritems = btrfs_header_nritems(leaf);
4496 }
4497
4498 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4499
4500 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4501 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset))
4505 break;
4506 }
4507
4508 if (inode && key.objectid != inode->i_ino) {
4509 BUG_ON(extent_locked);
4510 btrfs_release_path(root, path);
4511 mutex_unlock(&inode->i_mutex);
4512 iput(inode);
4513 inode = NULL;
4514 continue;
4515 }
4516
4517 if (key.type != BTRFS_EXTENT_DATA_KEY) {
4518 path->slots[0]++;
4519 ret = 1;
4520 goto next;
4521 }
4522 fi = btrfs_item_ptr(leaf, path->slots[0],
4523 struct btrfs_file_extent_item);
4524 extent_type = btrfs_file_extent_type(leaf, fi);
4525 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
4526 extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
4527 (btrfs_file_extent_disk_bytenr(leaf, fi) !=
4528 extent_key->objectid)) {
4529 path->slots[0]++;
4530 ret = 1;
4531 goto next;
4532 }
4533
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536
4537 if (first_pos > key.offset - ext_offset)
4538 first_pos = key.offset - ext_offset;
4539
4540 if (!extent_locked) {
4541 lock_start = key.offset;
4542 lock_end = lock_start + num_bytes - 1;
4543 } else {
4544 if (lock_start > key.offset ||
4545 lock_end + 1 < key.offset + num_bytes) {
4546 unlock_extent(&BTRFS_I(inode)->io_tree,
4547 lock_start, lock_end, GFP_NOFS);
4548 extent_locked = 0;
4549 }
4550 }
4551
4552 if (!inode) {
4553 btrfs_release_path(root, path);
4554
4555 inode = btrfs_iget_locked(root->fs_info->sb,
4556 key.objectid, root);
4557 if (inode->i_state & I_NEW) {
4558 BTRFS_I(inode)->root = root;
4559 BTRFS_I(inode)->location.objectid =
4560 key.objectid;
4561 BTRFS_I(inode)->location.type =
4562 BTRFS_INODE_ITEM_KEY;
4563 BTRFS_I(inode)->location.offset = 0;
4564 btrfs_read_locked_inode(inode);
4565 unlock_new_inode(inode);
4566 }
4567 /*
4568 * some code call btrfs_commit_transaction while
4569 * holding the i_mutex, so we can't use mutex_lock
4570 * here.
4571 */
4572 if (is_bad_inode(inode) ||
4573 !mutex_trylock(&inode->i_mutex)) {
4574 iput(inode);
4575 inode = NULL;
4576 key.offset = (u64)-1;
4577 goto skip;
4578 }
4579 }
4580
4581 if (!extent_locked) {
4582 struct btrfs_ordered_extent *ordered;
4583
4584 btrfs_release_path(root, path);
4585
4586 lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4587 lock_end, GFP_NOFS);
4588 ordered = btrfs_lookup_first_ordered_extent(inode,
4589 lock_end);
4590 if (ordered &&
4591 ordered->file_offset <= lock_end &&
4592 ordered->file_offset + ordered->len > lock_start) {
4593 unlock_extent(&BTRFS_I(inode)->io_tree,
4594 lock_start, lock_end, GFP_NOFS);
4595 btrfs_start_ordered_extent(inode, ordered, 1);
4596 btrfs_put_ordered_extent(ordered);
4597 key.offset += num_bytes;
4598 goto skip;
4599 }
4600 if (ordered)
4601 btrfs_put_ordered_extent(ordered);
4602
4603 extent_locked = 1;
4604 continue;
4605 }
4606
4607 if (nr_extents == 1) {
4608 /* update extent pointer in place */
4609 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4610 new_extents[0].disk_bytenr);
4611 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4612 new_extents[0].disk_num_bytes);
4613 btrfs_mark_buffer_dirty(leaf);
4614
4615 btrfs_drop_extent_cache(inode, key.offset,
4616 key.offset + num_bytes - 1, 0);
4617
4618 ret = btrfs_inc_extent_ref(trans, root,
4619 new_extents[0].disk_bytenr,
4620 new_extents[0].disk_num_bytes,
4621 leaf->start,
4622 root->root_key.objectid,
4623 trans->transid,
4624 key.objectid);
4625 BUG_ON(ret);
4626
4627 ret = btrfs_free_extent(trans, root,
4628 extent_key->objectid,
4629 extent_key->offset,
4630 leaf->start,
4631 btrfs_header_owner(leaf),
4632 btrfs_header_generation(leaf),
4633 key.objectid, 0);
4634 BUG_ON(ret);
4635
4636 btrfs_release_path(root, path);
4637 key.offset += num_bytes;
4638 } else {
4639 BUG_ON(1);
4640#if 0
4641 u64 alloc_hint;
4642 u64 extent_len;
4643 int i;
4644 /*
4645 * drop old extent pointer at first, then insert the
4646 * new pointers one bye one
4647 */
4648 btrfs_release_path(root, path);
4649 ret = btrfs_drop_extents(trans, root, inode, key.offset,
4650 key.offset + num_bytes,
4651 key.offset, &alloc_hint);
4652 BUG_ON(ret);
4653
4654 for (i = 0; i < nr_extents; i++) {
4655 if (ext_offset >= new_extents[i].num_bytes) {
4656 ext_offset -= new_extents[i].num_bytes;
4657 continue;
4658 }
4659 extent_len = min(new_extents[i].num_bytes -
4660 ext_offset, num_bytes);
4661
4662 ret = btrfs_insert_empty_item(trans, root,
4663 path, &key,
4664 sizeof(*fi));
4665 BUG_ON(ret);
4666
4667 leaf = path->nodes[0];
4668 fi = btrfs_item_ptr(leaf, path->slots[0],
4669 struct btrfs_file_extent_item);
4670 btrfs_set_file_extent_generation(leaf, fi,
4671 trans->transid);
4672 btrfs_set_file_extent_type(leaf, fi,
4673 BTRFS_FILE_EXTENT_REG);
4674 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4675 new_extents[i].disk_bytenr);
4676 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4677 new_extents[i].disk_num_bytes);
4678 btrfs_set_file_extent_ram_bytes(leaf, fi,
4679 new_extents[i].ram_bytes);
4680
4681 btrfs_set_file_extent_compression(leaf, fi,
4682 new_extents[i].compression);
4683 btrfs_set_file_extent_encryption(leaf, fi,
4684 new_extents[i].encryption);
4685 btrfs_set_file_extent_other_encoding(leaf, fi,
4686 new_extents[i].other_encoding);
4687
4688 btrfs_set_file_extent_num_bytes(leaf, fi,
4689 extent_len);
4690 ext_offset += new_extents[i].offset;
4691 btrfs_set_file_extent_offset(leaf, fi,
4692 ext_offset);
4693 btrfs_mark_buffer_dirty(leaf);
4694
4695 btrfs_drop_extent_cache(inode, key.offset,
4696 key.offset + extent_len - 1, 0);
4697
4698 ret = btrfs_inc_extent_ref(trans, root,
4699 new_extents[i].disk_bytenr,
4700 new_extents[i].disk_num_bytes,
4701 leaf->start,
4702 root->root_key.objectid,
4703 trans->transid, key.objectid);
4704 BUG_ON(ret);
4705 btrfs_release_path(root, path);
4706
4707 inode_add_bytes(inode, extent_len);
4708
4709 ext_offset = 0;
4710 num_bytes -= extent_len;
4711 key.offset += extent_len;
4712
4713 if (num_bytes == 0)
4714 break;
4715 }
4716 BUG_ON(i >= nr_extents);
4717#endif
4718 }
4719
4720 if (extent_locked) {
4721 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4722 lock_end, GFP_NOFS);
4723 extent_locked = 0;
4724 }
4725skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset)
4728 break;
4729
4730 cond_resched();
4731 }
4732 ret = 0;
4733out:
4734 btrfs_release_path(root, path);
4735 if (inode) {
4736 mutex_unlock(&inode->i_mutex);
4737 if (extent_locked) {
4738 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4739 lock_end, GFP_NOFS);
4740 }
4741 iput(inode);
4742 }
4743 return ret;
4744}
4745
4746int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4747 struct btrfs_root *root,
4748 struct extent_buffer *buf, u64 orig_start)
4749{
4750 int level;
4751 int ret;
4752
4753 BUG_ON(btrfs_header_generation(buf) != trans->transid);
4754 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4755
4756 level = btrfs_header_level(buf);
4757 if (level == 0) {
4758 struct btrfs_leaf_ref *ref;
4759 struct btrfs_leaf_ref *orig_ref;
4760
4761 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4762 if (!orig_ref)
4763 return -ENOENT;
4764
4765 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4766 if (!ref) {
4767 btrfs_free_leaf_ref(root, orig_ref);
4768 return -ENOMEM;
4769 }
4770
4771 ref->nritems = orig_ref->nritems;
4772 memcpy(ref->extents, orig_ref->extents,
4773 sizeof(ref->extents[0]) * ref->nritems);
4774
4775 btrfs_free_leaf_ref(root, orig_ref);
4776
4777 ref->root_gen = trans->transid;
4778 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf);
4781 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref);
4784 }
4785 return 0;
4786}
4787
4788static noinline int invalidate_extent_cache(struct btrfs_root *root,
4789 struct extent_buffer *leaf,
4790 struct btrfs_block_group_cache *group,
4791 struct btrfs_root *target_root)
4792{
4793 struct btrfs_key key;
4794 struct inode *inode = NULL;
4795 struct btrfs_file_extent_item *fi;
4796 u64 num_bytes;
4797 u64 skip_objectid = 0;
4798 u32 nritems;
4799 u32 i;
4800
4801 nritems = btrfs_header_nritems(leaf);
4802 for (i = 0; i < nritems; i++) {
4803 btrfs_item_key_to_cpu(leaf, &key, i);
4804 if (key.objectid == skip_objectid ||
4805 key.type != BTRFS_EXTENT_DATA_KEY)
4806 continue;
4807 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4808 if (btrfs_file_extent_type(leaf, fi) ==
4809 BTRFS_FILE_EXTENT_INLINE)
4810 continue;
4811 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4812 continue;
4813 if (!inode || inode->i_ino != key.objectid) {
4814 iput(inode);
4815 inode = btrfs_ilookup(target_root->fs_info->sb,
4816 key.objectid, target_root, 1);
4817 }
4818 if (!inode) {
4819 skip_objectid = key.objectid;
4820 continue;
4821 }
4822 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4823
4824 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4825 key.offset + num_bytes - 1, GFP_NOFS);
4826 btrfs_drop_extent_cache(inode, key.offset,
4827 key.offset + num_bytes - 1, 1);
4828 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4829 key.offset + num_bytes - 1, GFP_NOFS);
4830 cond_resched();
4831 }
4832 iput(inode);
4833 return 0;
4834}
4835
4836static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4837 struct btrfs_root *root,
4838 struct extent_buffer *leaf,
4839 struct btrfs_block_group_cache *group,
4840 struct inode *reloc_inode)
4841{
4842 struct btrfs_key key;
4843 struct btrfs_key extent_key;
4844 struct btrfs_file_extent_item *fi;
4845 struct btrfs_leaf_ref *ref;
4846 struct disk_extent *new_extent;
4847 u64 bytenr;
4848 u64 num_bytes;
4849 u32 nritems;
4850 u32 i;
4851 int ext_index;
4852 int nr_extent;
4853 int ret;
4854
4855 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4856 BUG_ON(!new_extent);
4857
4858 ref = btrfs_lookup_leaf_ref(root, leaf->start);
4859 BUG_ON(!ref);
4860
4861 ext_index = -1;
4862 nritems = btrfs_header_nritems(leaf);
4863 for (i = 0; i < nritems; i++) {
4864 btrfs_item_key_to_cpu(leaf, &key, i);
4865 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4866 continue;
4867 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4868 if (btrfs_file_extent_type(leaf, fi) ==
4869 BTRFS_FILE_EXTENT_INLINE)
4870 continue;
4871 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4872 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4873 if (bytenr == 0)
4874 continue;
4875
4876 ext_index++;
4877 if (bytenr >= group->key.objectid + group->key.offset ||
4878 bytenr + num_bytes <= group->key.objectid)
4879 continue;
4880
4881 extent_key.objectid = bytenr;
4882 extent_key.offset = num_bytes;
4883 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4884 nr_extent = 1;
4885 ret = get_new_locations(reloc_inode, &extent_key,
4886 group->key.objectid, 1,
4887 &new_extent, &nr_extent);
4888 if (ret > 0)
4889 continue;
4890 BUG_ON(ret < 0);
4891
4892 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4893 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4894 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4895 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4896
4897 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4898 new_extent->disk_bytenr);
4899 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4900 new_extent->disk_num_bytes);
4901 btrfs_mark_buffer_dirty(leaf);
4902
4903 ret = btrfs_inc_extent_ref(trans, root,
4904 new_extent->disk_bytenr,
4905 new_extent->disk_num_bytes,
4906 leaf->start,
4907 root->root_key.objectid,
4908 trans->transid, key.objectid);
4909 BUG_ON(ret);
4910 ret = btrfs_free_extent(trans, root,
4911 bytenr, num_bytes, leaf->start,
4912 btrfs_header_owner(leaf),
4913 btrfs_header_generation(leaf),
4914 key.objectid, 0);
4915 BUG_ON(ret);
4916 cond_resched();
4917 }
4918 kfree(new_extent);
4919 BUG_ON(ext_index + 1 != ref->nritems);
4920 btrfs_free_leaf_ref(root, ref);
4921 return 0;
4922}
4923
4924int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4925 struct btrfs_root *root)
4926{
4927 struct btrfs_root *reloc_root;
4928 int ret;
4929
4930 if (root->reloc_root) {
4931 reloc_root = root->reloc_root;
4932 root->reloc_root = NULL;
4933 list_add(&reloc_root->dead_list,
4934 &root->fs_info->dead_reloc_roots);
4935
4936 btrfs_set_root_bytenr(&reloc_root->root_item,
4937 reloc_root->node->start);
4938 btrfs_set_root_level(&root->root_item,
4939 btrfs_header_level(reloc_root->node));
4940 memset(&reloc_root->root_item.drop_progress, 0,
4941 sizeof(struct btrfs_disk_key));
4942 reloc_root->root_item.drop_level = 0;
4943
4944 ret = btrfs_update_root(trans, root->fs_info->tree_root,
4945 &reloc_root->root_key,
4946 &reloc_root->root_item);
4947 BUG_ON(ret);
4948 }
4949 return 0;
4950}
4951
4952int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4953{
4954 struct btrfs_trans_handle *trans;
4955 struct btrfs_root *reloc_root;
4956 struct btrfs_root *prev_root = NULL;
4957 struct list_head dead_roots;
4958 int ret;
4959 unsigned long nr;
4960
4961 INIT_LIST_HEAD(&dead_roots);
4962 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4963
4964 while (!list_empty(&dead_roots)) {
4965 reloc_root = list_entry(dead_roots.prev,
4966 struct btrfs_root, dead_list);
4967 list_del_init(&reloc_root->dead_list);
4968
4969 BUG_ON(reloc_root->commit_root != NULL);
4970 while (1) {
4971 trans = btrfs_join_transaction(root, 1);
4972 BUG_ON(!trans);
4973
4974 mutex_lock(&root->fs_info->drop_mutex);
4975 ret = btrfs_drop_snapshot(trans, reloc_root);
4976 if (ret != -EAGAIN)
4977 break;
4978 mutex_unlock(&root->fs_info->drop_mutex);
4979
4980 nr = trans->blocks_used;
4981 ret = btrfs_end_transaction(trans, root);
4982 BUG_ON(ret);
4983 btrfs_btree_balance_dirty(root, nr);
4984 }
4985
4986 free_extent_buffer(reloc_root->node);
4987
4988 ret = btrfs_del_root(trans, root->fs_info->tree_root,
4989 &reloc_root->root_key);
4990 BUG_ON(ret);
4991 mutex_unlock(&root->fs_info->drop_mutex);
4992
4993 nr = trans->blocks_used;
4994 ret = btrfs_end_transaction(trans, root);
4995 BUG_ON(ret);
4996 btrfs_btree_balance_dirty(root, nr);
4997
4998 kfree(prev_root);
4999 prev_root = reloc_root;
5000 }
5001 if (prev_root) {
5002 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
5003 kfree(prev_root);
5004 }
5005 return 0;
5006}
5007
5008int btrfs_add_dead_reloc_root(struct btrfs_root *root)
5009{
5010 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
5011 return 0;
5012}
5013
5014int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
5015{
5016 struct btrfs_root *reloc_root;
5017 struct btrfs_trans_handle *trans;
5018 struct btrfs_key location;
5019 int found;
5020 int ret;
5021
5022 mutex_lock(&root->fs_info->tree_reloc_mutex);
5023 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
5024 BUG_ON(ret);
5025 found = !list_empty(&root->fs_info->dead_reloc_roots);
5026 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5027
5028 if (found) {
5029 trans = btrfs_start_transaction(root, 1);
5030 BUG_ON(!trans);
5031 ret = btrfs_commit_transaction(trans, root);
5032 BUG_ON(ret);
5033 }
5034
5035 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5036 location.offset = (u64)-1;
5037 location.type = BTRFS_ROOT_ITEM_KEY;
5038
5039 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5040 BUG_ON(!reloc_root);
5041 btrfs_orphan_cleanup(reloc_root);
5042 return 0;
5043}
5044
5045static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
5046 struct btrfs_root *root)
5047{
5048 struct btrfs_root *reloc_root;
5049 struct extent_buffer *eb;
5050 struct btrfs_root_item *root_item;
5051 struct btrfs_key root_key;
5052 int ret;
5053
5054 BUG_ON(!root->ref_cows);
5055 if (root->reloc_root)
5056 return 0;
5057
5058 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5059 BUG_ON(!root_item);
5060
5061 ret = btrfs_copy_root(trans, root, root->commit_root,
5062 &eb, BTRFS_TREE_RELOC_OBJECTID);
5063 BUG_ON(ret);
5064
5065 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5066 root_key.offset = root->root_key.objectid;
5067 root_key.type = BTRFS_ROOT_ITEM_KEY;
5068
5069 memcpy(root_item, &root->root_item, sizeof(root_item));
5070 btrfs_set_root_refs(root_item, 0);
5071 btrfs_set_root_bytenr(root_item, eb->start);
5072 btrfs_set_root_level(root_item, btrfs_header_level(eb));
5073 btrfs_set_root_generation(root_item, trans->transid);
5074
5075 btrfs_tree_unlock(eb);
5076 free_extent_buffer(eb);
5077
5078 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5079 &root_key, root_item);
5080 BUG_ON(ret);
5081 kfree(root_item);
5082
5083 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5084 &root_key);
5085 BUG_ON(!reloc_root);
5086 reloc_root->last_trans = trans->transid;
5087 reloc_root->commit_root = NULL;
5088 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5089
5090 root->reloc_root = reloc_root;
5091 return 0;
5092}
5093
5094/*
5095 * Core function of space balance.
5096 *
5097 * The idea is using reloc trees to relocate tree blocks in reference
5098 * counted roots. There is one reloc tree for each subvol, and all
5099 * reloc trees share same root key objectid. Reloc trees are snapshots
5100 * of the latest committed roots of subvols (root->commit_root).
5101 *
5102 * To relocate a tree block referenced by a subvol, there are two steps.
5103 * COW the block through subvol's reloc tree, then update block pointer
5104 * in the subvol to point to the new block. Since all reloc trees share
5105 * same root key objectid, doing special handing for tree blocks owned
5106 * by them is easy. Once a tree block has been COWed in one reloc tree,
5107 * we can use the resulting new block directly when the same block is
5108 * required to COW again through other reloc trees. By this way, relocated
5109 * tree blocks are shared between reloc trees, so they are also shared
5110 * between subvols.
5111 */
5112static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
5113 struct btrfs_root *root,
5114 struct btrfs_path *path,
5115 struct btrfs_key *first_key,
5116 struct btrfs_ref_path *ref_path,
5117 struct btrfs_block_group_cache *group,
5118 struct inode *reloc_inode)
5119{
5120 struct btrfs_root *reloc_root;
5121 struct extent_buffer *eb = NULL;
5122 struct btrfs_key *keys;
5123 u64 *nodes;
5124 int level;
5125 int shared_level;
5126 int lowest_level = 0;
5127 int ret;
5128
5129 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5130 lowest_level = ref_path->owner_objectid;
5131
5132 if (!root->ref_cows) {
5133 path->lowest_level = lowest_level;
5134 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5135 BUG_ON(ret < 0);
5136 path->lowest_level = 0;
5137 btrfs_release_path(root, path);
5138 return 0;
5139 }
5140
5141 mutex_lock(&root->fs_info->tree_reloc_mutex);
5142 ret = init_reloc_tree(trans, root);
5143 BUG_ON(ret);
5144 reloc_root = root->reloc_root;
5145
5146 shared_level = ref_path->shared_level;
5147 ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5148
5149 keys = ref_path->node_keys;
5150 nodes = ref_path->new_nodes;
5151 memset(&keys[shared_level + 1], 0,
5152 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5153 memset(&nodes[shared_level + 1], 0,
5154 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5155
5156 if (nodes[lowest_level] == 0) {
5157 path->lowest_level = lowest_level;
5158 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5159 0, 1);
5160 BUG_ON(ret);
5161 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5162 eb = path->nodes[level];
5163 if (!eb || eb == reloc_root->node)
5164 break;
5165 nodes[level] = eb->start;
5166 if (level == 0)
5167 btrfs_item_key_to_cpu(eb, &keys[level], 0);
5168 else
5169 btrfs_node_key_to_cpu(eb, &keys[level], 0);
5170 }
5171 if (nodes[0] &&
5172 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5173 eb = path->nodes[0];
5174 ret = replace_extents_in_leaf(trans, reloc_root, eb,
5175 group, reloc_inode);
5176 BUG_ON(ret);
5177 }
5178 btrfs_release_path(reloc_root, path);
5179 } else {
5180 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5181 lowest_level);
5182 BUG_ON(ret);
5183 }
5184
5185 /*
5186 * replace tree blocks in the fs tree with tree blocks in
5187 * the reloc tree.
5188 */
5189 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5190 BUG_ON(ret < 0);
5191
5192 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5193 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5194 0, 0);
5195 BUG_ON(ret);
5196 extent_buffer_get(path->nodes[0]);
5197 eb = path->nodes[0];
5198 btrfs_release_path(reloc_root, path);
5199 ret = invalidate_extent_cache(reloc_root, eb, group, root);
5200 BUG_ON(ret);
5201 free_extent_buffer(eb);
5202 }
5203
5204 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5205 path->lowest_level = 0;
5206 return 0;
5207}
5208
5209static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5210 struct btrfs_root *root,
5211 struct btrfs_path *path,
5212 struct btrfs_key *first_key,
5213 struct btrfs_ref_path *ref_path)
5214{
5215 int ret;
5216
5217 ret = relocate_one_path(trans, root, path, first_key,
5218 ref_path, NULL, NULL);
5219 BUG_ON(ret);
5220
5221 if (root == root->fs_info->extent_root)
5222 btrfs_extent_post_op(trans, root);
5223
5224 return 0;
5225}
5226
5227static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
5228 struct btrfs_root *extent_root,
5229 struct btrfs_path *path,
5230 struct btrfs_key *extent_key)
5231{
5232 int ret;
5233
5234 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5235 if (ret)
5236 goto out;
5237 ret = btrfs_del_item(trans, extent_root, path);
5238out:
5239 btrfs_release_path(extent_root, path);
5240 return ret;
5241}
5242
5243static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
5244 struct btrfs_ref_path *ref_path)
5245{
5246 struct btrfs_key root_key;
5247
5248 root_key.objectid = ref_path->root_objectid;
5249 root_key.type = BTRFS_ROOT_ITEM_KEY;
5250 if (is_cowonly_root(ref_path->root_objectid))
5251 root_key.offset = 0;
5252 else
5253 root_key.offset = (u64)-1;
5254
5255 return btrfs_read_fs_root_no_name(fs_info, &root_key);
5256}
5257
5258static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5259 struct btrfs_path *path,
5260 struct btrfs_key *extent_key,
5261 struct btrfs_block_group_cache *group,
5262 struct inode *reloc_inode, int pass)
5263{
5264 struct btrfs_trans_handle *trans;
5265 struct btrfs_root *found_root;
5266 struct btrfs_ref_path *ref_path = NULL;
5267 struct disk_extent *new_extents = NULL;
5268 int nr_extents = 0;
5269 int loops;
5270 int ret;
5271 int level;
5272 struct btrfs_key first_key;
5273 u64 prev_block = 0;
5274
5275
5276 trans = btrfs_start_transaction(extent_root, 1);
5277 BUG_ON(!trans);
5278
5279 if (extent_key->objectid == 0) {
5280 ret = del_extent_zero(trans, extent_root, path, extent_key);
5281 goto out;
5282 }
5283
5284 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5285 if (!ref_path) {
5286 ret = -ENOMEM;
5287 goto out;
5288 }
5289
5290 for (loops = 0; ; loops++) {
5291 if (loops == 0) {
5292 ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5293 extent_key->objectid);
5294 } else {
5295 ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5296 }
5297 if (ret < 0)
5298 goto out;
5299 if (ret > 0)
5300 break;
5301
5302 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5303 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5304 continue;
5305
5306 found_root = read_ref_root(extent_root->fs_info, ref_path);
5307 BUG_ON(!found_root);
5308 /*
5309 * for reference counted tree, only process reference paths
5310 * rooted at the latest committed root.
5311 */
5312 if (found_root->ref_cows &&
5313 ref_path->root_generation != found_root->root_key.offset)
5314 continue;
5315
5316 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5317 if (pass == 0) {
5318 /*
5319 * copy data extents to new locations
5320 */
5321 u64 group_start = group->key.objectid;
5322 ret = relocate_data_extent(reloc_inode,
5323 extent_key,
5324 group_start);
5325 if (ret < 0)
5326 goto out;
5327 break;
5328 }
5329 level = 0;
5330 } else {
5331 level = ref_path->owner_objectid;
5332 }
5333
5334 if (prev_block != ref_path->nodes[level]) {
5335 struct extent_buffer *eb;
5336 u64 block_start = ref_path->nodes[level];
5337 u64 block_size = btrfs_level_size(found_root, level);
5338
5339 eb = read_tree_block(found_root, block_start,
5340 block_size, 0);
5341 btrfs_tree_lock(eb);
5342 BUG_ON(level != btrfs_header_level(eb));
5343
5344 if (level == 0)
5345 btrfs_item_key_to_cpu(eb, &first_key, 0);
5346 else
5347 btrfs_node_key_to_cpu(eb, &first_key, 0);
5348
5349 btrfs_tree_unlock(eb);
5350 free_extent_buffer(eb);
5351 prev_block = block_start;
5352 }
5353
5354 btrfs_record_root_in_trans(found_root);
5355 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5356 /*
5357 * try to update data extent references while
5358 * keeping metadata shared between snapshots.
5359 */
5360 if (pass == 1) {
5361 ret = relocate_one_path(trans, found_root,
5362 path, &first_key, ref_path,
5363 group, reloc_inode);
5364 if (ret < 0)
5365 goto out;
5366 continue;
5367 }
5368 /*
5369 * use fallback method to process the remaining
5370 * references.
5371 */
5372 if (!new_extents) {
5373 u64 group_start = group->key.objectid;
5374 new_extents = kmalloc(sizeof(*new_extents),
5375 GFP_NOFS);
5376 nr_extents = 1;
5377 ret = get_new_locations(reloc_inode,
5378 extent_key,
5379 group_start, 1,
5380 &new_extents,
5381 &nr_extents);
5382 if (ret)
5383 goto out;
5384 }
5385 ret = replace_one_extent(trans, found_root,
5386 path, extent_key,
5387 &first_key, ref_path,
5388 new_extents, nr_extents);
5389 } else {
5390 ret = relocate_tree_block(trans, found_root, path,
5391 &first_key, ref_path);
5392 }
5393 if (ret < 0)
5394 goto out;
5395 }
5396 ret = 0;
5397out:
5398 btrfs_end_transaction(trans, extent_root);
5399 kfree(new_extents);
5400 kfree(ref_path);
5401 return ret;
5402}
5403
5404static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5405{
5406 u64 num_devices;
5407 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5408 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5409
5410 num_devices = root->fs_info->fs_devices->rw_devices;
5411 if (num_devices == 1) {
5412 stripped |= BTRFS_BLOCK_GROUP_DUP;
5413 stripped = flags & ~stripped;
5414
5415 /* turn raid0 into single device chunks */
5416 if (flags & BTRFS_BLOCK_GROUP_RAID0)
5417 return stripped;
5418
5419 /* turn mirroring into duplication */
5420 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5421 BTRFS_BLOCK_GROUP_RAID10))
5422 return stripped | BTRFS_BLOCK_GROUP_DUP;
5423 return flags;
5424 } else {
5425 /* they already had raid on here, just return */
5426 if (flags & stripped)
5427 return flags;
5428
5429 stripped |= BTRFS_BLOCK_GROUP_DUP;
5430 stripped = flags & ~stripped;
5431
5432 /* switch duplicated blocks with raid1 */
5433 if (flags & BTRFS_BLOCK_GROUP_DUP)
5434 return stripped | BTRFS_BLOCK_GROUP_RAID1;
5435
5436 /* turn single device chunks into raid0 */
5437 return stripped | BTRFS_BLOCK_GROUP_RAID0;
5438 }
5439 return flags;
5440}
5441
5442static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5443 struct btrfs_block_group_cache *shrink_block_group,
5444 int force)
5445{
5446 struct btrfs_trans_handle *trans;
5447 u64 new_alloc_flags;
5448 u64 calc;
5449
5450 spin_lock(&shrink_block_group->lock);
5451 if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5452 spin_unlock(&shrink_block_group->lock);
5453
5454 trans = btrfs_start_transaction(root, 1);
5455 spin_lock(&shrink_block_group->lock);
5456
5457 new_alloc_flags = update_block_group_flags(root,
5458 shrink_block_group->flags);
5459 if (new_alloc_flags != shrink_block_group->flags) {
5460 calc =
5461 btrfs_block_group_used(&shrink_block_group->item);
5462 } else {
5463 calc = shrink_block_group->key.offset;
5464 }
5465 spin_unlock(&shrink_block_group->lock);
5466
5467 do_chunk_alloc(trans, root->fs_info->extent_root,
5468 calc + 2 * 1024 * 1024, new_alloc_flags, force);
5469
5470 btrfs_end_transaction(trans, root);
5471 } else
5472 spin_unlock(&shrink_block_group->lock);
5473 return 0;
5474}
5475
5476static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5477 struct btrfs_root *root,
5478 u64 objectid, u64 size)
5479{
5480 struct btrfs_path *path;
5481 struct btrfs_inode_item *item;
5482 struct extent_buffer *leaf;
5483 int ret;
5484
5485 path = btrfs_alloc_path();
5486 if (!path)
5487 return -ENOMEM;
5488
5489 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5490 if (ret)
5491 goto out;
5492
5493 leaf = path->nodes[0];
5494 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
5495 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
5496 btrfs_set_inode_generation(leaf, item, 1);
5497 btrfs_set_inode_size(leaf, item, size);
5498 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
5499 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
5500 btrfs_mark_buffer_dirty(leaf);
5501 btrfs_release_path(root, path);
5502out:
5503 btrfs_free_path(path);
5504 return ret;
5505}
5506
5507static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
5508 struct btrfs_block_group_cache *group)
5509{
5510 struct inode *inode = NULL;
5511 struct btrfs_trans_handle *trans;
5512 struct btrfs_root *root;
5513 struct btrfs_key root_key;
5514 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
5515 int err = 0;
5516
5517 root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5518 root_key.type = BTRFS_ROOT_ITEM_KEY;
5519 root_key.offset = (u64)-1;
5520 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
5521 if (IS_ERR(root))
5522 return ERR_CAST(root);
5523
5524 trans = btrfs_start_transaction(root, 1);
5525 BUG_ON(!trans);
5526
5527 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
5528 if (err)
5529 goto out;
5530
5531 err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
5532 BUG_ON(err);
5533
5534 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
5535 group->key.offset, 0, group->key.offset,
5536 0, 0, 0);
5537 BUG_ON(err);
5538
5539 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
5540 if (inode->i_state & I_NEW) {
5541 BTRFS_I(inode)->root = root;
5542 BTRFS_I(inode)->location.objectid = objectid;
5543 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5544 BTRFS_I(inode)->location.offset = 0;
5545 btrfs_read_locked_inode(inode);
5546 unlock_new_inode(inode);
5547 BUG_ON(is_bad_inode(inode));
5548 } else {
5549 BUG_ON(1);
5550 }
5551 BTRFS_I(inode)->index_cnt = group->key.objectid;
5552
5553 err = btrfs_orphan_add(trans, inode);
5554out:
5555 btrfs_end_transaction(trans, root);
5556 if (err) {
5557 if (inode)
5558 iput(inode);
5559 inode = ERR_PTR(err);
5560 }
5561 return inode;
5562}
5563
5564int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
5565{
5566
5567 struct btrfs_ordered_sum *sums;
5568 struct btrfs_sector_sum *sector_sum;
5569 struct btrfs_ordered_extent *ordered;
5570 struct btrfs_root *root = BTRFS_I(inode)->root;
5571 struct list_head list;
5572 size_t offset;
5573 int ret;
5574 u64 disk_bytenr;
5575
5576 INIT_LIST_HEAD(&list);
5577
5578 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
5579 BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
5580
5581 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
5582 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
5583 disk_bytenr + len - 1, &list);
5584
5585 while (!list_empty(&list)) {
5586 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
5587 list_del_init(&sums->list);
5588
5589 sector_sum = sums->sums;
5590 sums->bytenr = ordered->start;
5591
5592 offset = 0;
5593 while (offset < sums->len) {
5594 sector_sum->bytenr += ordered->start - disk_bytenr;
5595 sector_sum++;
5596 offset += root->sectorsize;
5597 }
5598
5599 btrfs_add_ordered_sum(inode, ordered, sums);
5600 }
5601 btrfs_put_ordered_extent(ordered);
5602 return 0;
5603}
5604
5605int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
5606{
5607 struct btrfs_trans_handle *trans;
5608 struct btrfs_path *path;
5609 struct btrfs_fs_info *info = root->fs_info;
5610 struct extent_buffer *leaf;
5611 struct inode *reloc_inode;
5612 struct btrfs_block_group_cache *block_group;
5613 struct btrfs_key key;
5614 u64 skipped;
5615 u64 cur_byte;
5616 u64 total_found;
5617 u32 nritems;
5618 int ret;
5619 int progress;
5620 int pass = 0;
5621
5622 root = root->fs_info->extent_root;
5623
5624 block_group = btrfs_lookup_block_group(info, group_start);
5625 BUG_ON(!block_group);
5626
5627 printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
5628 (unsigned long long)block_group->key.objectid,
5629 (unsigned long long)block_group->flags);
5630
5631 path = btrfs_alloc_path();
5632 BUG_ON(!path);
5633
5634 reloc_inode = create_reloc_inode(info, block_group);
5635 BUG_ON(IS_ERR(reloc_inode));
5636
5637 __alloc_chunk_for_shrink(root, block_group, 1);
5638 set_block_group_readonly(block_group);
5639
5640 btrfs_start_delalloc_inodes(info->tree_root);
5641 btrfs_wait_ordered_extents(info->tree_root, 0);
5642again:
5643 skipped = 0;
5644 total_found = 0;
5645 progress = 0;
5646 key.objectid = block_group->key.objectid;
5647 key.offset = 0;
5648 key.type = 0;
5649 cur_byte = key.objectid;
5650
5651 trans = btrfs_start_transaction(info->tree_root, 1);
5652 btrfs_commit_transaction(trans, info->tree_root);
5653
5654 mutex_lock(&root->fs_info->cleaner_mutex);
5655 btrfs_clean_old_snapshots(info->tree_root);
5656 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5657 mutex_unlock(&root->fs_info->cleaner_mutex);
5658
5659 while (1) {
5660 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5661 if (ret < 0)
5662 goto out;
5663next:
5664 leaf = path->nodes[0];
5665 nritems = btrfs_header_nritems(leaf);
5666 if (path->slots[0] >= nritems) {
5667 ret = btrfs_next_leaf(root, path);
5668 if (ret < 0)
5669 goto out;
5670 if (ret == 1) {
5671 ret = 0;
5672 break;
5673 }
5674 leaf = path->nodes[0];
5675 nritems = btrfs_header_nritems(leaf);
5676 }
5677
5678 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5679
5680 if (key.objectid >= block_group->key.objectid +
5681 block_group->key.offset)
5682 break;
5683
5684 if (progress && need_resched()) {
5685 btrfs_release_path(root, path);
5686 cond_resched();
5687 progress = 0;
5688 continue;
5689 }
5690 progress = 1;
5691
5692 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
5693 key.objectid + key.offset <= cur_byte) {
5694 path->slots[0]++;
5695 goto next;
5696 }
5697
5698 total_found++;
5699 cur_byte = key.objectid + key.offset;
5700 btrfs_release_path(root, path);
5701
5702 __alloc_chunk_for_shrink(root, block_group, 0);
5703 ret = relocate_one_extent(root, path, &key, block_group,
5704 reloc_inode, pass);
5705 BUG_ON(ret < 0);
5706 if (ret > 0)
5707 skipped++;
5708
5709 key.objectid = cur_byte;
5710 key.type = 0;
5711 key.offset = 0;
5712 }
5713
5714 btrfs_release_path(root, path);
5715
5716 if (pass == 0) {
5717 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
5718 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
5719 }
5720
5721 if (total_found > 0) {
5722 printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
5723 (unsigned long long)total_found, pass);
5724 pass++;
5725 if (total_found == skipped && pass > 2) {
5726 iput(reloc_inode);
5727 reloc_inode = create_reloc_inode(info, block_group);
5728 pass = 0;
5729 }
5730 goto again;
5731 }
5732
5733 /* delete reloc_inode */
5734 iput(reloc_inode);
5735
5736 /* unpin extents in this range */
5737 trans = btrfs_start_transaction(info->tree_root, 1);
5738 btrfs_commit_transaction(trans, info->tree_root);
5739
5740 spin_lock(&block_group->lock);
5741 WARN_ON(block_group->pinned > 0);
5742 WARN_ON(block_group->reserved > 0);
5743 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5744 spin_unlock(&block_group->lock);
5745 put_block_group(block_group);
5746 ret = 0;
5747out:
5748 btrfs_free_path(path);
5749 return ret;
5750}
5751
5752static int find_first_block_group(struct btrfs_root *root,
5753 struct btrfs_path *path, struct btrfs_key *key)
5754{
5755 int ret = 0;
5756 struct btrfs_key found_key;
5757 struct extent_buffer *leaf;
5758 int slot;
5759
5760 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5761 if (ret < 0)
5762 goto out;
5763
5764 while (1) {
5765 slot = path->slots[0];
5766 leaf = path->nodes[0];
5767 if (slot >= btrfs_header_nritems(leaf)) {
5768 ret = btrfs_next_leaf(root, path);
5769 if (ret == 0)
5770 continue;
5771 if (ret < 0)
5772 goto out;
5773 break;
5774 }
5775 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5776
5777 if (found_key.objectid >= key->objectid &&
5778 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5779 ret = 0;
5780 goto out;
5781 }
5782 path->slots[0]++;
5783 }
5784 ret = -ENOENT;
5785out:
5786 return ret;
5787}
5788
5789int btrfs_free_block_groups(struct btrfs_fs_info *info)
5790{
5791 struct btrfs_block_group_cache *block_group;
5792 struct rb_node *n;
5793
5794 spin_lock(&info->block_group_cache_lock);
5795 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5796 block_group = rb_entry(n, struct btrfs_block_group_cache,
5797 cache_node);
5798 rb_erase(&block_group->cache_node,
5799 &info->block_group_cache_tree);
5800 spin_unlock(&info->block_group_cache_lock);
5801
5802 btrfs_remove_free_space_cache(block_group);
5803 down_write(&block_group->space_info->groups_sem);
5804 list_del(&block_group->list);
5805 up_write(&block_group->space_info->groups_sem);
5806
5807 WARN_ON(atomic_read(&block_group->count) != 1);
5808 kfree(block_group);
5809
5810 spin_lock(&info->block_group_cache_lock);
5811 }
5812 spin_unlock(&info->block_group_cache_lock);
5813 return 0;
5814}
5815
5816int btrfs_read_block_groups(struct btrfs_root *root)
5817{
5818 struct btrfs_path *path;
5819 int ret;
5820 struct btrfs_block_group_cache *cache;
5821 struct btrfs_fs_info *info = root->fs_info;
5822 struct btrfs_space_info *space_info;
5823 struct btrfs_key key;
5824 struct btrfs_key found_key;
5825 struct extent_buffer *leaf;
5826
5827 root = info->extent_root;
5828 key.objectid = 0;
5829 key.offset = 0;
5830 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5831 path = btrfs_alloc_path();
5832 if (!path)
5833 return -ENOMEM;
5834
5835 while (1) {
5836 ret = find_first_block_group(root, path, &key);
5837 if (ret > 0) {
5838 ret = 0;
5839 goto error;
5840 }
5841 if (ret != 0)
5842 goto error;
5843
5844 leaf = path->nodes[0];
5845 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5846 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5847 if (!cache) {
5848 ret = -ENOMEM;
5849 break;
5850 }
5851
5852 atomic_set(&cache->count, 1);
5853 spin_lock_init(&cache->lock);
5854 mutex_init(&cache->alloc_mutex);
5855 mutex_init(&cache->cache_mutex);
5856 INIT_LIST_HEAD(&cache->list);
5857 read_extent_buffer(leaf, &cache->item,
5858 btrfs_item_ptr_offset(leaf, path->slots[0]),
5859 sizeof(cache->item));
5860 memcpy(&cache->key, &found_key, sizeof(found_key));
5861
5862 key.objectid = found_key.objectid + found_key.offset;
5863 btrfs_release_path(root, path);
5864 cache->flags = btrfs_block_group_flags(&cache->item);
5865
5866 ret = update_space_info(info, cache->flags, found_key.offset,
5867 btrfs_block_group_used(&cache->item),
5868 &space_info);
5869 BUG_ON(ret);
5870 cache->space_info = space_info;
5871 down_write(&space_info->groups_sem);
5872 list_add_tail(&cache->list, &space_info->block_groups);
5873 up_write(&space_info->groups_sem);
5874
5875 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5876 BUG_ON(ret);
5877
5878 set_avail_alloc_bits(root->fs_info, cache->flags);
5879 if (btrfs_chunk_readonly(root, cache->key.objectid))
5880 set_block_group_readonly(cache);
5881 }
5882 ret = 0;
5883error:
5884 btrfs_free_path(path);
5885 return ret;
5886}
5887
5888int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5889 struct btrfs_root *root, u64 bytes_used,
5890 u64 type, u64 chunk_objectid, u64 chunk_offset,
5891 u64 size)
5892{
5893 int ret;
5894 struct btrfs_root *extent_root;
5895 struct btrfs_block_group_cache *cache;
5896
5897 extent_root = root->fs_info->extent_root;
5898
5899 root->fs_info->last_trans_new_blockgroup = trans->transid;
5900
5901 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5902 if (!cache)
5903 return -ENOMEM;
5904
5905 cache->key.objectid = chunk_offset;
5906 cache->key.offset = size;
5907 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5908 atomic_set(&cache->count, 1);
5909 spin_lock_init(&cache->lock);
5910 mutex_init(&cache->alloc_mutex);
5911 mutex_init(&cache->cache_mutex);
5912 INIT_LIST_HEAD(&cache->list);
5913
5914 btrfs_set_block_group_used(&cache->item, bytes_used);
5915 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5916 cache->flags = type;
5917 btrfs_set_block_group_flags(&cache->item, type);
5918
5919 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5920 &cache->space_info);
5921 BUG_ON(ret);
5922 down_write(&cache->space_info->groups_sem);
5923 list_add_tail(&cache->list, &cache->space_info->block_groups);
5924 up_write(&cache->space_info->groups_sem);
5925
5926 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5927 BUG_ON(ret);
5928
5929 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5930 sizeof(cache->item));
5931 BUG_ON(ret);
5932
5933 finish_current_insert(trans, extent_root, 0);
5934 ret = del_pending_extents(trans, extent_root, 0);
5935 BUG_ON(ret);
5936 set_avail_alloc_bits(extent_root->fs_info, type);
5937
5938 return 0;
5939}
5940
5941int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5942 struct btrfs_root *root, u64 group_start)
5943{
5944 struct btrfs_path *path;
5945 struct btrfs_block_group_cache *block_group;
5946 struct btrfs_key key;
5947 int ret;
5948
5949 root = root->fs_info->extent_root;
5950
5951 block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5952 BUG_ON(!block_group);
5953 BUG_ON(!block_group->ro);
5954
5955 memcpy(&key, &block_group->key, sizeof(key));
5956
5957 path = btrfs_alloc_path();
5958 BUG_ON(!path);
5959
5960 btrfs_remove_free_space_cache(block_group);
5961 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree);
5963 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem);
5966
5967 spin_lock(&block_group->space_info->lock);
5968 block_group->space_info->total_bytes -= block_group->key.offset;
5969 block_group->space_info->bytes_readonly -= block_group->key.offset;
5970 spin_unlock(&block_group->space_info->lock);
5971 block_group->space_info->full = 0;
5972
5973 put_block_group(block_group);
5974 put_block_group(block_group);
5975
5976 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5977 if (ret > 0)
5978 ret = -EIO;
5979 if (ret < 0)
5980 goto out;
5981
5982 ret = btrfs_del_item(trans, root, path);
5983out:
5984 btrfs_free_path(path);
5985 return ret;
5986}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..e086d407f1fa
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3717 @@
1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h>
7#include <linux/page-flags.h>
8#include <linux/module.h>
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
15#include "extent_io.h"
16#include "extent_map.h"
17#include "compat.h"
18#include "ctree.h"
19#include "btrfs_inode.h"
20
21/* temporary define until extent_map moves out of btrfs */
22struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
23 unsigned long extra_flags,
24 void (*ctor)(void *, struct kmem_cache *,
25 unsigned long));
26
27static struct kmem_cache *extent_state_cache;
28static struct kmem_cache *extent_buffer_cache;
29
30static LIST_HEAD(buffers);
31static LIST_HEAD(states);
32
33#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG
35static DEFINE_SPINLOCK(leak_lock);
36#endif
37
38#define BUFFER_LRU_MAX 64
39
40struct tree_entry {
41 u64 start;
42 u64 end;
43 struct rb_node rb_node;
44};
45
46struct extent_page_data {
47 struct bio *bio;
48 struct extent_io_tree *tree;
49 get_extent_t *get_extent;
50
51 /* tells writepage not to lock the state bits for this range
52 * it still does the unlocking
53 */
54 int extent_locked;
55};
56
57int __init extent_io_init(void)
58{
59 extent_state_cache = btrfs_cache_create("extent_state",
60 sizeof(struct extent_state), 0,
61 NULL);
62 if (!extent_state_cache)
63 return -ENOMEM;
64
65 extent_buffer_cache = btrfs_cache_create("extent_buffers",
66 sizeof(struct extent_buffer), 0,
67 NULL);
68 if (!extent_buffer_cache)
69 goto free_state_cache;
70 return 0;
71
72free_state_cache:
73 kmem_cache_destroy(extent_state_cache);
74 return -ENOMEM;
75}
76
77void extent_io_exit(void)
78{
79 struct extent_state *state;
80 struct extent_buffer *eb;
81
82 while (!list_empty(&states)) {
83 state = list_entry(states.next, struct extent_state, leak_list);
84 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
85 "state %lu in tree %p refs %d\n",
86 (unsigned long long)state->start,
87 (unsigned long long)state->end,
88 state->state, state->tree, atomic_read(&state->refs));
89 list_del(&state->leak_list);
90 kmem_cache_free(extent_state_cache, state);
91
92 }
93
94 while (!list_empty(&buffers)) {
95 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
96 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
97 "refs %d\n", (unsigned long long)eb->start,
98 eb->len, atomic_read(&eb->refs));
99 list_del(&eb->leak_list);
100 kmem_cache_free(extent_buffer_cache, eb);
101 }
102 if (extent_state_cache)
103 kmem_cache_destroy(extent_state_cache);
104 if (extent_buffer_cache)
105 kmem_cache_destroy(extent_buffer_cache);
106}
107
108void extent_io_tree_init(struct extent_io_tree *tree,
109 struct address_space *mapping, gfp_t mask)
110{
111 tree->state.rb_node = NULL;
112 tree->buffer.rb_node = NULL;
113 tree->ops = NULL;
114 tree->dirty_bytes = 0;
115 spin_lock_init(&tree->lock);
116 spin_lock_init(&tree->buffer_lock);
117 tree->mapping = mapping;
118}
119
120static struct extent_state *alloc_extent_state(gfp_t mask)
121{
122 struct extent_state *state;
123#ifdef LEAK_DEBUG
124 unsigned long flags;
125#endif
126
127 state = kmem_cache_alloc(extent_state_cache, mask);
128 if (!state)
129 return state;
130 state->state = 0;
131 state->private = 0;
132 state->tree = NULL;
133#ifdef LEAK_DEBUG
134 spin_lock_irqsave(&leak_lock, flags);
135 list_add(&state->leak_list, &states);
136 spin_unlock_irqrestore(&leak_lock, flags);
137#endif
138 atomic_set(&state->refs, 1);
139 init_waitqueue_head(&state->wq);
140 return state;
141}
142
143static void free_extent_state(struct extent_state *state)
144{
145 if (!state)
146 return;
147 if (atomic_dec_and_test(&state->refs)) {
148#ifdef LEAK_DEBUG
149 unsigned long flags;
150#endif
151 WARN_ON(state->tree);
152#ifdef LEAK_DEBUG
153 spin_lock_irqsave(&leak_lock, flags);
154 list_del(&state->leak_list);
155 spin_unlock_irqrestore(&leak_lock, flags);
156#endif
157 kmem_cache_free(extent_state_cache, state);
158 }
159}
160
161static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
162 struct rb_node *node)
163{
164 struct rb_node **p = &root->rb_node;
165 struct rb_node *parent = NULL;
166 struct tree_entry *entry;
167
168 while (*p) {
169 parent = *p;
170 entry = rb_entry(parent, struct tree_entry, rb_node);
171
172 if (offset < entry->start)
173 p = &(*p)->rb_left;
174 else if (offset > entry->end)
175 p = &(*p)->rb_right;
176 else
177 return parent;
178 }
179
180 entry = rb_entry(node, struct tree_entry, rb_node);
181 rb_link_node(node, parent, p);
182 rb_insert_color(node, root);
183 return NULL;
184}
185
186static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
187 struct rb_node **prev_ret,
188 struct rb_node **next_ret)
189{
190 struct rb_root *root = &tree->state;
191 struct rb_node *n = root->rb_node;
192 struct rb_node *prev = NULL;
193 struct rb_node *orig_prev = NULL;
194 struct tree_entry *entry;
195 struct tree_entry *prev_entry = NULL;
196
197 while (n) {
198 entry = rb_entry(n, struct tree_entry, rb_node);
199 prev = n;
200 prev_entry = entry;
201
202 if (offset < entry->start)
203 n = n->rb_left;
204 else if (offset > entry->end)
205 n = n->rb_right;
206 else
207 return n;
208 }
209
210 if (prev_ret) {
211 orig_prev = prev;
212 while (prev && offset > prev_entry->end) {
213 prev = rb_next(prev);
214 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
215 }
216 *prev_ret = prev;
217 prev = orig_prev;
218 }
219
220 if (next_ret) {
221 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
222 while (prev && offset < prev_entry->start) {
223 prev = rb_prev(prev);
224 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
225 }
226 *next_ret = prev;
227 }
228 return NULL;
229}
230
231static inline struct rb_node *tree_search(struct extent_io_tree *tree,
232 u64 offset)
233{
234 struct rb_node *prev = NULL;
235 struct rb_node *ret;
236
237 ret = __etree_search(tree, offset, &prev, NULL);
238 if (!ret)
239 return prev;
240 return ret;
241}
242
243static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
244 u64 offset, struct rb_node *node)
245{
246 struct rb_root *root = &tree->buffer;
247 struct rb_node **p = &root->rb_node;
248 struct rb_node *parent = NULL;
249 struct extent_buffer *eb;
250
251 while (*p) {
252 parent = *p;
253 eb = rb_entry(parent, struct extent_buffer, rb_node);
254
255 if (offset < eb->start)
256 p = &(*p)->rb_left;
257 else if (offset > eb->start)
258 p = &(*p)->rb_right;
259 else
260 return eb;
261 }
262
263 rb_link_node(node, parent, p);
264 rb_insert_color(node, root);
265 return NULL;
266}
267
268static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
269 u64 offset)
270{
271 struct rb_root *root = &tree->buffer;
272 struct rb_node *n = root->rb_node;
273 struct extent_buffer *eb;
274
275 while (n) {
276 eb = rb_entry(n, struct extent_buffer, rb_node);
277 if (offset < eb->start)
278 n = n->rb_left;
279 else if (offset > eb->start)
280 n = n->rb_right;
281 else
282 return eb;
283 }
284 return NULL;
285}
286
287/*
288 * utility function to look for merge candidates inside a given range.
289 * Any extents with matching state are merged together into a single
290 * extent in the tree. Extents with EXTENT_IO in their state field
291 * are not merged because the end_io handlers need to be able to do
292 * operations on them without sleeping (or doing allocations/splits).
293 *
294 * This should be called with the tree lock held.
295 */
296static int merge_state(struct extent_io_tree *tree,
297 struct extent_state *state)
298{
299 struct extent_state *other;
300 struct rb_node *other_node;
301
302 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
303 return 0;
304
305 other_node = rb_prev(&state->rb_node);
306 if (other_node) {
307 other = rb_entry(other_node, struct extent_state, rb_node);
308 if (other->end == state->start - 1 &&
309 other->state == state->state) {
310 state->start = other->start;
311 other->tree = NULL;
312 rb_erase(&other->rb_node, &tree->state);
313 free_extent_state(other);
314 }
315 }
316 other_node = rb_next(&state->rb_node);
317 if (other_node) {
318 other = rb_entry(other_node, struct extent_state, rb_node);
319 if (other->start == state->end + 1 &&
320 other->state == state->state) {
321 other->start = state->start;
322 state->tree = NULL;
323 rb_erase(&state->rb_node, &tree->state);
324 free_extent_state(state);
325 }
326 }
327 return 0;
328}
329
330static void set_state_cb(struct extent_io_tree *tree,
331 struct extent_state *state,
332 unsigned long bits)
333{
334 if (tree->ops && tree->ops->set_bit_hook) {
335 tree->ops->set_bit_hook(tree->mapping->host, state->start,
336 state->end, state->state, bits);
337 }
338}
339
340static void clear_state_cb(struct extent_io_tree *tree,
341 struct extent_state *state,
342 unsigned long bits)
343{
344 if (tree->ops && tree->ops->clear_bit_hook) {
345 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
346 state->end, state->state, bits);
347 }
348}
349
350/*
351 * insert an extent_state struct into the tree. 'bits' are set on the
352 * struct before it is inserted.
353 *
354 * This may return -EEXIST if the extent is already there, in which case the
355 * state struct is freed.
356 *
357 * The tree lock is not taken internally. This is a utility function and
358 * probably isn't what you want to call (see set/clear_extent_bit).
359 */
360static int insert_state(struct extent_io_tree *tree,
361 struct extent_state *state, u64 start, u64 end,
362 int bits)
363{
364 struct rb_node *node;
365
366 if (end < start) {
367 printk(KERN_ERR "btrfs end < start %llu %llu\n",
368 (unsigned long long)end,
369 (unsigned long long)start);
370 WARN_ON(1);
371 }
372 if (bits & EXTENT_DIRTY)
373 tree->dirty_bytes += end - start + 1;
374 set_state_cb(tree, state, bits);
375 state->state |= bits;
376 state->start = start;
377 state->end = end;
378 node = tree_insert(&tree->state, end, &state->rb_node);
379 if (node) {
380 struct extent_state *found;
381 found = rb_entry(node, struct extent_state, rb_node);
382 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
383 "%llu %llu\n", (unsigned long long)found->start,
384 (unsigned long long)found->end,
385 (unsigned long long)start, (unsigned long long)end);
386 free_extent_state(state);
387 return -EEXIST;
388 }
389 state->tree = tree;
390 merge_state(tree, state);
391 return 0;
392}
393
394/*
395 * split a given extent state struct in two, inserting the preallocated
396 * struct 'prealloc' as the newly created second half. 'split' indicates an
397 * offset inside 'orig' where it should be split.
398 *
399 * Before calling,
400 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
401 * are two extent state structs in the tree:
402 * prealloc: [orig->start, split - 1]
403 * orig: [ split, orig->end ]
404 *
405 * The tree locks are not taken by this function. They need to be held
406 * by the caller.
407 */
408static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
409 struct extent_state *prealloc, u64 split)
410{
411 struct rb_node *node;
412 prealloc->start = orig->start;
413 prealloc->end = split - 1;
414 prealloc->state = orig->state;
415 orig->start = split;
416
417 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
418 if (node) {
419 struct extent_state *found;
420 found = rb_entry(node, struct extent_state, rb_node);
421 free_extent_state(prealloc);
422 return -EEXIST;
423 }
424 prealloc->tree = tree;
425 return 0;
426}
427
428/*
429 * utility function to clear some bits in an extent state struct.
430 * it will optionally wake up any one waiting on this state (wake == 1), or
431 * forcibly remove the state from the tree (delete == 1).
432 *
433 * If no bits are set on the state struct after clearing things, the
434 * struct is freed and removed from the tree
435 */
436static int clear_state_bit(struct extent_io_tree *tree,
437 struct extent_state *state, int bits, int wake,
438 int delete)
439{
440 int ret = state->state & bits;
441
442 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
443 u64 range = state->end - state->start + 1;
444 WARN_ON(range > tree->dirty_bytes);
445 tree->dirty_bytes -= range;
446 }
447 clear_state_cb(tree, state, bits);
448 state->state &= ~bits;
449 if (wake)
450 wake_up(&state->wq);
451 if (delete || state->state == 0) {
452 if (state->tree) {
453 clear_state_cb(tree, state, state->state);
454 rb_erase(&state->rb_node, &tree->state);
455 state->tree = NULL;
456 free_extent_state(state);
457 } else {
458 WARN_ON(1);
459 }
460 } else {
461 merge_state(tree, state);
462 }
463 return ret;
464}
465
466/*
467 * clear some bits on a range in the tree. This may require splitting
468 * or inserting elements in the tree, so the gfp mask is used to
469 * indicate which allocations or sleeping are allowed.
470 *
471 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
472 * the given range from the tree regardless of state (ie for truncate).
473 *
474 * the range [start, end] is inclusive.
475 *
476 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
477 * bits were already set, or zero if none of the bits were already set.
478 */
479int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
480 int bits, int wake, int delete, gfp_t mask)
481{
482 struct extent_state *state;
483 struct extent_state *prealloc = NULL;
484 struct rb_node *node;
485 int err;
486 int set = 0;
487
488again:
489 if (!prealloc && (mask & __GFP_WAIT)) {
490 prealloc = alloc_extent_state(mask);
491 if (!prealloc)
492 return -ENOMEM;
493 }
494
495 spin_lock(&tree->lock);
496 /*
497 * this search will find the extents that end after
498 * our range starts
499 */
500 node = tree_search(tree, start);
501 if (!node)
502 goto out;
503 state = rb_entry(node, struct extent_state, rb_node);
504 if (state->start > end)
505 goto out;
506 WARN_ON(state->end < start);
507
508 /*
509 * | ---- desired range ---- |
510 * | state | or
511 * | ------------- state -------------- |
512 *
513 * We need to split the extent we found, and may flip
514 * bits on second half.
515 *
516 * If the extent we found extends past our range, we
517 * just split and search again. It'll get split again
518 * the next time though.
519 *
520 * If the extent we found is inside our range, we clear
521 * the desired bit on it.
522 */
523
524 if (state->start < start) {
525 if (!prealloc)
526 prealloc = alloc_extent_state(GFP_ATOMIC);
527 err = split_state(tree, state, prealloc, start);
528 BUG_ON(err == -EEXIST);
529 prealloc = NULL;
530 if (err)
531 goto out;
532 if (state->end <= end) {
533 start = state->end + 1;
534 set |= clear_state_bit(tree, state, bits,
535 wake, delete);
536 } else {
537 start = state->start;
538 }
539 goto search_again;
540 }
541 /*
542 * | ---- desired range ---- |
543 * | state |
544 * We need to split the extent, and clear the bit
545 * on the first half
546 */
547 if (state->start <= end && state->end > end) {
548 if (!prealloc)
549 prealloc = alloc_extent_state(GFP_ATOMIC);
550 err = split_state(tree, state, prealloc, end + 1);
551 BUG_ON(err == -EEXIST);
552
553 if (wake)
554 wake_up(&state->wq);
555 set |= clear_state_bit(tree, prealloc, bits,
556 wake, delete);
557 prealloc = NULL;
558 goto out;
559 }
560
561 start = state->end + 1;
562 set |= clear_state_bit(tree, state, bits, wake, delete);
563 goto search_again;
564
565out:
566 spin_unlock(&tree->lock);
567 if (prealloc)
568 free_extent_state(prealloc);
569
570 return set;
571
572search_again:
573 if (start > end)
574 goto out;
575 spin_unlock(&tree->lock);
576 if (mask & __GFP_WAIT)
577 cond_resched();
578 goto again;
579}
580
581static int wait_on_state(struct extent_io_tree *tree,
582 struct extent_state *state)
583 __releases(tree->lock)
584 __acquires(tree->lock)
585{
586 DEFINE_WAIT(wait);
587 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
588 spin_unlock(&tree->lock);
589 schedule();
590 spin_lock(&tree->lock);
591 finish_wait(&state->wq, &wait);
592 return 0;
593}
594
595/*
596 * waits for one or more bits to clear on a range in the state tree.
597 * The range [start, end] is inclusive.
598 * The tree lock is taken by this function
599 */
600int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
601{
602 struct extent_state *state;
603 struct rb_node *node;
604
605 spin_lock(&tree->lock);
606again:
607 while (1) {
608 /*
609 * this search will find all the extents that end after
610 * our range starts
611 */
612 node = tree_search(tree, start);
613 if (!node)
614 break;
615
616 state = rb_entry(node, struct extent_state, rb_node);
617
618 if (state->start > end)
619 goto out;
620
621 if (state->state & bits) {
622 start = state->start;
623 atomic_inc(&state->refs);
624 wait_on_state(tree, state);
625 free_extent_state(state);
626 goto again;
627 }
628 start = state->end + 1;
629
630 if (start > end)
631 break;
632
633 if (need_resched()) {
634 spin_unlock(&tree->lock);
635 cond_resched();
636 spin_lock(&tree->lock);
637 }
638 }
639out:
640 spin_unlock(&tree->lock);
641 return 0;
642}
643
644static void set_state_bits(struct extent_io_tree *tree,
645 struct extent_state *state,
646 int bits)
647{
648 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
649 u64 range = state->end - state->start + 1;
650 tree->dirty_bytes += range;
651 }
652 set_state_cb(tree, state, bits);
653 state->state |= bits;
654}
655
656/*
657 * set some bits on a range in the tree. This may require allocations
658 * or sleeping, so the gfp mask is used to indicate what is allowed.
659 *
660 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
661 * range already has the desired bits set. The start of the existing
662 * range is returned in failed_start in this case.
663 *
664 * [start, end] is inclusive
665 * This takes the tree lock.
666 */
667static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
668 int bits, int exclusive, u64 *failed_start,
669 gfp_t mask)
670{
671 struct extent_state *state;
672 struct extent_state *prealloc = NULL;
673 struct rb_node *node;
674 int err = 0;
675 int set;
676 u64 last_start;
677 u64 last_end;
678again:
679 if (!prealloc && (mask & __GFP_WAIT)) {
680 prealloc = alloc_extent_state(mask);
681 if (!prealloc)
682 return -ENOMEM;
683 }
684
685 spin_lock(&tree->lock);
686 /*
687 * this search will find all the extents that end after
688 * our range starts.
689 */
690 node = tree_search(tree, start);
691 if (!node) {
692 err = insert_state(tree, prealloc, start, end, bits);
693 prealloc = NULL;
694 BUG_ON(err == -EEXIST);
695 goto out;
696 }
697
698 state = rb_entry(node, struct extent_state, rb_node);
699 last_start = state->start;
700 last_end = state->end;
701
702 /*
703 * | ---- desired range ---- |
704 * | state |
705 *
706 * Just lock what we found and keep going
707 */
708 if (state->start == start && state->end <= end) {
709 set = state->state & bits;
710 if (set && exclusive) {
711 *failed_start = state->start;
712 err = -EEXIST;
713 goto out;
714 }
715 set_state_bits(tree, state, bits);
716 start = state->end + 1;
717 merge_state(tree, state);
718 goto search_again;
719 }
720
721 /*
722 * | ---- desired range ---- |
723 * | state |
724 * or
725 * | ------------- state -------------- |
726 *
727 * We need to split the extent we found, and may flip bits on
728 * second half.
729 *
730 * If the extent we found extends past our
731 * range, we just split and search again. It'll get split
732 * again the next time though.
733 *
734 * If the extent we found is inside our range, we set the
735 * desired bit on it.
736 */
737 if (state->start < start) {
738 set = state->state & bits;
739 if (exclusive && set) {
740 *failed_start = start;
741 err = -EEXIST;
742 goto out;
743 }
744 err = split_state(tree, state, prealloc, start);
745 BUG_ON(err == -EEXIST);
746 prealloc = NULL;
747 if (err)
748 goto out;
749 if (state->end <= end) {
750 set_state_bits(tree, state, bits);
751 start = state->end + 1;
752 merge_state(tree, state);
753 } else {
754 start = state->start;
755 }
756 goto search_again;
757 }
758 /*
759 * | ---- desired range ---- |
760 * | state | or | state |
761 *
762 * There's a hole, we need to insert something in it and
763 * ignore the extent we found.
764 */
765 if (state->start > start) {
766 u64 this_end;
767 if (end < last_start)
768 this_end = end;
769 else
770 this_end = last_start - 1;
771 err = insert_state(tree, prealloc, start, this_end,
772 bits);
773 prealloc = NULL;
774 BUG_ON(err == -EEXIST);
775 if (err)
776 goto out;
777 start = this_end + 1;
778 goto search_again;
779 }
780 /*
781 * | ---- desired range ---- |
782 * | state |
783 * We need to split the extent, and set the bit
784 * on the first half
785 */
786 if (state->start <= end && state->end > end) {
787 set = state->state & bits;
788 if (exclusive && set) {
789 *failed_start = start;
790 err = -EEXIST;
791 goto out;
792 }
793 err = split_state(tree, state, prealloc, end + 1);
794 BUG_ON(err == -EEXIST);
795
796 set_state_bits(tree, prealloc, bits);
797 merge_state(tree, prealloc);
798 prealloc = NULL;
799 goto out;
800 }
801
802 goto search_again;
803
804out:
805 spin_unlock(&tree->lock);
806 if (prealloc)
807 free_extent_state(prealloc);
808
809 return err;
810
811search_again:
812 if (start > end)
813 goto out;
814 spin_unlock(&tree->lock);
815 if (mask & __GFP_WAIT)
816 cond_resched();
817 goto again;
818}
819
820/* wrappers around set/clear extent bit */
821int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
822 gfp_t mask)
823{
824 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
825 mask);
826}
827
828int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
829 gfp_t mask)
830{
831 return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
832}
833
834int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
835 int bits, gfp_t mask)
836{
837 return set_extent_bit(tree, start, end, bits, 0, NULL,
838 mask);
839}
840
841int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
842 int bits, gfp_t mask)
843{
844 return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
845}
846
847int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
848 gfp_t mask)
849{
850 return set_extent_bit(tree, start, end,
851 EXTENT_DELALLOC | EXTENT_DIRTY,
852 0, NULL, mask);
853}
854
855int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
856 gfp_t mask)
857{
858 return clear_extent_bit(tree, start, end,
859 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
860}
861
862int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
863 gfp_t mask)
864{
865 return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
866}
867
868int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
869 gfp_t mask)
870{
871 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
872 mask);
873}
874
875static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
876 gfp_t mask)
877{
878 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
879}
880
881int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
882 gfp_t mask)
883{
884 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
885 mask);
886}
887
888static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
889 u64 end, gfp_t mask)
890{
891 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
892}
893
894static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
895 gfp_t mask)
896{
897 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
898 0, NULL, mask);
899}
900
901static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
902 u64 end, gfp_t mask)
903{
904 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
905}
906
907int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
908{
909 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
910}
911
912/*
913 * either insert or lock state struct between start and end use mask to tell
914 * us if waiting is desired.
915 */
916int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
917{
918 int err;
919 u64 failed_start;
920 while (1) {
921 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
922 &failed_start, mask);
923 if (err == -EEXIST && (mask & __GFP_WAIT)) {
924 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
925 start = failed_start;
926 } else {
927 break;
928 }
929 WARN_ON(start > end);
930 }
931 return err;
932}
933
934int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
935 gfp_t mask)
936{
937 int err;
938 u64 failed_start;
939
940 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
941 &failed_start, mask);
942 if (err == -EEXIST) {
943 if (failed_start > start)
944 clear_extent_bit(tree, start, failed_start - 1,
945 EXTENT_LOCKED, 1, 0, mask);
946 return 0;
947 }
948 return 1;
949}
950
951int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
952 gfp_t mask)
953{
954 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
955}
956
957/*
958 * helper function to set pages and extents in the tree dirty
959 */
960int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
961{
962 unsigned long index = start >> PAGE_CACHE_SHIFT;
963 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
964 struct page *page;
965
966 while (index <= end_index) {
967 page = find_get_page(tree->mapping, index);
968 BUG_ON(!page);
969 __set_page_dirty_nobuffers(page);
970 page_cache_release(page);
971 index++;
972 }
973 set_extent_dirty(tree, start, end, GFP_NOFS);
974 return 0;
975}
976
977/*
978 * helper function to set both pages and extents in the tree writeback
979 */
980static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
981{
982 unsigned long index = start >> PAGE_CACHE_SHIFT;
983 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
984 struct page *page;
985
986 while (index <= end_index) {
987 page = find_get_page(tree->mapping, index);
988 BUG_ON(!page);
989 set_page_writeback(page);
990 page_cache_release(page);
991 index++;
992 }
993 set_extent_writeback(tree, start, end, GFP_NOFS);
994 return 0;
995}
996
997/*
998 * find the first offset in the io tree with 'bits' set. zero is
999 * returned if we find something, and *start_ret and *end_ret are
1000 * set to reflect the state struct that was found.
1001 *
1002 * If nothing was found, 1 is returned, < 0 on error
1003 */
1004int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1005 u64 *start_ret, u64 *end_ret, int bits)
1006{
1007 struct rb_node *node;
1008 struct extent_state *state;
1009 int ret = 1;
1010
1011 spin_lock(&tree->lock);
1012 /*
1013 * this search will find all the extents that end after
1014 * our range starts.
1015 */
1016 node = tree_search(tree, start);
1017 if (!node)
1018 goto out;
1019
1020 while (1) {
1021 state = rb_entry(node, struct extent_state, rb_node);
1022 if (state->end >= start && (state->state & bits)) {
1023 *start_ret = state->start;
1024 *end_ret = state->end;
1025 ret = 0;
1026 break;
1027 }
1028 node = rb_next(node);
1029 if (!node)
1030 break;
1031 }
1032out:
1033 spin_unlock(&tree->lock);
1034 return ret;
1035}
1036
1037/* find the first state struct with 'bits' set after 'start', and
1038 * return it. tree->lock must be held. NULL will returned if
1039 * nothing was found after 'start'
1040 */
1041struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1042 u64 start, int bits)
1043{
1044 struct rb_node *node;
1045 struct extent_state *state;
1046
1047 /*
1048 * this search will find all the extents that end after
1049 * our range starts.
1050 */
1051 node = tree_search(tree, start);
1052 if (!node)
1053 goto out;
1054
1055 while (1) {
1056 state = rb_entry(node, struct extent_state, rb_node);
1057 if (state->end >= start && (state->state & bits))
1058 return state;
1059
1060 node = rb_next(node);
1061 if (!node)
1062 break;
1063 }
1064out:
1065 return NULL;
1066}
1067
1068/*
1069 * find a contiguous range of bytes in the file marked as delalloc, not
1070 * more than 'max_bytes'. start and end are used to return the range,
1071 *
1072 * 1 is returned if we find something, 0 if nothing was in the tree
1073 */
1074static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1075 u64 *start, u64 *end, u64 max_bytes)
1076{
1077 struct rb_node *node;
1078 struct extent_state *state;
1079 u64 cur_start = *start;
1080 u64 found = 0;
1081 u64 total_bytes = 0;
1082
1083 spin_lock(&tree->lock);
1084
1085 /*
1086 * this search will find all the extents that end after
1087 * our range starts.
1088 */
1089 node = tree_search(tree, cur_start);
1090 if (!node) {
1091 if (!found)
1092 *end = (u64)-1;
1093 goto out;
1094 }
1095
1096 while (1) {
1097 state = rb_entry(node, struct extent_state, rb_node);
1098 if (found && (state->start != cur_start ||
1099 (state->state & EXTENT_BOUNDARY))) {
1100 goto out;
1101 }
1102 if (!(state->state & EXTENT_DELALLOC)) {
1103 if (!found)
1104 *end = state->end;
1105 goto out;
1106 }
1107 if (!found)
1108 *start = state->start;
1109 found++;
1110 *end = state->end;
1111 cur_start = state->end + 1;
1112 node = rb_next(node);
1113 if (!node)
1114 break;
1115 total_bytes += state->end - state->start + 1;
1116 if (total_bytes >= max_bytes)
1117 break;
1118 }
1119out:
1120 spin_unlock(&tree->lock);
1121 return found;
1122}
1123
1124static noinline int __unlock_for_delalloc(struct inode *inode,
1125 struct page *locked_page,
1126 u64 start, u64 end)
1127{
1128 int ret;
1129 struct page *pages[16];
1130 unsigned long index = start >> PAGE_CACHE_SHIFT;
1131 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1132 unsigned long nr_pages = end_index - index + 1;
1133 int i;
1134
1135 if (index == locked_page->index && end_index == index)
1136 return 0;
1137
1138 while (nr_pages > 0) {
1139 ret = find_get_pages_contig(inode->i_mapping, index,
1140 min_t(unsigned long, nr_pages,
1141 ARRAY_SIZE(pages)), pages);
1142 for (i = 0; i < ret; i++) {
1143 if (pages[i] != locked_page)
1144 unlock_page(pages[i]);
1145 page_cache_release(pages[i]);
1146 }
1147 nr_pages -= ret;
1148 index += ret;
1149 cond_resched();
1150 }
1151 return 0;
1152}
1153
1154static noinline int lock_delalloc_pages(struct inode *inode,
1155 struct page *locked_page,
1156 u64 delalloc_start,
1157 u64 delalloc_end)
1158{
1159 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1160 unsigned long start_index = index;
1161 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1162 unsigned long pages_locked = 0;
1163 struct page *pages[16];
1164 unsigned long nrpages;
1165 int ret;
1166 int i;
1167
1168 /* the caller is responsible for locking the start index */
1169 if (index == locked_page->index && index == end_index)
1170 return 0;
1171
1172 /* skip the page at the start index */
1173 nrpages = end_index - index + 1;
1174 while (nrpages > 0) {
1175 ret = find_get_pages_contig(inode->i_mapping, index,
1176 min_t(unsigned long,
1177 nrpages, ARRAY_SIZE(pages)), pages);
1178 if (ret == 0) {
1179 ret = -EAGAIN;
1180 goto done;
1181 }
1182 /* now we have an array of pages, lock them all */
1183 for (i = 0; i < ret; i++) {
1184 /*
1185 * the caller is taking responsibility for
1186 * locked_page
1187 */
1188 if (pages[i] != locked_page) {
1189 lock_page(pages[i]);
1190 if (!PageDirty(pages[i]) ||
1191 pages[i]->mapping != inode->i_mapping) {
1192 ret = -EAGAIN;
1193 unlock_page(pages[i]);
1194 page_cache_release(pages[i]);
1195 goto done;
1196 }
1197 }
1198 page_cache_release(pages[i]);
1199 pages_locked++;
1200 }
1201 nrpages -= ret;
1202 index += ret;
1203 cond_resched();
1204 }
1205 ret = 0;
1206done:
1207 if (ret && pages_locked) {
1208 __unlock_for_delalloc(inode, locked_page,
1209 delalloc_start,
1210 ((u64)(start_index + pages_locked - 1)) <<
1211 PAGE_CACHE_SHIFT);
1212 }
1213 return ret;
1214}
1215
1216/*
1217 * find a contiguous range of bytes in the file marked as delalloc, not
1218 * more than 'max_bytes'. start and end are used to return the range,
1219 *
1220 * 1 is returned if we find something, 0 if nothing was in the tree
1221 */
1222static noinline u64 find_lock_delalloc_range(struct inode *inode,
1223 struct extent_io_tree *tree,
1224 struct page *locked_page,
1225 u64 *start, u64 *end,
1226 u64 max_bytes)
1227{
1228 u64 delalloc_start;
1229 u64 delalloc_end;
1230 u64 found;
1231 int ret;
1232 int loops = 0;
1233
1234again:
1235 /* step one, find a bunch of delalloc bytes starting at start */
1236 delalloc_start = *start;
1237 delalloc_end = 0;
1238 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1239 max_bytes);
1240 if (!found || delalloc_end <= *start) {
1241 *start = delalloc_start;
1242 *end = delalloc_end;
1243 return found;
1244 }
1245
1246 /*
1247 * start comes from the offset of locked_page. We have to lock
1248 * pages in order, so we can't process delalloc bytes before
1249 * locked_page
1250 */
1251 if (delalloc_start < *start)
1252 delalloc_start = *start;
1253
1254 /*
1255 * make sure to limit the number of pages we try to lock down
1256 * if we're looping.
1257 */
1258 if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1259 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1260
1261 /* step two, lock all the pages after the page that has start */
1262 ret = lock_delalloc_pages(inode, locked_page,
1263 delalloc_start, delalloc_end);
1264 if (ret == -EAGAIN) {
1265 /* some of the pages are gone, lets avoid looping by
1266 * shortening the size of the delalloc range we're searching
1267 */
1268 if (!loops) {
1269 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1270 max_bytes = PAGE_CACHE_SIZE - offset;
1271 loops = 1;
1272 goto again;
1273 } else {
1274 found = 0;
1275 goto out_failed;
1276 }
1277 }
1278 BUG_ON(ret);
1279
1280 /* step three, lock the state bits for the whole range */
1281 lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1282
1283 /* then test to make sure it is all still delalloc */
1284 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1285 EXTENT_DELALLOC, 1);
1286 if (!ret) {
1287 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1288 __unlock_for_delalloc(inode, locked_page,
1289 delalloc_start, delalloc_end);
1290 cond_resched();
1291 goto again;
1292 }
1293 *start = delalloc_start;
1294 *end = delalloc_end;
1295out_failed:
1296 return found;
1297}
1298
1299int extent_clear_unlock_delalloc(struct inode *inode,
1300 struct extent_io_tree *tree,
1301 u64 start, u64 end, struct page *locked_page,
1302 int unlock_pages,
1303 int clear_unlock,
1304 int clear_delalloc, int clear_dirty,
1305 int set_writeback,
1306 int end_writeback)
1307{
1308 int ret;
1309 struct page *pages[16];
1310 unsigned long index = start >> PAGE_CACHE_SHIFT;
1311 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1312 unsigned long nr_pages = end_index - index + 1;
1313 int i;
1314 int clear_bits = 0;
1315
1316 if (clear_unlock)
1317 clear_bits |= EXTENT_LOCKED;
1318 if (clear_dirty)
1319 clear_bits |= EXTENT_DIRTY;
1320
1321 if (clear_delalloc)
1322 clear_bits |= EXTENT_DELALLOC;
1323
1324 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1325 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1326 return 0;
1327
1328 while (nr_pages > 0) {
1329 ret = find_get_pages_contig(inode->i_mapping, index,
1330 min_t(unsigned long,
1331 nr_pages, ARRAY_SIZE(pages)), pages);
1332 for (i = 0; i < ret; i++) {
1333 if (pages[i] == locked_page) {
1334 page_cache_release(pages[i]);
1335 continue;
1336 }
1337 if (clear_dirty)
1338 clear_page_dirty_for_io(pages[i]);
1339 if (set_writeback)
1340 set_page_writeback(pages[i]);
1341 if (end_writeback)
1342 end_page_writeback(pages[i]);
1343 if (unlock_pages)
1344 unlock_page(pages[i]);
1345 page_cache_release(pages[i]);
1346 }
1347 nr_pages -= ret;
1348 index += ret;
1349 cond_resched();
1350 }
1351 return 0;
1352}
1353
1354/*
1355 * count the number of bytes in the tree that have a given bit(s)
1356 * set. This can be fairly slow, except for EXTENT_DIRTY which is
1357 * cached. The total number found is returned.
1358 */
1359u64 count_range_bits(struct extent_io_tree *tree,
1360 u64 *start, u64 search_end, u64 max_bytes,
1361 unsigned long bits)
1362{
1363 struct rb_node *node;
1364 struct extent_state *state;
1365 u64 cur_start = *start;
1366 u64 total_bytes = 0;
1367 int found = 0;
1368
1369 if (search_end <= cur_start) {
1370 WARN_ON(1);
1371 return 0;
1372 }
1373
1374 spin_lock(&tree->lock);
1375 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1376 total_bytes = tree->dirty_bytes;
1377 goto out;
1378 }
1379 /*
1380 * this search will find all the extents that end after
1381 * our range starts.
1382 */
1383 node = tree_search(tree, cur_start);
1384 if (!node)
1385 goto out;
1386
1387 while (1) {
1388 state = rb_entry(node, struct extent_state, rb_node);
1389 if (state->start > search_end)
1390 break;
1391 if (state->end >= cur_start && (state->state & bits)) {
1392 total_bytes += min(search_end, state->end) + 1 -
1393 max(cur_start, state->start);
1394 if (total_bytes >= max_bytes)
1395 break;
1396 if (!found) {
1397 *start = state->start;
1398 found = 1;
1399 }
1400 }
1401 node = rb_next(node);
1402 if (!node)
1403 break;
1404 }
1405out:
1406 spin_unlock(&tree->lock);
1407 return total_bytes;
1408}
1409
1410#if 0
1411/*
1412 * helper function to lock both pages and extents in the tree.
1413 * pages must be locked first.
1414 */
1415static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1416{
1417 unsigned long index = start >> PAGE_CACHE_SHIFT;
1418 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1419 struct page *page;
1420 int err;
1421
1422 while (index <= end_index) {
1423 page = grab_cache_page(tree->mapping, index);
1424 if (!page) {
1425 err = -ENOMEM;
1426 goto failed;
1427 }
1428 if (IS_ERR(page)) {
1429 err = PTR_ERR(page);
1430 goto failed;
1431 }
1432 index++;
1433 }
1434 lock_extent(tree, start, end, GFP_NOFS);
1435 return 0;
1436
1437failed:
1438 /*
1439 * we failed above in getting the page at 'index', so we undo here
1440 * up to but not including the page at 'index'
1441 */
1442 end_index = index;
1443 index = start >> PAGE_CACHE_SHIFT;
1444 while (index < end_index) {
1445 page = find_get_page(tree->mapping, index);
1446 unlock_page(page);
1447 page_cache_release(page);
1448 index++;
1449 }
1450 return err;
1451}
1452
1453/*
1454 * helper function to unlock both pages and extents in the tree.
1455 */
1456static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1457{
1458 unsigned long index = start >> PAGE_CACHE_SHIFT;
1459 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1460 struct page *page;
1461
1462 while (index <= end_index) {
1463 page = find_get_page(tree->mapping, index);
1464 unlock_page(page);
1465 page_cache_release(page);
1466 index++;
1467 }
1468 unlock_extent(tree, start, end, GFP_NOFS);
1469 return 0;
1470}
1471#endif
1472
1473/*
1474 * set the private field for a given byte offset in the tree. If there isn't
1475 * an extent_state there already, this does nothing.
1476 */
1477int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1478{
1479 struct rb_node *node;
1480 struct extent_state *state;
1481 int ret = 0;
1482
1483 spin_lock(&tree->lock);
1484 /*
1485 * this search will find all the extents that end after
1486 * our range starts.
1487 */
1488 node = tree_search(tree, start);
1489 if (!node) {
1490 ret = -ENOENT;
1491 goto out;
1492 }
1493 state = rb_entry(node, struct extent_state, rb_node);
1494 if (state->start != start) {
1495 ret = -ENOENT;
1496 goto out;
1497 }
1498 state->private = private;
1499out:
1500 spin_unlock(&tree->lock);
1501 return ret;
1502}
1503
1504int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1505{
1506 struct rb_node *node;
1507 struct extent_state *state;
1508 int ret = 0;
1509
1510 spin_lock(&tree->lock);
1511 /*
1512 * this search will find all the extents that end after
1513 * our range starts.
1514 */
1515 node = tree_search(tree, start);
1516 if (!node) {
1517 ret = -ENOENT;
1518 goto out;
1519 }
1520 state = rb_entry(node, struct extent_state, rb_node);
1521 if (state->start != start) {
1522 ret = -ENOENT;
1523 goto out;
1524 }
1525 *private = state->private;
1526out:
1527 spin_unlock(&tree->lock);
1528 return ret;
1529}
1530
1531/*
1532 * searches a range in the state tree for a given mask.
1533 * If 'filled' == 1, this returns 1 only if every extent in the tree
1534 * has the bits set. Otherwise, 1 is returned if any bit in the
1535 * range is found set.
1536 */
1537int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1538 int bits, int filled)
1539{
1540 struct extent_state *state = NULL;
1541 struct rb_node *node;
1542 int bitset = 0;
1543
1544 spin_lock(&tree->lock);
1545 node = tree_search(tree, start);
1546 while (node && start <= end) {
1547 state = rb_entry(node, struct extent_state, rb_node);
1548
1549 if (filled && state->start > start) {
1550 bitset = 0;
1551 break;
1552 }
1553
1554 if (state->start > end)
1555 break;
1556
1557 if (state->state & bits) {
1558 bitset = 1;
1559 if (!filled)
1560 break;
1561 } else if (filled) {
1562 bitset = 0;
1563 break;
1564 }
1565 start = state->end + 1;
1566 if (start > end)
1567 break;
1568 node = rb_next(node);
1569 if (!node) {
1570 if (filled)
1571 bitset = 0;
1572 break;
1573 }
1574 }
1575 spin_unlock(&tree->lock);
1576 return bitset;
1577}
1578
1579/*
1580 * helper function to set a given page up to date if all the
1581 * extents in the tree for that page are up to date
1582 */
1583static int check_page_uptodate(struct extent_io_tree *tree,
1584 struct page *page)
1585{
1586 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1587 u64 end = start + PAGE_CACHE_SIZE - 1;
1588 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1589 SetPageUptodate(page);
1590 return 0;
1591}
1592
1593/*
1594 * helper function to unlock a page if all the extents in the tree
1595 * for that page are unlocked
1596 */
1597static int check_page_locked(struct extent_io_tree *tree,
1598 struct page *page)
1599{
1600 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1601 u64 end = start + PAGE_CACHE_SIZE - 1;
1602 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1603 unlock_page(page);
1604 return 0;
1605}
1606
1607/*
1608 * helper function to end page writeback if all the extents
1609 * in the tree for that page are done with writeback
1610 */
1611static int check_page_writeback(struct extent_io_tree *tree,
1612 struct page *page)
1613{
1614 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1615 u64 end = start + PAGE_CACHE_SIZE - 1;
1616 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1617 end_page_writeback(page);
1618 return 0;
1619}
1620
1621/* lots and lots of room for performance fixes in the end_bio funcs */
1622
1623/*
1624 * after a writepage IO is done, we need to:
1625 * clear the uptodate bits on error
1626 * clear the writeback bits in the extent tree for this IO
1627 * end_page_writeback if the page has no more pending IO
1628 *
1629 * Scheduling is not allowed, so the extent state tree is expected
1630 * to have one and only one object corresponding to this IO.
1631 */
1632static void end_bio_extent_writepage(struct bio *bio, int err)
1633{
1634 int uptodate = err == 0;
1635 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1636 struct extent_io_tree *tree;
1637 u64 start;
1638 u64 end;
1639 int whole_page;
1640 int ret;
1641
1642 do {
1643 struct page *page = bvec->bv_page;
1644 tree = &BTRFS_I(page->mapping->host)->io_tree;
1645
1646 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1647 bvec->bv_offset;
1648 end = start + bvec->bv_len - 1;
1649
1650 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1651 whole_page = 1;
1652 else
1653 whole_page = 0;
1654
1655 if (--bvec >= bio->bi_io_vec)
1656 prefetchw(&bvec->bv_page->flags);
1657 if (tree->ops && tree->ops->writepage_end_io_hook) {
1658 ret = tree->ops->writepage_end_io_hook(page, start,
1659 end, NULL, uptodate);
1660 if (ret)
1661 uptodate = 0;
1662 }
1663
1664 if (!uptodate && tree->ops &&
1665 tree->ops->writepage_io_failed_hook) {
1666 ret = tree->ops->writepage_io_failed_hook(bio, page,
1667 start, end, NULL);
1668 if (ret == 0) {
1669 uptodate = (err == 0);
1670 continue;
1671 }
1672 }
1673
1674 if (!uptodate) {
1675 clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1676 ClearPageUptodate(page);
1677 SetPageError(page);
1678 }
1679
1680 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1681
1682 if (whole_page)
1683 end_page_writeback(page);
1684 else
1685 check_page_writeback(tree, page);
1686 } while (bvec >= bio->bi_io_vec);
1687
1688 bio_put(bio);
1689}
1690
1691/*
1692 * after a readpage IO is done, we need to:
1693 * clear the uptodate bits on error
1694 * set the uptodate bits if things worked
1695 * set the page up to date if all extents in the tree are uptodate
1696 * clear the lock bit in the extent tree
1697 * unlock the page if there are no other extents locked for it
1698 *
1699 * Scheduling is not allowed, so the extent state tree is expected
1700 * to have one and only one object corresponding to this IO.
1701 */
1702static void end_bio_extent_readpage(struct bio *bio, int err)
1703{
1704 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1705 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1706 struct extent_io_tree *tree;
1707 u64 start;
1708 u64 end;
1709 int whole_page;
1710 int ret;
1711
1712 if (err)
1713 uptodate = 0;
1714
1715 do {
1716 struct page *page = bvec->bv_page;
1717 tree = &BTRFS_I(page->mapping->host)->io_tree;
1718
1719 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1720 bvec->bv_offset;
1721 end = start + bvec->bv_len - 1;
1722
1723 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1724 whole_page = 1;
1725 else
1726 whole_page = 0;
1727
1728 if (--bvec >= bio->bi_io_vec)
1729 prefetchw(&bvec->bv_page->flags);
1730
1731 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1732 ret = tree->ops->readpage_end_io_hook(page, start, end,
1733 NULL);
1734 if (ret)
1735 uptodate = 0;
1736 }
1737 if (!uptodate && tree->ops &&
1738 tree->ops->readpage_io_failed_hook) {
1739 ret = tree->ops->readpage_io_failed_hook(bio, page,
1740 start, end, NULL);
1741 if (ret == 0) {
1742 uptodate =
1743 test_bit(BIO_UPTODATE, &bio->bi_flags);
1744 if (err)
1745 uptodate = 0;
1746 continue;
1747 }
1748 }
1749
1750 if (uptodate) {
1751 set_extent_uptodate(tree, start, end,
1752 GFP_ATOMIC);
1753 }
1754 unlock_extent(tree, start, end, GFP_ATOMIC);
1755
1756 if (whole_page) {
1757 if (uptodate) {
1758 SetPageUptodate(page);
1759 } else {
1760 ClearPageUptodate(page);
1761 SetPageError(page);
1762 }
1763 unlock_page(page);
1764 } else {
1765 if (uptodate) {
1766 check_page_uptodate(tree, page);
1767 } else {
1768 ClearPageUptodate(page);
1769 SetPageError(page);
1770 }
1771 check_page_locked(tree, page);
1772 }
1773 } while (bvec >= bio->bi_io_vec);
1774
1775 bio_put(bio);
1776}
1777
1778/*
1779 * IO done from prepare_write is pretty simple, we just unlock
1780 * the structs in the extent tree when done, and set the uptodate bits
1781 * as appropriate.
1782 */
1783static void end_bio_extent_preparewrite(struct bio *bio, int err)
1784{
1785 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1786 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1787 struct extent_io_tree *tree;
1788 u64 start;
1789 u64 end;
1790
1791 do {
1792 struct page *page = bvec->bv_page;
1793 tree = &BTRFS_I(page->mapping->host)->io_tree;
1794
1795 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1796 bvec->bv_offset;
1797 end = start + bvec->bv_len - 1;
1798
1799 if (--bvec >= bio->bi_io_vec)
1800 prefetchw(&bvec->bv_page->flags);
1801
1802 if (uptodate) {
1803 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1804 } else {
1805 ClearPageUptodate(page);
1806 SetPageError(page);
1807 }
1808
1809 unlock_extent(tree, start, end, GFP_ATOMIC);
1810
1811 } while (bvec >= bio->bi_io_vec);
1812
1813 bio_put(bio);
1814}
1815
1816static struct bio *
1817extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1818 gfp_t gfp_flags)
1819{
1820 struct bio *bio;
1821
1822 bio = bio_alloc(gfp_flags, nr_vecs);
1823
1824 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1825 while (!bio && (nr_vecs /= 2))
1826 bio = bio_alloc(gfp_flags, nr_vecs);
1827 }
1828
1829 if (bio) {
1830 bio->bi_size = 0;
1831 bio->bi_bdev = bdev;
1832 bio->bi_sector = first_sector;
1833 }
1834 return bio;
1835}
1836
1837static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1838 unsigned long bio_flags)
1839{
1840 int ret = 0;
1841 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1842 struct page *page = bvec->bv_page;
1843 struct extent_io_tree *tree = bio->bi_private;
1844 u64 start;
1845 u64 end;
1846
1847 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1848 end = start + bvec->bv_len - 1;
1849
1850 bio->bi_private = NULL;
1851
1852 bio_get(bio);
1853
1854 if (tree->ops && tree->ops->submit_bio_hook)
1855 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1856 mirror_num, bio_flags);
1857 else
1858 submit_bio(rw, bio);
1859 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1860 ret = -EOPNOTSUPP;
1861 bio_put(bio);
1862 return ret;
1863}
1864
1865static int submit_extent_page(int rw, struct extent_io_tree *tree,
1866 struct page *page, sector_t sector,
1867 size_t size, unsigned long offset,
1868 struct block_device *bdev,
1869 struct bio **bio_ret,
1870 unsigned long max_pages,
1871 bio_end_io_t end_io_func,
1872 int mirror_num,
1873 unsigned long prev_bio_flags,
1874 unsigned long bio_flags)
1875{
1876 int ret = 0;
1877 struct bio *bio;
1878 int nr;
1879 int contig = 0;
1880 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1881 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1882 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1883
1884 if (bio_ret && *bio_ret) {
1885 bio = *bio_ret;
1886 if (old_compressed)
1887 contig = bio->bi_sector == sector;
1888 else
1889 contig = bio->bi_sector + (bio->bi_size >> 9) ==
1890 sector;
1891
1892 if (prev_bio_flags != bio_flags || !contig ||
1893 (tree->ops && tree->ops->merge_bio_hook &&
1894 tree->ops->merge_bio_hook(page, offset, page_size, bio,
1895 bio_flags)) ||
1896 bio_add_page(bio, page, page_size, offset) < page_size) {
1897 ret = submit_one_bio(rw, bio, mirror_num,
1898 prev_bio_flags);
1899 bio = NULL;
1900 } else {
1901 return 0;
1902 }
1903 }
1904 if (this_compressed)
1905 nr = BIO_MAX_PAGES;
1906 else
1907 nr = bio_get_nr_vecs(bdev);
1908
1909 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1910
1911 bio_add_page(bio, page, page_size, offset);
1912 bio->bi_end_io = end_io_func;
1913 bio->bi_private = tree;
1914
1915 if (bio_ret)
1916 *bio_ret = bio;
1917 else
1918 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1919
1920 return ret;
1921}
1922
1923void set_page_extent_mapped(struct page *page)
1924{
1925 if (!PagePrivate(page)) {
1926 SetPagePrivate(page);
1927 page_cache_get(page);
1928 set_page_private(page, EXTENT_PAGE_PRIVATE);
1929 }
1930}
1931
1932static void set_page_extent_head(struct page *page, unsigned long len)
1933{
1934 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1935}
1936
1937/*
1938 * basic readpage implementation. Locked extent state structs are inserted
1939 * into the tree that are removed when the IO is done (by the end_io
1940 * handlers)
1941 */
1942static int __extent_read_full_page(struct extent_io_tree *tree,
1943 struct page *page,
1944 get_extent_t *get_extent,
1945 struct bio **bio, int mirror_num,
1946 unsigned long *bio_flags)
1947{
1948 struct inode *inode = page->mapping->host;
1949 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1950 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1951 u64 end;
1952 u64 cur = start;
1953 u64 extent_offset;
1954 u64 last_byte = i_size_read(inode);
1955 u64 block_start;
1956 u64 cur_end;
1957 sector_t sector;
1958 struct extent_map *em;
1959 struct block_device *bdev;
1960 int ret;
1961 int nr = 0;
1962 size_t page_offset = 0;
1963 size_t iosize;
1964 size_t disk_io_size;
1965 size_t blocksize = inode->i_sb->s_blocksize;
1966 unsigned long this_bio_flag = 0;
1967
1968 set_page_extent_mapped(page);
1969
1970 end = page_end;
1971 lock_extent(tree, start, end, GFP_NOFS);
1972
1973 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1974 char *userpage;
1975 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1976
1977 if (zero_offset) {
1978 iosize = PAGE_CACHE_SIZE - zero_offset;
1979 userpage = kmap_atomic(page, KM_USER0);
1980 memset(userpage + zero_offset, 0, iosize);
1981 flush_dcache_page(page);
1982 kunmap_atomic(userpage, KM_USER0);
1983 }
1984 }
1985 while (cur <= end) {
1986 if (cur >= last_byte) {
1987 char *userpage;
1988 iosize = PAGE_CACHE_SIZE - page_offset;
1989 userpage = kmap_atomic(page, KM_USER0);
1990 memset(userpage + page_offset, 0, iosize);
1991 flush_dcache_page(page);
1992 kunmap_atomic(userpage, KM_USER0);
1993 set_extent_uptodate(tree, cur, cur + iosize - 1,
1994 GFP_NOFS);
1995 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1996 break;
1997 }
1998 em = get_extent(inode, page, page_offset, cur,
1999 end - cur + 1, 0);
2000 if (IS_ERR(em) || !em) {
2001 SetPageError(page);
2002 unlock_extent(tree, cur, end, GFP_NOFS);
2003 break;
2004 }
2005 extent_offset = cur - em->start;
2006 BUG_ON(extent_map_end(em) <= cur);
2007 BUG_ON(end < cur);
2008
2009 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2010 this_bio_flag = EXTENT_BIO_COMPRESSED;
2011
2012 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2013 cur_end = min(extent_map_end(em) - 1, end);
2014 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2015 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2016 disk_io_size = em->block_len;
2017 sector = em->block_start >> 9;
2018 } else {
2019 sector = (em->block_start + extent_offset) >> 9;
2020 disk_io_size = iosize;
2021 }
2022 bdev = em->bdev;
2023 block_start = em->block_start;
2024 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2025 block_start = EXTENT_MAP_HOLE;
2026 free_extent_map(em);
2027 em = NULL;
2028
2029 /* we've found a hole, just zero and go on */
2030 if (block_start == EXTENT_MAP_HOLE) {
2031 char *userpage;
2032 userpage = kmap_atomic(page, KM_USER0);
2033 memset(userpage + page_offset, 0, iosize);
2034 flush_dcache_page(page);
2035 kunmap_atomic(userpage, KM_USER0);
2036
2037 set_extent_uptodate(tree, cur, cur + iosize - 1,
2038 GFP_NOFS);
2039 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2040 cur = cur + iosize;
2041 page_offset += iosize;
2042 continue;
2043 }
2044 /* the get_extent function already copied into the page */
2045 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
2046 check_page_uptodate(tree, page);
2047 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2048 cur = cur + iosize;
2049 page_offset += iosize;
2050 continue;
2051 }
2052 /* we have an inline extent but it didn't get marked up
2053 * to date. Error out
2054 */
2055 if (block_start == EXTENT_MAP_INLINE) {
2056 SetPageError(page);
2057 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2058 cur = cur + iosize;
2059 page_offset += iosize;
2060 continue;
2061 }
2062
2063 ret = 0;
2064 if (tree->ops && tree->ops->readpage_io_hook) {
2065 ret = tree->ops->readpage_io_hook(page, cur,
2066 cur + iosize - 1);
2067 }
2068 if (!ret) {
2069 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2070 pnr -= page->index;
2071 ret = submit_extent_page(READ, tree, page,
2072 sector, disk_io_size, page_offset,
2073 bdev, bio, pnr,
2074 end_bio_extent_readpage, mirror_num,
2075 *bio_flags,
2076 this_bio_flag);
2077 nr++;
2078 *bio_flags = this_bio_flag;
2079 }
2080 if (ret)
2081 SetPageError(page);
2082 cur = cur + iosize;
2083 page_offset += iosize;
2084 }
2085 if (!nr) {
2086 if (!PageError(page))
2087 SetPageUptodate(page);
2088 unlock_page(page);
2089 }
2090 return 0;
2091}
2092
2093int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2094 get_extent_t *get_extent)
2095{
2096 struct bio *bio = NULL;
2097 unsigned long bio_flags = 0;
2098 int ret;
2099
2100 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2101 &bio_flags);
2102 if (bio)
2103 submit_one_bio(READ, bio, 0, bio_flags);
2104 return ret;
2105}
2106
2107/*
2108 * the writepage semantics are similar to regular writepage. extent
2109 * records are inserted to lock ranges in the tree, and as dirty areas
2110 * are found, they are marked writeback. Then the lock bits are removed
2111 * and the end_io handler clears the writeback ranges
2112 */
2113static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2114 void *data)
2115{
2116 struct inode *inode = page->mapping->host;
2117 struct extent_page_data *epd = data;
2118 struct extent_io_tree *tree = epd->tree;
2119 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2120 u64 delalloc_start;
2121 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2122 u64 end;
2123 u64 cur = start;
2124 u64 extent_offset;
2125 u64 last_byte = i_size_read(inode);
2126 u64 block_start;
2127 u64 iosize;
2128 u64 unlock_start;
2129 sector_t sector;
2130 struct extent_map *em;
2131 struct block_device *bdev;
2132 int ret;
2133 int nr = 0;
2134 size_t pg_offset = 0;
2135 size_t blocksize;
2136 loff_t i_size = i_size_read(inode);
2137 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2138 u64 nr_delalloc;
2139 u64 delalloc_end;
2140 int page_started;
2141 int compressed;
2142 unsigned long nr_written = 0;
2143
2144 WARN_ON(!PageLocked(page));
2145 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2146 if (page->index > end_index ||
2147 (page->index == end_index && !pg_offset)) {
2148 page->mapping->a_ops->invalidatepage(page, 0);
2149 unlock_page(page);
2150 return 0;
2151 }
2152
2153 if (page->index == end_index) {
2154 char *userpage;
2155
2156 userpage = kmap_atomic(page, KM_USER0);
2157 memset(userpage + pg_offset, 0,
2158 PAGE_CACHE_SIZE - pg_offset);
2159 kunmap_atomic(userpage, KM_USER0);
2160 flush_dcache_page(page);
2161 }
2162 pg_offset = 0;
2163
2164 set_page_extent_mapped(page);
2165
2166 delalloc_start = start;
2167 delalloc_end = 0;
2168 page_started = 0;
2169 if (!epd->extent_locked) {
2170 while (delalloc_end < page_end) {
2171 nr_delalloc = find_lock_delalloc_range(inode, tree,
2172 page,
2173 &delalloc_start,
2174 &delalloc_end,
2175 128 * 1024 * 1024);
2176 if (nr_delalloc == 0) {
2177 delalloc_start = delalloc_end + 1;
2178 continue;
2179 }
2180 tree->ops->fill_delalloc(inode, page, delalloc_start,
2181 delalloc_end, &page_started,
2182 &nr_written);
2183 delalloc_start = delalloc_end + 1;
2184 }
2185
2186 /* did the fill delalloc function already unlock and start
2187 * the IO?
2188 */
2189 if (page_started) {
2190 ret = 0;
2191 goto update_nr_written;
2192 }
2193 }
2194 lock_extent(tree, start, page_end, GFP_NOFS);
2195
2196 unlock_start = start;
2197
2198 if (tree->ops && tree->ops->writepage_start_hook) {
2199 ret = tree->ops->writepage_start_hook(page, start,
2200 page_end);
2201 if (ret == -EAGAIN) {
2202 unlock_extent(tree, start, page_end, GFP_NOFS);
2203 redirty_page_for_writepage(wbc, page);
2204 unlock_page(page);
2205 ret = 0;
2206 goto update_nr_written;
2207 }
2208 }
2209
2210 nr_written++;
2211
2212 end = page_end;
2213 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
2214 printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
2215
2216 if (last_byte <= start) {
2217 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2218 unlock_extent(tree, start, page_end, GFP_NOFS);
2219 if (tree->ops && tree->ops->writepage_end_io_hook)
2220 tree->ops->writepage_end_io_hook(page, start,
2221 page_end, NULL, 1);
2222 unlock_start = page_end + 1;
2223 goto done;
2224 }
2225
2226 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2227 blocksize = inode->i_sb->s_blocksize;
2228
2229 while (cur <= end) {
2230 if (cur >= last_byte) {
2231 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2232 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2233 if (tree->ops && tree->ops->writepage_end_io_hook)
2234 tree->ops->writepage_end_io_hook(page, cur,
2235 page_end, NULL, 1);
2236 unlock_start = page_end + 1;
2237 break;
2238 }
2239 em = epd->get_extent(inode, page, pg_offset, cur,
2240 end - cur + 1, 1);
2241 if (IS_ERR(em) || !em) {
2242 SetPageError(page);
2243 break;
2244 }
2245
2246 extent_offset = cur - em->start;
2247 BUG_ON(extent_map_end(em) <= cur);
2248 BUG_ON(end < cur);
2249 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2250 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2251 sector = (em->block_start + extent_offset) >> 9;
2252 bdev = em->bdev;
2253 block_start = em->block_start;
2254 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2255 free_extent_map(em);
2256 em = NULL;
2257
2258 /*
2259 * compressed and inline extents are written through other
2260 * paths in the FS
2261 */
2262 if (compressed || block_start == EXTENT_MAP_HOLE ||
2263 block_start == EXTENT_MAP_INLINE) {
2264 clear_extent_dirty(tree, cur,
2265 cur + iosize - 1, GFP_NOFS);
2266
2267 unlock_extent(tree, unlock_start, cur + iosize - 1,
2268 GFP_NOFS);
2269
2270 /*
2271 * end_io notification does not happen here for
2272 * compressed extents
2273 */
2274 if (!compressed && tree->ops &&
2275 tree->ops->writepage_end_io_hook)
2276 tree->ops->writepage_end_io_hook(page, cur,
2277 cur + iosize - 1,
2278 NULL, 1);
2279 else if (compressed) {
2280 /* we don't want to end_page_writeback on
2281 * a compressed extent. this happens
2282 * elsewhere
2283 */
2284 nr++;
2285 }
2286
2287 cur += iosize;
2288 pg_offset += iosize;
2289 unlock_start = cur;
2290 continue;
2291 }
2292 /* leave this out until we have a page_mkwrite call */
2293 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2294 EXTENT_DIRTY, 0)) {
2295 cur = cur + iosize;
2296 pg_offset += iosize;
2297 continue;
2298 }
2299
2300 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2301 if (tree->ops && tree->ops->writepage_io_hook) {
2302 ret = tree->ops->writepage_io_hook(page, cur,
2303 cur + iosize - 1);
2304 } else {
2305 ret = 0;
2306 }
2307 if (ret) {
2308 SetPageError(page);
2309 } else {
2310 unsigned long max_nr = end_index + 1;
2311
2312 set_range_writeback(tree, cur, cur + iosize - 1);
2313 if (!PageWriteback(page)) {
2314 printk(KERN_ERR "btrfs warning page %lu not "
2315 "writeback, cur %llu end %llu\n",
2316 page->index, (unsigned long long)cur,
2317 (unsigned long long)end);
2318 }
2319
2320 ret = submit_extent_page(WRITE, tree, page, sector,
2321 iosize, pg_offset, bdev,
2322 &epd->bio, max_nr,
2323 end_bio_extent_writepage,
2324 0, 0, 0);
2325 if (ret)
2326 SetPageError(page);
2327 }
2328 cur = cur + iosize;
2329 pg_offset += iosize;
2330 nr++;
2331 }
2332done:
2333 if (nr == 0) {
2334 /* make sure the mapping tag for page dirty gets cleared */
2335 set_page_writeback(page);
2336 end_page_writeback(page);
2337 }
2338 if (unlock_start <= page_end)
2339 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2340 unlock_page(page);
2341
2342update_nr_written:
2343 wbc->nr_to_write -= nr_written;
2344 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2345 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2346 page->mapping->writeback_index = page->index + nr_written;
2347 return 0;
2348}
2349
2350/**
2351 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2352 * @mapping: address space structure to write
2353 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2354 * @writepage: function called for each page
2355 * @data: data passed to writepage function
2356 *
2357 * If a page is already under I/O, write_cache_pages() skips it, even
2358 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2359 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2360 * and msync() need to guarantee that all the data which was dirty at the time
2361 * the call was made get new I/O started against them. If wbc->sync_mode is
2362 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2363 * existing IO to complete.
2364 */
2365static int extent_write_cache_pages(struct extent_io_tree *tree,
2366 struct address_space *mapping,
2367 struct writeback_control *wbc,
2368 writepage_t writepage, void *data,
2369 void (*flush_fn)(void *))
2370{
2371 struct backing_dev_info *bdi = mapping->backing_dev_info;
2372 int ret = 0;
2373 int done = 0;
2374 struct pagevec pvec;
2375 int nr_pages;
2376 pgoff_t index;
2377 pgoff_t end; /* Inclusive */
2378 int scanned = 0;
2379 int range_whole = 0;
2380
2381 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2382 wbc->encountered_congestion = 1;
2383 return 0;
2384 }
2385
2386 pagevec_init(&pvec, 0);
2387 if (wbc->range_cyclic) {
2388 index = mapping->writeback_index; /* Start from prev offset */
2389 end = -1;
2390 } else {
2391 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2392 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2393 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2394 range_whole = 1;
2395 scanned = 1;
2396 }
2397retry:
2398 while (!done && (index <= end) &&
2399 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2400 PAGECACHE_TAG_DIRTY, min(end - index,
2401 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2402 unsigned i;
2403
2404 scanned = 1;
2405 for (i = 0; i < nr_pages; i++) {
2406 struct page *page = pvec.pages[i];
2407
2408 /*
2409 * At this point we hold neither mapping->tree_lock nor
2410 * lock on the page itself: the page may be truncated or
2411 * invalidated (changing page->mapping to NULL), or even
2412 * swizzled back from swapper_space to tmpfs file
2413 * mapping
2414 */
2415 if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2416 tree->ops->write_cache_pages_lock_hook(page);
2417 else
2418 lock_page(page);
2419
2420 if (unlikely(page->mapping != mapping)) {
2421 unlock_page(page);
2422 continue;
2423 }
2424
2425 if (!wbc->range_cyclic && page->index > end) {
2426 done = 1;
2427 unlock_page(page);
2428 continue;
2429 }
2430
2431 if (wbc->sync_mode != WB_SYNC_NONE) {
2432 if (PageWriteback(page))
2433 flush_fn(data);
2434 wait_on_page_writeback(page);
2435 }
2436
2437 if (PageWriteback(page) ||
2438 !clear_page_dirty_for_io(page)) {
2439 unlock_page(page);
2440 continue;
2441 }
2442
2443 ret = (*writepage)(page, wbc, data);
2444
2445 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2446 unlock_page(page);
2447 ret = 0;
2448 }
2449 if (ret || wbc->nr_to_write <= 0)
2450 done = 1;
2451 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2452 wbc->encountered_congestion = 1;
2453 done = 1;
2454 }
2455 }
2456 pagevec_release(&pvec);
2457 cond_resched();
2458 }
2459 if (!scanned && !done) {
2460 /*
2461 * We hit the last page and there is more work to be done: wrap
2462 * back to the start of the file
2463 */
2464 scanned = 1;
2465 index = 0;
2466 goto retry;
2467 }
2468 return ret;
2469}
2470
2471static noinline void flush_write_bio(void *data)
2472{
2473 struct extent_page_data *epd = data;
2474 if (epd->bio) {
2475 submit_one_bio(WRITE, epd->bio, 0, 0);
2476 epd->bio = NULL;
2477 }
2478}
2479
2480int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2481 get_extent_t *get_extent,
2482 struct writeback_control *wbc)
2483{
2484 int ret;
2485 struct address_space *mapping = page->mapping;
2486 struct extent_page_data epd = {
2487 .bio = NULL,
2488 .tree = tree,
2489 .get_extent = get_extent,
2490 .extent_locked = 0,
2491 };
2492 struct writeback_control wbc_writepages = {
2493 .bdi = wbc->bdi,
2494 .sync_mode = WB_SYNC_NONE,
2495 .older_than_this = NULL,
2496 .nr_to_write = 64,
2497 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2498 .range_end = (loff_t)-1,
2499 };
2500
2501
2502 ret = __extent_writepage(page, wbc, &epd);
2503
2504 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2505 __extent_writepage, &epd, flush_write_bio);
2506 if (epd.bio)
2507 submit_one_bio(WRITE, epd.bio, 0, 0);
2508 return ret;
2509}
2510
2511int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2512 u64 start, u64 end, get_extent_t *get_extent,
2513 int mode)
2514{
2515 int ret = 0;
2516 struct address_space *mapping = inode->i_mapping;
2517 struct page *page;
2518 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2519 PAGE_CACHE_SHIFT;
2520
2521 struct extent_page_data epd = {
2522 .bio = NULL,
2523 .tree = tree,
2524 .get_extent = get_extent,
2525 .extent_locked = 1,
2526 };
2527 struct writeback_control wbc_writepages = {
2528 .bdi = inode->i_mapping->backing_dev_info,
2529 .sync_mode = mode,
2530 .older_than_this = NULL,
2531 .nr_to_write = nr_pages * 2,
2532 .range_start = start,
2533 .range_end = end + 1,
2534 };
2535
2536 while (start <= end) {
2537 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2538 if (clear_page_dirty_for_io(page))
2539 ret = __extent_writepage(page, &wbc_writepages, &epd);
2540 else {
2541 if (tree->ops && tree->ops->writepage_end_io_hook)
2542 tree->ops->writepage_end_io_hook(page, start,
2543 start + PAGE_CACHE_SIZE - 1,
2544 NULL, 1);
2545 unlock_page(page);
2546 }
2547 page_cache_release(page);
2548 start += PAGE_CACHE_SIZE;
2549 }
2550
2551 if (epd.bio)
2552 submit_one_bio(WRITE, epd.bio, 0, 0);
2553 return ret;
2554}
2555
2556int extent_writepages(struct extent_io_tree *tree,
2557 struct address_space *mapping,
2558 get_extent_t *get_extent,
2559 struct writeback_control *wbc)
2560{
2561 int ret = 0;
2562 struct extent_page_data epd = {
2563 .bio = NULL,
2564 .tree = tree,
2565 .get_extent = get_extent,
2566 .extent_locked = 0,
2567 };
2568
2569 ret = extent_write_cache_pages(tree, mapping, wbc,
2570 __extent_writepage, &epd,
2571 flush_write_bio);
2572 if (epd.bio)
2573 submit_one_bio(WRITE, epd.bio, 0, 0);
2574 return ret;
2575}
2576
2577int extent_readpages(struct extent_io_tree *tree,
2578 struct address_space *mapping,
2579 struct list_head *pages, unsigned nr_pages,
2580 get_extent_t get_extent)
2581{
2582 struct bio *bio = NULL;
2583 unsigned page_idx;
2584 struct pagevec pvec;
2585 unsigned long bio_flags = 0;
2586
2587 pagevec_init(&pvec, 0);
2588 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2589 struct page *page = list_entry(pages->prev, struct page, lru);
2590
2591 prefetchw(&page->flags);
2592 list_del(&page->lru);
2593 /*
2594 * what we want to do here is call add_to_page_cache_lru,
2595 * but that isn't exported, so we reproduce it here
2596 */
2597 if (!add_to_page_cache(page, mapping,
2598 page->index, GFP_KERNEL)) {
2599
2600 /* open coding of lru_cache_add, also not exported */
2601 page_cache_get(page);
2602 if (!pagevec_add(&pvec, page))
2603 __pagevec_lru_add_file(&pvec);
2604 __extent_read_full_page(tree, page, get_extent,
2605 &bio, 0, &bio_flags);
2606 }
2607 page_cache_release(page);
2608 }
2609 if (pagevec_count(&pvec))
2610 __pagevec_lru_add_file(&pvec);
2611 BUG_ON(!list_empty(pages));
2612 if (bio)
2613 submit_one_bio(READ, bio, 0, bio_flags);
2614 return 0;
2615}
2616
2617/*
2618 * basic invalidatepage code, this waits on any locked or writeback
2619 * ranges corresponding to the page, and then deletes any extent state
2620 * records from the tree
2621 */
2622int extent_invalidatepage(struct extent_io_tree *tree,
2623 struct page *page, unsigned long offset)
2624{
2625 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2626 u64 end = start + PAGE_CACHE_SIZE - 1;
2627 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2628
2629 start += (offset + blocksize - 1) & ~(blocksize - 1);
2630 if (start > end)
2631 return 0;
2632
2633 lock_extent(tree, start, end, GFP_NOFS);
2634 wait_on_extent_writeback(tree, start, end);
2635 clear_extent_bit(tree, start, end,
2636 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2637 1, 1, GFP_NOFS);
2638 return 0;
2639}
2640
2641/*
2642 * simple commit_write call, set_range_dirty is used to mark both
2643 * the pages and the extent records as dirty
2644 */
2645int extent_commit_write(struct extent_io_tree *tree,
2646 struct inode *inode, struct page *page,
2647 unsigned from, unsigned to)
2648{
2649 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2650
2651 set_page_extent_mapped(page);
2652 set_page_dirty(page);
2653
2654 if (pos > inode->i_size) {
2655 i_size_write(inode, pos);
2656 mark_inode_dirty(inode);
2657 }
2658 return 0;
2659}
2660
2661int extent_prepare_write(struct extent_io_tree *tree,
2662 struct inode *inode, struct page *page,
2663 unsigned from, unsigned to, get_extent_t *get_extent)
2664{
2665 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2666 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2667 u64 block_start;
2668 u64 orig_block_start;
2669 u64 block_end;
2670 u64 cur_end;
2671 struct extent_map *em;
2672 unsigned blocksize = 1 << inode->i_blkbits;
2673 size_t page_offset = 0;
2674 size_t block_off_start;
2675 size_t block_off_end;
2676 int err = 0;
2677 int iocount = 0;
2678 int ret = 0;
2679 int isnew;
2680
2681 set_page_extent_mapped(page);
2682
2683 block_start = (page_start + from) & ~((u64)blocksize - 1);
2684 block_end = (page_start + to - 1) | (blocksize - 1);
2685 orig_block_start = block_start;
2686
2687 lock_extent(tree, page_start, page_end, GFP_NOFS);
2688 while (block_start <= block_end) {
2689 em = get_extent(inode, page, page_offset, block_start,
2690 block_end - block_start + 1, 1);
2691 if (IS_ERR(em) || !em)
2692 goto err;
2693
2694 cur_end = min(block_end, extent_map_end(em) - 1);
2695 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2696 block_off_end = block_off_start + blocksize;
2697 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2698
2699 if (!PageUptodate(page) && isnew &&
2700 (block_off_end > to || block_off_start < from)) {
2701 void *kaddr;
2702
2703 kaddr = kmap_atomic(page, KM_USER0);
2704 if (block_off_end > to)
2705 memset(kaddr + to, 0, block_off_end - to);
2706 if (block_off_start < from)
2707 memset(kaddr + block_off_start, 0,
2708 from - block_off_start);
2709 flush_dcache_page(page);
2710 kunmap_atomic(kaddr, KM_USER0);
2711 }
2712 if ((em->block_start != EXTENT_MAP_HOLE &&
2713 em->block_start != EXTENT_MAP_INLINE) &&
2714 !isnew && !PageUptodate(page) &&
2715 (block_off_end > to || block_off_start < from) &&
2716 !test_range_bit(tree, block_start, cur_end,
2717 EXTENT_UPTODATE, 1)) {
2718 u64 sector;
2719 u64 extent_offset = block_start - em->start;
2720 size_t iosize;
2721 sector = (em->block_start + extent_offset) >> 9;
2722 iosize = (cur_end - block_start + blocksize) &
2723 ~((u64)blocksize - 1);
2724 /*
2725 * we've already got the extent locked, but we
2726 * need to split the state such that our end_bio
2727 * handler can clear the lock.
2728 */
2729 set_extent_bit(tree, block_start,
2730 block_start + iosize - 1,
2731 EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2732 ret = submit_extent_page(READ, tree, page,
2733 sector, iosize, page_offset, em->bdev,
2734 NULL, 1,
2735 end_bio_extent_preparewrite, 0,
2736 0, 0);
2737 iocount++;
2738 block_start = block_start + iosize;
2739 } else {
2740 set_extent_uptodate(tree, block_start, cur_end,
2741 GFP_NOFS);
2742 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2743 block_start = cur_end + 1;
2744 }
2745 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2746 free_extent_map(em);
2747 }
2748 if (iocount) {
2749 wait_extent_bit(tree, orig_block_start,
2750 block_end, EXTENT_LOCKED);
2751 }
2752 check_page_uptodate(tree, page);
2753err:
2754 /* FIXME, zero out newly allocated blocks on error */
2755 return err;
2756}
2757
2758/*
2759 * a helper for releasepage, this tests for areas of the page that
2760 * are locked or under IO and drops the related state bits if it is safe
2761 * to drop the page.
2762 */
2763int try_release_extent_state(struct extent_map_tree *map,
2764 struct extent_io_tree *tree, struct page *page,
2765 gfp_t mask)
2766{
2767 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2768 u64 end = start + PAGE_CACHE_SIZE - 1;
2769 int ret = 1;
2770
2771 if (test_range_bit(tree, start, end,
2772 EXTENT_IOBITS | EXTENT_ORDERED, 0))
2773 ret = 0;
2774 else {
2775 if ((mask & GFP_NOFS) == GFP_NOFS)
2776 mask = GFP_NOFS;
2777 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2778 1, 1, mask);
2779 }
2780 return ret;
2781}
2782
2783/*
2784 * a helper for releasepage. As long as there are no locked extents
2785 * in the range corresponding to the page, both state records and extent
2786 * map records are removed
2787 */
2788int try_release_extent_mapping(struct extent_map_tree *map,
2789 struct extent_io_tree *tree, struct page *page,
2790 gfp_t mask)
2791{
2792 struct extent_map *em;
2793 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2794 u64 end = start + PAGE_CACHE_SIZE - 1;
2795
2796 if ((mask & __GFP_WAIT) &&
2797 page->mapping->host->i_size > 16 * 1024 * 1024) {
2798 u64 len;
2799 while (start <= end) {
2800 len = end - start + 1;
2801 spin_lock(&map->lock);
2802 em = lookup_extent_mapping(map, start, len);
2803 if (!em || IS_ERR(em)) {
2804 spin_unlock(&map->lock);
2805 break;
2806 }
2807 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2808 em->start != start) {
2809 spin_unlock(&map->lock);
2810 free_extent_map(em);
2811 break;
2812 }
2813 if (!test_range_bit(tree, em->start,
2814 extent_map_end(em) - 1,
2815 EXTENT_LOCKED | EXTENT_WRITEBACK |
2816 EXTENT_ORDERED,
2817 0)) {
2818 remove_extent_mapping(map, em);
2819 /* once for the rb tree */
2820 free_extent_map(em);
2821 }
2822 start = extent_map_end(em);
2823 spin_unlock(&map->lock);
2824
2825 /* once for us */
2826 free_extent_map(em);
2827 }
2828 }
2829 return try_release_extent_state(map, tree, page, mask);
2830}
2831
2832sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2833 get_extent_t *get_extent)
2834{
2835 struct inode *inode = mapping->host;
2836 u64 start = iblock << inode->i_blkbits;
2837 sector_t sector = 0;
2838 size_t blksize = (1 << inode->i_blkbits);
2839 struct extent_map *em;
2840
2841 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2842 GFP_NOFS);
2843 em = get_extent(inode, NULL, 0, start, blksize, 0);
2844 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2845 GFP_NOFS);
2846 if (!em || IS_ERR(em))
2847 return 0;
2848
2849 if (em->block_start > EXTENT_MAP_LAST_BYTE)
2850 goto out;
2851
2852 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2853out:
2854 free_extent_map(em);
2855 return sector;
2856}
2857
2858static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2859 unsigned long i)
2860{
2861 struct page *p;
2862 struct address_space *mapping;
2863
2864 if (i == 0)
2865 return eb->first_page;
2866 i += eb->start >> PAGE_CACHE_SHIFT;
2867 mapping = eb->first_page->mapping;
2868 if (!mapping)
2869 return NULL;
2870
2871 /*
2872 * extent_buffer_page is only called after pinning the page
2873 * by increasing the reference count. So we know the page must
2874 * be in the radix tree.
2875 */
2876 rcu_read_lock();
2877 p = radix_tree_lookup(&mapping->page_tree, i);
2878 rcu_read_unlock();
2879
2880 return p;
2881}
2882
2883static inline unsigned long num_extent_pages(u64 start, u64 len)
2884{
2885 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2886 (start >> PAGE_CACHE_SHIFT);
2887}
2888
2889static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2890 u64 start,
2891 unsigned long len,
2892 gfp_t mask)
2893{
2894 struct extent_buffer *eb = NULL;
2895#ifdef LEAK_DEBUG
2896 unsigned long flags;
2897#endif
2898
2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2900 eb->start = start;
2901 eb->len = len;
2902 mutex_init(&eb->mutex);
2903#ifdef LEAK_DEBUG
2904 spin_lock_irqsave(&leak_lock, flags);
2905 list_add(&eb->leak_list, &buffers);
2906 spin_unlock_irqrestore(&leak_lock, flags);
2907#endif
2908 atomic_set(&eb->refs, 1);
2909
2910 return eb;
2911}
2912
2913static void __free_extent_buffer(struct extent_buffer *eb)
2914{
2915#ifdef LEAK_DEBUG
2916 unsigned long flags;
2917 spin_lock_irqsave(&leak_lock, flags);
2918 list_del(&eb->leak_list);
2919 spin_unlock_irqrestore(&leak_lock, flags);
2920#endif
2921 kmem_cache_free(extent_buffer_cache, eb);
2922}
2923
2924struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2925 u64 start, unsigned long len,
2926 struct page *page0,
2927 gfp_t mask)
2928{
2929 unsigned long num_pages = num_extent_pages(start, len);
2930 unsigned long i;
2931 unsigned long index = start >> PAGE_CACHE_SHIFT;
2932 struct extent_buffer *eb;
2933 struct extent_buffer *exists = NULL;
2934 struct page *p;
2935 struct address_space *mapping = tree->mapping;
2936 int uptodate = 1;
2937
2938 spin_lock(&tree->buffer_lock);
2939 eb = buffer_search(tree, start);
2940 if (eb) {
2941 atomic_inc(&eb->refs);
2942 spin_unlock(&tree->buffer_lock);
2943 mark_page_accessed(eb->first_page);
2944 return eb;
2945 }
2946 spin_unlock(&tree->buffer_lock);
2947
2948 eb = __alloc_extent_buffer(tree, start, len, mask);
2949 if (!eb)
2950 return NULL;
2951
2952 if (page0) {
2953 eb->first_page = page0;
2954 i = 1;
2955 index++;
2956 page_cache_get(page0);
2957 mark_page_accessed(page0);
2958 set_page_extent_mapped(page0);
2959 set_page_extent_head(page0, len);
2960 uptodate = PageUptodate(page0);
2961 } else {
2962 i = 0;
2963 }
2964 for (; i < num_pages; i++, index++) {
2965 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2966 if (!p) {
2967 WARN_ON(1);
2968 goto free_eb;
2969 }
2970 set_page_extent_mapped(p);
2971 mark_page_accessed(p);
2972 if (i == 0) {
2973 eb->first_page = p;
2974 set_page_extent_head(p, len);
2975 } else {
2976 set_page_private(p, EXTENT_PAGE_PRIVATE);
2977 }
2978 if (!PageUptodate(p))
2979 uptodate = 0;
2980 unlock_page(p);
2981 }
2982 if (uptodate)
2983 eb->flags |= EXTENT_UPTODATE;
2984 eb->flags |= EXTENT_BUFFER_FILLED;
2985
2986 spin_lock(&tree->buffer_lock);
2987 exists = buffer_tree_insert(tree, start, &eb->rb_node);
2988 if (exists) {
2989 /* add one reference for the caller */
2990 atomic_inc(&exists->refs);
2991 spin_unlock(&tree->buffer_lock);
2992 goto free_eb;
2993 }
2994 spin_unlock(&tree->buffer_lock);
2995
2996 /* add one reference for the tree */
2997 atomic_inc(&eb->refs);
2998 return eb;
2999
3000free_eb:
3001 if (!atomic_dec_and_test(&eb->refs))
3002 return exists;
3003 for (index = 1; index < i; index++)
3004 page_cache_release(extent_buffer_page(eb, index));
3005 page_cache_release(extent_buffer_page(eb, 0));
3006 __free_extent_buffer(eb);
3007 return exists;
3008}
3009
3010struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3011 u64 start, unsigned long len,
3012 gfp_t mask)
3013{
3014 struct extent_buffer *eb;
3015
3016 spin_lock(&tree->buffer_lock);
3017 eb = buffer_search(tree, start);
3018 if (eb)
3019 atomic_inc(&eb->refs);
3020 spin_unlock(&tree->buffer_lock);
3021
3022 if (eb)
3023 mark_page_accessed(eb->first_page);
3024
3025 return eb;
3026}
3027
3028void free_extent_buffer(struct extent_buffer *eb)
3029{
3030 if (!eb)
3031 return;
3032
3033 if (!atomic_dec_and_test(&eb->refs))
3034 return;
3035
3036 WARN_ON(1);
3037}
3038
3039int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3040 struct extent_buffer *eb)
3041{
3042 int set;
3043 unsigned long i;
3044 unsigned long num_pages;
3045 struct page *page;
3046
3047 u64 start = eb->start;
3048 u64 end = start + eb->len - 1;
3049
3050 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3051 num_pages = num_extent_pages(eb->start, eb->len);
3052
3053 for (i = 0; i < num_pages; i++) {
3054 page = extent_buffer_page(eb, i);
3055 if (!set && !PageDirty(page))
3056 continue;
3057
3058 lock_page(page);
3059 if (i == 0)
3060 set_page_extent_head(page, eb->len);
3061 else
3062 set_page_private(page, EXTENT_PAGE_PRIVATE);
3063
3064 /*
3065 * if we're on the last page or the first page and the
3066 * block isn't aligned on a page boundary, do extra checks
3067 * to make sure we don't clean page that is partially dirty
3068 */
3069 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3070 ((i == num_pages - 1) &&
3071 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3072 start = (u64)page->index << PAGE_CACHE_SHIFT;
3073 end = start + PAGE_CACHE_SIZE - 1;
3074 if (test_range_bit(tree, start, end,
3075 EXTENT_DIRTY, 0)) {
3076 unlock_page(page);
3077 continue;
3078 }
3079 }
3080 clear_page_dirty_for_io(page);
3081 spin_lock_irq(&page->mapping->tree_lock);
3082 if (!PageDirty(page)) {
3083 radix_tree_tag_clear(&page->mapping->page_tree,
3084 page_index(page),
3085 PAGECACHE_TAG_DIRTY);
3086 }
3087 spin_unlock_irq(&page->mapping->tree_lock);
3088 unlock_page(page);
3089 }
3090 return 0;
3091}
3092
3093int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3094 struct extent_buffer *eb)
3095{
3096 return wait_on_extent_writeback(tree, eb->start,
3097 eb->start + eb->len - 1);
3098}
3099
3100int set_extent_buffer_dirty(struct extent_io_tree *tree,
3101 struct extent_buffer *eb)
3102{
3103 unsigned long i;
3104 unsigned long num_pages;
3105
3106 num_pages = num_extent_pages(eb->start, eb->len);
3107 for (i = 0; i < num_pages; i++) {
3108 struct page *page = extent_buffer_page(eb, i);
3109 /* writepage may need to do something special for the
3110 * first page, we have to make sure page->private is
3111 * properly set. releasepage may drop page->private
3112 * on us if the page isn't already dirty.
3113 */
3114 lock_page(page);
3115 if (i == 0) {
3116 set_page_extent_head(page, eb->len);
3117 } else if (PagePrivate(page) &&
3118 page->private != EXTENT_PAGE_PRIVATE) {
3119 set_page_extent_mapped(page);
3120 }
3121 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3122 set_extent_dirty(tree, page_offset(page),
3123 page_offset(page) + PAGE_CACHE_SIZE - 1,
3124 GFP_NOFS);
3125 unlock_page(page);
3126 }
3127 return 0;
3128}
3129
3130int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3131 struct extent_buffer *eb)
3132{
3133 unsigned long i;
3134 struct page *page;
3135 unsigned long num_pages;
3136
3137 num_pages = num_extent_pages(eb->start, eb->len);
3138 eb->flags &= ~EXTENT_UPTODATE;
3139
3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3141 GFP_NOFS);
3142 for (i = 0; i < num_pages; i++) {
3143 page = extent_buffer_page(eb, i);
3144 if (page)
3145 ClearPageUptodate(page);
3146 }
3147 return 0;
3148}
3149
3150int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3151 struct extent_buffer *eb)
3152{
3153 unsigned long i;
3154 struct page *page;
3155 unsigned long num_pages;
3156
3157 num_pages = num_extent_pages(eb->start, eb->len);
3158
3159 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3160 GFP_NOFS);
3161 for (i = 0; i < num_pages; i++) {
3162 page = extent_buffer_page(eb, i);
3163 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3164 ((i == num_pages - 1) &&
3165 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3166 check_page_uptodate(tree, page);
3167 continue;
3168 }
3169 SetPageUptodate(page);
3170 }
3171 return 0;
3172}
3173
3174int extent_range_uptodate(struct extent_io_tree *tree,
3175 u64 start, u64 end)
3176{
3177 struct page *page;
3178 int ret;
3179 int pg_uptodate = 1;
3180 int uptodate;
3181 unsigned long index;
3182
3183 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
3184 if (ret)
3185 return 1;
3186 while (start <= end) {
3187 index = start >> PAGE_CACHE_SHIFT;
3188 page = find_get_page(tree->mapping, index);
3189 uptodate = PageUptodate(page);
3190 page_cache_release(page);
3191 if (!uptodate) {
3192 pg_uptodate = 0;
3193 break;
3194 }
3195 start += PAGE_CACHE_SIZE;
3196 }
3197 return pg_uptodate;
3198}
3199
3200int extent_buffer_uptodate(struct extent_io_tree *tree,
3201 struct extent_buffer *eb)
3202{
3203 int ret = 0;
3204 unsigned long num_pages;
3205 unsigned long i;
3206 struct page *page;
3207 int pg_uptodate = 1;
3208
3209 if (eb->flags & EXTENT_UPTODATE)
3210 return 1;
3211
3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3213 EXTENT_UPTODATE, 1);
3214 if (ret)
3215 return ret;
3216
3217 num_pages = num_extent_pages(eb->start, eb->len);
3218 for (i = 0; i < num_pages; i++) {
3219 page = extent_buffer_page(eb, i);
3220 if (!PageUptodate(page)) {
3221 pg_uptodate = 0;
3222 break;
3223 }
3224 }
3225 return pg_uptodate;
3226}
3227
3228int read_extent_buffer_pages(struct extent_io_tree *tree,
3229 struct extent_buffer *eb,
3230 u64 start, int wait,
3231 get_extent_t *get_extent, int mirror_num)
3232{
3233 unsigned long i;
3234 unsigned long start_i;
3235 struct page *page;
3236 int err;
3237 int ret = 0;
3238 int locked_pages = 0;
3239 int all_uptodate = 1;
3240 int inc_all_pages = 0;
3241 unsigned long num_pages;
3242 struct bio *bio = NULL;
3243 unsigned long bio_flags = 0;
3244
3245 if (eb->flags & EXTENT_UPTODATE)
3246 return 0;
3247
3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3249 EXTENT_UPTODATE, 1)) {
3250 return 0;
3251 }
3252
3253 if (start) {
3254 WARN_ON(start < eb->start);
3255 start_i = (start >> PAGE_CACHE_SHIFT) -
3256 (eb->start >> PAGE_CACHE_SHIFT);
3257 } else {
3258 start_i = 0;
3259 }
3260
3261 num_pages = num_extent_pages(eb->start, eb->len);
3262 for (i = start_i; i < num_pages; i++) {
3263 page = extent_buffer_page(eb, i);
3264 if (!wait) {
3265 if (!trylock_page(page))
3266 goto unlock_exit;
3267 } else {
3268 lock_page(page);
3269 }
3270 locked_pages++;
3271 if (!PageUptodate(page))
3272 all_uptodate = 0;
3273 }
3274 if (all_uptodate) {
3275 if (start_i == 0)
3276 eb->flags |= EXTENT_UPTODATE;
3277 goto unlock_exit;
3278 }
3279
3280 for (i = start_i; i < num_pages; i++) {
3281 page = extent_buffer_page(eb, i);
3282 if (inc_all_pages)
3283 page_cache_get(page);
3284 if (!PageUptodate(page)) {
3285 if (start_i == 0)
3286 inc_all_pages = 1;
3287 ClearPageError(page);
3288 err = __extent_read_full_page(tree, page,
3289 get_extent, &bio,
3290 mirror_num, &bio_flags);
3291 if (err)
3292 ret = err;
3293 } else {
3294 unlock_page(page);
3295 }
3296 }
3297
3298 if (bio)
3299 submit_one_bio(READ, bio, mirror_num, bio_flags);
3300
3301 if (ret || !wait)
3302 return ret;
3303
3304 for (i = start_i; i < num_pages; i++) {
3305 page = extent_buffer_page(eb, i);
3306 wait_on_page_locked(page);
3307 if (!PageUptodate(page))
3308 ret = -EIO;
3309 }
3310
3311 if (!ret)
3312 eb->flags |= EXTENT_UPTODATE;
3313 return ret;
3314
3315unlock_exit:
3316 i = start_i;
3317 while (locked_pages > 0) {
3318 page = extent_buffer_page(eb, i);
3319 i++;
3320 unlock_page(page);
3321 locked_pages--;
3322 }
3323 return ret;
3324}
3325
3326void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3327 unsigned long start,
3328 unsigned long len)
3329{
3330 size_t cur;
3331 size_t offset;
3332 struct page *page;
3333 char *kaddr;
3334 char *dst = (char *)dstv;
3335 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3336 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3337
3338 WARN_ON(start > eb->len);
3339 WARN_ON(start + len > eb->start + eb->len);
3340
3341 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3342
3343 while (len > 0) {
3344 page = extent_buffer_page(eb, i);
3345
3346 cur = min(len, (PAGE_CACHE_SIZE - offset));
3347 kaddr = kmap_atomic(page, KM_USER1);
3348 memcpy(dst, kaddr + offset, cur);
3349 kunmap_atomic(kaddr, KM_USER1);
3350
3351 dst += cur;
3352 len -= cur;
3353 offset = 0;
3354 i++;
3355 }
3356}
3357
3358int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3359 unsigned long min_len, char **token, char **map,
3360 unsigned long *map_start,
3361 unsigned long *map_len, int km)
3362{
3363 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3364 char *kaddr;
3365 struct page *p;
3366 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3367 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3368 unsigned long end_i = (start_offset + start + min_len - 1) >>
3369 PAGE_CACHE_SHIFT;
3370
3371 if (i != end_i)
3372 return -EINVAL;
3373
3374 if (i == 0) {
3375 offset = start_offset;
3376 *map_start = 0;
3377 } else {
3378 offset = 0;
3379 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3380 }
3381
3382 if (start + min_len > eb->len) {
3383 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3384 "wanted %lu %lu\n", (unsigned long long)eb->start,
3385 eb->len, start, min_len);
3386 WARN_ON(1);
3387 }
3388
3389 p = extent_buffer_page(eb, i);
3390 kaddr = kmap_atomic(p, km);
3391 *token = kaddr;
3392 *map = kaddr + offset;
3393 *map_len = PAGE_CACHE_SIZE - offset;
3394 return 0;
3395}
3396
3397int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3398 unsigned long min_len,
3399 char **token, char **map,
3400 unsigned long *map_start,
3401 unsigned long *map_len, int km)
3402{
3403 int err;
3404 int save = 0;
3405 if (eb->map_token) {
3406 unmap_extent_buffer(eb, eb->map_token, km);
3407 eb->map_token = NULL;
3408 save = 1;
3409 WARN_ON(!mutex_is_locked(&eb->mutex));
3410 }
3411 err = map_private_extent_buffer(eb, start, min_len, token, map,
3412 map_start, map_len, km);
3413 if (!err && save) {
3414 eb->map_token = *token;
3415 eb->kaddr = *map;
3416 eb->map_start = *map_start;
3417 eb->map_len = *map_len;
3418 }
3419 return err;
3420}
3421
3422void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3423{
3424 kunmap_atomic(token, km);
3425}
3426
3427int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3428 unsigned long start,
3429 unsigned long len)
3430{
3431 size_t cur;
3432 size_t offset;
3433 struct page *page;
3434 char *kaddr;
3435 char *ptr = (char *)ptrv;
3436 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3437 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3438 int ret = 0;
3439
3440 WARN_ON(start > eb->len);
3441 WARN_ON(start + len > eb->start + eb->len);
3442
3443 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3444
3445 while (len > 0) {
3446 page = extent_buffer_page(eb, i);
3447
3448 cur = min(len, (PAGE_CACHE_SIZE - offset));
3449
3450 kaddr = kmap_atomic(page, KM_USER0);
3451 ret = memcmp(ptr, kaddr + offset, cur);
3452 kunmap_atomic(kaddr, KM_USER0);
3453 if (ret)
3454 break;
3455
3456 ptr += cur;
3457 len -= cur;
3458 offset = 0;
3459 i++;
3460 }
3461 return ret;
3462}
3463
3464void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3465 unsigned long start, unsigned long len)
3466{
3467 size_t cur;
3468 size_t offset;
3469 struct page *page;
3470 char *kaddr;
3471 char *src = (char *)srcv;
3472 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3473 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3474
3475 WARN_ON(start > eb->len);
3476 WARN_ON(start + len > eb->start + eb->len);
3477
3478 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3479
3480 while (len > 0) {
3481 page = extent_buffer_page(eb, i);
3482 WARN_ON(!PageUptodate(page));
3483
3484 cur = min(len, PAGE_CACHE_SIZE - offset);
3485 kaddr = kmap_atomic(page, KM_USER1);
3486 memcpy(kaddr + offset, src, cur);
3487 kunmap_atomic(kaddr, KM_USER1);
3488
3489 src += cur;
3490 len -= cur;
3491 offset = 0;
3492 i++;
3493 }
3494}
3495
3496void memset_extent_buffer(struct extent_buffer *eb, char c,
3497 unsigned long start, unsigned long len)
3498{
3499 size_t cur;
3500 size_t offset;
3501 struct page *page;
3502 char *kaddr;
3503 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3504 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3505
3506 WARN_ON(start > eb->len);
3507 WARN_ON(start + len > eb->start + eb->len);
3508
3509 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3510
3511 while (len > 0) {
3512 page = extent_buffer_page(eb, i);
3513 WARN_ON(!PageUptodate(page));
3514
3515 cur = min(len, PAGE_CACHE_SIZE - offset);
3516 kaddr = kmap_atomic(page, KM_USER0);
3517 memset(kaddr + offset, c, cur);
3518 kunmap_atomic(kaddr, KM_USER0);
3519
3520 len -= cur;
3521 offset = 0;
3522 i++;
3523 }
3524}
3525
3526void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3527 unsigned long dst_offset, unsigned long src_offset,
3528 unsigned long len)
3529{
3530 u64 dst_len = dst->len;
3531 size_t cur;
3532 size_t offset;
3533 struct page *page;
3534 char *kaddr;
3535 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3536 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3537
3538 WARN_ON(src->len != dst_len);
3539
3540 offset = (start_offset + dst_offset) &
3541 ((unsigned long)PAGE_CACHE_SIZE - 1);
3542
3543 while (len > 0) {
3544 page = extent_buffer_page(dst, i);
3545 WARN_ON(!PageUptodate(page));
3546
3547 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3548
3549 kaddr = kmap_atomic(page, KM_USER0);
3550 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3551 kunmap_atomic(kaddr, KM_USER0);
3552
3553 src_offset += cur;
3554 len -= cur;
3555 offset = 0;
3556 i++;
3557 }
3558}
3559
3560static void move_pages(struct page *dst_page, struct page *src_page,
3561 unsigned long dst_off, unsigned long src_off,
3562 unsigned long len)
3563{
3564 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3565 if (dst_page == src_page) {
3566 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3567 } else {
3568 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3569 char *p = dst_kaddr + dst_off + len;
3570 char *s = src_kaddr + src_off + len;
3571
3572 while (len--)
3573 *--p = *--s;
3574
3575 kunmap_atomic(src_kaddr, KM_USER1);
3576 }
3577 kunmap_atomic(dst_kaddr, KM_USER0);
3578}
3579
3580static void copy_pages(struct page *dst_page, struct page *src_page,
3581 unsigned long dst_off, unsigned long src_off,
3582 unsigned long len)
3583{
3584 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3585 char *src_kaddr;
3586
3587 if (dst_page != src_page)
3588 src_kaddr = kmap_atomic(src_page, KM_USER1);
3589 else
3590 src_kaddr = dst_kaddr;
3591
3592 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3593 kunmap_atomic(dst_kaddr, KM_USER0);
3594 if (dst_page != src_page)
3595 kunmap_atomic(src_kaddr, KM_USER1);
3596}
3597
3598void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3599 unsigned long src_offset, unsigned long len)
3600{
3601 size_t cur;
3602 size_t dst_off_in_page;
3603 size_t src_off_in_page;
3604 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3605 unsigned long dst_i;
3606 unsigned long src_i;
3607
3608 if (src_offset + len > dst->len) {
3609 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3610 "len %lu dst len %lu\n", src_offset, len, dst->len);
3611 BUG_ON(1);
3612 }
3613 if (dst_offset + len > dst->len) {
3614 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3615 "len %lu dst len %lu\n", dst_offset, len, dst->len);
3616 BUG_ON(1);
3617 }
3618
3619 while (len > 0) {
3620 dst_off_in_page = (start_offset + dst_offset) &
3621 ((unsigned long)PAGE_CACHE_SIZE - 1);
3622 src_off_in_page = (start_offset + src_offset) &
3623 ((unsigned long)PAGE_CACHE_SIZE - 1);
3624
3625 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3626 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3627
3628 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3629 src_off_in_page));
3630 cur = min_t(unsigned long, cur,
3631 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3632
3633 copy_pages(extent_buffer_page(dst, dst_i),
3634 extent_buffer_page(dst, src_i),
3635 dst_off_in_page, src_off_in_page, cur);
3636
3637 src_offset += cur;
3638 dst_offset += cur;
3639 len -= cur;
3640 }
3641}
3642
3643void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3644 unsigned long src_offset, unsigned long len)
3645{
3646 size_t cur;
3647 size_t dst_off_in_page;
3648 size_t src_off_in_page;
3649 unsigned long dst_end = dst_offset + len - 1;
3650 unsigned long src_end = src_offset + len - 1;
3651 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3652 unsigned long dst_i;
3653 unsigned long src_i;
3654
3655 if (src_offset + len > dst->len) {
3656 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3657 "len %lu len %lu\n", src_offset, len, dst->len);
3658 BUG_ON(1);
3659 }
3660 if (dst_offset + len > dst->len) {
3661 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3662 "len %lu len %lu\n", dst_offset, len, dst->len);
3663 BUG_ON(1);
3664 }
3665 if (dst_offset < src_offset) {
3666 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3667 return;
3668 }
3669 while (len > 0) {
3670 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3671 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3672
3673 dst_off_in_page = (start_offset + dst_end) &
3674 ((unsigned long)PAGE_CACHE_SIZE - 1);
3675 src_off_in_page = (start_offset + src_end) &
3676 ((unsigned long)PAGE_CACHE_SIZE - 1);
3677
3678 cur = min_t(unsigned long, len, src_off_in_page + 1);
3679 cur = min(cur, dst_off_in_page + 1);
3680 move_pages(extent_buffer_page(dst, dst_i),
3681 extent_buffer_page(dst, src_i),
3682 dst_off_in_page - cur + 1,
3683 src_off_in_page - cur + 1, cur);
3684
3685 dst_end -= cur;
3686 src_end -= cur;
3687 len -= cur;
3688 }
3689}
3690
3691int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3692{
3693 u64 start = page_offset(page);
3694 struct extent_buffer *eb;
3695 int ret = 1;
3696 unsigned long i;
3697 unsigned long num_pages;
3698
3699 spin_lock(&tree->buffer_lock);
3700 eb = buffer_search(tree, start);
3701 if (!eb)
3702 goto out;
3703
3704 if (atomic_read(&eb->refs) > 1) {
3705 ret = 0;
3706 goto out;
3707 }
3708 /* at this point we can safely release the extent buffer */
3709 num_pages = num_extent_pages(eb->start, eb->len);
3710 for (i = 0; i < num_pages; i++)
3711 page_cache_release(extent_buffer_page(eb, i));
3712 rb_erase(&eb->rb_node, &tree->buffer);
3713 __free_extent_buffer(eb);
3714out:
3715 spin_unlock(&tree->buffer_lock);
3716 return ret;
3717}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000000..c5b483a79137
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,269 @@
1#ifndef __EXTENTIO__
2#define __EXTENTIO__
3
4#include <linux/rbtree.h>
5
6/* bits for the extent state */
7#define EXTENT_DIRTY 1
8#define EXTENT_WRITEBACK (1 << 1)
9#define EXTENT_UPTODATE (1 << 2)
10#define EXTENT_LOCKED (1 << 3)
11#define EXTENT_NEW (1 << 4)
12#define EXTENT_DELALLOC (1 << 5)
13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_ORDERED (1 << 9)
17#define EXTENT_ORDERED_METADATA (1 << 10)
18#define EXTENT_BOUNDARY (1 << 11)
19#define EXTENT_NODATASUM (1 << 12)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21
22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1
24
25/*
26 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one.
28 */
29#define EXTENT_PAGE_PRIVATE 1
30#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
31
32struct extent_state;
33
34typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
35 struct bio *bio, int mirror_num,
36 unsigned long bio_flags);
37struct extent_io_ops {
38 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
39 u64 start, u64 end, int *page_started,
40 unsigned long *nr_written);
41 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
42 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
43 extent_submit_bio_hook_t *submit_bio_hook;
44 int (*merge_bio_hook)(struct page *page, unsigned long offset,
45 size_t size, struct bio *bio,
46 unsigned long bio_flags);
47 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
48 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
49 u64 start, u64 end,
50 struct extent_state *state);
51 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
52 u64 start, u64 end,
53 struct extent_state *state);
54 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
55 struct extent_state *state);
56 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
57 struct extent_state *state, int uptodate);
58 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
59 unsigned long old, unsigned long bits);
60 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
61 unsigned long old, unsigned long bits);
62 int (*write_cache_pages_lock_hook)(struct page *page);
63};
64
65struct extent_io_tree {
66 struct rb_root state;
67 struct rb_root buffer;
68 struct address_space *mapping;
69 u64 dirty_bytes;
70 spinlock_t lock;
71 spinlock_t buffer_lock;
72 struct extent_io_ops *ops;
73};
74
75struct extent_state {
76 u64 start;
77 u64 end; /* inclusive */
78 struct rb_node rb_node;
79 struct extent_io_tree *tree;
80 wait_queue_head_t wq;
81 atomic_t refs;
82 unsigned long state;
83
84 /* for use by the FS */
85 u64 private;
86
87 struct list_head leak_list;
88};
89
90struct extent_buffer {
91 u64 start;
92 unsigned long len;
93 char *map_token;
94 char *kaddr;
95 unsigned long map_start;
96 unsigned long map_len;
97 struct page *first_page;
98 atomic_t refs;
99 int flags;
100 struct list_head leak_list;
101 struct rb_node rb_node;
102 struct mutex mutex;
103};
104
105struct extent_map_tree;
106
107static inline struct extent_state *extent_state_next(struct extent_state *state)
108{
109 struct rb_node *node;
110 node = rb_next(&state->rb_node);
111 if (!node)
112 return NULL;
113 return rb_entry(node, struct extent_state, rb_node);
114}
115
116typedef struct extent_map *(get_extent_t)(struct inode *inode,
117 struct page *page,
118 size_t page_offset,
119 u64 start, u64 len,
120 int create);
121
122void extent_io_tree_init(struct extent_io_tree *tree,
123 struct address_space *mapping, gfp_t mask);
124int try_release_extent_mapping(struct extent_map_tree *map,
125 struct extent_io_tree *tree, struct page *page,
126 gfp_t mask);
127int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
128int try_release_extent_state(struct extent_map_tree *map,
129 struct extent_io_tree *tree, struct page *page,
130 gfp_t mask);
131int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
132int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
133int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
134 gfp_t mask);
135int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
136 get_extent_t *get_extent);
137int __init extent_io_init(void);
138void extent_io_exit(void);
139
140u64 count_range_bits(struct extent_io_tree *tree,
141 u64 *start, u64 search_end,
142 u64 max_bytes, unsigned long bits);
143
144int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
145 int bits, int filled);
146int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
147 int bits, gfp_t mask);
148int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
149 int bits, int wake, int delete, gfp_t mask);
150int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
151 int bits, gfp_t mask);
152int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
153 gfp_t mask);
154int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
155 gfp_t mask);
156int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
157 gfp_t mask);
158int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
159 gfp_t mask);
160int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
161 gfp_t mask);
162int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
163 u64 end, gfp_t mask);
164int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
165 gfp_t mask);
166int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask);
168int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
169 u64 *start_ret, u64 *end_ret, int bits);
170struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
171 u64 start, int bits);
172int extent_invalidatepage(struct extent_io_tree *tree,
173 struct page *page, unsigned long offset);
174int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
175 get_extent_t *get_extent,
176 struct writeback_control *wbc);
177int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
178 u64 start, u64 end, get_extent_t *get_extent,
179 int mode);
180int extent_writepages(struct extent_io_tree *tree,
181 struct address_space *mapping,
182 get_extent_t *get_extent,
183 struct writeback_control *wbc);
184int extent_readpages(struct extent_io_tree *tree,
185 struct address_space *mapping,
186 struct list_head *pages, unsigned nr_pages,
187 get_extent_t get_extent);
188int extent_prepare_write(struct extent_io_tree *tree,
189 struct inode *inode, struct page *page,
190 unsigned from, unsigned to, get_extent_t *get_extent);
191int extent_commit_write(struct extent_io_tree *tree,
192 struct inode *inode, struct page *page,
193 unsigned from, unsigned to);
194sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
195 get_extent_t *get_extent);
196int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
197int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
198int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
199void set_page_extent_mapped(struct page *page);
200
201struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
202 u64 start, unsigned long len,
203 struct page *page0,
204 gfp_t mask);
205struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
206 u64 start, unsigned long len,
207 gfp_t mask);
208void free_extent_buffer(struct extent_buffer *eb);
209int read_extent_buffer_pages(struct extent_io_tree *tree,
210 struct extent_buffer *eb, u64 start, int wait,
211 get_extent_t *get_extent, int mirror_num);
212
213static inline void extent_buffer_get(struct extent_buffer *eb)
214{
215 atomic_inc(&eb->refs);
216}
217
218int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
219 unsigned long start,
220 unsigned long len);
221void read_extent_buffer(struct extent_buffer *eb, void *dst,
222 unsigned long start,
223 unsigned long len);
224void write_extent_buffer(struct extent_buffer *eb, const void *src,
225 unsigned long start, unsigned long len);
226void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
227 unsigned long dst_offset, unsigned long src_offset,
228 unsigned long len);
229void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
230 unsigned long src_offset, unsigned long len);
231void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
232 unsigned long src_offset, unsigned long len);
233void memset_extent_buffer(struct extent_buffer *eb, char c,
234 unsigned long start, unsigned long len);
235int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
236 struct extent_buffer *eb);
237int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
238int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
239int clear_extent_buffer_dirty(struct extent_io_tree *tree,
240 struct extent_buffer *eb);
241int set_extent_buffer_dirty(struct extent_io_tree *tree,
242 struct extent_buffer *eb);
243int set_extent_buffer_uptodate(struct extent_io_tree *tree,
244 struct extent_buffer *eb);
245int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
246 struct extent_buffer *eb);
247int extent_buffer_uptodate(struct extent_io_tree *tree,
248 struct extent_buffer *eb);
249int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
250 unsigned long min_len, char **token, char **map,
251 unsigned long *map_start,
252 unsigned long *map_len, int km);
253int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
254 unsigned long min_len, char **token, char **map,
255 unsigned long *map_start,
256 unsigned long *map_len, int km);
257void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
258int release_extent_buffer_tail_pages(struct extent_buffer *eb);
259int extent_range_uptodate(struct extent_io_tree *tree,
260 u64 start, u64 end);
261int extent_clear_unlock_delalloc(struct inode *inode,
262 struct extent_io_tree *tree,
263 u64 start, u64 end, struct page *locked_page,
264 int unlock_page,
265 int clear_unlock,
266 int clear_delalloc, int clear_dirty,
267 int set_writeback,
268 int end_writeback);
269#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000000..4a83e33ada32
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,351 @@
1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h>
4#include <linux/module.h>
5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h>
8#include "extent_map.h"
9
10/* temporary define until extent_map moves out of btrfs */
11struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
12 unsigned long extra_flags,
13 void (*ctor)(void *, struct kmem_cache *,
14 unsigned long));
15
16static struct kmem_cache *extent_map_cache;
17
18int __init extent_map_init(void)
19{
20 extent_map_cache = btrfs_cache_create("extent_map",
21 sizeof(struct extent_map), 0,
22 NULL);
23 if (!extent_map_cache)
24 return -ENOMEM;
25 return 0;
26}
27
28void extent_map_exit(void)
29{
30 if (extent_map_cache)
31 kmem_cache_destroy(extent_map_cache);
32}
33
34/**
35 * extent_map_tree_init - initialize extent map tree
36 * @tree: tree to initialize
37 * @mask: flags for memory allocations during tree operations
38 *
39 * Initialize the extent tree @tree. Should be called for each new inode
40 * or other user of the extent_map interface.
41 */
42void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
43{
44 tree->map.rb_node = NULL;
45 spin_lock_init(&tree->lock);
46}
47EXPORT_SYMBOL(extent_map_tree_init);
48
49/**
50 * alloc_extent_map - allocate new extent map structure
51 * @mask: memory allocation flags
52 *
53 * Allocate a new extent_map structure. The new structure is
54 * returned with a reference count of one and needs to be
55 * freed using free_extent_map()
56 */
57struct extent_map *alloc_extent_map(gfp_t mask)
58{
59 struct extent_map *em;
60 em = kmem_cache_alloc(extent_map_cache, mask);
61 if (!em || IS_ERR(em))
62 return em;
63 em->in_tree = 0;
64 em->flags = 0;
65 atomic_set(&em->refs, 1);
66 return em;
67}
68EXPORT_SYMBOL(alloc_extent_map);
69
70/**
71 * free_extent_map - drop reference count of an extent_map
72 * @em: extent map beeing releasead
73 *
74 * Drops the reference out on @em by one and free the structure
75 * if the reference count hits zero.
76 */
77void free_extent_map(struct extent_map *em)
78{
79 if (!em)
80 return;
81 WARN_ON(atomic_read(&em->refs) == 0);
82 if (atomic_dec_and_test(&em->refs)) {
83 WARN_ON(em->in_tree);
84 kmem_cache_free(extent_map_cache, em);
85 }
86}
87EXPORT_SYMBOL(free_extent_map);
88
89static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
90 struct rb_node *node)
91{
92 struct rb_node **p = &root->rb_node;
93 struct rb_node *parent = NULL;
94 struct extent_map *entry;
95
96 while (*p) {
97 parent = *p;
98 entry = rb_entry(parent, struct extent_map, rb_node);
99
100 WARN_ON(!entry->in_tree);
101
102 if (offset < entry->start)
103 p = &(*p)->rb_left;
104 else if (offset >= extent_map_end(entry))
105 p = &(*p)->rb_right;
106 else
107 return parent;
108 }
109
110 entry = rb_entry(node, struct extent_map, rb_node);
111 entry->in_tree = 1;
112 rb_link_node(node, parent, p);
113 rb_insert_color(node, root);
114 return NULL;
115}
116
117/*
118 * search through the tree for an extent_map with a given offset. If
119 * it can't be found, try to find some neighboring extents
120 */
121static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
122 struct rb_node **prev_ret,
123 struct rb_node **next_ret)
124{
125 struct rb_node *n = root->rb_node;
126 struct rb_node *prev = NULL;
127 struct rb_node *orig_prev = NULL;
128 struct extent_map *entry;
129 struct extent_map *prev_entry = NULL;
130
131 while (n) {
132 entry = rb_entry(n, struct extent_map, rb_node);
133 prev = n;
134 prev_entry = entry;
135
136 WARN_ON(!entry->in_tree);
137
138 if (offset < entry->start)
139 n = n->rb_left;
140 else if (offset >= extent_map_end(entry))
141 n = n->rb_right;
142 else
143 return n;
144 }
145
146 if (prev_ret) {
147 orig_prev = prev;
148 while (prev && offset >= extent_map_end(prev_entry)) {
149 prev = rb_next(prev);
150 prev_entry = rb_entry(prev, struct extent_map, rb_node);
151 }
152 *prev_ret = prev;
153 prev = orig_prev;
154 }
155
156 if (next_ret) {
157 prev_entry = rb_entry(prev, struct extent_map, rb_node);
158 while (prev && offset < prev_entry->start) {
159 prev = rb_prev(prev);
160 prev_entry = rb_entry(prev, struct extent_map, rb_node);
161 }
162 *next_ret = prev;
163 }
164 return NULL;
165}
166
167/*
168 * look for an offset in the tree, and if it can't be found, return
169 * the first offset we can find smaller than 'offset'.
170 */
171static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
172{
173 struct rb_node *prev;
174 struct rb_node *ret;
175 ret = __tree_search(root, offset, &prev, NULL);
176 if (!ret)
177 return prev;
178 return ret;
179}
180
181/* check to see if two extent_map structs are adjacent and safe to merge */
182static int mergable_maps(struct extent_map *prev, struct extent_map *next)
183{
184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
185 return 0;
186
187 /*
188 * don't merge compressed extents, we need to know their
189 * actual size
190 */
191 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
192 return 0;
193
194 if (extent_map_end(prev) == next->start &&
195 prev->flags == next->flags &&
196 prev->bdev == next->bdev &&
197 ((next->block_start == EXTENT_MAP_HOLE &&
198 prev->block_start == EXTENT_MAP_HOLE) ||
199 (next->block_start == EXTENT_MAP_INLINE &&
200 prev->block_start == EXTENT_MAP_INLINE) ||
201 (next->block_start == EXTENT_MAP_DELALLOC &&
202 prev->block_start == EXTENT_MAP_DELALLOC) ||
203 (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
204 next->block_start == extent_map_block_end(prev)))) {
205 return 1;
206 }
207 return 0;
208}
209
210/**
211 * add_extent_mapping - add new extent map to the extent tree
212 * @tree: tree to insert new map in
213 * @em: map to insert
214 *
215 * Insert @em into @tree or perform a simple forward/backward merge with
216 * existing mappings. The extent_map struct passed in will be inserted
217 * into the tree directly, with an additional reference taken, or a
218 * reference dropped if the merge attempt was sucessfull.
219 */
220int add_extent_mapping(struct extent_map_tree *tree,
221 struct extent_map *em)
222{
223 int ret = 0;
224 struct extent_map *merge = NULL;
225 struct rb_node *rb;
226 struct extent_map *exist;
227
228 exist = lookup_extent_mapping(tree, em->start, em->len);
229 if (exist) {
230 free_extent_map(exist);
231 ret = -EEXIST;
232 goto out;
233 }
234 assert_spin_locked(&tree->lock);
235 rb = tree_insert(&tree->map, em->start, &em->rb_node);
236 if (rb) {
237 ret = -EEXIST;
238 free_extent_map(merge);
239 goto out;
240 }
241 atomic_inc(&em->refs);
242 if (em->start != 0) {
243 rb = rb_prev(&em->rb_node);
244 if (rb)
245 merge = rb_entry(rb, struct extent_map, rb_node);
246 if (rb && mergable_maps(merge, em)) {
247 em->start = merge->start;
248 em->len += merge->len;
249 em->block_len += merge->block_len;
250 em->block_start = merge->block_start;
251 merge->in_tree = 0;
252 rb_erase(&merge->rb_node, &tree->map);
253 free_extent_map(merge);
254 }
255 }
256 rb = rb_next(&em->rb_node);
257 if (rb)
258 merge = rb_entry(rb, struct extent_map, rb_node);
259 if (rb && mergable_maps(em, merge)) {
260 em->len += merge->len;
261 em->block_len += merge->len;
262 rb_erase(&merge->rb_node, &tree->map);
263 merge->in_tree = 0;
264 free_extent_map(merge);
265 }
266out:
267 return ret;
268}
269EXPORT_SYMBOL(add_extent_mapping);
270
271/* simple helper to do math around the end of an extent, handling wrap */
272static u64 range_end(u64 start, u64 len)
273{
274 if (start + len < start)
275 return (u64)-1;
276 return start + len;
277}
278
279/**
280 * lookup_extent_mapping - lookup extent_map
281 * @tree: tree to lookup in
282 * @start: byte offset to start the search
283 * @len: length of the lookup range
284 *
285 * Find and return the first extent_map struct in @tree that intersects the
286 * [start, len] range. There may be additional objects in the tree that
287 * intersect, so check the object returned carefully to make sure that no
288 * additional lookups are needed.
289 */
290struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
291 u64 start, u64 len)
292{
293 struct extent_map *em;
294 struct rb_node *rb_node;
295 struct rb_node *prev = NULL;
296 struct rb_node *next = NULL;
297 u64 end = range_end(start, len);
298
299 assert_spin_locked(&tree->lock);
300 rb_node = __tree_search(&tree->map, start, &prev, &next);
301 if (!rb_node && prev) {
302 em = rb_entry(prev, struct extent_map, rb_node);
303 if (end > em->start && start < extent_map_end(em))
304 goto found;
305 }
306 if (!rb_node && next) {
307 em = rb_entry(next, struct extent_map, rb_node);
308 if (end > em->start && start < extent_map_end(em))
309 goto found;
310 }
311 if (!rb_node) {
312 em = NULL;
313 goto out;
314 }
315 if (IS_ERR(rb_node)) {
316 em = ERR_PTR(PTR_ERR(rb_node));
317 goto out;
318 }
319 em = rb_entry(rb_node, struct extent_map, rb_node);
320 if (end > em->start && start < extent_map_end(em))
321 goto found;
322
323 em = NULL;
324 goto out;
325
326found:
327 atomic_inc(&em->refs);
328out:
329 return em;
330}
331EXPORT_SYMBOL(lookup_extent_mapping);
332
333/**
334 * remove_extent_mapping - removes an extent_map from the extent tree
335 * @tree: extent tree to remove from
336 * @em: extent map beeing removed
337 *
338 * Removes @em from @tree. No reference counts are dropped, and no checks
339 * are done to see if the range is in use
340 */
341int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
342{
343 int ret = 0;
344
345 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
346 assert_spin_locked(&tree->lock);
347 rb_erase(&em->rb_node, &tree->map);
348 em->in_tree = 0;
349 return ret;
350}
351EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000000..fb6eeef06bb0
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,62 @@
1#ifndef __EXTENTMAP__
2#define __EXTENTMAP__
3
4#include <linux/rbtree.h>
5
6#define EXTENT_MAP_LAST_BYTE (u64)-4
7#define EXTENT_MAP_HOLE (u64)-3
8#define EXTENT_MAP_INLINE (u64)-2
9#define EXTENT_MAP_DELALLOC (u64)-1
10
11/* bits for the flags field */
12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
13#define EXTENT_FLAG_COMPRESSED 1
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16
17struct extent_map {
18 struct rb_node rb_node;
19
20 /* all of these are in bytes */
21 u64 start;
22 u64 len;
23 u64 orig_start;
24 u64 block_start;
25 u64 block_len;
26 unsigned long flags;
27 struct block_device *bdev;
28 atomic_t refs;
29 int in_tree;
30};
31
32struct extent_map_tree {
33 struct rb_root map;
34 spinlock_t lock;
35};
36
37static inline u64 extent_map_end(struct extent_map *em)
38{
39 if (em->start + em->len < em->start)
40 return (u64)-1;
41 return em->start + em->len;
42}
43
44static inline u64 extent_map_block_end(struct extent_map *em)
45{
46 if (em->block_start + em->block_len < em->block_start)
47 return (u64)-1;
48 return em->block_start + em->block_len;
49}
50
51void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
52struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
53 u64 start, u64 len);
54int add_extent_mapping(struct extent_map_tree *tree,
55 struct extent_map *em);
56int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
57
58struct extent_map *alloc_extent_map(gfp_t mask);
59void free_extent_map(struct extent_map *em);
60int __init extent_map_init(void);
61void extent_map_exit(void);
62#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000000..964652435fd1
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,831 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/bio.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include "ctree.h"
23#include "disk-io.h"
24#include "transaction.h"
25#include "print-tree.h"
26
27#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
28 sizeof(struct btrfs_item) * 2) / \
29 size) - 1))
30
31#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
32 sizeof(struct btrfs_ordered_sum)) / \
33 sizeof(struct btrfs_sector_sum) * \
34 (r)->sectorsize - (r)->sectorsize)
35
36int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root,
38 u64 objectid, u64 pos,
39 u64 disk_offset, u64 disk_num_bytes,
40 u64 num_bytes, u64 offset, u64 ram_bytes,
41 u8 compression, u8 encryption, u16 other_encoding)
42{
43 int ret = 0;
44 struct btrfs_file_extent_item *item;
45 struct btrfs_key file_key;
46 struct btrfs_path *path;
47 struct extent_buffer *leaf;
48
49 path = btrfs_alloc_path();
50 BUG_ON(!path);
51 file_key.objectid = objectid;
52 file_key.offset = pos;
53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
54
55 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
56 sizeof(*item));
57 if (ret < 0)
58 goto out;
59 BUG_ON(ret);
60 leaf = path->nodes[0];
61 item = btrfs_item_ptr(leaf, path->slots[0],
62 struct btrfs_file_extent_item);
63 btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
64 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
65 btrfs_set_file_extent_offset(leaf, item, offset);
66 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
67 btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
68 btrfs_set_file_extent_generation(leaf, item, trans->transid);
69 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
70 btrfs_set_file_extent_compression(leaf, item, compression);
71 btrfs_set_file_extent_encryption(leaf, item, encryption);
72 btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
73
74 btrfs_mark_buffer_dirty(leaf);
75out:
76 btrfs_free_path(path);
77 return ret;
78}
79
80struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
81 struct btrfs_root *root,
82 struct btrfs_path *path,
83 u64 bytenr, int cow)
84{
85 int ret;
86 struct btrfs_key file_key;
87 struct btrfs_key found_key;
88 struct btrfs_csum_item *item;
89 struct extent_buffer *leaf;
90 u64 csum_offset = 0;
91 u16 csum_size =
92 btrfs_super_csum_size(&root->fs_info->super_copy);
93 int csums_in_item;
94
95 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
96 file_key.offset = bytenr;
97 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
98 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
99 if (ret < 0)
100 goto fail;
101 leaf = path->nodes[0];
102 if (ret > 0) {
103 ret = 1;
104 if (path->slots[0] == 0)
105 goto fail;
106 path->slots[0]--;
107 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
108 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
109 goto fail;
110
111 csum_offset = (bytenr - found_key.offset) >>
112 root->fs_info->sb->s_blocksize_bits;
113 csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
114 csums_in_item /= csum_size;
115
116 if (csum_offset >= csums_in_item) {
117 ret = -EFBIG;
118 goto fail;
119 }
120 }
121 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
122 item = (struct btrfs_csum_item *)((unsigned char *)item +
123 csum_offset * csum_size);
124 return item;
125fail:
126 if (ret > 0)
127 ret = -ENOENT;
128 return ERR_PTR(ret);
129}
130
131
132int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
133 struct btrfs_root *root,
134 struct btrfs_path *path, u64 objectid,
135 u64 offset, int mod)
136{
137 int ret;
138 struct btrfs_key file_key;
139 int ins_len = mod < 0 ? -1 : 0;
140 int cow = mod != 0;
141
142 file_key.objectid = objectid;
143 file_key.offset = offset;
144 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
145 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
146 return ret;
147}
148
149
150int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
151 struct bio *bio, u32 *dst)
152{
153 u32 sum;
154 struct bio_vec *bvec = bio->bi_io_vec;
155 int bio_index = 0;
156 u64 offset;
157 u64 item_start_offset = 0;
158 u64 item_last_offset = 0;
159 u64 disk_bytenr;
160 u32 diff;
161 u16 csum_size =
162 btrfs_super_csum_size(&root->fs_info->super_copy);
163 int ret;
164 struct btrfs_path *path;
165 struct btrfs_csum_item *item = NULL;
166 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
167
168 path = btrfs_alloc_path();
169 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
170 path->reada = 2;
171
172 WARN_ON(bio->bi_vcnt <= 0);
173
174 disk_bytenr = (u64)bio->bi_sector << 9;
175 while (bio_index < bio->bi_vcnt) {
176 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
177 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
178 if (ret == 0)
179 goto found;
180
181 if (!item || disk_bytenr < item_start_offset ||
182 disk_bytenr >= item_last_offset) {
183 struct btrfs_key found_key;
184 u32 item_size;
185
186 if (item)
187 btrfs_release_path(root, path);
188 item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
189 path, disk_bytenr, 0);
190 if (IS_ERR(item)) {
191 ret = PTR_ERR(item);
192 if (ret == -ENOENT || ret == -EFBIG)
193 ret = 0;
194 sum = 0;
195 if (BTRFS_I(inode)->root->root_key.objectid ==
196 BTRFS_DATA_RELOC_TREE_OBJECTID) {
197 set_extent_bits(io_tree, offset,
198 offset + bvec->bv_len - 1,
199 EXTENT_NODATASUM, GFP_NOFS);
200 } else {
201 printk(KERN_INFO "btrfs no csum found "
202 "for inode %lu start %llu\n",
203 inode->i_ino,
204 (unsigned long long)offset);
205 }
206 item = NULL;
207 btrfs_release_path(root, path);
208 goto found;
209 }
210 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
211 path->slots[0]);
212
213 item_start_offset = found_key.offset;
214 item_size = btrfs_item_size_nr(path->nodes[0],
215 path->slots[0]);
216 item_last_offset = item_start_offset +
217 (item_size / csum_size) *
218 root->sectorsize;
219 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
220 struct btrfs_csum_item);
221 }
222 /*
223 * this byte range must be able to fit inside
224 * a single leaf so it will also fit inside a u32
225 */
226 diff = disk_bytenr - item_start_offset;
227 diff = diff / root->sectorsize;
228 diff = diff * csum_size;
229
230 read_extent_buffer(path->nodes[0], &sum,
231 ((unsigned long)item) + diff,
232 csum_size);
233found:
234 if (dst)
235 *dst++ = sum;
236 else
237 set_state_private(io_tree, offset, sum);
238 disk_bytenr += bvec->bv_len;
239 bio_index++;
240 bvec++;
241 }
242 btrfs_free_path(path);
243 return 0;
244}
245
246int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
247 struct list_head *list)
248{
249 struct btrfs_key key;
250 struct btrfs_path *path;
251 struct extent_buffer *leaf;
252 struct btrfs_ordered_sum *sums;
253 struct btrfs_sector_sum *sector_sum;
254 struct btrfs_csum_item *item;
255 unsigned long offset;
256 int ret;
257 size_t size;
258 u64 csum_end;
259 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
260
261 path = btrfs_alloc_path();
262 BUG_ON(!path);
263
264 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
265 key.offset = start;
266 key.type = BTRFS_EXTENT_CSUM_KEY;
267
268 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
269 if (ret < 0)
270 goto fail;
271 if (ret > 0 && path->slots[0] > 0) {
272 leaf = path->nodes[0];
273 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
274 if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
275 key.type == BTRFS_EXTENT_CSUM_KEY) {
276 offset = (start - key.offset) >>
277 root->fs_info->sb->s_blocksize_bits;
278 if (offset * csum_size <
279 btrfs_item_size_nr(leaf, path->slots[0] - 1))
280 path->slots[0]--;
281 }
282 }
283
284 while (start <= end) {
285 leaf = path->nodes[0];
286 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
287 ret = btrfs_next_leaf(root, path);
288 if (ret < 0)
289 goto fail;
290 if (ret > 0)
291 break;
292 leaf = path->nodes[0];
293 }
294
295 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
296 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
297 key.type != BTRFS_EXTENT_CSUM_KEY)
298 break;
299
300 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
301 if (key.offset > end)
302 break;
303
304 if (key.offset > start)
305 start = key.offset;
306
307 size = btrfs_item_size_nr(leaf, path->slots[0]);
308 csum_end = key.offset + (size / csum_size) * root->sectorsize;
309 if (csum_end <= start) {
310 path->slots[0]++;
311 continue;
312 }
313
314 csum_end = min(csum_end, end + 1);
315 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
316 struct btrfs_csum_item);
317 while (start < csum_end) {
318 size = min_t(size_t, csum_end - start,
319 MAX_ORDERED_SUM_BYTES(root));
320 sums = kzalloc(btrfs_ordered_sum_size(root, size),
321 GFP_NOFS);
322 BUG_ON(!sums);
323
324 sector_sum = sums->sums;
325 sums->bytenr = start;
326 sums->len = size;
327
328 offset = (start - key.offset) >>
329 root->fs_info->sb->s_blocksize_bits;
330 offset *= csum_size;
331
332 while (size > 0) {
333 read_extent_buffer(path->nodes[0],
334 &sector_sum->sum,
335 ((unsigned long)item) +
336 offset, csum_size);
337 sector_sum->bytenr = start;
338
339 size -= root->sectorsize;
340 start += root->sectorsize;
341 offset += csum_size;
342 sector_sum++;
343 }
344 list_add_tail(&sums->list, list);
345 }
346 path->slots[0]++;
347 }
348 ret = 0;
349fail:
350 btrfs_free_path(path);
351 return ret;
352}
353
354int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
355 struct bio *bio, u64 file_start, int contig)
356{
357 struct btrfs_ordered_sum *sums;
358 struct btrfs_sector_sum *sector_sum;
359 struct btrfs_ordered_extent *ordered;
360 char *data;
361 struct bio_vec *bvec = bio->bi_io_vec;
362 int bio_index = 0;
363 unsigned long total_bytes = 0;
364 unsigned long this_sum_bytes = 0;
365 u64 offset;
366 u64 disk_bytenr;
367
368 WARN_ON(bio->bi_vcnt <= 0);
369 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
370 if (!sums)
371 return -ENOMEM;
372
373 sector_sum = sums->sums;
374 disk_bytenr = (u64)bio->bi_sector << 9;
375 sums->len = bio->bi_size;
376 INIT_LIST_HEAD(&sums->list);
377
378 if (contig)
379 offset = file_start;
380 else
381 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
382
383 ordered = btrfs_lookup_ordered_extent(inode, offset);
384 BUG_ON(!ordered);
385 sums->bytenr = ordered->start;
386
387 while (bio_index < bio->bi_vcnt) {
388 if (!contig)
389 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
390
391 if (!contig && (offset >= ordered->file_offset + ordered->len ||
392 offset < ordered->file_offset)) {
393 unsigned long bytes_left;
394 sums->len = this_sum_bytes;
395 this_sum_bytes = 0;
396 btrfs_add_ordered_sum(inode, ordered, sums);
397 btrfs_put_ordered_extent(ordered);
398
399 bytes_left = bio->bi_size - total_bytes;
400
401 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
402 GFP_NOFS);
403 BUG_ON(!sums);
404 sector_sum = sums->sums;
405 sums->len = bytes_left;
406 ordered = btrfs_lookup_ordered_extent(inode, offset);
407 BUG_ON(!ordered);
408 sums->bytenr = ordered->start;
409 }
410
411 data = kmap_atomic(bvec->bv_page, KM_USER0);
412 sector_sum->sum = ~(u32)0;
413 sector_sum->sum = btrfs_csum_data(root,
414 data + bvec->bv_offset,
415 sector_sum->sum,
416 bvec->bv_len);
417 kunmap_atomic(data, KM_USER0);
418 btrfs_csum_final(sector_sum->sum,
419 (char *)&sector_sum->sum);
420 sector_sum->bytenr = disk_bytenr;
421
422 sector_sum++;
423 bio_index++;
424 total_bytes += bvec->bv_len;
425 this_sum_bytes += bvec->bv_len;
426 disk_bytenr += bvec->bv_len;
427 offset += bvec->bv_len;
428 bvec++;
429 }
430 this_sum_bytes = 0;
431 btrfs_add_ordered_sum(inode, ordered, sums);
432 btrfs_put_ordered_extent(ordered);
433 return 0;
434}
435
436/*
437 * helper function for csum removal, this expects the
438 * key to describe the csum pointed to by the path, and it expects
439 * the csum to overlap the range [bytenr, len]
440 *
441 * The csum should not be entirely contained in the range and the
442 * range should not be entirely contained in the csum.
443 *
444 * This calls btrfs_truncate_item with the correct args based on the
445 * overlap, and fixes up the key as required.
446 */
447static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
448 struct btrfs_root *root,
449 struct btrfs_path *path,
450 struct btrfs_key *key,
451 u64 bytenr, u64 len)
452{
453 struct extent_buffer *leaf;
454 u16 csum_size =
455 btrfs_super_csum_size(&root->fs_info->super_copy);
456 u64 csum_end;
457 u64 end_byte = bytenr + len;
458 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
459 int ret;
460
461 leaf = path->nodes[0];
462 csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
463 csum_end <<= root->fs_info->sb->s_blocksize_bits;
464 csum_end += key->offset;
465
466 if (key->offset < bytenr && csum_end <= end_byte) {
467 /*
468 * [ bytenr - len ]
469 * [ ]
470 * [csum ]
471 * A simple truncate off the end of the item
472 */
473 u32 new_size = (bytenr - key->offset) >> blocksize_bits;
474 new_size *= csum_size;
475 ret = btrfs_truncate_item(trans, root, path, new_size, 1);
476 BUG_ON(ret);
477 } else if (key->offset >= bytenr && csum_end > end_byte &&
478 end_byte > key->offset) {
479 /*
480 * [ bytenr - len ]
481 * [ ]
482 * [csum ]
483 * we need to truncate from the beginning of the csum
484 */
485 u32 new_size = (csum_end - end_byte) >> blocksize_bits;
486 new_size *= csum_size;
487
488 ret = btrfs_truncate_item(trans, root, path, new_size, 0);
489 BUG_ON(ret);
490
491 key->offset = end_byte;
492 ret = btrfs_set_item_key_safe(trans, root, path, key);
493 BUG_ON(ret);
494 } else {
495 BUG();
496 }
497 return 0;
498}
499
500/*
501 * deletes the csum items from the csum tree for a given
502 * range of bytes.
503 */
504int btrfs_del_csums(struct btrfs_trans_handle *trans,
505 struct btrfs_root *root, u64 bytenr, u64 len)
506{
507 struct btrfs_path *path;
508 struct btrfs_key key;
509 u64 end_byte = bytenr + len;
510 u64 csum_end;
511 struct extent_buffer *leaf;
512 int ret;
513 u16 csum_size =
514 btrfs_super_csum_size(&root->fs_info->super_copy);
515 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
516
517 root = root->fs_info->csum_root;
518
519 path = btrfs_alloc_path();
520
521 while (1) {
522 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
523 key.offset = end_byte - 1;
524 key.type = BTRFS_EXTENT_CSUM_KEY;
525
526 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
527 if (ret > 0) {
528 if (path->slots[0] == 0)
529 goto out;
530 path->slots[0]--;
531 }
532 leaf = path->nodes[0];
533 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
534
535 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
536 key.type != BTRFS_EXTENT_CSUM_KEY) {
537 break;
538 }
539
540 if (key.offset >= end_byte)
541 break;
542
543 csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
544 csum_end <<= blocksize_bits;
545 csum_end += key.offset;
546
547 /* this csum ends before we start, we're done */
548 if (csum_end <= bytenr)
549 break;
550
551 /* delete the entire item, it is inside our range */
552 if (key.offset >= bytenr && csum_end <= end_byte) {
553 ret = btrfs_del_item(trans, root, path);
554 BUG_ON(ret);
555 if (key.offset == bytenr)
556 break;
557 } else if (key.offset < bytenr && csum_end > end_byte) {
558 unsigned long offset;
559 unsigned long shift_len;
560 unsigned long item_offset;
561 /*
562 * [ bytenr - len ]
563 * [csum ]
564 *
565 * Our bytes are in the middle of the csum,
566 * we need to split this item and insert a new one.
567 *
568 * But we can't drop the path because the
569 * csum could change, get removed, extended etc.
570 *
571 * The trick here is the max size of a csum item leaves
572 * enough room in the tree block for a single
573 * item header. So, we split the item in place,
574 * adding a new header pointing to the existing
575 * bytes. Then we loop around again and we have
576 * a nicely formed csum item that we can neatly
577 * truncate.
578 */
579 offset = (bytenr - key.offset) >> blocksize_bits;
580 offset *= csum_size;
581
582 shift_len = (len >> blocksize_bits) * csum_size;
583
584 item_offset = btrfs_item_ptr_offset(leaf,
585 path->slots[0]);
586
587 memset_extent_buffer(leaf, 0, item_offset + offset,
588 shift_len);
589 key.offset = bytenr;
590
591 /*
592 * btrfs_split_item returns -EAGAIN when the
593 * item changed size or key
594 */
595 ret = btrfs_split_item(trans, root, path, &key, offset);
596 BUG_ON(ret && ret != -EAGAIN);
597
598 key.offset = end_byte - 1;
599 } else {
600 ret = truncate_one_csum(trans, root, path,
601 &key, bytenr, len);
602 BUG_ON(ret);
603 if (key.offset < bytenr)
604 break;
605 }
606 btrfs_release_path(root, path);
607 }
608out:
609 btrfs_free_path(path);
610 return 0;
611}
612
613int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
614 struct btrfs_root *root,
615 struct btrfs_ordered_sum *sums)
616{
617 u64 bytenr;
618 int ret;
619 struct btrfs_key file_key;
620 struct btrfs_key found_key;
621 u64 next_offset;
622 u64 total_bytes = 0;
623 int found_next;
624 struct btrfs_path *path;
625 struct btrfs_csum_item *item;
626 struct btrfs_csum_item *item_end;
627 struct extent_buffer *leaf = NULL;
628 u64 csum_offset;
629 struct btrfs_sector_sum *sector_sum;
630 u32 nritems;
631 u32 ins_size;
632 char *eb_map;
633 char *eb_token;
634 unsigned long map_len;
635 unsigned long map_start;
636 u16 csum_size =
637 btrfs_super_csum_size(&root->fs_info->super_copy);
638
639 path = btrfs_alloc_path();
640 BUG_ON(!path);
641 sector_sum = sums->sums;
642again:
643 next_offset = (u64)-1;
644 found_next = 0;
645 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
646 file_key.offset = sector_sum->bytenr;
647 bytenr = sector_sum->bytenr;
648 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
649
650 item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
651 if (!IS_ERR(item)) {
652 leaf = path->nodes[0];
653 ret = 0;
654 goto found;
655 }
656 ret = PTR_ERR(item);
657 if (ret == -EFBIG) {
658 u32 item_size;
659 /* we found one, but it isn't big enough yet */
660 leaf = path->nodes[0];
661 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
662 if ((item_size / csum_size) >=
663 MAX_CSUM_ITEMS(root, csum_size)) {
664 /* already at max size, make a new one */
665 goto insert;
666 }
667 } else {
668 int slot = path->slots[0] + 1;
669 /* we didn't find a csum item, insert one */
670 nritems = btrfs_header_nritems(path->nodes[0]);
671 if (path->slots[0] >= nritems - 1) {
672 ret = btrfs_next_leaf(root, path);
673 if (ret == 1)
674 found_next = 1;
675 if (ret != 0)
676 goto insert;
677 slot = 0;
678 }
679 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
680 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
681 found_key.type != BTRFS_EXTENT_CSUM_KEY) {
682 found_next = 1;
683 goto insert;
684 }
685 next_offset = found_key.offset;
686 found_next = 1;
687 goto insert;
688 }
689
690 /*
691 * at this point, we know the tree has an item, but it isn't big
692 * enough yet to put our csum in. Grow it
693 */
694 btrfs_release_path(root, path);
695 ret = btrfs_search_slot(trans, root, &file_key, path,
696 csum_size, 1);
697 if (ret < 0)
698 goto fail_unlock;
699
700 if (ret > 0) {
701 if (path->slots[0] == 0)
702 goto insert;
703 path->slots[0]--;
704 }
705
706 leaf = path->nodes[0];
707 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
708 csum_offset = (bytenr - found_key.offset) >>
709 root->fs_info->sb->s_blocksize_bits;
710
711 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
712 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
713 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
714 goto insert;
715 }
716
717 if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
718 csum_size) {
719 u32 diff = (csum_offset + 1) * csum_size;
720
721 /*
722 * is the item big enough already? we dropped our lock
723 * before and need to recheck
724 */
725 if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
726 goto csum;
727
728 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
729 if (diff != csum_size)
730 goto insert;
731
732 ret = btrfs_extend_item(trans, root, path, diff);
733 BUG_ON(ret);
734 goto csum;
735 }
736
737insert:
738 btrfs_release_path(root, path);
739 csum_offset = 0;
740 if (found_next) {
741 u64 tmp = total_bytes + root->sectorsize;
742 u64 next_sector = sector_sum->bytenr;
743 struct btrfs_sector_sum *next = sector_sum + 1;
744
745 while (tmp < sums->len) {
746 if (next_sector + root->sectorsize != next->bytenr)
747 break;
748 tmp += root->sectorsize;
749 next_sector = next->bytenr;
750 next++;
751 }
752 tmp = min(tmp, next_offset - file_key.offset);
753 tmp >>= root->fs_info->sb->s_blocksize_bits;
754 tmp = max((u64)1, tmp);
755 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
756 ins_size = csum_size * tmp;
757 } else {
758 ins_size = csum_size;
759 }
760 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
761 ins_size);
762 if (ret < 0)
763 goto fail_unlock;
764 if (ret != 0) {
765 WARN_ON(1);
766 goto fail_unlock;
767 }
768csum:
769 leaf = path->nodes[0];
770 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
771 ret = 0;
772 item = (struct btrfs_csum_item *)((unsigned char *)item +
773 csum_offset * csum_size);
774found:
775 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
776 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
777 btrfs_item_size_nr(leaf, path->slots[0]));
778 eb_token = NULL;
779 cond_resched();
780next_sector:
781
782 if (!eb_token ||
783 (unsigned long)item + csum_size >= map_start + map_len) {
784 int err;
785
786 if (eb_token)
787 unmap_extent_buffer(leaf, eb_token, KM_USER1);
788 eb_token = NULL;
789 err = map_private_extent_buffer(leaf, (unsigned long)item,
790 csum_size,
791 &eb_token, &eb_map,
792 &map_start, &map_len, KM_USER1);
793 if (err)
794 eb_token = NULL;
795 }
796 if (eb_token) {
797 memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
798 &sector_sum->sum, csum_size);
799 } else {
800 write_extent_buffer(leaf, &sector_sum->sum,
801 (unsigned long)item, csum_size);
802 }
803
804 total_bytes += root->sectorsize;
805 sector_sum++;
806 if (total_bytes < sums->len) {
807 item = (struct btrfs_csum_item *)((char *)item +
808 csum_size);
809 if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
810 sector_sum->bytenr) {
811 bytenr = sector_sum->bytenr;
812 goto next_sector;
813 }
814 }
815 if (eb_token) {
816 unmap_extent_buffer(leaf, eb_token, KM_USER1);
817 eb_token = NULL;
818 }
819 btrfs_mark_buffer_dirty(path->nodes[0]);
820 cond_resched();
821 if (total_bytes < sums->len) {
822 btrfs_release_path(root, path);
823 goto again;
824 }
825out:
826 btrfs_free_path(path);
827 return ret;
828
829fail_unlock:
830 goto out;
831}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000000..90268334145e
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1288 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/string.h>
25#include <linux/smp_lock.h>
26#include <linux/backing-dev.h>
27#include <linux/mpage.h>
28#include <linux/swap.h>
29#include <linux/writeback.h>
30#include <linux/statfs.h>
31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h"
34#include "disk-io.h"
35#include "transaction.h"
36#include "btrfs_inode.h"
37#include "ioctl.h"
38#include "print-tree.h"
39#include "tree-log.h"
40#include "locking.h"
41#include "compat.h"
42
43
44/* simple helper to fault in pages and copy. This should go away
45 * and be replaced with calls into generic code.
46 */
47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 int write_bytes,
49 struct page **prepared_pages,
50 const char __user *buf)
51{
52 long page_fault = 0;
53 int i;
54 int offset = pos & (PAGE_CACHE_SIZE - 1);
55
56 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
57 size_t count = min_t(size_t,
58 PAGE_CACHE_SIZE - offset, write_bytes);
59 struct page *page = prepared_pages[i];
60 fault_in_pages_readable(buf, count);
61
62 /* Copy data from userspace to the current page */
63 kmap(page);
64 page_fault = __copy_from_user(page_address(page) + offset,
65 buf, count);
66 /* Flush processor's dcache for this page */
67 flush_dcache_page(page);
68 kunmap(page);
69 buf += count;
70 write_bytes -= count;
71
72 if (page_fault)
73 break;
74 }
75 return page_fault ? -EFAULT : 0;
76}
77
78/*
79 * unlocks pages after btrfs_file_write is done with them
80 */
81static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
82{
83 size_t i;
84 for (i = 0; i < num_pages; i++) {
85 if (!pages[i])
86 break;
87 /* page checked is some magic around finding pages that
88 * have been modified without going through btrfs_set_page_dirty
89 * clear it here
90 */
91 ClearPageChecked(pages[i]);
92 unlock_page(pages[i]);
93 mark_page_accessed(pages[i]);
94 page_cache_release(pages[i]);
95 }
96}
97
98/*
99 * after copy_from_user, pages need to be dirtied and we need to make
100 * sure holes are created between the current EOF and the start of
101 * any next extents (if required).
102 *
103 * this also makes the decision about creating an inline extent vs
104 * doing real data extents, marking pages dirty and delalloc as required.
105 */
106static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root,
108 struct file *file,
109 struct page **pages,
110 size_t num_pages,
111 loff_t pos,
112 size_t write_bytes)
113{
114 int err = 0;
115 int i;
116 struct inode *inode = fdentry(file)->d_inode;
117 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
118 u64 hint_byte;
119 u64 num_bytes;
120 u64 start_pos;
121 u64 end_of_last_block;
122 u64 end_pos = pos + write_bytes;
123 loff_t isize = i_size_read(inode);
124
125 start_pos = pos & ~((u64)root->sectorsize - 1);
126 num_bytes = (write_bytes + pos - start_pos +
127 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
128
129 end_of_last_block = start_pos + num_bytes - 1;
130
131 lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
132 trans = btrfs_join_transaction(root, 1);
133 if (!trans) {
134 err = -ENOMEM;
135 goto out_unlock;
136 }
137 btrfs_set_trans_block_group(trans, inode);
138 hint_byte = 0;
139
140 set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
141
142 /* check for reserved extents on each page, we don't want
143 * to reset the delalloc bit on things that already have
144 * extents reserved.
145 */
146 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
147 for (i = 0; i < num_pages; i++) {
148 struct page *p = pages[i];
149 SetPageUptodate(p);
150 ClearPageChecked(p);
151 set_page_dirty(p);
152 }
153 if (end_pos > isize) {
154 i_size_write(inode, end_pos);
155 btrfs_update_inode(trans, root, inode);
156 }
157 err = btrfs_end_transaction(trans, root);
158out_unlock:
159 unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
160 return err;
161}
162
163/*
164 * this drops all the extents in the cache that intersect the range
165 * [start, end]. Existing extents are split as required.
166 */
167int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
168 int skip_pinned)
169{
170 struct extent_map *em;
171 struct extent_map *split = NULL;
172 struct extent_map *split2 = NULL;
173 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
174 u64 len = end - start + 1;
175 int ret;
176 int testend = 1;
177 unsigned long flags;
178 int compressed = 0;
179
180 WARN_ON(end < start);
181 if (end == (u64)-1) {
182 len = (u64)-1;
183 testend = 0;
184 }
185 while (1) {
186 if (!split)
187 split = alloc_extent_map(GFP_NOFS);
188 if (!split2)
189 split2 = alloc_extent_map(GFP_NOFS);
190
191 spin_lock(&em_tree->lock);
192 em = lookup_extent_mapping(em_tree, start, len);
193 if (!em) {
194 spin_unlock(&em_tree->lock);
195 break;
196 }
197 flags = em->flags;
198 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
199 spin_unlock(&em_tree->lock);
200 if (em->start <= start &&
201 (!testend || em->start + em->len >= start + len)) {
202 free_extent_map(em);
203 break;
204 }
205 if (start < em->start) {
206 len = em->start - start;
207 } else {
208 len = start + len - (em->start + em->len);
209 start = em->start + em->len;
210 }
211 free_extent_map(em);
212 continue;
213 }
214 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
215 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
216 remove_extent_mapping(em_tree, em);
217
218 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
219 em->start < start) {
220 split->start = em->start;
221 split->len = start - em->start;
222 split->orig_start = em->orig_start;
223 split->block_start = em->block_start;
224
225 if (compressed)
226 split->block_len = em->block_len;
227 else
228 split->block_len = split->len;
229
230 split->bdev = em->bdev;
231 split->flags = flags;
232 ret = add_extent_mapping(em_tree, split);
233 BUG_ON(ret);
234 free_extent_map(split);
235 split = split2;
236 split2 = NULL;
237 }
238 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
239 testend && em->start + em->len > start + len) {
240 u64 diff = start + len - em->start;
241
242 split->start = start + len;
243 split->len = em->start + em->len - (start + len);
244 split->bdev = em->bdev;
245 split->flags = flags;
246
247 if (compressed) {
248 split->block_len = em->block_len;
249 split->block_start = em->block_start;
250 split->orig_start = em->orig_start;
251 } else {
252 split->block_len = split->len;
253 split->block_start = em->block_start + diff;
254 split->orig_start = split->start;
255 }
256
257 ret = add_extent_mapping(em_tree, split);
258 BUG_ON(ret);
259 free_extent_map(split);
260 split = NULL;
261 }
262 spin_unlock(&em_tree->lock);
263
264 /* once for us */
265 free_extent_map(em);
266 /* once for the tree*/
267 free_extent_map(em);
268 }
269 if (split)
270 free_extent_map(split);
271 if (split2)
272 free_extent_map(split2);
273 return 0;
274}
275
276int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
277{
278 return 0;
279#if 0
280 struct btrfs_path *path;
281 struct btrfs_key found_key;
282 struct extent_buffer *leaf;
283 struct btrfs_file_extent_item *extent;
284 u64 last_offset = 0;
285 int nritems;
286 int slot;
287 int found_type;
288 int ret;
289 int err = 0;
290 u64 extent_end = 0;
291
292 path = btrfs_alloc_path();
293 ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
294 last_offset, 0);
295 while (1) {
296 nritems = btrfs_header_nritems(path->nodes[0]);
297 if (path->slots[0] >= nritems) {
298 ret = btrfs_next_leaf(root, path);
299 if (ret)
300 goto out;
301 nritems = btrfs_header_nritems(path->nodes[0]);
302 }
303 slot = path->slots[0];
304 leaf = path->nodes[0];
305 btrfs_item_key_to_cpu(leaf, &found_key, slot);
306 if (found_key.objectid != inode->i_ino)
307 break;
308 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
309 goto out;
310
311 if (found_key.offset < last_offset) {
312 WARN_ON(1);
313 btrfs_print_leaf(root, leaf);
314 printk(KERN_ERR "inode %lu found offset %llu "
315 "expected %llu\n", inode->i_ino,
316 (unsigned long long)found_key.offset,
317 (unsigned long long)last_offset);
318 err = 1;
319 goto out;
320 }
321 extent = btrfs_item_ptr(leaf, slot,
322 struct btrfs_file_extent_item);
323 found_type = btrfs_file_extent_type(leaf, extent);
324 if (found_type == BTRFS_FILE_EXTENT_REG) {
325 extent_end = found_key.offset +
326 btrfs_file_extent_num_bytes(leaf, extent);
327 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
328 struct btrfs_item *item;
329 item = btrfs_item_nr(leaf, slot);
330 extent_end = found_key.offset +
331 btrfs_file_extent_inline_len(leaf, extent);
332 extent_end = (extent_end + root->sectorsize - 1) &
333 ~((u64)root->sectorsize - 1);
334 }
335 last_offset = extent_end;
336 path->slots[0]++;
337 }
338 if (0 && last_offset < inode->i_size) {
339 WARN_ON(1);
340 btrfs_print_leaf(root, leaf);
341 printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
342 inode->i_ino, (unsigned long long)last_offset,
343 (unsigned long long)inode->i_size);
344 err = 1;
345
346 }
347out:
348 btrfs_free_path(path);
349 return err;
350#endif
351}
352
353/*
354 * this is very complex, but the basic idea is to drop all extents
355 * in the range start - end. hint_block is filled in with a block number
356 * that would be a good hint to the block allocator for this file.
357 *
358 * If an extent intersects the range but is not entirely inside the range
359 * it is either truncated or split. Anything entirely inside the range
360 * is deleted from the tree.
361 *
362 * inline_limit is used to tell this code which offsets in the file to keep
363 * if they contain inline extents.
364 */
365noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
366 struct btrfs_root *root, struct inode *inode,
367 u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
368{
369 u64 extent_end = 0;
370 u64 locked_end = end;
371 u64 search_start = start;
372 u64 leaf_start;
373 u64 ram_bytes = 0;
374 u64 orig_parent = 0;
375 u64 disk_bytenr = 0;
376 u8 compression;
377 u8 encryption;
378 u16 other_encoding = 0;
379 u64 root_gen;
380 u64 root_owner;
381 struct extent_buffer *leaf;
382 struct btrfs_file_extent_item *extent;
383 struct btrfs_path *path;
384 struct btrfs_key key;
385 struct btrfs_file_extent_item old;
386 int keep;
387 int slot;
388 int bookend;
389 int found_type = 0;
390 int found_extent;
391 int found_inline;
392 int recow;
393 int ret;
394
395 inline_limit = 0;
396 btrfs_drop_extent_cache(inode, start, end - 1, 0);
397
398 path = btrfs_alloc_path();
399 if (!path)
400 return -ENOMEM;
401 while (1) {
402 recow = 0;
403 btrfs_release_path(root, path);
404 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
405 search_start, -1);
406 if (ret < 0)
407 goto out;
408 if (ret > 0) {
409 if (path->slots[0] == 0) {
410 ret = 0;
411 goto out;
412 }
413 path->slots[0]--;
414 }
415next_slot:
416 keep = 0;
417 bookend = 0;
418 found_extent = 0;
419 found_inline = 0;
420 leaf_start = 0;
421 root_gen = 0;
422 root_owner = 0;
423 compression = 0;
424 encryption = 0;
425 extent = NULL;
426 leaf = path->nodes[0];
427 slot = path->slots[0];
428 ret = 0;
429 btrfs_item_key_to_cpu(leaf, &key, slot);
430 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
431 key.offset >= end) {
432 goto out;
433 }
434 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
435 key.objectid != inode->i_ino) {
436 goto out;
437 }
438 if (recow) {
439 search_start = max(key.offset, start);
440 continue;
441 }
442 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
443 extent = btrfs_item_ptr(leaf, slot,
444 struct btrfs_file_extent_item);
445 found_type = btrfs_file_extent_type(leaf, extent);
446 compression = btrfs_file_extent_compression(leaf,
447 extent);
448 encryption = btrfs_file_extent_encryption(leaf,
449 extent);
450 other_encoding = btrfs_file_extent_other_encoding(leaf,
451 extent);
452 if (found_type == BTRFS_FILE_EXTENT_REG ||
453 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
454 extent_end =
455 btrfs_file_extent_disk_bytenr(leaf,
456 extent);
457 if (extent_end)
458 *hint_byte = extent_end;
459
460 extent_end = key.offset +
461 btrfs_file_extent_num_bytes(leaf, extent);
462 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
463 extent);
464 found_extent = 1;
465 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
466 found_inline = 1;
467 extent_end = key.offset +
468 btrfs_file_extent_inline_len(leaf, extent);
469 }
470 } else {
471 extent_end = search_start;
472 }
473
474 /* we found nothing we can drop */
475 if ((!found_extent && !found_inline) ||
476 search_start >= extent_end) {
477 int nextret;
478 u32 nritems;
479 nritems = btrfs_header_nritems(leaf);
480 if (slot >= nritems - 1) {
481 nextret = btrfs_next_leaf(root, path);
482 if (nextret)
483 goto out;
484 recow = 1;
485 } else {
486 path->slots[0]++;
487 }
488 goto next_slot;
489 }
490
491 if (end <= extent_end && start >= key.offset && found_inline)
492 *hint_byte = EXTENT_MAP_INLINE;
493
494 if (found_extent) {
495 read_extent_buffer(leaf, &old, (unsigned long)extent,
496 sizeof(old));
497 root_gen = btrfs_header_generation(leaf);
498 root_owner = btrfs_header_owner(leaf);
499 leaf_start = leaf->start;
500 }
501
502 if (end < extent_end && end >= key.offset) {
503 bookend = 1;
504 if (found_inline && start <= key.offset)
505 keep = 1;
506 }
507
508 if (bookend && found_extent) {
509 if (locked_end < extent_end) {
510 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
511 locked_end, extent_end - 1,
512 GFP_NOFS);
513 if (!ret) {
514 btrfs_release_path(root, path);
515 lock_extent(&BTRFS_I(inode)->io_tree,
516 locked_end, extent_end - 1,
517 GFP_NOFS);
518 locked_end = extent_end;
519 continue;
520 }
521 locked_end = extent_end;
522 }
523 orig_parent = path->nodes[0]->start;
524 disk_bytenr = le64_to_cpu(old.disk_bytenr);
525 if (disk_bytenr != 0) {
526 ret = btrfs_inc_extent_ref(trans, root,
527 disk_bytenr,
528 le64_to_cpu(old.disk_num_bytes),
529 orig_parent, root->root_key.objectid,
530 trans->transid, inode->i_ino);
531 BUG_ON(ret);
532 }
533 }
534
535 if (found_inline) {
536 u64 mask = root->sectorsize - 1;
537 search_start = (extent_end + mask) & ~mask;
538 } else
539 search_start = extent_end;
540
541 /* truncate existing extent */
542 if (start > key.offset) {
543 u64 new_num;
544 u64 old_num;
545 keep = 1;
546 WARN_ON(start & (root->sectorsize - 1));
547 if (found_extent) {
548 new_num = start - key.offset;
549 old_num = btrfs_file_extent_num_bytes(leaf,
550 extent);
551 *hint_byte =
552 btrfs_file_extent_disk_bytenr(leaf,
553 extent);
554 if (btrfs_file_extent_disk_bytenr(leaf,
555 extent)) {
556 inode_sub_bytes(inode, old_num -
557 new_num);
558 }
559 btrfs_set_file_extent_num_bytes(leaf,
560 extent, new_num);
561 btrfs_mark_buffer_dirty(leaf);
562 } else if (key.offset < inline_limit &&
563 (end > extent_end) &&
564 (inline_limit < extent_end)) {
565 u32 new_size;
566 new_size = btrfs_file_extent_calc_inline_size(
567 inline_limit - key.offset);
568 inode_sub_bytes(inode, extent_end -
569 inline_limit);
570 btrfs_set_file_extent_ram_bytes(leaf, extent,
571 new_size);
572 if (!compression && !encryption) {
573 btrfs_truncate_item(trans, root, path,
574 new_size, 1);
575 }
576 }
577 }
578 /* delete the entire extent */
579 if (!keep) {
580 if (found_inline)
581 inode_sub_bytes(inode, extent_end -
582 key.offset);
583 ret = btrfs_del_item(trans, root, path);
584 /* TODO update progress marker and return */
585 BUG_ON(ret);
586 extent = NULL;
587 btrfs_release_path(root, path);
588 /* the extent will be freed later */
589 }
590 if (bookend && found_inline && start <= key.offset) {
591 u32 new_size;
592 new_size = btrfs_file_extent_calc_inline_size(
593 extent_end - end);
594 inode_sub_bytes(inode, end - key.offset);
595 btrfs_set_file_extent_ram_bytes(leaf, extent,
596 new_size);
597 if (!compression && !encryption)
598 ret = btrfs_truncate_item(trans, root, path,
599 new_size, 0);
600 BUG_ON(ret);
601 }
602 /* create bookend, splitting the extent in two */
603 if (bookend && found_extent) {
604 struct btrfs_key ins;
605 ins.objectid = inode->i_ino;
606 ins.offset = end;
607 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
608
609 btrfs_release_path(root, path);
610 ret = btrfs_insert_empty_item(trans, root, path, &ins,
611 sizeof(*extent));
612 BUG_ON(ret);
613
614 leaf = path->nodes[0];
615 extent = btrfs_item_ptr(leaf, path->slots[0],
616 struct btrfs_file_extent_item);
617 write_extent_buffer(leaf, &old,
618 (unsigned long)extent, sizeof(old));
619
620 btrfs_set_file_extent_compression(leaf, extent,
621 compression);
622 btrfs_set_file_extent_encryption(leaf, extent,
623 encryption);
624 btrfs_set_file_extent_other_encoding(leaf, extent,
625 other_encoding);
626 btrfs_set_file_extent_offset(leaf, extent,
627 le64_to_cpu(old.offset) + end - key.offset);
628 WARN_ON(le64_to_cpu(old.num_bytes) <
629 (extent_end - end));
630 btrfs_set_file_extent_num_bytes(leaf, extent,
631 extent_end - end);
632
633 /*
634 * set the ram bytes to the size of the full extent
635 * before splitting. This is a worst case flag,
636 * but its the best we can do because we don't know
637 * how splitting affects compression
638 */
639 btrfs_set_file_extent_ram_bytes(leaf, extent,
640 ram_bytes);
641 btrfs_set_file_extent_type(leaf, extent, found_type);
642
643 btrfs_mark_buffer_dirty(path->nodes[0]);
644
645 if (disk_bytenr != 0) {
646 ret = btrfs_update_extent_ref(trans, root,
647 disk_bytenr, orig_parent,
648 leaf->start,
649 root->root_key.objectid,
650 trans->transid, ins.objectid);
651
652 BUG_ON(ret);
653 }
654 btrfs_release_path(root, path);
655 if (disk_bytenr != 0)
656 inode_add_bytes(inode, extent_end - end);
657 }
658
659 if (found_extent && !keep) {
660 u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
661
662 if (old_disk_bytenr != 0) {
663 inode_sub_bytes(inode,
664 le64_to_cpu(old.num_bytes));
665 ret = btrfs_free_extent(trans, root,
666 old_disk_bytenr,
667 le64_to_cpu(old.disk_num_bytes),
668 leaf_start, root_owner,
669 root_gen, key.objectid, 0);
670 BUG_ON(ret);
671 *hint_byte = old_disk_bytenr;
672 }
673 }
674
675 if (search_start >= end) {
676 ret = 0;
677 goto out;
678 }
679 }
680out:
681 btrfs_free_path(path);
682 if (locked_end > end) {
683 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
684 GFP_NOFS);
685 }
686 btrfs_check_file(root, inode);
687 return ret;
688}
689
690static int extent_mergeable(struct extent_buffer *leaf, int slot,
691 u64 objectid, u64 bytenr, u64 *start, u64 *end)
692{
693 struct btrfs_file_extent_item *fi;
694 struct btrfs_key key;
695 u64 extent_end;
696
697 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
698 return 0;
699
700 btrfs_item_key_to_cpu(leaf, &key, slot);
701 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
702 return 0;
703
704 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
705 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
706 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
707 btrfs_file_extent_compression(leaf, fi) ||
708 btrfs_file_extent_encryption(leaf, fi) ||
709 btrfs_file_extent_other_encoding(leaf, fi))
710 return 0;
711
712 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
713 if ((*start && *start != key.offset) || (*end && *end != extent_end))
714 return 0;
715
716 *start = key.offset;
717 *end = extent_end;
718 return 1;
719}
720
721/*
722 * Mark extent in the range start - end as written.
723 *
724 * This changes extent type from 'pre-allocated' to 'regular'. If only
725 * part of extent is marked as written, the extent will be split into
726 * two or three.
727 */
728int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
729 struct btrfs_root *root,
730 struct inode *inode, u64 start, u64 end)
731{
732 struct extent_buffer *leaf;
733 struct btrfs_path *path;
734 struct btrfs_file_extent_item *fi;
735 struct btrfs_key key;
736 u64 bytenr;
737 u64 num_bytes;
738 u64 extent_end;
739 u64 extent_offset;
740 u64 other_start;
741 u64 other_end;
742 u64 split = start;
743 u64 locked_end = end;
744 u64 orig_parent;
745 int extent_type;
746 int split_end = 1;
747 int ret;
748
749 btrfs_drop_extent_cache(inode, start, end - 1, 0);
750
751 path = btrfs_alloc_path();
752 BUG_ON(!path);
753again:
754 key.objectid = inode->i_ino;
755 key.type = BTRFS_EXTENT_DATA_KEY;
756 if (split == start)
757 key.offset = split;
758 else
759 key.offset = split - 1;
760
761 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
762 if (ret > 0 && path->slots[0] > 0)
763 path->slots[0]--;
764
765 leaf = path->nodes[0];
766 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
767 BUG_ON(key.objectid != inode->i_ino ||
768 key.type != BTRFS_EXTENT_DATA_KEY);
769 fi = btrfs_item_ptr(leaf, path->slots[0],
770 struct btrfs_file_extent_item);
771 extent_type = btrfs_file_extent_type(leaf, fi);
772 BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
773 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
774 BUG_ON(key.offset > start || extent_end < end);
775
776 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
777 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
778 extent_offset = btrfs_file_extent_offset(leaf, fi);
779
780 if (key.offset == start)
781 split = end;
782
783 if (key.offset == start && extent_end == end) {
784 int del_nr = 0;
785 int del_slot = 0;
786 u64 leaf_owner = btrfs_header_owner(leaf);
787 u64 leaf_gen = btrfs_header_generation(leaf);
788 other_start = end;
789 other_end = 0;
790 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
791 bytenr, &other_start, &other_end)) {
792 extent_end = other_end;
793 del_slot = path->slots[0] + 1;
794 del_nr++;
795 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
796 leaf->start, leaf_owner,
797 leaf_gen, inode->i_ino, 0);
798 BUG_ON(ret);
799 }
800 other_start = 0;
801 other_end = start;
802 if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
803 bytenr, &other_start, &other_end)) {
804 key.offset = other_start;
805 del_slot = path->slots[0];
806 del_nr++;
807 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
808 leaf->start, leaf_owner,
809 leaf_gen, inode->i_ino, 0);
810 BUG_ON(ret);
811 }
812 split_end = 0;
813 if (del_nr == 0) {
814 btrfs_set_file_extent_type(leaf, fi,
815 BTRFS_FILE_EXTENT_REG);
816 goto done;
817 }
818
819 fi = btrfs_item_ptr(leaf, del_slot - 1,
820 struct btrfs_file_extent_item);
821 btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
822 btrfs_set_file_extent_num_bytes(leaf, fi,
823 extent_end - key.offset);
824 btrfs_mark_buffer_dirty(leaf);
825
826 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
827 BUG_ON(ret);
828 goto done;
829 } else if (split == start) {
830 if (locked_end < extent_end) {
831 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
832 locked_end, extent_end - 1, GFP_NOFS);
833 if (!ret) {
834 btrfs_release_path(root, path);
835 lock_extent(&BTRFS_I(inode)->io_tree,
836 locked_end, extent_end - 1, GFP_NOFS);
837 locked_end = extent_end;
838 goto again;
839 }
840 locked_end = extent_end;
841 }
842 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
843 extent_offset += split - key.offset;
844 } else {
845 BUG_ON(key.offset != start);
846 btrfs_set_file_extent_offset(leaf, fi, extent_offset +
847 split - key.offset);
848 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
849 key.offset = split;
850 btrfs_set_item_key_safe(trans, root, path, &key);
851 extent_end = split;
852 }
853
854 if (extent_end == end) {
855 split_end = 0;
856 extent_type = BTRFS_FILE_EXTENT_REG;
857 }
858 if (extent_end == end && split == start) {
859 other_start = end;
860 other_end = 0;
861 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
862 bytenr, &other_start, &other_end)) {
863 path->slots[0]++;
864 fi = btrfs_item_ptr(leaf, path->slots[0],
865 struct btrfs_file_extent_item);
866 key.offset = split;
867 btrfs_set_item_key_safe(trans, root, path, &key);
868 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
869 btrfs_set_file_extent_num_bytes(leaf, fi,
870 other_end - split);
871 goto done;
872 }
873 }
874 if (extent_end == end && split == end) {
875 other_start = 0;
876 other_end = start;
877 if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
878 bytenr, &other_start, &other_end)) {
879 path->slots[0]--;
880 fi = btrfs_item_ptr(leaf, path->slots[0],
881 struct btrfs_file_extent_item);
882 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
883 other_start);
884 goto done;
885 }
886 }
887
888 btrfs_mark_buffer_dirty(leaf);
889
890 orig_parent = leaf->start;
891 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
892 orig_parent, root->root_key.objectid,
893 trans->transid, inode->i_ino);
894 BUG_ON(ret);
895 btrfs_release_path(root, path);
896
897 key.offset = start;
898 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
899 BUG_ON(ret);
900
901 leaf = path->nodes[0];
902 fi = btrfs_item_ptr(leaf, path->slots[0],
903 struct btrfs_file_extent_item);
904 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
905 btrfs_set_file_extent_type(leaf, fi, extent_type);
906 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
907 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
908 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
909 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
910 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
911 btrfs_set_file_extent_compression(leaf, fi, 0);
912 btrfs_set_file_extent_encryption(leaf, fi, 0);
913 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
914
915 if (orig_parent != leaf->start) {
916 ret = btrfs_update_extent_ref(trans, root, bytenr,
917 orig_parent, leaf->start,
918 root->root_key.objectid,
919 trans->transid, inode->i_ino);
920 BUG_ON(ret);
921 }
922done:
923 btrfs_mark_buffer_dirty(leaf);
924 btrfs_release_path(root, path);
925 if (split_end && split == start) {
926 split = end;
927 goto again;
928 }
929 if (locked_end > end) {
930 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
931 GFP_NOFS);
932 }
933 btrfs_free_path(path);
934 return 0;
935}
936
937/*
938 * this gets pages into the page cache and locks them down, it also properly
939 * waits for data=ordered extents to finish before allowing the pages to be
940 * modified.
941 */
942static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
943 struct page **pages, size_t num_pages,
944 loff_t pos, unsigned long first_index,
945 unsigned long last_index, size_t write_bytes)
946{
947 int i;
948 unsigned long index = pos >> PAGE_CACHE_SHIFT;
949 struct inode *inode = fdentry(file)->d_inode;
950 int err = 0;
951 u64 start_pos;
952 u64 last_pos;
953
954 start_pos = pos & ~((u64)root->sectorsize - 1);
955 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
956
957 if (start_pos > inode->i_size) {
958 err = btrfs_cont_expand(inode, start_pos);
959 if (err)
960 return err;
961 }
962
963 memset(pages, 0, num_pages * sizeof(struct page *));
964again:
965 for (i = 0; i < num_pages; i++) {
966 pages[i] = grab_cache_page(inode->i_mapping, index + i);
967 if (!pages[i]) {
968 err = -ENOMEM;
969 BUG_ON(1);
970 }
971 wait_on_page_writeback(pages[i]);
972 }
973 if (start_pos < inode->i_size) {
974 struct btrfs_ordered_extent *ordered;
975 lock_extent(&BTRFS_I(inode)->io_tree,
976 start_pos, last_pos - 1, GFP_NOFS);
977 ordered = btrfs_lookup_first_ordered_extent(inode,
978 last_pos - 1);
979 if (ordered &&
980 ordered->file_offset + ordered->len > start_pos &&
981 ordered->file_offset < last_pos) {
982 btrfs_put_ordered_extent(ordered);
983 unlock_extent(&BTRFS_I(inode)->io_tree,
984 start_pos, last_pos - 1, GFP_NOFS);
985 for (i = 0; i < num_pages; i++) {
986 unlock_page(pages[i]);
987 page_cache_release(pages[i]);
988 }
989 btrfs_wait_ordered_range(inode, start_pos,
990 last_pos - start_pos);
991 goto again;
992 }
993 if (ordered)
994 btrfs_put_ordered_extent(ordered);
995
996 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
997 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
998 GFP_NOFS);
999 unlock_extent(&BTRFS_I(inode)->io_tree,
1000 start_pos, last_pos - 1, GFP_NOFS);
1001 }
1002 for (i = 0; i < num_pages; i++) {
1003 clear_page_dirty_for_io(pages[i]);
1004 set_page_extent_mapped(pages[i]);
1005 WARN_ON(!PageLocked(pages[i]));
1006 }
1007 return 0;
1008}
1009
1010static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1011 size_t count, loff_t *ppos)
1012{
1013 loff_t pos;
1014 loff_t start_pos;
1015 ssize_t num_written = 0;
1016 ssize_t err = 0;
1017 int ret = 0;
1018 struct inode *inode = fdentry(file)->d_inode;
1019 struct btrfs_root *root = BTRFS_I(inode)->root;
1020 struct page **pages = NULL;
1021 int nrptrs;
1022 struct page *pinned[2];
1023 unsigned long first_index;
1024 unsigned long last_index;
1025 int will_write;
1026
1027 will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
1028 (file->f_flags & O_DIRECT));
1029
1030 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
1031 PAGE_CACHE_SIZE / (sizeof(struct page *)));
1032 pinned[0] = NULL;
1033 pinned[1] = NULL;
1034
1035 pos = *ppos;
1036 start_pos = pos;
1037
1038 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1039 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1040 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1041 if (err)
1042 goto out_nolock;
1043 if (count == 0)
1044 goto out_nolock;
1045
1046 err = file_remove_suid(file);
1047 if (err)
1048 goto out_nolock;
1049 file_update_time(file);
1050
1051 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1052
1053 mutex_lock(&inode->i_mutex);
1054 BTRFS_I(inode)->sequence++;
1055 first_index = pos >> PAGE_CACHE_SHIFT;
1056 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
1057
1058 /*
1059 * there are lots of better ways to do this, but this code
1060 * makes sure the first and last page in the file range are
1061 * up to date and ready for cow
1062 */
1063 if ((pos & (PAGE_CACHE_SIZE - 1))) {
1064 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
1065 if (!PageUptodate(pinned[0])) {
1066 ret = btrfs_readpage(NULL, pinned[0]);
1067 BUG_ON(ret);
1068 wait_on_page_locked(pinned[0]);
1069 } else {
1070 unlock_page(pinned[0]);
1071 }
1072 }
1073 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
1074 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
1075 if (!PageUptodate(pinned[1])) {
1076 ret = btrfs_readpage(NULL, pinned[1]);
1077 BUG_ON(ret);
1078 wait_on_page_locked(pinned[1]);
1079 } else {
1080 unlock_page(pinned[1]);
1081 }
1082 }
1083
1084 while (count > 0) {
1085 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1086 size_t write_bytes = min(count, nrptrs *
1087 (size_t)PAGE_CACHE_SIZE -
1088 offset);
1089 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
1090 PAGE_CACHE_SHIFT;
1091
1092 WARN_ON(num_pages > nrptrs);
1093 memset(pages, 0, sizeof(struct page *) * nrptrs);
1094
1095 ret = btrfs_check_free_space(root, write_bytes, 0);
1096 if (ret)
1097 goto out;
1098
1099 ret = prepare_pages(root, file, pages, num_pages,
1100 pos, first_index, last_index,
1101 write_bytes);
1102 if (ret)
1103 goto out;
1104
1105 ret = btrfs_copy_from_user(pos, num_pages,
1106 write_bytes, pages, buf);
1107 if (ret) {
1108 btrfs_drop_pages(pages, num_pages);
1109 goto out;
1110 }
1111
1112 ret = dirty_and_release_pages(NULL, root, file, pages,
1113 num_pages, pos, write_bytes);
1114 btrfs_drop_pages(pages, num_pages);
1115 if (ret)
1116 goto out;
1117
1118 if (will_write) {
1119 btrfs_fdatawrite_range(inode->i_mapping, pos,
1120 pos + write_bytes - 1,
1121 WB_SYNC_NONE);
1122 } else {
1123 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1124 num_pages);
1125 if (num_pages <
1126 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1127 btrfs_btree_balance_dirty(root, 1);
1128 btrfs_throttle(root);
1129 }
1130
1131 buf += write_bytes;
1132 count -= write_bytes;
1133 pos += write_bytes;
1134 num_written += write_bytes;
1135
1136 cond_resched();
1137 }
1138out:
1139 mutex_unlock(&inode->i_mutex);
1140
1141out_nolock:
1142 kfree(pages);
1143 if (pinned[0])
1144 page_cache_release(pinned[0]);
1145 if (pinned[1])
1146 page_cache_release(pinned[1]);
1147 *ppos = pos;
1148
1149 if (num_written > 0 && will_write) {
1150 struct btrfs_trans_handle *trans;
1151
1152 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1153 if (err)
1154 num_written = err;
1155
1156 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
1157 trans = btrfs_start_transaction(root, 1);
1158 ret = btrfs_log_dentry_safe(trans, root,
1159 file->f_dentry);
1160 if (ret == 0) {
1161 btrfs_sync_log(trans, root);
1162 btrfs_end_transaction(trans, root);
1163 } else {
1164 btrfs_commit_transaction(trans, root);
1165 }
1166 }
1167 if (file->f_flags & O_DIRECT) {
1168 invalidate_mapping_pages(inode->i_mapping,
1169 start_pos >> PAGE_CACHE_SHIFT,
1170 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1171 }
1172 }
1173 current->backing_dev_info = NULL;
1174 return num_written ? num_written : err;
1175}
1176
1177int btrfs_release_file(struct inode *inode, struct file *filp)
1178{
1179 if (filp->private_data)
1180 btrfs_ioctl_trans_end(filp);
1181 return 0;
1182}
1183
1184/*
1185 * fsync call for both files and directories. This logs the inode into
1186 * the tree log instead of forcing full commits whenever possible.
1187 *
1188 * It needs to call filemap_fdatawait so that all ordered extent updates are
1189 * in the metadata btree are up to date for copying to the log.
1190 *
1191 * It drops the inode mutex before doing the tree log commit. This is an
1192 * important optimization for directories because holding the mutex prevents
1193 * new operations on the dir while we write to disk.
1194 */
1195int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1196{
1197 struct inode *inode = dentry->d_inode;
1198 struct btrfs_root *root = BTRFS_I(inode)->root;
1199 int ret = 0;
1200 struct btrfs_trans_handle *trans;
1201
1202 /*
1203 * check the transaction that last modified this inode
1204 * and see if its already been committed
1205 */
1206 if (!BTRFS_I(inode)->last_trans)
1207 goto out;
1208
1209 mutex_lock(&root->fs_info->trans_mutex);
1210 if (BTRFS_I(inode)->last_trans <=
1211 root->fs_info->last_trans_committed) {
1212 BTRFS_I(inode)->last_trans = 0;
1213 mutex_unlock(&root->fs_info->trans_mutex);
1214 goto out;
1215 }
1216 mutex_unlock(&root->fs_info->trans_mutex);
1217
1218 root->fs_info->tree_log_batch++;
1219 filemap_fdatawrite(inode->i_mapping);
1220 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1221 root->fs_info->tree_log_batch++;
1222
1223 /*
1224 * ok we haven't committed the transaction yet, lets do a commit
1225 */
1226 if (file->private_data)
1227 btrfs_ioctl_trans_end(file);
1228
1229 trans = btrfs_start_transaction(root, 1);
1230 if (!trans) {
1231 ret = -ENOMEM;
1232 goto out;
1233 }
1234
1235 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
1236 if (ret < 0)
1237 goto out;
1238
1239 /* we've logged all the items and now have a consistent
1240 * version of the file in the log. It is possible that
1241 * someone will come in and modify the file, but that's
1242 * fine because the log is consistent on disk, and we
1243 * have references to all of the file's extents
1244 *
1245 * It is possible that someone will come in and log the
1246 * file again, but that will end up using the synchronization
1247 * inside btrfs_sync_log to keep things safe.
1248 */
1249 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
1250
1251 if (ret > 0) {
1252 ret = btrfs_commit_transaction(trans, root);
1253 } else {
1254 btrfs_sync_log(trans, root);
1255 ret = btrfs_end_transaction(trans, root);
1256 }
1257 mutex_lock(&file->f_dentry->d_inode->i_mutex);
1258out:
1259 return ret > 0 ? EIO : ret;
1260}
1261
1262static struct vm_operations_struct btrfs_file_vm_ops = {
1263 .fault = filemap_fault,
1264 .page_mkwrite = btrfs_page_mkwrite,
1265};
1266
1267static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1268{
1269 vma->vm_ops = &btrfs_file_vm_ops;
1270 file_accessed(filp);
1271 return 0;
1272}
1273
1274struct file_operations btrfs_file_operations = {
1275 .llseek = generic_file_llseek,
1276 .read = do_sync_read,
1277 .aio_read = generic_file_aio_read,
1278 .splice_read = generic_file_splice_read,
1279 .write = btrfs_file_write,
1280 .mmap = btrfs_file_mmap,
1281 .open = generic_file_open,
1282 .release = btrfs_release_file,
1283 .fsync = btrfs_sync_file,
1284 .unlocked_ioctl = btrfs_ioctl,
1285#ifdef CONFIG_COMPAT
1286 .compat_ioctl = btrfs_ioctl,
1287#endif
1288};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..d1e5f0e84c58
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,495 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21
22static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node)
24{
25 struct rb_node **p = &root->rb_node;
26 struct rb_node *parent = NULL;
27 struct btrfs_free_space *info;
28
29 while (*p) {
30 parent = *p;
31 info = rb_entry(parent, struct btrfs_free_space, offset_index);
32
33 if (offset < info->offset)
34 p = &(*p)->rb_left;
35 else if (offset > info->offset)
36 p = &(*p)->rb_right;
37 else
38 return -EEXIST;
39 }
40
41 rb_link_node(node, parent, p);
42 rb_insert_color(node, root);
43
44 return 0;
45}
46
47static int tree_insert_bytes(struct rb_root *root, u64 bytes,
48 struct rb_node *node)
49{
50 struct rb_node **p = &root->rb_node;
51 struct rb_node *parent = NULL;
52 struct btrfs_free_space *info;
53
54 while (*p) {
55 parent = *p;
56 info = rb_entry(parent, struct btrfs_free_space, bytes_index);
57
58 if (bytes < info->bytes)
59 p = &(*p)->rb_left;
60 else
61 p = &(*p)->rb_right;
62 }
63
64 rb_link_node(node, parent, p);
65 rb_insert_color(node, root);
66
67 return 0;
68}
69
70/*
71 * searches the tree for the given offset. If contains is set we will return
72 * the free space that contains the given offset. If contains is not set we
73 * will return the free space that starts at or after the given offset and is
74 * at least bytes long.
75 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes,
78 int contains)
79{
80 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL;
82
83 while (n) {
84 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85
86 if (offset < entry->offset) {
87 if (!contains &&
88 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes))
90 ret = entry;
91 n = n->rb_left;
92 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) {
95 ret = entry;
96 break;
97 }
98 n = n->rb_right;
99 } else {
100 if (bytes > entry->bytes) {
101 n = n->rb_right;
102 continue;
103 }
104 ret = entry;
105 break;
106 }
107 }
108
109 return ret;
110}
111
112/*
113 * return a chunk at least bytes size, as close to offset that we can get.
114 */
115static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
116 u64 offset, u64 bytes)
117{
118 struct rb_node *n = root->rb_node;
119 struct btrfs_free_space *entry, *ret = NULL;
120
121 while (n) {
122 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
123
124 if (bytes < entry->bytes) {
125 /*
126 * We prefer to get a hole size as close to the size we
127 * are asking for so we don't take small slivers out of
128 * huge holes, but we also want to get as close to the
129 * offset as possible so we don't have a whole lot of
130 * fragmentation.
131 */
132 if (offset <= entry->offset) {
133 if (!ret)
134 ret = entry;
135 else if (entry->bytes < ret->bytes)
136 ret = entry;
137 else if (entry->offset < ret->offset)
138 ret = entry;
139 }
140 n = n->rb_left;
141 } else if (bytes > entry->bytes) {
142 n = n->rb_right;
143 } else {
144 /*
145 * Ok we may have multiple chunks of the wanted size,
146 * so we don't want to take the first one we find, we
147 * want to take the one closest to our given offset, so
148 * keep searching just in case theres a better match.
149 */
150 n = n->rb_right;
151 if (offset > entry->offset)
152 continue;
153 else if (!ret || entry->offset < ret->offset)
154 ret = entry;
155 }
156 }
157
158 return ret;
159}
160
161static void unlink_free_space(struct btrfs_block_group_cache *block_group,
162 struct btrfs_free_space *info)
163{
164 rb_erase(&info->offset_index, &block_group->free_space_offset);
165 rb_erase(&info->bytes_index, &block_group->free_space_bytes);
166}
167
168static int link_free_space(struct btrfs_block_group_cache *block_group,
169 struct btrfs_free_space *info)
170{
171 int ret = 0;
172
173
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index);
176 if (ret)
177 return ret;
178
179 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
180 &info->bytes_index);
181 if (ret)
182 return ret;
183
184 return ret;
185}
186
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes)
189{
190 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0;
195
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info)
198 return -ENOMEM;
199
200 /*
201 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range
204 */
205 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1);
207 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1);
209
210 if (right_info && right_info->offset == offset+bytes) {
211 unlink_free_space(block_group, right_info);
212 info = right_info;
213 info->offset = offset;
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 }
225
226 if (left_info) {
227 unlink_free_space(block_group, left_info);
228
229 if (unlikely((left_info->offset + left_info->bytes) !=
230 offset)) {
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 }
251
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info);
265 if (ret)
266 kfree(info);
267out:
268 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST)
271 BUG();
272 }
273
274 kfree(alloc_info);
275
276 return ret;
277}
278
279static int
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
281 u64 offset, u64 bytes)
282{
283 struct btrfs_free_space *info;
284 int ret = 0;
285
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1);
288
289 if (info && info->offset == offset) {
290 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu,"
292 "trying to use %llu\n",
293 (unsigned long long)info->offset,
294 (unsigned long long)info->bytes,
295 (unsigned long long)bytes);
296 WARN_ON(1);
297 ret = -EINVAL;
298 goto out;
299 }
300 unlink_free_space(block_group, info);
301
302 if (info->bytes == bytes) {
303 kfree(info);
304 goto out;
305 }
306
307 info->offset += bytes;
308 info->bytes -= bytes;
309
310 ret = link_free_space(block_group, info);
311 BUG_ON(ret);
312 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) {
314 u64 old_start = info->offset;
315 /*
316 * we're freeing space in the middle of the info,
317 * this can happen during tree log replay
318 *
319 * first unlink the old info and then
320 * insert it again after the hole we're creating
321 */
322 unlink_free_space(block_group, info);
323 if (offset + bytes < info->offset + info->bytes) {
324 u64 old_end = info->offset + info->bytes;
325
326 info->offset = offset + bytes;
327 info->bytes = old_end - info->offset;
328 ret = link_free_space(block_group, info);
329 BUG_ON(ret);
330 } else {
331 /* the hole we're creating ends at the end
332 * of the info struct, just free the info
333 */
334 kfree(info);
335 }
336
337 /* step two, insert a new info struct to cover anything
338 * before the hole
339 */
340 ret = __btrfs_add_free_space(block_group, old_start,
341 offset - old_start);
342 BUG_ON(ret);
343 } else {
344 WARN_ON(1);
345 }
346out:
347 return ret;
348}
349
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes)
402{
403 struct btrfs_free_space *info;
404 struct rb_node *n;
405 int count = 0;
406
407 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
408 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes)
410 count++;
411 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count);
414}
415
416u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
417{
418 struct btrfs_free_space *info;
419 struct rb_node *n;
420 u64 ret = 0;
421
422 for (n = rb_first(&block_group->free_space_offset); n;
423 n = rb_next(n)) {
424 info = rb_entry(n, struct btrfs_free_space, offset_index);
425 ret += info->bytes;
426 }
427
428 return ret;
429}
430
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{
433 struct btrfs_free_space *info;
434 struct rb_node *node;
435
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info);
440 kfree(info);
441 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex);
443 cond_resched();
444 mutex_lock(&block_group->alloc_mutex);
445 }
446 }
447 mutex_unlock(&block_group->alloc_mutex);
448}
449
450#if 0
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{
456 struct btrfs_free_space *ret;
457
458 mutex_lock(&block_group->alloc_mutex);
459 ret = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0);
461 mutex_unlock(&block_group->alloc_mutex);
462
463 return ret;
464}
465
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
467 btrfs_block_group_cache
468 *block_group, u64 offset,
469 u64 bytes)
470{
471 struct btrfs_free_space *ret;
472
473 mutex_lock(&block_group->alloc_mutex);
474
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
476 mutex_unlock(&block_group->alloc_mutex);
477
478 return ret;
479}
480#endif
481
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
483 *block_group, u64 offset,
484 u64 bytes)
485{
486 struct btrfs_free_space *ret = NULL;
487
488 ret = tree_search_offset(&block_group->free_space_offset, offset,
489 bytes, 0);
490 if (!ret)
491 ret = tree_search_bytes(&block_group->free_space_bytes,
492 offset, bytes);
493
494 return ret;
495}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 000000000000..2a020b276768
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __HASH__
20#define __HASH__
21
22#include "crc32c.h"
23static inline u64 btrfs_name_hash(const char *name, int len)
24{
25 return btrfs_crc32c((u32)~1, name, len);
26}
27#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000000..3d46fa1f29a4
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23static int find_name_in_backref(struct btrfs_path *path, const char *name,
24 int name_len, struct btrfs_inode_ref **ref_ret)
25{
26 struct extent_buffer *leaf;
27 struct btrfs_inode_ref *ref;
28 unsigned long ptr;
29 unsigned long name_ptr;
30 u32 item_size;
31 u32 cur_offset = 0;
32 int len;
33
34 leaf = path->nodes[0];
35 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
36 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
37 while (cur_offset < item_size) {
38 ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
39 len = btrfs_inode_ref_name_len(leaf, ref);
40 name_ptr = (unsigned long)(ref + 1);
41 cur_offset += len + sizeof(*ref);
42 if (len != name_len)
43 continue;
44 if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
45 *ref_ret = ref;
46 return 1;
47 }
48 }
49 return 0;
50}
51
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root,
54 const char *name, int name_len,
55 u64 inode_objectid, u64 ref_objectid, u64 *index)
56{
57 struct btrfs_path *path;
58 struct btrfs_key key;
59 struct btrfs_inode_ref *ref;
60 struct extent_buffer *leaf;
61 unsigned long ptr;
62 unsigned long item_start;
63 u32 item_size;
64 u32 sub_item_len;
65 int ret;
66 int del_len = name_len + sizeof(*ref);
67
68 key.objectid = inode_objectid;
69 key.offset = ref_objectid;
70 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
71
72 path = btrfs_alloc_path();
73 if (!path)
74 return -ENOMEM;
75
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) {
78 ret = -ENOENT;
79 goto out;
80 } else if (ret < 0) {
81 goto out;
82 }
83 if (!find_name_in_backref(path, name, name_len, &ref)) {
84 ret = -ENOENT;
85 goto out;
86 }
87 leaf = path->nodes[0];
88 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
89
90 if (index)
91 *index = btrfs_inode_ref_index(leaf, ref);
92
93 if (del_len == item_size) {
94 ret = btrfs_del_item(trans, root, path);
95 goto out;
96 }
97 ptr = (unsigned long)ref;
98 sub_item_len = name_len + sizeof(*ref);
99 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
100 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
101 item_size - (ptr + sub_item_len - item_start));
102 ret = btrfs_truncate_item(trans, root, path,
103 item_size - sub_item_len, 1);
104 BUG_ON(ret);
105out:
106 btrfs_free_path(path);
107 return ret;
108}
109
110int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
111 struct btrfs_root *root,
112 const char *name, int name_len,
113 u64 inode_objectid, u64 ref_objectid, u64 index)
114{
115 struct btrfs_path *path;
116 struct btrfs_key key;
117 struct btrfs_inode_ref *ref;
118 unsigned long ptr;
119 int ret;
120 int ins_len = name_len + sizeof(*ref);
121
122 key.objectid = inode_objectid;
123 key.offset = ref_objectid;
124 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
125
126 path = btrfs_alloc_path();
127 if (!path)
128 return -ENOMEM;
129
130 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len);
132 if (ret == -EEXIST) {
133 u32 old_size;
134
135 if (find_name_in_backref(path, name, name_len, &ref))
136 goto out;
137
138 old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
139 ret = btrfs_extend_item(trans, root, path, ins_len);
140 BUG_ON(ret);
141 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
142 struct btrfs_inode_ref);
143 ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
144 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
145 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
146 ptr = (unsigned long)(ref + 1);
147 ret = 0;
148 } else if (ret < 0) {
149 goto out;
150 } else {
151 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
152 struct btrfs_inode_ref);
153 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
154 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
155 ptr = (unsigned long)(ref + 1);
156 }
157 write_extent_buffer(path->nodes[0], name, ptr, name_len);
158 btrfs_mark_buffer_dirty(path->nodes[0]);
159
160out:
161 btrfs_free_path(path);
162 return ret;
163}
164
165int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
166 struct btrfs_root *root,
167 struct btrfs_path *path, u64 objectid)
168{
169 struct btrfs_key key;
170 int ret;
171 key.objectid = objectid;
172 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
173 key.offset = 0;
174
175 ret = btrfs_insert_empty_item(trans, root, path, &key,
176 sizeof(struct btrfs_inode_item));
177 if (ret == 0 && objectid > root->highest_inode)
178 root->highest_inode = objectid;
179 return ret;
180}
181
182int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
183 *root, struct btrfs_path *path,
184 struct btrfs_key *location, int mod)
185{
186 int ins_len = mod < 0 ? -1 : 0;
187 int cow = mod != 0;
188 int ret;
189 int slot;
190 struct extent_buffer *leaf;
191 struct btrfs_key found_key;
192
193 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
194 if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
195 location->offset == (u64)-1 && path->slots[0] != 0) {
196 slot = path->slots[0] - 1;
197 leaf = path->nodes[0];
198 btrfs_item_key_to_cpu(leaf, &found_key, slot);
199 if (found_key.objectid == location->objectid &&
200 btrfs_key_type(&found_key) == btrfs_key_type(location)) {
201 path->slots[0]--;
202 return 0;
203 }
204 }
205 return ret;
206}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 000000000000..2aa79873eb46
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,144 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
24{
25 struct btrfs_path *path;
26 int ret;
27 struct extent_buffer *l;
28 struct btrfs_key search_key;
29 struct btrfs_key found_key;
30 int slot;
31
32 path = btrfs_alloc_path();
33 BUG_ON(!path);
34
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1;
37 search_key.offset = (u64)-1;
38 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
39 if (ret < 0)
40 goto error;
41 BUG_ON(ret == 0);
42 if (path->slots[0] > 0) {
43 slot = path->slots[0] - 1;
44 l = path->nodes[0];
45 btrfs_item_key_to_cpu(l, &found_key, slot);
46 *objectid = found_key.objectid;
47 } else {
48 *objectid = BTRFS_FIRST_FREE_OBJECTID;
49 }
50 ret = 0;
51error:
52 btrfs_free_path(path);
53 return ret;
54}
55
56/*
57 * walks the btree of allocated inodes and find a hole.
58 */
59int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 dirid, u64 *objectid)
62{
63 struct btrfs_path *path;
64 struct btrfs_key key;
65 int ret;
66 int slot = 0;
67 u64 last_ino = 0;
68 int start_found;
69 struct extent_buffer *l;
70 struct btrfs_key search_key;
71 u64 search_start = dirid;
72
73 mutex_lock(&root->objectid_mutex);
74 if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
75 root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
76 *objectid = ++root->last_inode_alloc;
77 mutex_unlock(&root->objectid_mutex);
78 return 0;
79 }
80 path = btrfs_alloc_path();
81 BUG_ON(!path);
82 search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start;
84 search_key.type = 0;
85 search_key.offset = 0;
86
87 btrfs_init_path(path);
88 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0)
91 goto error;
92
93 while (1) {
94 l = path->nodes[0];
95 slot = path->slots[0];
96 if (slot >= btrfs_header_nritems(l)) {
97 ret = btrfs_next_leaf(root, path);
98 if (ret == 0)
99 continue;
100 if (ret < 0)
101 goto error;
102 if (!start_found) {
103 *objectid = search_start;
104 start_found = 1;
105 goto found;
106 }
107 *objectid = last_ino > search_start ?
108 last_ino : search_start;
109 goto found;
110 }
111 btrfs_item_key_to_cpu(l, &key, slot);
112 if (key.objectid >= search_start) {
113 if (start_found) {
114 if (last_ino < search_start)
115 last_ino = search_start;
116 if (key.objectid > last_ino) {
117 *objectid = last_ino;
118 goto found;
119 }
120 } else if (key.objectid > search_start) {
121 *objectid = search_start;
122 goto found;
123 }
124 }
125 if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
126 break;
127
128 start_found = 1;
129 last_ino = key.objectid + 1;
130 path->slots[0]++;
131 }
132 BUG_ON(1);
133found:
134 btrfs_release_path(root, path);
135 btrfs_free_path(path);
136 BUG_ON(*objectid < search_start);
137 mutex_unlock(&root->objectid_mutex);
138 return 0;
139error:
140 btrfs_release_path(root, path);
141 btrfs_free_path(path);
142 mutex_unlock(&root->objectid_mutex);
143 return ret;
144}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000000..8adfe059ab41
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,5035 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
40#include <linux/falloc.h>
41#include "compat.h"
42#include "ctree.h"
43#include "disk-io.h"
44#include "transaction.h"
45#include "btrfs_inode.h"
46#include "ioctl.h"
47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h"
50#include "xattr.h"
51#include "tree-log.h"
52#include "ref-cache.h"
53#include "compression.h"
54
55struct btrfs_iget_args {
56 u64 ino;
57 struct btrfs_root *root;
58};
59
60static struct inode_operations btrfs_dir_inode_operations;
61static struct inode_operations btrfs_symlink_inode_operations;
62static struct inode_operations btrfs_dir_ro_inode_operations;
63static struct inode_operations btrfs_special_inode_operations;
64static struct inode_operations btrfs_file_inode_operations;
65static struct address_space_operations btrfs_aops;
66static struct address_space_operations btrfs_symlink_aops;
67static struct file_operations btrfs_dir_file_operations;
68static struct extent_io_ops btrfs_extent_io_ops;
69
70static struct kmem_cache *btrfs_inode_cachep;
71struct kmem_cache *btrfs_trans_handle_cachep;
72struct kmem_cache *btrfs_transaction_cachep;
73struct kmem_cache *btrfs_bit_radix_cachep;
74struct kmem_cache *btrfs_path_cachep;
75
76#define S_SHIFT 12
77static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
78 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
79 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
80 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
81 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
82 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
83 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
84 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
85};
86
87static void btrfs_truncate(struct inode *inode);
88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89static noinline int cow_file_range(struct inode *inode,
90 struct page *locked_page,
91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock);
93
94/*
95 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing.
97 */
98int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
99 int for_del)
100{
101 u64 total;
102 u64 used;
103 u64 thresh;
104 int ret = 0;
105
106 spin_lock(&root->fs_info->delalloc_lock);
107 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
108 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
109 if (for_del)
110 thresh = total * 90;
111 else
112 thresh = total * 85;
113
114 do_div(thresh, 100);
115
116 if (used + root->fs_info->delalloc_bytes + num_required > thresh)
117 ret = -ENOSPC;
118 spin_unlock(&root->fs_info->delalloc_lock);
119 return ret;
120}
121
122/*
123 * this does all the hard work for inserting an inline extent into
124 * the btree. The caller should have done a btrfs_drop_extents so that
125 * no overlapping inline items exist in the btree
126 */
127static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
128 struct btrfs_root *root, struct inode *inode,
129 u64 start, size_t size, size_t compressed_size,
130 struct page **compressed_pages)
131{
132 struct btrfs_key key;
133 struct btrfs_path *path;
134 struct extent_buffer *leaf;
135 struct page *page = NULL;
136 char *kaddr;
137 unsigned long ptr;
138 struct btrfs_file_extent_item *ei;
139 int err = 0;
140 int ret;
141 size_t cur_size = size;
142 size_t datasize;
143 unsigned long offset;
144 int use_compress = 0;
145
146 if (compressed_size && compressed_pages) {
147 use_compress = 1;
148 cur_size = compressed_size;
149 }
150
151 path = btrfs_alloc_path();
152 if (!path)
153 return -ENOMEM;
154
155 btrfs_set_trans_block_group(trans, inode);
156
157 key.objectid = inode->i_ino;
158 key.offset = start;
159 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
160 datasize = btrfs_file_extent_calc_inline_size(cur_size);
161
162 inode_add_bytes(inode, size);
163 ret = btrfs_insert_empty_item(trans, root, path, &key,
164 datasize);
165 BUG_ON(ret);
166 if (ret) {
167 err = ret;
168 goto fail;
169 }
170 leaf = path->nodes[0];
171 ei = btrfs_item_ptr(leaf, path->slots[0],
172 struct btrfs_file_extent_item);
173 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
174 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
175 btrfs_set_file_extent_encryption(leaf, ei, 0);
176 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
177 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
178 ptr = btrfs_file_extent_inline_start(ei);
179
180 if (use_compress) {
181 struct page *cpage;
182 int i = 0;
183 while (compressed_size > 0) {
184 cpage = compressed_pages[i];
185 cur_size = min_t(unsigned long, compressed_size,
186 PAGE_CACHE_SIZE);
187
188 kaddr = kmap(cpage);
189 write_extent_buffer(leaf, kaddr, ptr, cur_size);
190 kunmap(cpage);
191
192 i++;
193 ptr += cur_size;
194 compressed_size -= cur_size;
195 }
196 btrfs_set_file_extent_compression(leaf, ei,
197 BTRFS_COMPRESS_ZLIB);
198 } else {
199 page = find_get_page(inode->i_mapping,
200 start >> PAGE_CACHE_SHIFT);
201 btrfs_set_file_extent_compression(leaf, ei, 0);
202 kaddr = kmap_atomic(page, KM_USER0);
203 offset = start & (PAGE_CACHE_SIZE - 1);
204 write_extent_buffer(leaf, kaddr + offset, ptr, size);
205 kunmap_atomic(kaddr, KM_USER0);
206 page_cache_release(page);
207 }
208 btrfs_mark_buffer_dirty(leaf);
209 btrfs_free_path(path);
210
211 BTRFS_I(inode)->disk_i_size = inode->i_size;
212 btrfs_update_inode(trans, root, inode);
213 return 0;
214fail:
215 btrfs_free_path(path);
216 return err;
217}
218
219
220/*
221 * conditionally insert an inline extent into the file. This
222 * does the checks required to make sure the data is small enough
223 * to fit as an inline extent.
224 */
225static int cow_file_range_inline(struct btrfs_trans_handle *trans,
226 struct btrfs_root *root,
227 struct inode *inode, u64 start, u64 end,
228 size_t compressed_size,
229 struct page **compressed_pages)
230{
231 u64 isize = i_size_read(inode);
232 u64 actual_end = min(end + 1, isize);
233 u64 inline_len = actual_end - start;
234 u64 aligned_end = (end + root->sectorsize - 1) &
235 ~((u64)root->sectorsize - 1);
236 u64 hint_byte;
237 u64 data_len = inline_len;
238 int ret;
239
240 if (compressed_size)
241 data_len = compressed_size;
242
243 if (start > 0 ||
244 actual_end >= PAGE_CACHE_SIZE ||
245 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
246 (!compressed_size &&
247 (actual_end & (root->sectorsize - 1)) == 0) ||
248 end + 1 < isize ||
249 data_len > root->fs_info->max_inline) {
250 return 1;
251 }
252
253 ret = btrfs_drop_extents(trans, root, inode, start,
254 aligned_end, start, &hint_byte);
255 BUG_ON(ret);
256
257 if (isize > actual_end)
258 inline_len = min_t(u64, isize, actual_end);
259 ret = insert_inline_extent(trans, root, inode, start,
260 inline_len, compressed_size,
261 compressed_pages);
262 BUG_ON(ret);
263 btrfs_drop_extent_cache(inode, start, aligned_end, 0);
264 return 0;
265}
266
267struct async_extent {
268 u64 start;
269 u64 ram_size;
270 u64 compressed_size;
271 struct page **pages;
272 unsigned long nr_pages;
273 struct list_head list;
274};
275
276struct async_cow {
277 struct inode *inode;
278 struct btrfs_root *root;
279 struct page *locked_page;
280 u64 start;
281 u64 end;
282 struct list_head extents;
283 struct btrfs_work work;
284};
285
286static noinline int add_async_extent(struct async_cow *cow,
287 u64 start, u64 ram_size,
288 u64 compressed_size,
289 struct page **pages,
290 unsigned long nr_pages)
291{
292 struct async_extent *async_extent;
293
294 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
295 async_extent->start = start;
296 async_extent->ram_size = ram_size;
297 async_extent->compressed_size = compressed_size;
298 async_extent->pages = pages;
299 async_extent->nr_pages = nr_pages;
300 list_add_tail(&async_extent->list, &cow->extents);
301 return 0;
302}
303
304/*
305 * we create compressed extents in two phases. The first
306 * phase compresses a range of pages that have already been
307 * locked (both pages and state bits are locked).
308 *
309 * This is done inside an ordered work queue, and the compression
310 * is spread across many cpus. The actual IO submission is step
311 * two, and the ordered work queue takes care of making sure that
312 * happens in the same order things were put onto the queue by
313 * writepages and friends.
314 *
315 * If this code finds it can't get good compression, it puts an
316 * entry onto the work queue to write the uncompressed bytes. This
317 * makes sure that both compressed inodes and uncompressed inodes
318 * are written in the same order that pdflush sent them down.
319 */
320static noinline int compress_file_range(struct inode *inode,
321 struct page *locked_page,
322 u64 start, u64 end,
323 struct async_cow *async_cow,
324 int *num_added)
325{
326 struct btrfs_root *root = BTRFS_I(inode)->root;
327 struct btrfs_trans_handle *trans;
328 u64 num_bytes;
329 u64 orig_start;
330 u64 disk_num_bytes;
331 u64 blocksize = root->sectorsize;
332 u64 actual_end;
333 u64 isize = i_size_read(inode);
334 int ret = 0;
335 struct page **pages = NULL;
336 unsigned long nr_pages;
337 unsigned long nr_pages_ret = 0;
338 unsigned long total_compressed = 0;
339 unsigned long total_in = 0;
340 unsigned long max_compressed = 128 * 1024;
341 unsigned long max_uncompressed = 128 * 1024;
342 int i;
343 int will_compress;
344
345 orig_start = start;
346
347 actual_end = min_t(u64, isize, end + 1);
348again:
349 will_compress = 0;
350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352
353 total_compressed = actual_end - start;
354
355 /* we want to make sure that amount of ram required to uncompress
356 * an extent is reasonable, so we limit the total size in ram
357 * of a compressed extent to 128k. This is a crucial number
358 * because it also controls how easily we can spread reads across
359 * cpus for decompression.
360 *
361 * We also want to make sure the amount of IO required to do
362 * a random read is reasonably small, so we limit the size of
363 * a compressed extent to 128k.
364 */
365 total_compressed = min(total_compressed, max_uncompressed);
366 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
367 num_bytes = max(blocksize, num_bytes);
368 disk_num_bytes = num_bytes;
369 total_in = 0;
370 ret = 0;
371
372 /*
373 * we do compression for mount -o compress and when the
374 * inode has not been flagged as nocompress. This flag can
375 * change at any time if we discover bad compression ratios.
376 */
377 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
378 btrfs_test_opt(root, COMPRESS)) {
379 WARN_ON(pages);
380 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
381
382 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
383 total_compressed, pages,
384 nr_pages, &nr_pages_ret,
385 &total_in,
386 &total_compressed,
387 max_compressed);
388
389 if (!ret) {
390 unsigned long offset = total_compressed &
391 (PAGE_CACHE_SIZE - 1);
392 struct page *page = pages[nr_pages_ret - 1];
393 char *kaddr;
394
395 /* zero the tail end of the last page, we might be
396 * sending it down to disk
397 */
398 if (offset) {
399 kaddr = kmap_atomic(page, KM_USER0);
400 memset(kaddr + offset, 0,
401 PAGE_CACHE_SIZE - offset);
402 kunmap_atomic(kaddr, KM_USER0);
403 }
404 will_compress = 1;
405 }
406 }
407 if (start == 0) {
408 trans = btrfs_join_transaction(root, 1);
409 BUG_ON(!trans);
410 btrfs_set_trans_block_group(trans, inode);
411
412 /* lets try to make an inline extent */
413 if (ret || total_in < (actual_end - start)) {
414 /* we didn't compress the entire range, try
415 * to make an uncompressed inline extent.
416 */
417 ret = cow_file_range_inline(trans, root, inode,
418 start, end, 0, NULL);
419 } else {
420 /* try making a compressed inline extent */
421 ret = cow_file_range_inline(trans, root, inode,
422 start, end,
423 total_compressed, pages);
424 }
425 btrfs_end_transaction(trans, root);
426 if (ret == 0) {
427 /*
428 * inline extent creation worked, we don't need
429 * to create any more async work items. Unlock
430 * and free up our temp pages.
431 */
432 extent_clear_unlock_delalloc(inode,
433 &BTRFS_I(inode)->io_tree,
434 start, end, NULL, 1, 0,
435 0, 1, 1, 1);
436 ret = 0;
437 goto free_pages_out;
438 }
439 }
440
441 if (will_compress) {
442 /*
443 * we aren't doing an inline extent round the compressed size
444 * up to a block size boundary so the allocator does sane
445 * things
446 */
447 total_compressed = (total_compressed + blocksize - 1) &
448 ~(blocksize - 1);
449
450 /*
451 * one last check to make sure the compression is really a
452 * win, compare the page count read with the blocks on disk
453 */
454 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
455 ~(PAGE_CACHE_SIZE - 1);
456 if (total_compressed >= total_in) {
457 will_compress = 0;
458 } else {
459 disk_num_bytes = total_compressed;
460 num_bytes = total_in;
461 }
462 }
463 if (!will_compress && pages) {
464 /*
465 * the compression code ran but failed to make things smaller,
466 * free any pages it allocated and our page pointer array
467 */
468 for (i = 0; i < nr_pages_ret; i++) {
469 WARN_ON(pages[i]->mapping);
470 page_cache_release(pages[i]);
471 }
472 kfree(pages);
473 pages = NULL;
474 total_compressed = 0;
475 nr_pages_ret = 0;
476
477 /* flag the file so we don't compress in the future */
478 btrfs_set_flag(inode, NOCOMPRESS);
479 }
480 if (will_compress) {
481 *num_added += 1;
482
483 /* the async work queues will take care of doing actual
484 * allocation on disk for these compressed pages,
485 * and will submit them to the elevator.
486 */
487 add_async_extent(async_cow, start, num_bytes,
488 total_compressed, pages, nr_pages_ret);
489
490 if (start + num_bytes < end && start + num_bytes < actual_end) {
491 start += num_bytes;
492 pages = NULL;
493 cond_resched();
494 goto again;
495 }
496 } else {
497 /*
498 * No compression, but we still need to write the pages in
499 * the file we've been given so far. redirty the locked
500 * page if it corresponds to our extent and set things up
501 * for the async work queue to run cow_file_range to do
502 * the normal delalloc dance
503 */
504 if (page_offset(locked_page) >= start &&
505 page_offset(locked_page) <= end) {
506 __set_page_dirty_nobuffers(locked_page);
507 /* unlocked later on in the async handlers */
508 }
509 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
510 *num_added += 1;
511 }
512
513out:
514 return 0;
515
516free_pages_out:
517 for (i = 0; i < nr_pages_ret; i++) {
518 WARN_ON(pages[i]->mapping);
519 page_cache_release(pages[i]);
520 }
521 kfree(pages);
522
523 goto out;
524}
525
526/*
527 * phase two of compressed writeback. This is the ordered portion
528 * of the code, which only gets called in the order the work was
529 * queued. We walk all the async extents created by compress_file_range
530 * and send them down to the disk.
531 */
532static noinline int submit_compressed_extents(struct inode *inode,
533 struct async_cow *async_cow)
534{
535 struct async_extent *async_extent;
536 u64 alloc_hint = 0;
537 struct btrfs_trans_handle *trans;
538 struct btrfs_key ins;
539 struct extent_map *em;
540 struct btrfs_root *root = BTRFS_I(inode)->root;
541 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
542 struct extent_io_tree *io_tree;
543 int ret;
544
545 if (list_empty(&async_cow->extents))
546 return 0;
547
548 trans = btrfs_join_transaction(root, 1);
549
550 while (!list_empty(&async_cow->extents)) {
551 async_extent = list_entry(async_cow->extents.next,
552 struct async_extent, list);
553 list_del(&async_extent->list);
554
555 io_tree = &BTRFS_I(inode)->io_tree;
556
557 /* did the compression code fall back to uncompressed IO? */
558 if (!async_extent->pages) {
559 int page_started = 0;
560 unsigned long nr_written = 0;
561
562 lock_extent(io_tree, async_extent->start,
563 async_extent->start +
564 async_extent->ram_size - 1, GFP_NOFS);
565
566 /* allocate blocks */
567 cow_file_range(inode, async_cow->locked_page,
568 async_extent->start,
569 async_extent->start +
570 async_extent->ram_size - 1,
571 &page_started, &nr_written, 0);
572
573 /*
574 * if page_started, cow_file_range inserted an
575 * inline extent and took care of all the unlocking
576 * and IO for us. Otherwise, we need to submit
577 * all those pages down to the drive.
578 */
579 if (!page_started)
580 extent_write_locked_range(io_tree,
581 inode, async_extent->start,
582 async_extent->start +
583 async_extent->ram_size - 1,
584 btrfs_get_extent,
585 WB_SYNC_ALL);
586 kfree(async_extent);
587 cond_resched();
588 continue;
589 }
590
591 lock_extent(io_tree, async_extent->start,
592 async_extent->start + async_extent->ram_size - 1,
593 GFP_NOFS);
594 /*
595 * here we're doing allocation and writeback of the
596 * compressed pages
597 */
598 btrfs_drop_extent_cache(inode, async_extent->start,
599 async_extent->start +
600 async_extent->ram_size - 1, 0);
601
602 ret = btrfs_reserve_extent(trans, root,
603 async_extent->compressed_size,
604 async_extent->compressed_size,
605 0, alloc_hint,
606 (u64)-1, &ins, 1);
607 BUG_ON(ret);
608 em = alloc_extent_map(GFP_NOFS);
609 em->start = async_extent->start;
610 em->len = async_extent->ram_size;
611 em->orig_start = em->start;
612
613 em->block_start = ins.objectid;
614 em->block_len = ins.offset;
615 em->bdev = root->fs_info->fs_devices->latest_bdev;
616 set_bit(EXTENT_FLAG_PINNED, &em->flags);
617 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
618
619 while (1) {
620 spin_lock(&em_tree->lock);
621 ret = add_extent_mapping(em_tree, em);
622 spin_unlock(&em_tree->lock);
623 if (ret != -EEXIST) {
624 free_extent_map(em);
625 break;
626 }
627 btrfs_drop_extent_cache(inode, async_extent->start,
628 async_extent->start +
629 async_extent->ram_size - 1, 0);
630 }
631
632 ret = btrfs_add_ordered_extent(inode, async_extent->start,
633 ins.objectid,
634 async_extent->ram_size,
635 ins.offset,
636 BTRFS_ORDERED_COMPRESSED);
637 BUG_ON(ret);
638
639 btrfs_end_transaction(trans, root);
640
641 /*
642 * clear dirty, set writeback and unlock the pages.
643 */
644 extent_clear_unlock_delalloc(inode,
645 &BTRFS_I(inode)->io_tree,
646 async_extent->start,
647 async_extent->start +
648 async_extent->ram_size - 1,
649 NULL, 1, 1, 0, 1, 1, 0);
650
651 ret = btrfs_submit_compressed_write(inode,
652 async_extent->start,
653 async_extent->ram_size,
654 ins.objectid,
655 ins.offset, async_extent->pages,
656 async_extent->nr_pages);
657
658 BUG_ON(ret);
659 trans = btrfs_join_transaction(root, 1);
660 alloc_hint = ins.objectid + ins.offset;
661 kfree(async_extent);
662 cond_resched();
663 }
664
665 btrfs_end_transaction(trans, root);
666 return 0;
667}
668
669/*
670 * when extent_io.c finds a delayed allocation range in the file,
671 * the call backs end up in this code. The basic idea is to
672 * allocate extents on disk for the range, and create ordered data structs
673 * in ram to track those extents.
674 *
675 * locked_page is the page that writepage had locked already. We use
676 * it to make sure we don't do extra locks or unlocks.
677 *
678 * *page_started is set to one if we unlock locked_page and do everything
679 * required to start IO on it. It may be clean and already done with
680 * IO when we return.
681 */
682static noinline int cow_file_range(struct inode *inode,
683 struct page *locked_page,
684 u64 start, u64 end, int *page_started,
685 unsigned long *nr_written,
686 int unlock)
687{
688 struct btrfs_root *root = BTRFS_I(inode)->root;
689 struct btrfs_trans_handle *trans;
690 u64 alloc_hint = 0;
691 u64 num_bytes;
692 unsigned long ram_size;
693 u64 disk_num_bytes;
694 u64 cur_alloc_size;
695 u64 blocksize = root->sectorsize;
696 u64 actual_end;
697 u64 isize = i_size_read(inode);
698 struct btrfs_key ins;
699 struct extent_map *em;
700 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
701 int ret = 0;
702
703 trans = btrfs_join_transaction(root, 1);
704 BUG_ON(!trans);
705 btrfs_set_trans_block_group(trans, inode);
706
707 actual_end = min_t(u64, isize, end + 1);
708
709 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
710 num_bytes = max(blocksize, num_bytes);
711 disk_num_bytes = num_bytes;
712 ret = 0;
713
714 if (start == 0) {
715 /* lets try to make an inline extent */
716 ret = cow_file_range_inline(trans, root, inode,
717 start, end, 0, NULL);
718 if (ret == 0) {
719 extent_clear_unlock_delalloc(inode,
720 &BTRFS_I(inode)->io_tree,
721 start, end, NULL, 1, 1,
722 1, 1, 1, 1);
723 *nr_written = *nr_written +
724 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
725 *page_started = 1;
726 ret = 0;
727 goto out;
728 }
729 }
730
731 BUG_ON(disk_num_bytes >
732 btrfs_super_total_bytes(&root->fs_info->super_copy));
733
734 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
735
736 while (disk_num_bytes > 0) {
737 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
738 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
739 root->sectorsize, 0, alloc_hint,
740 (u64)-1, &ins, 1);
741 BUG_ON(ret);
742
743 em = alloc_extent_map(GFP_NOFS);
744 em->start = start;
745 em->orig_start = em->start;
746
747 ram_size = ins.offset;
748 em->len = ins.offset;
749
750 em->block_start = ins.objectid;
751 em->block_len = ins.offset;
752 em->bdev = root->fs_info->fs_devices->latest_bdev;
753 set_bit(EXTENT_FLAG_PINNED, &em->flags);
754
755 while (1) {
756 spin_lock(&em_tree->lock);
757 ret = add_extent_mapping(em_tree, em);
758 spin_unlock(&em_tree->lock);
759 if (ret != -EEXIST) {
760 free_extent_map(em);
761 break;
762 }
763 btrfs_drop_extent_cache(inode, start,
764 start + ram_size - 1, 0);
765 }
766
767 cur_alloc_size = ins.offset;
768 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
769 ram_size, cur_alloc_size, 0);
770 BUG_ON(ret);
771
772 if (root->root_key.objectid ==
773 BTRFS_DATA_RELOC_TREE_OBJECTID) {
774 ret = btrfs_reloc_clone_csums(inode, start,
775 cur_alloc_size);
776 BUG_ON(ret);
777 }
778
779 if (disk_num_bytes < cur_alloc_size)
780 break;
781
782 /* we're not doing compressed IO, don't unlock the first
783 * page (which the caller expects to stay locked), don't
784 * clear any dirty bits and don't set any writeback bits
785 */
786 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
787 start, start + ram_size - 1,
788 locked_page, unlock, 1,
789 1, 0, 0, 0);
790 disk_num_bytes -= cur_alloc_size;
791 num_bytes -= cur_alloc_size;
792 alloc_hint = ins.objectid + ins.offset;
793 start += cur_alloc_size;
794 }
795out:
796 ret = 0;
797 btrfs_end_transaction(trans, root);
798
799 return ret;
800}
801
802/*
803 * work queue call back to started compression on a file and pages
804 */
805static noinline void async_cow_start(struct btrfs_work *work)
806{
807 struct async_cow *async_cow;
808 int num_added = 0;
809 async_cow = container_of(work, struct async_cow, work);
810
811 compress_file_range(async_cow->inode, async_cow->locked_page,
812 async_cow->start, async_cow->end, async_cow,
813 &num_added);
814 if (num_added == 0)
815 async_cow->inode = NULL;
816}
817
818/*
819 * work queue call back to submit previously compressed pages
820 */
821static noinline void async_cow_submit(struct btrfs_work *work)
822{
823 struct async_cow *async_cow;
824 struct btrfs_root *root;
825 unsigned long nr_pages;
826
827 async_cow = container_of(work, struct async_cow, work);
828
829 root = async_cow->root;
830 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
831 PAGE_CACHE_SHIFT;
832
833 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
834
835 if (atomic_read(&root->fs_info->async_delalloc_pages) <
836 5 * 1042 * 1024 &&
837 waitqueue_active(&root->fs_info->async_submit_wait))
838 wake_up(&root->fs_info->async_submit_wait);
839
840 if (async_cow->inode)
841 submit_compressed_extents(async_cow->inode, async_cow);
842}
843
844static noinline void async_cow_free(struct btrfs_work *work)
845{
846 struct async_cow *async_cow;
847 async_cow = container_of(work, struct async_cow, work);
848 kfree(async_cow);
849}
850
851static int cow_file_range_async(struct inode *inode, struct page *locked_page,
852 u64 start, u64 end, int *page_started,
853 unsigned long *nr_written)
854{
855 struct async_cow *async_cow;
856 struct btrfs_root *root = BTRFS_I(inode)->root;
857 unsigned long nr_pages;
858 u64 cur_end;
859 int limit = 10 * 1024 * 1042;
860
861 if (!btrfs_test_opt(root, COMPRESS)) {
862 return cow_file_range(inode, locked_page, start, end,
863 page_started, nr_written, 1);
864 }
865
866 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
867 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
868 while (start < end) {
869 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
870 async_cow->inode = inode;
871 async_cow->root = root;
872 async_cow->locked_page = locked_page;
873 async_cow->start = start;
874
875 if (btrfs_test_flag(inode, NOCOMPRESS))
876 cur_end = end;
877 else
878 cur_end = min(end, start + 512 * 1024 - 1);
879
880 async_cow->end = cur_end;
881 INIT_LIST_HEAD(&async_cow->extents);
882
883 async_cow->work.func = async_cow_start;
884 async_cow->work.ordered_func = async_cow_submit;
885 async_cow->work.ordered_free = async_cow_free;
886 async_cow->work.flags = 0;
887
888 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
889 PAGE_CACHE_SHIFT;
890 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
891
892 btrfs_queue_worker(&root->fs_info->delalloc_workers,
893 &async_cow->work);
894
895 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
896 wait_event(root->fs_info->async_submit_wait,
897 (atomic_read(&root->fs_info->async_delalloc_pages) <
898 limit));
899 }
900
901 while (atomic_read(&root->fs_info->async_submit_draining) &&
902 atomic_read(&root->fs_info->async_delalloc_pages)) {
903 wait_event(root->fs_info->async_submit_wait,
904 (atomic_read(&root->fs_info->async_delalloc_pages) ==
905 0));
906 }
907
908 *nr_written += nr_pages;
909 start = cur_end + 1;
910 }
911 *page_started = 1;
912 return 0;
913}
914
915static noinline int csum_exist_in_range(struct btrfs_root *root,
916 u64 bytenr, u64 num_bytes)
917{
918 int ret;
919 struct btrfs_ordered_sum *sums;
920 LIST_HEAD(list);
921
922 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
923 bytenr + num_bytes - 1, &list);
924 if (ret == 0 && list_empty(&list))
925 return 0;
926
927 while (!list_empty(&list)) {
928 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
929 list_del(&sums->list);
930 kfree(sums);
931 }
932 return 1;
933}
934
935/*
936 * when nowcow writeback call back. This checks for snapshots or COW copies
937 * of the extents that exist in the file, and COWs the file as required.
938 *
939 * If no cow copies or snapshots exist, we write directly to the existing
940 * blocks on disk
941 */
942static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
943 u64 start, u64 end, int *page_started, int force,
944 unsigned long *nr_written)
945{
946 struct btrfs_root *root = BTRFS_I(inode)->root;
947 struct btrfs_trans_handle *trans;
948 struct extent_buffer *leaf;
949 struct btrfs_path *path;
950 struct btrfs_file_extent_item *fi;
951 struct btrfs_key found_key;
952 u64 cow_start;
953 u64 cur_offset;
954 u64 extent_end;
955 u64 disk_bytenr;
956 u64 num_bytes;
957 int extent_type;
958 int ret;
959 int type;
960 int nocow;
961 int check_prev = 1;
962
963 path = btrfs_alloc_path();
964 BUG_ON(!path);
965 trans = btrfs_join_transaction(root, 1);
966 BUG_ON(!trans);
967
968 cow_start = (u64)-1;
969 cur_offset = start;
970 while (1) {
971 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
972 cur_offset, 0);
973 BUG_ON(ret < 0);
974 if (ret > 0 && path->slots[0] > 0 && check_prev) {
975 leaf = path->nodes[0];
976 btrfs_item_key_to_cpu(leaf, &found_key,
977 path->slots[0] - 1);
978 if (found_key.objectid == inode->i_ino &&
979 found_key.type == BTRFS_EXTENT_DATA_KEY)
980 path->slots[0]--;
981 }
982 check_prev = 0;
983next_slot:
984 leaf = path->nodes[0];
985 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
986 ret = btrfs_next_leaf(root, path);
987 if (ret < 0)
988 BUG_ON(1);
989 if (ret > 0)
990 break;
991 leaf = path->nodes[0];
992 }
993
994 nocow = 0;
995 disk_bytenr = 0;
996 num_bytes = 0;
997 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
998
999 if (found_key.objectid > inode->i_ino ||
1000 found_key.type > BTRFS_EXTENT_DATA_KEY ||
1001 found_key.offset > end)
1002 break;
1003
1004 if (found_key.offset > cur_offset) {
1005 extent_end = found_key.offset;
1006 goto out_check;
1007 }
1008
1009 fi = btrfs_item_ptr(leaf, path->slots[0],
1010 struct btrfs_file_extent_item);
1011 extent_type = btrfs_file_extent_type(leaf, fi);
1012
1013 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1014 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1015 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1016 extent_end = found_key.offset +
1017 btrfs_file_extent_num_bytes(leaf, fi);
1018 if (extent_end <= start) {
1019 path->slots[0]++;
1020 goto next_slot;
1021 }
1022 if (disk_bytenr == 0)
1023 goto out_check;
1024 if (btrfs_file_extent_compression(leaf, fi) ||
1025 btrfs_file_extent_encryption(leaf, fi) ||
1026 btrfs_file_extent_other_encoding(leaf, fi))
1027 goto out_check;
1028 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1029 goto out_check;
1030 if (btrfs_extent_readonly(root, disk_bytenr))
1031 goto out_check;
1032 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
1033 disk_bytenr))
1034 goto out_check;
1035 disk_bytenr += btrfs_file_extent_offset(leaf, fi);
1036 disk_bytenr += cur_offset - found_key.offset;
1037 num_bytes = min(end + 1, extent_end) - cur_offset;
1038 /*
1039 * force cow if csum exists in the range.
1040 * this ensure that csum for a given extent are
1041 * either valid or do not exist.
1042 */
1043 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1044 goto out_check;
1045 nocow = 1;
1046 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1047 extent_end = found_key.offset +
1048 btrfs_file_extent_inline_len(leaf, fi);
1049 extent_end = ALIGN(extent_end, root->sectorsize);
1050 } else {
1051 BUG_ON(1);
1052 }
1053out_check:
1054 if (extent_end <= start) {
1055 path->slots[0]++;
1056 goto next_slot;
1057 }
1058 if (!nocow) {
1059 if (cow_start == (u64)-1)
1060 cow_start = cur_offset;
1061 cur_offset = extent_end;
1062 if (cur_offset > end)
1063 break;
1064 path->slots[0]++;
1065 goto next_slot;
1066 }
1067
1068 btrfs_release_path(root, path);
1069 if (cow_start != (u64)-1) {
1070 ret = cow_file_range(inode, locked_page, cow_start,
1071 found_key.offset - 1, page_started,
1072 nr_written, 1);
1073 BUG_ON(ret);
1074 cow_start = (u64)-1;
1075 }
1076
1077 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1078 struct extent_map *em;
1079 struct extent_map_tree *em_tree;
1080 em_tree = &BTRFS_I(inode)->extent_tree;
1081 em = alloc_extent_map(GFP_NOFS);
1082 em->start = cur_offset;
1083 em->orig_start = em->start;
1084 em->len = num_bytes;
1085 em->block_len = num_bytes;
1086 em->block_start = disk_bytenr;
1087 em->bdev = root->fs_info->fs_devices->latest_bdev;
1088 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1089 while (1) {
1090 spin_lock(&em_tree->lock);
1091 ret = add_extent_mapping(em_tree, em);
1092 spin_unlock(&em_tree->lock);
1093 if (ret != -EEXIST) {
1094 free_extent_map(em);
1095 break;
1096 }
1097 btrfs_drop_extent_cache(inode, em->start,
1098 em->start + em->len - 1, 0);
1099 }
1100 type = BTRFS_ORDERED_PREALLOC;
1101 } else {
1102 type = BTRFS_ORDERED_NOCOW;
1103 }
1104
1105 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1106 num_bytes, num_bytes, type);
1107 BUG_ON(ret);
1108
1109 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1110 cur_offset, cur_offset + num_bytes - 1,
1111 locked_page, 1, 1, 1, 0, 0, 0);
1112 cur_offset = extent_end;
1113 if (cur_offset > end)
1114 break;
1115 }
1116 btrfs_release_path(root, path);
1117
1118 if (cur_offset <= end && cow_start == (u64)-1)
1119 cow_start = cur_offset;
1120 if (cow_start != (u64)-1) {
1121 ret = cow_file_range(inode, locked_page, cow_start, end,
1122 page_started, nr_written, 1);
1123 BUG_ON(ret);
1124 }
1125
1126 ret = btrfs_end_transaction(trans, root);
1127 BUG_ON(ret);
1128 btrfs_free_path(path);
1129 return 0;
1130}
1131
1132/*
1133 * extent_io.c call back to do delayed allocation processing
1134 */
1135static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1136 u64 start, u64 end, int *page_started,
1137 unsigned long *nr_written)
1138{
1139 int ret;
1140
1141 if (btrfs_test_flag(inode, NODATACOW))
1142 ret = run_delalloc_nocow(inode, locked_page, start, end,
1143 page_started, 1, nr_written);
1144 else if (btrfs_test_flag(inode, PREALLOC))
1145 ret = run_delalloc_nocow(inode, locked_page, start, end,
1146 page_started, 0, nr_written);
1147 else
1148 ret = cow_file_range_async(inode, locked_page, start, end,
1149 page_started, nr_written);
1150
1151 return ret;
1152}
1153
1154/*
1155 * extent_io.c set_bit_hook, used to track delayed allocation
1156 * bytes in this file, and to maintain the list of inodes that
1157 * have pending delalloc work to be done.
1158 */
1159static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1160 unsigned long old, unsigned long bits)
1161{
1162 /*
1163 * set_bit and clear bit hooks normally require _irqsave/restore
1164 * but in this case, we are only testeing for the DELALLOC
1165 * bit, which is only set or cleared with irqs on
1166 */
1167 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1168 struct btrfs_root *root = BTRFS_I(inode)->root;
1169 spin_lock(&root->fs_info->delalloc_lock);
1170 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1171 root->fs_info->delalloc_bytes += end - start + 1;
1172 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1173 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1174 &root->fs_info->delalloc_inodes);
1175 }
1176 spin_unlock(&root->fs_info->delalloc_lock);
1177 }
1178 return 0;
1179}
1180
1181/*
1182 * extent_io.c clear_bit_hook, see set_bit_hook for why
1183 */
1184static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1185 unsigned long old, unsigned long bits)
1186{
1187 /*
1188 * set_bit and clear bit hooks normally require _irqsave/restore
1189 * but in this case, we are only testeing for the DELALLOC
1190 * bit, which is only set or cleared with irqs on
1191 */
1192 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1193 struct btrfs_root *root = BTRFS_I(inode)->root;
1194
1195 spin_lock(&root->fs_info->delalloc_lock);
1196 if (end - start + 1 > root->fs_info->delalloc_bytes) {
1197 printk(KERN_INFO "btrfs warning: delalloc account "
1198 "%llu %llu\n",
1199 (unsigned long long)end - start + 1,
1200 (unsigned long long)
1201 root->fs_info->delalloc_bytes);
1202 root->fs_info->delalloc_bytes = 0;
1203 BTRFS_I(inode)->delalloc_bytes = 0;
1204 } else {
1205 root->fs_info->delalloc_bytes -= end - start + 1;
1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1207 }
1208 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1209 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1210 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1211 }
1212 spin_unlock(&root->fs_info->delalloc_lock);
1213 }
1214 return 0;
1215}
1216
1217/*
1218 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1219 * we don't create bios that span stripes or chunks
1220 */
1221int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1222 size_t size, struct bio *bio,
1223 unsigned long bio_flags)
1224{
1225 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1226 struct btrfs_mapping_tree *map_tree;
1227 u64 logical = (u64)bio->bi_sector << 9;
1228 u64 length = 0;
1229 u64 map_length;
1230 int ret;
1231
1232 if (bio_flags & EXTENT_BIO_COMPRESSED)
1233 return 0;
1234
1235 length = bio->bi_size;
1236 map_tree = &root->fs_info->mapping_tree;
1237 map_length = length;
1238 ret = btrfs_map_block(map_tree, READ, logical,
1239 &map_length, NULL, 0);
1240
1241 if (map_length < length + size)
1242 return 1;
1243 return 0;
1244}
1245
1246/*
1247 * in order to insert checksums into the metadata in large chunks,
1248 * we wait until bio submission time. All the pages in the bio are
1249 * checksummed and sums are attached onto the ordered extent record.
1250 *
1251 * At IO completion time the cums attached on the ordered extent record
1252 * are inserted into the btree
1253 */
1254static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1255 struct bio *bio, int mirror_num,
1256 unsigned long bio_flags)
1257{
1258 struct btrfs_root *root = BTRFS_I(inode)->root;
1259 int ret = 0;
1260
1261 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1262 BUG_ON(ret);
1263 return 0;
1264}
1265
1266/*
1267 * in order to insert checksums into the metadata in large chunks,
1268 * we wait until bio submission time. All the pages in the bio are
1269 * checksummed and sums are attached onto the ordered extent record.
1270 *
1271 * At IO completion time the cums attached on the ordered extent record
1272 * are inserted into the btree
1273 */
1274static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1275 int mirror_num, unsigned long bio_flags)
1276{
1277 struct btrfs_root *root = BTRFS_I(inode)->root;
1278 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
1279}
1280
1281/*
1282 * extent_io.c submission hook. This does the right thing for csum calculation
1283 * on write, or reading the csums from the tree before a read
1284 */
1285static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1286 int mirror_num, unsigned long bio_flags)
1287{
1288 struct btrfs_root *root = BTRFS_I(inode)->root;
1289 int ret = 0;
1290 int skip_sum;
1291
1292 skip_sum = btrfs_test_flag(inode, NODATASUM);
1293
1294 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1295 BUG_ON(ret);
1296
1297 if (!(rw & (1 << BIO_RW))) {
1298 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1299 return btrfs_submit_compressed_read(inode, bio,
1300 mirror_num, bio_flags);
1301 } else if (!skip_sum)
1302 btrfs_lookup_bio_sums(root, inode, bio, NULL);
1303 goto mapit;
1304 } else if (!skip_sum) {
1305 /* csum items have already been cloned */
1306 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1307 goto mapit;
1308 /* we're doing a write, do the async checksumming */
1309 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1310 inode, rw, bio, mirror_num,
1311 bio_flags, __btrfs_submit_bio_start,
1312 __btrfs_submit_bio_done);
1313 }
1314
1315mapit:
1316 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
1317}
1318
1319/*
1320 * given a list of ordered sums record them in the inode. This happens
1321 * at IO completion time based on sums calculated at bio submission time.
1322 */
1323static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1324 struct inode *inode, u64 file_offset,
1325 struct list_head *list)
1326{
1327 struct list_head *cur;
1328 struct btrfs_ordered_sum *sum;
1329
1330 btrfs_set_trans_block_group(trans, inode);
1331 list_for_each(cur, list) {
1332 sum = list_entry(cur, struct btrfs_ordered_sum, list);
1333 btrfs_csum_file_blocks(trans,
1334 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1335 }
1336 return 0;
1337}
1338
1339int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
1340{
1341 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1342 WARN_ON(1);
1343 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1344 GFP_NOFS);
1345}
1346
1347/* see btrfs_writepage_start_hook for details on why this is required */
1348struct btrfs_writepage_fixup {
1349 struct page *page;
1350 struct btrfs_work work;
1351};
1352
1353static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1354{
1355 struct btrfs_writepage_fixup *fixup;
1356 struct btrfs_ordered_extent *ordered;
1357 struct page *page;
1358 struct inode *inode;
1359 u64 page_start;
1360 u64 page_end;
1361
1362 fixup = container_of(work, struct btrfs_writepage_fixup, work);
1363 page = fixup->page;
1364again:
1365 lock_page(page);
1366 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1367 ClearPageChecked(page);
1368 goto out_page;
1369 }
1370
1371 inode = page->mapping->host;
1372 page_start = page_offset(page);
1373 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1374
1375 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1376
1377 /* already ordered? We're done */
1378 if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
1379 EXTENT_ORDERED, 0)) {
1380 goto out;
1381 }
1382
1383 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1384 if (ordered) {
1385 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
1386 page_end, GFP_NOFS);
1387 unlock_page(page);
1388 btrfs_start_ordered_extent(inode, ordered, 1);
1389 goto again;
1390 }
1391
1392 btrfs_set_extent_delalloc(inode, page_start, page_end);
1393 ClearPageChecked(page);
1394out:
1395 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1396out_page:
1397 unlock_page(page);
1398 page_cache_release(page);
1399}
1400
1401/*
1402 * There are a few paths in the higher layers of the kernel that directly
1403 * set the page dirty bit without asking the filesystem if it is a
1404 * good idea. This causes problems because we want to make sure COW
1405 * properly happens and the data=ordered rules are followed.
1406 *
1407 * In our case any range that doesn't have the ORDERED bit set
1408 * hasn't been properly setup for IO. We kick off an async process
1409 * to fix it up. The async helper will wait for ordered extents, set
1410 * the delalloc bit and make it safe to write the page.
1411 */
1412static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1413{
1414 struct inode *inode = page->mapping->host;
1415 struct btrfs_writepage_fixup *fixup;
1416 struct btrfs_root *root = BTRFS_I(inode)->root;
1417 int ret;
1418
1419 ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1420 EXTENT_ORDERED, 0);
1421 if (ret)
1422 return 0;
1423
1424 if (PageChecked(page))
1425 return -EAGAIN;
1426
1427 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1428 if (!fixup)
1429 return -EAGAIN;
1430
1431 SetPageChecked(page);
1432 page_cache_get(page);
1433 fixup->work.func = btrfs_writepage_fixup_worker;
1434 fixup->page = page;
1435 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1436 return -EAGAIN;
1437}
1438
1439static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1440 struct inode *inode, u64 file_pos,
1441 u64 disk_bytenr, u64 disk_num_bytes,
1442 u64 num_bytes, u64 ram_bytes,
1443 u8 compression, u8 encryption,
1444 u16 other_encoding, int extent_type)
1445{
1446 struct btrfs_root *root = BTRFS_I(inode)->root;
1447 struct btrfs_file_extent_item *fi;
1448 struct btrfs_path *path;
1449 struct extent_buffer *leaf;
1450 struct btrfs_key ins;
1451 u64 hint;
1452 int ret;
1453
1454 path = btrfs_alloc_path();
1455 BUG_ON(!path);
1456
1457 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1458 file_pos + num_bytes, file_pos, &hint);
1459 BUG_ON(ret);
1460
1461 ins.objectid = inode->i_ino;
1462 ins.offset = file_pos;
1463 ins.type = BTRFS_EXTENT_DATA_KEY;
1464 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1465 BUG_ON(ret);
1466 leaf = path->nodes[0];
1467 fi = btrfs_item_ptr(leaf, path->slots[0],
1468 struct btrfs_file_extent_item);
1469 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1470 btrfs_set_file_extent_type(leaf, fi, extent_type);
1471 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1472 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1473 btrfs_set_file_extent_offset(leaf, fi, 0);
1474 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1475 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1476 btrfs_set_file_extent_compression(leaf, fi, compression);
1477 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1478 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1479 btrfs_mark_buffer_dirty(leaf);
1480
1481 inode_add_bytes(inode, num_bytes);
1482 btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
1483
1484 ins.objectid = disk_bytenr;
1485 ins.offset = disk_num_bytes;
1486 ins.type = BTRFS_EXTENT_ITEM_KEY;
1487 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
1488 root->root_key.objectid,
1489 trans->transid, inode->i_ino, &ins);
1490 BUG_ON(ret);
1491
1492 btrfs_free_path(path);
1493 return 0;
1494}
1495
1496/* as ordered data IO finishes, this gets called so we can finish
1497 * an ordered extent if the range of bytes in the file it covers are
1498 * fully written.
1499 */
1500static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1501{
1502 struct btrfs_root *root = BTRFS_I(inode)->root;
1503 struct btrfs_trans_handle *trans;
1504 struct btrfs_ordered_extent *ordered_extent;
1505 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1506 int compressed = 0;
1507 int ret;
1508
1509 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
1510 if (!ret)
1511 return 0;
1512
1513 trans = btrfs_join_transaction(root, 1);
1514
1515 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1516 BUG_ON(!ordered_extent);
1517 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1518 goto nocow;
1519
1520 lock_extent(io_tree, ordered_extent->file_offset,
1521 ordered_extent->file_offset + ordered_extent->len - 1,
1522 GFP_NOFS);
1523
1524 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1525 compressed = 1;
1526 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1527 BUG_ON(compressed);
1528 ret = btrfs_mark_extent_written(trans, root, inode,
1529 ordered_extent->file_offset,
1530 ordered_extent->file_offset +
1531 ordered_extent->len);
1532 BUG_ON(ret);
1533 } else {
1534 ret = insert_reserved_file_extent(trans, inode,
1535 ordered_extent->file_offset,
1536 ordered_extent->start,
1537 ordered_extent->disk_len,
1538 ordered_extent->len,
1539 ordered_extent->len,
1540 compressed, 0, 0,
1541 BTRFS_FILE_EXTENT_REG);
1542 BUG_ON(ret);
1543 }
1544 unlock_extent(io_tree, ordered_extent->file_offset,
1545 ordered_extent->file_offset + ordered_extent->len - 1,
1546 GFP_NOFS);
1547nocow:
1548 add_pending_csums(trans, inode, ordered_extent->file_offset,
1549 &ordered_extent->list);
1550
1551 mutex_lock(&BTRFS_I(inode)->extent_mutex);
1552 btrfs_ordered_update_i_size(inode, ordered_extent);
1553 btrfs_update_inode(trans, root, inode);
1554 btrfs_remove_ordered_extent(inode, ordered_extent);
1555 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
1556
1557 /* once for us */
1558 btrfs_put_ordered_extent(ordered_extent);
1559 /* once for the tree */
1560 btrfs_put_ordered_extent(ordered_extent);
1561
1562 btrfs_end_transaction(trans, root);
1563 return 0;
1564}
1565
1566static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1567 struct extent_state *state, int uptodate)
1568{
1569 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1570}
1571
1572/*
1573 * When IO fails, either with EIO or csum verification fails, we
1574 * try other mirrors that might have a good copy of the data. This
1575 * io_failure_record is used to record state as we go through all the
1576 * mirrors. If another mirror has good data, the page is set up to date
1577 * and things continue. If a good mirror can't be found, the original
1578 * bio end_io callback is called to indicate things have failed.
1579 */
1580struct io_failure_record {
1581 struct page *page;
1582 u64 start;
1583 u64 len;
1584 u64 logical;
1585 unsigned long bio_flags;
1586 int last_mirror;
1587};
1588
1589static int btrfs_io_failed_hook(struct bio *failed_bio,
1590 struct page *page, u64 start, u64 end,
1591 struct extent_state *state)
1592{
1593 struct io_failure_record *failrec = NULL;
1594 u64 private;
1595 struct extent_map *em;
1596 struct inode *inode = page->mapping->host;
1597 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1598 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1599 struct bio *bio;
1600 int num_copies;
1601 int ret;
1602 int rw;
1603 u64 logical;
1604
1605 ret = get_state_private(failure_tree, start, &private);
1606 if (ret) {
1607 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1608 if (!failrec)
1609 return -ENOMEM;
1610 failrec->start = start;
1611 failrec->len = end - start + 1;
1612 failrec->last_mirror = 0;
1613 failrec->bio_flags = 0;
1614
1615 spin_lock(&em_tree->lock);
1616 em = lookup_extent_mapping(em_tree, start, failrec->len);
1617 if (em->start > start || em->start + em->len < start) {
1618 free_extent_map(em);
1619 em = NULL;
1620 }
1621 spin_unlock(&em_tree->lock);
1622
1623 if (!em || IS_ERR(em)) {
1624 kfree(failrec);
1625 return -EIO;
1626 }
1627 logical = start - em->start;
1628 logical = em->block_start + logical;
1629 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1630 logical = em->block_start;
1631 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1632 }
1633 failrec->logical = logical;
1634 free_extent_map(em);
1635 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1636 EXTENT_DIRTY, GFP_NOFS);
1637 set_state_private(failure_tree, start,
1638 (u64)(unsigned long)failrec);
1639 } else {
1640 failrec = (struct io_failure_record *)(unsigned long)private;
1641 }
1642 num_copies = btrfs_num_copies(
1643 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1644 failrec->logical, failrec->len);
1645 failrec->last_mirror++;
1646 if (!state) {
1647 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1648 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1649 failrec->start,
1650 EXTENT_LOCKED);
1651 if (state && state->start != failrec->start)
1652 state = NULL;
1653 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1654 }
1655 if (!state || failrec->last_mirror > num_copies) {
1656 set_state_private(failure_tree, failrec->start, 0);
1657 clear_extent_bits(failure_tree, failrec->start,
1658 failrec->start + failrec->len - 1,
1659 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1660 kfree(failrec);
1661 return -EIO;
1662 }
1663 bio = bio_alloc(GFP_NOFS, 1);
1664 bio->bi_private = state;
1665 bio->bi_end_io = failed_bio->bi_end_io;
1666 bio->bi_sector = failrec->logical >> 9;
1667 bio->bi_bdev = failed_bio->bi_bdev;
1668 bio->bi_size = 0;
1669
1670 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1671 if (failed_bio->bi_rw & (1 << BIO_RW))
1672 rw = WRITE;
1673 else
1674 rw = READ;
1675
1676 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1677 failrec->last_mirror,
1678 failrec->bio_flags);
1679 return 0;
1680}
1681
1682/*
1683 * each time an IO finishes, we do a fast check in the IO failure tree
1684 * to see if we need to process or clean up an io_failure_record
1685 */
1686static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1687{
1688 u64 private;
1689 u64 private_failure;
1690 struct io_failure_record *failure;
1691 int ret;
1692
1693 private = 0;
1694 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1695 (u64)-1, 1, EXTENT_DIRTY)) {
1696 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1697 start, &private_failure);
1698 if (ret == 0) {
1699 failure = (struct io_failure_record *)(unsigned long)
1700 private_failure;
1701 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1702 failure->start, 0);
1703 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1704 failure->start,
1705 failure->start + failure->len - 1,
1706 EXTENT_DIRTY | EXTENT_LOCKED,
1707 GFP_NOFS);
1708 kfree(failure);
1709 }
1710 }
1711 return 0;
1712}
1713
1714/*
1715 * when reads are done, we need to check csums to verify the data is correct
1716 * if there's a match, we allow the bio to finish. If not, we go through
1717 * the io_failure_record routines to find good copies
1718 */
1719static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1720 struct extent_state *state)
1721{
1722 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
1723 struct inode *inode = page->mapping->host;
1724 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1725 char *kaddr;
1726 u64 private = ~(u32)0;
1727 int ret;
1728 struct btrfs_root *root = BTRFS_I(inode)->root;
1729 u32 csum = ~(u32)0;
1730
1731 if (PageChecked(page)) {
1732 ClearPageChecked(page);
1733 goto good;
1734 }
1735 if (btrfs_test_flag(inode, NODATASUM))
1736 return 0;
1737
1738 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1739 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
1740 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
1741 GFP_NOFS);
1742 return 0;
1743 }
1744
1745 if (state && state->start == start) {
1746 private = state->private;
1747 ret = 0;
1748 } else {
1749 ret = get_state_private(io_tree, start, &private);
1750 }
1751 kaddr = kmap_atomic(page, KM_USER0);
1752 if (ret)
1753 goto zeroit;
1754
1755 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
1756 btrfs_csum_final(csum, (char *)&csum);
1757 if (csum != private)
1758 goto zeroit;
1759
1760 kunmap_atomic(kaddr, KM_USER0);
1761good:
1762 /* if the io failure tree for this inode is non-empty,
1763 * check to see if we've recovered from a failed IO
1764 */
1765 btrfs_clean_io_failures(inode, start);
1766 return 0;
1767
1768zeroit:
1769 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
1770 "private %llu\n", page->mapping->host->i_ino,
1771 (unsigned long long)start, csum,
1772 (unsigned long long)private);
1773 memset(kaddr + offset, 1, end - start + 1);
1774 flush_dcache_page(page);
1775 kunmap_atomic(kaddr, KM_USER0);
1776 if (private == 0)
1777 return 0;
1778 return -EIO;
1779}
1780
1781/*
1782 * This creates an orphan entry for the given inode in case something goes
1783 * wrong in the middle of an unlink/truncate.
1784 */
1785int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
1786{
1787 struct btrfs_root *root = BTRFS_I(inode)->root;
1788 int ret = 0;
1789
1790 spin_lock(&root->list_lock);
1791
1792 /* already on the orphan list, we're good */
1793 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
1794 spin_unlock(&root->list_lock);
1795 return 0;
1796 }
1797
1798 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1799
1800 spin_unlock(&root->list_lock);
1801
1802 /*
1803 * insert an orphan item to track this unlinked/truncated file
1804 */
1805 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
1806
1807 return ret;
1808}
1809
1810/*
1811 * We have done the truncate/delete so we can go ahead and remove the orphan
1812 * item for this particular inode.
1813 */
1814int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
1815{
1816 struct btrfs_root *root = BTRFS_I(inode)->root;
1817 int ret = 0;
1818
1819 spin_lock(&root->list_lock);
1820
1821 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
1822 spin_unlock(&root->list_lock);
1823 return 0;
1824 }
1825
1826 list_del_init(&BTRFS_I(inode)->i_orphan);
1827 if (!trans) {
1828 spin_unlock(&root->list_lock);
1829 return 0;
1830 }
1831
1832 spin_unlock(&root->list_lock);
1833
1834 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
1835
1836 return ret;
1837}
1838
1839/*
1840 * this cleans up any orphans that may be left on the list from the last use
1841 * of this root.
1842 */
1843void btrfs_orphan_cleanup(struct btrfs_root *root)
1844{
1845 struct btrfs_path *path;
1846 struct extent_buffer *leaf;
1847 struct btrfs_item *item;
1848 struct btrfs_key key, found_key;
1849 struct btrfs_trans_handle *trans;
1850 struct inode *inode;
1851 int ret = 0, nr_unlink = 0, nr_truncate = 0;
1852
1853 path = btrfs_alloc_path();
1854 if (!path)
1855 return;
1856 path->reada = -1;
1857
1858 key.objectid = BTRFS_ORPHAN_OBJECTID;
1859 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1860 key.offset = (u64)-1;
1861
1862
1863 while (1) {
1864 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1865 if (ret < 0) {
1866 printk(KERN_ERR "Error searching slot for orphan: %d"
1867 "\n", ret);
1868 break;
1869 }
1870
1871 /*
1872 * if ret == 0 means we found what we were searching for, which
1873 * is weird, but possible, so only screw with path if we didnt
1874 * find the key and see if we have stuff that matches
1875 */
1876 if (ret > 0) {
1877 if (path->slots[0] == 0)
1878 break;
1879 path->slots[0]--;
1880 }
1881
1882 /* pull out the item */
1883 leaf = path->nodes[0];
1884 item = btrfs_item_nr(leaf, path->slots[0]);
1885 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1886
1887 /* make sure the item matches what we want */
1888 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
1889 break;
1890 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
1891 break;
1892
1893 /* release the path since we're done with it */
1894 btrfs_release_path(root, path);
1895
1896 /*
1897 * this is where we are basically btrfs_lookup, without the
1898 * crossing root thing. we store the inode number in the
1899 * offset of the orphan item.
1900 */
1901 inode = btrfs_iget_locked(root->fs_info->sb,
1902 found_key.offset, root);
1903 if (!inode)
1904 break;
1905
1906 if (inode->i_state & I_NEW) {
1907 BTRFS_I(inode)->root = root;
1908
1909 /* have to set the location manually */
1910 BTRFS_I(inode)->location.objectid = inode->i_ino;
1911 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1912 BTRFS_I(inode)->location.offset = 0;
1913
1914 btrfs_read_locked_inode(inode);
1915 unlock_new_inode(inode);
1916 }
1917
1918 /*
1919 * add this inode to the orphan list so btrfs_orphan_del does
1920 * the proper thing when we hit it
1921 */
1922 spin_lock(&root->list_lock);
1923 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1924 spin_unlock(&root->list_lock);
1925
1926 /*
1927 * if this is a bad inode, means we actually succeeded in
1928 * removing the inode, but not the orphan record, which means
1929 * we need to manually delete the orphan since iput will just
1930 * do a destroy_inode
1931 */
1932 if (is_bad_inode(inode)) {
1933 trans = btrfs_start_transaction(root, 1);
1934 btrfs_orphan_del(trans, inode);
1935 btrfs_end_transaction(trans, root);
1936 iput(inode);
1937 continue;
1938 }
1939
1940 /* if we have links, this was a truncate, lets do that */
1941 if (inode->i_nlink) {
1942 nr_truncate++;
1943 btrfs_truncate(inode);
1944 } else {
1945 nr_unlink++;
1946 }
1947
1948 /* this will do delete_inode and everything for us */
1949 iput(inode);
1950 }
1951
1952 if (nr_unlink)
1953 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
1954 if (nr_truncate)
1955 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
1956
1957 btrfs_free_path(path);
1958}
1959
1960/*
1961 * read an inode from the btree into the in-memory inode
1962 */
1963void btrfs_read_locked_inode(struct inode *inode)
1964{
1965 struct btrfs_path *path;
1966 struct extent_buffer *leaf;
1967 struct btrfs_inode_item *inode_item;
1968 struct btrfs_timespec *tspec;
1969 struct btrfs_root *root = BTRFS_I(inode)->root;
1970 struct btrfs_key location;
1971 u64 alloc_group_block;
1972 u32 rdev;
1973 int ret;
1974
1975 path = btrfs_alloc_path();
1976 BUG_ON(!path);
1977 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
1978
1979 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
1980 if (ret)
1981 goto make_bad;
1982
1983 leaf = path->nodes[0];
1984 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1985 struct btrfs_inode_item);
1986
1987 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
1988 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
1989 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
1990 inode->i_gid = btrfs_inode_gid(leaf, inode_item);
1991 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
1992
1993 tspec = btrfs_inode_atime(inode_item);
1994 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1995 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1996
1997 tspec = btrfs_inode_mtime(inode_item);
1998 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1999 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2000
2001 tspec = btrfs_inode_ctime(inode_item);
2002 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2003 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2004
2005 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2006 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2007 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
2008 inode->i_generation = BTRFS_I(inode)->generation;
2009 inode->i_rdev = 0;
2010 rdev = btrfs_inode_rdev(leaf, inode_item);
2011
2012 BTRFS_I(inode)->index_cnt = (u64)-1;
2013 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2014
2015 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2017 alloc_group_block, 0);
2018 btrfs_free_path(path);
2019 inode_item = NULL;
2020
2021 switch (inode->i_mode & S_IFMT) {
2022 case S_IFREG:
2023 inode->i_mapping->a_ops = &btrfs_aops;
2024 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2025 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2026 inode->i_fop = &btrfs_file_operations;
2027 inode->i_op = &btrfs_file_inode_operations;
2028 break;
2029 case S_IFDIR:
2030 inode->i_fop = &btrfs_dir_file_operations;
2031 if (root == root->fs_info->tree_root)
2032 inode->i_op = &btrfs_dir_ro_inode_operations;
2033 else
2034 inode->i_op = &btrfs_dir_inode_operations;
2035 break;
2036 case S_IFLNK:
2037 inode->i_op = &btrfs_symlink_inode_operations;
2038 inode->i_mapping->a_ops = &btrfs_symlink_aops;
2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2040 break;
2041 default:
2042 init_special_inode(inode, inode->i_mode, rdev);
2043 break;
2044 }
2045 return;
2046
2047make_bad:
2048 btrfs_free_path(path);
2049 make_bad_inode(inode);
2050}
2051
2052/*
2053 * given a leaf and an inode, copy the inode fields into the leaf
2054 */
2055static void fill_inode_item(struct btrfs_trans_handle *trans,
2056 struct extent_buffer *leaf,
2057 struct btrfs_inode_item *item,
2058 struct inode *inode)
2059{
2060 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2061 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2062 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2063 btrfs_set_inode_mode(leaf, item, inode->i_mode);
2064 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2065
2066 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2067 inode->i_atime.tv_sec);
2068 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2069 inode->i_atime.tv_nsec);
2070
2071 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2072 inode->i_mtime.tv_sec);
2073 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2074 inode->i_mtime.tv_nsec);
2075
2076 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2077 inode->i_ctime.tv_sec);
2078 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2079 inode->i_ctime.tv_nsec);
2080
2081 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2082 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2083 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
2084 btrfs_set_inode_transid(leaf, item, trans->transid);
2085 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2086 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2087 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
2088}
2089
2090/*
2091 * copy everything in the in-memory inode into the btree.
2092 */
2093noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2094 struct btrfs_root *root, struct inode *inode)
2095{
2096 struct btrfs_inode_item *inode_item;
2097 struct btrfs_path *path;
2098 struct extent_buffer *leaf;
2099 int ret;
2100
2101 path = btrfs_alloc_path();
2102 BUG_ON(!path);
2103 ret = btrfs_lookup_inode(trans, root, path,
2104 &BTRFS_I(inode)->location, 1);
2105 if (ret) {
2106 if (ret > 0)
2107 ret = -ENOENT;
2108 goto failed;
2109 }
2110
2111 leaf = path->nodes[0];
2112 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2113 struct btrfs_inode_item);
2114
2115 fill_inode_item(trans, leaf, inode_item, inode);
2116 btrfs_mark_buffer_dirty(leaf);
2117 btrfs_set_inode_last_trans(trans, inode);
2118 ret = 0;
2119failed:
2120 btrfs_free_path(path);
2121 return ret;
2122}
2123
2124
2125/*
2126 * unlink helper that gets used here in inode.c and in the tree logging
2127 * recovery code. It remove a link in a directory with a given name, and
2128 * also drops the back refs in the inode to the directory
2129 */
2130int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2131 struct btrfs_root *root,
2132 struct inode *dir, struct inode *inode,
2133 const char *name, int name_len)
2134{
2135 struct btrfs_path *path;
2136 int ret = 0;
2137 struct extent_buffer *leaf;
2138 struct btrfs_dir_item *di;
2139 struct btrfs_key key;
2140 u64 index;
2141
2142 path = btrfs_alloc_path();
2143 if (!path) {
2144 ret = -ENOMEM;
2145 goto err;
2146 }
2147
2148 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2149 name, name_len, -1);
2150 if (IS_ERR(di)) {
2151 ret = PTR_ERR(di);
2152 goto err;
2153 }
2154 if (!di) {
2155 ret = -ENOENT;
2156 goto err;
2157 }
2158 leaf = path->nodes[0];
2159 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2160 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2161 if (ret)
2162 goto err;
2163 btrfs_release_path(root, path);
2164
2165 ret = btrfs_del_inode_ref(trans, root, name, name_len,
2166 inode->i_ino,
2167 dir->i_ino, &index);
2168 if (ret) {
2169 printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
2170 "inode %lu parent %lu\n", name_len, name,
2171 inode->i_ino, dir->i_ino);
2172 goto err;
2173 }
2174
2175 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2176 index, name, name_len, -1);
2177 if (IS_ERR(di)) {
2178 ret = PTR_ERR(di);
2179 goto err;
2180 }
2181 if (!di) {
2182 ret = -ENOENT;
2183 goto err;
2184 }
2185 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2186 btrfs_release_path(root, path);
2187
2188 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2189 inode, dir->i_ino);
2190 BUG_ON(ret != 0 && ret != -ENOENT);
2191 if (ret != -ENOENT)
2192 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2193
2194 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2195 dir, index);
2196 BUG_ON(ret);
2197err:
2198 btrfs_free_path(path);
2199 if (ret)
2200 goto out;
2201
2202 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2203 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2204 btrfs_update_inode(trans, root, dir);
2205 btrfs_drop_nlink(inode);
2206 ret = btrfs_update_inode(trans, root, inode);
2207 dir->i_sb->s_dirt = 1;
2208out:
2209 return ret;
2210}
2211
2212static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2213{
2214 struct btrfs_root *root;
2215 struct btrfs_trans_handle *trans;
2216 struct inode *inode = dentry->d_inode;
2217 int ret;
2218 unsigned long nr = 0;
2219
2220 root = BTRFS_I(dir)->root;
2221
2222 ret = btrfs_check_free_space(root, 1, 1);
2223 if (ret)
2224 goto fail;
2225
2226 trans = btrfs_start_transaction(root, 1);
2227
2228 btrfs_set_trans_block_group(trans, dir);
2229 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2230 dentry->d_name.name, dentry->d_name.len);
2231
2232 if (inode->i_nlink == 0)
2233 ret = btrfs_orphan_add(trans, inode);
2234
2235 nr = trans->blocks_used;
2236
2237 btrfs_end_transaction_throttle(trans, root);
2238fail:
2239 btrfs_btree_balance_dirty(root, nr);
2240 return ret;
2241}
2242
2243static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2244{
2245 struct inode *inode = dentry->d_inode;
2246 int err = 0;
2247 int ret;
2248 struct btrfs_root *root = BTRFS_I(dir)->root;
2249 struct btrfs_trans_handle *trans;
2250 unsigned long nr = 0;
2251
2252 /*
2253 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
2254 * the root of a subvolume or snapshot
2255 */
2256 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2257 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
2258 return -ENOTEMPTY;
2259 }
2260
2261 ret = btrfs_check_free_space(root, 1, 1);
2262 if (ret)
2263 goto fail;
2264
2265 trans = btrfs_start_transaction(root, 1);
2266 btrfs_set_trans_block_group(trans, dir);
2267
2268 err = btrfs_orphan_add(trans, inode);
2269 if (err)
2270 goto fail_trans;
2271
2272 /* now the directory is empty */
2273 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2274 dentry->d_name.name, dentry->d_name.len);
2275 if (!err)
2276 btrfs_i_size_write(inode, 0);
2277
2278fail_trans:
2279 nr = trans->blocks_used;
2280 ret = btrfs_end_transaction_throttle(trans, root);
2281fail:
2282 btrfs_btree_balance_dirty(root, nr);
2283
2284 if (ret && !err)
2285 err = ret;
2286 return err;
2287}
2288
2289#if 0
2290/*
2291 * when truncating bytes in a file, it is possible to avoid reading
2292 * the leaves that contain only checksum items. This can be the
2293 * majority of the IO required to delete a large file, but it must
2294 * be done carefully.
2295 *
2296 * The keys in the level just above the leaves are checked to make sure
2297 * the lowest key in a given leaf is a csum key, and starts at an offset
2298 * after the new size.
2299 *
2300 * Then the key for the next leaf is checked to make sure it also has
2301 * a checksum item for the same file. If it does, we know our target leaf
2302 * contains only checksum items, and it can be safely freed without reading
2303 * it.
2304 *
2305 * This is just an optimization targeted at large files. It may do
2306 * nothing. It will return 0 unless things went badly.
2307 */
2308static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
2309 struct btrfs_root *root,
2310 struct btrfs_path *path,
2311 struct inode *inode, u64 new_size)
2312{
2313 struct btrfs_key key;
2314 int ret;
2315 int nritems;
2316 struct btrfs_key found_key;
2317 struct btrfs_key other_key;
2318 struct btrfs_leaf_ref *ref;
2319 u64 leaf_gen;
2320 u64 leaf_start;
2321
2322 path->lowest_level = 1;
2323 key.objectid = inode->i_ino;
2324 key.type = BTRFS_CSUM_ITEM_KEY;
2325 key.offset = new_size;
2326again:
2327 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2328 if (ret < 0)
2329 goto out;
2330
2331 if (path->nodes[1] == NULL) {
2332 ret = 0;
2333 goto out;
2334 }
2335 ret = 0;
2336 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
2337 nritems = btrfs_header_nritems(path->nodes[1]);
2338
2339 if (!nritems)
2340 goto out;
2341
2342 if (path->slots[1] >= nritems)
2343 goto next_node;
2344
2345 /* did we find a key greater than anything we want to delete? */
2346 if (found_key.objectid > inode->i_ino ||
2347 (found_key.objectid == inode->i_ino && found_key.type > key.type))
2348 goto out;
2349
2350 /* we check the next key in the node to make sure the leave contains
2351 * only checksum items. This comparison doesn't work if our
2352 * leaf is the last one in the node
2353 */
2354 if (path->slots[1] + 1 >= nritems) {
2355next_node:
2356 /* search forward from the last key in the node, this
2357 * will bring us into the next node in the tree
2358 */
2359 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
2360
2361 /* unlikely, but we inc below, so check to be safe */
2362 if (found_key.offset == (u64)-1)
2363 goto out;
2364
2365 /* search_forward needs a path with locks held, do the
2366 * search again for the original key. It is possible
2367 * this will race with a balance and return a path that
2368 * we could modify, but this drop is just an optimization
2369 * and is allowed to miss some leaves.
2370 */
2371 btrfs_release_path(root, path);
2372 found_key.offset++;
2373
2374 /* setup a max key for search_forward */
2375 other_key.offset = (u64)-1;
2376 other_key.type = key.type;
2377 other_key.objectid = key.objectid;
2378
2379 path->keep_locks = 1;
2380 ret = btrfs_search_forward(root, &found_key, &other_key,
2381 path, 0, 0);
2382 path->keep_locks = 0;
2383 if (ret || found_key.objectid != key.objectid ||
2384 found_key.type != key.type) {
2385 ret = 0;
2386 goto out;
2387 }
2388
2389 key.offset = found_key.offset;
2390 btrfs_release_path(root, path);
2391 cond_resched();
2392 goto again;
2393 }
2394
2395 /* we know there's one more slot after us in the tree,
2396 * read that key so we can verify it is also a checksum item
2397 */
2398 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
2399
2400 if (found_key.objectid < inode->i_ino)
2401 goto next_key;
2402
2403 if (found_key.type != key.type || found_key.offset < new_size)
2404 goto next_key;
2405
2406 /*
2407 * if the key for the next leaf isn't a csum key from this objectid,
2408 * we can't be sure there aren't good items inside this leaf.
2409 * Bail out
2410 */
2411 if (other_key.objectid != inode->i_ino || other_key.type != key.type)
2412 goto out;
2413
2414 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
2415 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
2416 /*
2417 * it is safe to delete this leaf, it contains only
2418 * csum items from this inode at an offset >= new_size
2419 */
2420 ret = btrfs_del_leaf(trans, root, path, leaf_start);
2421 BUG_ON(ret);
2422
2423 if (root->ref_cows && leaf_gen < trans->transid) {
2424 ref = btrfs_alloc_leaf_ref(root, 0);
2425 if (ref) {
2426 ref->root_gen = root->root_key.offset;
2427 ref->bytenr = leaf_start;
2428 ref->owner = 0;
2429 ref->generation = leaf_gen;
2430 ref->nritems = 0;
2431
2432 ret = btrfs_add_leaf_ref(root, ref, 0);
2433 WARN_ON(ret);
2434 btrfs_free_leaf_ref(root, ref);
2435 } else {
2436 WARN_ON(1);
2437 }
2438 }
2439next_key:
2440 btrfs_release_path(root, path);
2441
2442 if (other_key.objectid == inode->i_ino &&
2443 other_key.type == key.type && other_key.offset > key.offset) {
2444 key.offset = other_key.offset;
2445 cond_resched();
2446 goto again;
2447 }
2448 ret = 0;
2449out:
2450 /* fixup any changes we've made to the path */
2451 path->lowest_level = 0;
2452 path->keep_locks = 0;
2453 btrfs_release_path(root, path);
2454 return ret;
2455}
2456
2457#endif
2458
2459/*
2460 * this can truncate away extent items, csum items and directory items.
2461 * It starts at a high offset and removes keys until it can't find
2462 * any higher than new_size
2463 *
2464 * csum items that cross the new i_size are truncated to the new size
2465 * as well.
2466 *
2467 * min_type is the minimum key type to truncate down to. If set to 0, this
2468 * will kill all the items on this inode, including the INODE_ITEM_KEY.
2469 */
2470noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2471 struct btrfs_root *root,
2472 struct inode *inode,
2473 u64 new_size, u32 min_type)
2474{
2475 int ret;
2476 struct btrfs_path *path;
2477 struct btrfs_key key;
2478 struct btrfs_key found_key;
2479 u32 found_type;
2480 struct extent_buffer *leaf;
2481 struct btrfs_file_extent_item *fi;
2482 u64 extent_start = 0;
2483 u64 extent_num_bytes = 0;
2484 u64 item_end = 0;
2485 u64 root_gen = 0;
2486 u64 root_owner = 0;
2487 int found_extent;
2488 int del_item;
2489 int pending_del_nr = 0;
2490 int pending_del_slot = 0;
2491 int extent_type = -1;
2492 int encoding;
2493 u64 mask = root->sectorsize - 1;
2494
2495 if (root->ref_cows)
2496 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2497 path = btrfs_alloc_path();
2498 path->reada = -1;
2499 BUG_ON(!path);
2500
2501 /* FIXME, add redo link to tree so we don't leak on crash */
2502 key.objectid = inode->i_ino;
2503 key.offset = (u64)-1;
2504 key.type = (u8)-1;
2505
2506 btrfs_init_path(path);
2507
2508search_again:
2509 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2510 if (ret < 0)
2511 goto error;
2512
2513 if (ret > 0) {
2514 /* there are no items in the tree for us to truncate, we're
2515 * done
2516 */
2517 if (path->slots[0] == 0) {
2518 ret = 0;
2519 goto error;
2520 }
2521 path->slots[0]--;
2522 }
2523
2524 while (1) {
2525 fi = NULL;
2526 leaf = path->nodes[0];
2527 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2528 found_type = btrfs_key_type(&found_key);
2529 encoding = 0;
2530
2531 if (found_key.objectid != inode->i_ino)
2532 break;
2533
2534 if (found_type < min_type)
2535 break;
2536
2537 item_end = found_key.offset;
2538 if (found_type == BTRFS_EXTENT_DATA_KEY) {
2539 fi = btrfs_item_ptr(leaf, path->slots[0],
2540 struct btrfs_file_extent_item);
2541 extent_type = btrfs_file_extent_type(leaf, fi);
2542 encoding = btrfs_file_extent_compression(leaf, fi);
2543 encoding |= btrfs_file_extent_encryption(leaf, fi);
2544 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2545
2546 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2547 item_end +=
2548 btrfs_file_extent_num_bytes(leaf, fi);
2549 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2550 item_end += btrfs_file_extent_inline_len(leaf,
2551 fi);
2552 }
2553 item_end--;
2554 }
2555 if (item_end < new_size) {
2556 if (found_type == BTRFS_DIR_ITEM_KEY)
2557 found_type = BTRFS_INODE_ITEM_KEY;
2558 else if (found_type == BTRFS_EXTENT_ITEM_KEY)
2559 found_type = BTRFS_EXTENT_DATA_KEY;
2560 else if (found_type == BTRFS_EXTENT_DATA_KEY)
2561 found_type = BTRFS_XATTR_ITEM_KEY;
2562 else if (found_type == BTRFS_XATTR_ITEM_KEY)
2563 found_type = BTRFS_INODE_REF_KEY;
2564 else if (found_type)
2565 found_type--;
2566 else
2567 break;
2568 btrfs_set_key_type(&key, found_type);
2569 goto next;
2570 }
2571 if (found_key.offset >= new_size)
2572 del_item = 1;
2573 else
2574 del_item = 0;
2575 found_extent = 0;
2576
2577 /* FIXME, shrink the extent if the ref count is only 1 */
2578 if (found_type != BTRFS_EXTENT_DATA_KEY)
2579 goto delete;
2580
2581 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2582 u64 num_dec;
2583 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2584 if (!del_item && !encoding) {
2585 u64 orig_num_bytes =
2586 btrfs_file_extent_num_bytes(leaf, fi);
2587 extent_num_bytes = new_size -
2588 found_key.offset + root->sectorsize - 1;
2589 extent_num_bytes = extent_num_bytes &
2590 ~((u64)root->sectorsize - 1);
2591 btrfs_set_file_extent_num_bytes(leaf, fi,
2592 extent_num_bytes);
2593 num_dec = (orig_num_bytes -
2594 extent_num_bytes);
2595 if (root->ref_cows && extent_start != 0)
2596 inode_sub_bytes(inode, num_dec);
2597 btrfs_mark_buffer_dirty(leaf);
2598 } else {
2599 extent_num_bytes =
2600 btrfs_file_extent_disk_num_bytes(leaf,
2601 fi);
2602 /* FIXME blocksize != 4096 */
2603 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2604 if (extent_start != 0) {
2605 found_extent = 1;
2606 if (root->ref_cows)
2607 inode_sub_bytes(inode, num_dec);
2608 }
2609 root_gen = btrfs_header_generation(leaf);
2610 root_owner = btrfs_header_owner(leaf);
2611 }
2612 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2613 /*
2614 * we can't truncate inline items that have had
2615 * special encodings
2616 */
2617 if (!del_item &&
2618 btrfs_file_extent_compression(leaf, fi) == 0 &&
2619 btrfs_file_extent_encryption(leaf, fi) == 0 &&
2620 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
2621 u32 size = new_size - found_key.offset;
2622
2623 if (root->ref_cows) {
2624 inode_sub_bytes(inode, item_end + 1 -
2625 new_size);
2626 }
2627 size =
2628 btrfs_file_extent_calc_inline_size(size);
2629 ret = btrfs_truncate_item(trans, root, path,
2630 size, 1);
2631 BUG_ON(ret);
2632 } else if (root->ref_cows) {
2633 inode_sub_bytes(inode, item_end + 1 -
2634 found_key.offset);
2635 }
2636 }
2637delete:
2638 if (del_item) {
2639 if (!pending_del_nr) {
2640 /* no pending yet, add ourselves */
2641 pending_del_slot = path->slots[0];
2642 pending_del_nr = 1;
2643 } else if (pending_del_nr &&
2644 path->slots[0] + 1 == pending_del_slot) {
2645 /* hop on the pending chunk */
2646 pending_del_nr++;
2647 pending_del_slot = path->slots[0];
2648 } else {
2649 BUG();
2650 }
2651 } else {
2652 break;
2653 }
2654 if (found_extent) {
2655 ret = btrfs_free_extent(trans, root, extent_start,
2656 extent_num_bytes,
2657 leaf->start, root_owner,
2658 root_gen, inode->i_ino, 0);
2659 BUG_ON(ret);
2660 }
2661next:
2662 if (path->slots[0] == 0) {
2663 if (pending_del_nr)
2664 goto del_pending;
2665 btrfs_release_path(root, path);
2666 goto search_again;
2667 }
2668
2669 path->slots[0]--;
2670 if (pending_del_nr &&
2671 path->slots[0] + 1 != pending_del_slot) {
2672 struct btrfs_key debug;
2673del_pending:
2674 btrfs_item_key_to_cpu(path->nodes[0], &debug,
2675 pending_del_slot);
2676 ret = btrfs_del_items(trans, root, path,
2677 pending_del_slot,
2678 pending_del_nr);
2679 BUG_ON(ret);
2680 pending_del_nr = 0;
2681 btrfs_release_path(root, path);
2682 goto search_again;
2683 }
2684 }
2685 ret = 0;
2686error:
2687 if (pending_del_nr) {
2688 ret = btrfs_del_items(trans, root, path, pending_del_slot,
2689 pending_del_nr);
2690 }
2691 btrfs_free_path(path);
2692 inode->i_sb->s_dirt = 1;
2693 return ret;
2694}
2695
2696/*
2697 * taken from block_truncate_page, but does cow as it zeros out
2698 * any bytes left in the last page in the file.
2699 */
2700static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
2701{
2702 struct inode *inode = mapping->host;
2703 struct btrfs_root *root = BTRFS_I(inode)->root;
2704 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2705 struct btrfs_ordered_extent *ordered;
2706 char *kaddr;
2707 u32 blocksize = root->sectorsize;
2708 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2709 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2710 struct page *page;
2711 int ret = 0;
2712 u64 page_start;
2713 u64 page_end;
2714
2715 if ((offset & (blocksize - 1)) == 0)
2716 goto out;
2717
2718 ret = -ENOMEM;
2719again:
2720 page = grab_cache_page(mapping, index);
2721 if (!page)
2722 goto out;
2723
2724 page_start = page_offset(page);
2725 page_end = page_start + PAGE_CACHE_SIZE - 1;
2726
2727 if (!PageUptodate(page)) {
2728 ret = btrfs_readpage(NULL, page);
2729 lock_page(page);
2730 if (page->mapping != mapping) {
2731 unlock_page(page);
2732 page_cache_release(page);
2733 goto again;
2734 }
2735 if (!PageUptodate(page)) {
2736 ret = -EIO;
2737 goto out_unlock;
2738 }
2739 }
2740 wait_on_page_writeback(page);
2741
2742 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2743 set_page_extent_mapped(page);
2744
2745 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2746 if (ordered) {
2747 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2748 unlock_page(page);
2749 page_cache_release(page);
2750 btrfs_start_ordered_extent(inode, ordered, 1);
2751 btrfs_put_ordered_extent(ordered);
2752 goto again;
2753 }
2754
2755 btrfs_set_extent_delalloc(inode, page_start, page_end);
2756 ret = 0;
2757 if (offset != PAGE_CACHE_SIZE) {
2758 kaddr = kmap(page);
2759 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2760 flush_dcache_page(page);
2761 kunmap(page);
2762 }
2763 ClearPageChecked(page);
2764 set_page_dirty(page);
2765 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2766
2767out_unlock:
2768 unlock_page(page);
2769 page_cache_release(page);
2770out:
2771 return ret;
2772}
2773
2774int btrfs_cont_expand(struct inode *inode, loff_t size)
2775{
2776 struct btrfs_trans_handle *trans;
2777 struct btrfs_root *root = BTRFS_I(inode)->root;
2778 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2779 struct extent_map *em;
2780 u64 mask = root->sectorsize - 1;
2781 u64 hole_start = (inode->i_size + mask) & ~mask;
2782 u64 block_end = (size + mask) & ~mask;
2783 u64 last_byte;
2784 u64 cur_offset;
2785 u64 hole_size;
2786 int err;
2787
2788 if (size <= hole_start)
2789 return 0;
2790
2791 err = btrfs_check_free_space(root, 1, 0);
2792 if (err)
2793 return err;
2794
2795 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2796
2797 while (1) {
2798 struct btrfs_ordered_extent *ordered;
2799 btrfs_wait_ordered_range(inode, hole_start,
2800 block_end - hole_start);
2801 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2802 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
2803 if (!ordered)
2804 break;
2805 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2806 btrfs_put_ordered_extent(ordered);
2807 }
2808
2809 trans = btrfs_start_transaction(root, 1);
2810 btrfs_set_trans_block_group(trans, inode);
2811
2812 cur_offset = hole_start;
2813 while (1) {
2814 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2815 block_end - cur_offset, 0);
2816 BUG_ON(IS_ERR(em) || !em);
2817 last_byte = min(extent_map_end(em), block_end);
2818 last_byte = (last_byte + mask) & ~mask;
2819 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2820 u64 hint_byte = 0;
2821 hole_size = last_byte - cur_offset;
2822 err = btrfs_drop_extents(trans, root, inode,
2823 cur_offset,
2824 cur_offset + hole_size,
2825 cur_offset, &hint_byte);
2826 if (err)
2827 break;
2828 err = btrfs_insert_file_extent(trans, root,
2829 inode->i_ino, cur_offset, 0,
2830 0, hole_size, 0, hole_size,
2831 0, 0, 0);
2832 btrfs_drop_extent_cache(inode, hole_start,
2833 last_byte - 1, 0);
2834 }
2835 free_extent_map(em);
2836 cur_offset = last_byte;
2837 if (err || cur_offset >= block_end)
2838 break;
2839 }
2840
2841 btrfs_end_transaction(trans, root);
2842 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2843 return err;
2844}
2845
2846static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2847{
2848 struct inode *inode = dentry->d_inode;
2849 int err;
2850
2851 err = inode_change_ok(inode, attr);
2852 if (err)
2853 return err;
2854
2855 if (S_ISREG(inode->i_mode) &&
2856 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
2857 err = btrfs_cont_expand(inode, attr->ia_size);
2858 if (err)
2859 return err;
2860 }
2861
2862 err = inode_setattr(inode, attr);
2863
2864 if (!err && ((attr->ia_valid & ATTR_MODE)))
2865 err = btrfs_acl_chmod(inode);
2866 return err;
2867}
2868
2869void btrfs_delete_inode(struct inode *inode)
2870{
2871 struct btrfs_trans_handle *trans;
2872 struct btrfs_root *root = BTRFS_I(inode)->root;
2873 unsigned long nr;
2874 int ret;
2875
2876 truncate_inode_pages(&inode->i_data, 0);
2877 if (is_bad_inode(inode)) {
2878 btrfs_orphan_del(NULL, inode);
2879 goto no_delete;
2880 }
2881 btrfs_wait_ordered_range(inode, 0, (u64)-1);
2882
2883 btrfs_i_size_write(inode, 0);
2884 trans = btrfs_join_transaction(root, 1);
2885
2886 btrfs_set_trans_block_group(trans, inode);
2887 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
2888 if (ret) {
2889 btrfs_orphan_del(NULL, inode);
2890 goto no_delete_lock;
2891 }
2892
2893 btrfs_orphan_del(trans, inode);
2894
2895 nr = trans->blocks_used;
2896 clear_inode(inode);
2897
2898 btrfs_end_transaction(trans, root);
2899 btrfs_btree_balance_dirty(root, nr);
2900 return;
2901
2902no_delete_lock:
2903 nr = trans->blocks_used;
2904 btrfs_end_transaction(trans, root);
2905 btrfs_btree_balance_dirty(root, nr);
2906no_delete:
2907 clear_inode(inode);
2908}
2909
2910/*
2911 * this returns the key found in the dir entry in the location pointer.
2912 * If no dir entries were found, location->objectid is 0.
2913 */
2914static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
2915 struct btrfs_key *location)
2916{
2917 const char *name = dentry->d_name.name;
2918 int namelen = dentry->d_name.len;
2919 struct btrfs_dir_item *di;
2920 struct btrfs_path *path;
2921 struct btrfs_root *root = BTRFS_I(dir)->root;
2922 int ret = 0;
2923
2924 path = btrfs_alloc_path();
2925 BUG_ON(!path);
2926
2927 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
2928 namelen, 0);
2929 if (IS_ERR(di))
2930 ret = PTR_ERR(di);
2931
2932 if (!di || IS_ERR(di))
2933 goto out_err;
2934
2935 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
2936out:
2937 btrfs_free_path(path);
2938 return ret;
2939out_err:
2940 location->objectid = 0;
2941 goto out;
2942}
2943
2944/*
2945 * when we hit a tree root in a directory, the btrfs part of the inode
2946 * needs to be changed to reflect the root directory of the tree root. This
2947 * is kind of like crossing a mount point.
2948 */
2949static int fixup_tree_root_location(struct btrfs_root *root,
2950 struct btrfs_key *location,
2951 struct btrfs_root **sub_root,
2952 struct dentry *dentry)
2953{
2954 struct btrfs_root_item *ri;
2955
2956 if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
2957 return 0;
2958 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
2959 return 0;
2960
2961 *sub_root = btrfs_read_fs_root(root->fs_info, location,
2962 dentry->d_name.name,
2963 dentry->d_name.len);
2964 if (IS_ERR(*sub_root))
2965 return PTR_ERR(*sub_root);
2966
2967 ri = &(*sub_root)->root_item;
2968 location->objectid = btrfs_root_dirid(ri);
2969 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2970 location->offset = 0;
2971
2972 return 0;
2973}
2974
2975static noinline void init_btrfs_i(struct inode *inode)
2976{
2977 struct btrfs_inode *bi = BTRFS_I(inode);
2978
2979 bi->i_acl = NULL;
2980 bi->i_default_acl = NULL;
2981
2982 bi->generation = 0;
2983 bi->sequence = 0;
2984 bi->last_trans = 0;
2985 bi->logged_trans = 0;
2986 bi->delalloc_bytes = 0;
2987 bi->disk_i_size = 0;
2988 bi->flags = 0;
2989 bi->index_cnt = (u64)-1;
2990 bi->log_dirty_trans = 0;
2991 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2992 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2993 inode->i_mapping, GFP_NOFS);
2994 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2995 inode->i_mapping, GFP_NOFS);
2996 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
2997 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2998 mutex_init(&BTRFS_I(inode)->extent_mutex);
2999 mutex_init(&BTRFS_I(inode)->log_mutex);
3000}
3001
3002static int btrfs_init_locked_inode(struct inode *inode, void *p)
3003{
3004 struct btrfs_iget_args *args = p;
3005 inode->i_ino = args->ino;
3006 init_btrfs_i(inode);
3007 BTRFS_I(inode)->root = args->root;
3008 return 0;
3009}
3010
3011static int btrfs_find_actor(struct inode *inode, void *opaque)
3012{
3013 struct btrfs_iget_args *args = opaque;
3014 return args->ino == inode->i_ino &&
3015 args->root == BTRFS_I(inode)->root;
3016}
3017
3018struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
3019 struct btrfs_root *root, int wait)
3020{
3021 struct inode *inode;
3022 struct btrfs_iget_args args;
3023 args.ino = objectid;
3024 args.root = root;
3025
3026 if (wait) {
3027 inode = ilookup5(s, objectid, btrfs_find_actor,
3028 (void *)&args);
3029 } else {
3030 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
3031 (void *)&args);
3032 }
3033 return inode;
3034}
3035
3036struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3037 struct btrfs_root *root)
3038{
3039 struct inode *inode;
3040 struct btrfs_iget_args args;
3041 args.ino = objectid;
3042 args.root = root;
3043
3044 inode = iget5_locked(s, objectid, btrfs_find_actor,
3045 btrfs_init_locked_inode,
3046 (void *)&args);
3047 return inode;
3048}
3049
3050/* Get an inode object given its location and corresponding root.
3051 * Returns in *is_new if the inode was read from disk
3052 */
3053struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3054 struct btrfs_root *root, int *is_new)
3055{
3056 struct inode *inode;
3057
3058 inode = btrfs_iget_locked(s, location->objectid, root);
3059 if (!inode)
3060 return ERR_PTR(-EACCES);
3061
3062 if (inode->i_state & I_NEW) {
3063 BTRFS_I(inode)->root = root;
3064 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3065 btrfs_read_locked_inode(inode);
3066 unlock_new_inode(inode);
3067 if (is_new)
3068 *is_new = 1;
3069 } else {
3070 if (is_new)
3071 *is_new = 0;
3072 }
3073
3074 return inode;
3075}
3076
3077struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3078{
3079 struct inode *inode;
3080 struct btrfs_inode *bi = BTRFS_I(dir);
3081 struct btrfs_root *root = bi->root;
3082 struct btrfs_root *sub_root = root;
3083 struct btrfs_key location;
3084 int ret, new;
3085
3086 if (dentry->d_name.len > BTRFS_NAME_LEN)
3087 return ERR_PTR(-ENAMETOOLONG);
3088
3089 ret = btrfs_inode_by_name(dir, dentry, &location);
3090
3091 if (ret < 0)
3092 return ERR_PTR(ret);
3093
3094 inode = NULL;
3095 if (location.objectid) {
3096 ret = fixup_tree_root_location(root, &location, &sub_root,
3097 dentry);
3098 if (ret < 0)
3099 return ERR_PTR(ret);
3100 if (ret > 0)
3101 return ERR_PTR(-ENOENT);
3102 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
3103 if (IS_ERR(inode))
3104 return ERR_CAST(inode);
3105 }
3106 return inode;
3107}
3108
3109static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3110 struct nameidata *nd)
3111{
3112 struct inode *inode;
3113
3114 if (dentry->d_name.len > BTRFS_NAME_LEN)
3115 return ERR_PTR(-ENAMETOOLONG);
3116
3117 inode = btrfs_lookup_dentry(dir, dentry);
3118 if (IS_ERR(inode))
3119 return ERR_CAST(inode);
3120
3121 return d_splice_alias(inode, dentry);
3122}
3123
3124static unsigned char btrfs_filetype_table[] = {
3125 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
3126};
3127
3128static int btrfs_real_readdir(struct file *filp, void *dirent,
3129 filldir_t filldir)
3130{
3131 struct inode *inode = filp->f_dentry->d_inode;
3132 struct btrfs_root *root = BTRFS_I(inode)->root;
3133 struct btrfs_item *item;
3134 struct btrfs_dir_item *di;
3135 struct btrfs_key key;
3136 struct btrfs_key found_key;
3137 struct btrfs_path *path;
3138 int ret;
3139 u32 nritems;
3140 struct extent_buffer *leaf;
3141 int slot;
3142 int advance;
3143 unsigned char d_type;
3144 int over = 0;
3145 u32 di_cur;
3146 u32 di_total;
3147 u32 di_len;
3148 int key_type = BTRFS_DIR_INDEX_KEY;
3149 char tmp_name[32];
3150 char *name_ptr;
3151 int name_len;
3152
3153 /* FIXME, use a real flag for deciding about the key type */
3154 if (root->fs_info->tree_root == root)
3155 key_type = BTRFS_DIR_ITEM_KEY;
3156
3157 /* special case for "." */
3158 if (filp->f_pos == 0) {
3159 over = filldir(dirent, ".", 1,
3160 1, inode->i_ino,
3161 DT_DIR);
3162 if (over)
3163 return 0;
3164 filp->f_pos = 1;
3165 }
3166 /* special case for .., just use the back ref */
3167 if (filp->f_pos == 1) {
3168 u64 pino = parent_ino(filp->f_path.dentry);
3169 over = filldir(dirent, "..", 2,
3170 2, pino, DT_DIR);
3171 if (over)
3172 return 0;
3173 filp->f_pos = 2;
3174 }
3175 path = btrfs_alloc_path();
3176 path->reada = 2;
3177
3178 btrfs_set_key_type(&key, key_type);
3179 key.offset = filp->f_pos;
3180 key.objectid = inode->i_ino;
3181
3182 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3183 if (ret < 0)
3184 goto err;
3185 advance = 0;
3186
3187 while (1) {
3188 leaf = path->nodes[0];
3189 nritems = btrfs_header_nritems(leaf);
3190 slot = path->slots[0];
3191 if (advance || slot >= nritems) {
3192 if (slot >= nritems - 1) {
3193 ret = btrfs_next_leaf(root, path);
3194 if (ret)
3195 break;
3196 leaf = path->nodes[0];
3197 nritems = btrfs_header_nritems(leaf);
3198 slot = path->slots[0];
3199 } else {
3200 slot++;
3201 path->slots[0]++;
3202 }
3203 }
3204
3205 advance = 1;
3206 item = btrfs_item_nr(leaf, slot);
3207 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3208
3209 if (found_key.objectid != key.objectid)
3210 break;
3211 if (btrfs_key_type(&found_key) != key_type)
3212 break;
3213 if (found_key.offset < filp->f_pos)
3214 continue;
3215
3216 filp->f_pos = found_key.offset;
3217
3218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
3219 di_cur = 0;
3220 di_total = btrfs_item_size(leaf, item);
3221
3222 while (di_cur < di_total) {
3223 struct btrfs_key location;
3224
3225 name_len = btrfs_dir_name_len(leaf, di);
3226 if (name_len <= sizeof(tmp_name)) {
3227 name_ptr = tmp_name;
3228 } else {
3229 name_ptr = kmalloc(name_len, GFP_NOFS);
3230 if (!name_ptr) {
3231 ret = -ENOMEM;
3232 goto err;
3233 }
3234 }
3235 read_extent_buffer(leaf, name_ptr,
3236 (unsigned long)(di + 1), name_len);
3237
3238 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
3239 btrfs_dir_item_key_to_cpu(leaf, di, &location);
3240
3241 /* is this a reference to our own snapshot? If so
3242 * skip it
3243 */
3244 if (location.type == BTRFS_ROOT_ITEM_KEY &&
3245 location.objectid == root->root_key.objectid) {
3246 over = 0;
3247 goto skip;
3248 }
3249 over = filldir(dirent, name_ptr, name_len,
3250 found_key.offset, location.objectid,
3251 d_type);
3252
3253skip:
3254 if (name_ptr != tmp_name)
3255 kfree(name_ptr);
3256
3257 if (over)
3258 goto nopos;
3259 di_len = btrfs_dir_name_len(leaf, di) +
3260 btrfs_dir_data_len(leaf, di) + sizeof(*di);
3261 di_cur += di_len;
3262 di = (struct btrfs_dir_item *)((char *)di + di_len);
3263 }
3264 }
3265
3266 /* Reached end of directory/root. Bump pos past the last item. */
3267 if (key_type == BTRFS_DIR_INDEX_KEY)
3268 filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
3269 else
3270 filp->f_pos++;
3271nopos:
3272 ret = 0;
3273err:
3274 btrfs_free_path(path);
3275 return ret;
3276}
3277
3278int btrfs_write_inode(struct inode *inode, int wait)
3279{
3280 struct btrfs_root *root = BTRFS_I(inode)->root;
3281 struct btrfs_trans_handle *trans;
3282 int ret = 0;
3283
3284 if (root->fs_info->btree_inode == inode)
3285 return 0;
3286
3287 if (wait) {
3288 trans = btrfs_join_transaction(root, 1);
3289 btrfs_set_trans_block_group(trans, inode);
3290 ret = btrfs_commit_transaction(trans, root);
3291 }
3292 return ret;
3293}
3294
3295/*
3296 * This is somewhat expensive, updating the tree every time the
3297 * inode changes. But, it is most likely to find the inode in cache.
3298 * FIXME, needs more benchmarking...there are no reasons other than performance
3299 * to keep or drop this code.
3300 */
3301void btrfs_dirty_inode(struct inode *inode)
3302{
3303 struct btrfs_root *root = BTRFS_I(inode)->root;
3304 struct btrfs_trans_handle *trans;
3305
3306 trans = btrfs_join_transaction(root, 1);
3307 btrfs_set_trans_block_group(trans, inode);
3308 btrfs_update_inode(trans, root, inode);
3309 btrfs_end_transaction(trans, root);
3310}
3311
3312/*
3313 * find the highest existing sequence number in a directory
3314 * and then set the in-memory index_cnt variable to reflect
3315 * free sequence numbers
3316 */
3317static int btrfs_set_inode_index_count(struct inode *inode)
3318{
3319 struct btrfs_root *root = BTRFS_I(inode)->root;
3320 struct btrfs_key key, found_key;
3321 struct btrfs_path *path;
3322 struct extent_buffer *leaf;
3323 int ret;
3324
3325 key.objectid = inode->i_ino;
3326 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
3327 key.offset = (u64)-1;
3328
3329 path = btrfs_alloc_path();
3330 if (!path)
3331 return -ENOMEM;
3332
3333 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3334 if (ret < 0)
3335 goto out;
3336 /* FIXME: we should be able to handle this */
3337 if (ret == 0)
3338 goto out;
3339 ret = 0;
3340
3341 /*
3342 * MAGIC NUMBER EXPLANATION:
3343 * since we search a directory based on f_pos we have to start at 2
3344 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
3345 * else has to start at 2
3346 */
3347 if (path->slots[0] == 0) {
3348 BTRFS_I(inode)->index_cnt = 2;
3349 goto out;
3350 }
3351
3352 path->slots[0]--;
3353
3354 leaf = path->nodes[0];
3355 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3356
3357 if (found_key.objectid != inode->i_ino ||
3358 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
3359 BTRFS_I(inode)->index_cnt = 2;
3360 goto out;
3361 }
3362
3363 BTRFS_I(inode)->index_cnt = found_key.offset + 1;
3364out:
3365 btrfs_free_path(path);
3366 return ret;
3367}
3368
3369/*
3370 * helper to find a free sequence number in a given directory. This current
3371 * code is very simple, later versions will do smarter things in the btree
3372 */
3373int btrfs_set_inode_index(struct inode *dir, u64 *index)
3374{
3375 int ret = 0;
3376
3377 if (BTRFS_I(dir)->index_cnt == (u64)-1) {
3378 ret = btrfs_set_inode_index_count(dir);
3379 if (ret)
3380 return ret;
3381 }
3382
3383 *index = BTRFS_I(dir)->index_cnt;
3384 BTRFS_I(dir)->index_cnt++;
3385
3386 return ret;
3387}
3388
3389static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3390 struct btrfs_root *root,
3391 struct inode *dir,
3392 const char *name, int name_len,
3393 u64 ref_objectid, u64 objectid,
3394 u64 alloc_hint, int mode, u64 *index)
3395{
3396 struct inode *inode;
3397 struct btrfs_inode_item *inode_item;
3398 struct btrfs_key *location;
3399 struct btrfs_path *path;
3400 struct btrfs_inode_ref *ref;
3401 struct btrfs_key key[2];
3402 u32 sizes[2];
3403 unsigned long ptr;
3404 int ret;
3405 int owner;
3406
3407 path = btrfs_alloc_path();
3408 BUG_ON(!path);
3409
3410 inode = new_inode(root->fs_info->sb);
3411 if (!inode)
3412 return ERR_PTR(-ENOMEM);
3413
3414 if (dir) {
3415 ret = btrfs_set_inode_index(dir, index);
3416 if (ret)
3417 return ERR_PTR(ret);
3418 }
3419 /*
3420 * index_cnt is ignored for everything but a dir,
3421 * btrfs_get_inode_index_count has an explanation for the magic
3422 * number
3423 */
3424 init_btrfs_i(inode);
3425 BTRFS_I(inode)->index_cnt = 2;
3426 BTRFS_I(inode)->root = root;
3427 BTRFS_I(inode)->generation = trans->transid;
3428
3429 if (mode & S_IFDIR)
3430 owner = 0;
3431 else
3432 owner = 1;
3433 BTRFS_I(inode)->block_group =
3434 btrfs_find_block_group(root, 0, alloc_hint, owner);
3435 if ((mode & S_IFREG)) {
3436 if (btrfs_test_opt(root, NODATASUM))
3437 btrfs_set_flag(inode, NODATASUM);
3438 if (btrfs_test_opt(root, NODATACOW))
3439 btrfs_set_flag(inode, NODATACOW);
3440 }
3441
3442 key[0].objectid = objectid;
3443 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
3444 key[0].offset = 0;
3445
3446 key[1].objectid = objectid;
3447 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
3448 key[1].offset = ref_objectid;
3449
3450 sizes[0] = sizeof(struct btrfs_inode_item);
3451 sizes[1] = name_len + sizeof(*ref);
3452
3453 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3454 if (ret != 0)
3455 goto fail;
3456
3457 if (objectid > root->highest_inode)
3458 root->highest_inode = objectid;
3459
3460 inode->i_uid = current_fsuid();
3461 inode->i_gid = current_fsgid();
3462 inode->i_mode = mode;
3463 inode->i_ino = objectid;
3464 inode_set_bytes(inode, 0);
3465 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3466 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3467 struct btrfs_inode_item);
3468 fill_inode_item(trans, path->nodes[0], inode_item, inode);
3469
3470 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3471 struct btrfs_inode_ref);
3472 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
3473 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
3474 ptr = (unsigned long)(ref + 1);
3475 write_extent_buffer(path->nodes[0], name, ptr, name_len);
3476
3477 btrfs_mark_buffer_dirty(path->nodes[0]);
3478 btrfs_free_path(path);
3479
3480 location = &BTRFS_I(inode)->location;
3481 location->objectid = objectid;
3482 location->offset = 0;
3483 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3484
3485 insert_inode_hash(inode);
3486 return inode;
3487fail:
3488 if (dir)
3489 BTRFS_I(dir)->index_cnt--;
3490 btrfs_free_path(path);
3491 return ERR_PTR(ret);
3492}
3493
3494static inline u8 btrfs_inode_type(struct inode *inode)
3495{
3496 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
3497}
3498
3499/*
3500 * utility function to add 'inode' into 'parent_inode' with
3501 * a give name and a given sequence number.
3502 * if 'add_backref' is true, also insert a backref from the
3503 * inode to the parent directory.
3504 */
3505int btrfs_add_link(struct btrfs_trans_handle *trans,
3506 struct inode *parent_inode, struct inode *inode,
3507 const char *name, int name_len, int add_backref, u64 index)
3508{
3509 int ret;
3510 struct btrfs_key key;
3511 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
3512
3513 key.objectid = inode->i_ino;
3514 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
3515 key.offset = 0;
3516
3517 ret = btrfs_insert_dir_item(trans, root, name, name_len,
3518 parent_inode->i_ino,
3519 &key, btrfs_inode_type(inode),
3520 index);
3521 if (ret == 0) {
3522 if (add_backref) {
3523 ret = btrfs_insert_inode_ref(trans, root,
3524 name, name_len,
3525 inode->i_ino,
3526 parent_inode->i_ino,
3527 index);
3528 }
3529 btrfs_i_size_write(parent_inode, parent_inode->i_size +
3530 name_len * 2);
3531 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
3532 ret = btrfs_update_inode(trans, root, parent_inode);
3533 }
3534 return ret;
3535}
3536
3537static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
3538 struct dentry *dentry, struct inode *inode,
3539 int backref, u64 index)
3540{
3541 int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3542 inode, dentry->d_name.name,
3543 dentry->d_name.len, backref, index);
3544 if (!err) {
3545 d_instantiate(dentry, inode);
3546 return 0;
3547 }
3548 if (err > 0)
3549 err = -EEXIST;
3550 return err;
3551}
3552
3553static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3554 int mode, dev_t rdev)
3555{
3556 struct btrfs_trans_handle *trans;
3557 struct btrfs_root *root = BTRFS_I(dir)->root;
3558 struct inode *inode = NULL;
3559 int err;
3560 int drop_inode = 0;
3561 u64 objectid;
3562 unsigned long nr = 0;
3563 u64 index = 0;
3564
3565 if (!new_valid_dev(rdev))
3566 return -EINVAL;
3567
3568 err = btrfs_check_free_space(root, 1, 0);
3569 if (err)
3570 goto fail;
3571
3572 trans = btrfs_start_transaction(root, 1);
3573 btrfs_set_trans_block_group(trans, dir);
3574
3575 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3576 if (err) {
3577 err = -ENOSPC;
3578 goto out_unlock;
3579 }
3580
3581 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3582 dentry->d_name.len,
3583 dentry->d_parent->d_inode->i_ino, objectid,
3584 BTRFS_I(dir)->block_group, mode, &index);
3585 err = PTR_ERR(inode);
3586 if (IS_ERR(inode))
3587 goto out_unlock;
3588
3589 err = btrfs_init_acl(inode, dir);
3590 if (err) {
3591 drop_inode = 1;
3592 goto out_unlock;
3593 }
3594
3595 btrfs_set_trans_block_group(trans, inode);
3596 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3597 if (err)
3598 drop_inode = 1;
3599 else {
3600 inode->i_op = &btrfs_special_inode_operations;
3601 init_special_inode(inode, inode->i_mode, rdev);
3602 btrfs_update_inode(trans, root, inode);
3603 }
3604 dir->i_sb->s_dirt = 1;
3605 btrfs_update_inode_block_group(trans, inode);
3606 btrfs_update_inode_block_group(trans, dir);
3607out_unlock:
3608 nr = trans->blocks_used;
3609 btrfs_end_transaction_throttle(trans, root);
3610fail:
3611 if (drop_inode) {
3612 inode_dec_link_count(inode);
3613 iput(inode);
3614 }
3615 btrfs_btree_balance_dirty(root, nr);
3616 return err;
3617}
3618
3619static int btrfs_create(struct inode *dir, struct dentry *dentry,
3620 int mode, struct nameidata *nd)
3621{
3622 struct btrfs_trans_handle *trans;
3623 struct btrfs_root *root = BTRFS_I(dir)->root;
3624 struct inode *inode = NULL;
3625 int err;
3626 int drop_inode = 0;
3627 unsigned long nr = 0;
3628 u64 objectid;
3629 u64 index = 0;
3630
3631 err = btrfs_check_free_space(root, 1, 0);
3632 if (err)
3633 goto fail;
3634 trans = btrfs_start_transaction(root, 1);
3635 btrfs_set_trans_block_group(trans, dir);
3636
3637 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3638 if (err) {
3639 err = -ENOSPC;
3640 goto out_unlock;
3641 }
3642
3643 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3644 dentry->d_name.len,
3645 dentry->d_parent->d_inode->i_ino,
3646 objectid, BTRFS_I(dir)->block_group, mode,
3647 &index);
3648 err = PTR_ERR(inode);
3649 if (IS_ERR(inode))
3650 goto out_unlock;
3651
3652 err = btrfs_init_acl(inode, dir);
3653 if (err) {
3654 drop_inode = 1;
3655 goto out_unlock;
3656 }
3657
3658 btrfs_set_trans_block_group(trans, inode);
3659 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3660 if (err)
3661 drop_inode = 1;
3662 else {
3663 inode->i_mapping->a_ops = &btrfs_aops;
3664 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3665 inode->i_fop = &btrfs_file_operations;
3666 inode->i_op = &btrfs_file_inode_operations;
3667 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3668 }
3669 dir->i_sb->s_dirt = 1;
3670 btrfs_update_inode_block_group(trans, inode);
3671 btrfs_update_inode_block_group(trans, dir);
3672out_unlock:
3673 nr = trans->blocks_used;
3674 btrfs_end_transaction_throttle(trans, root);
3675fail:
3676 if (drop_inode) {
3677 inode_dec_link_count(inode);
3678 iput(inode);
3679 }
3680 btrfs_btree_balance_dirty(root, nr);
3681 return err;
3682}
3683
3684static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3685 struct dentry *dentry)
3686{
3687 struct btrfs_trans_handle *trans;
3688 struct btrfs_root *root = BTRFS_I(dir)->root;
3689 struct inode *inode = old_dentry->d_inode;
3690 u64 index;
3691 unsigned long nr = 0;
3692 int err;
3693 int drop_inode = 0;
3694
3695 if (inode->i_nlink == 0)
3696 return -ENOENT;
3697
3698 btrfs_inc_nlink(inode);
3699 err = btrfs_check_free_space(root, 1, 0);
3700 if (err)
3701 goto fail;
3702 err = btrfs_set_inode_index(dir, &index);
3703 if (err)
3704 goto fail;
3705
3706 trans = btrfs_start_transaction(root, 1);
3707
3708 btrfs_set_trans_block_group(trans, dir);
3709 atomic_inc(&inode->i_count);
3710
3711 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
3712
3713 if (err)
3714 drop_inode = 1;
3715
3716 dir->i_sb->s_dirt = 1;
3717 btrfs_update_inode_block_group(trans, dir);
3718 err = btrfs_update_inode(trans, root, inode);
3719
3720 if (err)
3721 drop_inode = 1;
3722
3723 nr = trans->blocks_used;
3724 btrfs_end_transaction_throttle(trans, root);
3725fail:
3726 if (drop_inode) {
3727 inode_dec_link_count(inode);
3728 iput(inode);
3729 }
3730 btrfs_btree_balance_dirty(root, nr);
3731 return err;
3732}
3733
3734static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3735{
3736 struct inode *inode = NULL;
3737 struct btrfs_trans_handle *trans;
3738 struct btrfs_root *root = BTRFS_I(dir)->root;
3739 int err = 0;
3740 int drop_on_err = 0;
3741 u64 objectid = 0;
3742 u64 index = 0;
3743 unsigned long nr = 1;
3744
3745 err = btrfs_check_free_space(root, 1, 0);
3746 if (err)
3747 goto out_unlock;
3748
3749 trans = btrfs_start_transaction(root, 1);
3750 btrfs_set_trans_block_group(trans, dir);
3751
3752 if (IS_ERR(trans)) {
3753 err = PTR_ERR(trans);
3754 goto out_unlock;
3755 }
3756
3757 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3758 if (err) {
3759 err = -ENOSPC;
3760 goto out_unlock;
3761 }
3762
3763 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3764 dentry->d_name.len,
3765 dentry->d_parent->d_inode->i_ino, objectid,
3766 BTRFS_I(dir)->block_group, S_IFDIR | mode,
3767 &index);
3768 if (IS_ERR(inode)) {
3769 err = PTR_ERR(inode);
3770 goto out_fail;
3771 }
3772
3773 drop_on_err = 1;
3774
3775 err = btrfs_init_acl(inode, dir);
3776 if (err)
3777 goto out_fail;
3778
3779 inode->i_op = &btrfs_dir_inode_operations;
3780 inode->i_fop = &btrfs_dir_file_operations;
3781 btrfs_set_trans_block_group(trans, inode);
3782
3783 btrfs_i_size_write(inode, 0);
3784 err = btrfs_update_inode(trans, root, inode);
3785 if (err)
3786 goto out_fail;
3787
3788 err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3789 inode, dentry->d_name.name,
3790 dentry->d_name.len, 0, index);
3791 if (err)
3792 goto out_fail;
3793
3794 d_instantiate(dentry, inode);
3795 drop_on_err = 0;
3796 dir->i_sb->s_dirt = 1;
3797 btrfs_update_inode_block_group(trans, inode);
3798 btrfs_update_inode_block_group(trans, dir);
3799
3800out_fail:
3801 nr = trans->blocks_used;
3802 btrfs_end_transaction_throttle(trans, root);
3803
3804out_unlock:
3805 if (drop_on_err)
3806 iput(inode);
3807 btrfs_btree_balance_dirty(root, nr);
3808 return err;
3809}
3810
3811/* helper for btfs_get_extent. Given an existing extent in the tree,
3812 * and an extent that you want to insert, deal with overlap and insert
3813 * the new extent into the tree.
3814 */
3815static int merge_extent_mapping(struct extent_map_tree *em_tree,
3816 struct extent_map *existing,
3817 struct extent_map *em,
3818 u64 map_start, u64 map_len)
3819{
3820 u64 start_diff;
3821
3822 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
3823 start_diff = map_start - em->start;
3824 em->start = map_start;
3825 em->len = map_len;
3826 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3827 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3828 em->block_start += start_diff;
3829 em->block_len -= start_diff;
3830 }
3831 return add_extent_mapping(em_tree, em);
3832}
3833
3834static noinline int uncompress_inline(struct btrfs_path *path,
3835 struct inode *inode, struct page *page,
3836 size_t pg_offset, u64 extent_offset,
3837 struct btrfs_file_extent_item *item)
3838{
3839 int ret;
3840 struct extent_buffer *leaf = path->nodes[0];
3841 char *tmp;
3842 size_t max_size;
3843 unsigned long inline_size;
3844 unsigned long ptr;
3845
3846 WARN_ON(pg_offset != 0);
3847 max_size = btrfs_file_extent_ram_bytes(leaf, item);
3848 inline_size = btrfs_file_extent_inline_item_len(leaf,
3849 btrfs_item_nr(leaf, path->slots[0]));
3850 tmp = kmalloc(inline_size, GFP_NOFS);
3851 ptr = btrfs_file_extent_inline_start(item);
3852
3853 read_extent_buffer(leaf, tmp, ptr, inline_size);
3854
3855 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
3856 ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3857 inline_size, max_size);
3858 if (ret) {
3859 char *kaddr = kmap_atomic(page, KM_USER0);
3860 unsigned long copy_size = min_t(u64,
3861 PAGE_CACHE_SIZE - pg_offset,
3862 max_size - extent_offset);
3863 memset(kaddr + pg_offset, 0, copy_size);
3864 kunmap_atomic(kaddr, KM_USER0);
3865 }
3866 kfree(tmp);
3867 return 0;
3868}
3869
3870/*
3871 * a bit scary, this does extent mapping from logical file offset to the disk.
3872 * the ugly parts come from merging extents from the disk with the in-ram
3873 * representation. This gets more complex because of the data=ordered code,
3874 * where the in-ram extents might be locked pending data=ordered completion.
3875 *
3876 * This also copies inline extents directly into the page.
3877 */
3878
3879struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
3880 size_t pg_offset, u64 start, u64 len,
3881 int create)
3882{
3883 int ret;
3884 int err = 0;
3885 u64 bytenr;
3886 u64 extent_start = 0;
3887 u64 extent_end = 0;
3888 u64 objectid = inode->i_ino;
3889 u32 found_type;
3890 struct btrfs_path *path = NULL;
3891 struct btrfs_root *root = BTRFS_I(inode)->root;
3892 struct btrfs_file_extent_item *item;
3893 struct extent_buffer *leaf;
3894 struct btrfs_key found_key;
3895 struct extent_map *em = NULL;
3896 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3897 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3898 struct btrfs_trans_handle *trans = NULL;
3899 int compressed;
3900
3901again:
3902 spin_lock(&em_tree->lock);
3903 em = lookup_extent_mapping(em_tree, start, len);
3904 if (em)
3905 em->bdev = root->fs_info->fs_devices->latest_bdev;
3906 spin_unlock(&em_tree->lock);
3907
3908 if (em) {
3909 if (em->start > start || em->start + em->len <= start)
3910 free_extent_map(em);
3911 else if (em->block_start == EXTENT_MAP_INLINE && page)
3912 free_extent_map(em);
3913 else
3914 goto out;
3915 }
3916 em = alloc_extent_map(GFP_NOFS);
3917 if (!em) {
3918 err = -ENOMEM;
3919 goto out;
3920 }
3921 em->bdev = root->fs_info->fs_devices->latest_bdev;
3922 em->start = EXTENT_MAP_HOLE;
3923 em->orig_start = EXTENT_MAP_HOLE;
3924 em->len = (u64)-1;
3925 em->block_len = (u64)-1;
3926
3927 if (!path) {
3928 path = btrfs_alloc_path();
3929 BUG_ON(!path);
3930 }
3931
3932 ret = btrfs_lookup_file_extent(trans, root, path,
3933 objectid, start, trans != NULL);
3934 if (ret < 0) {
3935 err = ret;
3936 goto out;
3937 }
3938
3939 if (ret != 0) {
3940 if (path->slots[0] == 0)
3941 goto not_found;
3942 path->slots[0]--;
3943 }
3944
3945 leaf = path->nodes[0];
3946 item = btrfs_item_ptr(leaf, path->slots[0],
3947 struct btrfs_file_extent_item);
3948 /* are we inside the extent that was found? */
3949 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3950 found_type = btrfs_key_type(&found_key);
3951 if (found_key.objectid != objectid ||
3952 found_type != BTRFS_EXTENT_DATA_KEY) {
3953 goto not_found;
3954 }
3955
3956 found_type = btrfs_file_extent_type(leaf, item);
3957 extent_start = found_key.offset;
3958 compressed = btrfs_file_extent_compression(leaf, item);
3959 if (found_type == BTRFS_FILE_EXTENT_REG ||
3960 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3961 extent_end = extent_start +
3962 btrfs_file_extent_num_bytes(leaf, item);
3963 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3964 size_t size;
3965 size = btrfs_file_extent_inline_len(leaf, item);
3966 extent_end = (extent_start + size + root->sectorsize - 1) &
3967 ~((u64)root->sectorsize - 1);
3968 }
3969
3970 if (start >= extent_end) {
3971 path->slots[0]++;
3972 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3973 ret = btrfs_next_leaf(root, path);
3974 if (ret < 0) {
3975 err = ret;
3976 goto out;
3977 }
3978 if (ret > 0)
3979 goto not_found;
3980 leaf = path->nodes[0];
3981 }
3982 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3983 if (found_key.objectid != objectid ||
3984 found_key.type != BTRFS_EXTENT_DATA_KEY)
3985 goto not_found;
3986 if (start + len <= found_key.offset)
3987 goto not_found;
3988 em->start = start;
3989 em->len = found_key.offset - start;
3990 goto not_found_em;
3991 }
3992
3993 if (found_type == BTRFS_FILE_EXTENT_REG ||
3994 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3995 em->start = extent_start;
3996 em->len = extent_end - extent_start;
3997 em->orig_start = extent_start -
3998 btrfs_file_extent_offset(leaf, item);
3999 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
4000 if (bytenr == 0) {
4001 em->block_start = EXTENT_MAP_HOLE;
4002 goto insert;
4003 }
4004 if (compressed) {
4005 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4006 em->block_start = bytenr;
4007 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
4008 item);
4009 } else {
4010 bytenr += btrfs_file_extent_offset(leaf, item);
4011 em->block_start = bytenr;
4012 em->block_len = em->len;
4013 if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
4014 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
4015 }
4016 goto insert;
4017 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
4018 unsigned long ptr;
4019 char *map;
4020 size_t size;
4021 size_t extent_offset;
4022 size_t copy_size;
4023
4024 em->block_start = EXTENT_MAP_INLINE;
4025 if (!page || create) {
4026 em->start = extent_start;
4027 em->len = extent_end - extent_start;
4028 goto out;
4029 }
4030
4031 size = btrfs_file_extent_inline_len(leaf, item);
4032 extent_offset = page_offset(page) + pg_offset - extent_start;
4033 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
4034 size - extent_offset);
4035 em->start = extent_start + extent_offset;
4036 em->len = (copy_size + root->sectorsize - 1) &
4037 ~((u64)root->sectorsize - 1);
4038 em->orig_start = EXTENT_MAP_INLINE;
4039 if (compressed)
4040 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4041 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
4042 if (create == 0 && !PageUptodate(page)) {
4043 if (btrfs_file_extent_compression(leaf, item) ==
4044 BTRFS_COMPRESS_ZLIB) {
4045 ret = uncompress_inline(path, inode, page,
4046 pg_offset,
4047 extent_offset, item);
4048 BUG_ON(ret);
4049 } else {
4050 map = kmap(page);
4051 read_extent_buffer(leaf, map + pg_offset, ptr,
4052 copy_size);
4053 kunmap(page);
4054 }
4055 flush_dcache_page(page);
4056 } else if (create && PageUptodate(page)) {
4057 if (!trans) {
4058 kunmap(page);
4059 free_extent_map(em);
4060 em = NULL;
4061 btrfs_release_path(root, path);
4062 trans = btrfs_join_transaction(root, 1);
4063 goto again;
4064 }
4065 map = kmap(page);
4066 write_extent_buffer(leaf, map + pg_offset, ptr,
4067 copy_size);
4068 kunmap(page);
4069 btrfs_mark_buffer_dirty(leaf);
4070 }
4071 set_extent_uptodate(io_tree, em->start,
4072 extent_map_end(em) - 1, GFP_NOFS);
4073 goto insert;
4074 } else {
4075 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
4076 WARN_ON(1);
4077 }
4078not_found:
4079 em->start = start;
4080 em->len = len;
4081not_found_em:
4082 em->block_start = EXTENT_MAP_HOLE;
4083 set_bit(EXTENT_FLAG_VACANCY, &em->flags);
4084insert:
4085 btrfs_release_path(root, path);
4086 if (em->start > start || extent_map_end(em) <= start) {
4087 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
4088 "[%llu %llu]\n", (unsigned long long)em->start,
4089 (unsigned long long)em->len,
4090 (unsigned long long)start,
4091 (unsigned long long)len);
4092 err = -EIO;
4093 goto out;
4094 }
4095
4096 err = 0;
4097 spin_lock(&em_tree->lock);
4098 ret = add_extent_mapping(em_tree, em);
4099 /* it is possible that someone inserted the extent into the tree
4100 * while we had the lock dropped. It is also possible that
4101 * an overlapping map exists in the tree
4102 */
4103 if (ret == -EEXIST) {
4104 struct extent_map *existing;
4105
4106 ret = 0;
4107
4108 existing = lookup_extent_mapping(em_tree, start, len);
4109 if (existing && (existing->start > start ||
4110 existing->start + existing->len <= start)) {
4111 free_extent_map(existing);
4112 existing = NULL;
4113 }
4114 if (!existing) {
4115 existing = lookup_extent_mapping(em_tree, em->start,
4116 em->len);
4117 if (existing) {
4118 err = merge_extent_mapping(em_tree, existing,
4119 em, start,
4120 root->sectorsize);
4121 free_extent_map(existing);
4122 if (err) {
4123 free_extent_map(em);
4124 em = NULL;
4125 }
4126 } else {
4127 err = -EIO;
4128 free_extent_map(em);
4129 em = NULL;
4130 }
4131 } else {
4132 free_extent_map(em);
4133 em = existing;
4134 err = 0;
4135 }
4136 }
4137 spin_unlock(&em_tree->lock);
4138out:
4139 if (path)
4140 btrfs_free_path(path);
4141 if (trans) {
4142 ret = btrfs_end_transaction(trans, root);
4143 if (!err)
4144 err = ret;
4145 }
4146 if (err) {
4147 free_extent_map(em);
4148 WARN_ON(1);
4149 return ERR_PTR(err);
4150 }
4151 return em;
4152}
4153
4154static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4155 const struct iovec *iov, loff_t offset,
4156 unsigned long nr_segs)
4157{
4158 return -EINVAL;
4159}
4160
4161static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
4162{
4163 return extent_bmap(mapping, iblock, btrfs_get_extent);
4164}
4165
4166int btrfs_readpage(struct file *file, struct page *page)
4167{
4168 struct extent_io_tree *tree;
4169 tree = &BTRFS_I(page->mapping->host)->io_tree;
4170 return extent_read_full_page(tree, page, btrfs_get_extent);
4171}
4172
4173static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
4174{
4175 struct extent_io_tree *tree;
4176
4177
4178 if (current->flags & PF_MEMALLOC) {
4179 redirty_page_for_writepage(wbc, page);
4180 unlock_page(page);
4181 return 0;
4182 }
4183 tree = &BTRFS_I(page->mapping->host)->io_tree;
4184 return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
4185}
4186
4187int btrfs_writepages(struct address_space *mapping,
4188 struct writeback_control *wbc)
4189{
4190 struct extent_io_tree *tree;
4191
4192 tree = &BTRFS_I(mapping->host)->io_tree;
4193 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
4194}
4195
4196static int
4197btrfs_readpages(struct file *file, struct address_space *mapping,
4198 struct list_head *pages, unsigned nr_pages)
4199{
4200 struct extent_io_tree *tree;
4201 tree = &BTRFS_I(mapping->host)->io_tree;
4202 return extent_readpages(tree, mapping, pages, nr_pages,
4203 btrfs_get_extent);
4204}
4205static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4206{
4207 struct extent_io_tree *tree;
4208 struct extent_map_tree *map;
4209 int ret;
4210
4211 tree = &BTRFS_I(page->mapping->host)->io_tree;
4212 map = &BTRFS_I(page->mapping->host)->extent_tree;
4213 ret = try_release_extent_mapping(map, tree, page, gfp_flags);
4214 if (ret == 1) {
4215 ClearPagePrivate(page);
4216 set_page_private(page, 0);
4217 page_cache_release(page);
4218 }
4219 return ret;
4220}
4221
4222static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4223{
4224 if (PageWriteback(page) || PageDirty(page))
4225 return 0;
4226 return __btrfs_releasepage(page, gfp_flags);
4227}
4228
4229static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4230{
4231 struct extent_io_tree *tree;
4232 struct btrfs_ordered_extent *ordered;
4233 u64 page_start = page_offset(page);
4234 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4235
4236 wait_on_page_writeback(page);
4237 tree = &BTRFS_I(page->mapping->host)->io_tree;
4238 if (offset) {
4239 btrfs_releasepage(page, GFP_NOFS);
4240 return;
4241 }
4242
4243 lock_extent(tree, page_start, page_end, GFP_NOFS);
4244 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4245 page_offset(page));
4246 if (ordered) {
4247 /*
4248 * IO on this page will never be started, so we need
4249 * to account for any ordered extents now
4250 */
4251 clear_extent_bit(tree, page_start, page_end,
4252 EXTENT_DIRTY | EXTENT_DELALLOC |
4253 EXTENT_LOCKED, 1, 0, GFP_NOFS);
4254 btrfs_finish_ordered_io(page->mapping->host,
4255 page_start, page_end);
4256 btrfs_put_ordered_extent(ordered);
4257 lock_extent(tree, page_start, page_end, GFP_NOFS);
4258 }
4259 clear_extent_bit(tree, page_start, page_end,
4260 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4261 EXTENT_ORDERED,
4262 1, 1, GFP_NOFS);
4263 __btrfs_releasepage(page, GFP_NOFS);
4264
4265 ClearPageChecked(page);
4266 if (PagePrivate(page)) {
4267 ClearPagePrivate(page);
4268 set_page_private(page, 0);
4269 page_cache_release(page);
4270 }
4271}
4272
4273/*
4274 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
4275 * called from a page fault handler when a page is first dirtied. Hence we must
4276 * be careful to check for EOF conditions here. We set the page up correctly
4277 * for a written page which means we get ENOSPC checking when writing into
4278 * holes and correct delalloc and unwritten extent mapping on filesystems that
4279 * support these features.
4280 *
4281 * We are not allowed to take the i_mutex here so we have to play games to
4282 * protect against truncate races as the page could now be beyond EOF. Because
4283 * vmtruncate() writes the inode size before removing pages, once we have the
4284 * page lock we can determine safely if the page is beyond EOF. If it is not
4285 * beyond EOF, then the page is guaranteed safe against truncation until we
4286 * unlock the page.
4287 */
4288int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4289{
4290 struct inode *inode = fdentry(vma->vm_file)->d_inode;
4291 struct btrfs_root *root = BTRFS_I(inode)->root;
4292 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4293 struct btrfs_ordered_extent *ordered;
4294 char *kaddr;
4295 unsigned long zero_start;
4296 loff_t size;
4297 int ret;
4298 u64 page_start;
4299 u64 page_end;
4300
4301 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
4302 if (ret)
4303 goto out;
4304
4305 ret = -EINVAL;
4306again:
4307 lock_page(page);
4308 size = i_size_read(inode);
4309 page_start = page_offset(page);
4310 page_end = page_start + PAGE_CACHE_SIZE - 1;
4311
4312 if ((page->mapping != inode->i_mapping) ||
4313 (page_start >= size)) {
4314 /* page got truncated out from underneath us */
4315 goto out_unlock;
4316 }
4317 wait_on_page_writeback(page);
4318
4319 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
4320 set_page_extent_mapped(page);
4321
4322 /*
4323 * we can't set the delalloc bits if there are pending ordered
4324 * extents. Drop our locks and wait for them to finish
4325 */
4326 ordered = btrfs_lookup_ordered_extent(inode, page_start);
4327 if (ordered) {
4328 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4329 unlock_page(page);
4330 btrfs_start_ordered_extent(inode, ordered, 1);
4331 btrfs_put_ordered_extent(ordered);
4332 goto again;
4333 }
4334
4335 btrfs_set_extent_delalloc(inode, page_start, page_end);
4336 ret = 0;
4337
4338 /* page is wholly or partially inside EOF */
4339 if (page_start + PAGE_CACHE_SIZE > size)
4340 zero_start = size & ~PAGE_CACHE_MASK;
4341 else
4342 zero_start = PAGE_CACHE_SIZE;
4343
4344 if (zero_start != PAGE_CACHE_SIZE) {
4345 kaddr = kmap(page);
4346 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
4347 flush_dcache_page(page);
4348 kunmap(page);
4349 }
4350 ClearPageChecked(page);
4351 set_page_dirty(page);
4352 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4353
4354out_unlock:
4355 unlock_page(page);
4356out:
4357 return ret;
4358}
4359
4360static void btrfs_truncate(struct inode *inode)
4361{
4362 struct btrfs_root *root = BTRFS_I(inode)->root;
4363 int ret;
4364 struct btrfs_trans_handle *trans;
4365 unsigned long nr;
4366 u64 mask = root->sectorsize - 1;
4367
4368 if (!S_ISREG(inode->i_mode))
4369 return;
4370 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4371 return;
4372
4373 btrfs_truncate_page(inode->i_mapping, inode->i_size);
4374 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4375
4376 trans = btrfs_start_transaction(root, 1);
4377 btrfs_set_trans_block_group(trans, inode);
4378 btrfs_i_size_write(inode, inode->i_size);
4379
4380 ret = btrfs_orphan_add(trans, inode);
4381 if (ret)
4382 goto out;
4383 /* FIXME, add redo link to tree so we don't leak on crash */
4384 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
4385 BTRFS_EXTENT_DATA_KEY);
4386 btrfs_update_inode(trans, root, inode);
4387
4388 ret = btrfs_orphan_del(trans, inode);
4389 BUG_ON(ret);
4390
4391out:
4392 nr = trans->blocks_used;
4393 ret = btrfs_end_transaction_throttle(trans, root);
4394 BUG_ON(ret);
4395 btrfs_btree_balance_dirty(root, nr);
4396}
4397
4398/*
4399 * create a new subvolume directory/inode (helper for the ioctl).
4400 */
4401int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
4402 struct btrfs_root *new_root, struct dentry *dentry,
4403 u64 new_dirid, u64 alloc_hint)
4404{
4405 struct inode *inode;
4406 int error;
4407 u64 index = 0;
4408
4409 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
4410 new_dirid, alloc_hint, S_IFDIR | 0700, &index);
4411 if (IS_ERR(inode))
4412 return PTR_ERR(inode);
4413 inode->i_op = &btrfs_dir_inode_operations;
4414 inode->i_fop = &btrfs_dir_file_operations;
4415
4416 inode->i_nlink = 1;
4417 btrfs_i_size_write(inode, 0);
4418
4419 error = btrfs_update_inode(trans, new_root, inode);
4420 if (error)
4421 return error;
4422
4423 d_instantiate(dentry, inode);
4424 return 0;
4425}
4426
4427/* helper function for file defrag and space balancing. This
4428 * forces readahead on a given range of bytes in an inode
4429 */
4430unsigned long btrfs_force_ra(struct address_space *mapping,
4431 struct file_ra_state *ra, struct file *file,
4432 pgoff_t offset, pgoff_t last_index)
4433{
4434 pgoff_t req_size = last_index - offset + 1;
4435
4436 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
4437 return offset + req_size;
4438}
4439
4440struct inode *btrfs_alloc_inode(struct super_block *sb)
4441{
4442 struct btrfs_inode *ei;
4443
4444 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
4445 if (!ei)
4446 return NULL;
4447 ei->last_trans = 0;
4448 ei->logged_trans = 0;
4449 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4450 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4451 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4452 INIT_LIST_HEAD(&ei->i_orphan);
4453 return &ei->vfs_inode;
4454}
4455
4456void btrfs_destroy_inode(struct inode *inode)
4457{
4458 struct btrfs_ordered_extent *ordered;
4459 WARN_ON(!list_empty(&inode->i_dentry));
4460 WARN_ON(inode->i_data.nrpages);
4461
4462 if (BTRFS_I(inode)->i_acl &&
4463 BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
4464 posix_acl_release(BTRFS_I(inode)->i_acl);
4465 if (BTRFS_I(inode)->i_default_acl &&
4466 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4467 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4468
4469 spin_lock(&BTRFS_I(inode)->root->list_lock);
4470 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4471 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4472 " list\n", inode->i_ino);
4473 dump_stack();
4474 }
4475 spin_unlock(&BTRFS_I(inode)->root->list_lock);
4476
4477 while (1) {
4478 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
4479 if (!ordered)
4480 break;
4481 else {
4482 printk(KERN_ERR "btrfs found ordered "
4483 "extent %llu %llu on inode cleanup\n",
4484 (unsigned long long)ordered->file_offset,
4485 (unsigned long long)ordered->len);
4486 btrfs_remove_ordered_extent(inode, ordered);
4487 btrfs_put_ordered_extent(ordered);
4488 btrfs_put_ordered_extent(ordered);
4489 }
4490 }
4491 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4492 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4493}
4494
4495static void init_once(void *foo)
4496{
4497 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
4498
4499 inode_init_once(&ei->vfs_inode);
4500}
4501
4502void btrfs_destroy_cachep(void)
4503{
4504 if (btrfs_inode_cachep)
4505 kmem_cache_destroy(btrfs_inode_cachep);
4506 if (btrfs_trans_handle_cachep)
4507 kmem_cache_destroy(btrfs_trans_handle_cachep);
4508 if (btrfs_transaction_cachep)
4509 kmem_cache_destroy(btrfs_transaction_cachep);
4510 if (btrfs_bit_radix_cachep)
4511 kmem_cache_destroy(btrfs_bit_radix_cachep);
4512 if (btrfs_path_cachep)
4513 kmem_cache_destroy(btrfs_path_cachep);
4514}
4515
4516struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
4517 unsigned long extra_flags,
4518 void (*ctor)(void *))
4519{
4520 return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
4521 SLAB_MEM_SPREAD | extra_flags), ctor);
4522}
4523
4524int btrfs_init_cachep(void)
4525{
4526 btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
4527 sizeof(struct btrfs_inode),
4528 0, init_once);
4529 if (!btrfs_inode_cachep)
4530 goto fail;
4531 btrfs_trans_handle_cachep =
4532 btrfs_cache_create("btrfs_trans_handle_cache",
4533 sizeof(struct btrfs_trans_handle),
4534 0, NULL);
4535 if (!btrfs_trans_handle_cachep)
4536 goto fail;
4537 btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
4538 sizeof(struct btrfs_transaction),
4539 0, NULL);
4540 if (!btrfs_transaction_cachep)
4541 goto fail;
4542 btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
4543 sizeof(struct btrfs_path),
4544 0, NULL);
4545 if (!btrfs_path_cachep)
4546 goto fail;
4547 btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
4548 SLAB_DESTROY_BY_RCU, NULL);
4549 if (!btrfs_bit_radix_cachep)
4550 goto fail;
4551 return 0;
4552fail:
4553 btrfs_destroy_cachep();
4554 return -ENOMEM;
4555}
4556
4557static int btrfs_getattr(struct vfsmount *mnt,
4558 struct dentry *dentry, struct kstat *stat)
4559{
4560 struct inode *inode = dentry->d_inode;
4561 generic_fillattr(inode, stat);
4562 stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
4563 stat->blksize = PAGE_CACHE_SIZE;
4564 stat->blocks = (inode_get_bytes(inode) +
4565 BTRFS_I(inode)->delalloc_bytes) >> 9;
4566 return 0;
4567}
4568
4569static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4570 struct inode *new_dir, struct dentry *new_dentry)
4571{
4572 struct btrfs_trans_handle *trans;
4573 struct btrfs_root *root = BTRFS_I(old_dir)->root;
4574 struct inode *new_inode = new_dentry->d_inode;
4575 struct inode *old_inode = old_dentry->d_inode;
4576 struct timespec ctime = CURRENT_TIME;
4577 u64 index = 0;
4578 int ret;
4579
4580 /* we're not allowed to rename between subvolumes */
4581 if (BTRFS_I(old_inode)->root->root_key.objectid !=
4582 BTRFS_I(new_dir)->root->root_key.objectid)
4583 return -EXDEV;
4584
4585 if (S_ISDIR(old_inode->i_mode) && new_inode &&
4586 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
4587 return -ENOTEMPTY;
4588 }
4589
4590 /* to rename a snapshot or subvolume, we need to juggle the
4591 * backrefs. This isn't coded yet
4592 */
4593 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4594 return -EXDEV;
4595
4596 ret = btrfs_check_free_space(root, 1, 0);
4597 if (ret)
4598 goto out_unlock;
4599
4600 trans = btrfs_start_transaction(root, 1);
4601
4602 btrfs_set_trans_block_group(trans, new_dir);
4603
4604 btrfs_inc_nlink(old_dentry->d_inode);
4605 old_dir->i_ctime = old_dir->i_mtime = ctime;
4606 new_dir->i_ctime = new_dir->i_mtime = ctime;
4607 old_inode->i_ctime = ctime;
4608
4609 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4610 old_dentry->d_name.name,
4611 old_dentry->d_name.len);
4612 if (ret)
4613 goto out_fail;
4614
4615 if (new_inode) {
4616 new_inode->i_ctime = CURRENT_TIME;
4617 ret = btrfs_unlink_inode(trans, root, new_dir,
4618 new_dentry->d_inode,
4619 new_dentry->d_name.name,
4620 new_dentry->d_name.len);
4621 if (ret)
4622 goto out_fail;
4623 if (new_inode->i_nlink == 0) {
4624 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
4625 if (ret)
4626 goto out_fail;
4627 }
4628
4629 }
4630 ret = btrfs_set_inode_index(new_dir, &index);
4631 if (ret)
4632 goto out_fail;
4633
4634 ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
4635 old_inode, new_dentry->d_name.name,
4636 new_dentry->d_name.len, 1, index);
4637 if (ret)
4638 goto out_fail;
4639
4640out_fail:
4641 btrfs_end_transaction_throttle(trans, root);
4642out_unlock:
4643 return ret;
4644}
4645
4646/*
4647 * some fairly slow code that needs optimization. This walks the list
4648 * of all the inodes with pending delalloc and forces them to disk.
4649 */
4650int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4651{
4652 struct list_head *head = &root->fs_info->delalloc_inodes;
4653 struct btrfs_inode *binode;
4654 struct inode *inode;
4655
4656 if (root->fs_info->sb->s_flags & MS_RDONLY)
4657 return -EROFS;
4658
4659 spin_lock(&root->fs_info->delalloc_lock);
4660 while (!list_empty(head)) {
4661 binode = list_entry(head->next, struct btrfs_inode,
4662 delalloc_inodes);
4663 inode = igrab(&binode->vfs_inode);
4664 if (!inode)
4665 list_del_init(&binode->delalloc_inodes);
4666 spin_unlock(&root->fs_info->delalloc_lock);
4667 if (inode) {
4668 filemap_flush(inode->i_mapping);
4669 iput(inode);
4670 }
4671 cond_resched();
4672 spin_lock(&root->fs_info->delalloc_lock);
4673 }
4674 spin_unlock(&root->fs_info->delalloc_lock);
4675
4676 /* the filemap_flush will queue IO into the worker threads, but
4677 * we have to make sure the IO is actually started and that
4678 * ordered extents get created before we return
4679 */
4680 atomic_inc(&root->fs_info->async_submit_draining);
4681 while (atomic_read(&root->fs_info->nr_async_submits) ||
4682 atomic_read(&root->fs_info->async_delalloc_pages)) {
4683 wait_event(root->fs_info->async_submit_wait,
4684 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4685 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4686 }
4687 atomic_dec(&root->fs_info->async_submit_draining);
4688 return 0;
4689}
4690
4691static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4692 const char *symname)
4693{
4694 struct btrfs_trans_handle *trans;
4695 struct btrfs_root *root = BTRFS_I(dir)->root;
4696 struct btrfs_path *path;
4697 struct btrfs_key key;
4698 struct inode *inode = NULL;
4699 int err;
4700 int drop_inode = 0;
4701 u64 objectid;
4702 u64 index = 0 ;
4703 int name_len;
4704 int datasize;
4705 unsigned long ptr;
4706 struct btrfs_file_extent_item *ei;
4707 struct extent_buffer *leaf;
4708 unsigned long nr = 0;
4709
4710 name_len = strlen(symname) + 1;
4711 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4712 return -ENAMETOOLONG;
4713
4714 err = btrfs_check_free_space(root, 1, 0);
4715 if (err)
4716 goto out_fail;
4717
4718 trans = btrfs_start_transaction(root, 1);
4719 btrfs_set_trans_block_group(trans, dir);
4720
4721 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4722 if (err) {
4723 err = -ENOSPC;
4724 goto out_unlock;
4725 }
4726
4727 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4728 dentry->d_name.len,
4729 dentry->d_parent->d_inode->i_ino, objectid,
4730 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
4731 &index);
4732 err = PTR_ERR(inode);
4733 if (IS_ERR(inode))
4734 goto out_unlock;
4735
4736 err = btrfs_init_acl(inode, dir);
4737 if (err) {
4738 drop_inode = 1;
4739 goto out_unlock;
4740 }
4741
4742 btrfs_set_trans_block_group(trans, inode);
4743 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4744 if (err)
4745 drop_inode = 1;
4746 else {
4747 inode->i_mapping->a_ops = &btrfs_aops;
4748 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4749 inode->i_fop = &btrfs_file_operations;
4750 inode->i_op = &btrfs_file_inode_operations;
4751 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4752 }
4753 dir->i_sb->s_dirt = 1;
4754 btrfs_update_inode_block_group(trans, inode);
4755 btrfs_update_inode_block_group(trans, dir);
4756 if (drop_inode)
4757 goto out_unlock;
4758
4759 path = btrfs_alloc_path();
4760 BUG_ON(!path);
4761 key.objectid = inode->i_ino;
4762 key.offset = 0;
4763 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
4764 datasize = btrfs_file_extent_calc_inline_size(name_len);
4765 err = btrfs_insert_empty_item(trans, root, path, &key,
4766 datasize);
4767 if (err) {
4768 drop_inode = 1;
4769 goto out_unlock;
4770 }
4771 leaf = path->nodes[0];
4772 ei = btrfs_item_ptr(leaf, path->slots[0],
4773 struct btrfs_file_extent_item);
4774 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
4775 btrfs_set_file_extent_type(leaf, ei,
4776 BTRFS_FILE_EXTENT_INLINE);
4777 btrfs_set_file_extent_encryption(leaf, ei, 0);
4778 btrfs_set_file_extent_compression(leaf, ei, 0);
4779 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4780 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4781
4782 ptr = btrfs_file_extent_inline_start(ei);
4783 write_extent_buffer(leaf, symname, ptr, name_len);
4784 btrfs_mark_buffer_dirty(leaf);
4785 btrfs_free_path(path);
4786
4787 inode->i_op = &btrfs_symlink_inode_operations;
4788 inode->i_mapping->a_ops = &btrfs_symlink_aops;
4789 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4790 inode_set_bytes(inode, name_len);
4791 btrfs_i_size_write(inode, name_len - 1);
4792 err = btrfs_update_inode(trans, root, inode);
4793 if (err)
4794 drop_inode = 1;
4795
4796out_unlock:
4797 nr = trans->blocks_used;
4798 btrfs_end_transaction_throttle(trans, root);
4799out_fail:
4800 if (drop_inode) {
4801 inode_dec_link_count(inode);
4802 iput(inode);
4803 }
4804 btrfs_btree_balance_dirty(root, nr);
4805 return err;
4806}
4807
4808static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4809 u64 alloc_hint, int mode)
4810{
4811 struct btrfs_trans_handle *trans;
4812 struct btrfs_root *root = BTRFS_I(inode)->root;
4813 struct btrfs_key ins;
4814 u64 alloc_size;
4815 u64 cur_offset = start;
4816 u64 num_bytes = end - start;
4817 int ret = 0;
4818
4819 trans = btrfs_join_transaction(root, 1);
4820 BUG_ON(!trans);
4821 btrfs_set_trans_block_group(trans, inode);
4822
4823 while (num_bytes > 0) {
4824 alloc_size = min(num_bytes, root->fs_info->max_extent);
4825 ret = btrfs_reserve_extent(trans, root, alloc_size,
4826 root->sectorsize, 0, alloc_hint,
4827 (u64)-1, &ins, 1);
4828 if (ret) {
4829 WARN_ON(1);
4830 goto out;
4831 }
4832 ret = insert_reserved_file_extent(trans, inode,
4833 cur_offset, ins.objectid,
4834 ins.offset, ins.offset,
4835 ins.offset, 0, 0, 0,
4836 BTRFS_FILE_EXTENT_PREALLOC);
4837 BUG_ON(ret);
4838 num_bytes -= ins.offset;
4839 cur_offset += ins.offset;
4840 alloc_hint = ins.objectid + ins.offset;
4841 }
4842out:
4843 if (cur_offset > start) {
4844 inode->i_ctime = CURRENT_TIME;
4845 btrfs_set_flag(inode, PREALLOC);
4846 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4847 cur_offset > i_size_read(inode))
4848 btrfs_i_size_write(inode, cur_offset);
4849 ret = btrfs_update_inode(trans, root, inode);
4850 BUG_ON(ret);
4851 }
4852
4853 btrfs_end_transaction(trans, root);
4854 return ret;
4855}
4856
4857static long btrfs_fallocate(struct inode *inode, int mode,
4858 loff_t offset, loff_t len)
4859{
4860 u64 cur_offset;
4861 u64 last_byte;
4862 u64 alloc_start;
4863 u64 alloc_end;
4864 u64 alloc_hint = 0;
4865 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
4866 struct extent_map *em;
4867 int ret;
4868
4869 alloc_start = offset & ~mask;
4870 alloc_end = (offset + len + mask) & ~mask;
4871
4872 mutex_lock(&inode->i_mutex);
4873 if (alloc_start > inode->i_size) {
4874 ret = btrfs_cont_expand(inode, alloc_start);
4875 if (ret)
4876 goto out;
4877 }
4878
4879 while (1) {
4880 struct btrfs_ordered_extent *ordered;
4881 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
4882 alloc_end - 1, GFP_NOFS);
4883 ordered = btrfs_lookup_first_ordered_extent(inode,
4884 alloc_end - 1);
4885 if (ordered &&
4886 ordered->file_offset + ordered->len > alloc_start &&
4887 ordered->file_offset < alloc_end) {
4888 btrfs_put_ordered_extent(ordered);
4889 unlock_extent(&BTRFS_I(inode)->io_tree,
4890 alloc_start, alloc_end - 1, GFP_NOFS);
4891 btrfs_wait_ordered_range(inode, alloc_start,
4892 alloc_end - alloc_start);
4893 } else {
4894 if (ordered)
4895 btrfs_put_ordered_extent(ordered);
4896 break;
4897 }
4898 }
4899
4900 cur_offset = alloc_start;
4901 while (1) {
4902 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4903 alloc_end - cur_offset, 0);
4904 BUG_ON(IS_ERR(em) || !em);
4905 last_byte = min(extent_map_end(em), alloc_end);
4906 last_byte = (last_byte + mask) & ~mask;
4907 if (em->block_start == EXTENT_MAP_HOLE) {
4908 ret = prealloc_file_range(inode, cur_offset,
4909 last_byte, alloc_hint, mode);
4910 if (ret < 0) {
4911 free_extent_map(em);
4912 break;
4913 }
4914 }
4915 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
4916 alloc_hint = em->block_start;
4917 free_extent_map(em);
4918
4919 cur_offset = last_byte;
4920 if (cur_offset >= alloc_end) {
4921 ret = 0;
4922 break;
4923 }
4924 }
4925 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
4926 GFP_NOFS);
4927out:
4928 mutex_unlock(&inode->i_mutex);
4929 return ret;
4930}
4931
4932static int btrfs_set_page_dirty(struct page *page)
4933{
4934 return __set_page_dirty_nobuffers(page);
4935}
4936
4937static int btrfs_permission(struct inode *inode, int mask)
4938{
4939 if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
4940 return -EACCES;
4941 return generic_permission(inode, mask, btrfs_check_acl);
4942}
4943
4944static struct inode_operations btrfs_dir_inode_operations = {
4945 .getattr = btrfs_getattr,
4946 .lookup = btrfs_lookup,
4947 .create = btrfs_create,
4948 .unlink = btrfs_unlink,
4949 .link = btrfs_link,
4950 .mkdir = btrfs_mkdir,
4951 .rmdir = btrfs_rmdir,
4952 .rename = btrfs_rename,
4953 .symlink = btrfs_symlink,
4954 .setattr = btrfs_setattr,
4955 .mknod = btrfs_mknod,
4956 .setxattr = btrfs_setxattr,
4957 .getxattr = btrfs_getxattr,
4958 .listxattr = btrfs_listxattr,
4959 .removexattr = btrfs_removexattr,
4960 .permission = btrfs_permission,
4961};
4962static struct inode_operations btrfs_dir_ro_inode_operations = {
4963 .lookup = btrfs_lookup,
4964 .permission = btrfs_permission,
4965};
4966static struct file_operations btrfs_dir_file_operations = {
4967 .llseek = generic_file_llseek,
4968 .read = generic_read_dir,
4969 .readdir = btrfs_real_readdir,
4970 .unlocked_ioctl = btrfs_ioctl,
4971#ifdef CONFIG_COMPAT
4972 .compat_ioctl = btrfs_ioctl,
4973#endif
4974 .release = btrfs_release_file,
4975 .fsync = btrfs_sync_file,
4976};
4977
4978static struct extent_io_ops btrfs_extent_io_ops = {
4979 .fill_delalloc = run_delalloc_range,
4980 .submit_bio_hook = btrfs_submit_bio_hook,
4981 .merge_bio_hook = btrfs_merge_bio_hook,
4982 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
4983 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
4984 .writepage_start_hook = btrfs_writepage_start_hook,
4985 .readpage_io_failed_hook = btrfs_io_failed_hook,
4986 .set_bit_hook = btrfs_set_bit_hook,
4987 .clear_bit_hook = btrfs_clear_bit_hook,
4988};
4989
4990static struct address_space_operations btrfs_aops = {
4991 .readpage = btrfs_readpage,
4992 .writepage = btrfs_writepage,
4993 .writepages = btrfs_writepages,
4994 .readpages = btrfs_readpages,
4995 .sync_page = block_sync_page,
4996 .bmap = btrfs_bmap,
4997 .direct_IO = btrfs_direct_IO,
4998 .invalidatepage = btrfs_invalidatepage,
4999 .releasepage = btrfs_releasepage,
5000 .set_page_dirty = btrfs_set_page_dirty,
5001};
5002
5003static struct address_space_operations btrfs_symlink_aops = {
5004 .readpage = btrfs_readpage,
5005 .writepage = btrfs_writepage,
5006 .invalidatepage = btrfs_invalidatepage,
5007 .releasepage = btrfs_releasepage,
5008};
5009
5010static struct inode_operations btrfs_file_inode_operations = {
5011 .truncate = btrfs_truncate,
5012 .getattr = btrfs_getattr,
5013 .setattr = btrfs_setattr,
5014 .setxattr = btrfs_setxattr,
5015 .getxattr = btrfs_getxattr,
5016 .listxattr = btrfs_listxattr,
5017 .removexattr = btrfs_removexattr,
5018 .permission = btrfs_permission,
5019 .fallocate = btrfs_fallocate,
5020};
5021static struct inode_operations btrfs_special_inode_operations = {
5022 .getattr = btrfs_getattr,
5023 .setattr = btrfs_setattr,
5024 .permission = btrfs_permission,
5025 .setxattr = btrfs_setxattr,
5026 .getxattr = btrfs_getxattr,
5027 .listxattr = btrfs_listxattr,
5028 .removexattr = btrfs_removexattr,
5029};
5030static struct inode_operations btrfs_symlink_inode_operations = {
5031 .readlink = generic_readlink,
5032 .follow_link = page_follow_link_light,
5033 .put_link = page_put_link,
5034 .permission = btrfs_permission,
5035};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000000..c2aa33e3feb5
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,1132 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/fsnotify.h>
25#include <linux/pagemap.h>
26#include <linux/highmem.h>
27#include <linux/time.h>
28#include <linux/init.h>
29#include <linux/string.h>
30#include <linux/smp_lock.h>
31#include <linux/backing-dev.h>
32#include <linux/mount.h>
33#include <linux/mpage.h>
34#include <linux/namei.h>
35#include <linux/swap.h>
36#include <linux/writeback.h>
37#include <linux/statfs.h>
38#include <linux/compat.h>
39#include <linux/bit_spinlock.h>
40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h>
43#include <linux/vmalloc.h>
44#include "compat.h"
45#include "ctree.h"
46#include "disk-io.h"
47#include "transaction.h"
48#include "btrfs_inode.h"
49#include "ioctl.h"
50#include "print-tree.h"
51#include "volumes.h"
52#include "locking.h"
53
54
55
56static noinline int create_subvol(struct btrfs_root *root,
57 struct dentry *dentry,
58 char *name, int namelen)
59{
60 struct btrfs_trans_handle *trans;
61 struct btrfs_key key;
62 struct btrfs_root_item root_item;
63 struct btrfs_inode_item *inode_item;
64 struct extent_buffer *leaf;
65 struct btrfs_root *new_root = root;
66 struct inode *dir;
67 int ret;
68 int err;
69 u64 objectid;
70 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
71 u64 index = 0;
72 unsigned long nr = 1;
73
74 ret = btrfs_check_free_space(root, 1, 0);
75 if (ret)
76 goto fail_commit;
77
78 trans = btrfs_start_transaction(root, 1);
79 BUG_ON(!trans);
80
81 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
82 0, &objectid);
83 if (ret)
84 goto fail;
85
86 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
87 objectid, trans->transid, 0, 0, 0);
88 if (IS_ERR(leaf)) {
89 ret = PTR_ERR(leaf);
90 goto fail;
91 }
92
93 btrfs_set_header_nritems(leaf, 0);
94 btrfs_set_header_level(leaf, 0);
95 btrfs_set_header_bytenr(leaf, leaf->start);
96 btrfs_set_header_generation(leaf, trans->transid);
97 btrfs_set_header_owner(leaf, objectid);
98
99 write_extent_buffer(leaf, root->fs_info->fsid,
100 (unsigned long)btrfs_header_fsid(leaf),
101 BTRFS_FSID_SIZE);
102 btrfs_mark_buffer_dirty(leaf);
103
104 inode_item = &root_item.inode;
105 memset(inode_item, 0, sizeof(*inode_item));
106 inode_item->generation = cpu_to_le64(1);
107 inode_item->size = cpu_to_le64(3);
108 inode_item->nlink = cpu_to_le32(1);
109 inode_item->nbytes = cpu_to_le64(root->leafsize);
110 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
111
112 btrfs_set_root_bytenr(&root_item, leaf->start);
113 btrfs_set_root_generation(&root_item, trans->transid);
114 btrfs_set_root_level(&root_item, 0);
115 btrfs_set_root_refs(&root_item, 1);
116 btrfs_set_root_used(&root_item, 0);
117 btrfs_set_root_last_snapshot(&root_item, 0);
118
119 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
120 root_item.drop_level = 0;
121
122 btrfs_tree_unlock(leaf);
123 free_extent_buffer(leaf);
124 leaf = NULL;
125
126 btrfs_set_root_dirid(&root_item, new_dirid);
127
128 key.objectid = objectid;
129 key.offset = 1;
130 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
131 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
132 &root_item);
133 if (ret)
134 goto fail;
135
136 /*
137 * insert the directory item
138 */
139 key.offset = (u64)-1;
140 dir = dentry->d_parent->d_inode;
141 ret = btrfs_set_inode_index(dir, &index);
142 BUG_ON(ret);
143
144 ret = btrfs_insert_dir_item(trans, root,
145 name, namelen, dir->i_ino, &key,
146 BTRFS_FT_DIR, index);
147 if (ret)
148 goto fail;
149
150 btrfs_i_size_write(dir, dir->i_size + namelen * 2);
151 ret = btrfs_update_inode(trans, root, dir);
152 BUG_ON(ret);
153
154 /* add the backref first */
155 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
156 objectid, BTRFS_ROOT_BACKREF_KEY,
157 root->root_key.objectid,
158 dir->i_ino, index, name, namelen);
159
160 BUG_ON(ret);
161
162 /* now add the forward ref */
163 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
164 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
165 objectid,
166 dir->i_ino, index, name, namelen);
167
168 BUG_ON(ret);
169
170 ret = btrfs_commit_transaction(trans, root);
171 if (ret)
172 goto fail_commit;
173
174 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
175 BUG_ON(!new_root);
176
177 trans = btrfs_start_transaction(new_root, 1);
178 BUG_ON(!trans);
179
180 ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
181 BTRFS_I(dir)->block_group);
182 if (ret)
183 goto fail;
184
185fail:
186 nr = trans->blocks_used;
187 err = btrfs_commit_transaction(trans, new_root);
188 if (err && !ret)
189 ret = err;
190fail_commit:
191 btrfs_btree_balance_dirty(root, nr);
192 return ret;
193}
194
195static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
196 char *name, int namelen)
197{
198 struct btrfs_pending_snapshot *pending_snapshot;
199 struct btrfs_trans_handle *trans;
200 int ret = 0;
201 int err;
202 unsigned long nr = 0;
203
204 if (!root->ref_cows)
205 return -EINVAL;
206
207 ret = btrfs_check_free_space(root, 1, 0);
208 if (ret)
209 goto fail_unlock;
210
211 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
212 if (!pending_snapshot) {
213 ret = -ENOMEM;
214 goto fail_unlock;
215 }
216 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
217 if (!pending_snapshot->name) {
218 ret = -ENOMEM;
219 kfree(pending_snapshot);
220 goto fail_unlock;
221 }
222 memcpy(pending_snapshot->name, name, namelen);
223 pending_snapshot->name[namelen] = '\0';
224 pending_snapshot->dentry = dentry;
225 trans = btrfs_start_transaction(root, 1);
226 BUG_ON(!trans);
227 pending_snapshot->root = root;
228 list_add(&pending_snapshot->list,
229 &trans->transaction->pending_snapshots);
230 err = btrfs_commit_transaction(trans, root);
231
232fail_unlock:
233 btrfs_btree_balance_dirty(root, nr);
234 return ret;
235}
236
237/* copy of may_create in fs/namei.c() */
238static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
239{
240 if (child->d_inode)
241 return -EEXIST;
242 if (IS_DEADDIR(dir))
243 return -ENOENT;
244 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
245}
246
247/*
248 * Create a new subvolume below @parent. This is largely modeled after
249 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
250 * inside this filesystem so it's quite a bit simpler.
251 */
252static noinline int btrfs_mksubvol(struct path *parent, char *name,
253 int mode, int namelen,
254 struct btrfs_root *snap_src)
255{
256 struct dentry *dentry;
257 int error;
258
259 mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
260
261 dentry = lookup_one_len(name, parent->dentry, namelen);
262 error = PTR_ERR(dentry);
263 if (IS_ERR(dentry))
264 goto out_unlock;
265
266 error = -EEXIST;
267 if (dentry->d_inode)
268 goto out_dput;
269
270 if (!IS_POSIXACL(parent->dentry->d_inode))
271 mode &= ~current->fs->umask;
272
273 error = mnt_want_write(parent->mnt);
274 if (error)
275 goto out_dput;
276
277 error = btrfs_may_create(parent->dentry->d_inode, dentry);
278 if (error)
279 goto out_drop_write;
280
281 /*
282 * Actually perform the low-level subvolume creation after all
283 * this VFS fuzz.
284 *
285 * Eventually we want to pass in an inode under which we create this
286 * subvolume, but for now all are under the filesystem root.
287 *
288 * Also we should pass on the mode eventually to allow creating new
289 * subvolume with specific mode bits.
290 */
291 if (snap_src) {
292 struct dentry *dir = dentry->d_parent;
293 struct dentry *test = dir->d_parent;
294 struct btrfs_path *path = btrfs_alloc_path();
295 int ret;
296 u64 test_oid;
297 u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
298
299 test_oid = snap_src->root_key.objectid;
300
301 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
302 path, parent_oid, test_oid);
303 if (ret == 0)
304 goto create;
305 btrfs_release_path(snap_src->fs_info->tree_root, path);
306
307 /* we need to make sure we aren't creating a directory loop
308 * by taking a snapshot of something that has our current
309 * subvol in its directory tree. So, this loops through
310 * the dentries and checks the forward refs for each subvolume
311 * to see if is references the subvolume where we are
312 * placing this new snapshot.
313 */
314 while (1) {
315 if (!test ||
316 dir == snap_src->fs_info->sb->s_root ||
317 test == snap_src->fs_info->sb->s_root ||
318 test->d_inode->i_sb != snap_src->fs_info->sb) {
319 break;
320 }
321 if (S_ISLNK(test->d_inode->i_mode)) {
322 printk(KERN_INFO "Btrfs symlink in snapshot "
323 "path, failed\n");
324 error = -EMLINK;
325 btrfs_free_path(path);
326 goto out_drop_write;
327 }
328 test_oid =
329 BTRFS_I(test->d_inode)->root->root_key.objectid;
330 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
331 path, test_oid, parent_oid);
332 if (ret == 0) {
333 printk(KERN_INFO "Btrfs snapshot creation "
334 "failed, looping\n");
335 error = -EMLINK;
336 btrfs_free_path(path);
337 goto out_drop_write;
338 }
339 btrfs_release_path(snap_src->fs_info->tree_root, path);
340 test = test->d_parent;
341 }
342create:
343 btrfs_free_path(path);
344 error = create_snapshot(snap_src, dentry, name, namelen);
345 } else {
346 error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
347 dentry, name, namelen);
348 }
349 if (error)
350 goto out_drop_write;
351
352 fsnotify_mkdir(parent->dentry->d_inode, dentry);
353out_drop_write:
354 mnt_drop_write(parent->mnt);
355out_dput:
356 dput(dentry);
357out_unlock:
358 mutex_unlock(&parent->dentry->d_inode->i_mutex);
359 return error;
360}
361
362
363static int btrfs_defrag_file(struct file *file)
364{
365 struct inode *inode = fdentry(file)->d_inode;
366 struct btrfs_root *root = BTRFS_I(inode)->root;
367 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
368 struct btrfs_ordered_extent *ordered;
369 struct page *page;
370 unsigned long last_index;
371 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
372 unsigned long total_read = 0;
373 u64 page_start;
374 u64 page_end;
375 unsigned long i;
376 int ret;
377
378 ret = btrfs_check_free_space(root, inode->i_size, 0);
379 if (ret)
380 return -ENOSPC;
381
382 mutex_lock(&inode->i_mutex);
383 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
384 for (i = 0; i <= last_index; i++) {
385 if (total_read % ra_pages == 0) {
386 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
387 min(last_index, i + ra_pages - 1));
388 }
389 total_read++;
390again:
391 page = grab_cache_page(inode->i_mapping, i);
392 if (!page)
393 goto out_unlock;
394 if (!PageUptodate(page)) {
395 btrfs_readpage(NULL, page);
396 lock_page(page);
397 if (!PageUptodate(page)) {
398 unlock_page(page);
399 page_cache_release(page);
400 goto out_unlock;
401 }
402 }
403
404 wait_on_page_writeback(page);
405
406 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
407 page_end = page_start + PAGE_CACHE_SIZE - 1;
408 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
409
410 ordered = btrfs_lookup_ordered_extent(inode, page_start);
411 if (ordered) {
412 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
413 unlock_page(page);
414 page_cache_release(page);
415 btrfs_start_ordered_extent(inode, ordered, 1);
416 btrfs_put_ordered_extent(ordered);
417 goto again;
418 }
419 set_page_extent_mapped(page);
420
421 /*
422 * this makes sure page_mkwrite is called on the
423 * page if it is dirtied again later
424 */
425 clear_page_dirty_for_io(page);
426
427 btrfs_set_extent_delalloc(inode, page_start, page_end);
428
429 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
430 set_page_dirty(page);
431 unlock_page(page);
432 page_cache_release(page);
433 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
434 }
435
436out_unlock:
437 mutex_unlock(&inode->i_mutex);
438 return 0;
439}
440
441/*
442 * Called inside transaction, so use GFP_NOFS
443 */
444
445static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
446{
447 u64 new_size;
448 u64 old_size;
449 u64 devid = 1;
450 struct btrfs_ioctl_vol_args *vol_args;
451 struct btrfs_trans_handle *trans;
452 struct btrfs_device *device = NULL;
453 char *sizestr;
454 char *devstr = NULL;
455 int ret = 0;
456 int namelen;
457 int mod = 0;
458
459 if (root->fs_info->sb->s_flags & MS_RDONLY)
460 return -EROFS;
461
462 if (!capable(CAP_SYS_ADMIN))
463 return -EPERM;
464
465 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
466
467 if (!vol_args)
468 return -ENOMEM;
469
470 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
471 ret = -EFAULT;
472 goto out;
473 }
474
475 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
476 namelen = strlen(vol_args->name);
477
478 mutex_lock(&root->fs_info->volume_mutex);
479 sizestr = vol_args->name;
480 devstr = strchr(sizestr, ':');
481 if (devstr) {
482 char *end;
483 sizestr = devstr + 1;
484 *devstr = '\0';
485 devstr = vol_args->name;
486 devid = simple_strtoull(devstr, &end, 10);
487 printk(KERN_INFO "resizing devid %llu\n", devid);
488 }
489 device = btrfs_find_device(root, devid, NULL, NULL);
490 if (!device) {
491 printk(KERN_INFO "resizer unable to find device %llu\n", devid);
492 ret = -EINVAL;
493 goto out_unlock;
494 }
495 if (!strcmp(sizestr, "max"))
496 new_size = device->bdev->bd_inode->i_size;
497 else {
498 if (sizestr[0] == '-') {
499 mod = -1;
500 sizestr++;
501 } else if (sizestr[0] == '+') {
502 mod = 1;
503 sizestr++;
504 }
505 new_size = btrfs_parse_size(sizestr);
506 if (new_size == 0) {
507 ret = -EINVAL;
508 goto out_unlock;
509 }
510 }
511
512 old_size = device->total_bytes;
513
514 if (mod < 0) {
515 if (new_size > old_size) {
516 ret = -EINVAL;
517 goto out_unlock;
518 }
519 new_size = old_size - new_size;
520 } else if (mod > 0) {
521 new_size = old_size + new_size;
522 }
523
524 if (new_size < 256 * 1024 * 1024) {
525 ret = -EINVAL;
526 goto out_unlock;
527 }
528 if (new_size > device->bdev->bd_inode->i_size) {
529 ret = -EFBIG;
530 goto out_unlock;
531 }
532
533 do_div(new_size, root->sectorsize);
534 new_size *= root->sectorsize;
535
536 printk(KERN_INFO "new size for %s is %llu\n",
537 device->name, (unsigned long long)new_size);
538
539 if (new_size > old_size) {
540 trans = btrfs_start_transaction(root, 1);
541 ret = btrfs_grow_device(trans, device, new_size);
542 btrfs_commit_transaction(trans, root);
543 } else {
544 ret = btrfs_shrink_device(device, new_size);
545 }
546
547out_unlock:
548 mutex_unlock(&root->fs_info->volume_mutex);
549out:
550 kfree(vol_args);
551 return ret;
552}
553
554static noinline int btrfs_ioctl_snap_create(struct file *file,
555 void __user *arg, int subvol)
556{
557 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
558 struct btrfs_ioctl_vol_args *vol_args;
559 struct btrfs_dir_item *di;
560 struct btrfs_path *path;
561 struct file *src_file;
562 u64 root_dirid;
563 int namelen;
564 int ret = 0;
565
566 if (root->fs_info->sb->s_flags & MS_RDONLY)
567 return -EROFS;
568
569 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
570
571 if (!vol_args)
572 return -ENOMEM;
573
574 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
575 ret = -EFAULT;
576 goto out;
577 }
578
579 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
580 namelen = strlen(vol_args->name);
581 if (strchr(vol_args->name, '/')) {
582 ret = -EINVAL;
583 goto out;
584 }
585
586 path = btrfs_alloc_path();
587 if (!path) {
588 ret = -ENOMEM;
589 goto out;
590 }
591
592 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
593 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
594 path, root_dirid,
595 vol_args->name, namelen, 0);
596 btrfs_free_path(path);
597
598 if (di && !IS_ERR(di)) {
599 ret = -EEXIST;
600 goto out;
601 }
602
603 if (IS_ERR(di)) {
604 ret = PTR_ERR(di);
605 goto out;
606 }
607
608 if (subvol) {
609 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
610 file->f_path.dentry->d_inode->i_mode,
611 namelen, NULL);
612 } else {
613 struct inode *src_inode;
614 src_file = fget(vol_args->fd);
615 if (!src_file) {
616 ret = -EINVAL;
617 goto out;
618 }
619
620 src_inode = src_file->f_path.dentry->d_inode;
621 if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
622 printk(KERN_INFO "btrfs: Snapshot src from "
623 "another FS\n");
624 ret = -EINVAL;
625 fput(src_file);
626 goto out;
627 }
628 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
629 file->f_path.dentry->d_inode->i_mode,
630 namelen, BTRFS_I(src_inode)->root);
631 fput(src_file);
632 }
633
634out:
635 kfree(vol_args);
636 return ret;
637}
638
639static int btrfs_ioctl_defrag(struct file *file)
640{
641 struct inode *inode = fdentry(file)->d_inode;
642 struct btrfs_root *root = BTRFS_I(inode)->root;
643 int ret;
644
645 ret = mnt_want_write(file->f_path.mnt);
646 if (ret)
647 return ret;
648
649 switch (inode->i_mode & S_IFMT) {
650 case S_IFDIR:
651 if (!capable(CAP_SYS_ADMIN)) {
652 ret = -EPERM;
653 goto out;
654 }
655 btrfs_defrag_root(root, 0);
656 btrfs_defrag_root(root->fs_info->extent_root, 0);
657 break;
658 case S_IFREG:
659 if (!(file->f_mode & FMODE_WRITE)) {
660 ret = -EINVAL;
661 goto out;
662 }
663 btrfs_defrag_file(file);
664 break;
665 }
666out:
667 mnt_drop_write(file->f_path.mnt);
668 return ret;
669}
670
671static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
672{
673 struct btrfs_ioctl_vol_args *vol_args;
674 int ret;
675
676 if (!capable(CAP_SYS_ADMIN))
677 return -EPERM;
678
679 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
680
681 if (!vol_args)
682 return -ENOMEM;
683
684 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
685 ret = -EFAULT;
686 goto out;
687 }
688 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
689 ret = btrfs_init_new_device(root, vol_args->name);
690
691out:
692 kfree(vol_args);
693 return ret;
694}
695
696static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
697{
698 struct btrfs_ioctl_vol_args *vol_args;
699 int ret;
700
701 if (!capable(CAP_SYS_ADMIN))
702 return -EPERM;
703
704 if (root->fs_info->sb->s_flags & MS_RDONLY)
705 return -EROFS;
706
707 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
708
709 if (!vol_args)
710 return -ENOMEM;
711
712 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
713 ret = -EFAULT;
714 goto out;
715 }
716 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
717 ret = btrfs_rm_device(root, vol_args->name);
718
719out:
720 kfree(vol_args);
721 return ret;
722}
723
724static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
725 u64 off, u64 olen, u64 destoff)
726{
727 struct inode *inode = fdentry(file)->d_inode;
728 struct btrfs_root *root = BTRFS_I(inode)->root;
729 struct file *src_file;
730 struct inode *src;
731 struct btrfs_trans_handle *trans;
732 struct btrfs_path *path;
733 struct extent_buffer *leaf;
734 char *buf;
735 struct btrfs_key key;
736 u32 nritems;
737 int slot;
738 int ret;
739 u64 len = olen;
740 u64 bs = root->fs_info->sb->s_blocksize;
741 u64 hint_byte;
742
743 /*
744 * TODO:
745 * - split compressed inline extents. annoying: we need to
746 * decompress into destination's address_space (the file offset
747 * may change, so source mapping won't do), then recompress (or
748 * otherwise reinsert) a subrange.
749 * - allow ranges within the same file to be cloned (provided
750 * they don't overlap)?
751 */
752
753 /* the destination must be opened for writing */
754 if (!(file->f_mode & FMODE_WRITE))
755 return -EINVAL;
756
757 ret = mnt_want_write(file->f_path.mnt);
758 if (ret)
759 return ret;
760
761 src_file = fget(srcfd);
762 if (!src_file) {
763 ret = -EBADF;
764 goto out_drop_write;
765 }
766 src = src_file->f_dentry->d_inode;
767
768 ret = -EINVAL;
769 if (src == inode)
770 goto out_fput;
771
772 ret = -EISDIR;
773 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
774 goto out_fput;
775
776 ret = -EXDEV;
777 if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
778 goto out_fput;
779
780 ret = -ENOMEM;
781 buf = vmalloc(btrfs_level_size(root, 0));
782 if (!buf)
783 goto out_fput;
784
785 path = btrfs_alloc_path();
786 if (!path) {
787 vfree(buf);
788 goto out_fput;
789 }
790 path->reada = 2;
791
792 if (inode < src) {
793 mutex_lock(&inode->i_mutex);
794 mutex_lock(&src->i_mutex);
795 } else {
796 mutex_lock(&src->i_mutex);
797 mutex_lock(&inode->i_mutex);
798 }
799
800 /* determine range to clone */
801 ret = -EINVAL;
802 if (off >= src->i_size || off + len > src->i_size)
803 goto out_unlock;
804 if (len == 0)
805 olen = len = src->i_size - off;
806 /* if we extend to eof, continue to block boundary */
807 if (off + len == src->i_size)
808 len = ((src->i_size + bs-1) & ~(bs-1))
809 - off;
810
811 /* verify the end result is block aligned */
812 if ((off & (bs-1)) ||
813 ((off + len) & (bs-1)))
814 goto out_unlock;
815
816 /* do any pending delalloc/csum calc on src, one way or
817 another, and lock file content */
818 while (1) {
819 struct btrfs_ordered_extent *ordered;
820 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
821 ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
822 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
823 break;
824 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
825 if (ordered)
826 btrfs_put_ordered_extent(ordered);
827 btrfs_wait_ordered_range(src, off, off+len);
828 }
829
830 trans = btrfs_start_transaction(root, 1);
831 BUG_ON(!trans);
832
833 /* punch hole in destination first */
834 btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
835
836 /* clone data */
837 key.objectid = src->i_ino;
838 key.type = BTRFS_EXTENT_DATA_KEY;
839 key.offset = 0;
840
841 while (1) {
842 /*
843 * note the key will change type as we walk through the
844 * tree.
845 */
846 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
847 if (ret < 0)
848 goto out;
849
850 nritems = btrfs_header_nritems(path->nodes[0]);
851 if (path->slots[0] >= nritems) {
852 ret = btrfs_next_leaf(root, path);
853 if (ret < 0)
854 goto out;
855 if (ret > 0)
856 break;
857 nritems = btrfs_header_nritems(path->nodes[0]);
858 }
859 leaf = path->nodes[0];
860 slot = path->slots[0];
861
862 btrfs_item_key_to_cpu(leaf, &key, slot);
863 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
864 key.objectid != src->i_ino)
865 break;
866
867 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
868 struct btrfs_file_extent_item *extent;
869 int type;
870 u32 size;
871 struct btrfs_key new_key;
872 u64 disko = 0, diskl = 0;
873 u64 datao = 0, datal = 0;
874 u8 comp;
875
876 size = btrfs_item_size_nr(leaf, slot);
877 read_extent_buffer(leaf, buf,
878 btrfs_item_ptr_offset(leaf, slot),
879 size);
880
881 extent = btrfs_item_ptr(leaf, slot,
882 struct btrfs_file_extent_item);
883 comp = btrfs_file_extent_compression(leaf, extent);
884 type = btrfs_file_extent_type(leaf, extent);
885 if (type == BTRFS_FILE_EXTENT_REG) {
886 disko = btrfs_file_extent_disk_bytenr(leaf,
887 extent);
888 diskl = btrfs_file_extent_disk_num_bytes(leaf,
889 extent);
890 datao = btrfs_file_extent_offset(leaf, extent);
891 datal = btrfs_file_extent_num_bytes(leaf,
892 extent);
893 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
894 /* take upper bound, may be compressed */
895 datal = btrfs_file_extent_ram_bytes(leaf,
896 extent);
897 }
898 btrfs_release_path(root, path);
899
900 if (key.offset + datal < off ||
901 key.offset >= off+len)
902 goto next;
903
904 memcpy(&new_key, &key, sizeof(new_key));
905 new_key.objectid = inode->i_ino;
906 new_key.offset = key.offset + destoff - off;
907
908 if (type == BTRFS_FILE_EXTENT_REG) {
909 ret = btrfs_insert_empty_item(trans, root, path,
910 &new_key, size);
911 if (ret)
912 goto out;
913
914 leaf = path->nodes[0];
915 slot = path->slots[0];
916 write_extent_buffer(leaf, buf,
917 btrfs_item_ptr_offset(leaf, slot),
918 size);
919
920 extent = btrfs_item_ptr(leaf, slot,
921 struct btrfs_file_extent_item);
922
923 if (off > key.offset) {
924 datao += off - key.offset;
925 datal -= off - key.offset;
926 }
927 if (key.offset + datao + datal + key.offset >
928 off + len)
929 datal = off + len - key.offset - datao;
930 /* disko == 0 means it's a hole */
931 if (!disko)
932 datao = 0;
933
934 btrfs_set_file_extent_offset(leaf, extent,
935 datao);
936 btrfs_set_file_extent_num_bytes(leaf, extent,
937 datal);
938 if (disko) {
939 inode_add_bytes(inode, datal);
940 ret = btrfs_inc_extent_ref(trans, root,
941 disko, diskl, leaf->start,
942 root->root_key.objectid,
943 trans->transid,
944 inode->i_ino);
945 BUG_ON(ret);
946 }
947 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
948 u64 skip = 0;
949 u64 trim = 0;
950 if (off > key.offset) {
951 skip = off - key.offset;
952 new_key.offset += skip;
953 }
954
955 if (key.offset + datal > off+len)
956 trim = key.offset + datal - (off+len);
957
958 if (comp && (skip || trim)) {
959 ret = -EINVAL;
960 goto out;
961 }
962 size -= skip + trim;
963 datal -= skip + trim;
964 ret = btrfs_insert_empty_item(trans, root, path,
965 &new_key, size);
966 if (ret)
967 goto out;
968
969 if (skip) {
970 u32 start =
971 btrfs_file_extent_calc_inline_size(0);
972 memmove(buf+start, buf+start+skip,
973 datal);
974 }
975
976 leaf = path->nodes[0];
977 slot = path->slots[0];
978 write_extent_buffer(leaf, buf,
979 btrfs_item_ptr_offset(leaf, slot),
980 size);
981 inode_add_bytes(inode, datal);
982 }
983
984 btrfs_mark_buffer_dirty(leaf);
985 }
986
987next:
988 btrfs_release_path(root, path);
989 key.offset++;
990 }
991 ret = 0;
992out:
993 btrfs_release_path(root, path);
994 if (ret == 0) {
995 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
996 if (destoff + olen > inode->i_size)
997 btrfs_i_size_write(inode, destoff + olen);
998 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
999 ret = btrfs_update_inode(trans, root, inode);
1000 }
1001 btrfs_end_transaction(trans, root);
1002 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1003 if (ret)
1004 vmtruncate(inode, 0);
1005out_unlock:
1006 mutex_unlock(&src->i_mutex);
1007 mutex_unlock(&inode->i_mutex);
1008 vfree(buf);
1009 btrfs_free_path(path);
1010out_fput:
1011 fput(src_file);
1012out_drop_write:
1013 mnt_drop_write(file->f_path.mnt);
1014 return ret;
1015}
1016
1017static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
1018{
1019 struct btrfs_ioctl_clone_range_args args;
1020
1021 if (copy_from_user(&args, argp, sizeof(args)))
1022 return -EFAULT;
1023 return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
1024 args.src_length, args.dest_offset);
1025}
1026
1027/*
1028 * there are many ways the trans_start and trans_end ioctls can lead
1029 * to deadlocks. They should only be used by applications that
1030 * basically own the machine, and have a very in depth understanding
1031 * of all the possible deadlocks and enospc problems.
1032 */
1033static long btrfs_ioctl_trans_start(struct file *file)
1034{
1035 struct inode *inode = fdentry(file)->d_inode;
1036 struct btrfs_root *root = BTRFS_I(inode)->root;
1037 struct btrfs_trans_handle *trans;
1038 int ret = 0;
1039
1040 if (!capable(CAP_SYS_ADMIN))
1041 return -EPERM;
1042
1043 if (file->private_data) {
1044 ret = -EINPROGRESS;
1045 goto out;
1046 }
1047
1048 ret = mnt_want_write(file->f_path.mnt);
1049 if (ret)
1050 goto out;
1051
1052 mutex_lock(&root->fs_info->trans_mutex);
1053 root->fs_info->open_ioctl_trans++;
1054 mutex_unlock(&root->fs_info->trans_mutex);
1055
1056 trans = btrfs_start_ioctl_transaction(root, 0);
1057 if (trans)
1058 file->private_data = trans;
1059 else
1060 ret = -ENOMEM;
1061 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
1062out:
1063 return ret;
1064}
1065
1066/*
1067 * there are many ways the trans_start and trans_end ioctls can lead
1068 * to deadlocks. They should only be used by applications that
1069 * basically own the machine, and have a very in depth understanding
1070 * of all the possible deadlocks and enospc problems.
1071 */
1072long btrfs_ioctl_trans_end(struct file *file)
1073{
1074 struct inode *inode = fdentry(file)->d_inode;
1075 struct btrfs_root *root = BTRFS_I(inode)->root;
1076 struct btrfs_trans_handle *trans;
1077 int ret = 0;
1078
1079 trans = file->private_data;
1080 if (!trans) {
1081 ret = -EINVAL;
1082 goto out;
1083 }
1084 btrfs_end_transaction(trans, root);
1085 file->private_data = NULL;
1086
1087 mutex_lock(&root->fs_info->trans_mutex);
1088 root->fs_info->open_ioctl_trans--;
1089 mutex_unlock(&root->fs_info->trans_mutex);
1090
1091 mnt_drop_write(file->f_path.mnt);
1092
1093out:
1094 return ret;
1095}
1096
1097long btrfs_ioctl(struct file *file, unsigned int
1098 cmd, unsigned long arg)
1099{
1100 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1101 void __user *argp = (void __user *)arg;
1102
1103 switch (cmd) {
1104 case BTRFS_IOC_SNAP_CREATE:
1105 return btrfs_ioctl_snap_create(file, argp, 0);
1106 case BTRFS_IOC_SUBVOL_CREATE:
1107 return btrfs_ioctl_snap_create(file, argp, 1);
1108 case BTRFS_IOC_DEFRAG:
1109 return btrfs_ioctl_defrag(file);
1110 case BTRFS_IOC_RESIZE:
1111 return btrfs_ioctl_resize(root, argp);
1112 case BTRFS_IOC_ADD_DEV:
1113 return btrfs_ioctl_add_dev(root, argp);
1114 case BTRFS_IOC_RM_DEV:
1115 return btrfs_ioctl_rm_dev(root, argp);
1116 case BTRFS_IOC_BALANCE:
1117 return btrfs_balance(root->fs_info->dev_root);
1118 case BTRFS_IOC_CLONE:
1119 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
1120 case BTRFS_IOC_CLONE_RANGE:
1121 return btrfs_ioctl_clone_range(file, argp);
1122 case BTRFS_IOC_TRANS_START:
1123 return btrfs_ioctl_trans_start(file);
1124 case BTRFS_IOC_TRANS_END:
1125 return btrfs_ioctl_trans_end(file);
1126 case BTRFS_IOC_SYNC:
1127 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1128 return 0;
1129 }
1130
1131 return -ENOTTY;
1132}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..78049ea208db
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __IOCTL_
20#define __IOCTL_
21#include <linux/ioctl.h>
22
23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 3072
26
27struct btrfs_ioctl_vol_args {
28 __s64 fd;
29 char name[BTRFS_PATH_NAME_MAX + 1];
30};
31
32#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
33 struct btrfs_ioctl_vol_args)
34#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
35 struct btrfs_ioctl_vol_args)
36#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
37 struct btrfs_ioctl_vol_args)
38#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
39 struct btrfs_ioctl_vol_args)
40/* trans start and trans end are dangerous, and only for
41 * use by applications that know how to avoid the
42 * resulting deadlocks
43 */
44#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
45#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
46#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
47
48#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
49#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
50 struct btrfs_ioctl_vol_args)
51#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
52 struct btrfs_ioctl_vol_args)
53#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
54 struct btrfs_ioctl_vol_args)
55struct btrfs_ioctl_clone_range_args {
56 __s64 src_fd;
57 __u64 src_offset, src_length;
58 __u64 dest_offset;
59};
60
61#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
62 struct btrfs_ioctl_clone_range_args)
63
64#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
65 struct btrfs_ioctl_vol_args)
66
67#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000000..39bae7761db6
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,88 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h>
21#include <linux/spinlock.h>
22#include <linux/page-flags.h>
23#include <asm/bug.h>
24#include "ctree.h"
25#include "extent_io.h"
26#include "locking.h"
27
28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks
30 * and the spin is not tuned very extensively. The spinning does make a big
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */
37
38int btrfs_tree_lock(struct extent_buffer *eb)
39{
40 int i;
41
42 if (mutex_trylock(&eb->mutex))
43 return 0;
44 for (i = 0; i < 512; i++) {
45 cpu_relax();
46 if (mutex_trylock(&eb->mutex))
47 return 0;
48 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0;
52}
53
54int btrfs_try_tree_lock(struct extent_buffer *eb)
55{
56 return mutex_trylock(&eb->mutex);
57}
58
59int btrfs_tree_unlock(struct extent_buffer *eb)
60{
61 mutex_unlock(&eb->mutex);
62 return 0;
63}
64
65int btrfs_tree_locked(struct extent_buffer *eb)
66{
67 return mutex_is_locked(&eb->mutex);
68}
69
70/*
71 * btrfs_search_slot uses this to decide if it should drop its locks
72 * before doing something expensive like allocating free blocks for cow.
73 */
74int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
75{
76 int i;
77 struct extent_buffer *eb;
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
79 eb = path->nodes[i];
80 if (!eb)
81 break;
82 smp_mb();
83 if (!list_empty(&eb->mutex.wait_list))
84 return 1;
85 }
86 return 0;
87}
88
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000000..bc1faef12519
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_
21
22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb);
25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
27#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000000..a20940170274
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,730 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/gfp.h>
20#include <linux/slab.h>
21#include <linux/blkdev.h>
22#include <linux/writeback.h>
23#include <linux/pagevec.h>
24#include "ctree.h"
25#include "transaction.h"
26#include "btrfs_inode.h"
27#include "extent_io.h"
28
29static u64 entry_end(struct btrfs_ordered_extent *entry)
30{
31 if (entry->file_offset + entry->len < entry->file_offset)
32 return (u64)-1;
33 return entry->file_offset + entry->len;
34}
35
36/* returns NULL if the insertion worked, or it returns the node it did find
37 * in the tree
38 */
39static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
40 struct rb_node *node)
41{
42 struct rb_node **p = &root->rb_node;
43 struct rb_node *parent = NULL;
44 struct btrfs_ordered_extent *entry;
45
46 while (*p) {
47 parent = *p;
48 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
49
50 if (file_offset < entry->file_offset)
51 p = &(*p)->rb_left;
52 else if (file_offset >= entry_end(entry))
53 p = &(*p)->rb_right;
54 else
55 return parent;
56 }
57
58 rb_link_node(node, parent, p);
59 rb_insert_color(node, root);
60 return NULL;
61}
62
63/*
64 * look for a given offset in the tree, and if it can't be found return the
65 * first lesser offset
66 */
67static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
68 struct rb_node **prev_ret)
69{
70 struct rb_node *n = root->rb_node;
71 struct rb_node *prev = NULL;
72 struct rb_node *test;
73 struct btrfs_ordered_extent *entry;
74 struct btrfs_ordered_extent *prev_entry = NULL;
75
76 while (n) {
77 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
78 prev = n;
79 prev_entry = entry;
80
81 if (file_offset < entry->file_offset)
82 n = n->rb_left;
83 else if (file_offset >= entry_end(entry))
84 n = n->rb_right;
85 else
86 return n;
87 }
88 if (!prev_ret)
89 return NULL;
90
91 while (prev && file_offset >= entry_end(prev_entry)) {
92 test = rb_next(prev);
93 if (!test)
94 break;
95 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
96 rb_node);
97 if (file_offset < entry_end(prev_entry))
98 break;
99
100 prev = test;
101 }
102 if (prev)
103 prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
104 rb_node);
105 while (prev && file_offset < entry_end(prev_entry)) {
106 test = rb_prev(prev);
107 if (!test)
108 break;
109 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
110 rb_node);
111 prev = test;
112 }
113 *prev_ret = prev;
114 return NULL;
115}
116
117/*
118 * helper to check if a given offset is inside a given entry
119 */
120static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
121{
122 if (file_offset < entry->file_offset ||
123 entry->file_offset + entry->len <= file_offset)
124 return 0;
125 return 1;
126}
127
128/*
129 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset
131 */
132static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
133 u64 file_offset)
134{
135 struct rb_root *root = &tree->tree;
136 struct rb_node *prev;
137 struct rb_node *ret;
138 struct btrfs_ordered_extent *entry;
139
140 if (tree->last) {
141 entry = rb_entry(tree->last, struct btrfs_ordered_extent,
142 rb_node);
143 if (offset_in_entry(entry, file_offset))
144 return tree->last;
145 }
146 ret = __tree_search(root, file_offset, &prev);
147 if (!ret)
148 ret = prev;
149 if (ret)
150 tree->last = ret;
151 return ret;
152}
153
154/* allocate and add a new ordered_extent into the per-inode tree.
155 * file_offset is the logical offset in the file
156 *
157 * start is the disk block number of an extent already reserved in the
158 * extent allocation tree
159 *
160 * len is the length of the extent
161 *
162 * This also sets the EXTENT_ORDERED bit on the range in the inode.
163 *
164 * The tree is given a single reference on the ordered extent that was
165 * inserted.
166 */
167int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
168 u64 start, u64 len, u64 disk_len, int type)
169{
170 struct btrfs_ordered_inode_tree *tree;
171 struct rb_node *node;
172 struct btrfs_ordered_extent *entry;
173
174 tree = &BTRFS_I(inode)->ordered_tree;
175 entry = kzalloc(sizeof(*entry), GFP_NOFS);
176 if (!entry)
177 return -ENOMEM;
178
179 mutex_lock(&tree->mutex);
180 entry->file_offset = file_offset;
181 entry->start = start;
182 entry->len = len;
183 entry->disk_len = disk_len;
184 entry->inode = inode;
185 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
186 set_bit(type, &entry->flags);
187
188 /* one ref for the tree */
189 atomic_set(&entry->refs, 1);
190 init_waitqueue_head(&entry->wait);
191 INIT_LIST_HEAD(&entry->list);
192 INIT_LIST_HEAD(&entry->root_extent_list);
193
194 node = tree_insert(&tree->tree, file_offset,
195 &entry->rb_node);
196 BUG_ON(node);
197
198 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
199 entry_end(entry) - 1, GFP_NOFS);
200
201 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
202 list_add_tail(&entry->root_extent_list,
203 &BTRFS_I(inode)->root->fs_info->ordered_extents);
204 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
205
206 mutex_unlock(&tree->mutex);
207 BUG_ON(node);
208 return 0;
209}
210
211/*
212 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
213 * when an ordered extent is finished. If the list covers more than one
214 * ordered extent, it is split across multiples.
215 */
216int btrfs_add_ordered_sum(struct inode *inode,
217 struct btrfs_ordered_extent *entry,
218 struct btrfs_ordered_sum *sum)
219{
220 struct btrfs_ordered_inode_tree *tree;
221
222 tree = &BTRFS_I(inode)->ordered_tree;
223 mutex_lock(&tree->mutex);
224 list_add_tail(&sum->list, &entry->list);
225 mutex_unlock(&tree->mutex);
226 return 0;
227}
228
229/*
230 * this is used to account for finished IO across a given range
231 * of the file. The IO should not span ordered extents. If
232 * a given ordered_extent is completely done, 1 is returned, otherwise
233 * 0.
234 *
235 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
236 * to make sure this function only returns 1 once for a given ordered extent.
237 */
238int btrfs_dec_test_ordered_pending(struct inode *inode,
239 u64 file_offset, u64 io_size)
240{
241 struct btrfs_ordered_inode_tree *tree;
242 struct rb_node *node;
243 struct btrfs_ordered_extent *entry;
244 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
245 int ret;
246
247 tree = &BTRFS_I(inode)->ordered_tree;
248 mutex_lock(&tree->mutex);
249 clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
250 GFP_NOFS);
251 node = tree_search(tree, file_offset);
252 if (!node) {
253 ret = 1;
254 goto out;
255 }
256
257 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
258 if (!offset_in_entry(entry, file_offset)) {
259 ret = 1;
260 goto out;
261 }
262
263 ret = test_range_bit(io_tree, entry->file_offset,
264 entry->file_offset + entry->len - 1,
265 EXTENT_ORDERED, 0);
266 if (ret == 0)
267 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
268out:
269 mutex_unlock(&tree->mutex);
270 return ret == 0;
271}
272
273/*
274 * used to drop a reference on an ordered extent. This will free
275 * the extent if the last reference is dropped
276 */
277int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
278{
279 struct list_head *cur;
280 struct btrfs_ordered_sum *sum;
281
282 if (atomic_dec_and_test(&entry->refs)) {
283 while (!list_empty(&entry->list)) {
284 cur = entry->list.next;
285 sum = list_entry(cur, struct btrfs_ordered_sum, list);
286 list_del(&sum->list);
287 kfree(sum);
288 }
289 kfree(entry);
290 }
291 return 0;
292}
293
294/*
295 * remove an ordered extent from the tree. No references are dropped
296 * but, anyone waiting on this extent is woken up.
297 */
298int btrfs_remove_ordered_extent(struct inode *inode,
299 struct btrfs_ordered_extent *entry)
300{
301 struct btrfs_ordered_inode_tree *tree;
302 struct rb_node *node;
303
304 tree = &BTRFS_I(inode)->ordered_tree;
305 mutex_lock(&tree->mutex);
306 node = &entry->rb_node;
307 rb_erase(node, &tree->tree);
308 tree->last = NULL;
309 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list);
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314
315 mutex_unlock(&tree->mutex);
316 wake_up(&entry->wait);
317 return 0;
318}
319
320/*
321 * wait for all the ordered extents in a root. This is done when balancing
322 * space between drives.
323 */
324int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
325{
326 struct list_head splice;
327 struct list_head *cur;
328 struct btrfs_ordered_extent *ordered;
329 struct inode *inode;
330
331 INIT_LIST_HEAD(&splice);
332
333 spin_lock(&root->fs_info->ordered_extent_lock);
334 list_splice_init(&root->fs_info->ordered_extents, &splice);
335 while (!list_empty(&splice)) {
336 cur = splice.next;
337 ordered = list_entry(cur, struct btrfs_ordered_extent,
338 root_extent_list);
339 if (nocow_only &&
340 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
341 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
342 list_move(&ordered->root_extent_list,
343 &root->fs_info->ordered_extents);
344 cond_resched_lock(&root->fs_info->ordered_extent_lock);
345 continue;
346 }
347
348 list_del_init(&ordered->root_extent_list);
349 atomic_inc(&ordered->refs);
350
351 /*
352 * the inode may be getting freed (in sys_unlink path).
353 */
354 inode = igrab(ordered->inode);
355
356 spin_unlock(&root->fs_info->ordered_extent_lock);
357
358 if (inode) {
359 btrfs_start_ordered_extent(inode, ordered, 1);
360 btrfs_put_ordered_extent(ordered);
361 iput(inode);
362 } else {
363 btrfs_put_ordered_extent(ordered);
364 }
365
366 spin_lock(&root->fs_info->ordered_extent_lock);
367 }
368 spin_unlock(&root->fs_info->ordered_extent_lock);
369 return 0;
370}
371
372/*
373 * Used to start IO or wait for a given ordered extent to finish.
374 *
375 * If wait is one, this effectively waits on page writeback for all the pages
376 * in the extent, and it waits on the io completion code to insert
377 * metadata into the btree corresponding to the extent
378 */
379void btrfs_start_ordered_extent(struct inode *inode,
380 struct btrfs_ordered_extent *entry,
381 int wait)
382{
383 u64 start = entry->file_offset;
384 u64 end = start + entry->len - 1;
385
386 /*
387 * pages in the range can be dirty, clean or writeback. We
388 * start IO on any dirty ones so the wait doesn't stall waiting
389 * for pdflush to find them
390 */
391 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
392 if (wait) {
393 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
394 &entry->flags));
395 }
396}
397
398/*
399 * Used to wait on ordered extents across a large range of bytes.
400 */
401int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
402{
403 u64 end;
404 u64 orig_end;
405 u64 wait_end;
406 struct btrfs_ordered_extent *ordered;
407
408 if (start + len < start) {
409 orig_end = INT_LIMIT(loff_t);
410 } else {
411 orig_end = start + len - 1;
412 if (orig_end > INT_LIMIT(loff_t))
413 orig_end = INT_LIMIT(loff_t);
414 }
415 wait_end = orig_end;
416again:
417 /* start IO across the range first to instantiate any delalloc
418 * extents
419 */
420 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
421
422 /* The compression code will leave pages locked but return from
423 * writepage without setting the page writeback. Starting again
424 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
425 */
426 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
427
428 btrfs_wait_on_page_writeback_range(inode->i_mapping,
429 start >> PAGE_CACHE_SHIFT,
430 orig_end >> PAGE_CACHE_SHIFT);
431
432 end = orig_end;
433 while (1) {
434 ordered = btrfs_lookup_first_ordered_extent(inode, end);
435 if (!ordered)
436 break;
437 if (ordered->file_offset > orig_end) {
438 btrfs_put_ordered_extent(ordered);
439 break;
440 }
441 if (ordered->file_offset + ordered->len < start) {
442 btrfs_put_ordered_extent(ordered);
443 break;
444 }
445 btrfs_start_ordered_extent(inode, ordered, 1);
446 end = ordered->file_offset;
447 btrfs_put_ordered_extent(ordered);
448 if (end == 0 || end == start)
449 break;
450 end--;
451 }
452 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
453 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
454 schedule_timeout(1);
455 goto again;
456 }
457 return 0;
458}
459
460/*
461 * find an ordered extent corresponding to file_offset. return NULL if
462 * nothing is found, otherwise take a reference on the extent and return it
463 */
464struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
465 u64 file_offset)
466{
467 struct btrfs_ordered_inode_tree *tree;
468 struct rb_node *node;
469 struct btrfs_ordered_extent *entry = NULL;
470
471 tree = &BTRFS_I(inode)->ordered_tree;
472 mutex_lock(&tree->mutex);
473 node = tree_search(tree, file_offset);
474 if (!node)
475 goto out;
476
477 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
478 if (!offset_in_entry(entry, file_offset))
479 entry = NULL;
480 if (entry)
481 atomic_inc(&entry->refs);
482out:
483 mutex_unlock(&tree->mutex);
484 return entry;
485}
486
487/*
488 * lookup and return any extent before 'file_offset'. NULL is returned
489 * if none is found
490 */
491struct btrfs_ordered_extent *
492btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
493{
494 struct btrfs_ordered_inode_tree *tree;
495 struct rb_node *node;
496 struct btrfs_ordered_extent *entry = NULL;
497
498 tree = &BTRFS_I(inode)->ordered_tree;
499 mutex_lock(&tree->mutex);
500 node = tree_search(tree, file_offset);
501 if (!node)
502 goto out;
503
504 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
505 atomic_inc(&entry->refs);
506out:
507 mutex_unlock(&tree->mutex);
508 return entry;
509}
510
511/*
512 * After an extent is done, call this to conditionally update the on disk
513 * i_size. i_size is updated to cover any fully written part of the file.
514 */
515int btrfs_ordered_update_i_size(struct inode *inode,
516 struct btrfs_ordered_extent *ordered)
517{
518 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
519 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
520 u64 disk_i_size;
521 u64 new_i_size;
522 u64 i_size_test;
523 struct rb_node *node;
524 struct btrfs_ordered_extent *test;
525
526 mutex_lock(&tree->mutex);
527 disk_i_size = BTRFS_I(inode)->disk_i_size;
528
529 /*
530 * if the disk i_size is already at the inode->i_size, or
531 * this ordered extent is inside the disk i_size, we're done
532 */
533 if (disk_i_size >= inode->i_size ||
534 ordered->file_offset + ordered->len <= disk_i_size) {
535 goto out;
536 }
537
538 /*
539 * we can't update the disk_isize if there are delalloc bytes
540 * between disk_i_size and this ordered extent
541 */
542 if (test_range_bit(io_tree, disk_i_size,
543 ordered->file_offset + ordered->len - 1,
544 EXTENT_DELALLOC, 0)) {
545 goto out;
546 }
547 /*
548 * walk backward from this ordered extent to disk_i_size.
549 * if we find an ordered extent then we can't update disk i_size
550 * yet
551 */
552 node = &ordered->rb_node;
553 while (1) {
554 node = rb_prev(node);
555 if (!node)
556 break;
557 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
558 if (test->file_offset + test->len <= disk_i_size)
559 break;
560 if (test->file_offset >= inode->i_size)
561 break;
562 if (test->file_offset >= disk_i_size)
563 goto out;
564 }
565 new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
566
567 /*
568 * at this point, we know we can safely update i_size to at least
569 * the offset from this ordered extent. But, we need to
570 * walk forward and see if ios from higher up in the file have
571 * finished.
572 */
573 node = rb_next(&ordered->rb_node);
574 i_size_test = 0;
575 if (node) {
576 /*
577 * do we have an area where IO might have finished
578 * between our ordered extent and the next one.
579 */
580 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
581 if (test->file_offset > entry_end(ordered))
582 i_size_test = test->file_offset;
583 } else {
584 i_size_test = i_size_read(inode);
585 }
586
587 /*
588 * i_size_test is the end of a region after this ordered
589 * extent where there are no ordered extents. As long as there
590 * are no delalloc bytes in this area, it is safe to update
591 * disk_i_size to the end of the region.
592 */
593 if (i_size_test > entry_end(ordered) &&
594 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
595 EXTENT_DELALLOC, 0)) {
596 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
597 }
598 BTRFS_I(inode)->disk_i_size = new_i_size;
599out:
600 mutex_unlock(&tree->mutex);
601 return 0;
602}
603
604/*
605 * search the ordered extents for one corresponding to 'offset' and
606 * try to find a checksum. This is used because we allow pages to
607 * be reclaimed before their checksum is actually put into the btree
608 */
609int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
610 u32 *sum)
611{
612 struct btrfs_ordered_sum *ordered_sum;
613 struct btrfs_sector_sum *sector_sums;
614 struct btrfs_ordered_extent *ordered;
615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
616 struct list_head *cur;
617 unsigned long num_sectors;
618 unsigned long i;
619 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
620 int ret = 1;
621
622 ordered = btrfs_lookup_ordered_extent(inode, offset);
623 if (!ordered)
624 return 1;
625
626 mutex_lock(&tree->mutex);
627 list_for_each_prev(cur, &ordered->list) {
628 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
629 if (disk_bytenr >= ordered_sum->bytenr) {
630 num_sectors = ordered_sum->len / sectorsize;
631 sector_sums = ordered_sum->sums;
632 for (i = 0; i < num_sectors; i++) {
633 if (sector_sums[i].bytenr == disk_bytenr) {
634 *sum = sector_sums[i].sum;
635 ret = 0;
636 goto out;
637 }
638 }
639 }
640 }
641out:
642 mutex_unlock(&tree->mutex);
643 btrfs_put_ordered_extent(ordered);
644 return ret;
645}
646
647
648/**
649 * taken from mm/filemap.c because it isn't exported
650 *
651 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
652 * @mapping: address space structure to write
653 * @start: offset in bytes where the range starts
654 * @end: offset in bytes where the range ends (inclusive)
655 * @sync_mode: enable synchronous operation
656 *
657 * Start writeback against all of a mapping's dirty pages that lie
658 * within the byte offsets <start, end> inclusive.
659 *
660 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
661 * opposed to a regular memory cleansing writeback. The difference between
662 * these two operations is that if a dirty page/buffer is encountered, it must
663 * be waited upon, and not just skipped over.
664 */
665int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
666 loff_t end, int sync_mode)
667{
668 struct writeback_control wbc = {
669 .sync_mode = sync_mode,
670 .nr_to_write = mapping->nrpages * 2,
671 .range_start = start,
672 .range_end = end,
673 .for_writepages = 1,
674 };
675 return btrfs_writepages(mapping, &wbc);
676}
677
678/**
679 * taken from mm/filemap.c because it isn't exported
680 *
681 * wait_on_page_writeback_range - wait for writeback to complete
682 * @mapping: target address_space
683 * @start: beginning page index
684 * @end: ending page index
685 *
686 * Wait for writeback to complete against pages indexed by start->end
687 * inclusive
688 */
689int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
690 pgoff_t start, pgoff_t end)
691{
692 struct pagevec pvec;
693 int nr_pages;
694 int ret = 0;
695 pgoff_t index;
696
697 if (end < start)
698 return 0;
699
700 pagevec_init(&pvec, 0);
701 index = start;
702 while ((index <= end) &&
703 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
704 PAGECACHE_TAG_WRITEBACK,
705 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
706 unsigned i;
707
708 for (i = 0; i < nr_pages; i++) {
709 struct page *page = pvec.pages[i];
710
711 /* until radix tree lookup accepts end_index */
712 if (page->index > end)
713 continue;
714
715 wait_on_page_writeback(page);
716 if (PageError(page))
717 ret = -EIO;
718 }
719 pagevec_release(&pvec);
720 cond_resched();
721 }
722
723 /* Check for outstanding write errors */
724 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
725 ret = -ENOSPC;
726 if (test_and_clear_bit(AS_EIO, &mapping->flags))
727 ret = -EIO;
728
729 return ret;
730}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..ab66d5e8d6d6
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,158 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ORDERED_DATA__
20#define __BTRFS_ORDERED_DATA__
21
22/* one of these per inode */
23struct btrfs_ordered_inode_tree {
24 struct mutex mutex;
25 struct rb_root tree;
26 struct rb_node *last;
27};
28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 /* bytenr on disk */
37 u64 bytenr;
38 u32 sum;
39};
40
41struct btrfs_ordered_sum {
42 /* bytenr is the start of this extent on disk */
43 u64 bytenr;
44
45 /*
46 * this is the length in bytes covered by the sums array below.
47 */
48 unsigned long len;
49 struct list_head list;
50 /* last field is a variable length array of btrfs_sector_sums */
51 struct btrfs_sector_sum sums[];
52};
53
54/*
55 * bits for the flags field:
56 *
57 * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
58 * It is used to make sure metadata is inserted into the tree only once
59 * per extent.
60 *
61 * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
62 * rbtree, just before waking any waiters. It is used to indicate the
63 * IO is done and any metadata is inserted into the tree.
64 */
65#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
66
67#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74
75struct btrfs_ordered_extent {
76 /* logical offset in the file */
77 u64 file_offset;
78
79 /* disk byte number */
80 u64 start;
81
82 /* ram length of the extent in bytes */
83 u64 len;
84
85 /* extent length on disk */
86 u64 disk_len;
87
88 /* flags (described above) */
89 unsigned long flags;
90
91 /* reference count */
92 atomic_t refs;
93
94 /* the inode we belong to */
95 struct inode *inode;
96
97 /* list of checksums for insertion when the extent io is done */
98 struct list_head list;
99
100 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
101 wait_queue_head_t wait;
102
103 /* our friendly rbtree entry */
104 struct rb_node rb_node;
105
106 /* a per root list of all the pending ordered extents */
107 struct list_head root_extent_list;
108};
109
110
111/*
112 * calculates the total size you need to allocate for an ordered sum
113 * structure spanning 'bytes' in the file
114 */
115static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
116 unsigned long bytes)
117{
118 unsigned long num_sectors = (bytes + root->sectorsize - 1) /
119 root->sectorsize;
120 num_sectors++;
121 return sizeof(struct btrfs_ordered_sum) +
122 num_sectors * sizeof(struct btrfs_sector_sum);
123}
124
125static inline void
126btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
127{
128 mutex_init(&t->mutex);
129 t->tree.rb_node = NULL;
130 t->last = NULL;
131}
132
133int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
134int btrfs_remove_ordered_extent(struct inode *inode,
135 struct btrfs_ordered_extent *entry);
136int btrfs_dec_test_ordered_pending(struct inode *inode,
137 u64 file_offset, u64 io_size);
138int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
139 u64 start, u64 len, u64 disk_len, int tyep);
140int btrfs_add_ordered_sum(struct inode *inode,
141 struct btrfs_ordered_extent *entry,
142 struct btrfs_ordered_sum *sum);
143struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
144 u64 file_offset);
145void btrfs_start_ordered_extent(struct inode *inode,
146 struct btrfs_ordered_extent *entry, int wait);
147int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
148struct btrfs_ordered_extent *
149btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
150int btrfs_ordered_update_i_size(struct inode *inode,
151 struct btrfs_ordered_extent *ordered);
152int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
153int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
154 pgoff_t start, pgoff_t end);
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000000..3c0d52af4f80
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21
22int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root, u64 offset)
24{
25 struct btrfs_path *path;
26 struct btrfs_key key;
27 int ret = 0;
28
29 key.objectid = BTRFS_ORPHAN_OBJECTID;
30 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
31 key.offset = offset;
32
33 path = btrfs_alloc_path();
34 if (!path)
35 return -ENOMEM;
36
37 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
38
39 btrfs_free_path(path);
40 return ret;
41}
42
43int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
44 struct btrfs_root *root, u64 offset)
45{
46 struct btrfs_path *path;
47 struct btrfs_key key;
48 int ret = 0;
49
50 key.objectid = BTRFS_ORPHAN_OBJECTID;
51 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
52 key.offset = offset;
53
54 path = btrfs_alloc_path();
55 if (!path)
56 return -ENOMEM;
57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret)
60 goto out;
61
62 ret = btrfs_del_item(trans, root, path);
63
64out:
65 btrfs_free_path(path);
66 return ret;
67}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000000..5f8f218c1005
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,216 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "print-tree.h"
22
23static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
24{
25 int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
26 int i;
27 printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
28 "num_stripes %d\n",
29 (unsigned long long)btrfs_chunk_length(eb, chunk),
30 (unsigned long long)btrfs_chunk_owner(eb, chunk),
31 (unsigned long long)btrfs_chunk_type(eb, chunk),
32 num_stripes);
33 for (i = 0 ; i < num_stripes ; i++) {
34 printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
35 (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
36 (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
37 }
38}
39static void print_dev_item(struct extent_buffer *eb,
40 struct btrfs_dev_item *dev_item)
41{
42 printk(KERN_INFO "\t\tdev item devid %llu "
43 "total_bytes %llu bytes used %llu\n",
44 (unsigned long long)btrfs_device_id(eb, dev_item),
45 (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
46 (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
47}
48void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
49{
50 int i;
51 u32 nr = btrfs_header_nritems(l);
52 struct btrfs_item *item;
53 struct btrfs_extent_item *ei;
54 struct btrfs_root_item *ri;
55 struct btrfs_dir_item *di;
56 struct btrfs_inode_item *ii;
57 struct btrfs_block_group_item *bi;
58 struct btrfs_file_extent_item *fi;
59 struct btrfs_key key;
60 struct btrfs_key found_key;
61 struct btrfs_extent_ref *ref;
62 struct btrfs_dev_extent *dev_extent;
63 u32 type;
64
65 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
66 (unsigned long long)btrfs_header_bytenr(l), nr,
67 btrfs_leaf_free_space(root, l));
68 for (i = 0 ; i < nr ; i++) {
69 item = btrfs_item_nr(l, i);
70 btrfs_item_key_to_cpu(l, &key, i);
71 type = btrfs_key_type(&key);
72 printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
73 "itemsize %d\n",
74 i,
75 (unsigned long long)key.objectid, type,
76 (unsigned long long)key.offset,
77 btrfs_item_offset(l, item), btrfs_item_size(l, item));
78 switch (type) {
79 case BTRFS_INODE_ITEM_KEY:
80 ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
81 printk(KERN_INFO "\t\tinode generation %llu size %llu "
82 "mode %o\n",
83 (unsigned long long)
84 btrfs_inode_generation(l, ii),
85 (unsigned long long)btrfs_inode_size(l, ii),
86 btrfs_inode_mode(l, ii));
87 break;
88 case BTRFS_DIR_ITEM_KEY:
89 di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
90 btrfs_dir_item_key_to_cpu(l, di, &found_key);
91 printk(KERN_INFO "\t\tdir oid %llu type %u\n",
92 (unsigned long long)found_key.objectid,
93 btrfs_dir_type(l, di));
94 break;
95 case BTRFS_ROOT_ITEM_KEY:
96 ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
97 printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
98 (unsigned long long)
99 btrfs_disk_root_bytenr(l, ri),
100 btrfs_disk_root_refs(l, ri));
101 break;
102 case BTRFS_EXTENT_ITEM_KEY:
103 ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
104 printk(KERN_INFO "\t\textent data refs %u\n",
105 btrfs_extent_refs(l, ei));
106 break;
107 case BTRFS_EXTENT_REF_KEY:
108 ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
109 printk(KERN_INFO "\t\textent back ref root %llu "
110 "gen %llu owner %llu num_refs %lu\n",
111 (unsigned long long)btrfs_ref_root(l, ref),
112 (unsigned long long)btrfs_ref_generation(l, ref),
113 (unsigned long long)btrfs_ref_objectid(l, ref),
114 (unsigned long)btrfs_ref_num_refs(l, ref));
115 break;
116
117 case BTRFS_EXTENT_DATA_KEY:
118 fi = btrfs_item_ptr(l, i,
119 struct btrfs_file_extent_item);
120 if (btrfs_file_extent_type(l, fi) ==
121 BTRFS_FILE_EXTENT_INLINE) {
122 printk(KERN_INFO "\t\tinline extent data "
123 "size %u\n",
124 btrfs_file_extent_inline_len(l, fi));
125 break;
126 }
127 printk(KERN_INFO "\t\textent data disk bytenr %llu "
128 "nr %llu\n",
129 (unsigned long long)
130 btrfs_file_extent_disk_bytenr(l, fi),
131 (unsigned long long)
132 btrfs_file_extent_disk_num_bytes(l, fi));
133 printk(KERN_INFO "\t\textent data offset %llu "
134 "nr %llu ram %llu\n",
135 (unsigned long long)
136 btrfs_file_extent_offset(l, fi),
137 (unsigned long long)
138 btrfs_file_extent_num_bytes(l, fi),
139 (unsigned long long)
140 btrfs_file_extent_ram_bytes(l, fi));
141 break;
142 case BTRFS_BLOCK_GROUP_ITEM_KEY:
143 bi = btrfs_item_ptr(l, i,
144 struct btrfs_block_group_item);
145 printk(KERN_INFO "\t\tblock group used %llu\n",
146 (unsigned long long)
147 btrfs_disk_block_group_used(l, bi));
148 break;
149 case BTRFS_CHUNK_ITEM_KEY:
150 print_chunk(l, btrfs_item_ptr(l, i,
151 struct btrfs_chunk));
152 break;
153 case BTRFS_DEV_ITEM_KEY:
154 print_dev_item(l, btrfs_item_ptr(l, i,
155 struct btrfs_dev_item));
156 break;
157 case BTRFS_DEV_EXTENT_KEY:
158 dev_extent = btrfs_item_ptr(l, i,
159 struct btrfs_dev_extent);
160 printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
161 "\t\tchunk objectid %llu chunk offset %llu "
162 "length %llu\n",
163 (unsigned long long)
164 btrfs_dev_extent_chunk_tree(l, dev_extent),
165 (unsigned long long)
166 btrfs_dev_extent_chunk_objectid(l, dev_extent),
167 (unsigned long long)
168 btrfs_dev_extent_chunk_offset(l, dev_extent),
169 (unsigned long long)
170 btrfs_dev_extent_length(l, dev_extent));
171 };
172 }
173}
174
175void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
176{
177 int i; u32 nr;
178 struct btrfs_key key;
179 int level;
180
181 if (!c)
182 return;
183 nr = btrfs_header_nritems(c);
184 level = btrfs_header_level(c);
185 if (level == 0) {
186 btrfs_print_leaf(root, c);
187 return;
188 }
189 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
190 (unsigned long long)btrfs_header_bytenr(c),
191 btrfs_header_level(c), nr,
192 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
193 for (i = 0; i < nr; i++) {
194 btrfs_node_key_to_cpu(c, &key, i);
195 printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
196 i,
197 (unsigned long long)key.objectid,
198 key.type,
199 (unsigned long long)key.offset,
200 (unsigned long long)btrfs_node_blockptr(c, i));
201 }
202 for (i = 0; i < nr; i++) {
203 struct extent_buffer *next = read_tree_block(root,
204 btrfs_node_blockptr(c, i),
205 btrfs_level_size(root, level - 1),
206 btrfs_node_ptr_generation(c, i));
207 if (btrfs_is_leaf(next) &&
208 btrfs_header_level(c) != 1)
209 BUG();
210 if (btrfs_header_level(next) !=
211 btrfs_header_level(c) - 1)
212 BUG();
213 btrfs_print_tree(root, next);
214 free_extent_buffer(next);
215 }
216}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000000..da75efe534d5
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __PRINT_TREE_
20#define __PRINT_TREE_
21void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
22void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
23#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 000000000000..6f0acc4c9eab
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "ref-cache.h"
22#include "transaction.h"
23
24/*
25 * leaf refs are used to cache the information about which extents
26 * a given leaf has references on. This allows us to process that leaf
27 * in btrfs_drop_snapshot without needing to read it back from disk.
28 */
29
30/*
31 * kmalloc a leaf reference struct and update the counters for the
32 * total ref cache size
33 */
34struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
35 int nr_extents)
36{
37 struct btrfs_leaf_ref *ref;
38 size_t size = btrfs_leaf_ref_size(nr_extents);
39
40 ref = kmalloc(size, GFP_NOFS);
41 if (ref) {
42 spin_lock(&root->fs_info->ref_cache_lock);
43 root->fs_info->total_ref_cache_size += size;
44 spin_unlock(&root->fs_info->ref_cache_lock);
45
46 memset(ref, 0, sizeof(*ref));
47 atomic_set(&ref->usage, 1);
48 INIT_LIST_HEAD(&ref->list);
49 }
50 return ref;
51}
52
53/*
54 * free a leaf reference struct and update the counters for the
55 * total ref cache size
56 */
57void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
58{
59 if (!ref)
60 return;
61 WARN_ON(atomic_read(&ref->usage) == 0);
62 if (atomic_dec_and_test(&ref->usage)) {
63 size_t size = btrfs_leaf_ref_size(ref->nritems);
64
65 BUG_ON(ref->in_tree);
66 kfree(ref);
67
68 spin_lock(&root->fs_info->ref_cache_lock);
69 root->fs_info->total_ref_cache_size -= size;
70 spin_unlock(&root->fs_info->ref_cache_lock);
71 }
72}
73
74static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
75 struct rb_node *node)
76{
77 struct rb_node **p = &root->rb_node;
78 struct rb_node *parent = NULL;
79 struct btrfs_leaf_ref *entry;
80
81 while (*p) {
82 parent = *p;
83 entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
84
85 if (bytenr < entry->bytenr)
86 p = &(*p)->rb_left;
87 else if (bytenr > entry->bytenr)
88 p = &(*p)->rb_right;
89 else
90 return parent;
91 }
92
93 entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
94 rb_link_node(node, parent, p);
95 rb_insert_color(node, root);
96 return NULL;
97}
98
99static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
100{
101 struct rb_node *n = root->rb_node;
102 struct btrfs_leaf_ref *entry;
103
104 while (n) {
105 entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
106 WARN_ON(!entry->in_tree);
107
108 if (bytenr < entry->bytenr)
109 n = n->rb_left;
110 else if (bytenr > entry->bytenr)
111 n = n->rb_right;
112 else
113 return n;
114 }
115 return NULL;
116}
117
118int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
119 int shared)
120{
121 struct btrfs_leaf_ref *ref = NULL;
122 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
123
124 if (shared)
125 tree = &root->fs_info->shared_ref_tree;
126 if (!tree)
127 return 0;
128
129 spin_lock(&tree->lock);
130 while (!list_empty(&tree->list)) {
131 ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
132 BUG_ON(ref->tree != tree);
133 if (ref->root_gen > max_root_gen)
134 break;
135 if (!xchg(&ref->in_tree, 0)) {
136 cond_resched_lock(&tree->lock);
137 continue;
138 }
139
140 rb_erase(&ref->rb_node, &tree->root);
141 list_del_init(&ref->list);
142
143 spin_unlock(&tree->lock);
144 btrfs_free_leaf_ref(root, ref);
145 cond_resched();
146 spin_lock(&tree->lock);
147 }
148 spin_unlock(&tree->lock);
149 return 0;
150}
151
152/*
153 * find the leaf ref for a given extent. This returns the ref struct with
154 * a usage reference incremented
155 */
156struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
157 u64 bytenr)
158{
159 struct rb_node *rb;
160 struct btrfs_leaf_ref *ref = NULL;
161 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
162again:
163 if (tree) {
164 spin_lock(&tree->lock);
165 rb = tree_search(&tree->root, bytenr);
166 if (rb)
167 ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
168 if (ref)
169 atomic_inc(&ref->usage);
170 spin_unlock(&tree->lock);
171 if (ref)
172 return ref;
173 }
174 if (tree != &root->fs_info->shared_ref_tree) {
175 tree = &root->fs_info->shared_ref_tree;
176 goto again;
177 }
178 return NULL;
179}
180
181/*
182 * add a fully filled in leaf ref struct
183 * remove all the refs older than a given root generation
184 */
185int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
186 int shared)
187{
188 int ret = 0;
189 struct rb_node *rb;
190 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
191
192 if (shared)
193 tree = &root->fs_info->shared_ref_tree;
194
195 spin_lock(&tree->lock);
196 rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
197 if (rb) {
198 ret = -EEXIST;
199 } else {
200 atomic_inc(&ref->usage);
201 ref->tree = tree;
202 ref->in_tree = 1;
203 list_add_tail(&ref->list, &tree->list);
204 }
205 spin_unlock(&tree->lock);
206 return ret;
207}
208
209/*
210 * remove a single leaf ref from the tree. This drops the ref held by the tree
211 * only
212 */
213int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
214{
215 struct btrfs_leaf_ref_tree *tree;
216
217 if (!xchg(&ref->in_tree, 0))
218 return 0;
219
220 tree = ref->tree;
221 spin_lock(&tree->lock);
222
223 rb_erase(&ref->rb_node, &tree->root);
224 list_del_init(&ref->list);
225
226 spin_unlock(&tree->lock);
227
228 btrfs_free_leaf_ref(root, ref);
229 return 0;
230}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 000000000000..16f3183d7c59
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,77 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REFCACHE__
19#define __REFCACHE__
20
21struct btrfs_extent_info {
22 /* bytenr and num_bytes find the extent in the extent allocation tree */
23 u64 bytenr;
24 u64 num_bytes;
25
26 /* objectid and offset find the back reference for the file */
27 u64 objectid;
28 u64 offset;
29};
30
31struct btrfs_leaf_ref {
32 struct rb_node rb_node;
33 struct btrfs_leaf_ref_tree *tree;
34 int in_tree;
35 atomic_t usage;
36
37 u64 root_gen;
38 u64 bytenr;
39 u64 owner;
40 u64 generation;
41 int nritems;
42
43 struct list_head list;
44 struct btrfs_extent_info extents[];
45};
46
47static inline size_t btrfs_leaf_ref_size(int nr_extents)
48{
49 return sizeof(struct btrfs_leaf_ref) +
50 sizeof(struct btrfs_extent_info) * nr_extents;
51}
52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{
55 tree->root.rb_node = NULL;
56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock);
58}
59
60static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
61{
62 return RB_EMPTY_ROOT(&tree->root);
63}
64
65void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
66struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
67 int nr_extents);
68void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
69struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
70 u64 bytenr);
71int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
72 int shared);
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000000..b48650de4472
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,366 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "transaction.h"
21#include "disk-io.h"
22#include "print-tree.h"
23
24/*
25 * search forward for a root, starting with objectid 'search_start'
26 * if a root key is found, the objectid we find is filled into 'found_objectid'
27 * and 0 is returned. < 0 is returned on error, 1 if there is nothing
28 * left in the tree.
29 */
30int btrfs_search_root(struct btrfs_root *root, u64 search_start,
31 u64 *found_objectid)
32{
33 struct btrfs_path *path;
34 struct btrfs_key search_key;
35 int ret;
36
37 root = root->fs_info->tree_root;
38 search_key.objectid = search_start;
39 search_key.type = (u8)-1;
40 search_key.offset = (u64)-1;
41
42 path = btrfs_alloc_path();
43 BUG_ON(!path);
44again:
45 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
46 if (ret < 0)
47 goto out;
48 if (ret == 0) {
49 ret = 1;
50 goto out;
51 }
52 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
53 ret = btrfs_next_leaf(root, path);
54 if (ret)
55 goto out;
56 }
57 btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
58 if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
59 search_key.offset++;
60 btrfs_release_path(root, path);
61 goto again;
62 }
63 ret = 0;
64 *found_objectid = search_key.objectid;
65
66out:
67 btrfs_free_path(path);
68 return ret;
69}
70
71/*
72 * lookup the root with the highest offset for a given objectid. The key we do
73 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
74 * on error.
75 */
76int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
77 struct btrfs_root_item *item, struct btrfs_key *key)
78{
79 struct btrfs_path *path;
80 struct btrfs_key search_key;
81 struct btrfs_key found_key;
82 struct extent_buffer *l;
83 int ret;
84 int slot;
85
86 search_key.objectid = objectid;
87 search_key.type = BTRFS_ROOT_ITEM_KEY;
88 search_key.offset = (u64)-1;
89
90 path = btrfs_alloc_path();
91 BUG_ON(!path);
92 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
93 if (ret < 0)
94 goto out;
95
96 BUG_ON(ret == 0);
97 l = path->nodes[0];
98 BUG_ON(path->slots[0] == 0);
99 slot = path->slots[0] - 1;
100 btrfs_item_key_to_cpu(l, &found_key, slot);
101 if (found_key.objectid != objectid) {
102 ret = 1;
103 goto out;
104 }
105 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
106 sizeof(*item));
107 memcpy(key, &found_key, sizeof(found_key));
108 ret = 0;
109out:
110 btrfs_free_path(path);
111 return ret;
112}
113
114/*
115 * copy the data in 'item' into the btree
116 */
117int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
118 *root, struct btrfs_key *key, struct btrfs_root_item
119 *item)
120{
121 struct btrfs_path *path;
122 struct extent_buffer *l;
123 int ret;
124 int slot;
125 unsigned long ptr;
126
127 path = btrfs_alloc_path();
128 BUG_ON(!path);
129 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
130 if (ret < 0)
131 goto out;
132
133 if (ret != 0) {
134 btrfs_print_leaf(root, path->nodes[0]);
135 printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
136 (unsigned long long)key->objectid, key->type,
137 (unsigned long long)key->offset);
138 BUG_ON(1);
139 }
140
141 l = path->nodes[0];
142 slot = path->slots[0];
143 ptr = btrfs_item_ptr_offset(l, slot);
144 write_extent_buffer(l, item, ptr, sizeof(*item));
145 btrfs_mark_buffer_dirty(path->nodes[0]);
146out:
147 btrfs_release_path(root, path);
148 btrfs_free_path(path);
149 return ret;
150}
151
152int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
153 *root, struct btrfs_key *key, struct btrfs_root_item
154 *item)
155{
156 int ret;
157 ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
158 return ret;
159}
160
161/*
162 * at mount time we want to find all the old transaction snapshots that were in
163 * the process of being deleted if we crashed. This is any root item with an
164 * offset lower than the latest root. They need to be queued for deletion to
165 * finish what was happening when we crashed.
166 */
167int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
168 struct btrfs_root *latest)
169{
170 struct btrfs_root *dead_root;
171 struct btrfs_item *item;
172 struct btrfs_root_item *ri;
173 struct btrfs_key key;
174 struct btrfs_key found_key;
175 struct btrfs_path *path;
176 int ret;
177 u32 nritems;
178 struct extent_buffer *leaf;
179 int slot;
180
181 key.objectid = objectid;
182 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
183 key.offset = 0;
184 path = btrfs_alloc_path();
185 if (!path)
186 return -ENOMEM;
187
188again:
189 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
190 if (ret < 0)
191 goto err;
192 while (1) {
193 leaf = path->nodes[0];
194 nritems = btrfs_header_nritems(leaf);
195 slot = path->slots[0];
196 if (slot >= nritems) {
197 ret = btrfs_next_leaf(root, path);
198 if (ret)
199 break;
200 leaf = path->nodes[0];
201 nritems = btrfs_header_nritems(leaf);
202 slot = path->slots[0];
203 }
204 item = btrfs_item_nr(leaf, slot);
205 btrfs_item_key_to_cpu(leaf, &key, slot);
206 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
207 goto next;
208
209 if (key.objectid < objectid)
210 goto next;
211
212 if (key.objectid > objectid)
213 break;
214
215 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
216 if (btrfs_disk_root_refs(leaf, ri) != 0)
217 goto next;
218
219 memcpy(&found_key, &key, sizeof(key));
220 key.offset++;
221 btrfs_release_path(root, path);
222 dead_root =
223 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
224 &found_key);
225 if (IS_ERR(dead_root)) {
226 ret = PTR_ERR(dead_root);
227 goto err;
228 }
229
230 if (objectid == BTRFS_TREE_RELOC_OBJECTID)
231 ret = btrfs_add_dead_reloc_root(dead_root);
232 else
233 ret = btrfs_add_dead_root(dead_root, latest);
234 if (ret)
235 goto err;
236 goto again;
237next:
238 slot++;
239 path->slots[0]++;
240 }
241 ret = 0;
242err:
243 btrfs_free_path(path);
244 return ret;
245}
246
247/* drop the root item for 'key' from 'root' */
248int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
249 struct btrfs_key *key)
250{
251 struct btrfs_path *path;
252 int ret;
253 u32 refs;
254 struct btrfs_root_item *ri;
255 struct extent_buffer *leaf;
256
257 path = btrfs_alloc_path();
258 BUG_ON(!path);
259 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
260 if (ret < 0)
261 goto out;
262
263 BUG_ON(ret != 0);
264 leaf = path->nodes[0];
265 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
266
267 refs = btrfs_disk_root_refs(leaf, ri);
268 BUG_ON(refs != 0);
269 ret = btrfs_del_item(trans, root, path);
270out:
271 btrfs_release_path(root, path);
272 btrfs_free_path(path);
273 return ret;
274}
275
276#if 0 /* this will get used when snapshot deletion is implemented */
277int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
278 struct btrfs_root *tree_root,
279 u64 root_id, u8 type, u64 ref_id)
280{
281 struct btrfs_key key;
282 int ret;
283 struct btrfs_path *path;
284
285 path = btrfs_alloc_path();
286
287 key.objectid = root_id;
288 key.type = type;
289 key.offset = ref_id;
290
291 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
292 BUG_ON(ret);
293
294 ret = btrfs_del_item(trans, tree_root, path);
295 BUG_ON(ret);
296
297 btrfs_free_path(path);
298 return ret;
299}
300#endif
301
302int btrfs_find_root_ref(struct btrfs_root *tree_root,
303 struct btrfs_path *path,
304 u64 root_id, u64 ref_id)
305{
306 struct btrfs_key key;
307 int ret;
308
309 key.objectid = root_id;
310 key.type = BTRFS_ROOT_REF_KEY;
311 key.offset = ref_id;
312
313 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
314 return ret;
315}
316
317
318/*
319 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
320 * or BTRFS_ROOT_BACKREF_KEY.
321 *
322 * The dirid, sequence, name and name_len refer to the directory entry
323 * that is referencing the root.
324 *
325 * For a forward ref, the root_id is the id of the tree referencing
326 * the root and ref_id is the id of the subvol or snapshot.
327 *
328 * For a back ref the root_id is the id of the subvol or snapshot and
329 * ref_id is the id of the tree referencing it.
330 */
331int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
332 struct btrfs_root *tree_root,
333 u64 root_id, u8 type, u64 ref_id,
334 u64 dirid, u64 sequence,
335 const char *name, int name_len)
336{
337 struct btrfs_key key;
338 int ret;
339 struct btrfs_path *path;
340 struct btrfs_root_ref *ref;
341 struct extent_buffer *leaf;
342 unsigned long ptr;
343
344
345 path = btrfs_alloc_path();
346
347 key.objectid = root_id;
348 key.type = type;
349 key.offset = ref_id;
350
351 ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
352 sizeof(*ref) + name_len);
353 BUG_ON(ret);
354
355 leaf = path->nodes[0];
356 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
357 btrfs_set_root_ref_dirid(leaf, ref, dirid);
358 btrfs_set_root_ref_sequence(leaf, ref, sequence);
359 btrfs_set_root_ref_name_len(leaf, ref, name_len);
360 ptr = (unsigned long)(ref + 1);
361 write_extent_buffer(leaf, name, ptr, name_len);
362 btrfs_mark_buffer_dirty(leaf);
363
364 btrfs_free_path(path);
365 return ret;
366}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000000..c0f7ecaf1e79
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,139 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/highmem.h>
20
21/* this is some deeply nasty code. ctree.h has a different
22 * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
23 *
24 * The end result is that anyone who #includes ctree.h gets a
25 * declaration for the btrfs_set_foo functions and btrfs_foo functions
26 *
27 * This file declares the macros and then #includes ctree.h, which results
28 * in cpp creating the function here based on the template below.
29 *
30 * These setget functions do all the extent_buffer related mapping
31 * required to efficiently read and write specific fields in the extent
32 * buffers. Every pointer to metadata items in btrfs is really just
33 * an unsigned long offset into the extent buffer which has been
34 * cast to a specific type. This gives us all the gcc type checking.
35 *
36 * The extent buffer api is used to do all the kmapping and page
37 * spanning work required to get extent buffers in highmem and have
38 * a metadata blocksize different from the page size.
39 *
40 * The macro starts with a simple function prototype declaration so that
41 * sparse won't complain about it being static.
42 */
43
44#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
45u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
46void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \
47u##bits btrfs_##name(struct extent_buffer *eb, \
48 type *s) \
49{ \
50 unsigned long part_offset = (unsigned long)s; \
51 unsigned long offset = part_offset + offsetof(type, member); \
52 type *p; \
53 /* ugly, but we want the fast path here */ \
54 if (eb->map_token && offset >= eb->map_start && \
55 offset + sizeof(((type *)0)->member) <= eb->map_start + \
56 eb->map_len) { \
57 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
58 return le##bits##_to_cpu(p->member); \
59 } \
60 { \
61 int err; \
62 char *map_token; \
63 char *kaddr; \
64 int unmap_on_exit = (eb->map_token == NULL); \
65 unsigned long map_start; \
66 unsigned long map_len; \
67 u##bits res; \
68 err = map_extent_buffer(eb, offset, \
69 sizeof(((type *)0)->member), \
70 &map_token, &kaddr, \
71 &map_start, &map_len, KM_USER1); \
72 if (err) { \
73 __le##bits leres; \
74 read_eb_member(eb, s, type, member, &leres); \
75 return le##bits##_to_cpu(leres); \
76 } \
77 p = (type *)(kaddr + part_offset - map_start); \
78 res = le##bits##_to_cpu(p->member); \
79 if (unmap_on_exit) \
80 unmap_extent_buffer(eb, map_token, KM_USER1); \
81 return res; \
82 } \
83} \
84void btrfs_set_##name(struct extent_buffer *eb, \
85 type *s, u##bits val) \
86{ \
87 unsigned long part_offset = (unsigned long)s; \
88 unsigned long offset = part_offset + offsetof(type, member); \
89 type *p; \
90 /* ugly, but we want the fast path here */ \
91 if (eb->map_token && offset >= eb->map_start && \
92 offset + sizeof(((type *)0)->member) <= eb->map_start + \
93 eb->map_len) { \
94 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
95 p->member = cpu_to_le##bits(val); \
96 return; \
97 } \
98 { \
99 int err; \
100 char *map_token; \
101 char *kaddr; \
102 int unmap_on_exit = (eb->map_token == NULL); \
103 unsigned long map_start; \
104 unsigned long map_len; \
105 err = map_extent_buffer(eb, offset, \
106 sizeof(((type *)0)->member), \
107 &map_token, &kaddr, \
108 &map_start, &map_len, KM_USER1); \
109 if (err) { \
110 __le##bits val2; \
111 val2 = cpu_to_le##bits(val); \
112 write_eb_member(eb, s, type, member, &val2); \
113 return; \
114 } \
115 p = (type *)(kaddr + part_offset - map_start); \
116 p->member = cpu_to_le##bits(val); \
117 if (unmap_on_exit) \
118 unmap_extent_buffer(eb, map_token, KM_USER1); \
119 } \
120}
121
122#include "ctree.h"
123
124void btrfs_node_key(struct extent_buffer *eb,
125 struct btrfs_disk_key *disk_key, int nr)
126{
127 unsigned long ptr = btrfs_node_key_ptr_offset(nr);
128 if (eb->map_token && ptr >= eb->map_start &&
129 ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
130 memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
131 sizeof(*disk_key));
132 return;
133 } else if (eb->map_token) {
134 unmap_extent_buffer(eb, eb->map_token, KM_USER1);
135 eb->map_token = NULL;
136 }
137 read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
138 struct btrfs_key_ptr, key, disk_key);
139}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000000..b4c101d9322c
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,720 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/module.h>
21#include <linux/buffer_head.h>
22#include <linux/fs.h>
23#include <linux/pagemap.h>
24#include <linux/highmem.h>
25#include <linux/time.h>
26#include <linux/init.h>
27#include <linux/string.h>
28#include <linux/smp_lock.h>
29#include <linux/backing-dev.h>
30#include <linux/mount.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/parser.h>
37#include <linux/ctype.h>
38#include <linux/namei.h>
39#include <linux/miscdevice.h>
40#include <linux/version.h>
41#include "compat.h"
42#include "ctree.h"
43#include "disk-io.h"
44#include "transaction.h"
45#include "btrfs_inode.h"
46#include "ioctl.h"
47#include "print-tree.h"
48#include "xattr.h"
49#include "volumes.h"
50#include "version.h"
51#include "export.h"
52#include "compression.h"
53
54#define BTRFS_SUPER_MAGIC 0x9123683E
55
56static struct super_operations btrfs_super_ops;
57
58static void btrfs_put_super(struct super_block *sb)
59{
60 struct btrfs_root *root = btrfs_sb(sb);
61 int ret;
62
63 ret = close_ctree(root);
64 sb->s_fs_info = NULL;
65}
66
67enum {
68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err,
71};
72
73static match_table_t tokens = {
74 {Opt_degraded, "degraded"},
75 {Opt_subvol, "subvol=%s"},
76 {Opt_device, "device=%s"},
77 {Opt_nodatasum, "nodatasum"},
78 {Opt_nodatacow, "nodatacow"},
79 {Opt_nobarrier, "nobarrier"},
80 {Opt_max_extent, "max_extent=%s"},
81 {Opt_max_inline, "max_inline=%s"},
82 {Opt_alloc_start, "alloc_start=%s"},
83 {Opt_thread_pool, "thread_pool=%d"},
84 {Opt_compress, "compress"},
85 {Opt_ssd, "ssd"},
86 {Opt_noacl, "noacl"},
87 {Opt_err, NULL},
88};
89
90u64 btrfs_parse_size(char *str)
91{
92 u64 res;
93 int mult = 1;
94 char *end;
95 char last;
96
97 res = simple_strtoul(str, &end, 10);
98
99 last = end[0];
100 if (isalpha(last)) {
101 last = tolower(last);
102 switch (last) {
103 case 'g':
104 mult *= 1024;
105 case 'm':
106 mult *= 1024;
107 case 'k':
108 mult *= 1024;
109 }
110 res = res * mult;
111 }
112 return res;
113}
114
115/*
116 * Regular mount options parser. Everything that is needed only when
117 * reading in a new superblock is parsed here.
118 */
119int btrfs_parse_options(struct btrfs_root *root, char *options)
120{
121 struct btrfs_fs_info *info = root->fs_info;
122 substring_t args[MAX_OPT_ARGS];
123 char *p, *num;
124 int intarg;
125
126 if (!options)
127 return 0;
128
129 /*
130 * strsep changes the string, duplicate it because parse_options
131 * gets called twice
132 */
133 options = kstrdup(options, GFP_NOFS);
134 if (!options)
135 return -ENOMEM;
136
137
138 while ((p = strsep(&options, ",")) != NULL) {
139 int token;
140 if (!*p)
141 continue;
142
143 token = match_token(p, tokens, args);
144 switch (token) {
145 case Opt_degraded:
146 printk(KERN_INFO "btrfs: allowing degraded mounts\n");
147 btrfs_set_opt(info->mount_opt, DEGRADED);
148 break;
149 case Opt_subvol:
150 case Opt_device:
151 /*
152 * These are parsed by btrfs_parse_early_options
153 * and can be happily ignored here.
154 */
155 break;
156 case Opt_nodatasum:
157 printk(KERN_INFO "btrfs: setting nodatacsum\n");
158 btrfs_set_opt(info->mount_opt, NODATASUM);
159 break;
160 case Opt_nodatacow:
161 printk(KERN_INFO "btrfs: setting nodatacow\n");
162 btrfs_set_opt(info->mount_opt, NODATACOW);
163 btrfs_set_opt(info->mount_opt, NODATASUM);
164 break;
165 case Opt_compress:
166 printk(KERN_INFO "btrfs: use compression\n");
167 btrfs_set_opt(info->mount_opt, COMPRESS);
168 break;
169 case Opt_ssd:
170 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
171 btrfs_set_opt(info->mount_opt, SSD);
172 break;
173 case Opt_nobarrier:
174 printk(KERN_INFO "btrfs: turning off barriers\n");
175 btrfs_set_opt(info->mount_opt, NOBARRIER);
176 break;
177 case Opt_thread_pool:
178 intarg = 0;
179 match_int(&args[0], &intarg);
180 if (intarg) {
181 info->thread_pool_size = intarg;
182 printk(KERN_INFO "btrfs: thread pool %d\n",
183 info->thread_pool_size);
184 }
185 break;
186 case Opt_max_extent:
187 num = match_strdup(&args[0]);
188 if (num) {
189 info->max_extent = btrfs_parse_size(num);
190 kfree(num);
191
192 info->max_extent = max_t(u64,
193 info->max_extent, root->sectorsize);
194 printk(KERN_INFO "btrfs: max_extent at %llu\n",
195 info->max_extent);
196 }
197 break;
198 case Opt_max_inline:
199 num = match_strdup(&args[0]);
200 if (num) {
201 info->max_inline = btrfs_parse_size(num);
202 kfree(num);
203
204 if (info->max_inline) {
205 info->max_inline = max_t(u64,
206 info->max_inline,
207 root->sectorsize);
208 }
209 printk(KERN_INFO "btrfs: max_inline at %llu\n",
210 info->max_inline);
211 }
212 break;
213 case Opt_alloc_start:
214 num = match_strdup(&args[0]);
215 if (num) {
216 info->alloc_start = btrfs_parse_size(num);
217 kfree(num);
218 printk(KERN_INFO
219 "btrfs: allocations start at %llu\n",
220 info->alloc_start);
221 }
222 break;
223 case Opt_noacl:
224 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
225 break;
226 default:
227 break;
228 }
229 }
230 kfree(options);
231 return 0;
232}
233
234/*
235 * Parse mount options that are required early in the mount process.
236 *
237 * All other options will be parsed on much later in the mount process and
238 * only when we need to allocate a new super block.
239 */
240static int btrfs_parse_early_options(const char *options, fmode_t flags,
241 void *holder, char **subvol_name,
242 struct btrfs_fs_devices **fs_devices)
243{
244 substring_t args[MAX_OPT_ARGS];
245 char *opts, *p;
246 int error = 0;
247
248 if (!options)
249 goto out;
250
251 /*
252 * strsep changes the string, duplicate it because parse_options
253 * gets called twice
254 */
255 opts = kstrdup(options, GFP_KERNEL);
256 if (!opts)
257 return -ENOMEM;
258
259 while ((p = strsep(&opts, ",")) != NULL) {
260 int token;
261 if (!*p)
262 continue;
263
264 token = match_token(p, tokens, args);
265 switch (token) {
266 case Opt_subvol:
267 *subvol_name = match_strdup(&args[0]);
268 break;
269 case Opt_device:
270 error = btrfs_scan_one_device(match_strdup(&args[0]),
271 flags, holder, fs_devices);
272 if (error)
273 goto out_free_opts;
274 break;
275 default:
276 break;
277 }
278 }
279
280 out_free_opts:
281 kfree(opts);
282 out:
283 /*
284 * If no subvolume name is specified we use the default one. Allocate
285 * a copy of the string "." here so that code later in the
286 * mount path doesn't care if it's the default volume or another one.
287 */
288 if (!*subvol_name) {
289 *subvol_name = kstrdup(".", GFP_KERNEL);
290 if (!*subvol_name)
291 return -ENOMEM;
292 }
293 return error;
294}
295
296static int btrfs_fill_super(struct super_block *sb,
297 struct btrfs_fs_devices *fs_devices,
298 void *data, int silent)
299{
300 struct inode *inode;
301 struct dentry *root_dentry;
302 struct btrfs_super_block *disk_super;
303 struct btrfs_root *tree_root;
304 struct btrfs_inode *bi;
305 int err;
306
307 sb->s_maxbytes = MAX_LFS_FILESIZE;
308 sb->s_magic = BTRFS_SUPER_MAGIC;
309 sb->s_op = &btrfs_super_ops;
310 sb->s_export_op = &btrfs_export_ops;
311 sb->s_xattr = btrfs_xattr_handlers;
312 sb->s_time_gran = 1;
313 sb->s_flags |= MS_POSIXACL;
314
315 tree_root = open_ctree(sb, fs_devices, (char *)data);
316
317 if (IS_ERR(tree_root)) {
318 printk("btrfs: open_ctree failed\n");
319 return PTR_ERR(tree_root);
320 }
321 sb->s_fs_info = tree_root;
322 disk_super = &tree_root->fs_info->super_copy;
323 inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
324 tree_root->fs_info->fs_root);
325 bi = BTRFS_I(inode);
326 bi->location.objectid = inode->i_ino;
327 bi->location.offset = 0;
328 bi->root = tree_root->fs_info->fs_root;
329
330 btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
331
332 if (!inode) {
333 err = -ENOMEM;
334 goto fail_close;
335 }
336 if (inode->i_state & I_NEW) {
337 btrfs_read_locked_inode(inode);
338 unlock_new_inode(inode);
339 }
340
341 root_dentry = d_alloc_root(inode);
342 if (!root_dentry) {
343 iput(inode);
344 err = -ENOMEM;
345 goto fail_close;
346 }
347#if 0
348 /* this does the super kobj at the same time */
349 err = btrfs_sysfs_add_super(tree_root->fs_info);
350 if (err)
351 goto fail_close;
352#endif
353
354 sb->s_root = root_dentry;
355
356 save_mount_options(sb, data);
357 return 0;
358
359fail_close:
360 close_ctree(tree_root);
361 return err;
362}
363
364int btrfs_sync_fs(struct super_block *sb, int wait)
365{
366 struct btrfs_trans_handle *trans;
367 struct btrfs_root *root;
368 int ret;
369 root = btrfs_sb(sb);
370
371 if (sb->s_flags & MS_RDONLY)
372 return 0;
373
374 sb->s_dirt = 0;
375 if (!wait) {
376 filemap_flush(root->fs_info->btree_inode->i_mapping);
377 return 0;
378 }
379
380 btrfs_start_delalloc_inodes(root);
381 btrfs_wait_ordered_extents(root, 0);
382
383 btrfs_clean_old_snapshots(root);
384 trans = btrfs_start_transaction(root, 1);
385 ret = btrfs_commit_transaction(trans, root);
386 sb->s_dirt = 0;
387 return ret;
388}
389
390static void btrfs_write_super(struct super_block *sb)
391{
392 sb->s_dirt = 0;
393}
394
395static int btrfs_test_super(struct super_block *s, void *data)
396{
397 struct btrfs_fs_devices *test_fs_devices = data;
398 struct btrfs_root *root = btrfs_sb(s);
399
400 return root->fs_info->fs_devices == test_fs_devices;
401}
402
403/*
404 * Find a superblock for the given device / mount point.
405 *
406 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
407 * for multiple device setup. Make sure to keep it in sync.
408 */
409static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
410 const char *dev_name, void *data, struct vfsmount *mnt)
411{
412 char *subvol_name = NULL;
413 struct block_device *bdev = NULL;
414 struct super_block *s;
415 struct dentry *root;
416 struct btrfs_fs_devices *fs_devices = NULL;
417 fmode_t mode = FMODE_READ;
418 int error = 0;
419
420 if (!(flags & MS_RDONLY))
421 mode |= FMODE_WRITE;
422
423 error = btrfs_parse_early_options(data, mode, fs_type,
424 &subvol_name, &fs_devices);
425 if (error)
426 return error;
427
428 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
429 if (error)
430 goto error_free_subvol_name;
431
432 error = btrfs_open_devices(fs_devices, mode, fs_type);
433 if (error)
434 goto error_free_subvol_name;
435
436 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
437 error = -EACCES;
438 goto error_close_devices;
439 }
440
441 bdev = fs_devices->latest_bdev;
442 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
443 if (IS_ERR(s))
444 goto error_s;
445
446 if (s->s_root) {
447 if ((flags ^ s->s_flags) & MS_RDONLY) {
448 up_write(&s->s_umount);
449 deactivate_super(s);
450 error = -EBUSY;
451 goto error_close_devices;
452 }
453
454 btrfs_close_devices(fs_devices);
455 } else {
456 char b[BDEVNAME_SIZE];
457
458 s->s_flags = flags;
459 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
460 error = btrfs_fill_super(s, fs_devices, data,
461 flags & MS_SILENT ? 1 : 0);
462 if (error) {
463 up_write(&s->s_umount);
464 deactivate_super(s);
465 goto error_free_subvol_name;
466 }
467
468 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
469 s->s_flags |= MS_ACTIVE;
470 }
471
472 if (!strcmp(subvol_name, "."))
473 root = dget(s->s_root);
474 else {
475 mutex_lock(&s->s_root->d_inode->i_mutex);
476 root = lookup_one_len(subvol_name, s->s_root,
477 strlen(subvol_name));
478 mutex_unlock(&s->s_root->d_inode->i_mutex);
479
480 if (IS_ERR(root)) {
481 up_write(&s->s_umount);
482 deactivate_super(s);
483 error = PTR_ERR(root);
484 goto error_free_subvol_name;
485 }
486 if (!root->d_inode) {
487 dput(root);
488 up_write(&s->s_umount);
489 deactivate_super(s);
490 error = -ENXIO;
491 goto error_free_subvol_name;
492 }
493 }
494
495 mnt->mnt_sb = s;
496 mnt->mnt_root = root;
497
498 kfree(subvol_name);
499 return 0;
500
501error_s:
502 error = PTR_ERR(s);
503error_close_devices:
504 btrfs_close_devices(fs_devices);
505error_free_subvol_name:
506 kfree(subvol_name);
507 return error;
508}
509
510static int btrfs_remount(struct super_block *sb, int *flags, char *data)
511{
512 struct btrfs_root *root = btrfs_sb(sb);
513 int ret;
514
515 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
516 return 0;
517
518 if (*flags & MS_RDONLY) {
519 sb->s_flags |= MS_RDONLY;
520
521 ret = btrfs_commit_super(root);
522 WARN_ON(ret);
523 } else {
524 if (root->fs_info->fs_devices->rw_devices == 0)
525 return -EACCES;
526
527 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
528 return -EINVAL;
529
530 ret = btrfs_cleanup_reloc_trees(root);
531 WARN_ON(ret);
532
533 ret = btrfs_cleanup_fs_roots(root->fs_info);
534 WARN_ON(ret);
535
536 sb->s_flags &= ~MS_RDONLY;
537 }
538
539 return 0;
540}
541
542static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
543{
544 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
545 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
546 int bits = dentry->d_sb->s_blocksize_bits;
547 __be32 *fsid = (__be32 *)root->fs_info->fsid;
548
549 buf->f_namelen = BTRFS_NAME_LEN;
550 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
551 buf->f_bfree = buf->f_blocks -
552 (btrfs_super_bytes_used(disk_super) >> bits);
553 buf->f_bavail = buf->f_bfree;
554 buf->f_bsize = dentry->d_sb->s_blocksize;
555 buf->f_type = BTRFS_SUPER_MAGIC;
556
557 /* We treat it as constant endianness (it doesn't matter _which_)
558 because we want the fsid to come out the same whether mounted
559 on a big-endian or little-endian host */
560 buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
561 buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
562 /* Mask in the root object ID too, to disambiguate subvols */
563 buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
564 buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
565
566 return 0;
567}
568
569static struct file_system_type btrfs_fs_type = {
570 .owner = THIS_MODULE,
571 .name = "btrfs",
572 .get_sb = btrfs_get_sb,
573 .kill_sb = kill_anon_super,
574 .fs_flags = FS_REQUIRES_DEV,
575};
576
577/*
578 * used by btrfsctl to scan devices when no FS is mounted
579 */
580static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
581 unsigned long arg)
582{
583 struct btrfs_ioctl_vol_args *vol;
584 struct btrfs_fs_devices *fs_devices;
585 int ret = 0;
586 int len;
587
588 if (!capable(CAP_SYS_ADMIN))
589 return -EPERM;
590
591 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
592 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
593 ret = -EFAULT;
594 goto out;
595 }
596 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
597 switch (cmd) {
598 case BTRFS_IOC_SCAN_DEV:
599 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
600 &btrfs_fs_type, &fs_devices);
601 break;
602 }
603out:
604 kfree(vol);
605 return ret;
606}
607
608static void btrfs_write_super_lockfs(struct super_block *sb)
609{
610 struct btrfs_root *root = btrfs_sb(sb);
611 mutex_lock(&root->fs_info->transaction_kthread_mutex);
612 mutex_lock(&root->fs_info->cleaner_mutex);
613}
614
615static void btrfs_unlockfs(struct super_block *sb)
616{
617 struct btrfs_root *root = btrfs_sb(sb);
618 mutex_unlock(&root->fs_info->cleaner_mutex);
619 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
620}
621
622static struct super_operations btrfs_super_ops = {
623 .delete_inode = btrfs_delete_inode,
624 .put_super = btrfs_put_super,
625 .write_super = btrfs_write_super,
626 .sync_fs = btrfs_sync_fs,
627 .show_options = generic_show_options,
628 .write_inode = btrfs_write_inode,
629 .dirty_inode = btrfs_dirty_inode,
630 .alloc_inode = btrfs_alloc_inode,
631 .destroy_inode = btrfs_destroy_inode,
632 .statfs = btrfs_statfs,
633 .remount_fs = btrfs_remount,
634 .write_super_lockfs = btrfs_write_super_lockfs,
635 .unlockfs = btrfs_unlockfs,
636};
637
638static const struct file_operations btrfs_ctl_fops = {
639 .unlocked_ioctl = btrfs_control_ioctl,
640 .compat_ioctl = btrfs_control_ioctl,
641 .owner = THIS_MODULE,
642};
643
644static struct miscdevice btrfs_misc = {
645 .minor = MISC_DYNAMIC_MINOR,
646 .name = "btrfs-control",
647 .fops = &btrfs_ctl_fops
648};
649
650static int btrfs_interface_init(void)
651{
652 return misc_register(&btrfs_misc);
653}
654
655static void btrfs_interface_exit(void)
656{
657 if (misc_deregister(&btrfs_misc) < 0)
658 printk(KERN_INFO "misc_deregister failed for control device");
659}
660
661static int __init init_btrfs_fs(void)
662{
663 int err;
664
665 err = btrfs_init_sysfs();
666 if (err)
667 return err;
668
669 err = btrfs_init_cachep();
670 if (err)
671 goto free_sysfs;
672
673 err = extent_io_init();
674 if (err)
675 goto free_cachep;
676
677 err = extent_map_init();
678 if (err)
679 goto free_extent_io;
680
681 err = btrfs_interface_init();
682 if (err)
683 goto free_extent_map;
684
685 err = register_filesystem(&btrfs_fs_type);
686 if (err)
687 goto unregister_ioctl;
688
689 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
690 return 0;
691
692unregister_ioctl:
693 btrfs_interface_exit();
694free_extent_map:
695 extent_map_exit();
696free_extent_io:
697 extent_io_exit();
698free_cachep:
699 btrfs_destroy_cachep();
700free_sysfs:
701 btrfs_exit_sysfs();
702 return err;
703}
704
705static void __exit exit_btrfs_fs(void)
706{
707 btrfs_destroy_cachep();
708 extent_map_exit();
709 extent_io_exit();
710 btrfs_interface_exit();
711 unregister_filesystem(&btrfs_fs_type);
712 btrfs_exit_sysfs();
713 btrfs_cleanup_fs_uuids();
714 btrfs_zlib_exit();
715}
716
717module_init(init_btrfs_fs)
718module_exit(exit_btrfs_fs)
719
720MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000000..a240b6fa81df
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,269 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/spinlock.h>
22#include <linux/completion.h>
23#include <linux/buffer_head.h>
24#include <linux/module.h>
25#include <linux/kobject.h>
26
27#include "ctree.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
32{
33 return snprintf(buf, PAGE_SIZE, "%llu\n",
34 (unsigned long long)btrfs_root_used(&root->root_item));
35}
36
37static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
38{
39 return snprintf(buf, PAGE_SIZE, "%llu\n",
40 (unsigned long long)btrfs_root_limit(&root->root_item));
41}
42
43static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
44{
45
46 return snprintf(buf, PAGE_SIZE, "%llu\n",
47 (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
48}
49
50static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
51{
52 return snprintf(buf, PAGE_SIZE, "%llu\n",
53 (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
54}
55
56static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
57{
58 return snprintf(buf, PAGE_SIZE, "%llu\n",
59 (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
60}
61
62/* this is for root attrs (subvols/snapshots) */
63struct btrfs_root_attr {
64 struct attribute attr;
65 ssize_t (*show)(struct btrfs_root *, char *);
66 ssize_t (*store)(struct btrfs_root *, const char *, size_t);
67};
68
69#define ROOT_ATTR(name, mode, show, store) \
70static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
71 show, store)
72
73ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL);
74ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL);
75
76static struct attribute *btrfs_root_attrs[] = {
77 &btrfs_root_attr_blocks_used.attr,
78 &btrfs_root_attr_block_limit.attr,
79 NULL,
80};
81
82/* this is for super attrs (actual full fs) */
83struct btrfs_super_attr {
84 struct attribute attr;
85 ssize_t (*show)(struct btrfs_fs_info *, char *);
86 ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
87};
88
89#define SUPER_ATTR(name, mode, show, store) \
90static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
91 show, store)
92
93SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL);
94SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL);
95SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL);
96
97static struct attribute *btrfs_super_attrs[] = {
98 &btrfs_super_attr_blocks_used.attr,
99 &btrfs_super_attr_total_blocks.attr,
100 &btrfs_super_attr_blocksize.attr,
101 NULL,
102};
103
104static ssize_t btrfs_super_attr_show(struct kobject *kobj,
105 struct attribute *attr, char *buf)
106{
107 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
108 super_kobj);
109 struct btrfs_super_attr *a = container_of(attr,
110 struct btrfs_super_attr,
111 attr);
112
113 return a->show ? a->show(fs, buf) : 0;
114}
115
116static ssize_t btrfs_super_attr_store(struct kobject *kobj,
117 struct attribute *attr,
118 const char *buf, size_t len)
119{
120 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
121 super_kobj);
122 struct btrfs_super_attr *a = container_of(attr,
123 struct btrfs_super_attr,
124 attr);
125
126 return a->store ? a->store(fs, buf, len) : 0;
127}
128
129static ssize_t btrfs_root_attr_show(struct kobject *kobj,
130 struct attribute *attr, char *buf)
131{
132 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
133 root_kobj);
134 struct btrfs_root_attr *a = container_of(attr,
135 struct btrfs_root_attr,
136 attr);
137
138 return a->show ? a->show(root, buf) : 0;
139}
140
141static ssize_t btrfs_root_attr_store(struct kobject *kobj,
142 struct attribute *attr,
143 const char *buf, size_t len)
144{
145 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
146 root_kobj);
147 struct btrfs_root_attr *a = container_of(attr,
148 struct btrfs_root_attr,
149 attr);
150 return a->store ? a->store(root, buf, len) : 0;
151}
152
153static void btrfs_super_release(struct kobject *kobj)
154{
155 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
156 super_kobj);
157 complete(&fs->kobj_unregister);
158}
159
160static void btrfs_root_release(struct kobject *kobj)
161{
162 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
163 root_kobj);
164 complete(&root->kobj_unregister);
165}
166
167static struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store,
170};
171
172static struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store,
175};
176
177static struct kobj_type btrfs_root_ktype = {
178 .default_attrs = btrfs_root_attrs,
179 .sysfs_ops = &btrfs_root_attr_ops,
180 .release = btrfs_root_release,
181};
182
183static struct kobj_type btrfs_super_ktype = {
184 .default_attrs = btrfs_super_attrs,
185 .sysfs_ops = &btrfs_super_attr_ops,
186 .release = btrfs_super_release,
187};
188
189/* /sys/fs/btrfs/ entry */
190static struct kset *btrfs_kset;
191
192int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
193{
194 int error;
195 char *name;
196 char c;
197 int len = strlen(fs->sb->s_id) + 1;
198 int i;
199
200 name = kmalloc(len, GFP_NOFS);
201 if (!name) {
202 error = -ENOMEM;
203 goto fail;
204 }
205
206 for (i = 0; i < len; i++) {
207 c = fs->sb->s_id[i];
208 if (c == '/' || c == '\\')
209 c = '!';
210 name[i] = c;
211 }
212 name[len] = '\0';
213
214 fs->super_kobj.kset = btrfs_kset;
215 error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
216 NULL, "%s", name);
217 kfree(name);
218 if (error)
219 goto fail;
220
221 return 0;
222
223fail:
224 printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
225 return error;
226}
227
228int btrfs_sysfs_add_root(struct btrfs_root *root)
229{
230 int error;
231
232 error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
233 &root->fs_info->super_kobj,
234 "%s", root->name);
235 if (error)
236 goto fail;
237
238 return 0;
239
240fail:
241 printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
242 return error;
243}
244
245void btrfs_sysfs_del_root(struct btrfs_root *root)
246{
247 kobject_put(&root->root_kobj);
248 wait_for_completion(&root->kobj_unregister);
249}
250
251void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
252{
253 kobject_put(&fs->super_kobj);
254 wait_for_completion(&fs->kobj_unregister);
255}
256
257int btrfs_init_sysfs(void)
258{
259 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
260 if (!btrfs_kset)
261 return -ENOMEM;
262 return 0;
263}
264
265void btrfs_exit_sysfs(void)
266{
267 kset_unregister(btrfs_kset);
268}
269
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000000..8a08f9443340
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1097 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/sched.h>
21#include <linux/writeback.h>
22#include <linux/pagemap.h>
23#include <linux/blkdev.h>
24#include "ctree.h"
25#include "disk-io.h"
26#include "transaction.h"
27#include "locking.h"
28#include "ref-cache.h"
29#include "tree-log.h"
30
31#define BTRFS_ROOT_TRANS_TAG 0
32
33static noinline void put_transaction(struct btrfs_transaction *transaction)
34{
35 WARN_ON(transaction->use_count == 0);
36 transaction->use_count--;
37 if (transaction->use_count == 0) {
38 list_del_init(&transaction->list);
39 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 }
42}
43
44/*
45 * either allocate a new transaction or hop into the existing one
46 */
47static noinline int join_transaction(struct btrfs_root *root)
48{
49 struct btrfs_transaction *cur_trans;
50 cur_trans = root->fs_info->running_transaction;
51 if (!cur_trans) {
52 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
53 GFP_NOFS);
54 BUG_ON(!cur_trans);
55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation;
61 init_waitqueue_head(&cur_trans->writer_wait);
62 init_waitqueue_head(&cur_trans->commit_wait);
63 cur_trans->in_commit = 0;
64 cur_trans->blocked = 0;
65 cur_trans->use_count = 1;
66 cur_trans->commit_done = 0;
67 cur_trans->start_time = get_seconds();
68 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
70 extent_io_tree_init(&cur_trans->dirty_pages,
71 root->fs_info->btree_inode->i_mapping,
72 GFP_NOFS);
73 spin_lock(&root->fs_info->new_trans_lock);
74 root->fs_info->running_transaction = cur_trans;
75 spin_unlock(&root->fs_info->new_trans_lock);
76 } else {
77 cur_trans->num_writers++;
78 cur_trans->num_joined++;
79 }
80
81 return 0;
82}
83
84/*
85 * this does all the record keeping required to make sure that a reference
86 * counted root is properly recorded in a given transaction. This is required
87 * to make sure the old root from before we joined the transaction is deleted
88 * when the transaction commits
89 */
90noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
91{
92 struct btrfs_dirty_root *dirty;
93 u64 running_trans_id = root->fs_info->running_transaction->transid;
94 if (root->ref_cows && root->last_trans < running_trans_id) {
95 WARN_ON(root == root->fs_info->extent_root);
96 if (root->root_item.refs != 0) {
97 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
98 (unsigned long)root->root_key.objectid,
99 BTRFS_ROOT_TRANS_TAG);
100
101 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
102 BUG_ON(!dirty);
103 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
104 BUG_ON(!dirty->root);
105 dirty->latest_root = root;
106 INIT_LIST_HEAD(&dirty->list);
107
108 root->commit_root = btrfs_root_node(root);
109
110 memcpy(dirty->root, root, sizeof(*root));
111 spin_lock_init(&dirty->root->node_lock);
112 spin_lock_init(&dirty->root->list_lock);
113 mutex_init(&dirty->root->objectid_mutex);
114 mutex_init(&dirty->root->log_mutex);
115 INIT_LIST_HEAD(&dirty->root->dead_list);
116 dirty->root->node = root->commit_root;
117 dirty->root->commit_root = NULL;
118
119 spin_lock(&root->list_lock);
120 list_add(&dirty->root->dead_list, &root->dead_list);
121 spin_unlock(&root->list_lock);
122
123 root->dirty_root = dirty;
124 } else {
125 WARN_ON(1);
126 }
127 root->last_trans = running_trans_id;
128 }
129 return 0;
130}
131
132/* wait for commit against the current transaction to become unblocked
133 * when this is done, it is safe to start a new transaction, but the current
134 * transaction might not be fully on disk.
135 */
136static void wait_current_trans(struct btrfs_root *root)
137{
138 struct btrfs_transaction *cur_trans;
139
140 cur_trans = root->fs_info->running_transaction;
141 if (cur_trans && cur_trans->blocked) {
142 DEFINE_WAIT(wait);
143 cur_trans->use_count++;
144 while (1) {
145 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
146 TASK_UNINTERRUPTIBLE);
147 if (cur_trans->blocked) {
148 mutex_unlock(&root->fs_info->trans_mutex);
149 schedule();
150 mutex_lock(&root->fs_info->trans_mutex);
151 finish_wait(&root->fs_info->transaction_wait,
152 &wait);
153 } else {
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 break;
157 }
158 }
159 put_transaction(cur_trans);
160 }
161}
162
163static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
164 int num_blocks, int wait)
165{
166 struct btrfs_trans_handle *h =
167 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
168 int ret;
169
170 mutex_lock(&root->fs_info->trans_mutex);
171 if (!root->fs_info->log_root_recovering &&
172 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
173 wait_current_trans(root);
174 ret = join_transaction(root);
175 BUG_ON(ret);
176
177 btrfs_record_root_in_trans(root);
178 h->transid = root->fs_info->running_transaction->transid;
179 h->transaction = root->fs_info->running_transaction;
180 h->blocks_reserved = num_blocks;
181 h->blocks_used = 0;
182 h->block_group = 0;
183 h->alloc_exclude_nr = 0;
184 h->alloc_exclude_start = 0;
185 root->fs_info->running_transaction->use_count++;
186 mutex_unlock(&root->fs_info->trans_mutex);
187 return h;
188}
189
190struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
191 int num_blocks)
192{
193 return start_transaction(root, num_blocks, 1);
194}
195struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
196 int num_blocks)
197{
198 return start_transaction(root, num_blocks, 0);
199}
200
201struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
202 int num_blocks)
203{
204 return start_transaction(r, num_blocks, 2);
205}
206
207/* wait for a transaction commit to be fully complete */
208static noinline int wait_for_commit(struct btrfs_root *root,
209 struct btrfs_transaction *commit)
210{
211 DEFINE_WAIT(wait);
212 mutex_lock(&root->fs_info->trans_mutex);
213 while (!commit->commit_done) {
214 prepare_to_wait(&commit->commit_wait, &wait,
215 TASK_UNINTERRUPTIBLE);
216 if (commit->commit_done)
217 break;
218 mutex_unlock(&root->fs_info->trans_mutex);
219 schedule();
220 mutex_lock(&root->fs_info->trans_mutex);
221 }
222 mutex_unlock(&root->fs_info->trans_mutex);
223 finish_wait(&commit->commit_wait, &wait);
224 return 0;
225}
226
227/*
228 * rate limit against the drop_snapshot code. This helps to slow down new
229 * operations if the drop_snapshot code isn't able to keep up.
230 */
231static void throttle_on_drops(struct btrfs_root *root)
232{
233 struct btrfs_fs_info *info = root->fs_info;
234 int harder_count = 0;
235
236harder:
237 if (atomic_read(&info->throttles)) {
238 DEFINE_WAIT(wait);
239 int thr;
240 thr = atomic_read(&info->throttle_gen);
241
242 do {
243 prepare_to_wait(&info->transaction_throttle,
244 &wait, TASK_UNINTERRUPTIBLE);
245 if (!atomic_read(&info->throttles)) {
246 finish_wait(&info->transaction_throttle, &wait);
247 break;
248 }
249 schedule();
250 finish_wait(&info->transaction_throttle, &wait);
251 } while (thr == atomic_read(&info->throttle_gen));
252 harder_count++;
253
254 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
255 harder_count < 2)
256 goto harder;
257
258 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
259 harder_count < 10)
260 goto harder;
261
262 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
263 harder_count < 20)
264 goto harder;
265 }
266}
267
268void btrfs_throttle(struct btrfs_root *root)
269{
270 mutex_lock(&root->fs_info->trans_mutex);
271 if (!root->fs_info->open_ioctl_trans)
272 wait_current_trans(root);
273 mutex_unlock(&root->fs_info->trans_mutex);
274
275 throttle_on_drops(root);
276}
277
278static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
279 struct btrfs_root *root, int throttle)
280{
281 struct btrfs_transaction *cur_trans;
282 struct btrfs_fs_info *info = root->fs_info;
283
284 mutex_lock(&info->trans_mutex);
285 cur_trans = info->running_transaction;
286 WARN_ON(cur_trans != trans->transaction);
287 WARN_ON(cur_trans->num_writers < 1);
288 cur_trans->num_writers--;
289
290 if (waitqueue_active(&cur_trans->writer_wait))
291 wake_up(&cur_trans->writer_wait);
292 put_transaction(cur_trans);
293 mutex_unlock(&info->trans_mutex);
294 memset(trans, 0, sizeof(*trans));
295 kmem_cache_free(btrfs_trans_handle_cachep, trans);
296
297 if (throttle)
298 throttle_on_drops(root);
299
300 return 0;
301}
302
303int btrfs_end_transaction(struct btrfs_trans_handle *trans,
304 struct btrfs_root *root)
305{
306 return __btrfs_end_transaction(trans, root, 0);
307}
308
309int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
310 struct btrfs_root *root)
311{
312 return __btrfs_end_transaction(trans, root, 1);
313}
314
315/*
316 * when btree blocks are allocated, they have some corresponding bits set for
317 * them in one of two extent_io trees. This is used to make sure all of
318 * those extents are on disk for transaction or log commit
319 */
320int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
321 struct extent_io_tree *dirty_pages)
322{
323 int ret;
324 int err = 0;
325 int werr = 0;
326 struct page *page;
327 struct inode *btree_inode = root->fs_info->btree_inode;
328 u64 start = 0;
329 u64 end;
330 unsigned long index;
331
332 while (1) {
333 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
334 EXTENT_DIRTY);
335 if (ret)
336 break;
337 while (start <= end) {
338 cond_resched();
339
340 index = start >> PAGE_CACHE_SHIFT;
341 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
342 page = find_get_page(btree_inode->i_mapping, index);
343 if (!page)
344 continue;
345
346 btree_lock_page_hook(page);
347 if (!page->mapping) {
348 unlock_page(page);
349 page_cache_release(page);
350 continue;
351 }
352
353 if (PageWriteback(page)) {
354 if (PageDirty(page))
355 wait_on_page_writeback(page);
356 else {
357 unlock_page(page);
358 page_cache_release(page);
359 continue;
360 }
361 }
362 err = write_one_page(page, 0);
363 if (err)
364 werr = err;
365 page_cache_release(page);
366 }
367 }
368 while (1) {
369 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
370 EXTENT_DIRTY);
371 if (ret)
372 break;
373
374 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
375 while (start <= end) {
376 index = start >> PAGE_CACHE_SHIFT;
377 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
378 page = find_get_page(btree_inode->i_mapping, index);
379 if (!page)
380 continue;
381 if (PageDirty(page)) {
382 btree_lock_page_hook(page);
383 wait_on_page_writeback(page);
384 err = write_one_page(page, 0);
385 if (err)
386 werr = err;
387 }
388 wait_on_page_writeback(page);
389 page_cache_release(page);
390 cond_resched();
391 }
392 }
393 if (err)
394 werr = err;
395 return werr;
396}
397
398int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
399 struct btrfs_root *root)
400{
401 if (!trans || !trans->transaction) {
402 struct inode *btree_inode;
403 btree_inode = root->fs_info->btree_inode;
404 return filemap_write_and_wait(btree_inode->i_mapping);
405 }
406 return btrfs_write_and_wait_marked_extents(root,
407 &trans->transaction->dirty_pages);
408}
409
410/*
411 * this is used to update the root pointer in the tree of tree roots.
412 *
413 * But, in the case of the extent allocation tree, updating the root
414 * pointer may allocate blocks which may change the root of the extent
415 * allocation tree.
416 *
417 * So, this loops and repeats and makes sure the cowonly root didn't
418 * change while the root pointer was being updated in the metadata.
419 */
420static int update_cowonly_root(struct btrfs_trans_handle *trans,
421 struct btrfs_root *root)
422{
423 int ret;
424 u64 old_root_bytenr;
425 struct btrfs_root *tree_root = root->fs_info->tree_root;
426
427 btrfs_extent_post_op(trans, root);
428 btrfs_write_dirty_block_groups(trans, root);
429 btrfs_extent_post_op(trans, root);
430
431 while (1) {
432 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
433 if (old_root_bytenr == root->node->start)
434 break;
435 btrfs_set_root_bytenr(&root->root_item,
436 root->node->start);
437 btrfs_set_root_level(&root->root_item,
438 btrfs_header_level(root->node));
439 btrfs_set_root_generation(&root->root_item, trans->transid);
440
441 btrfs_extent_post_op(trans, root);
442
443 ret = btrfs_update_root(trans, tree_root,
444 &root->root_key,
445 &root->root_item);
446 BUG_ON(ret);
447 btrfs_write_dirty_block_groups(trans, root);
448 btrfs_extent_post_op(trans, root);
449 }
450 return 0;
451}
452
453/*
454 * update all the cowonly tree roots on disk
455 */
456int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
457 struct btrfs_root *root)
458{
459 struct btrfs_fs_info *fs_info = root->fs_info;
460 struct list_head *next;
461 struct extent_buffer *eb;
462
463 btrfs_extent_post_op(trans, fs_info->tree_root);
464
465 eb = btrfs_lock_root_node(fs_info->tree_root);
466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
467 btrfs_tree_unlock(eb);
468 free_extent_buffer(eb);
469
470 btrfs_extent_post_op(trans, fs_info->tree_root);
471
472 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
473 next = fs_info->dirty_cowonly_roots.next;
474 list_del_init(next);
475 root = list_entry(next, struct btrfs_root, dirty_list);
476
477 update_cowonly_root(trans, root);
478 }
479 return 0;
480}
481
482/*
483 * dead roots are old snapshots that need to be deleted. This allocates
484 * a dirty root struct and adds it into the list of dead roots that need to
485 * be deleted
486 */
487int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
488{
489 struct btrfs_dirty_root *dirty;
490
491 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
492 if (!dirty)
493 return -ENOMEM;
494 dirty->root = root;
495 dirty->latest_root = latest;
496
497 mutex_lock(&root->fs_info->trans_mutex);
498 list_add(&dirty->list, &latest->fs_info->dead_roots);
499 mutex_unlock(&root->fs_info->trans_mutex);
500 return 0;
501}
502
503/*
504 * at transaction commit time we need to schedule the old roots for
505 * deletion via btrfs_drop_snapshot. This runs through all the
506 * reference counted roots that were modified in the current
507 * transaction and puts them into the drop list
508 */
509static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
510 struct radix_tree_root *radix,
511 struct list_head *list)
512{
513 struct btrfs_dirty_root *dirty;
514 struct btrfs_root *gang[8];
515 struct btrfs_root *root;
516 int i;
517 int ret;
518 int err = 0;
519 u32 refs;
520
521 while (1) {
522 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
523 ARRAY_SIZE(gang),
524 BTRFS_ROOT_TRANS_TAG);
525 if (ret == 0)
526 break;
527 for (i = 0; i < ret; i++) {
528 root = gang[i];
529 radix_tree_tag_clear(radix,
530 (unsigned long)root->root_key.objectid,
531 BTRFS_ROOT_TRANS_TAG);
532
533 BUG_ON(!root->ref_tree);
534 dirty = root->dirty_root;
535
536 btrfs_free_log(trans, root);
537 btrfs_free_reloc_root(trans, root);
538
539 if (root->commit_root == root->node) {
540 WARN_ON(root->node->start !=
541 btrfs_root_bytenr(&root->root_item));
542
543 free_extent_buffer(root->commit_root);
544 root->commit_root = NULL;
545 root->dirty_root = NULL;
546
547 spin_lock(&root->list_lock);
548 list_del_init(&dirty->root->dead_list);
549 spin_unlock(&root->list_lock);
550
551 kfree(dirty->root);
552 kfree(dirty);
553
554 /* make sure to update the root on disk
555 * so we get any updates to the block used
556 * counts
557 */
558 err = btrfs_update_root(trans,
559 root->fs_info->tree_root,
560 &root->root_key,
561 &root->root_item);
562 continue;
563 }
564
565 memset(&root->root_item.drop_progress, 0,
566 sizeof(struct btrfs_disk_key));
567 root->root_item.drop_level = 0;
568 root->commit_root = NULL;
569 root->dirty_root = NULL;
570 root->root_key.offset = root->fs_info->generation;
571 btrfs_set_root_bytenr(&root->root_item,
572 root->node->start);
573 btrfs_set_root_level(&root->root_item,
574 btrfs_header_level(root->node));
575 btrfs_set_root_generation(&root->root_item,
576 root->root_key.offset);
577
578 err = btrfs_insert_root(trans, root->fs_info->tree_root,
579 &root->root_key,
580 &root->root_item);
581 if (err)
582 break;
583
584 refs = btrfs_root_refs(&dirty->root->root_item);
585 btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
586 err = btrfs_update_root(trans, root->fs_info->tree_root,
587 &dirty->root->root_key,
588 &dirty->root->root_item);
589
590 BUG_ON(err);
591 if (refs == 1) {
592 list_add(&dirty->list, list);
593 } else {
594 WARN_ON(1);
595 free_extent_buffer(dirty->root->node);
596 kfree(dirty->root);
597 kfree(dirty);
598 }
599 }
600 }
601 return err;
602}
603
604/*
605 * defrag a given btree. If cacheonly == 1, this won't read from the disk,
606 * otherwise every leaf in the btree is read and defragged.
607 */
608int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
609{
610 struct btrfs_fs_info *info = root->fs_info;
611 int ret;
612 struct btrfs_trans_handle *trans;
613 unsigned long nr;
614
615 smp_mb();
616 if (root->defrag_running)
617 return 0;
618 trans = btrfs_start_transaction(root, 1);
619 while (1) {
620 root->defrag_running = 1;
621 ret = btrfs_defrag_leaves(trans, root, cacheonly);
622 nr = trans->blocks_used;
623 btrfs_end_transaction(trans, root);
624 btrfs_btree_balance_dirty(info->tree_root, nr);
625 cond_resched();
626
627 trans = btrfs_start_transaction(root, 1);
628 if (root->fs_info->closing || ret != -EAGAIN)
629 break;
630 }
631 root->defrag_running = 0;
632 smp_mb();
633 btrfs_end_transaction(trans, root);
634 return 0;
635}
636
637/*
638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
639 * all of them
640 */
641static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
642 struct list_head *list)
643{
644 struct btrfs_dirty_root *dirty;
645 struct btrfs_trans_handle *trans;
646 unsigned long nr;
647 u64 num_bytes;
648 u64 bytes_used;
649 u64 max_useless;
650 int ret = 0;
651 int err;
652
653 while (!list_empty(list)) {
654 struct btrfs_root *root;
655
656 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
657 list_del_init(&dirty->list);
658
659 num_bytes = btrfs_root_used(&dirty->root->root_item);
660 root = dirty->latest_root;
661 atomic_inc(&root->fs_info->throttles);
662
663 while (1) {
664 trans = btrfs_start_transaction(tree_root, 1);
665 mutex_lock(&root->fs_info->drop_mutex);
666 ret = btrfs_drop_snapshot(trans, dirty->root);
667 if (ret != -EAGAIN)
668 break;
669 mutex_unlock(&root->fs_info->drop_mutex);
670
671 err = btrfs_update_root(trans,
672 tree_root,
673 &dirty->root->root_key,
674 &dirty->root->root_item);
675 if (err)
676 ret = err;
677 nr = trans->blocks_used;
678 ret = btrfs_end_transaction(trans, tree_root);
679 BUG_ON(ret);
680
681 btrfs_btree_balance_dirty(tree_root, nr);
682 cond_resched();
683 }
684 BUG_ON(ret);
685 atomic_dec(&root->fs_info->throttles);
686 wake_up(&root->fs_info->transaction_throttle);
687
688 num_bytes -= btrfs_root_used(&dirty->root->root_item);
689 bytes_used = btrfs_root_used(&root->root_item);
690 if (num_bytes) {
691 btrfs_record_root_in_trans(root);
692 btrfs_set_root_used(&root->root_item,
693 bytes_used - num_bytes);
694 }
695
696 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
697 if (ret) {
698 BUG();
699 break;
700 }
701 mutex_unlock(&root->fs_info->drop_mutex);
702
703 spin_lock(&root->list_lock);
704 list_del_init(&dirty->root->dead_list);
705 if (!list_empty(&root->dead_list)) {
706 struct btrfs_root *oldest;
707 oldest = list_entry(root->dead_list.prev,
708 struct btrfs_root, dead_list);
709 max_useless = oldest->root_key.offset - 1;
710 } else {
711 max_useless = root->root_key.offset - 1;
712 }
713 spin_unlock(&root->list_lock);
714
715 nr = trans->blocks_used;
716 ret = btrfs_end_transaction(trans, tree_root);
717 BUG_ON(ret);
718
719 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
720 BUG_ON(ret);
721
722 free_extent_buffer(dirty->root->node);
723 kfree(dirty->root);
724 kfree(dirty);
725
726 btrfs_btree_balance_dirty(tree_root, nr);
727 cond_resched();
728 }
729 return ret;
730}
731
732/*
733 * new snapshots need to be created at a very specific time in the
734 * transaction commit. This does the actual creation
735 */
736static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
737 struct btrfs_fs_info *fs_info,
738 struct btrfs_pending_snapshot *pending)
739{
740 struct btrfs_key key;
741 struct btrfs_root_item *new_root_item;
742 struct btrfs_root *tree_root = fs_info->tree_root;
743 struct btrfs_root *root = pending->root;
744 struct extent_buffer *tmp;
745 struct extent_buffer *old;
746 int ret;
747 u64 objectid;
748
749 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
750 if (!new_root_item) {
751 ret = -ENOMEM;
752 goto fail;
753 }
754 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
755 if (ret)
756 goto fail;
757
758 btrfs_record_root_in_trans(root);
759 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
760 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
761
762 key.objectid = objectid;
763 key.offset = trans->transid;
764 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
765
766 old = btrfs_lock_root_node(root);
767 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
768
769 btrfs_copy_root(trans, root, old, &tmp, objectid);
770 btrfs_tree_unlock(old);
771 free_extent_buffer(old);
772
773 btrfs_set_root_bytenr(new_root_item, tmp->start);
774 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
775 btrfs_set_root_generation(new_root_item, trans->transid);
776 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
777 new_root_item);
778 btrfs_tree_unlock(tmp);
779 free_extent_buffer(tmp);
780 if (ret)
781 goto fail;
782
783 key.offset = (u64)-1;
784 memcpy(&pending->root_key, &key, sizeof(key));
785fail:
786 kfree(new_root_item);
787 return ret;
788}
789
790static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
791 struct btrfs_pending_snapshot *pending)
792{
793 int ret;
794 int namelen;
795 u64 index = 0;
796 struct btrfs_trans_handle *trans;
797 struct inode *parent_inode;
798 struct inode *inode;
799 struct btrfs_root *parent_root;
800
801 parent_inode = pending->dentry->d_parent->d_inode;
802 parent_root = BTRFS_I(parent_inode)->root;
803 trans = btrfs_join_transaction(parent_root, 1);
804
805 /*
806 * insert the directory item
807 */
808 namelen = strlen(pending->name);
809 ret = btrfs_set_inode_index(parent_inode, &index);
810 ret = btrfs_insert_dir_item(trans, parent_root,
811 pending->name, namelen,
812 parent_inode->i_ino,
813 &pending->root_key, BTRFS_FT_DIR, index);
814
815 if (ret)
816 goto fail;
817
818 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
819 ret = btrfs_update_inode(trans, parent_root, parent_inode);
820 BUG_ON(ret);
821
822 /* add the backref first */
823 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
824 pending->root_key.objectid,
825 BTRFS_ROOT_BACKREF_KEY,
826 parent_root->root_key.objectid,
827 parent_inode->i_ino, index, pending->name,
828 namelen);
829
830 BUG_ON(ret);
831
832 /* now add the forward ref */
833 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
834 parent_root->root_key.objectid,
835 BTRFS_ROOT_REF_KEY,
836 pending->root_key.objectid,
837 parent_inode->i_ino, index, pending->name,
838 namelen);
839
840 inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
841 d_instantiate(pending->dentry, inode);
842fail:
843 btrfs_end_transaction(trans, fs_info->fs_root);
844 return ret;
845}
846
847/*
848 * create all the snapshots we've scheduled for creation
849 */
850static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
851 struct btrfs_fs_info *fs_info)
852{
853 struct btrfs_pending_snapshot *pending;
854 struct list_head *head = &trans->transaction->pending_snapshots;
855 struct list_head *cur;
856 int ret;
857
858 list_for_each(cur, head) {
859 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
860 ret = create_pending_snapshot(trans, fs_info, pending);
861 BUG_ON(ret);
862 }
863 return 0;
864}
865
866static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
867 struct btrfs_fs_info *fs_info)
868{
869 struct btrfs_pending_snapshot *pending;
870 struct list_head *head = &trans->transaction->pending_snapshots;
871 int ret;
872
873 while (!list_empty(head)) {
874 pending = list_entry(head->next,
875 struct btrfs_pending_snapshot, list);
876 ret = finish_pending_snapshot(fs_info, pending);
877 BUG_ON(ret);
878 list_del(&pending->list);
879 kfree(pending->name);
880 kfree(pending);
881 }
882 return 0;
883}
884
885int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
886 struct btrfs_root *root)
887{
888 unsigned long joined = 0;
889 unsigned long timeout = 1;
890 struct btrfs_transaction *cur_trans;
891 struct btrfs_transaction *prev_trans = NULL;
892 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
893 struct list_head dirty_fs_roots;
894 struct extent_io_tree *pinned_copy;
895 DEFINE_WAIT(wait);
896 int ret;
897
898 INIT_LIST_HEAD(&dirty_fs_roots);
899 mutex_lock(&root->fs_info->trans_mutex);
900 if (trans->transaction->in_commit) {
901 cur_trans = trans->transaction;
902 trans->transaction->use_count++;
903 mutex_unlock(&root->fs_info->trans_mutex);
904 btrfs_end_transaction(trans, root);
905
906 ret = wait_for_commit(root, cur_trans);
907 BUG_ON(ret);
908
909 mutex_lock(&root->fs_info->trans_mutex);
910 put_transaction(cur_trans);
911 mutex_unlock(&root->fs_info->trans_mutex);
912
913 return 0;
914 }
915
916 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
917 if (!pinned_copy)
918 return -ENOMEM;
919
920 extent_io_tree_init(pinned_copy,
921 root->fs_info->btree_inode->i_mapping, GFP_NOFS);
922
923 trans->transaction->in_commit = 1;
924 trans->transaction->blocked = 1;
925 cur_trans = trans->transaction;
926 if (cur_trans->list.prev != &root->fs_info->trans_list) {
927 prev_trans = list_entry(cur_trans->list.prev,
928 struct btrfs_transaction, list);
929 if (!prev_trans->commit_done) {
930 prev_trans->use_count++;
931 mutex_unlock(&root->fs_info->trans_mutex);
932
933 wait_for_commit(root, prev_trans);
934
935 mutex_lock(&root->fs_info->trans_mutex);
936 put_transaction(prev_trans);
937 }
938 }
939
940 do {
941 int snap_pending = 0;
942 joined = cur_trans->num_joined;
943 if (!list_empty(&trans->transaction->pending_snapshots))
944 snap_pending = 1;
945
946 WARN_ON(cur_trans != trans->transaction);
947 prepare_to_wait(&cur_trans->writer_wait, &wait,
948 TASK_UNINTERRUPTIBLE);
949
950 if (cur_trans->num_writers > 1)
951 timeout = MAX_SCHEDULE_TIMEOUT;
952 else
953 timeout = 1;
954
955 mutex_unlock(&root->fs_info->trans_mutex);
956
957 if (snap_pending) {
958 ret = btrfs_wait_ordered_extents(root, 1);
959 BUG_ON(ret);
960 }
961
962 schedule_timeout(timeout);
963
964 mutex_lock(&root->fs_info->trans_mutex);
965 finish_wait(&cur_trans->writer_wait, &wait);
966 } while (cur_trans->num_writers > 1 ||
967 (cur_trans->num_joined != joined));
968
969 ret = create_pending_snapshots(trans, root->fs_info);
970 BUG_ON(ret);
971
972 WARN_ON(cur_trans != trans->transaction);
973
974 /* btrfs_commit_tree_roots is responsible for getting the
975 * various roots consistent with each other. Every pointer
976 * in the tree of tree roots has to point to the most up to date
977 * root for every subvolume and other tree. So, we have to keep
978 * the tree logging code from jumping in and changing any
979 * of the trees.
980 *
981 * At this point in the commit, there can't be any tree-log
982 * writers, but a little lower down we drop the trans mutex
983 * and let new people in. By holding the tree_log_mutex
984 * from now until after the super is written, we avoid races
985 * with the tree-log code.
986 */
987 mutex_lock(&root->fs_info->tree_log_mutex);
988 /*
989 * keep tree reloc code from adding new reloc trees
990 */
991 mutex_lock(&root->fs_info->tree_reloc_mutex);
992
993
994 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
995 &dirty_fs_roots);
996 BUG_ON(ret);
997
998 /* add_dirty_roots gets rid of all the tree log roots, it is now
999 * safe to free the root of tree log roots
1000 */
1001 btrfs_free_log_root_tree(trans, root->fs_info);
1002
1003 ret = btrfs_commit_tree_roots(trans, root);
1004 BUG_ON(ret);
1005
1006 cur_trans = root->fs_info->running_transaction;
1007 spin_lock(&root->fs_info->new_trans_lock);
1008 root->fs_info->running_transaction = NULL;
1009 spin_unlock(&root->fs_info->new_trans_lock);
1010 btrfs_set_super_generation(&root->fs_info->super_copy,
1011 cur_trans->transid);
1012 btrfs_set_super_root(&root->fs_info->super_copy,
1013 root->fs_info->tree_root->node->start);
1014 btrfs_set_super_root_level(&root->fs_info->super_copy,
1015 btrfs_header_level(root->fs_info->tree_root->node));
1016
1017 btrfs_set_super_chunk_root(&root->fs_info->super_copy,
1018 chunk_root->node->start);
1019 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
1020 btrfs_header_level(chunk_root->node));
1021 btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
1022 btrfs_header_generation(chunk_root->node));
1023
1024 if (!root->fs_info->log_root_recovering) {
1025 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1026 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1027 }
1028
1029 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1030 sizeof(root->fs_info->super_copy));
1031
1032 btrfs_copy_pinned(root, pinned_copy);
1033
1034 trans->transaction->blocked = 0;
1035 wake_up(&root->fs_info->transaction_throttle);
1036 wake_up(&root->fs_info->transaction_wait);
1037
1038 mutex_unlock(&root->fs_info->trans_mutex);
1039 ret = btrfs_write_and_wait_transaction(trans, root);
1040 BUG_ON(ret);
1041 write_ctree_super(trans, root, 0);
1042
1043 /*
1044 * the super is written, we can safely allow the tree-loggers
1045 * to go about their business
1046 */
1047 mutex_unlock(&root->fs_info->tree_log_mutex);
1048
1049 btrfs_finish_extent_commit(trans, root, pinned_copy);
1050 kfree(pinned_copy);
1051
1052 btrfs_drop_dead_reloc_roots(root);
1053 mutex_unlock(&root->fs_info->tree_reloc_mutex);
1054
1055 /* do the directory inserts of any pending snapshot creations */
1056 finish_pending_snapshots(trans, root->fs_info);
1057
1058 mutex_lock(&root->fs_info->trans_mutex);
1059
1060 cur_trans->commit_done = 1;
1061 root->fs_info->last_trans_committed = cur_trans->transid;
1062 wake_up(&cur_trans->commit_wait);
1063
1064 put_transaction(cur_trans);
1065 put_transaction(cur_trans);
1066
1067 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1068 if (root->fs_info->closing)
1069 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1070
1071 mutex_unlock(&root->fs_info->trans_mutex);
1072
1073 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1074
1075 if (root->fs_info->closing)
1076 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1077 return ret;
1078}
1079
1080/*
1081 * interface function to delete all the snapshots we have scheduled for deletion
1082 */
1083int btrfs_clean_old_snapshots(struct btrfs_root *root)
1084{
1085 struct list_head dirty_roots;
1086 INIT_LIST_HEAD(&dirty_roots);
1087again:
1088 mutex_lock(&root->fs_info->trans_mutex);
1089 list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
1090 mutex_unlock(&root->fs_info->trans_mutex);
1091
1092 if (!list_empty(&dirty_roots)) {
1093 drop_dirty_roots(root, &dirty_roots);
1094 goto again;
1095 }
1096 return 0;
1097}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000000..ea292117f882
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,106 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h"
22
23struct btrfs_transaction {
24 u64 transid;
25 unsigned long num_writers;
26 unsigned long num_joined;
27 int in_commit;
28 int use_count;
29 int commit_done;
30 int blocked;
31 struct list_head list;
32 struct extent_io_tree dirty_pages;
33 unsigned long start_time;
34 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots;
37};
38
39struct btrfs_trans_handle {
40 u64 transid;
41 unsigned long blocks_reserved;
42 unsigned long blocks_used;
43 struct btrfs_transaction *transaction;
44 u64 block_group;
45 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr;
47};
48
49struct btrfs_pending_snapshot {
50 struct dentry *dentry;
51 struct btrfs_root *root;
52 char *name;
53 struct btrfs_key root_key;
54 struct list_head list;
55};
56
57struct btrfs_dirty_root {
58 struct list_head list;
59 struct btrfs_root *root;
60 struct btrfs_root *latest_root;
61};
62
63static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
64 struct inode *inode)
65{
66 trans->block_group = BTRFS_I(inode)->block_group;
67}
68
69static inline void btrfs_update_inode_block_group(
70 struct btrfs_trans_handle *trans,
71 struct inode *inode)
72{
73 BTRFS_I(inode)->block_group = trans->block_group;
74}
75
76static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
77 struct inode *inode)
78{
79 BTRFS_I(inode)->last_trans = trans->transaction->transid;
80}
81
82int btrfs_end_transaction(struct btrfs_trans_handle *trans,
83 struct btrfs_root *root);
84struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
85 int num_blocks);
86struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
87 int num_blocks);
88struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
89 int num_blocks);
90int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
91 struct btrfs_root *root);
92int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
93 struct btrfs_root *root);
94
95int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
96int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
97int btrfs_clean_old_snapshots(struct btrfs_root *root);
98int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
99 struct btrfs_root *root);
100int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
101 struct btrfs_root *root);
102void btrfs_throttle(struct btrfs_root *root);
103int btrfs_record_root_in_trans(struct btrfs_root *root);
104int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
105 struct extent_io_tree *dirty_pages);
106#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000000..3e8358c36165
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,147 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "print-tree.h"
23#include "transaction.h"
24#include "locking.h"
25
26/* defrag all the leaves in a given btree. If cache_only == 1, don't read
27 * things from disk, otherwise read all the leaves and try to get key order to
28 * better reflect disk order
29 */
30
31int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, int cache_only)
33{
34 struct btrfs_path *path = NULL;
35 struct btrfs_key key;
36 int ret = 0;
37 int wret;
38 int level;
39 int orig_level;
40 int is_extent = 0;
41 int next_key_ret = 0;
42 u64 last_ret = 0;
43 u64 min_trans = 0;
44
45 if (cache_only)
46 goto out;
47
48 if (root->fs_info->extent_root == root) {
49 /*
50 * there's recursion here right now in the tree locking,
51 * we can't defrag the extent root without deadlock
52 */
53 goto out;
54 }
55
56 if (root->ref_cows == 0 && !is_extent)
57 goto out;
58
59 if (btrfs_test_opt(root, SSD))
60 goto out;
61
62 path = btrfs_alloc_path();
63 if (!path)
64 return -ENOMEM;
65
66 level = btrfs_header_level(root->node);
67 orig_level = level;
68
69 if (level == 0)
70 goto out;
71
72 if (root->defrag_progress.objectid == 0) {
73 struct extent_buffer *root_node;
74 u32 nritems;
75
76 root_node = btrfs_lock_root_node(root);
77 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */
80 btrfs_node_key_to_cpu(root_node, &root->defrag_max,
81 nritems - 1);
82 btrfs_tree_unlock(root_node);
83 free_extent_buffer(root_node);
84 memset(&key, 0, sizeof(key));
85 } else {
86 memcpy(&key, &root->defrag_progress, sizeof(key));
87 }
88
89 path->keep_locks = 1;
90 if (cache_only)
91 min_trans = root->defrag_trans_start;
92
93 ret = btrfs_search_forward(root, &key, NULL, path,
94 cache_only, min_trans);
95 if (ret < 0)
96 goto out;
97 if (ret > 0) {
98 ret = 0;
99 goto out;
100 }
101 btrfs_release_path(root, path);
102 wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
103
104 if (wret < 0) {
105 ret = wret;
106 goto out;
107 }
108 if (!path->nodes[1]) {
109 ret = 0;
110 goto out;
111 }
112 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
113 next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
114 min_trans);
115 ret = btrfs_realloc_node(trans, root,
116 path->nodes[1], 0,
117 cache_only, &last_ret,
118 &root->defrag_progress);
119 WARN_ON(ret && ret != -EAGAIN);
120 if (next_key_ret == 0) {
121 memcpy(&root->defrag_progress, &key, sizeof(key));
122 ret = -EAGAIN;
123 }
124
125 btrfs_release_path(root, path);
126 if (is_extent)
127 btrfs_extent_post_op(trans, root);
128out:
129 if (path)
130 btrfs_free_path(path);
131 if (ret == -EAGAIN) {
132 if (root->defrag_max.objectid > root->defrag_progress.objectid)
133 goto done;
134 if (root->defrag_max.type > root->defrag_progress.type)
135 goto done;
136 if (root->defrag_max.offset > root->defrag_progress.offset)
137 goto done;
138 ret = 0;
139 }
140done:
141 if (ret != -EAGAIN) {
142 memset(&root->defrag_progress, 0,
143 sizeof(root->defrag_progress));
144 root->defrag_trans_start = trans->transid;
145 }
146 return ret;
147}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000000..d81cda2e077c
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2898 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "transaction.h"
22#include "disk-io.h"
23#include "locking.h"
24#include "print-tree.h"
25#include "compat.h"
26#include "tree-log.h"
27
28/* magic values for the inode_only field in btrfs_log_inode:
29 *
30 * LOG_INODE_ALL means to log everything
31 * LOG_INODE_EXISTS means to log just enough to recreate the inode
32 * during log replay
33 */
34#define LOG_INODE_ALL 0
35#define LOG_INODE_EXISTS 1
36
37/*
38 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes
41 * we find in the log are created in the subvolume.
42 *
43 * The last stage is to deal with directories and links and extents
44 * and all the other fun semantics
45 */
46#define LOG_WALK_PIN_ONLY 0
47#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2
49
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode,
52 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid);
56
57/*
58 * tree logging is a special write ahead log used to make sure that
59 * fsyncs and O_SYNCs can happen without doing full tree commits.
60 *
61 * Full tree commits are expensive because they require commonly
62 * modified blocks to be recowed, creating many dirty pages in the
63 * extent tree an 4x-6x higher write load than ext3.
64 *
65 * Instead of doing a tree commit on every fsync, we use the
66 * key ranges and transaction ids to find items for a given file or directory
67 * that have changed in this transaction. Those items are copied into
68 * a special tree (one per subvolume root), that tree is written to disk
69 * and then the fsync is considered complete.
70 *
71 * After a crash, items are copied out of the log-tree back into the
72 * subvolume tree. Any file data extents found are recorded in the extent
73 * allocation tree, and the log-tree freed.
74 *
75 * The log tree is read three times, once to pin down all the extents it is
76 * using in ram and once, once to create all the inodes logged in the tree
77 * and once to do all the other items.
78 */
79
80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161
162 /*
163 * we need to make sure the root block for this new tree
164 * is marked as dirty in the dirty_log_pages tree. This
165 * is how it gets flushed down to disk at tree log commit time.
166 *
167 * the tree logging mutex keeps others from coming in and changing
168 * the new_root->node, so we can safely access it here
169 */
170 set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171 new_root->node->start + new_root->node->len - 1,
172 GFP_NOFS);
173
174fail:
175 return ret;
176}
177
178/*
179 * start a sub transaction and setup the log tree
180 * this increments the log tree writer count to make the people
181 * syncing the tree wait for us to finish
182 */
183static int start_log_trans(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root)
185{
186 int ret;
187 mutex_lock(&root->fs_info->tree_log_mutex);
188 if (!root->fs_info->log_root_tree) {
189 ret = btrfs_init_log_root_tree(trans, root->fs_info);
190 BUG_ON(ret);
191 }
192 if (!root->log_root) {
193 ret = btrfs_add_log_tree(trans, root);
194 BUG_ON(ret);
195 }
196 atomic_inc(&root->fs_info->tree_log_writers);
197 root->fs_info->tree_log_batch++;
198 mutex_unlock(&root->fs_info->tree_log_mutex);
199 return 0;
200}
201
202/*
203 * returns 0 if there was a log transaction running and we were able
204 * to join, or returns -ENOENT if there were not transactions
205 * in progress
206 */
207static int join_running_log_trans(struct btrfs_root *root)
208{
209 int ret = -ENOENT;
210
211 smp_mb();
212 if (!root->log_root)
213 return -ENOENT;
214
215 mutex_lock(&root->fs_info->tree_log_mutex);
216 if (root->log_root) {
217 ret = 0;
218 atomic_inc(&root->fs_info->tree_log_writers);
219 root->fs_info->tree_log_batch++;
220 }
221 mutex_unlock(&root->fs_info->tree_log_mutex);
222 return ret;
223}
224
225/*
226 * indicate we're done making changes to the log tree
227 * and wake up anyone waiting to do a sync
228 */
229static int end_log_trans(struct btrfs_root *root)
230{
231 atomic_dec(&root->fs_info->tree_log_writers);
232 smp_mb();
233 if (waitqueue_active(&root->fs_info->tree_log_wait))
234 wake_up(&root->fs_info->tree_log_wait);
235 return 0;
236}
237
238
239/*
240 * the walk control struct is used to pass state down the chain when
241 * processing the log tree. The stage field tells us which part
242 * of the log tree processing we are currently doing. The others
243 * are state fields used for that specific part
244 */
245struct walk_control {
246 /* should we free the extent on disk when done? This is used
247 * at transaction commit time while freeing a log tree
248 */
249 int free;
250
251 /* should we write out the extent buffer? This is used
252 * while flushing the log tree to disk during a sync
253 */
254 int write;
255
256 /* should we wait for the extent buffer io to finish? Also used
257 * while flushing the log tree to disk for a sync
258 */
259 int wait;
260
261 /* pin only walk, we record which extents on disk belong to the
262 * log trees
263 */
264 int pin;
265
266 /* what stage of the replay code we're currently in */
267 int stage;
268
269 /* the root we are currently replaying */
270 struct btrfs_root *replay_dest;
271
272 /* the trans handle for the current replay */
273 struct btrfs_trans_handle *trans;
274
275 /* the function that gets used to process blocks we find in the
276 * tree. Note the extent_buffer might not be up to date when it is
277 * passed in, and it must be checked or read if you need the data
278 * inside it
279 */
280 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
281 struct walk_control *wc, u64 gen);
282};
283
284/*
285 * process_func used to pin down extents, write them or wait on them
286 */
287static int process_one_buffer(struct btrfs_root *log,
288 struct extent_buffer *eb,
289 struct walk_control *wc, u64 gen)
290{
291 if (wc->pin) {
292 mutex_lock(&log->fs_info->pinned_mutex);
293 btrfs_update_pinned_extents(log->fs_info->extent_root,
294 eb->start, eb->len, 1);
295 mutex_unlock(&log->fs_info->pinned_mutex);
296 }
297
298 if (btrfs_buffer_uptodate(eb, gen)) {
299 if (wc->write)
300 btrfs_write_tree_block(eb);
301 if (wc->wait)
302 btrfs_wait_tree_block_writeback(eb);
303 }
304 return 0;
305}
306
307/*
308 * Item overwrite used by replay and tree logging. eb, slot and key all refer
309 * to the src data we are copying out.
310 *
311 * root is the tree we are copying into, and path is a scratch
312 * path for use in this function (it should be released on entry and
313 * will be released on exit).
314 *
315 * If the key is already in the destination tree the existing item is
316 * overwritten. If the existing item isn't big enough, it is extended.
317 * If it is too large, it is truncated.
318 *
319 * If the key isn't in the destination yet, a new item is inserted.
320 */
321static noinline int overwrite_item(struct btrfs_trans_handle *trans,
322 struct btrfs_root *root,
323 struct btrfs_path *path,
324 struct extent_buffer *eb, int slot,
325 struct btrfs_key *key)
326{
327 int ret;
328 u32 item_size;
329 u64 saved_i_size = 0;
330 int save_old_i_size = 0;
331 unsigned long src_ptr;
332 unsigned long dst_ptr;
333 int overwrite_root = 0;
334
335 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
336 overwrite_root = 1;
337
338 item_size = btrfs_item_size_nr(eb, slot);
339 src_ptr = btrfs_item_ptr_offset(eb, slot);
340
341 /* look for the key in the destination tree */
342 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
343 if (ret == 0) {
344 char *src_copy;
345 char *dst_copy;
346 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
347 path->slots[0]);
348 if (dst_size != item_size)
349 goto insert;
350
351 if (item_size == 0) {
352 btrfs_release_path(root, path);
353 return 0;
354 }
355 dst_copy = kmalloc(item_size, GFP_NOFS);
356 src_copy = kmalloc(item_size, GFP_NOFS);
357
358 read_extent_buffer(eb, src_copy, src_ptr, item_size);
359
360 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
361 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
362 item_size);
363 ret = memcmp(dst_copy, src_copy, item_size);
364
365 kfree(dst_copy);
366 kfree(src_copy);
367 /*
368 * they have the same contents, just return, this saves
369 * us from cowing blocks in the destination tree and doing
370 * extra writes that may not have been done by a previous
371 * sync
372 */
373 if (ret == 0) {
374 btrfs_release_path(root, path);
375 return 0;
376 }
377
378 }
379insert:
380 btrfs_release_path(root, path);
381 /* try to insert the key into the destination tree */
382 ret = btrfs_insert_empty_item(trans, root, path,
383 key, item_size);
384
385 /* make sure any existing item is the correct size */
386 if (ret == -EEXIST) {
387 u32 found_size;
388 found_size = btrfs_item_size_nr(path->nodes[0],
389 path->slots[0]);
390 if (found_size > item_size) {
391 btrfs_truncate_item(trans, root, path, item_size, 1);
392 } else if (found_size < item_size) {
393 ret = btrfs_extend_item(trans, root, path,
394 item_size - found_size);
395 BUG_ON(ret);
396 }
397 } else if (ret) {
398 BUG();
399 }
400 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
401 path->slots[0]);
402
403 /* don't overwrite an existing inode if the generation number
404 * was logged as zero. This is done when the tree logging code
405 * is just logging an inode to make sure it exists after recovery.
406 *
407 * Also, don't overwrite i_size on directories during replay.
408 * log replay inserts and removes directory items based on the
409 * state of the tree found in the subvolume, and i_size is modified
410 * as it goes
411 */
412 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
413 struct btrfs_inode_item *src_item;
414 struct btrfs_inode_item *dst_item;
415
416 src_item = (struct btrfs_inode_item *)src_ptr;
417 dst_item = (struct btrfs_inode_item *)dst_ptr;
418
419 if (btrfs_inode_generation(eb, src_item) == 0)
420 goto no_copy;
421
422 if (overwrite_root &&
423 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
424 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
425 save_old_i_size = 1;
426 saved_i_size = btrfs_inode_size(path->nodes[0],
427 dst_item);
428 }
429 }
430
431 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
432 src_ptr, item_size);
433
434 if (save_old_i_size) {
435 struct btrfs_inode_item *dst_item;
436 dst_item = (struct btrfs_inode_item *)dst_ptr;
437 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
438 }
439
440 /* make sure the generation is filled in */
441 if (key->type == BTRFS_INODE_ITEM_KEY) {
442 struct btrfs_inode_item *dst_item;
443 dst_item = (struct btrfs_inode_item *)dst_ptr;
444 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
445 btrfs_set_inode_generation(path->nodes[0], dst_item,
446 trans->transid);
447 }
448 }
449no_copy:
450 btrfs_mark_buffer_dirty(path->nodes[0]);
451 btrfs_release_path(root, path);
452 return 0;
453}
454
455/*
456 * simple helper to read an inode off the disk from a given root
457 * This can only be called for subvolume roots and not for the log
458 */
459static noinline struct inode *read_one_inode(struct btrfs_root *root,
460 u64 objectid)
461{
462 struct inode *inode;
463 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
464 if (inode->i_state & I_NEW) {
465 BTRFS_I(inode)->root = root;
466 BTRFS_I(inode)->location.objectid = objectid;
467 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
468 BTRFS_I(inode)->location.offset = 0;
469 btrfs_read_locked_inode(inode);
470 unlock_new_inode(inode);
471
472 }
473 if (is_bad_inode(inode)) {
474 iput(inode);
475 inode = NULL;
476 }
477 return inode;
478}
479
480/* replays a single extent in 'eb' at 'slot' with 'key' into the
481 * subvolume 'root'. path is released on entry and should be released
482 * on exit.
483 *
484 * extents in the log tree have not been allocated out of the extent
485 * tree yet. So, this completes the allocation, taking a reference
486 * as required if the extent already exists or creating a new extent
487 * if it isn't in the extent allocation tree yet.
488 *
489 * The extent is inserted into the file, dropping any existing extents
490 * from the file that overlap the new one.
491 */
492static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
493 struct btrfs_root *root,
494 struct btrfs_path *path,
495 struct extent_buffer *eb, int slot,
496 struct btrfs_key *key)
497{
498 int found_type;
499 u64 mask = root->sectorsize - 1;
500 u64 extent_end;
501 u64 alloc_hint;
502 u64 start = key->offset;
503 u64 saved_nbytes;
504 struct btrfs_file_extent_item *item;
505 struct inode *inode = NULL;
506 unsigned long size;
507 int ret = 0;
508
509 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
510 found_type = btrfs_file_extent_type(eb, item);
511
512 if (found_type == BTRFS_FILE_EXTENT_REG ||
513 found_type == BTRFS_FILE_EXTENT_PREALLOC)
514 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
515 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
516 size = btrfs_file_extent_inline_len(eb, item);
517 extent_end = (start + size + mask) & ~mask;
518 } else {
519 ret = 0;
520 goto out;
521 }
522
523 inode = read_one_inode(root, key->objectid);
524 if (!inode) {
525 ret = -EIO;
526 goto out;
527 }
528
529 /*
530 * first check to see if we already have this extent in the
531 * file. This must be done before the btrfs_drop_extents run
532 * so we don't try to drop this extent.
533 */
534 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
535 start, 0);
536
537 if (ret == 0 &&
538 (found_type == BTRFS_FILE_EXTENT_REG ||
539 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
540 struct btrfs_file_extent_item cmp1;
541 struct btrfs_file_extent_item cmp2;
542 struct btrfs_file_extent_item *existing;
543 struct extent_buffer *leaf;
544
545 leaf = path->nodes[0];
546 existing = btrfs_item_ptr(leaf, path->slots[0],
547 struct btrfs_file_extent_item);
548
549 read_extent_buffer(eb, &cmp1, (unsigned long)item,
550 sizeof(cmp1));
551 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
552 sizeof(cmp2));
553
554 /*
555 * we already have a pointer to this exact extent,
556 * we don't have to do anything
557 */
558 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
559 btrfs_release_path(root, path);
560 goto out;
561 }
562 }
563 btrfs_release_path(root, path);
564
565 saved_nbytes = inode_get_bytes(inode);
566 /* drop any overlapping extents */
567 ret = btrfs_drop_extents(trans, root, inode,
568 start, extent_end, start, &alloc_hint);
569 BUG_ON(ret);
570
571 if (found_type == BTRFS_FILE_EXTENT_REG ||
572 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
573 unsigned long dest_offset;
574 struct btrfs_key ins;
575
576 ret = btrfs_insert_empty_item(trans, root, path, key,
577 sizeof(*item));
578 BUG_ON(ret);
579 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
580 path->slots[0]);
581 copy_extent_buffer(path->nodes[0], eb, dest_offset,
582 (unsigned long)item, sizeof(*item));
583
584 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
585 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
586 ins.type = BTRFS_EXTENT_ITEM_KEY;
587
588 if (ins.objectid > 0) {
589 u64 csum_start;
590 u64 csum_end;
591 LIST_HEAD(ordered_sums);
592 /*
593 * is this extent already allocated in the extent
594 * allocation tree? If so, just add a reference
595 */
596 ret = btrfs_lookup_extent(root, ins.objectid,
597 ins.offset);
598 if (ret == 0) {
599 ret = btrfs_inc_extent_ref(trans, root,
600 ins.objectid, ins.offset,
601 path->nodes[0]->start,
602 root->root_key.objectid,
603 trans->transid, key->objectid);
604 } else {
605 /*
606 * insert the extent pointer in the extent
607 * allocation tree
608 */
609 ret = btrfs_alloc_logged_extent(trans, root,
610 path->nodes[0]->start,
611 root->root_key.objectid,
612 trans->transid, key->objectid,
613 &ins);
614 BUG_ON(ret);
615 }
616 btrfs_release_path(root, path);
617
618 if (btrfs_file_extent_compression(eb, item)) {
619 csum_start = ins.objectid;
620 csum_end = csum_start + ins.offset;
621 } else {
622 csum_start = ins.objectid +
623 btrfs_file_extent_offset(eb, item);
624 csum_end = csum_start +
625 btrfs_file_extent_num_bytes(eb, item);
626 }
627
628 ret = btrfs_lookup_csums_range(root->log_root,
629 csum_start, csum_end - 1,
630 &ordered_sums);
631 BUG_ON(ret);
632 while (!list_empty(&ordered_sums)) {
633 struct btrfs_ordered_sum *sums;
634 sums = list_entry(ordered_sums.next,
635 struct btrfs_ordered_sum,
636 list);
637 ret = btrfs_csum_file_blocks(trans,
638 root->fs_info->csum_root,
639 sums);
640 BUG_ON(ret);
641 list_del(&sums->list);
642 kfree(sums);
643 }
644 } else {
645 btrfs_release_path(root, path);
646 }
647 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
648 /* inline extents are easy, we just overwrite them */
649 ret = overwrite_item(trans, root, path, eb, slot, key);
650 BUG_ON(ret);
651 }
652
653 inode_set_bytes(inode, saved_nbytes);
654 btrfs_update_inode(trans, root, inode);
655out:
656 if (inode)
657 iput(inode);
658 return ret;
659}
660
661/*
662 * when cleaning up conflicts between the directory names in the
663 * subvolume, directory names in the log and directory names in the
664 * inode back references, we may have to unlink inodes from directories.
665 *
666 * This is a helper function to do the unlink of a specific directory
667 * item
668 */
669static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
670 struct btrfs_root *root,
671 struct btrfs_path *path,
672 struct inode *dir,
673 struct btrfs_dir_item *di)
674{
675 struct inode *inode;
676 char *name;
677 int name_len;
678 struct extent_buffer *leaf;
679 struct btrfs_key location;
680 int ret;
681
682 leaf = path->nodes[0];
683
684 btrfs_dir_item_key_to_cpu(leaf, di, &location);
685 name_len = btrfs_dir_name_len(leaf, di);
686 name = kmalloc(name_len, GFP_NOFS);
687 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
688 btrfs_release_path(root, path);
689
690 inode = read_one_inode(root, location.objectid);
691 BUG_ON(!inode);
692
693 ret = link_to_fixup_dir(trans, root, path, location.objectid);
694 BUG_ON(ret);
695 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
696 BUG_ON(ret);
697 kfree(name);
698
699 iput(inode);
700 return ret;
701}
702
703/*
704 * helper function to see if a given name and sequence number found
705 * in an inode back reference are already in a directory and correctly
706 * point to this inode
707 */
708static noinline int inode_in_dir(struct btrfs_root *root,
709 struct btrfs_path *path,
710 u64 dirid, u64 objectid, u64 index,
711 const char *name, int name_len)
712{
713 struct btrfs_dir_item *di;
714 struct btrfs_key location;
715 int match = 0;
716
717 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
718 index, name, name_len, 0);
719 if (di && !IS_ERR(di)) {
720 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
721 if (location.objectid != objectid)
722 goto out;
723 } else
724 goto out;
725 btrfs_release_path(root, path);
726
727 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
728 if (di && !IS_ERR(di)) {
729 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
730 if (location.objectid != objectid)
731 goto out;
732 } else
733 goto out;
734 match = 1;
735out:
736 btrfs_release_path(root, path);
737 return match;
738}
739
740/*
741 * helper function to check a log tree for a named back reference in
742 * an inode. This is used to decide if a back reference that is
743 * found in the subvolume conflicts with what we find in the log.
744 *
745 * inode backreferences may have multiple refs in a single item,
746 * during replay we process one reference at a time, and we don't
747 * want to delete valid links to a file from the subvolume if that
748 * link is also in the log.
749 */
750static noinline int backref_in_log(struct btrfs_root *log,
751 struct btrfs_key *key,
752 char *name, int namelen)
753{
754 struct btrfs_path *path;
755 struct btrfs_inode_ref *ref;
756 unsigned long ptr;
757 unsigned long ptr_end;
758 unsigned long name_ptr;
759 int found_name_len;
760 int item_size;
761 int ret;
762 int match = 0;
763
764 path = btrfs_alloc_path();
765 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
766 if (ret != 0)
767 goto out;
768
769 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
770 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
771 ptr_end = ptr + item_size;
772 while (ptr < ptr_end) {
773 ref = (struct btrfs_inode_ref *)ptr;
774 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
775 if (found_name_len == namelen) {
776 name_ptr = (unsigned long)(ref + 1);
777 ret = memcmp_extent_buffer(path->nodes[0], name,
778 name_ptr, namelen);
779 if (ret == 0) {
780 match = 1;
781 goto out;
782 }
783 }
784 ptr = (unsigned long)(ref + 1) + found_name_len;
785 }
786out:
787 btrfs_free_path(path);
788 return match;
789}
790
791
792/*
793 * replay one inode back reference item found in the log tree.
794 * eb, slot and key refer to the buffer and key found in the log tree.
795 * root is the destination we are replaying into, and path is for temp
796 * use by this function. (it should be released on return).
797 */
798static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
799 struct btrfs_root *root,
800 struct btrfs_root *log,
801 struct btrfs_path *path,
802 struct extent_buffer *eb, int slot,
803 struct btrfs_key *key)
804{
805 struct inode *dir;
806 int ret;
807 struct btrfs_key location;
808 struct btrfs_inode_ref *ref;
809 struct btrfs_dir_item *di;
810 struct inode *inode;
811 char *name;
812 int namelen;
813 unsigned long ref_ptr;
814 unsigned long ref_end;
815
816 location.objectid = key->objectid;
817 location.type = BTRFS_INODE_ITEM_KEY;
818 location.offset = 0;
819
820 /*
821 * it is possible that we didn't log all the parent directories
822 * for a given inode. If we don't find the dir, just don't
823 * copy the back ref in. The link count fixup code will take
824 * care of the rest
825 */
826 dir = read_one_inode(root, key->offset);
827 if (!dir)
828 return -ENOENT;
829
830 inode = read_one_inode(root, key->objectid);
831 BUG_ON(!dir);
832
833 ref_ptr = btrfs_item_ptr_offset(eb, slot);
834 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
835
836again:
837 ref = (struct btrfs_inode_ref *)ref_ptr;
838
839 namelen = btrfs_inode_ref_name_len(eb, ref);
840 name = kmalloc(namelen, GFP_NOFS);
841 BUG_ON(!name);
842
843 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
844
845 /* if we already have a perfect match, we're done */
846 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
847 btrfs_inode_ref_index(eb, ref),
848 name, namelen)) {
849 goto out;
850 }
851
852 /*
853 * look for a conflicting back reference in the metadata.
854 * if we find one we have to unlink that name of the file
855 * before we add our new link. Later on, we overwrite any
856 * existing back reference, and we don't want to create
857 * dangling pointers in the directory.
858 */
859conflict_again:
860 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
861 if (ret == 0) {
862 char *victim_name;
863 int victim_name_len;
864 struct btrfs_inode_ref *victim_ref;
865 unsigned long ptr;
866 unsigned long ptr_end;
867 struct extent_buffer *leaf = path->nodes[0];
868
869 /* are we trying to overwrite a back ref for the root directory
870 * if so, just jump out, we're done
871 */
872 if (key->objectid == key->offset)
873 goto out_nowrite;
874
875 /* check all the names in this back reference to see
876 * if they are in the log. if so, we allow them to stay
877 * otherwise they must be unlinked as a conflict
878 */
879 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
880 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
881 while (ptr < ptr_end) {
882 victim_ref = (struct btrfs_inode_ref *)ptr;
883 victim_name_len = btrfs_inode_ref_name_len(leaf,
884 victim_ref);
885 victim_name = kmalloc(victim_name_len, GFP_NOFS);
886 BUG_ON(!victim_name);
887
888 read_extent_buffer(leaf, victim_name,
889 (unsigned long)(victim_ref + 1),
890 victim_name_len);
891
892 if (!backref_in_log(log, key, victim_name,
893 victim_name_len)) {
894 btrfs_inc_nlink(inode);
895 btrfs_release_path(root, path);
896 ret = btrfs_unlink_inode(trans, root, dir,
897 inode, victim_name,
898 victim_name_len);
899 kfree(victim_name);
900 btrfs_release_path(root, path);
901 goto conflict_again;
902 }
903 kfree(victim_name);
904 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
905 }
906 BUG_ON(ret);
907 }
908 btrfs_release_path(root, path);
909
910 /* look for a conflicting sequence number */
911 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
912 btrfs_inode_ref_index(eb, ref),
913 name, namelen, 0);
914 if (di && !IS_ERR(di)) {
915 ret = drop_one_dir_item(trans, root, path, dir, di);
916 BUG_ON(ret);
917 }
918 btrfs_release_path(root, path);
919
920
921 /* look for a conflicting name */
922 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
923 name, namelen, 0);
924 if (di && !IS_ERR(di)) {
925 ret = drop_one_dir_item(trans, root, path, dir, di);
926 BUG_ON(ret);
927 }
928 btrfs_release_path(root, path);
929
930 /* insert our name */
931 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
932 btrfs_inode_ref_index(eb, ref));
933 BUG_ON(ret);
934
935 btrfs_update_inode(trans, root, inode);
936
937out:
938 ref_ptr = (unsigned long)(ref + 1) + namelen;
939 kfree(name);
940 if (ref_ptr < ref_end)
941 goto again;
942
943 /* finally write the back reference in the inode */
944 ret = overwrite_item(trans, root, path, eb, slot, key);
945 BUG_ON(ret);
946
947out_nowrite:
948 btrfs_release_path(root, path);
949 iput(dir);
950 iput(inode);
951 return 0;
952}
953
954/*
955 * There are a few corners where the link count of the file can't
956 * be properly maintained during replay. So, instead of adding
957 * lots of complexity to the log code, we just scan the backrefs
958 * for any file that has been through replay.
959 *
960 * The scan will update the link count on the inode to reflect the
961 * number of back refs found. If it goes down to zero, the iput
962 * will free the inode.
963 */
964static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
965 struct btrfs_root *root,
966 struct inode *inode)
967{
968 struct btrfs_path *path;
969 int ret;
970 struct btrfs_key key;
971 u64 nlink = 0;
972 unsigned long ptr;
973 unsigned long ptr_end;
974 int name_len;
975
976 key.objectid = inode->i_ino;
977 key.type = BTRFS_INODE_REF_KEY;
978 key.offset = (u64)-1;
979
980 path = btrfs_alloc_path();
981
982 while (1) {
983 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
984 if (ret < 0)
985 break;
986 if (ret > 0) {
987 if (path->slots[0] == 0)
988 break;
989 path->slots[0]--;
990 }
991 btrfs_item_key_to_cpu(path->nodes[0], &key,
992 path->slots[0]);
993 if (key.objectid != inode->i_ino ||
994 key.type != BTRFS_INODE_REF_KEY)
995 break;
996 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
997 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
998 path->slots[0]);
999 while (ptr < ptr_end) {
1000 struct btrfs_inode_ref *ref;
1001
1002 ref = (struct btrfs_inode_ref *)ptr;
1003 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1004 ref);
1005 ptr = (unsigned long)(ref + 1) + name_len;
1006 nlink++;
1007 }
1008
1009 if (key.offset == 0)
1010 break;
1011 key.offset--;
1012 btrfs_release_path(root, path);
1013 }
1014 btrfs_free_path(path);
1015 if (nlink != inode->i_nlink) {
1016 inode->i_nlink = nlink;
1017 btrfs_update_inode(trans, root, inode);
1018 }
1019 BTRFS_I(inode)->index_cnt = (u64)-1;
1020
1021 return 0;
1022}
1023
1024static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1025 struct btrfs_root *root,
1026 struct btrfs_path *path)
1027{
1028 int ret;
1029 struct btrfs_key key;
1030 struct inode *inode;
1031
1032 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1033 key.type = BTRFS_ORPHAN_ITEM_KEY;
1034 key.offset = (u64)-1;
1035 while (1) {
1036 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1037 if (ret < 0)
1038 break;
1039
1040 if (ret == 1) {
1041 if (path->slots[0] == 0)
1042 break;
1043 path->slots[0]--;
1044 }
1045
1046 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1047 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1048 key.type != BTRFS_ORPHAN_ITEM_KEY)
1049 break;
1050
1051 ret = btrfs_del_item(trans, root, path);
1052 BUG_ON(ret);
1053
1054 btrfs_release_path(root, path);
1055 inode = read_one_inode(root, key.offset);
1056 BUG_ON(!inode);
1057
1058 ret = fixup_inode_link_count(trans, root, inode);
1059 BUG_ON(ret);
1060
1061 iput(inode);
1062
1063 if (key.offset == 0)
1064 break;
1065 key.offset--;
1066 }
1067 btrfs_release_path(root, path);
1068 return 0;
1069}
1070
1071
1072/*
1073 * record a given inode in the fixup dir so we can check its link
1074 * count when replay is done. The link count is incremented here
1075 * so the inode won't go away until we check it
1076 */
1077static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1078 struct btrfs_root *root,
1079 struct btrfs_path *path,
1080 u64 objectid)
1081{
1082 struct btrfs_key key;
1083 int ret = 0;
1084 struct inode *inode;
1085
1086 inode = read_one_inode(root, objectid);
1087 BUG_ON(!inode);
1088
1089 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1090 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1091 key.offset = objectid;
1092
1093 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1094
1095 btrfs_release_path(root, path);
1096 if (ret == 0) {
1097 btrfs_inc_nlink(inode);
1098 btrfs_update_inode(trans, root, inode);
1099 } else if (ret == -EEXIST) {
1100 ret = 0;
1101 } else {
1102 BUG();
1103 }
1104 iput(inode);
1105
1106 return ret;
1107}
1108
1109/*
1110 * when replaying the log for a directory, we only insert names
1111 * for inodes that actually exist. This means an fsync on a directory
1112 * does not implicitly fsync all the new files in it
1113 */
1114static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1115 struct btrfs_root *root,
1116 struct btrfs_path *path,
1117 u64 dirid, u64 index,
1118 char *name, int name_len, u8 type,
1119 struct btrfs_key *location)
1120{
1121 struct inode *inode;
1122 struct inode *dir;
1123 int ret;
1124
1125 inode = read_one_inode(root, location->objectid);
1126 if (!inode)
1127 return -ENOENT;
1128
1129 dir = read_one_inode(root, dirid);
1130 if (!dir) {
1131 iput(inode);
1132 return -EIO;
1133 }
1134 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1135
1136 /* FIXME, put inode into FIXUP list */
1137
1138 iput(inode);
1139 iput(dir);
1140 return ret;
1141}
1142
1143/*
1144 * take a single entry in a log directory item and replay it into
1145 * the subvolume.
1146 *
1147 * if a conflicting item exists in the subdirectory already,
1148 * the inode it points to is unlinked and put into the link count
1149 * fix up tree.
1150 *
1151 * If a name from the log points to a file or directory that does
1152 * not exist in the FS, it is skipped. fsyncs on directories
1153 * do not force down inodes inside that directory, just changes to the
1154 * names or unlinks in a directory.
1155 */
1156static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1157 struct btrfs_root *root,
1158 struct btrfs_path *path,
1159 struct extent_buffer *eb,
1160 struct btrfs_dir_item *di,
1161 struct btrfs_key *key)
1162{
1163 char *name;
1164 int name_len;
1165 struct btrfs_dir_item *dst_di;
1166 struct btrfs_key found_key;
1167 struct btrfs_key log_key;
1168 struct inode *dir;
1169 u8 log_type;
1170 int exists;
1171 int ret;
1172
1173 dir = read_one_inode(root, key->objectid);
1174 BUG_ON(!dir);
1175
1176 name_len = btrfs_dir_name_len(eb, di);
1177 name = kmalloc(name_len, GFP_NOFS);
1178 log_type = btrfs_dir_type(eb, di);
1179 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1180 name_len);
1181
1182 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1183 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1184 if (exists == 0)
1185 exists = 1;
1186 else
1187 exists = 0;
1188 btrfs_release_path(root, path);
1189
1190 if (key->type == BTRFS_DIR_ITEM_KEY) {
1191 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1192 name, name_len, 1);
1193 } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1194 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1195 key->objectid,
1196 key->offset, name,
1197 name_len, 1);
1198 } else {
1199 BUG();
1200 }
1201 if (!dst_di || IS_ERR(dst_di)) {
1202 /* we need a sequence number to insert, so we only
1203 * do inserts for the BTRFS_DIR_INDEX_KEY types
1204 */
1205 if (key->type != BTRFS_DIR_INDEX_KEY)
1206 goto out;
1207 goto insert;
1208 }
1209
1210 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1211 /* the existing item matches the logged item */
1212 if (found_key.objectid == log_key.objectid &&
1213 found_key.type == log_key.type &&
1214 found_key.offset == log_key.offset &&
1215 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1216 goto out;
1217 }
1218
1219 /*
1220 * don't drop the conflicting directory entry if the inode
1221 * for the new entry doesn't exist
1222 */
1223 if (!exists)
1224 goto out;
1225
1226 ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1227 BUG_ON(ret);
1228
1229 if (key->type == BTRFS_DIR_INDEX_KEY)
1230 goto insert;
1231out:
1232 btrfs_release_path(root, path);
1233 kfree(name);
1234 iput(dir);
1235 return 0;
1236
1237insert:
1238 btrfs_release_path(root, path);
1239 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1240 name, name_len, log_type, &log_key);
1241
1242 if (ret && ret != -ENOENT)
1243 BUG();
1244 goto out;
1245}
1246
1247/*
1248 * find all the names in a directory item and reconcile them into
1249 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
1250 * one name in a directory item, but the same code gets used for
1251 * both directory index types
1252 */
1253static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1254 struct btrfs_root *root,
1255 struct btrfs_path *path,
1256 struct extent_buffer *eb, int slot,
1257 struct btrfs_key *key)
1258{
1259 int ret;
1260 u32 item_size = btrfs_item_size_nr(eb, slot);
1261 struct btrfs_dir_item *di;
1262 int name_len;
1263 unsigned long ptr;
1264 unsigned long ptr_end;
1265
1266 ptr = btrfs_item_ptr_offset(eb, slot);
1267 ptr_end = ptr + item_size;
1268 while (ptr < ptr_end) {
1269 di = (struct btrfs_dir_item *)ptr;
1270 name_len = btrfs_dir_name_len(eb, di);
1271 ret = replay_one_name(trans, root, path, eb, di, key);
1272 BUG_ON(ret);
1273 ptr = (unsigned long)(di + 1);
1274 ptr += name_len;
1275 }
1276 return 0;
1277}
1278
1279/*
1280 * directory replay has two parts. There are the standard directory
1281 * items in the log copied from the subvolume, and range items
1282 * created in the log while the subvolume was logged.
1283 *
1284 * The range items tell us which parts of the key space the log
1285 * is authoritative for. During replay, if a key in the subvolume
1286 * directory is in a logged range item, but not actually in the log
1287 * that means it was deleted from the directory before the fsync
1288 * and should be removed.
1289 */
1290static noinline int find_dir_range(struct btrfs_root *root,
1291 struct btrfs_path *path,
1292 u64 dirid, int key_type,
1293 u64 *start_ret, u64 *end_ret)
1294{
1295 struct btrfs_key key;
1296 u64 found_end;
1297 struct btrfs_dir_log_item *item;
1298 int ret;
1299 int nritems;
1300
1301 if (*start_ret == (u64)-1)
1302 return 1;
1303
1304 key.objectid = dirid;
1305 key.type = key_type;
1306 key.offset = *start_ret;
1307
1308 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1309 if (ret < 0)
1310 goto out;
1311 if (ret > 0) {
1312 if (path->slots[0] == 0)
1313 goto out;
1314 path->slots[0]--;
1315 }
1316 if (ret != 0)
1317 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1318
1319 if (key.type != key_type || key.objectid != dirid) {
1320 ret = 1;
1321 goto next;
1322 }
1323 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1324 struct btrfs_dir_log_item);
1325 found_end = btrfs_dir_log_end(path->nodes[0], item);
1326
1327 if (*start_ret >= key.offset && *start_ret <= found_end) {
1328 ret = 0;
1329 *start_ret = key.offset;
1330 *end_ret = found_end;
1331 goto out;
1332 }
1333 ret = 1;
1334next:
1335 /* check the next slot in the tree to see if it is a valid item */
1336 nritems = btrfs_header_nritems(path->nodes[0]);
1337 if (path->slots[0] >= nritems) {
1338 ret = btrfs_next_leaf(root, path);
1339 if (ret)
1340 goto out;
1341 } else {
1342 path->slots[0]++;
1343 }
1344
1345 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1346
1347 if (key.type != key_type || key.objectid != dirid) {
1348 ret = 1;
1349 goto out;
1350 }
1351 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1352 struct btrfs_dir_log_item);
1353 found_end = btrfs_dir_log_end(path->nodes[0], item);
1354 *start_ret = key.offset;
1355 *end_ret = found_end;
1356 ret = 0;
1357out:
1358 btrfs_release_path(root, path);
1359 return ret;
1360}
1361
1362/*
1363 * this looks for a given directory item in the log. If the directory
1364 * item is not in the log, the item is removed and the inode it points
1365 * to is unlinked
1366 */
1367static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1368 struct btrfs_root *root,
1369 struct btrfs_root *log,
1370 struct btrfs_path *path,
1371 struct btrfs_path *log_path,
1372 struct inode *dir,
1373 struct btrfs_key *dir_key)
1374{
1375 int ret;
1376 struct extent_buffer *eb;
1377 int slot;
1378 u32 item_size;
1379 struct btrfs_dir_item *di;
1380 struct btrfs_dir_item *log_di;
1381 int name_len;
1382 unsigned long ptr;
1383 unsigned long ptr_end;
1384 char *name;
1385 struct inode *inode;
1386 struct btrfs_key location;
1387
1388again:
1389 eb = path->nodes[0];
1390 slot = path->slots[0];
1391 item_size = btrfs_item_size_nr(eb, slot);
1392 ptr = btrfs_item_ptr_offset(eb, slot);
1393 ptr_end = ptr + item_size;
1394 while (ptr < ptr_end) {
1395 di = (struct btrfs_dir_item *)ptr;
1396 name_len = btrfs_dir_name_len(eb, di);
1397 name = kmalloc(name_len, GFP_NOFS);
1398 if (!name) {
1399 ret = -ENOMEM;
1400 goto out;
1401 }
1402 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1403 name_len);
1404 log_di = NULL;
1405 if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
1406 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1407 dir_key->objectid,
1408 name, name_len, 0);
1409 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
1410 log_di = btrfs_lookup_dir_index_item(trans, log,
1411 log_path,
1412 dir_key->objectid,
1413 dir_key->offset,
1414 name, name_len, 0);
1415 }
1416 if (!log_di || IS_ERR(log_di)) {
1417 btrfs_dir_item_key_to_cpu(eb, di, &location);
1418 btrfs_release_path(root, path);
1419 btrfs_release_path(log, log_path);
1420 inode = read_one_inode(root, location.objectid);
1421 BUG_ON(!inode);
1422
1423 ret = link_to_fixup_dir(trans, root,
1424 path, location.objectid);
1425 BUG_ON(ret);
1426 btrfs_inc_nlink(inode);
1427 ret = btrfs_unlink_inode(trans, root, dir, inode,
1428 name, name_len);
1429 BUG_ON(ret);
1430 kfree(name);
1431 iput(inode);
1432
1433 /* there might still be more names under this key
1434 * check and repeat if required
1435 */
1436 ret = btrfs_search_slot(NULL, root, dir_key, path,
1437 0, 0);
1438 if (ret == 0)
1439 goto again;
1440 ret = 0;
1441 goto out;
1442 }
1443 btrfs_release_path(log, log_path);
1444 kfree(name);
1445
1446 ptr = (unsigned long)(di + 1);
1447 ptr += name_len;
1448 }
1449 ret = 0;
1450out:
1451 btrfs_release_path(root, path);
1452 btrfs_release_path(log, log_path);
1453 return ret;
1454}
1455
1456/*
1457 * deletion replay happens before we copy any new directory items
1458 * out of the log or out of backreferences from inodes. It
1459 * scans the log to find ranges of keys that log is authoritative for,
1460 * and then scans the directory to find items in those ranges that are
1461 * not present in the log.
1462 *
1463 * Anything we don't find in the log is unlinked and removed from the
1464 * directory.
1465 */
1466static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1467 struct btrfs_root *root,
1468 struct btrfs_root *log,
1469 struct btrfs_path *path,
1470 u64 dirid)
1471{
1472 u64 range_start;
1473 u64 range_end;
1474 int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1475 int ret = 0;
1476 struct btrfs_key dir_key;
1477 struct btrfs_key found_key;
1478 struct btrfs_path *log_path;
1479 struct inode *dir;
1480
1481 dir_key.objectid = dirid;
1482 dir_key.type = BTRFS_DIR_ITEM_KEY;
1483 log_path = btrfs_alloc_path();
1484 if (!log_path)
1485 return -ENOMEM;
1486
1487 dir = read_one_inode(root, dirid);
1488 /* it isn't an error if the inode isn't there, that can happen
1489 * because we replay the deletes before we copy in the inode item
1490 * from the log
1491 */
1492 if (!dir) {
1493 btrfs_free_path(log_path);
1494 return 0;
1495 }
1496again:
1497 range_start = 0;
1498 range_end = 0;
1499 while (1) {
1500 ret = find_dir_range(log, path, dirid, key_type,
1501 &range_start, &range_end);
1502 if (ret != 0)
1503 break;
1504
1505 dir_key.offset = range_start;
1506 while (1) {
1507 int nritems;
1508 ret = btrfs_search_slot(NULL, root, &dir_key, path,
1509 0, 0);
1510 if (ret < 0)
1511 goto out;
1512
1513 nritems = btrfs_header_nritems(path->nodes[0]);
1514 if (path->slots[0] >= nritems) {
1515 ret = btrfs_next_leaf(root, path);
1516 if (ret)
1517 break;
1518 }
1519 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1520 path->slots[0]);
1521 if (found_key.objectid != dirid ||
1522 found_key.type != dir_key.type)
1523 goto next_type;
1524
1525 if (found_key.offset > range_end)
1526 break;
1527
1528 ret = check_item_in_log(trans, root, log, path,
1529 log_path, dir, &found_key);
1530 BUG_ON(ret);
1531 if (found_key.offset == (u64)-1)
1532 break;
1533 dir_key.offset = found_key.offset + 1;
1534 }
1535 btrfs_release_path(root, path);
1536 if (range_end == (u64)-1)
1537 break;
1538 range_start = range_end + 1;
1539 }
1540
1541next_type:
1542 ret = 0;
1543 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1544 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1545 dir_key.type = BTRFS_DIR_INDEX_KEY;
1546 btrfs_release_path(root, path);
1547 goto again;
1548 }
1549out:
1550 btrfs_release_path(root, path);
1551 btrfs_free_path(log_path);
1552 iput(dir);
1553 return ret;
1554}
1555
1556/*
1557 * the process_func used to replay items from the log tree. This
1558 * gets called in two different stages. The first stage just looks
1559 * for inodes and makes sure they are all copied into the subvolume.
1560 *
1561 * The second stage copies all the other item types from the log into
1562 * the subvolume. The two stage approach is slower, but gets rid of
1563 * lots of complexity around inodes referencing other inodes that exist
1564 * only in the log (references come from either directory items or inode
1565 * back refs).
1566 */
1567static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1568 struct walk_control *wc, u64 gen)
1569{
1570 int nritems;
1571 struct btrfs_path *path;
1572 struct btrfs_root *root = wc->replay_dest;
1573 struct btrfs_key key;
1574 u32 item_size;
1575 int level;
1576 int i;
1577 int ret;
1578
1579 btrfs_read_buffer(eb, gen);
1580
1581 level = btrfs_header_level(eb);
1582
1583 if (level != 0)
1584 return 0;
1585
1586 path = btrfs_alloc_path();
1587 BUG_ON(!path);
1588
1589 nritems = btrfs_header_nritems(eb);
1590 for (i = 0; i < nritems; i++) {
1591 btrfs_item_key_to_cpu(eb, &key, i);
1592 item_size = btrfs_item_size_nr(eb, i);
1593
1594 /* inode keys are done during the first stage */
1595 if (key.type == BTRFS_INODE_ITEM_KEY &&
1596 wc->stage == LOG_WALK_REPLAY_INODES) {
1597 struct inode *inode;
1598 struct btrfs_inode_item *inode_item;
1599 u32 mode;
1600
1601 inode_item = btrfs_item_ptr(eb, i,
1602 struct btrfs_inode_item);
1603 mode = btrfs_inode_mode(eb, inode_item);
1604 if (S_ISDIR(mode)) {
1605 ret = replay_dir_deletes(wc->trans,
1606 root, log, path, key.objectid);
1607 BUG_ON(ret);
1608 }
1609 ret = overwrite_item(wc->trans, root, path,
1610 eb, i, &key);
1611 BUG_ON(ret);
1612
1613 /* for regular files, truncate away
1614 * extents past the new EOF
1615 */
1616 if (S_ISREG(mode)) {
1617 inode = read_one_inode(root,
1618 key.objectid);
1619 BUG_ON(!inode);
1620
1621 ret = btrfs_truncate_inode_items(wc->trans,
1622 root, inode, inode->i_size,
1623 BTRFS_EXTENT_DATA_KEY);
1624 BUG_ON(ret);
1625 iput(inode);
1626 }
1627 ret = link_to_fixup_dir(wc->trans, root,
1628 path, key.objectid);
1629 BUG_ON(ret);
1630 }
1631 if (wc->stage < LOG_WALK_REPLAY_ALL)
1632 continue;
1633
1634 /* these keys are simply copied */
1635 if (key.type == BTRFS_XATTR_ITEM_KEY) {
1636 ret = overwrite_item(wc->trans, root, path,
1637 eb, i, &key);
1638 BUG_ON(ret);
1639 } else if (key.type == BTRFS_INODE_REF_KEY) {
1640 ret = add_inode_ref(wc->trans, root, log, path,
1641 eb, i, &key);
1642 BUG_ON(ret && ret != -ENOENT);
1643 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1644 ret = replay_one_extent(wc->trans, root, path,
1645 eb, i, &key);
1646 BUG_ON(ret);
1647 } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1648 key.type == BTRFS_DIR_INDEX_KEY) {
1649 ret = replay_one_dir_item(wc->trans, root, path,
1650 eb, i, &key);
1651 BUG_ON(ret);
1652 }
1653 }
1654 btrfs_free_path(path);
1655 return 0;
1656}
1657
1658static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1659 struct btrfs_root *root,
1660 struct btrfs_path *path, int *level,
1661 struct walk_control *wc)
1662{
1663 u64 root_owner;
1664 u64 root_gen;
1665 u64 bytenr;
1666 u64 ptr_gen;
1667 struct extent_buffer *next;
1668 struct extent_buffer *cur;
1669 struct extent_buffer *parent;
1670 u32 blocksize;
1671 int ret = 0;
1672
1673 WARN_ON(*level < 0);
1674 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1675
1676 while (*level > 0) {
1677 WARN_ON(*level < 0);
1678 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1679 cur = path->nodes[*level];
1680
1681 if (btrfs_header_level(cur) != *level)
1682 WARN_ON(1);
1683
1684 if (path->slots[*level] >=
1685 btrfs_header_nritems(cur))
1686 break;
1687
1688 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1689 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1690 blocksize = btrfs_level_size(root, *level - 1);
1691
1692 parent = path->nodes[*level];
1693 root_owner = btrfs_header_owner(parent);
1694 root_gen = btrfs_header_generation(parent);
1695
1696 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1697
1698 wc->process_func(root, next, wc, ptr_gen);
1699
1700 if (*level == 1) {
1701 path->slots[*level]++;
1702 if (wc->free) {
1703 btrfs_read_buffer(next, ptr_gen);
1704
1705 btrfs_tree_lock(next);
1706 clean_tree_block(trans, root, next);
1707 btrfs_wait_tree_block_writeback(next);
1708 btrfs_tree_unlock(next);
1709
1710 ret = btrfs_drop_leaf_ref(trans, root, next);
1711 BUG_ON(ret);
1712
1713 WARN_ON(root_owner !=
1714 BTRFS_TREE_LOG_OBJECTID);
1715 ret = btrfs_free_reserved_extent(root,
1716 bytenr, blocksize);
1717 BUG_ON(ret);
1718 }
1719 free_extent_buffer(next);
1720 continue;
1721 }
1722 btrfs_read_buffer(next, ptr_gen);
1723
1724 WARN_ON(*level <= 0);
1725 if (path->nodes[*level-1])
1726 free_extent_buffer(path->nodes[*level-1]);
1727 path->nodes[*level-1] = next;
1728 *level = btrfs_header_level(next);
1729 path->slots[*level] = 0;
1730 cond_resched();
1731 }
1732 WARN_ON(*level < 0);
1733 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1734
1735 if (path->nodes[*level] == root->node)
1736 parent = path->nodes[*level];
1737 else
1738 parent = path->nodes[*level + 1];
1739
1740 bytenr = path->nodes[*level]->start;
1741
1742 blocksize = btrfs_level_size(root, *level);
1743 root_owner = btrfs_header_owner(parent);
1744 root_gen = btrfs_header_generation(parent);
1745
1746 wc->process_func(root, path->nodes[*level], wc,
1747 btrfs_header_generation(path->nodes[*level]));
1748
1749 if (wc->free) {
1750 next = path->nodes[*level];
1751 btrfs_tree_lock(next);
1752 clean_tree_block(trans, root, next);
1753 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next);
1755
1756 if (*level == 0) {
1757 ret = btrfs_drop_leaf_ref(trans, root, next);
1758 BUG_ON(ret);
1759 }
1760 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1761 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1762 BUG_ON(ret);
1763 }
1764 free_extent_buffer(path->nodes[*level]);
1765 path->nodes[*level] = NULL;
1766 *level += 1;
1767
1768 cond_resched();
1769 return 0;
1770}
1771
1772static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1773 struct btrfs_root *root,
1774 struct btrfs_path *path, int *level,
1775 struct walk_control *wc)
1776{
1777 u64 root_owner;
1778 u64 root_gen;
1779 int i;
1780 int slot;
1781 int ret;
1782
1783 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1784 slot = path->slots[i];
1785 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1786 struct extent_buffer *node;
1787 node = path->nodes[i];
1788 path->slots[i]++;
1789 *level = i;
1790 WARN_ON(*level == 0);
1791 return 0;
1792 } else {
1793 struct extent_buffer *parent;
1794 if (path->nodes[*level] == root->node)
1795 parent = path->nodes[*level];
1796 else
1797 parent = path->nodes[*level + 1];
1798
1799 root_owner = btrfs_header_owner(parent);
1800 root_gen = btrfs_header_generation(parent);
1801 wc->process_func(root, path->nodes[*level], wc,
1802 btrfs_header_generation(path->nodes[*level]));
1803 if (wc->free) {
1804 struct extent_buffer *next;
1805
1806 next = path->nodes[*level];
1807
1808 btrfs_tree_lock(next);
1809 clean_tree_block(trans, root, next);
1810 btrfs_wait_tree_block_writeback(next);
1811 btrfs_tree_unlock(next);
1812
1813 if (*level == 0) {
1814 ret = btrfs_drop_leaf_ref(trans, root,
1815 next);
1816 BUG_ON(ret);
1817 }
1818
1819 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1820 ret = btrfs_free_reserved_extent(root,
1821 path->nodes[*level]->start,
1822 path->nodes[*level]->len);
1823 BUG_ON(ret);
1824 }
1825 free_extent_buffer(path->nodes[*level]);
1826 path->nodes[*level] = NULL;
1827 *level = i + 1;
1828 }
1829 }
1830 return 1;
1831}
1832
1833/*
1834 * drop the reference count on the tree rooted at 'snap'. This traverses
1835 * the tree freeing any blocks that have a ref count of zero after being
1836 * decremented.
1837 */
1838static int walk_log_tree(struct btrfs_trans_handle *trans,
1839 struct btrfs_root *log, struct walk_control *wc)
1840{
1841 int ret = 0;
1842 int wret;
1843 int level;
1844 struct btrfs_path *path;
1845 int i;
1846 int orig_level;
1847
1848 path = btrfs_alloc_path();
1849 BUG_ON(!path);
1850
1851 level = btrfs_header_level(log->node);
1852 orig_level = level;
1853 path->nodes[level] = log->node;
1854 extent_buffer_get(log->node);
1855 path->slots[level] = 0;
1856
1857 while (1) {
1858 wret = walk_down_log_tree(trans, log, path, &level, wc);
1859 if (wret > 0)
1860 break;
1861 if (wret < 0)
1862 ret = wret;
1863
1864 wret = walk_up_log_tree(trans, log, path, &level, wc);
1865 if (wret > 0)
1866 break;
1867 if (wret < 0)
1868 ret = wret;
1869 }
1870
1871 /* was the root node processed? if not, catch it here */
1872 if (path->nodes[orig_level]) {
1873 wc->process_func(log, path->nodes[orig_level], wc,
1874 btrfs_header_generation(path->nodes[orig_level]));
1875 if (wc->free) {
1876 struct extent_buffer *next;
1877
1878 next = path->nodes[orig_level];
1879
1880 btrfs_tree_lock(next);
1881 clean_tree_block(trans, log, next);
1882 btrfs_wait_tree_block_writeback(next);
1883 btrfs_tree_unlock(next);
1884
1885 if (orig_level == 0) {
1886 ret = btrfs_drop_leaf_ref(trans, log,
1887 next);
1888 BUG_ON(ret);
1889 }
1890 WARN_ON(log->root_key.objectid !=
1891 BTRFS_TREE_LOG_OBJECTID);
1892 ret = btrfs_free_reserved_extent(log, next->start,
1893 next->len);
1894 BUG_ON(ret);
1895 }
1896 }
1897
1898 for (i = 0; i <= orig_level; i++) {
1899 if (path->nodes[i]) {
1900 free_extent_buffer(path->nodes[i]);
1901 path->nodes[i] = NULL;
1902 }
1903 }
1904 btrfs_free_path(path);
1905 if (wc->free)
1906 free_extent_buffer(log->node);
1907 return ret;
1908}
1909
1910static int wait_log_commit(struct btrfs_root *log)
1911{
1912 DEFINE_WAIT(wait);
1913 u64 transid = log->fs_info->tree_log_transid;
1914
1915 do {
1916 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1917 TASK_UNINTERRUPTIBLE);
1918 mutex_unlock(&log->fs_info->tree_log_mutex);
1919 if (atomic_read(&log->fs_info->tree_log_commit))
1920 schedule();
1921 finish_wait(&log->fs_info->tree_log_wait, &wait);
1922 mutex_lock(&log->fs_info->tree_log_mutex);
1923 } while (transid == log->fs_info->tree_log_transid &&
1924 atomic_read(&log->fs_info->tree_log_commit));
1925 return 0;
1926}
1927
1928/*
1929 * btrfs_sync_log does sends a given tree log down to the disk and
1930 * updates the super blocks to record it. When this call is done,
1931 * you know that any inodes previously logged are safely on disk
1932 */
1933int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root)
1935{
1936 int ret;
1937 unsigned long batch;
1938 struct btrfs_root *log = root->log_root;
1939
1940 mutex_lock(&log->fs_info->tree_log_mutex);
1941 if (atomic_read(&log->fs_info->tree_log_commit)) {
1942 wait_log_commit(log);
1943 goto out;
1944 }
1945 atomic_set(&log->fs_info->tree_log_commit, 1);
1946
1947 while (1) {
1948 batch = log->fs_info->tree_log_batch;
1949 mutex_unlock(&log->fs_info->tree_log_mutex);
1950 schedule_timeout_uninterruptible(1);
1951 mutex_lock(&log->fs_info->tree_log_mutex);
1952
1953 while (atomic_read(&log->fs_info->tree_log_writers)) {
1954 DEFINE_WAIT(wait);
1955 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 mutex_unlock(&log->fs_info->tree_log_mutex);
1958 if (atomic_read(&log->fs_info->tree_log_writers))
1959 schedule();
1960 mutex_lock(&log->fs_info->tree_log_mutex);
1961 finish_wait(&log->fs_info->tree_log_wait, &wait);
1962 }
1963 if (batch == log->fs_info->tree_log_batch)
1964 break;
1965 }
1966
1967 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968 BUG_ON(ret);
1969 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
1970 &root->fs_info->log_root_tree->dirty_log_pages);
1971 BUG_ON(ret);
1972
1973 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974 log->fs_info->log_root_tree->node->start);
1975 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976 btrfs_header_level(log->fs_info->log_root_tree->node));
1977
1978 write_ctree_super(trans, log->fs_info->tree_root, 2);
1979 log->fs_info->tree_log_transid++;
1980 log->fs_info->tree_log_batch = 0;
1981 atomic_set(&log->fs_info->tree_log_commit, 0);
1982 smp_mb();
1983 if (waitqueue_active(&log->fs_info->tree_log_wait))
1984 wake_up(&log->fs_info->tree_log_wait);
1985out:
1986 mutex_unlock(&log->fs_info->tree_log_mutex);
1987 return 0;
1988}
1989
1990/* * free all the extents used by the tree log. This should be called
1991 * at commit time of the full transaction
1992 */
1993int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
1994{
1995 int ret;
1996 struct btrfs_root *log;
1997 struct key;
1998 u64 start;
1999 u64 end;
2000 struct walk_control wc = {
2001 .free = 1,
2002 .process_func = process_one_buffer
2003 };
2004
2005 if (!root->log_root || root->fs_info->log_root_recovering)
2006 return 0;
2007
2008 log = root->log_root;
2009 ret = walk_log_tree(trans, log, &wc);
2010 BUG_ON(ret);
2011
2012 while (1) {
2013 ret = find_first_extent_bit(&log->dirty_log_pages,
2014 0, &start, &end, EXTENT_DIRTY);
2015 if (ret)
2016 break;
2017
2018 clear_extent_dirty(&log->dirty_log_pages,
2019 start, end, GFP_NOFS);
2020 }
2021
2022 log = root->log_root;
2023 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024 &log->root_key);
2025 BUG_ON(ret);
2026 root->log_root = NULL;
2027 kfree(root->log_root);
2028 return 0;
2029}
2030
2031/*
2032 * helper function to update the item for a given subvolumes log root
2033 * in the tree of log roots
2034 */
2035static int update_log_root(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *log)
2037{
2038 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039 int ret;
2040
2041 if (log->node->start == bytenr)
2042 return 0;
2043
2044 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045 btrfs_set_root_generation(&log->root_item, trans->transid);
2046 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048 &log->root_key, &log->root_item);
2049 BUG_ON(ret);
2050 return ret;
2051}
2052
2053/*
2054 * If both a file and directory are logged, and unlinks or renames are
2055 * mixed in, we have a few interesting corners:
2056 *
2057 * create file X in dir Y
2058 * link file X to X.link in dir Y
2059 * fsync file X
2060 * unlink file X but leave X.link
2061 * fsync dir Y
2062 *
2063 * After a crash we would expect only X.link to exist. But file X
2064 * didn't get fsync'd again so the log has back refs for X and X.link.
2065 *
2066 * We solve this by removing directory entries and inode backrefs from the
2067 * log when a file that was logged in the current transaction is
2068 * unlinked. Any later fsync will include the updated log entries, and
2069 * we'll be able to reconstruct the proper directory items from backrefs.
2070 *
2071 * This optimizations allows us to avoid relogging the entire inode
2072 * or the entire directory.
2073 */
2074int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2075 struct btrfs_root *root,
2076 const char *name, int name_len,
2077 struct inode *dir, u64 index)
2078{
2079 struct btrfs_root *log;
2080 struct btrfs_dir_item *di;
2081 struct btrfs_path *path;
2082 int ret;
2083 int bytes_del = 0;
2084
2085 if (BTRFS_I(dir)->logged_trans < trans->transid)
2086 return 0;
2087
2088 ret = join_running_log_trans(root);
2089 if (ret)
2090 return 0;
2091
2092 mutex_lock(&BTRFS_I(dir)->log_mutex);
2093
2094 log = root->log_root;
2095 path = btrfs_alloc_path();
2096 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2097 name, name_len, -1);
2098 if (di && !IS_ERR(di)) {
2099 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2100 bytes_del += name_len;
2101 BUG_ON(ret);
2102 }
2103 btrfs_release_path(log, path);
2104 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2105 index, name, name_len, -1);
2106 if (di && !IS_ERR(di)) {
2107 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2108 bytes_del += name_len;
2109 BUG_ON(ret);
2110 }
2111
2112 /* update the directory size in the log to reflect the names
2113 * we have removed
2114 */
2115 if (bytes_del) {
2116 struct btrfs_key key;
2117
2118 key.objectid = dir->i_ino;
2119 key.offset = 0;
2120 key.type = BTRFS_INODE_ITEM_KEY;
2121 btrfs_release_path(log, path);
2122
2123 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2124 if (ret == 0) {
2125 struct btrfs_inode_item *item;
2126 u64 i_size;
2127
2128 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2129 struct btrfs_inode_item);
2130 i_size = btrfs_inode_size(path->nodes[0], item);
2131 if (i_size > bytes_del)
2132 i_size -= bytes_del;
2133 else
2134 i_size = 0;
2135 btrfs_set_inode_size(path->nodes[0], item, i_size);
2136 btrfs_mark_buffer_dirty(path->nodes[0]);
2137 } else
2138 ret = 0;
2139 btrfs_release_path(log, path);
2140 }
2141
2142 btrfs_free_path(path);
2143 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2144 end_log_trans(root);
2145
2146 return 0;
2147}
2148
2149/* see comments for btrfs_del_dir_entries_in_log */
2150int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2151 struct btrfs_root *root,
2152 const char *name, int name_len,
2153 struct inode *inode, u64 dirid)
2154{
2155 struct btrfs_root *log;
2156 u64 index;
2157 int ret;
2158
2159 if (BTRFS_I(inode)->logged_trans < trans->transid)
2160 return 0;
2161
2162 ret = join_running_log_trans(root);
2163 if (ret)
2164 return 0;
2165 log = root->log_root;
2166 mutex_lock(&BTRFS_I(inode)->log_mutex);
2167
2168 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2169 dirid, &index);
2170 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2171 end_log_trans(root);
2172
2173 return ret;
2174}
2175
2176/*
2177 * creates a range item in the log for 'dirid'. first_offset and
2178 * last_offset tell us which parts of the key space the log should
2179 * be considered authoritative for.
2180 */
2181static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2182 struct btrfs_root *log,
2183 struct btrfs_path *path,
2184 int key_type, u64 dirid,
2185 u64 first_offset, u64 last_offset)
2186{
2187 int ret;
2188 struct btrfs_key key;
2189 struct btrfs_dir_log_item *item;
2190
2191 key.objectid = dirid;
2192 key.offset = first_offset;
2193 if (key_type == BTRFS_DIR_ITEM_KEY)
2194 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2195 else
2196 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2197 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2198 BUG_ON(ret);
2199
2200 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2201 struct btrfs_dir_log_item);
2202 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2203 btrfs_mark_buffer_dirty(path->nodes[0]);
2204 btrfs_release_path(log, path);
2205 return 0;
2206}
2207
2208/*
2209 * log all the items included in the current transaction for a given
2210 * directory. This also creates the range items in the log tree required
2211 * to replay anything deleted before the fsync
2212 */
2213static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2214 struct btrfs_root *root, struct inode *inode,
2215 struct btrfs_path *path,
2216 struct btrfs_path *dst_path, int key_type,
2217 u64 min_offset, u64 *last_offset_ret)
2218{
2219 struct btrfs_key min_key;
2220 struct btrfs_key max_key;
2221 struct btrfs_root *log = root->log_root;
2222 struct extent_buffer *src;
2223 int ret;
2224 int i;
2225 int nritems;
2226 u64 first_offset = min_offset;
2227 u64 last_offset = (u64)-1;
2228
2229 log = root->log_root;
2230 max_key.objectid = inode->i_ino;
2231 max_key.offset = (u64)-1;
2232 max_key.type = key_type;
2233
2234 min_key.objectid = inode->i_ino;
2235 min_key.type = key_type;
2236 min_key.offset = min_offset;
2237
2238 path->keep_locks = 1;
2239
2240 ret = btrfs_search_forward(root, &min_key, &max_key,
2241 path, 0, trans->transid);
2242
2243 /*
2244 * we didn't find anything from this transaction, see if there
2245 * is anything at all
2246 */
2247 if (ret != 0 || min_key.objectid != inode->i_ino ||
2248 min_key.type != key_type) {
2249 min_key.objectid = inode->i_ino;
2250 min_key.type = key_type;
2251 min_key.offset = (u64)-1;
2252 btrfs_release_path(root, path);
2253 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2254 if (ret < 0) {
2255 btrfs_release_path(root, path);
2256 return ret;
2257 }
2258 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2259
2260 /* if ret == 0 there are items for this type,
2261 * create a range to tell us the last key of this type.
2262 * otherwise, there are no items in this directory after
2263 * *min_offset, and we create a range to indicate that.
2264 */
2265 if (ret == 0) {
2266 struct btrfs_key tmp;
2267 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2268 path->slots[0]);
2269 if (key_type == tmp.type)
2270 first_offset = max(min_offset, tmp.offset) + 1;
2271 }
2272 goto done;
2273 }
2274
2275 /* go backward to find any previous key */
2276 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2277 if (ret == 0) {
2278 struct btrfs_key tmp;
2279 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2280 if (key_type == tmp.type) {
2281 first_offset = tmp.offset;
2282 ret = overwrite_item(trans, log, dst_path,
2283 path->nodes[0], path->slots[0],
2284 &tmp);
2285 }
2286 }
2287 btrfs_release_path(root, path);
2288
2289 /* find the first key from this transaction again */
2290 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2291 if (ret != 0) {
2292 WARN_ON(1);
2293 goto done;
2294 }
2295
2296 /*
2297 * we have a block from this transaction, log every item in it
2298 * from our directory
2299 */
2300 while (1) {
2301 struct btrfs_key tmp;
2302 src = path->nodes[0];
2303 nritems = btrfs_header_nritems(src);
2304 for (i = path->slots[0]; i < nritems; i++) {
2305 btrfs_item_key_to_cpu(src, &min_key, i);
2306
2307 if (min_key.objectid != inode->i_ino ||
2308 min_key.type != key_type)
2309 goto done;
2310 ret = overwrite_item(trans, log, dst_path, src, i,
2311 &min_key);
2312 BUG_ON(ret);
2313 }
2314 path->slots[0] = nritems;
2315
2316 /*
2317 * look ahead to the next item and see if it is also
2318 * from this directory and from this transaction
2319 */
2320 ret = btrfs_next_leaf(root, path);
2321 if (ret == 1) {
2322 last_offset = (u64)-1;
2323 goto done;
2324 }
2325 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2326 if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2327 last_offset = (u64)-1;
2328 goto done;
2329 }
2330 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2331 ret = overwrite_item(trans, log, dst_path,
2332 path->nodes[0], path->slots[0],
2333 &tmp);
2334
2335 BUG_ON(ret);
2336 last_offset = tmp.offset;
2337 goto done;
2338 }
2339 }
2340done:
2341 *last_offset_ret = last_offset;
2342 btrfs_release_path(root, path);
2343 btrfs_release_path(log, dst_path);
2344
2345 /* insert the log range keys to indicate where the log is valid */
2346 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2347 first_offset, last_offset);
2348 BUG_ON(ret);
2349 return 0;
2350}
2351
2352/*
2353 * logging directories is very similar to logging inodes, We find all the items
2354 * from the current transaction and write them to the log.
2355 *
2356 * The recovery code scans the directory in the subvolume, and if it finds a
2357 * key in the range logged that is not present in the log tree, then it means
2358 * that dir entry was unlinked during the transaction.
2359 *
2360 * In order for that scan to work, we must include one key smaller than
2361 * the smallest logged by this transaction and one key larger than the largest
2362 * key logged by this transaction.
2363 */
2364static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2365 struct btrfs_root *root, struct inode *inode,
2366 struct btrfs_path *path,
2367 struct btrfs_path *dst_path)
2368{
2369 u64 min_key;
2370 u64 max_key;
2371 int ret;
2372 int key_type = BTRFS_DIR_ITEM_KEY;
2373
2374again:
2375 min_key = 0;
2376 max_key = 0;
2377 while (1) {
2378 ret = log_dir_items(trans, root, inode, path,
2379 dst_path, key_type, min_key,
2380 &max_key);
2381 BUG_ON(ret);
2382 if (max_key == (u64)-1)
2383 break;
2384 min_key = max_key + 1;
2385 }
2386
2387 if (key_type == BTRFS_DIR_ITEM_KEY) {
2388 key_type = BTRFS_DIR_INDEX_KEY;
2389 goto again;
2390 }
2391 return 0;
2392}
2393
2394/*
2395 * a helper function to drop items from the log before we relog an
2396 * inode. max_key_type indicates the highest item type to remove.
2397 * This cannot be run for file data extents because it does not
2398 * free the extents they point to.
2399 */
2400static int drop_objectid_items(struct btrfs_trans_handle *trans,
2401 struct btrfs_root *log,
2402 struct btrfs_path *path,
2403 u64 objectid, int max_key_type)
2404{
2405 int ret;
2406 struct btrfs_key key;
2407 struct btrfs_key found_key;
2408
2409 key.objectid = objectid;
2410 key.type = max_key_type;
2411 key.offset = (u64)-1;
2412
2413 while (1) {
2414 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2415
2416 if (ret != 1)
2417 break;
2418
2419 if (path->slots[0] == 0)
2420 break;
2421
2422 path->slots[0]--;
2423 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2424 path->slots[0]);
2425
2426 if (found_key.objectid != objectid)
2427 break;
2428
2429 ret = btrfs_del_item(trans, log, path);
2430 BUG_ON(ret);
2431 btrfs_release_path(log, path);
2432 }
2433 btrfs_release_path(log, path);
2434 return 0;
2435}
2436
2437static noinline int copy_items(struct btrfs_trans_handle *trans,
2438 struct btrfs_root *log,
2439 struct btrfs_path *dst_path,
2440 struct extent_buffer *src,
2441 int start_slot, int nr, int inode_only)
2442{
2443 unsigned long src_offset;
2444 unsigned long dst_offset;
2445 struct btrfs_file_extent_item *extent;
2446 struct btrfs_inode_item *inode_item;
2447 int ret;
2448 struct btrfs_key *ins_keys;
2449 u32 *ins_sizes;
2450 char *ins_data;
2451 int i;
2452 struct list_head ordered_sums;
2453
2454 INIT_LIST_HEAD(&ordered_sums);
2455
2456 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2457 nr * sizeof(u32), GFP_NOFS);
2458 ins_sizes = (u32 *)ins_data;
2459 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2460
2461 for (i = 0; i < nr; i++) {
2462 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2463 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2464 }
2465 ret = btrfs_insert_empty_items(trans, log, dst_path,
2466 ins_keys, ins_sizes, nr);
2467 BUG_ON(ret);
2468
2469 for (i = 0; i < nr; i++) {
2470 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2471 dst_path->slots[0]);
2472
2473 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2474
2475 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2476 src_offset, ins_sizes[i]);
2477
2478 if (inode_only == LOG_INODE_EXISTS &&
2479 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2480 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2481 dst_path->slots[0],
2482 struct btrfs_inode_item);
2483 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2484
2485 /* set the generation to zero so the recover code
2486 * can tell the difference between an logging
2487 * just to say 'this inode exists' and a logging
2488 * to say 'update this inode with these values'
2489 */
2490 btrfs_set_inode_generation(dst_path->nodes[0],
2491 inode_item, 0);
2492 }
2493 /* take a reference on file data extents so that truncates
2494 * or deletes of this inode don't have to relog the inode
2495 * again
2496 */
2497 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2498 int found_type;
2499 extent = btrfs_item_ptr(src, start_slot + i,
2500 struct btrfs_file_extent_item);
2501
2502 found_type = btrfs_file_extent_type(src, extent);
2503 if (found_type == BTRFS_FILE_EXTENT_REG ||
2504 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2505 u64 ds = btrfs_file_extent_disk_bytenr(src,
2506 extent);
2507 u64 dl = btrfs_file_extent_disk_num_bytes(src,
2508 extent);
2509 u64 cs = btrfs_file_extent_offset(src, extent);
2510 u64 cl = btrfs_file_extent_num_bytes(src,
2511 extent);;
2512 if (btrfs_file_extent_compression(src,
2513 extent)) {
2514 cs = 0;
2515 cl = dl;
2516 }
2517 /* ds == 0 is a hole */
2518 if (ds != 0) {
2519 ret = btrfs_inc_extent_ref(trans, log,
2520 ds, dl,
2521 dst_path->nodes[0]->start,
2522 BTRFS_TREE_LOG_OBJECTID,
2523 trans->transid,
2524 ins_keys[i].objectid);
2525 BUG_ON(ret);
2526 ret = btrfs_lookup_csums_range(
2527 log->fs_info->csum_root,
2528 ds + cs, ds + cs + cl - 1,
2529 &ordered_sums);
2530 BUG_ON(ret);
2531 }
2532 }
2533 }
2534 dst_path->slots[0]++;
2535 }
2536
2537 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2538 btrfs_release_path(log, dst_path);
2539 kfree(ins_data);
2540
2541 /*
2542 * we have to do this after the loop above to avoid changing the
2543 * log tree while trying to change the log tree.
2544 */
2545 while (!list_empty(&ordered_sums)) {
2546 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2547 struct btrfs_ordered_sum,
2548 list);
2549 ret = btrfs_csum_file_blocks(trans, log, sums);
2550 BUG_ON(ret);
2551 list_del(&sums->list);
2552 kfree(sums);
2553 }
2554 return 0;
2555}
2556
2557/* log a single inode in the tree log.
2558 * At least one parent directory for this inode must exist in the tree
2559 * or be logged already.
2560 *
2561 * Any items from this inode changed by the current transaction are copied
2562 * to the log tree. An extra reference is taken on any extents in this
2563 * file, allowing us to avoid a whole pile of corner cases around logging
2564 * blocks that have been removed from the tree.
2565 *
2566 * See LOG_INODE_ALL and related defines for a description of what inode_only
2567 * does.
2568 *
2569 * This handles both files and directories.
2570 */
2571static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2572 struct btrfs_root *root, struct inode *inode,
2573 int inode_only)
2574{
2575 struct btrfs_path *path;
2576 struct btrfs_path *dst_path;
2577 struct btrfs_key min_key;
2578 struct btrfs_key max_key;
2579 struct btrfs_root *log = root->log_root;
2580 struct extent_buffer *src = NULL;
2581 u32 size;
2582 int ret;
2583 int nritems;
2584 int ins_start_slot = 0;
2585 int ins_nr;
2586
2587 log = root->log_root;
2588
2589 path = btrfs_alloc_path();
2590 dst_path = btrfs_alloc_path();
2591
2592 min_key.objectid = inode->i_ino;
2593 min_key.type = BTRFS_INODE_ITEM_KEY;
2594 min_key.offset = 0;
2595
2596 max_key.objectid = inode->i_ino;
2597 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2598 max_key.type = BTRFS_XATTR_ITEM_KEY;
2599 else
2600 max_key.type = (u8)-1;
2601 max_key.offset = (u64)-1;
2602
2603 /*
2604 * if this inode has already been logged and we're in inode_only
2605 * mode, we don't want to delete the things that have already
2606 * been written to the log.
2607 *
2608 * But, if the inode has been through an inode_only log,
2609 * the logged_trans field is not set. This allows us to catch
2610 * any new names for this inode in the backrefs by logging it
2611 * again
2612 */
2613 if (inode_only == LOG_INODE_EXISTS &&
2614 BTRFS_I(inode)->logged_trans == trans->transid) {
2615 btrfs_free_path(path);
2616 btrfs_free_path(dst_path);
2617 goto out;
2618 }
2619 mutex_lock(&BTRFS_I(inode)->log_mutex);
2620
2621 /*
2622 * a brute force approach to making sure we get the most uptodate
2623 * copies of everything.
2624 */
2625 if (S_ISDIR(inode->i_mode)) {
2626 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2627
2628 if (inode_only == LOG_INODE_EXISTS)
2629 max_key_type = BTRFS_XATTR_ITEM_KEY;
2630 ret = drop_objectid_items(trans, log, path,
2631 inode->i_ino, max_key_type);
2632 } else {
2633 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2634 }
2635 BUG_ON(ret);
2636 path->keep_locks = 1;
2637
2638 while (1) {
2639 ins_nr = 0;
2640 ret = btrfs_search_forward(root, &min_key, &max_key,
2641 path, 0, trans->transid);
2642 if (ret != 0)
2643 break;
2644again:
2645 /* note, ins_nr might be > 0 here, cleanup outside the loop */
2646 if (min_key.objectid != inode->i_ino)
2647 break;
2648 if (min_key.type > max_key.type)
2649 break;
2650
2651 src = path->nodes[0];
2652 size = btrfs_item_size_nr(src, path->slots[0]);
2653 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2654 ins_nr++;
2655 goto next_slot;
2656 } else if (!ins_nr) {
2657 ins_start_slot = path->slots[0];
2658 ins_nr = 1;
2659 goto next_slot;
2660 }
2661
2662 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2663 ins_nr, inode_only);
2664 BUG_ON(ret);
2665 ins_nr = 1;
2666 ins_start_slot = path->slots[0];
2667next_slot:
2668
2669 nritems = btrfs_header_nritems(path->nodes[0]);
2670 path->slots[0]++;
2671 if (path->slots[0] < nritems) {
2672 btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2673 path->slots[0]);
2674 goto again;
2675 }
2676 if (ins_nr) {
2677 ret = copy_items(trans, log, dst_path, src,
2678 ins_start_slot,
2679 ins_nr, inode_only);
2680 BUG_ON(ret);
2681 ins_nr = 0;
2682 }
2683 btrfs_release_path(root, path);
2684
2685 if (min_key.offset < (u64)-1)
2686 min_key.offset++;
2687 else if (min_key.type < (u8)-1)
2688 min_key.type++;
2689 else if (min_key.objectid < (u64)-1)
2690 min_key.objectid++;
2691 else
2692 break;
2693 }
2694 if (ins_nr) {
2695 ret = copy_items(trans, log, dst_path, src,
2696 ins_start_slot,
2697 ins_nr, inode_only);
2698 BUG_ON(ret);
2699 ins_nr = 0;
2700 }
2701 WARN_ON(ins_nr);
2702 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2703 btrfs_release_path(root, path);
2704 btrfs_release_path(log, dst_path);
2705 BTRFS_I(inode)->log_dirty_trans = 0;
2706 ret = log_directory_changes(trans, root, inode, path, dst_path);
2707 BUG_ON(ret);
2708 }
2709 BTRFS_I(inode)->logged_trans = trans->transid;
2710 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2711
2712 btrfs_free_path(path);
2713 btrfs_free_path(dst_path);
2714
2715 mutex_lock(&root->fs_info->tree_log_mutex);
2716 ret = update_log_root(trans, log);
2717 BUG_ON(ret);
2718 mutex_unlock(&root->fs_info->tree_log_mutex);
2719out:
2720 return 0;
2721}
2722
2723int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 struct btrfs_root *root, struct inode *inode,
2725 int inode_only)
2726{
2727 int ret;
2728
2729 start_log_trans(trans, root);
2730 ret = __btrfs_log_inode(trans, root, inode, inode_only);
2731 end_log_trans(root);
2732 return ret;
2733}
2734
2735/*
2736 * helper function around btrfs_log_inode to make sure newly created
2737 * parent directories also end up in the log. A minimal inode and backref
2738 * only logging is done of any parent directories that are older than
2739 * the last committed transaction
2740 */
2741int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2742 struct btrfs_root *root, struct dentry *dentry)
2743{
2744 int inode_only = LOG_INODE_ALL;
2745 struct super_block *sb;
2746 int ret;
2747
2748 start_log_trans(trans, root);
2749 sb = dentry->d_inode->i_sb;
2750 while (1) {
2751 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2752 inode_only);
2753 BUG_ON(ret);
2754 inode_only = LOG_INODE_EXISTS;
2755
2756 dentry = dentry->d_parent;
2757 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
2758 break;
2759
2760 if (BTRFS_I(dentry->d_inode)->generation <=
2761 root->fs_info->last_trans_committed)
2762 break;
2763 }
2764 end_log_trans(root);
2765 return 0;
2766}
2767
2768/*
2769 * it is not safe to log dentry if the chunk root has added new
2770 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
2771 * If this returns 1, you must commit the transaction to safely get your
2772 * data on disk.
2773 */
2774int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2775 struct btrfs_root *root, struct dentry *dentry)
2776{
2777 u64 gen;
2778 gen = root->fs_info->last_trans_new_blockgroup;
2779 if (gen > root->fs_info->last_trans_committed)
2780 return 1;
2781 else
2782 return btrfs_log_dentry(trans, root, dentry);
2783}
2784
2785/*
2786 * should be called during mount to recover any replay any log trees
2787 * from the FS
2788 */
2789int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2790{
2791 int ret;
2792 struct btrfs_path *path;
2793 struct btrfs_trans_handle *trans;
2794 struct btrfs_key key;
2795 struct btrfs_key found_key;
2796 struct btrfs_key tmp_key;
2797 struct btrfs_root *log;
2798 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2799 u64 highest_inode;
2800 struct walk_control wc = {
2801 .process_func = process_one_buffer,
2802 .stage = 0,
2803 };
2804
2805 fs_info->log_root_recovering = 1;
2806 path = btrfs_alloc_path();
2807 BUG_ON(!path);
2808
2809 trans = btrfs_start_transaction(fs_info->tree_root, 1);
2810
2811 wc.trans = trans;
2812 wc.pin = 1;
2813
2814 walk_log_tree(trans, log_root_tree, &wc);
2815
2816again:
2817 key.objectid = BTRFS_TREE_LOG_OBJECTID;
2818 key.offset = (u64)-1;
2819 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2820
2821 while (1) {
2822 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
2823 if (ret < 0)
2824 break;
2825 if (ret > 0) {
2826 if (path->slots[0] == 0)
2827 break;
2828 path->slots[0]--;
2829 }
2830 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2831 path->slots[0]);
2832 btrfs_release_path(log_root_tree, path);
2833 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
2834 break;
2835
2836 log = btrfs_read_fs_root_no_radix(log_root_tree,
2837 &found_key);
2838 BUG_ON(!log);
2839
2840
2841 tmp_key.objectid = found_key.offset;
2842 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
2843 tmp_key.offset = (u64)-1;
2844
2845 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
2846 BUG_ON(!wc.replay_dest);
2847
2848 wc.replay_dest->log_root = log;
2849 btrfs_record_root_in_trans(wc.replay_dest);
2850 ret = walk_log_tree(trans, log, &wc);
2851 BUG_ON(ret);
2852
2853 if (wc.stage == LOG_WALK_REPLAY_ALL) {
2854 ret = fixup_inode_link_counts(trans, wc.replay_dest,
2855 path);
2856 BUG_ON(ret);
2857 }
2858 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
2859 if (ret == 0) {
2860 wc.replay_dest->highest_inode = highest_inode;
2861 wc.replay_dest->last_inode_alloc = highest_inode;
2862 }
2863
2864 key.offset = found_key.offset - 1;
2865 wc.replay_dest->log_root = NULL;
2866 free_extent_buffer(log->node);
2867 kfree(log);
2868
2869 if (found_key.offset == 0)
2870 break;
2871 }
2872 btrfs_release_path(log_root_tree, path);
2873
2874 /* step one is to pin it all, step two is to replay just inodes */
2875 if (wc.pin) {
2876 wc.pin = 0;
2877 wc.process_func = replay_one_buffer;
2878 wc.stage = LOG_WALK_REPLAY_INODES;
2879 goto again;
2880 }
2881 /* step three is to replay everything */
2882 if (wc.stage < LOG_WALK_REPLAY_ALL) {
2883 wc.stage++;
2884 goto again;
2885 }
2886
2887 btrfs_free_path(path);
2888
2889 free_extent_buffer(log_root_tree->node);
2890 log_root_tree->log_root = NULL;
2891 fs_info->log_root_recovering = 0;
2892
2893 /* step 4: commit the transaction, which also unpins the blocks */
2894 btrfs_commit_transaction(trans, fs_info->tree_root);
2895
2896 kfree(log_root_tree);
2897 return 0;
2898}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000000..b9409b32ed02
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __TREE_LOG_
20#define __TREE_LOG_
21
22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root,
35 const char *name, int name_len,
36 struct inode *dir, u64 index);
37int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root,
39 const char *name, int name_len,
40 struct inode *inode, u64 dirid);
41#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 000000000000..9bf3946d5ef2
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 000000000000..1ca1952fd917
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
1#!/bin/bash
2#
3# determine-version -- report a useful version for releases
4#
5# Copyright 2008, Aron Griffis <agriffis@n01se.net>
6# Copyright 2008, Oracle
7# Released under the GNU GPLv2
8
9v="v0.16"
10
11which git &> /dev/null
12if [ $? == 0 ]; then
13 git branch >& /dev/null
14 if [ $? == 0 ]; then
15 if head=`git rev-parse --verify HEAD 2>/dev/null`; then
16 if tag=`git describe --tags 2>/dev/null`; then
17 v="$tag"
18 fi
19
20 # Are there uncommitted changes?
21 git update-index --refresh --unmerged > /dev/null
22 if git diff-index --name-only HEAD | \
23 grep -v "^scripts/package" \
24 | read dummy; then
25 v="$v"-dirty
26 fi
27 fi
28 fi
29fi
30
31echo "#ifndef __BUILD_VERSION" > .build-version.h
32echo "#define __BUILD_VERSION" >> .build-version.h
33echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
34echo "#endif" >> .build-version.h
35
36diff -q version.h .build-version.h >& /dev/null
37
38if [ $? == 0 ]; then
39 rm .build-version.h
40 exit 0
41fi
42
43mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000000..b187b537888e
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3218 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/buffer_head.h>
21#include <linux/blkdev.h>
22#include <linux/random.h>
23#include <linux/version.h>
24#include <asm/div64.h>
25#include "compat.h"
26#include "ctree.h"
27#include "extent_map.h"
28#include "disk-io.h"
29#include "transaction.h"
30#include "print-tree.h"
31#include "volumes.h"
32#include "async-thread.h"
33
34struct map_lookup {
35 u64 type;
36 int io_align;
37 int io_width;
38 int stripe_len;
39 int sector_size;
40 int num_stripes;
41 int sub_stripes;
42 struct btrfs_bio_stripe stripes[];
43};
44
45static int init_first_rw_device(struct btrfs_trans_handle *trans,
46 struct btrfs_root *root,
47 struct btrfs_device *device);
48static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
49
50#define map_lookup_size(n) (sizeof(struct map_lookup) + \
51 (sizeof(struct btrfs_bio_stripe) * (n)))
52
53static DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids);
55
56void btrfs_lock_volumes(void)
57{
58 mutex_lock(&uuid_mutex);
59}
60
61void btrfs_unlock_volumes(void)
62{
63 mutex_unlock(&uuid_mutex);
64}
65
66static void lock_chunks(struct btrfs_root *root)
67{
68 mutex_lock(&root->fs_info->chunk_mutex);
69}
70
71static void unlock_chunks(struct btrfs_root *root)
72{
73 mutex_unlock(&root->fs_info->chunk_mutex);
74}
75
76static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
77{
78 struct btrfs_device *device;
79 WARN_ON(fs_devices->opened);
80 while (!list_empty(&fs_devices->devices)) {
81 device = list_entry(fs_devices->devices.next,
82 struct btrfs_device, dev_list);
83 list_del(&device->dev_list);
84 kfree(device->name);
85 kfree(device);
86 }
87 kfree(fs_devices);
88}
89
90int btrfs_cleanup_fs_uuids(void)
91{
92 struct btrfs_fs_devices *fs_devices;
93
94 while (!list_empty(&fs_uuids)) {
95 fs_devices = list_entry(fs_uuids.next,
96 struct btrfs_fs_devices, list);
97 list_del(&fs_devices->list);
98 free_fs_devices(fs_devices);
99 }
100 return 0;
101}
102
103static noinline struct btrfs_device *__find_device(struct list_head *head,
104 u64 devid, u8 *uuid)
105{
106 struct btrfs_device *dev;
107 struct list_head *cur;
108
109 list_for_each(cur, head) {
110 dev = list_entry(cur, struct btrfs_device, dev_list);
111 if (dev->devid == devid &&
112 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113 return dev;
114 }
115 }
116 return NULL;
117}
118
119static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120{
121 struct list_head *cur;
122 struct btrfs_fs_devices *fs_devices;
123
124 list_for_each(cur, &fs_uuids) {
125 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
126 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
127 return fs_devices;
128 }
129 return NULL;
130}
131
132/*
133 * we try to collect pending bios for a device so we don't get a large
134 * number of procs sending bios down to the same device. This greatly
135 * improves the schedulers ability to collect and merge the bios.
136 *
137 * But, it also turns into a long list of bios to process and that is sure
138 * to eventually make the worker thread block. The solution here is to
139 * make some progress and then put this work struct back at the end of
140 * the list if the block device is congested. This way, multiple devices
141 * can make progress from a single worker thread.
142 */
143static noinline int run_scheduled_bios(struct btrfs_device *device)
144{
145 struct bio *pending;
146 struct backing_dev_info *bdi;
147 struct btrfs_fs_info *fs_info;
148 struct bio *tail;
149 struct bio *cur;
150 int again = 0;
151 unsigned long num_run = 0;
152 unsigned long limit;
153
154 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
155 fs_info = device->dev_root->fs_info;
156 limit = btrfs_async_submit_limit(fs_info);
157 limit = limit * 2 / 3;
158
159loop:
160 spin_lock(&device->io_lock);
161
162 /* take all the bios off the list at once and process them
163 * later on (without the lock held). But, remember the
164 * tail and other pointers so the bios can be properly reinserted
165 * into the list if we hit congestion
166 */
167 pending = device->pending_bios;
168 tail = device->pending_bio_tail;
169 WARN_ON(pending && !tail);
170 device->pending_bios = NULL;
171 device->pending_bio_tail = NULL;
172
173 /*
174 * if pending was null this time around, no bios need processing
175 * at all and we can stop. Otherwise it'll loop back up again
176 * and do an additional check so no bios are missed.
177 *
178 * device->running_pending is used to synchronize with the
179 * schedule_bio code.
180 */
181 if (pending) {
182 again = 1;
183 device->running_pending = 1;
184 } else {
185 again = 0;
186 device->running_pending = 0;
187 }
188 spin_unlock(&device->io_lock);
189
190 while (pending) {
191 cur = pending;
192 pending = pending->bi_next;
193 cur->bi_next = NULL;
194 atomic_dec(&fs_info->nr_async_bios);
195
196 if (atomic_read(&fs_info->nr_async_bios) < limit &&
197 waitqueue_active(&fs_info->async_submit_wait))
198 wake_up(&fs_info->async_submit_wait);
199
200 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
201 bio_get(cur);
202 submit_bio(cur->bi_rw, cur);
203 bio_put(cur);
204 num_run++;
205
206 /*
207 * we made progress, there is more work to do and the bdi
208 * is now congested. Back off and let other work structs
209 * run instead
210 */
211 if (pending && bdi_write_congested(bdi) &&
212 fs_info->fs_devices->open_devices > 1) {
213 struct bio *old_head;
214
215 spin_lock(&device->io_lock);
216
217 old_head = device->pending_bios;
218 device->pending_bios = pending;
219 if (device->pending_bio_tail)
220 tail->bi_next = old_head;
221 else
222 device->pending_bio_tail = tail;
223
224 spin_unlock(&device->io_lock);
225 btrfs_requeue_work(&device->work);
226 goto done;
227 }
228 }
229 if (again)
230 goto loop;
231done:
232 return 0;
233}
234
235static void pending_bios_fn(struct btrfs_work *work)
236{
237 struct btrfs_device *device;
238
239 device = container_of(work, struct btrfs_device, work);
240 run_scheduled_bios(device);
241}
242
243static noinline int device_list_add(const char *path,
244 struct btrfs_super_block *disk_super,
245 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
246{
247 struct btrfs_device *device;
248 struct btrfs_fs_devices *fs_devices;
249 u64 found_transid = btrfs_super_generation(disk_super);
250
251 fs_devices = find_fsid(disk_super->fsid);
252 if (!fs_devices) {
253 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
254 if (!fs_devices)
255 return -ENOMEM;
256 INIT_LIST_HEAD(&fs_devices->devices);
257 INIT_LIST_HEAD(&fs_devices->alloc_list);
258 list_add(&fs_devices->list, &fs_uuids);
259 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
260 fs_devices->latest_devid = devid;
261 fs_devices->latest_trans = found_transid;
262 device = NULL;
263 } else {
264 device = __find_device(&fs_devices->devices, devid,
265 disk_super->dev_item.uuid);
266 }
267 if (!device) {
268 if (fs_devices->opened)
269 return -EBUSY;
270
271 device = kzalloc(sizeof(*device), GFP_NOFS);
272 if (!device) {
273 /* we can safely leave the fs_devices entry around */
274 return -ENOMEM;
275 }
276 device->devid = devid;
277 device->work.func = pending_bios_fn;
278 memcpy(device->uuid, disk_super->dev_item.uuid,
279 BTRFS_UUID_SIZE);
280 device->barriers = 1;
281 spin_lock_init(&device->io_lock);
282 device->name = kstrdup(path, GFP_NOFS);
283 if (!device->name) {
284 kfree(device);
285 return -ENOMEM;
286 }
287 INIT_LIST_HEAD(&device->dev_alloc_list);
288 list_add(&device->dev_list, &fs_devices->devices);
289 device->fs_devices = fs_devices;
290 fs_devices->num_devices++;
291 }
292
293 if (found_transid > fs_devices->latest_trans) {
294 fs_devices->latest_devid = devid;
295 fs_devices->latest_trans = found_transid;
296 }
297 *fs_devices_ret = fs_devices;
298 return 0;
299}
300
301static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
302{
303 struct btrfs_fs_devices *fs_devices;
304 struct btrfs_device *device;
305 struct btrfs_device *orig_dev;
306
307 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
308 if (!fs_devices)
309 return ERR_PTR(-ENOMEM);
310
311 INIT_LIST_HEAD(&fs_devices->devices);
312 INIT_LIST_HEAD(&fs_devices->alloc_list);
313 INIT_LIST_HEAD(&fs_devices->list);
314 fs_devices->latest_devid = orig->latest_devid;
315 fs_devices->latest_trans = orig->latest_trans;
316 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
317
318 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
319 device = kzalloc(sizeof(*device), GFP_NOFS);
320 if (!device)
321 goto error;
322
323 device->name = kstrdup(orig_dev->name, GFP_NOFS);
324 if (!device->name)
325 goto error;
326
327 device->devid = orig_dev->devid;
328 device->work.func = pending_bios_fn;
329 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
330 device->barriers = 1;
331 spin_lock_init(&device->io_lock);
332 INIT_LIST_HEAD(&device->dev_list);
333 INIT_LIST_HEAD(&device->dev_alloc_list);
334
335 list_add(&device->dev_list, &fs_devices->devices);
336 device->fs_devices = fs_devices;
337 fs_devices->num_devices++;
338 }
339 return fs_devices;
340error:
341 free_fs_devices(fs_devices);
342 return ERR_PTR(-ENOMEM);
343}
344
345int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
346{
347 struct list_head *tmp;
348 struct list_head *cur;
349 struct btrfs_device *device;
350
351 mutex_lock(&uuid_mutex);
352again:
353 list_for_each_safe(cur, tmp, &fs_devices->devices) {
354 device = list_entry(cur, struct btrfs_device, dev_list);
355 if (device->in_fs_metadata)
356 continue;
357
358 if (device->bdev) {
359 close_bdev_exclusive(device->bdev, device->mode);
360 device->bdev = NULL;
361 fs_devices->open_devices--;
362 }
363 if (device->writeable) {
364 list_del_init(&device->dev_alloc_list);
365 device->writeable = 0;
366 fs_devices->rw_devices--;
367 }
368 list_del_init(&device->dev_list);
369 fs_devices->num_devices--;
370 kfree(device->name);
371 kfree(device);
372 }
373
374 if (fs_devices->seed) {
375 fs_devices = fs_devices->seed;
376 goto again;
377 }
378
379 mutex_unlock(&uuid_mutex);
380 return 0;
381}
382
383static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
384{
385 struct list_head *cur;
386 struct btrfs_device *device;
387
388 if (--fs_devices->opened > 0)
389 return 0;
390
391 list_for_each(cur, &fs_devices->devices) {
392 device = list_entry(cur, struct btrfs_device, dev_list);
393 if (device->bdev) {
394 close_bdev_exclusive(device->bdev, device->mode);
395 fs_devices->open_devices--;
396 }
397 if (device->writeable) {
398 list_del_init(&device->dev_alloc_list);
399 fs_devices->rw_devices--;
400 }
401
402 device->bdev = NULL;
403 device->writeable = 0;
404 device->in_fs_metadata = 0;
405 }
406 WARN_ON(fs_devices->open_devices);
407 WARN_ON(fs_devices->rw_devices);
408 fs_devices->opened = 0;
409 fs_devices->seeding = 0;
410
411 return 0;
412}
413
414int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
415{
416 struct btrfs_fs_devices *seed_devices = NULL;
417 int ret;
418
419 mutex_lock(&uuid_mutex);
420 ret = __btrfs_close_devices(fs_devices);
421 if (!fs_devices->opened) {
422 seed_devices = fs_devices->seed;
423 fs_devices->seed = NULL;
424 }
425 mutex_unlock(&uuid_mutex);
426
427 while (seed_devices) {
428 fs_devices = seed_devices;
429 seed_devices = fs_devices->seed;
430 __btrfs_close_devices(fs_devices);
431 free_fs_devices(fs_devices);
432 }
433 return ret;
434}
435
436static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
437 fmode_t flags, void *holder)
438{
439 struct block_device *bdev;
440 struct list_head *head = &fs_devices->devices;
441 struct list_head *cur;
442 struct btrfs_device *device;
443 struct block_device *latest_bdev = NULL;
444 struct buffer_head *bh;
445 struct btrfs_super_block *disk_super;
446 u64 latest_devid = 0;
447 u64 latest_transid = 0;
448 u64 devid;
449 int seeding = 1;
450 int ret = 0;
451
452 list_for_each(cur, head) {
453 device = list_entry(cur, struct btrfs_device, dev_list);
454 if (device->bdev)
455 continue;
456 if (!device->name)
457 continue;
458
459 bdev = open_bdev_exclusive(device->name, flags, holder);
460 if (IS_ERR(bdev)) {
461 printk(KERN_INFO "open %s failed\n", device->name);
462 goto error;
463 }
464 set_blocksize(bdev, 4096);
465
466 bh = btrfs_read_dev_super(bdev);
467 if (!bh)
468 goto error_close;
469
470 disk_super = (struct btrfs_super_block *)bh->b_data;
471 devid = le64_to_cpu(disk_super->dev_item.devid);
472 if (devid != device->devid)
473 goto error_brelse;
474
475 if (memcmp(device->uuid, disk_super->dev_item.uuid,
476 BTRFS_UUID_SIZE))
477 goto error_brelse;
478
479 device->generation = btrfs_super_generation(disk_super);
480 if (!latest_transid || device->generation > latest_transid) {
481 latest_devid = devid;
482 latest_transid = device->generation;
483 latest_bdev = bdev;
484 }
485
486 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
487 device->writeable = 0;
488 } else {
489 device->writeable = !bdev_read_only(bdev);
490 seeding = 0;
491 }
492
493 device->bdev = bdev;
494 device->in_fs_metadata = 0;
495 device->mode = flags;
496
497 fs_devices->open_devices++;
498 if (device->writeable) {
499 fs_devices->rw_devices++;
500 list_add(&device->dev_alloc_list,
501 &fs_devices->alloc_list);
502 }
503 continue;
504
505error_brelse:
506 brelse(bh);
507error_close:
508 close_bdev_exclusive(bdev, FMODE_READ);
509error:
510 continue;
511 }
512 if (fs_devices->open_devices == 0) {
513 ret = -EIO;
514 goto out;
515 }
516 fs_devices->seeding = seeding;
517 fs_devices->opened = 1;
518 fs_devices->latest_bdev = latest_bdev;
519 fs_devices->latest_devid = latest_devid;
520 fs_devices->latest_trans = latest_transid;
521 fs_devices->total_rw_bytes = 0;
522out:
523 return ret;
524}
525
526int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
527 fmode_t flags, void *holder)
528{
529 int ret;
530
531 mutex_lock(&uuid_mutex);
532 if (fs_devices->opened) {
533 fs_devices->opened++;
534 ret = 0;
535 } else {
536 ret = __btrfs_open_devices(fs_devices, flags, holder);
537 }
538 mutex_unlock(&uuid_mutex);
539 return ret;
540}
541
542int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
543 struct btrfs_fs_devices **fs_devices_ret)
544{
545 struct btrfs_super_block *disk_super;
546 struct block_device *bdev;
547 struct buffer_head *bh;
548 int ret;
549 u64 devid;
550 u64 transid;
551
552 mutex_lock(&uuid_mutex);
553
554 bdev = open_bdev_exclusive(path, flags, holder);
555
556 if (IS_ERR(bdev)) {
557 ret = PTR_ERR(bdev);
558 goto error;
559 }
560
561 ret = set_blocksize(bdev, 4096);
562 if (ret)
563 goto error_close;
564 bh = btrfs_read_dev_super(bdev);
565 if (!bh) {
566 ret = -EIO;
567 goto error_close;
568 }
569 disk_super = (struct btrfs_super_block *)bh->b_data;
570 devid = le64_to_cpu(disk_super->dev_item.devid);
571 transid = btrfs_super_generation(disk_super);
572 if (disk_super->label[0])
573 printk(KERN_INFO "device label %s ", disk_super->label);
574 else {
575 /* FIXME, make a readl uuid parser */
576 printk(KERN_INFO "device fsid %llx-%llx ",
577 *(unsigned long long *)disk_super->fsid,
578 *(unsigned long long *)(disk_super->fsid + 8));
579 }
580 printk(KERN_INFO "devid %llu transid %llu %s\n",
581 (unsigned long long)devid, (unsigned long long)transid, path);
582 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
583
584 brelse(bh);
585error_close:
586 close_bdev_exclusive(bdev, flags);
587error:
588 mutex_unlock(&uuid_mutex);
589 return ret;
590}
591
592/*
593 * this uses a pretty simple search, the expectation is that it is
594 * called very infrequently and that a given device has a small number
595 * of extents
596 */
597static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
598 struct btrfs_device *device,
599 u64 num_bytes, u64 *start)
600{
601 struct btrfs_key key;
602 struct btrfs_root *root = device->dev_root;
603 struct btrfs_dev_extent *dev_extent = NULL;
604 struct btrfs_path *path;
605 u64 hole_size = 0;
606 u64 last_byte = 0;
607 u64 search_start = 0;
608 u64 search_end = device->total_bytes;
609 int ret;
610 int slot = 0;
611 int start_found;
612 struct extent_buffer *l;
613
614 path = btrfs_alloc_path();
615 if (!path)
616 return -ENOMEM;
617 path->reada = 2;
618 start_found = 0;
619
620 /* FIXME use last free of some kind */
621
622 /* we don't want to overwrite the superblock on the drive,
623 * so we make sure to start at an offset of at least 1MB
624 */
625 search_start = max((u64)1024 * 1024, search_start);
626
627 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
628 search_start = max(root->fs_info->alloc_start, search_start);
629
630 key.objectid = device->devid;
631 key.offset = search_start;
632 key.type = BTRFS_DEV_EXTENT_KEY;
633 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
634 if (ret < 0)
635 goto error;
636 ret = btrfs_previous_item(root, path, 0, key.type);
637 if (ret < 0)
638 goto error;
639 l = path->nodes[0];
640 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
641 while (1) {
642 l = path->nodes[0];
643 slot = path->slots[0];
644 if (slot >= btrfs_header_nritems(l)) {
645 ret = btrfs_next_leaf(root, path);
646 if (ret == 0)
647 continue;
648 if (ret < 0)
649 goto error;
650no_more_items:
651 if (!start_found) {
652 if (search_start >= search_end) {
653 ret = -ENOSPC;
654 goto error;
655 }
656 *start = search_start;
657 start_found = 1;
658 goto check_pending;
659 }
660 *start = last_byte > search_start ?
661 last_byte : search_start;
662 if (search_end <= *start) {
663 ret = -ENOSPC;
664 goto error;
665 }
666 goto check_pending;
667 }
668 btrfs_item_key_to_cpu(l, &key, slot);
669
670 if (key.objectid < device->devid)
671 goto next;
672
673 if (key.objectid > device->devid)
674 goto no_more_items;
675
676 if (key.offset >= search_start && key.offset > last_byte &&
677 start_found) {
678 if (last_byte < search_start)
679 last_byte = search_start;
680 hole_size = key.offset - last_byte;
681 if (key.offset > last_byte &&
682 hole_size >= num_bytes) {
683 *start = last_byte;
684 goto check_pending;
685 }
686 }
687 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
688 goto next;
689
690 start_found = 1;
691 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
692 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
693next:
694 path->slots[0]++;
695 cond_resched();
696 }
697check_pending:
698 /* we have to make sure we didn't find an extent that has already
699 * been allocated by the map tree or the original allocation
700 */
701 BUG_ON(*start < search_start);
702
703 if (*start + num_bytes > search_end) {
704 ret = -ENOSPC;
705 goto error;
706 }
707 /* check for pending inserts here */
708 ret = 0;
709
710error:
711 btrfs_free_path(path);
712 return ret;
713}
714
715static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
716 struct btrfs_device *device,
717 u64 start)
718{
719 int ret;
720 struct btrfs_path *path;
721 struct btrfs_root *root = device->dev_root;
722 struct btrfs_key key;
723 struct btrfs_key found_key;
724 struct extent_buffer *leaf = NULL;
725 struct btrfs_dev_extent *extent = NULL;
726
727 path = btrfs_alloc_path();
728 if (!path)
729 return -ENOMEM;
730
731 key.objectid = device->devid;
732 key.offset = start;
733 key.type = BTRFS_DEV_EXTENT_KEY;
734
735 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
736 if (ret > 0) {
737 ret = btrfs_previous_item(root, path, key.objectid,
738 BTRFS_DEV_EXTENT_KEY);
739 BUG_ON(ret);
740 leaf = path->nodes[0];
741 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
742 extent = btrfs_item_ptr(leaf, path->slots[0],
743 struct btrfs_dev_extent);
744 BUG_ON(found_key.offset > start || found_key.offset +
745 btrfs_dev_extent_length(leaf, extent) < start);
746 ret = 0;
747 } else if (ret == 0) {
748 leaf = path->nodes[0];
749 extent = btrfs_item_ptr(leaf, path->slots[0],
750 struct btrfs_dev_extent);
751 }
752 BUG_ON(ret);
753
754 if (device->bytes_used > 0)
755 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
756 ret = btrfs_del_item(trans, root, path);
757 BUG_ON(ret);
758
759 btrfs_free_path(path);
760 return ret;
761}
762
763int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
764 struct btrfs_device *device,
765 u64 chunk_tree, u64 chunk_objectid,
766 u64 chunk_offset, u64 start, u64 num_bytes)
767{
768 int ret;
769 struct btrfs_path *path;
770 struct btrfs_root *root = device->dev_root;
771 struct btrfs_dev_extent *extent;
772 struct extent_buffer *leaf;
773 struct btrfs_key key;
774
775 WARN_ON(!device->in_fs_metadata);
776 path = btrfs_alloc_path();
777 if (!path)
778 return -ENOMEM;
779
780 key.objectid = device->devid;
781 key.offset = start;
782 key.type = BTRFS_DEV_EXTENT_KEY;
783 ret = btrfs_insert_empty_item(trans, root, path, &key,
784 sizeof(*extent));
785 BUG_ON(ret);
786
787 leaf = path->nodes[0];
788 extent = btrfs_item_ptr(leaf, path->slots[0],
789 struct btrfs_dev_extent);
790 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
791 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
792 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
793
794 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
795 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
796 BTRFS_UUID_SIZE);
797
798 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
799 btrfs_mark_buffer_dirty(leaf);
800 btrfs_free_path(path);
801 return ret;
802}
803
804static noinline int find_next_chunk(struct btrfs_root *root,
805 u64 objectid, u64 *offset)
806{
807 struct btrfs_path *path;
808 int ret;
809 struct btrfs_key key;
810 struct btrfs_chunk *chunk;
811 struct btrfs_key found_key;
812
813 path = btrfs_alloc_path();
814 BUG_ON(!path);
815
816 key.objectid = objectid;
817 key.offset = (u64)-1;
818 key.type = BTRFS_CHUNK_ITEM_KEY;
819
820 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
821 if (ret < 0)
822 goto error;
823
824 BUG_ON(ret == 0);
825
826 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
827 if (ret) {
828 *offset = 0;
829 } else {
830 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
831 path->slots[0]);
832 if (found_key.objectid != objectid)
833 *offset = 0;
834 else {
835 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
836 struct btrfs_chunk);
837 *offset = found_key.offset +
838 btrfs_chunk_length(path->nodes[0], chunk);
839 }
840 }
841 ret = 0;
842error:
843 btrfs_free_path(path);
844 return ret;
845}
846
847static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
848{
849 int ret;
850 struct btrfs_key key;
851 struct btrfs_key found_key;
852 struct btrfs_path *path;
853
854 root = root->fs_info->chunk_root;
855
856 path = btrfs_alloc_path();
857 if (!path)
858 return -ENOMEM;
859
860 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
861 key.type = BTRFS_DEV_ITEM_KEY;
862 key.offset = (u64)-1;
863
864 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
865 if (ret < 0)
866 goto error;
867
868 BUG_ON(ret == 0);
869
870 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
871 BTRFS_DEV_ITEM_KEY);
872 if (ret) {
873 *objectid = 1;
874 } else {
875 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
876 path->slots[0]);
877 *objectid = found_key.offset + 1;
878 }
879 ret = 0;
880error:
881 btrfs_free_path(path);
882 return ret;
883}
884
885/*
886 * the device information is stored in the chunk root
887 * the btrfs_device struct should be fully filled in
888 */
889int btrfs_add_device(struct btrfs_trans_handle *trans,
890 struct btrfs_root *root,
891 struct btrfs_device *device)
892{
893 int ret;
894 struct btrfs_path *path;
895 struct btrfs_dev_item *dev_item;
896 struct extent_buffer *leaf;
897 struct btrfs_key key;
898 unsigned long ptr;
899
900 root = root->fs_info->chunk_root;
901
902 path = btrfs_alloc_path();
903 if (!path)
904 return -ENOMEM;
905
906 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
907 key.type = BTRFS_DEV_ITEM_KEY;
908 key.offset = device->devid;
909
910 ret = btrfs_insert_empty_item(trans, root, path, &key,
911 sizeof(*dev_item));
912 if (ret)
913 goto out;
914
915 leaf = path->nodes[0];
916 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
917
918 btrfs_set_device_id(leaf, dev_item, device->devid);
919 btrfs_set_device_generation(leaf, dev_item, 0);
920 btrfs_set_device_type(leaf, dev_item, device->type);
921 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
922 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
923 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
924 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
925 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
926 btrfs_set_device_group(leaf, dev_item, 0);
927 btrfs_set_device_seek_speed(leaf, dev_item, 0);
928 btrfs_set_device_bandwidth(leaf, dev_item, 0);
929 btrfs_set_device_start_offset(leaf, dev_item, 0);
930
931 ptr = (unsigned long)btrfs_device_uuid(dev_item);
932 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
933 ptr = (unsigned long)btrfs_device_fsid(dev_item);
934 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
935 btrfs_mark_buffer_dirty(leaf);
936
937 ret = 0;
938out:
939 btrfs_free_path(path);
940 return ret;
941}
942
943static int btrfs_rm_dev_item(struct btrfs_root *root,
944 struct btrfs_device *device)
945{
946 int ret;
947 struct btrfs_path *path;
948 struct btrfs_key key;
949 struct btrfs_trans_handle *trans;
950
951 root = root->fs_info->chunk_root;
952
953 path = btrfs_alloc_path();
954 if (!path)
955 return -ENOMEM;
956
957 trans = btrfs_start_transaction(root, 1);
958 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
959 key.type = BTRFS_DEV_ITEM_KEY;
960 key.offset = device->devid;
961 lock_chunks(root);
962
963 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
964 if (ret < 0)
965 goto out;
966
967 if (ret > 0) {
968 ret = -ENOENT;
969 goto out;
970 }
971
972 ret = btrfs_del_item(trans, root, path);
973 if (ret)
974 goto out;
975out:
976 btrfs_free_path(path);
977 unlock_chunks(root);
978 btrfs_commit_transaction(trans, root);
979 return ret;
980}
981
982int btrfs_rm_device(struct btrfs_root *root, char *device_path)
983{
984 struct btrfs_device *device;
985 struct btrfs_device *next_device;
986 struct block_device *bdev;
987 struct buffer_head *bh = NULL;
988 struct btrfs_super_block *disk_super;
989 u64 all_avail;
990 u64 devid;
991 u64 num_devices;
992 u8 *dev_uuid;
993 int ret = 0;
994
995 mutex_lock(&uuid_mutex);
996 mutex_lock(&root->fs_info->volume_mutex);
997
998 all_avail = root->fs_info->avail_data_alloc_bits |
999 root->fs_info->avail_system_alloc_bits |
1000 root->fs_info->avail_metadata_alloc_bits;
1001
1002 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1003 root->fs_info->fs_devices->rw_devices <= 4) {
1004 printk(KERN_ERR "btrfs: unable to go below four devices "
1005 "on raid10\n");
1006 ret = -EINVAL;
1007 goto out;
1008 }
1009
1010 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1011 root->fs_info->fs_devices->rw_devices <= 2) {
1012 printk(KERN_ERR "btrfs: unable to go below two "
1013 "devices on raid1\n");
1014 ret = -EINVAL;
1015 goto out;
1016 }
1017
1018 if (strcmp(device_path, "missing") == 0) {
1019 struct list_head *cur;
1020 struct list_head *devices;
1021 struct btrfs_device *tmp;
1022
1023 device = NULL;
1024 devices = &root->fs_info->fs_devices->devices;
1025 list_for_each(cur, devices) {
1026 tmp = list_entry(cur, struct btrfs_device, dev_list);
1027 if (tmp->in_fs_metadata && !tmp->bdev) {
1028 device = tmp;
1029 break;
1030 }
1031 }
1032 bdev = NULL;
1033 bh = NULL;
1034 disk_super = NULL;
1035 if (!device) {
1036 printk(KERN_ERR "btrfs: no missing devices found to "
1037 "remove\n");
1038 goto out;
1039 }
1040 } else {
1041 bdev = open_bdev_exclusive(device_path, FMODE_READ,
1042 root->fs_info->bdev_holder);
1043 if (IS_ERR(bdev)) {
1044 ret = PTR_ERR(bdev);
1045 goto out;
1046 }
1047
1048 set_blocksize(bdev, 4096);
1049 bh = btrfs_read_dev_super(bdev);
1050 if (!bh) {
1051 ret = -EIO;
1052 goto error_close;
1053 }
1054 disk_super = (struct btrfs_super_block *)bh->b_data;
1055 devid = le64_to_cpu(disk_super->dev_item.devid);
1056 dev_uuid = disk_super->dev_item.uuid;
1057 device = btrfs_find_device(root, devid, dev_uuid,
1058 disk_super->fsid);
1059 if (!device) {
1060 ret = -ENOENT;
1061 goto error_brelse;
1062 }
1063 }
1064
1065 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1066 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1067 "device\n");
1068 ret = -EINVAL;
1069 goto error_brelse;
1070 }
1071
1072 if (device->writeable) {
1073 list_del_init(&device->dev_alloc_list);
1074 root->fs_info->fs_devices->rw_devices--;
1075 }
1076
1077 ret = btrfs_shrink_device(device, 0);
1078 if (ret)
1079 goto error_brelse;
1080
1081 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1082 if (ret)
1083 goto error_brelse;
1084
1085 device->in_fs_metadata = 0;
1086 list_del_init(&device->dev_list);
1087 device->fs_devices->num_devices--;
1088
1089 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1090 struct btrfs_device, dev_list);
1091 if (device->bdev == root->fs_info->sb->s_bdev)
1092 root->fs_info->sb->s_bdev = next_device->bdev;
1093 if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1094 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1095
1096 if (device->bdev) {
1097 close_bdev_exclusive(device->bdev, device->mode);
1098 device->bdev = NULL;
1099 device->fs_devices->open_devices--;
1100 }
1101
1102 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1103 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1104
1105 if (device->fs_devices->open_devices == 0) {
1106 struct btrfs_fs_devices *fs_devices;
1107 fs_devices = root->fs_info->fs_devices;
1108 while (fs_devices) {
1109 if (fs_devices->seed == device->fs_devices)
1110 break;
1111 fs_devices = fs_devices->seed;
1112 }
1113 fs_devices->seed = device->fs_devices->seed;
1114 device->fs_devices->seed = NULL;
1115 __btrfs_close_devices(device->fs_devices);
1116 free_fs_devices(device->fs_devices);
1117 }
1118
1119 /*
1120 * at this point, the device is zero sized. We want to
1121 * remove it from the devices list and zero out the old super
1122 */
1123 if (device->writeable) {
1124 /* make sure this device isn't detected as part of
1125 * the FS anymore
1126 */
1127 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1128 set_buffer_dirty(bh);
1129 sync_dirty_buffer(bh);
1130 }
1131
1132 kfree(device->name);
1133 kfree(device);
1134 ret = 0;
1135
1136error_brelse:
1137 brelse(bh);
1138error_close:
1139 if (bdev)
1140 close_bdev_exclusive(bdev, FMODE_READ);
1141out:
1142 mutex_unlock(&root->fs_info->volume_mutex);
1143 mutex_unlock(&uuid_mutex);
1144 return ret;
1145}
1146
1147/*
1148 * does all the dirty work required for changing file system's UUID.
1149 */
1150static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1151 struct btrfs_root *root)
1152{
1153 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1154 struct btrfs_fs_devices *old_devices;
1155 struct btrfs_fs_devices *seed_devices;
1156 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1157 struct btrfs_device *device;
1158 u64 super_flags;
1159
1160 BUG_ON(!mutex_is_locked(&uuid_mutex));
1161 if (!fs_devices->seeding)
1162 return -EINVAL;
1163
1164 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1165 if (!seed_devices)
1166 return -ENOMEM;
1167
1168 old_devices = clone_fs_devices(fs_devices);
1169 if (IS_ERR(old_devices)) {
1170 kfree(seed_devices);
1171 return PTR_ERR(old_devices);
1172 }
1173
1174 list_add(&old_devices->list, &fs_uuids);
1175
1176 memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1177 seed_devices->opened = 1;
1178 INIT_LIST_HEAD(&seed_devices->devices);
1179 INIT_LIST_HEAD(&seed_devices->alloc_list);
1180 list_splice_init(&fs_devices->devices, &seed_devices->devices);
1181 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1182 list_for_each_entry(device, &seed_devices->devices, dev_list) {
1183 device->fs_devices = seed_devices;
1184 }
1185
1186 fs_devices->seeding = 0;
1187 fs_devices->num_devices = 0;
1188 fs_devices->open_devices = 0;
1189 fs_devices->seed = seed_devices;
1190
1191 generate_random_uuid(fs_devices->fsid);
1192 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1193 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1194 super_flags = btrfs_super_flags(disk_super) &
1195 ~BTRFS_SUPER_FLAG_SEEDING;
1196 btrfs_set_super_flags(disk_super, super_flags);
1197
1198 return 0;
1199}
1200
1201/*
1202 * strore the expected generation for seed devices in device items.
1203 */
1204static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1205 struct btrfs_root *root)
1206{
1207 struct btrfs_path *path;
1208 struct extent_buffer *leaf;
1209 struct btrfs_dev_item *dev_item;
1210 struct btrfs_device *device;
1211 struct btrfs_key key;
1212 u8 fs_uuid[BTRFS_UUID_SIZE];
1213 u8 dev_uuid[BTRFS_UUID_SIZE];
1214 u64 devid;
1215 int ret;
1216
1217 path = btrfs_alloc_path();
1218 if (!path)
1219 return -ENOMEM;
1220
1221 root = root->fs_info->chunk_root;
1222 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1223 key.offset = 0;
1224 key.type = BTRFS_DEV_ITEM_KEY;
1225
1226 while (1) {
1227 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1228 if (ret < 0)
1229 goto error;
1230
1231 leaf = path->nodes[0];
1232next_slot:
1233 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1234 ret = btrfs_next_leaf(root, path);
1235 if (ret > 0)
1236 break;
1237 if (ret < 0)
1238 goto error;
1239 leaf = path->nodes[0];
1240 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1241 btrfs_release_path(root, path);
1242 continue;
1243 }
1244
1245 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1246 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1247 key.type != BTRFS_DEV_ITEM_KEY)
1248 break;
1249
1250 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1251 struct btrfs_dev_item);
1252 devid = btrfs_device_id(leaf, dev_item);
1253 read_extent_buffer(leaf, dev_uuid,
1254 (unsigned long)btrfs_device_uuid(dev_item),
1255 BTRFS_UUID_SIZE);
1256 read_extent_buffer(leaf, fs_uuid,
1257 (unsigned long)btrfs_device_fsid(dev_item),
1258 BTRFS_UUID_SIZE);
1259 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1260 BUG_ON(!device);
1261
1262 if (device->fs_devices->seeding) {
1263 btrfs_set_device_generation(leaf, dev_item,
1264 device->generation);
1265 btrfs_mark_buffer_dirty(leaf);
1266 }
1267
1268 path->slots[0]++;
1269 goto next_slot;
1270 }
1271 ret = 0;
1272error:
1273 btrfs_free_path(path);
1274 return ret;
1275}
1276
1277int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1278{
1279 struct btrfs_trans_handle *trans;
1280 struct btrfs_device *device;
1281 struct block_device *bdev;
1282 struct list_head *cur;
1283 struct list_head *devices;
1284 struct super_block *sb = root->fs_info->sb;
1285 u64 total_bytes;
1286 int seeding_dev = 0;
1287 int ret = 0;
1288
1289 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1290 return -EINVAL;
1291
1292 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
1293 if (!bdev)
1294 return -EIO;
1295
1296 if (root->fs_info->fs_devices->seeding) {
1297 seeding_dev = 1;
1298 down_write(&sb->s_umount);
1299 mutex_lock(&uuid_mutex);
1300 }
1301
1302 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1303 mutex_lock(&root->fs_info->volume_mutex);
1304
1305 devices = &root->fs_info->fs_devices->devices;
1306 list_for_each(cur, devices) {
1307 device = list_entry(cur, struct btrfs_device, dev_list);
1308 if (device->bdev == bdev) {
1309 ret = -EEXIST;
1310 goto error;
1311 }
1312 }
1313
1314 device = kzalloc(sizeof(*device), GFP_NOFS);
1315 if (!device) {
1316 /* we can safely leave the fs_devices entry around */
1317 ret = -ENOMEM;
1318 goto error;
1319 }
1320
1321 device->name = kstrdup(device_path, GFP_NOFS);
1322 if (!device->name) {
1323 kfree(device);
1324 ret = -ENOMEM;
1325 goto error;
1326 }
1327
1328 ret = find_next_devid(root, &device->devid);
1329 if (ret) {
1330 kfree(device);
1331 goto error;
1332 }
1333
1334 trans = btrfs_start_transaction(root, 1);
1335 lock_chunks(root);
1336
1337 device->barriers = 1;
1338 device->writeable = 1;
1339 device->work.func = pending_bios_fn;
1340 generate_random_uuid(device->uuid);
1341 spin_lock_init(&device->io_lock);
1342 device->generation = trans->transid;
1343 device->io_width = root->sectorsize;
1344 device->io_align = root->sectorsize;
1345 device->sector_size = root->sectorsize;
1346 device->total_bytes = i_size_read(bdev->bd_inode);
1347 device->dev_root = root->fs_info->dev_root;
1348 device->bdev = bdev;
1349 device->in_fs_metadata = 1;
1350 device->mode = 0;
1351 set_blocksize(device->bdev, 4096);
1352
1353 if (seeding_dev) {
1354 sb->s_flags &= ~MS_RDONLY;
1355 ret = btrfs_prepare_sprout(trans, root);
1356 BUG_ON(ret);
1357 }
1358
1359 device->fs_devices = root->fs_info->fs_devices;
1360 list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1361 list_add(&device->dev_alloc_list,
1362 &root->fs_info->fs_devices->alloc_list);
1363 root->fs_info->fs_devices->num_devices++;
1364 root->fs_info->fs_devices->open_devices++;
1365 root->fs_info->fs_devices->rw_devices++;
1366 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1367
1368 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1369 btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1370 total_bytes + device->total_bytes);
1371
1372 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1373 btrfs_set_super_num_devices(&root->fs_info->super_copy,
1374 total_bytes + 1);
1375
1376 if (seeding_dev) {
1377 ret = init_first_rw_device(trans, root, device);
1378 BUG_ON(ret);
1379 ret = btrfs_finish_sprout(trans, root);
1380 BUG_ON(ret);
1381 } else {
1382 ret = btrfs_add_device(trans, root, device);
1383 }
1384
1385 unlock_chunks(root);
1386 btrfs_commit_transaction(trans, root);
1387
1388 if (seeding_dev) {
1389 mutex_unlock(&uuid_mutex);
1390 up_write(&sb->s_umount);
1391
1392 ret = btrfs_relocate_sys_chunks(root);
1393 BUG_ON(ret);
1394 }
1395out:
1396 mutex_unlock(&root->fs_info->volume_mutex);
1397 return ret;
1398error:
1399 close_bdev_exclusive(bdev, 0);
1400 if (seeding_dev) {
1401 mutex_unlock(&uuid_mutex);
1402 up_write(&sb->s_umount);
1403 }
1404 goto out;
1405}
1406
1407static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1408 struct btrfs_device *device)
1409{
1410 int ret;
1411 struct btrfs_path *path;
1412 struct btrfs_root *root;
1413 struct btrfs_dev_item *dev_item;
1414 struct extent_buffer *leaf;
1415 struct btrfs_key key;
1416
1417 root = device->dev_root->fs_info->chunk_root;
1418
1419 path = btrfs_alloc_path();
1420 if (!path)
1421 return -ENOMEM;
1422
1423 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1424 key.type = BTRFS_DEV_ITEM_KEY;
1425 key.offset = device->devid;
1426
1427 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1428 if (ret < 0)
1429 goto out;
1430
1431 if (ret > 0) {
1432 ret = -ENOENT;
1433 goto out;
1434 }
1435
1436 leaf = path->nodes[0];
1437 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1438
1439 btrfs_set_device_id(leaf, dev_item, device->devid);
1440 btrfs_set_device_type(leaf, dev_item, device->type);
1441 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1442 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1443 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1444 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1445 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1446 btrfs_mark_buffer_dirty(leaf);
1447
1448out:
1449 btrfs_free_path(path);
1450 return ret;
1451}
1452
1453static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1454 struct btrfs_device *device, u64 new_size)
1455{
1456 struct btrfs_super_block *super_copy =
1457 &device->dev_root->fs_info->super_copy;
1458 u64 old_total = btrfs_super_total_bytes(super_copy);
1459 u64 diff = new_size - device->total_bytes;
1460
1461 if (!device->writeable)
1462 return -EACCES;
1463 if (new_size <= device->total_bytes)
1464 return -EINVAL;
1465
1466 btrfs_set_super_total_bytes(super_copy, old_total + diff);
1467 device->fs_devices->total_rw_bytes += diff;
1468
1469 device->total_bytes = new_size;
1470 return btrfs_update_device(trans, device);
1471}
1472
1473int btrfs_grow_device(struct btrfs_trans_handle *trans,
1474 struct btrfs_device *device, u64 new_size)
1475{
1476 int ret;
1477 lock_chunks(device->dev_root);
1478 ret = __btrfs_grow_device(trans, device, new_size);
1479 unlock_chunks(device->dev_root);
1480 return ret;
1481}
1482
1483static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1484 struct btrfs_root *root,
1485 u64 chunk_tree, u64 chunk_objectid,
1486 u64 chunk_offset)
1487{
1488 int ret;
1489 struct btrfs_path *path;
1490 struct btrfs_key key;
1491
1492 root = root->fs_info->chunk_root;
1493 path = btrfs_alloc_path();
1494 if (!path)
1495 return -ENOMEM;
1496
1497 key.objectid = chunk_objectid;
1498 key.offset = chunk_offset;
1499 key.type = BTRFS_CHUNK_ITEM_KEY;
1500
1501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1502 BUG_ON(ret);
1503
1504 ret = btrfs_del_item(trans, root, path);
1505 BUG_ON(ret);
1506
1507 btrfs_free_path(path);
1508 return 0;
1509}
1510
1511static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1512 chunk_offset)
1513{
1514 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1515 struct btrfs_disk_key *disk_key;
1516 struct btrfs_chunk *chunk;
1517 u8 *ptr;
1518 int ret = 0;
1519 u32 num_stripes;
1520 u32 array_size;
1521 u32 len = 0;
1522 u32 cur;
1523 struct btrfs_key key;
1524
1525 array_size = btrfs_super_sys_array_size(super_copy);
1526
1527 ptr = super_copy->sys_chunk_array;
1528 cur = 0;
1529
1530 while (cur < array_size) {
1531 disk_key = (struct btrfs_disk_key *)ptr;
1532 btrfs_disk_key_to_cpu(&key, disk_key);
1533
1534 len = sizeof(*disk_key);
1535
1536 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1537 chunk = (struct btrfs_chunk *)(ptr + len);
1538 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1539 len += btrfs_chunk_item_size(num_stripes);
1540 } else {
1541 ret = -EIO;
1542 break;
1543 }
1544 if (key.objectid == chunk_objectid &&
1545 key.offset == chunk_offset) {
1546 memmove(ptr, ptr + len, array_size - (cur + len));
1547 array_size -= len;
1548 btrfs_set_super_sys_array_size(super_copy, array_size);
1549 } else {
1550 ptr += len;
1551 cur += len;
1552 }
1553 }
1554 return ret;
1555}
1556
1557static int btrfs_relocate_chunk(struct btrfs_root *root,
1558 u64 chunk_tree, u64 chunk_objectid,
1559 u64 chunk_offset)
1560{
1561 struct extent_map_tree *em_tree;
1562 struct btrfs_root *extent_root;
1563 struct btrfs_trans_handle *trans;
1564 struct extent_map *em;
1565 struct map_lookup *map;
1566 int ret;
1567 int i;
1568
1569 printk(KERN_INFO "btrfs relocating chunk %llu\n",
1570 (unsigned long long)chunk_offset);
1571 root = root->fs_info->chunk_root;
1572 extent_root = root->fs_info->extent_root;
1573 em_tree = &root->fs_info->mapping_tree.map_tree;
1574
1575 /* step one, relocate all the extents inside this chunk */
1576 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1577 BUG_ON(ret);
1578
1579 trans = btrfs_start_transaction(root, 1);
1580 BUG_ON(!trans);
1581
1582 lock_chunks(root);
1583
1584 /*
1585 * step two, delete the device extents and the
1586 * chunk tree entries
1587 */
1588 spin_lock(&em_tree->lock);
1589 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1590 spin_unlock(&em_tree->lock);
1591
1592 BUG_ON(em->start > chunk_offset ||
1593 em->start + em->len < chunk_offset);
1594 map = (struct map_lookup *)em->bdev;
1595
1596 for (i = 0; i < map->num_stripes; i++) {
1597 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1598 map->stripes[i].physical);
1599 BUG_ON(ret);
1600
1601 if (map->stripes[i].dev) {
1602 ret = btrfs_update_device(trans, map->stripes[i].dev);
1603 BUG_ON(ret);
1604 }
1605 }
1606 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1607 chunk_offset);
1608
1609 BUG_ON(ret);
1610
1611 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1612 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1613 BUG_ON(ret);
1614 }
1615
1616 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1617 BUG_ON(ret);
1618
1619 spin_lock(&em_tree->lock);
1620 remove_extent_mapping(em_tree, em);
1621 spin_unlock(&em_tree->lock);
1622
1623 kfree(map);
1624 em->bdev = NULL;
1625
1626 /* once for the tree */
1627 free_extent_map(em);
1628 /* once for us */
1629 free_extent_map(em);
1630
1631 unlock_chunks(root);
1632 btrfs_end_transaction(trans, root);
1633 return 0;
1634}
1635
1636static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1637{
1638 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1639 struct btrfs_path *path;
1640 struct extent_buffer *leaf;
1641 struct btrfs_chunk *chunk;
1642 struct btrfs_key key;
1643 struct btrfs_key found_key;
1644 u64 chunk_tree = chunk_root->root_key.objectid;
1645 u64 chunk_type;
1646 int ret;
1647
1648 path = btrfs_alloc_path();
1649 if (!path)
1650 return -ENOMEM;
1651
1652 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1653 key.offset = (u64)-1;
1654 key.type = BTRFS_CHUNK_ITEM_KEY;
1655
1656 while (1) {
1657 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1658 if (ret < 0)
1659 goto error;
1660 BUG_ON(ret == 0);
1661
1662 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1663 key.type);
1664 if (ret < 0)
1665 goto error;
1666 if (ret > 0)
1667 break;
1668
1669 leaf = path->nodes[0];
1670 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1671
1672 chunk = btrfs_item_ptr(leaf, path->slots[0],
1673 struct btrfs_chunk);
1674 chunk_type = btrfs_chunk_type(leaf, chunk);
1675 btrfs_release_path(chunk_root, path);
1676
1677 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1678 ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1679 found_key.objectid,
1680 found_key.offset);
1681 BUG_ON(ret);
1682 }
1683
1684 if (found_key.offset == 0)
1685 break;
1686 key.offset = found_key.offset - 1;
1687 }
1688 ret = 0;
1689error:
1690 btrfs_free_path(path);
1691 return ret;
1692}
1693
1694static u64 div_factor(u64 num, int factor)
1695{
1696 if (factor == 10)
1697 return num;
1698 num *= factor;
1699 do_div(num, 10);
1700 return num;
1701}
1702
1703int btrfs_balance(struct btrfs_root *dev_root)
1704{
1705 int ret;
1706 struct list_head *cur;
1707 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1708 struct btrfs_device *device;
1709 u64 old_size;
1710 u64 size_to_free;
1711 struct btrfs_path *path;
1712 struct btrfs_key key;
1713 struct btrfs_chunk *chunk;
1714 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1715 struct btrfs_trans_handle *trans;
1716 struct btrfs_key found_key;
1717
1718 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1719 return -EROFS;
1720
1721 mutex_lock(&dev_root->fs_info->volume_mutex);
1722 dev_root = dev_root->fs_info->dev_root;
1723
1724 /* step one make some room on all the devices */
1725 list_for_each(cur, devices) {
1726 device = list_entry(cur, struct btrfs_device, dev_list);
1727 old_size = device->total_bytes;
1728 size_to_free = div_factor(old_size, 1);
1729 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
1730 if (!device->writeable ||
1731 device->total_bytes - device->bytes_used > size_to_free)
1732 continue;
1733
1734 ret = btrfs_shrink_device(device, old_size - size_to_free);
1735 BUG_ON(ret);
1736
1737 trans = btrfs_start_transaction(dev_root, 1);
1738 BUG_ON(!trans);
1739
1740 ret = btrfs_grow_device(trans, device, old_size);
1741 BUG_ON(ret);
1742
1743 btrfs_end_transaction(trans, dev_root);
1744 }
1745
1746 /* step two, relocate all the chunks */
1747 path = btrfs_alloc_path();
1748 BUG_ON(!path);
1749
1750 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1751 key.offset = (u64)-1;
1752 key.type = BTRFS_CHUNK_ITEM_KEY;
1753
1754 while (1) {
1755 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1756 if (ret < 0)
1757 goto error;
1758
1759 /*
1760 * this shouldn't happen, it means the last relocate
1761 * failed
1762 */
1763 if (ret == 0)
1764 break;
1765
1766 ret = btrfs_previous_item(chunk_root, path, 0,
1767 BTRFS_CHUNK_ITEM_KEY);
1768 if (ret)
1769 break;
1770
1771 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1772 path->slots[0]);
1773 if (found_key.objectid != key.objectid)
1774 break;
1775
1776 chunk = btrfs_item_ptr(path->nodes[0],
1777 path->slots[0],
1778 struct btrfs_chunk);
1779 key.offset = found_key.offset;
1780 /* chunk zero is special */
1781 if (key.offset == 0)
1782 break;
1783
1784 btrfs_release_path(chunk_root, path);
1785 ret = btrfs_relocate_chunk(chunk_root,
1786 chunk_root->root_key.objectid,
1787 found_key.objectid,
1788 found_key.offset);
1789 BUG_ON(ret);
1790 }
1791 ret = 0;
1792error:
1793 btrfs_free_path(path);
1794 mutex_unlock(&dev_root->fs_info->volume_mutex);
1795 return ret;
1796}
1797
1798/*
1799 * shrinking a device means finding all of the device extents past
1800 * the new size, and then following the back refs to the chunks.
1801 * The chunk relocation code actually frees the device extent
1802 */
1803int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1804{
1805 struct btrfs_trans_handle *trans;
1806 struct btrfs_root *root = device->dev_root;
1807 struct btrfs_dev_extent *dev_extent = NULL;
1808 struct btrfs_path *path;
1809 u64 length;
1810 u64 chunk_tree;
1811 u64 chunk_objectid;
1812 u64 chunk_offset;
1813 int ret;
1814 int slot;
1815 struct extent_buffer *l;
1816 struct btrfs_key key;
1817 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1818 u64 old_total = btrfs_super_total_bytes(super_copy);
1819 u64 diff = device->total_bytes - new_size;
1820
1821 if (new_size >= device->total_bytes)
1822 return -EINVAL;
1823
1824 path = btrfs_alloc_path();
1825 if (!path)
1826 return -ENOMEM;
1827
1828 trans = btrfs_start_transaction(root, 1);
1829 if (!trans) {
1830 ret = -ENOMEM;
1831 goto done;
1832 }
1833
1834 path->reada = 2;
1835
1836 lock_chunks(root);
1837
1838 device->total_bytes = new_size;
1839 if (device->writeable)
1840 device->fs_devices->total_rw_bytes -= diff;
1841 ret = btrfs_update_device(trans, device);
1842 if (ret) {
1843 unlock_chunks(root);
1844 btrfs_end_transaction(trans, root);
1845 goto done;
1846 }
1847 WARN_ON(diff > old_total);
1848 btrfs_set_super_total_bytes(super_copy, old_total - diff);
1849 unlock_chunks(root);
1850 btrfs_end_transaction(trans, root);
1851
1852 key.objectid = device->devid;
1853 key.offset = (u64)-1;
1854 key.type = BTRFS_DEV_EXTENT_KEY;
1855
1856 while (1) {
1857 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1858 if (ret < 0)
1859 goto done;
1860
1861 ret = btrfs_previous_item(root, path, 0, key.type);
1862 if (ret < 0)
1863 goto done;
1864 if (ret) {
1865 ret = 0;
1866 goto done;
1867 }
1868
1869 l = path->nodes[0];
1870 slot = path->slots[0];
1871 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1872
1873 if (key.objectid != device->devid)
1874 goto done;
1875
1876 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1877 length = btrfs_dev_extent_length(l, dev_extent);
1878
1879 if (key.offset + length <= new_size)
1880 goto done;
1881
1882 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1883 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1884 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1885 btrfs_release_path(root, path);
1886
1887 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
1888 chunk_offset);
1889 if (ret)
1890 goto done;
1891 }
1892
1893done:
1894 btrfs_free_path(path);
1895 return ret;
1896}
1897
1898static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
1899 struct btrfs_root *root,
1900 struct btrfs_key *key,
1901 struct btrfs_chunk *chunk, int item_size)
1902{
1903 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1904 struct btrfs_disk_key disk_key;
1905 u32 array_size;
1906 u8 *ptr;
1907
1908 array_size = btrfs_super_sys_array_size(super_copy);
1909 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
1910 return -EFBIG;
1911
1912 ptr = super_copy->sys_chunk_array + array_size;
1913 btrfs_cpu_key_to_disk(&disk_key, key);
1914 memcpy(ptr, &disk_key, sizeof(disk_key));
1915 ptr += sizeof(disk_key);
1916 memcpy(ptr, chunk, item_size);
1917 item_size += sizeof(disk_key);
1918 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
1919 return 0;
1920}
1921
1922static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
1923 int num_stripes, int sub_stripes)
1924{
1925 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
1926 return calc_size;
1927 else if (type & BTRFS_BLOCK_GROUP_RAID10)
1928 return calc_size * (num_stripes / sub_stripes);
1929 else
1930 return calc_size * num_stripes;
1931}
1932
1933static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *extent_root,
1935 struct map_lookup **map_ret,
1936 u64 *num_bytes, u64 *stripe_size,
1937 u64 start, u64 type)
1938{
1939 struct btrfs_fs_info *info = extent_root->fs_info;
1940 struct btrfs_device *device = NULL;
1941 struct btrfs_fs_devices *fs_devices = info->fs_devices;
1942 struct list_head *cur;
1943 struct map_lookup *map = NULL;
1944 struct extent_map_tree *em_tree;
1945 struct extent_map *em;
1946 struct list_head private_devs;
1947 int min_stripe_size = 1 * 1024 * 1024;
1948 u64 calc_size = 1024 * 1024 * 1024;
1949 u64 max_chunk_size = calc_size;
1950 u64 min_free;
1951 u64 avail;
1952 u64 max_avail = 0;
1953 u64 dev_offset;
1954 int num_stripes = 1;
1955 int min_stripes = 1;
1956 int sub_stripes = 0;
1957 int looped = 0;
1958 int ret;
1959 int index;
1960 int stripe_len = 64 * 1024;
1961
1962 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
1963 (type & BTRFS_BLOCK_GROUP_DUP)) {
1964 WARN_ON(1);
1965 type &= ~BTRFS_BLOCK_GROUP_DUP;
1966 }
1967 if (list_empty(&fs_devices->alloc_list))
1968 return -ENOSPC;
1969
1970 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
1971 num_stripes = fs_devices->rw_devices;
1972 min_stripes = 2;
1973 }
1974 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
1975 num_stripes = 2;
1976 min_stripes = 2;
1977 }
1978 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
1979 num_stripes = min_t(u64, 2, fs_devices->rw_devices);
1980 if (num_stripes < 2)
1981 return -ENOSPC;
1982 min_stripes = 2;
1983 }
1984 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
1985 num_stripes = fs_devices->rw_devices;
1986 if (num_stripes < 4)
1987 return -ENOSPC;
1988 num_stripes &= ~(u32)1;
1989 sub_stripes = 2;
1990 min_stripes = 4;
1991 }
1992
1993 if (type & BTRFS_BLOCK_GROUP_DATA) {
1994 max_chunk_size = 10 * calc_size;
1995 min_stripe_size = 64 * 1024 * 1024;
1996 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
1997 max_chunk_size = 4 * calc_size;
1998 min_stripe_size = 32 * 1024 * 1024;
1999 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2000 calc_size = 8 * 1024 * 1024;
2001 max_chunk_size = calc_size * 2;
2002 min_stripe_size = 1 * 1024 * 1024;
2003 }
2004
2005 /* we don't want a chunk larger than 10% of writeable space */
2006 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2007 max_chunk_size);
2008
2009again:
2010 if (!map || map->num_stripes != num_stripes) {
2011 kfree(map);
2012 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2013 if (!map)
2014 return -ENOMEM;
2015 map->num_stripes = num_stripes;
2016 }
2017
2018 if (calc_size * num_stripes > max_chunk_size) {
2019 calc_size = max_chunk_size;
2020 do_div(calc_size, num_stripes);
2021 do_div(calc_size, stripe_len);
2022 calc_size *= stripe_len;
2023 }
2024 /* we don't want tiny stripes */
2025 calc_size = max_t(u64, min_stripe_size, calc_size);
2026
2027 do_div(calc_size, stripe_len);
2028 calc_size *= stripe_len;
2029
2030 cur = fs_devices->alloc_list.next;
2031 index = 0;
2032
2033 if (type & BTRFS_BLOCK_GROUP_DUP)
2034 min_free = calc_size * 2;
2035 else
2036 min_free = calc_size;
2037
2038 /*
2039 * we add 1MB because we never use the first 1MB of the device, unless
2040 * we've looped, then we are likely allocating the maximum amount of
2041 * space left already
2042 */
2043 if (!looped)
2044 min_free += 1024 * 1024;
2045
2046 INIT_LIST_HEAD(&private_devs);
2047 while (index < num_stripes) {
2048 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2049 BUG_ON(!device->writeable);
2050 if (device->total_bytes > device->bytes_used)
2051 avail = device->total_bytes - device->bytes_used;
2052 else
2053 avail = 0;
2054 cur = cur->next;
2055
2056 if (device->in_fs_metadata && avail >= min_free) {
2057 ret = find_free_dev_extent(trans, device,
2058 min_free, &dev_offset);
2059 if (ret == 0) {
2060 list_move_tail(&device->dev_alloc_list,
2061 &private_devs);
2062 map->stripes[index].dev = device;
2063 map->stripes[index].physical = dev_offset;
2064 index++;
2065 if (type & BTRFS_BLOCK_GROUP_DUP) {
2066 map->stripes[index].dev = device;
2067 map->stripes[index].physical =
2068 dev_offset + calc_size;
2069 index++;
2070 }
2071 }
2072 } else if (device->in_fs_metadata && avail > max_avail)
2073 max_avail = avail;
2074 if (cur == &fs_devices->alloc_list)
2075 break;
2076 }
2077 list_splice(&private_devs, &fs_devices->alloc_list);
2078 if (index < num_stripes) {
2079 if (index >= min_stripes) {
2080 num_stripes = index;
2081 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2082 num_stripes /= sub_stripes;
2083 num_stripes *= sub_stripes;
2084 }
2085 looped = 1;
2086 goto again;
2087 }
2088 if (!looped && max_avail > 0) {
2089 looped = 1;
2090 calc_size = max_avail;
2091 goto again;
2092 }
2093 kfree(map);
2094 return -ENOSPC;
2095 }
2096 map->sector_size = extent_root->sectorsize;
2097 map->stripe_len = stripe_len;
2098 map->io_align = stripe_len;
2099 map->io_width = stripe_len;
2100 map->type = type;
2101 map->num_stripes = num_stripes;
2102 map->sub_stripes = sub_stripes;
2103
2104 *map_ret = map;
2105 *stripe_size = calc_size;
2106 *num_bytes = chunk_bytes_by_type(type, calc_size,
2107 num_stripes, sub_stripes);
2108
2109 em = alloc_extent_map(GFP_NOFS);
2110 if (!em) {
2111 kfree(map);
2112 return -ENOMEM;
2113 }
2114 em->bdev = (struct block_device *)map;
2115 em->start = start;
2116 em->len = *num_bytes;
2117 em->block_start = 0;
2118 em->block_len = em->len;
2119
2120 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2121 spin_lock(&em_tree->lock);
2122 ret = add_extent_mapping(em_tree, em);
2123 spin_unlock(&em_tree->lock);
2124 BUG_ON(ret);
2125 free_extent_map(em);
2126
2127 ret = btrfs_make_block_group(trans, extent_root, 0, type,
2128 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2129 start, *num_bytes);
2130 BUG_ON(ret);
2131
2132 index = 0;
2133 while (index < map->num_stripes) {
2134 device = map->stripes[index].dev;
2135 dev_offset = map->stripes[index].physical;
2136
2137 ret = btrfs_alloc_dev_extent(trans, device,
2138 info->chunk_root->root_key.objectid,
2139 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2140 start, dev_offset, calc_size);
2141 BUG_ON(ret);
2142 index++;
2143 }
2144
2145 return 0;
2146}
2147
2148static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2149 struct btrfs_root *extent_root,
2150 struct map_lookup *map, u64 chunk_offset,
2151 u64 chunk_size, u64 stripe_size)
2152{
2153 u64 dev_offset;
2154 struct btrfs_key key;
2155 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2156 struct btrfs_device *device;
2157 struct btrfs_chunk *chunk;
2158 struct btrfs_stripe *stripe;
2159 size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2160 int index = 0;
2161 int ret;
2162
2163 chunk = kzalloc(item_size, GFP_NOFS);
2164 if (!chunk)
2165 return -ENOMEM;
2166
2167 index = 0;
2168 while (index < map->num_stripes) {
2169 device = map->stripes[index].dev;
2170 device->bytes_used += stripe_size;
2171 ret = btrfs_update_device(trans, device);
2172 BUG_ON(ret);
2173 index++;
2174 }
2175
2176 index = 0;
2177 stripe = &chunk->stripe;
2178 while (index < map->num_stripes) {
2179 device = map->stripes[index].dev;
2180 dev_offset = map->stripes[index].physical;
2181
2182 btrfs_set_stack_stripe_devid(stripe, device->devid);
2183 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2184 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2185 stripe++;
2186 index++;
2187 }
2188
2189 btrfs_set_stack_chunk_length(chunk, chunk_size);
2190 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2191 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2192 btrfs_set_stack_chunk_type(chunk, map->type);
2193 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2194 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2195 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2196 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2197 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2198
2199 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2200 key.type = BTRFS_CHUNK_ITEM_KEY;
2201 key.offset = chunk_offset;
2202
2203 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2204 BUG_ON(ret);
2205
2206 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2207 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2208 item_size);
2209 BUG_ON(ret);
2210 }
2211 kfree(chunk);
2212 return 0;
2213}
2214
2215/*
2216 * Chunk allocation falls into two parts. The first part does works
2217 * that make the new allocated chunk useable, but not do any operation
2218 * that modifies the chunk tree. The second part does the works that
2219 * require modifying the chunk tree. This division is important for the
2220 * bootstrap process of adding storage to a seed btrfs.
2221 */
2222int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2223 struct btrfs_root *extent_root, u64 type)
2224{
2225 u64 chunk_offset;
2226 u64 chunk_size;
2227 u64 stripe_size;
2228 struct map_lookup *map;
2229 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2230 int ret;
2231
2232 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2233 &chunk_offset);
2234 if (ret)
2235 return ret;
2236
2237 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2238 &stripe_size, chunk_offset, type);
2239 if (ret)
2240 return ret;
2241
2242 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2243 chunk_size, stripe_size);
2244 BUG_ON(ret);
2245 return 0;
2246}
2247
2248static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2249 struct btrfs_root *root,
2250 struct btrfs_device *device)
2251{
2252 u64 chunk_offset;
2253 u64 sys_chunk_offset;
2254 u64 chunk_size;
2255 u64 sys_chunk_size;
2256 u64 stripe_size;
2257 u64 sys_stripe_size;
2258 u64 alloc_profile;
2259 struct map_lookup *map;
2260 struct map_lookup *sys_map;
2261 struct btrfs_fs_info *fs_info = root->fs_info;
2262 struct btrfs_root *extent_root = fs_info->extent_root;
2263 int ret;
2264
2265 ret = find_next_chunk(fs_info->chunk_root,
2266 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2267 BUG_ON(ret);
2268
2269 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2270 (fs_info->metadata_alloc_profile &
2271 fs_info->avail_metadata_alloc_bits);
2272 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2273
2274 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2275 &stripe_size, chunk_offset, alloc_profile);
2276 BUG_ON(ret);
2277
2278 sys_chunk_offset = chunk_offset + chunk_size;
2279
2280 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2281 (fs_info->system_alloc_profile &
2282 fs_info->avail_system_alloc_bits);
2283 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2284
2285 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2286 &sys_chunk_size, &sys_stripe_size,
2287 sys_chunk_offset, alloc_profile);
2288 BUG_ON(ret);
2289
2290 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2291 BUG_ON(ret);
2292
2293 /*
2294 * Modifying chunk tree needs allocating new blocks from both
2295 * system block group and metadata block group. So we only can
2296 * do operations require modifying the chunk tree after both
2297 * block groups were created.
2298 */
2299 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2300 chunk_size, stripe_size);
2301 BUG_ON(ret);
2302
2303 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2304 sys_chunk_offset, sys_chunk_size,
2305 sys_stripe_size);
2306 BUG_ON(ret);
2307 return 0;
2308}
2309
2310int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2311{
2312 struct extent_map *em;
2313 struct map_lookup *map;
2314 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2315 int readonly = 0;
2316 int i;
2317
2318 spin_lock(&map_tree->map_tree.lock);
2319 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2320 spin_unlock(&map_tree->map_tree.lock);
2321 if (!em)
2322 return 1;
2323
2324 map = (struct map_lookup *)em->bdev;
2325 for (i = 0; i < map->num_stripes; i++) {
2326 if (!map->stripes[i].dev->writeable) {
2327 readonly = 1;
2328 break;
2329 }
2330 }
2331 free_extent_map(em);
2332 return readonly;
2333}
2334
2335void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2336{
2337 extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2338}
2339
2340void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2341{
2342 struct extent_map *em;
2343
2344 while (1) {
2345 spin_lock(&tree->map_tree.lock);
2346 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2347 if (em)
2348 remove_extent_mapping(&tree->map_tree, em);
2349 spin_unlock(&tree->map_tree.lock);
2350 if (!em)
2351 break;
2352 kfree(em->bdev);
2353 /* once for us */
2354 free_extent_map(em);
2355 /* once for the tree */
2356 free_extent_map(em);
2357 }
2358}
2359
2360int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2361{
2362 struct extent_map *em;
2363 struct map_lookup *map;
2364 struct extent_map_tree *em_tree = &map_tree->map_tree;
2365 int ret;
2366
2367 spin_lock(&em_tree->lock);
2368 em = lookup_extent_mapping(em_tree, logical, len);
2369 spin_unlock(&em_tree->lock);
2370 BUG_ON(!em);
2371
2372 BUG_ON(em->start > logical || em->start + em->len < logical);
2373 map = (struct map_lookup *)em->bdev;
2374 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2375 ret = map->num_stripes;
2376 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2377 ret = map->sub_stripes;
2378 else
2379 ret = 1;
2380 free_extent_map(em);
2381 return ret;
2382}
2383
2384static int find_live_mirror(struct map_lookup *map, int first, int num,
2385 int optimal)
2386{
2387 int i;
2388 if (map->stripes[optimal].dev->bdev)
2389 return optimal;
2390 for (i = first; i < first + num; i++) {
2391 if (map->stripes[i].dev->bdev)
2392 return i;
2393 }
2394 /* we couldn't find one that doesn't fail. Just return something
2395 * and the io error handling code will clean up eventually
2396 */
2397 return optimal;
2398}
2399
2400static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2401 u64 logical, u64 *length,
2402 struct btrfs_multi_bio **multi_ret,
2403 int mirror_num, struct page *unplug_page)
2404{
2405 struct extent_map *em;
2406 struct map_lookup *map;
2407 struct extent_map_tree *em_tree = &map_tree->map_tree;
2408 u64 offset;
2409 u64 stripe_offset;
2410 u64 stripe_nr;
2411 int stripes_allocated = 8;
2412 int stripes_required = 1;
2413 int stripe_index;
2414 int i;
2415 int num_stripes;
2416 int max_errors = 0;
2417 struct btrfs_multi_bio *multi = NULL;
2418
2419 if (multi_ret && !(rw & (1 << BIO_RW)))
2420 stripes_allocated = 1;
2421again:
2422 if (multi_ret) {
2423 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2424 GFP_NOFS);
2425 if (!multi)
2426 return -ENOMEM;
2427
2428 atomic_set(&multi->error, 0);
2429 }
2430
2431 spin_lock(&em_tree->lock);
2432 em = lookup_extent_mapping(em_tree, logical, *length);
2433 spin_unlock(&em_tree->lock);
2434
2435 if (!em && unplug_page)
2436 return 0;
2437
2438 if (!em) {
2439 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2440 (unsigned long long)logical,
2441 (unsigned long long)*length);
2442 BUG();
2443 }
2444
2445 BUG_ON(em->start > logical || em->start + em->len < logical);
2446 map = (struct map_lookup *)em->bdev;
2447 offset = logical - em->start;
2448
2449 if (mirror_num > map->num_stripes)
2450 mirror_num = 0;
2451
2452 /* if our multi bio struct is too small, back off and try again */
2453 if (rw & (1 << BIO_RW)) {
2454 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2455 BTRFS_BLOCK_GROUP_DUP)) {
2456 stripes_required = map->num_stripes;
2457 max_errors = 1;
2458 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2459 stripes_required = map->sub_stripes;
2460 max_errors = 1;
2461 }
2462 }
2463 if (multi_ret && rw == WRITE &&
2464 stripes_allocated < stripes_required) {
2465 stripes_allocated = map->num_stripes;
2466 free_extent_map(em);
2467 kfree(multi);
2468 goto again;
2469 }
2470 stripe_nr = offset;
2471 /*
2472 * stripe_nr counts the total number of stripes we have to stride
2473 * to get to this block
2474 */
2475 do_div(stripe_nr, map->stripe_len);
2476
2477 stripe_offset = stripe_nr * map->stripe_len;
2478 BUG_ON(offset < stripe_offset);
2479
2480 /* stripe_offset is the offset of this block in its stripe*/
2481 stripe_offset = offset - stripe_offset;
2482
2483 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2484 BTRFS_BLOCK_GROUP_RAID10 |
2485 BTRFS_BLOCK_GROUP_DUP)) {
2486 /* we limit the length of each bio to what fits in a stripe */
2487 *length = min_t(u64, em->len - offset,
2488 map->stripe_len - stripe_offset);
2489 } else {
2490 *length = em->len - offset;
2491 }
2492
2493 if (!multi_ret && !unplug_page)
2494 goto out;
2495
2496 num_stripes = 1;
2497 stripe_index = 0;
2498 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2499 if (unplug_page || (rw & (1 << BIO_RW)))
2500 num_stripes = map->num_stripes;
2501 else if (mirror_num)
2502 stripe_index = mirror_num - 1;
2503 else {
2504 stripe_index = find_live_mirror(map, 0,
2505 map->num_stripes,
2506 current->pid % map->num_stripes);
2507 }
2508
2509 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2510 if (rw & (1 << BIO_RW))
2511 num_stripes = map->num_stripes;
2512 else if (mirror_num)
2513 stripe_index = mirror_num - 1;
2514
2515 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2516 int factor = map->num_stripes / map->sub_stripes;
2517
2518 stripe_index = do_div(stripe_nr, factor);
2519 stripe_index *= map->sub_stripes;
2520
2521 if (unplug_page || (rw & (1 << BIO_RW)))
2522 num_stripes = map->sub_stripes;
2523 else if (mirror_num)
2524 stripe_index += mirror_num - 1;
2525 else {
2526 stripe_index = find_live_mirror(map, stripe_index,
2527 map->sub_stripes, stripe_index +
2528 current->pid % map->sub_stripes);
2529 }
2530 } else {
2531 /*
2532 * after this do_div call, stripe_nr is the number of stripes
2533 * on this device we have to walk to find the data, and
2534 * stripe_index is the number of our device in the stripe array
2535 */
2536 stripe_index = do_div(stripe_nr, map->num_stripes);
2537 }
2538 BUG_ON(stripe_index >= map->num_stripes);
2539
2540 for (i = 0; i < num_stripes; i++) {
2541 if (unplug_page) {
2542 struct btrfs_device *device;
2543 struct backing_dev_info *bdi;
2544
2545 device = map->stripes[stripe_index].dev;
2546 if (device->bdev) {
2547 bdi = blk_get_backing_dev_info(device->bdev);
2548 if (bdi->unplug_io_fn)
2549 bdi->unplug_io_fn(bdi, unplug_page);
2550 }
2551 } else {
2552 multi->stripes[i].physical =
2553 map->stripes[stripe_index].physical +
2554 stripe_offset + stripe_nr * map->stripe_len;
2555 multi->stripes[i].dev = map->stripes[stripe_index].dev;
2556 }
2557 stripe_index++;
2558 }
2559 if (multi_ret) {
2560 *multi_ret = multi;
2561 multi->num_stripes = num_stripes;
2562 multi->max_errors = max_errors;
2563 }
2564out:
2565 free_extent_map(em);
2566 return 0;
2567}
2568
2569int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2570 u64 logical, u64 *length,
2571 struct btrfs_multi_bio **multi_ret, int mirror_num)
2572{
2573 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2574 mirror_num, NULL);
2575}
2576
2577int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2578 u64 chunk_start, u64 physical, u64 devid,
2579 u64 **logical, int *naddrs, int *stripe_len)
2580{
2581 struct extent_map_tree *em_tree = &map_tree->map_tree;
2582 struct extent_map *em;
2583 struct map_lookup *map;
2584 u64 *buf;
2585 u64 bytenr;
2586 u64 length;
2587 u64 stripe_nr;
2588 int i, j, nr = 0;
2589
2590 spin_lock(&em_tree->lock);
2591 em = lookup_extent_mapping(em_tree, chunk_start, 1);
2592 spin_unlock(&em_tree->lock);
2593
2594 BUG_ON(!em || em->start != chunk_start);
2595 map = (struct map_lookup *)em->bdev;
2596
2597 length = em->len;
2598 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2599 do_div(length, map->num_stripes / map->sub_stripes);
2600 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
2601 do_div(length, map->num_stripes);
2602
2603 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
2604 BUG_ON(!buf);
2605
2606 for (i = 0; i < map->num_stripes; i++) {
2607 if (devid && map->stripes[i].dev->devid != devid)
2608 continue;
2609 if (map->stripes[i].physical > physical ||
2610 map->stripes[i].physical + length <= physical)
2611 continue;
2612
2613 stripe_nr = physical - map->stripes[i].physical;
2614 do_div(stripe_nr, map->stripe_len);
2615
2616 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2617 stripe_nr = stripe_nr * map->num_stripes + i;
2618 do_div(stripe_nr, map->sub_stripes);
2619 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2620 stripe_nr = stripe_nr * map->num_stripes + i;
2621 }
2622 bytenr = chunk_start + stripe_nr * map->stripe_len;
2623 WARN_ON(nr >= map->num_stripes);
2624 for (j = 0; j < nr; j++) {
2625 if (buf[j] == bytenr)
2626 break;
2627 }
2628 if (j == nr) {
2629 WARN_ON(nr >= map->num_stripes);
2630 buf[nr++] = bytenr;
2631 }
2632 }
2633
2634 for (i = 0; i > nr; i++) {
2635 struct btrfs_multi_bio *multi;
2636 struct btrfs_bio_stripe *stripe;
2637 int ret;
2638
2639 length = 1;
2640 ret = btrfs_map_block(map_tree, WRITE, buf[i],
2641 &length, &multi, 0);
2642 BUG_ON(ret);
2643
2644 stripe = multi->stripes;
2645 for (j = 0; j < multi->num_stripes; j++) {
2646 if (stripe->physical >= physical &&
2647 physical < stripe->physical + length)
2648 break;
2649 }
2650 BUG_ON(j >= multi->num_stripes);
2651 kfree(multi);
2652 }
2653
2654 *logical = buf;
2655 *naddrs = nr;
2656 *stripe_len = map->stripe_len;
2657
2658 free_extent_map(em);
2659 return 0;
2660}
2661
2662int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2663 u64 logical, struct page *page)
2664{
2665 u64 length = PAGE_CACHE_SIZE;
2666 return __btrfs_map_block(map_tree, READ, logical, &length,
2667 NULL, 0, page);
2668}
2669
2670static void end_bio_multi_stripe(struct bio *bio, int err)
2671{
2672 struct btrfs_multi_bio *multi = bio->bi_private;
2673 int is_orig_bio = 0;
2674
2675 if (err)
2676 atomic_inc(&multi->error);
2677
2678 if (bio == multi->orig_bio)
2679 is_orig_bio = 1;
2680
2681 if (atomic_dec_and_test(&multi->stripes_pending)) {
2682 if (!is_orig_bio) {
2683 bio_put(bio);
2684 bio = multi->orig_bio;
2685 }
2686 bio->bi_private = multi->private;
2687 bio->bi_end_io = multi->end_io;
2688 /* only send an error to the higher layers if it is
2689 * beyond the tolerance of the multi-bio
2690 */
2691 if (atomic_read(&multi->error) > multi->max_errors) {
2692 err = -EIO;
2693 } else if (err) {
2694 /*
2695 * this bio is actually up to date, we didn't
2696 * go over the max number of errors
2697 */
2698 set_bit(BIO_UPTODATE, &bio->bi_flags);
2699 err = 0;
2700 }
2701 kfree(multi);
2702
2703 bio_endio(bio, err);
2704 } else if (!is_orig_bio) {
2705 bio_put(bio);
2706 }
2707}
2708
2709struct async_sched {
2710 struct bio *bio;
2711 int rw;
2712 struct btrfs_fs_info *info;
2713 struct btrfs_work work;
2714};
2715
2716/*
2717 * see run_scheduled_bios for a description of why bios are collected for
2718 * async submit.
2719 *
2720 * This will add one bio to the pending list for a device and make sure
2721 * the work struct is scheduled.
2722 */
2723static noinline int schedule_bio(struct btrfs_root *root,
2724 struct btrfs_device *device,
2725 int rw, struct bio *bio)
2726{
2727 int should_queue = 1;
2728
2729 /* don't bother with additional async steps for reads, right now */
2730 if (!(rw & (1 << BIO_RW))) {
2731 bio_get(bio);
2732 submit_bio(rw, bio);
2733 bio_put(bio);
2734 return 0;
2735 }
2736
2737 /*
2738 * nr_async_bios allows us to reliably return congestion to the
2739 * higher layers. Otherwise, the async bio makes it appear we have
2740 * made progress against dirty pages when we've really just put it
2741 * on a queue for later
2742 */
2743 atomic_inc(&root->fs_info->nr_async_bios);
2744 WARN_ON(bio->bi_next);
2745 bio->bi_next = NULL;
2746 bio->bi_rw |= rw;
2747
2748 spin_lock(&device->io_lock);
2749
2750 if (device->pending_bio_tail)
2751 device->pending_bio_tail->bi_next = bio;
2752
2753 device->pending_bio_tail = bio;
2754 if (!device->pending_bios)
2755 device->pending_bios = bio;
2756 if (device->running_pending)
2757 should_queue = 0;
2758
2759 spin_unlock(&device->io_lock);
2760
2761 if (should_queue)
2762 btrfs_queue_worker(&root->fs_info->submit_workers,
2763 &device->work);
2764 return 0;
2765}
2766
2767int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
2768 int mirror_num, int async_submit)
2769{
2770 struct btrfs_mapping_tree *map_tree;
2771 struct btrfs_device *dev;
2772 struct bio *first_bio = bio;
2773 u64 logical = (u64)bio->bi_sector << 9;
2774 u64 length = 0;
2775 u64 map_length;
2776 struct btrfs_multi_bio *multi = NULL;
2777 int ret;
2778 int dev_nr = 0;
2779 int total_devs = 1;
2780
2781 length = bio->bi_size;
2782 map_tree = &root->fs_info->mapping_tree;
2783 map_length = length;
2784
2785 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
2786 mirror_num);
2787 BUG_ON(ret);
2788
2789 total_devs = multi->num_stripes;
2790 if (map_length < length) {
2791 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
2792 "len %llu\n", (unsigned long long)logical,
2793 (unsigned long long)length,
2794 (unsigned long long)map_length);
2795 BUG();
2796 }
2797 multi->end_io = first_bio->bi_end_io;
2798 multi->private = first_bio->bi_private;
2799 multi->orig_bio = first_bio;
2800 atomic_set(&multi->stripes_pending, multi->num_stripes);
2801
2802 while (dev_nr < total_devs) {
2803 if (total_devs > 1) {
2804 if (dev_nr < total_devs - 1) {
2805 bio = bio_clone(first_bio, GFP_NOFS);
2806 BUG_ON(!bio);
2807 } else {
2808 bio = first_bio;
2809 }
2810 bio->bi_private = multi;
2811 bio->bi_end_io = end_bio_multi_stripe;
2812 }
2813 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
2814 dev = multi->stripes[dev_nr].dev;
2815 BUG_ON(rw == WRITE && !dev->writeable);
2816 if (dev && dev->bdev) {
2817 bio->bi_bdev = dev->bdev;
2818 if (async_submit)
2819 schedule_bio(root, dev, rw, bio);
2820 else
2821 submit_bio(rw, bio);
2822 } else {
2823 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2824 bio->bi_sector = logical >> 9;
2825 bio_endio(bio, -EIO);
2826 }
2827 dev_nr++;
2828 }
2829 if (total_devs == 1)
2830 kfree(multi);
2831 return 0;
2832}
2833
2834struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
2835 u8 *uuid, u8 *fsid)
2836{
2837 struct btrfs_device *device;
2838 struct btrfs_fs_devices *cur_devices;
2839
2840 cur_devices = root->fs_info->fs_devices;
2841 while (cur_devices) {
2842 if (!fsid ||
2843 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2844 device = __find_device(&cur_devices->devices,
2845 devid, uuid);
2846 if (device)
2847 return device;
2848 }
2849 cur_devices = cur_devices->seed;
2850 }
2851 return NULL;
2852}
2853
2854static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
2855 u64 devid, u8 *dev_uuid)
2856{
2857 struct btrfs_device *device;
2858 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2859
2860 device = kzalloc(sizeof(*device), GFP_NOFS);
2861 if (!device)
2862 return NULL;
2863 list_add(&device->dev_list,
2864 &fs_devices->devices);
2865 device->barriers = 1;
2866 device->dev_root = root->fs_info->dev_root;
2867 device->devid = devid;
2868 device->work.func = pending_bios_fn;
2869 device->fs_devices = fs_devices;
2870 fs_devices->num_devices++;
2871 spin_lock_init(&device->io_lock);
2872 INIT_LIST_HEAD(&device->dev_alloc_list);
2873 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
2874 return device;
2875}
2876
2877static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2878 struct extent_buffer *leaf,
2879 struct btrfs_chunk *chunk)
2880{
2881 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2882 struct map_lookup *map;
2883 struct extent_map *em;
2884 u64 logical;
2885 u64 length;
2886 u64 devid;
2887 u8 uuid[BTRFS_UUID_SIZE];
2888 int num_stripes;
2889 int ret;
2890 int i;
2891
2892 logical = key->offset;
2893 length = btrfs_chunk_length(leaf, chunk);
2894
2895 spin_lock(&map_tree->map_tree.lock);
2896 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
2897 spin_unlock(&map_tree->map_tree.lock);
2898
2899 /* already mapped? */
2900 if (em && em->start <= logical && em->start + em->len > logical) {
2901 free_extent_map(em);
2902 return 0;
2903 } else if (em) {
2904 free_extent_map(em);
2905 }
2906
2907 map = kzalloc(sizeof(*map), GFP_NOFS);
2908 if (!map)
2909 return -ENOMEM;
2910
2911 em = alloc_extent_map(GFP_NOFS);
2912 if (!em)
2913 return -ENOMEM;
2914 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2915 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2916 if (!map) {
2917 free_extent_map(em);
2918 return -ENOMEM;
2919 }
2920
2921 em->bdev = (struct block_device *)map;
2922 em->start = logical;
2923 em->len = length;
2924 em->block_start = 0;
2925 em->block_len = em->len;
2926
2927 map->num_stripes = num_stripes;
2928 map->io_width = btrfs_chunk_io_width(leaf, chunk);
2929 map->io_align = btrfs_chunk_io_align(leaf, chunk);
2930 map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
2931 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
2932 map->type = btrfs_chunk_type(leaf, chunk);
2933 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
2934 for (i = 0; i < num_stripes; i++) {
2935 map->stripes[i].physical =
2936 btrfs_stripe_offset_nr(leaf, chunk, i);
2937 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
2938 read_extent_buffer(leaf, uuid, (unsigned long)
2939 btrfs_stripe_dev_uuid_nr(chunk, i),
2940 BTRFS_UUID_SIZE);
2941 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
2942 NULL);
2943 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
2944 kfree(map);
2945 free_extent_map(em);
2946 return -EIO;
2947 }
2948 if (!map->stripes[i].dev) {
2949 map->stripes[i].dev =
2950 add_missing_dev(root, devid, uuid);
2951 if (!map->stripes[i].dev) {
2952 kfree(map);
2953 free_extent_map(em);
2954 return -EIO;
2955 }
2956 }
2957 map->stripes[i].dev->in_fs_metadata = 1;
2958 }
2959
2960 spin_lock(&map_tree->map_tree.lock);
2961 ret = add_extent_mapping(&map_tree->map_tree, em);
2962 spin_unlock(&map_tree->map_tree.lock);
2963 BUG_ON(ret);
2964 free_extent_map(em);
2965
2966 return 0;
2967}
2968
2969static int fill_device_from_item(struct extent_buffer *leaf,
2970 struct btrfs_dev_item *dev_item,
2971 struct btrfs_device *device)
2972{
2973 unsigned long ptr;
2974
2975 device->devid = btrfs_device_id(leaf, dev_item);
2976 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2977 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2978 device->type = btrfs_device_type(leaf, dev_item);
2979 device->io_align = btrfs_device_io_align(leaf, dev_item);
2980 device->io_width = btrfs_device_io_width(leaf, dev_item);
2981 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
2982
2983 ptr = (unsigned long)btrfs_device_uuid(dev_item);
2984 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
2985
2986 return 0;
2987}
2988
2989static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
2990{
2991 struct btrfs_fs_devices *fs_devices;
2992 int ret;
2993
2994 mutex_lock(&uuid_mutex);
2995
2996 fs_devices = root->fs_info->fs_devices->seed;
2997 while (fs_devices) {
2998 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2999 ret = 0;
3000 goto out;
3001 }
3002 fs_devices = fs_devices->seed;
3003 }
3004
3005 fs_devices = find_fsid(fsid);
3006 if (!fs_devices) {
3007 ret = -ENOENT;
3008 goto out;
3009 }
3010
3011 fs_devices = clone_fs_devices(fs_devices);
3012 if (IS_ERR(fs_devices)) {
3013 ret = PTR_ERR(fs_devices);
3014 goto out;
3015 }
3016
3017 ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3018 root->fs_info->bdev_holder);
3019 if (ret)
3020 goto out;
3021
3022 if (!fs_devices->seeding) {
3023 __btrfs_close_devices(fs_devices);
3024 free_fs_devices(fs_devices);
3025 ret = -EINVAL;
3026 goto out;
3027 }
3028
3029 fs_devices->seed = root->fs_info->fs_devices->seed;
3030 root->fs_info->fs_devices->seed = fs_devices;
3031out:
3032 mutex_unlock(&uuid_mutex);
3033 return ret;
3034}
3035
3036static int read_one_dev(struct btrfs_root *root,
3037 struct extent_buffer *leaf,
3038 struct btrfs_dev_item *dev_item)
3039{
3040 struct btrfs_device *device;
3041 u64 devid;
3042 int ret;
3043 u8 fs_uuid[BTRFS_UUID_SIZE];
3044 u8 dev_uuid[BTRFS_UUID_SIZE];
3045
3046 devid = btrfs_device_id(leaf, dev_item);
3047 read_extent_buffer(leaf, dev_uuid,
3048 (unsigned long)btrfs_device_uuid(dev_item),
3049 BTRFS_UUID_SIZE);
3050 read_extent_buffer(leaf, fs_uuid,
3051 (unsigned long)btrfs_device_fsid(dev_item),
3052 BTRFS_UUID_SIZE);
3053
3054 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3055 ret = open_seed_devices(root, fs_uuid);
3056 if (ret && !btrfs_test_opt(root, DEGRADED))
3057 return ret;
3058 }
3059
3060 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3061 if (!device || !device->bdev) {
3062 if (!btrfs_test_opt(root, DEGRADED))
3063 return -EIO;
3064
3065 if (!device) {
3066 printk(KERN_WARNING "warning devid %llu missing\n",
3067 (unsigned long long)devid);
3068 device = add_missing_dev(root, devid, dev_uuid);
3069 if (!device)
3070 return -ENOMEM;
3071 }
3072 }
3073
3074 if (device->fs_devices != root->fs_info->fs_devices) {
3075 BUG_ON(device->writeable);
3076 if (device->generation !=
3077 btrfs_device_generation(leaf, dev_item))
3078 return -EINVAL;
3079 }
3080
3081 fill_device_from_item(leaf, dev_item, device);
3082 device->dev_root = root->fs_info->dev_root;
3083 device->in_fs_metadata = 1;
3084 if (device->writeable)
3085 device->fs_devices->total_rw_bytes += device->total_bytes;
3086 ret = 0;
3087 return ret;
3088}
3089
3090int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3091{
3092 struct btrfs_dev_item *dev_item;
3093
3094 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3095 dev_item);
3096 return read_one_dev(root, buf, dev_item);
3097}
3098
3099int btrfs_read_sys_array(struct btrfs_root *root)
3100{
3101 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3102 struct extent_buffer *sb;
3103 struct btrfs_disk_key *disk_key;
3104 struct btrfs_chunk *chunk;
3105 u8 *ptr;
3106 unsigned long sb_ptr;
3107 int ret = 0;
3108 u32 num_stripes;
3109 u32 array_size;
3110 u32 len = 0;
3111 u32 cur;
3112 struct btrfs_key key;
3113
3114 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3115 BTRFS_SUPER_INFO_SIZE);
3116 if (!sb)
3117 return -ENOMEM;
3118 btrfs_set_buffer_uptodate(sb);
3119 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3120 array_size = btrfs_super_sys_array_size(super_copy);
3121
3122 ptr = super_copy->sys_chunk_array;
3123 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3124 cur = 0;
3125
3126 while (cur < array_size) {
3127 disk_key = (struct btrfs_disk_key *)ptr;
3128 btrfs_disk_key_to_cpu(&key, disk_key);
3129
3130 len = sizeof(*disk_key); ptr += len;
3131 sb_ptr += len;
3132 cur += len;
3133
3134 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3135 chunk = (struct btrfs_chunk *)sb_ptr;
3136 ret = read_one_chunk(root, &key, sb, chunk);
3137 if (ret)
3138 break;
3139 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3140 len = btrfs_chunk_item_size(num_stripes);
3141 } else {
3142 ret = -EIO;
3143 break;
3144 }
3145 ptr += len;
3146 sb_ptr += len;
3147 cur += len;
3148 }
3149 free_extent_buffer(sb);
3150 return ret;
3151}
3152
3153int btrfs_read_chunk_tree(struct btrfs_root *root)
3154{
3155 struct btrfs_path *path;
3156 struct extent_buffer *leaf;
3157 struct btrfs_key key;
3158 struct btrfs_key found_key;
3159 int ret;
3160 int slot;
3161
3162 root = root->fs_info->chunk_root;
3163
3164 path = btrfs_alloc_path();
3165 if (!path)
3166 return -ENOMEM;
3167
3168 /* first we search for all of the device items, and then we
3169 * read in all of the chunk items. This way we can create chunk
3170 * mappings that reference all of the devices that are afound
3171 */
3172 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3173 key.offset = 0;
3174 key.type = 0;
3175again:
3176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3177 while (1) {
3178 leaf = path->nodes[0];
3179 slot = path->slots[0];
3180 if (slot >= btrfs_header_nritems(leaf)) {
3181 ret = btrfs_next_leaf(root, path);
3182 if (ret == 0)
3183 continue;
3184 if (ret < 0)
3185 goto error;
3186 break;
3187 }
3188 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3189 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3190 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3191 break;
3192 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3193 struct btrfs_dev_item *dev_item;
3194 dev_item = btrfs_item_ptr(leaf, slot,
3195 struct btrfs_dev_item);
3196 ret = read_one_dev(root, leaf, dev_item);
3197 if (ret)
3198 goto error;
3199 }
3200 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3201 struct btrfs_chunk *chunk;
3202 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3203 ret = read_one_chunk(root, &found_key, leaf, chunk);
3204 if (ret)
3205 goto error;
3206 }
3207 path->slots[0]++;
3208 }
3209 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3210 key.objectid = 0;
3211 btrfs_release_path(root, path);
3212 goto again;
3213 }
3214 ret = 0;
3215error:
3216 btrfs_free_path(path);
3217 return ret;
3218}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000000..86c44e9ae110
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,162 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_VOLUMES_
20#define __BTRFS_VOLUMES_
21
22#include <linux/bio.h>
23#include "async-thread.h"
24
25struct buffer_head;
26struct btrfs_device {
27 struct list_head dev_list;
28 struct list_head dev_alloc_list;
29 struct btrfs_fs_devices *fs_devices;
30 struct btrfs_root *dev_root;
31 struct bio *pending_bios;
32 struct bio *pending_bio_tail;
33 int running_pending;
34 u64 generation;
35
36 int barriers;
37 int writeable;
38 int in_fs_metadata;
39
40 spinlock_t io_lock;
41
42 struct block_device *bdev;
43
44 /* the mode sent to open_bdev_exclusive */
45 fmode_t mode;
46
47 char *name;
48
49 /* the internal btrfs device id */
50 u64 devid;
51
52 /* size of the device */
53 u64 total_bytes;
54
55 /* bytes used */
56 u64 bytes_used;
57
58 /* optimal io alignment for this device */
59 u32 io_align;
60
61 /* optimal io width for this device */
62 u32 io_width;
63
64 /* minimal io size for this device */
65 u32 sector_size;
66
67 /* type and info about this device */
68 u64 type;
69
70 /* physical drive uuid (or lvm uuid) */
71 u8 uuid[BTRFS_UUID_SIZE];
72
73 struct btrfs_work work;
74};
75
76struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78
79 /* the device with this id has the most recent coyp of the super */
80 u64 latest_devid;
81 u64 latest_trans;
82 u64 num_devices;
83 u64 open_devices;
84 u64 rw_devices;
85 u64 total_rw_bytes;
86 struct block_device *latest_bdev;
87 /* all of the devices in the FS */
88 struct list_head devices;
89
90 /* devices not currently being allocated */
91 struct list_head alloc_list;
92 struct list_head list;
93
94 struct btrfs_fs_devices *seed;
95 int seeding;
96
97 int opened;
98};
99
100struct btrfs_bio_stripe {
101 struct btrfs_device *dev;
102 u64 physical;
103};
104
105struct btrfs_multi_bio {
106 atomic_t stripes_pending;
107 bio_end_io_t *end_io;
108 struct bio *orig_bio;
109 void *private;
110 atomic_t error;
111 int max_errors;
112 int num_stripes;
113 struct btrfs_bio_stripe stripes[];
114};
115
116#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
117 (sizeof(struct btrfs_bio_stripe) * (n)))
118
119int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
120 struct btrfs_device *device,
121 u64 chunk_tree, u64 chunk_objectid,
122 u64 chunk_offset, u64 start, u64 num_bytes);
123int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
124 u64 logical, u64 *length,
125 struct btrfs_multi_bio **multi_ret, int mirror_num);
126int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
127 u64 chunk_start, u64 physical, u64 devid,
128 u64 **logical, int *naddrs, int *stripe_len);
129int btrfs_read_sys_array(struct btrfs_root *root);
130int btrfs_read_chunk_tree(struct btrfs_root *root);
131int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
132 struct btrfs_root *extent_root, u64 type);
133void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
134void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
135int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
136 int mirror_num, int async_submit);
137int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
138int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
139 fmode_t flags, void *holder);
140int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
141 struct btrfs_fs_devices **fs_devices_ret);
142int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
143int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
144int btrfs_add_device(struct btrfs_trans_handle *trans,
145 struct btrfs_root *root,
146 struct btrfs_device *device);
147int btrfs_rm_device(struct btrfs_root *root, char *device_path);
148int btrfs_cleanup_fs_uuids(void);
149int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
150int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
151 u64 logical, struct page *page);
152int btrfs_grow_device(struct btrfs_trans_handle *trans,
153 struct btrfs_device *device, u64 new_size);
154struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
155 u8 *uuid, u8 *fsid);
156int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
157int btrfs_init_new_device(struct btrfs_root *root, char *path);
158int btrfs_balance(struct btrfs_root *dev_root);
159void btrfs_unlock_volumes(void);
160void btrfs_lock_volumes(void);
161int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
162#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000000..7f332e270894
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,322 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/init.h>
20#include <linux/fs.h>
21#include <linux/slab.h>
22#include <linux/rwsem.h>
23#include <linux/xattr.h>
24#include "ctree.h"
25#include "btrfs_inode.h"
26#include "transaction.h"
27#include "xattr.h"
28#include "disk-io.h"
29
30
31ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
32 void *buffer, size_t size)
33{
34 struct btrfs_dir_item *di;
35 struct btrfs_root *root = BTRFS_I(inode)->root;
36 struct btrfs_path *path;
37 struct extent_buffer *leaf;
38 int ret = 0;
39 unsigned long data_ptr;
40
41 path = btrfs_alloc_path();
42 if (!path)
43 return -ENOMEM;
44
45 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0);
48 if (!di || IS_ERR(di)) {
49 ret = -ENODATA;
50 goto out;
51 }
52
53 leaf = path->nodes[0];
54 /* if size is 0, that means we want the size of the attr */
55 if (!size) {
56 ret = btrfs_dir_data_len(leaf, di);
57 goto out;
58 }
59
60 /* now get the data out of our dir_item */
61 if (btrfs_dir_data_len(leaf, di) > size) {
62 ret = -ERANGE;
63 goto out;
64 }
65 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr,
68 btrfs_dir_data_len(leaf, di));
69 ret = btrfs_dir_data_len(leaf, di);
70
71out:
72 btrfs_free_path(path);
73 return ret;
74}
75
76int __btrfs_setxattr(struct inode *inode, const char *name,
77 const void *value, size_t size, int flags)
78{
79 struct btrfs_dir_item *di;
80 struct btrfs_root *root = BTRFS_I(inode)->root;
81 struct btrfs_trans_handle *trans;
82 struct btrfs_path *path;
83 int ret = 0, mod = 0;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88
89 trans = btrfs_start_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode);
91
92 /* first lets see if we already have this xattr */
93 di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
94 strlen(name), -1);
95 if (IS_ERR(di)) {
96 ret = PTR_ERR(di);
97 goto out;
98 }
99
100 /* ok we already have this xattr, lets remove it */
101 if (di) {
102 /* if we want create only exit */
103 if (flags & XATTR_CREATE) {
104 ret = -EEXIST;
105 goto out;
106 }
107
108 ret = btrfs_delete_one_dir_name(trans, root, path, di);
109 if (ret)
110 goto out;
111 btrfs_release_path(root, path);
112
113 /* if we don't have a value then we are removing the xattr */
114 if (!value) {
115 mod = 1;
116 goto out;
117 }
118 } else {
119 btrfs_release_path(root, path);
120
121 if (flags & XATTR_REPLACE) {
122 /* we couldn't find the attr to replace */
123 ret = -ENODATA;
124 goto out;
125 }
126 }
127
128 /* ok we have to create a completely new xattr */
129 ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
130 value, size, inode->i_ino);
131 if (ret)
132 goto out;
133 mod = 1;
134
135out:
136 if (mod) {
137 inode->i_ctime = CURRENT_TIME;
138 ret = btrfs_update_inode(trans, root, inode);
139 }
140
141 btrfs_end_transaction(trans, root);
142 btrfs_free_path(path);
143 return ret;
144}
145
146ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 struct btrfs_key key, found_key;
149 struct inode *inode = dentry->d_inode;
150 struct btrfs_root *root = BTRFS_I(inode)->root;
151 struct btrfs_path *path;
152 struct btrfs_item *item;
153 struct extent_buffer *leaf;
154 struct btrfs_dir_item *di;
155 int ret = 0, slot, advance;
156 size_t total_size = 0, size_left = size;
157 unsigned long name_ptr;
158 size_t name_len;
159 u32 nritems;
160
161 /*
162 * ok we want all objects associated with this id.
163 * NOTE: we set key.offset = 0; because we want to start with the
164 * first xattr that we find and walk forward
165 */
166 key.objectid = inode->i_ino;
167 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
168 key.offset = 0;
169
170 path = btrfs_alloc_path();
171 if (!path)
172 return -ENOMEM;
173 path->reada = 2;
174
175 /* search for our xattrs */
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0)
178 goto err;
179 ret = 0;
180 advance = 0;
181 while (1) {
182 leaf = path->nodes[0];
183 nritems = btrfs_header_nritems(leaf);
184 slot = path->slots[0];
185
186 /* this is where we start walking through the path */
187 if (advance || slot >= nritems) {
188 /*
189 * if we've reached the last slot in this leaf we need
190 * to go to the next leaf and reset everything
191 */
192 if (slot >= nritems-1) {
193 ret = btrfs_next_leaf(root, path);
194 if (ret)
195 break;
196 leaf = path->nodes[0];
197 nritems = btrfs_header_nritems(leaf);
198 slot = path->slots[0];
199 } else {
200 /*
201 * just walking through the slots on this leaf
202 */
203 slot++;
204 path->slots[0]++;
205 }
206 }
207 advance = 1;
208
209 item = btrfs_item_nr(leaf, slot);
210 btrfs_item_key_to_cpu(leaf, &found_key, slot);
211
212 /* check to make sure this item is what we want */
213 if (found_key.objectid != key.objectid)
214 break;
215 if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
216 break;
217
218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
219
220 name_len = btrfs_dir_name_len(leaf, di);
221 total_size += name_len + 1;
222
223 /* we are just looking for how big our buffer needs to be */
224 if (!size)
225 continue;
226
227 if (!buffer || (name_len + 1) > size_left) {
228 ret = -ERANGE;
229 goto err;
230 }
231
232 name_ptr = (unsigned long)(di + 1);
233 read_extent_buffer(leaf, buffer, name_ptr, name_len);
234 buffer[name_len] = '\0';
235
236 size_left -= name_len + 1;
237 buffer += name_len + 1;
238 }
239 ret = total_size;
240
241err:
242 btrfs_free_path(path);
243
244 return ret;
245}
246
247/*
248 * List of handlers for synthetic system.* attributes. All real ondisk
249 * attributes are handled directly.
250 */
251struct xattr_handler *btrfs_xattr_handlers[] = {
252#ifdef CONFIG_FS_POSIX_ACL
253 &btrfs_xattr_acl_access_handler,
254 &btrfs_xattr_acl_default_handler,
255#endif
256 NULL,
257};
258
259/*
260 * Check if the attribute is in a supported namespace.
261 *
262 * This applied after the check for the synthetic attributes in the system
263 * namespace.
264 */
265static bool btrfs_is_valid_xattr(const char *name)
266{
267 return !strncmp(name, XATTR_SECURITY_PREFIX,
268 XATTR_SECURITY_PREFIX_LEN) ||
269 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
270 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
271 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
272}
273
274ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
275 void *buffer, size_t size)
276{
277 /*
278 * If this is a request for a synthetic attribute in the system.*
279 * namespace use the generic infrastructure to resolve a handler
280 * for it via sb->s_xattr.
281 */
282 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
283 return generic_getxattr(dentry, name, buffer, size);
284
285 if (!btrfs_is_valid_xattr(name))
286 return -EOPNOTSUPP;
287 return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
288}
289
290int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
291 size_t size, int flags)
292{
293 /*
294 * If this is a request for a synthetic attribute in the system.*
295 * namespace use the generic infrastructure to resolve a handler
296 * for it via sb->s_xattr.
297 */
298 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
299 return generic_setxattr(dentry, name, value, size, flags);
300
301 if (!btrfs_is_valid_xattr(name))
302 return -EOPNOTSUPP;
303
304 if (size == 0)
305 value = ""; /* empty EA, do not remove */
306 return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
307}
308
309int btrfs_removexattr(struct dentry *dentry, const char *name)
310{
311 /*
312 * If this is a request for a synthetic attribute in the system.*
313 * namespace use the generic infrastructure to resolve a handler
314 * for it via sb->s_xattr.
315 */
316 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
317 return generic_removexattr(dentry, name);
318
319 if (!btrfs_is_valid_xattr(name))
320 return -EOPNOTSUPP;
321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
322}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000000..5b1d08f8e68d
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __XATTR__
20#define __XATTR__
21
22#include <linux/xattr.h>
23
24extern struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[];
27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size);
30extern int __btrfs_setxattr(struct inode *inode, const char *name,
31 const void *value, size_t size, int flags);
32
33extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
34 void *buffer, size_t size);
35extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38
39#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..ecfbce836d32
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,632 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 *
18 * Based on jffs2 zlib code:
19 * Copyright © 2001-2007 Red Hat, Inc.
20 * Created by David Woodhouse <dwmw2@infradead.org>
21 */
22
23#include <linux/kernel.h>
24#include <linux/slab.h>
25#include <linux/zlib.h>
26#include <linux/zutil.h>
27#include <linux/vmalloc.h>
28#include <linux/init.h>
29#include <linux/err.h>
30#include <linux/sched.h>
31#include <linux/pagemap.h>
32#include <linux/bio.h>
33#include "compression.h"
34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace {
45 z_stream inf_strm;
46 z_stream def_strm;
47 char *buf;
48 struct list_head list;
49};
50
51static LIST_HEAD(idle_workspace);
52static DEFINE_SPINLOCK(workspace_lock);
53static unsigned long num_workspace;
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56
57/*
58 * this finds an available zlib workspace or allocates a new one
59 * NULL or an ERR_PTR is returned if things go bad.
60 */
61static struct workspace *find_zlib_workspace(void)
62{
63 struct workspace *workspace;
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) {
90 ret = -ENOMEM;
91 goto fail;
92 }
93
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) {
106 ret = -ENOMEM;
107 goto fail_kmalloc;
108 }
109 return workspace;
110
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf);
141 kfree(workspace);
142
143 atomic_dec(&alloc_workspace);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147}
148
149/*
150 * cleanup function for module exit
151 */
152static void free_workspaces(void)
153{
154 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) {
156 workspace = list_entry(idle_workspace.next, struct workspace,
157 list);
158 list_del(&workspace->list);
159 vfree(workspace->def_strm.workspace);
160 vfree(workspace->inf_strm.workspace);
161 kfree(workspace->buf);
162 kfree(workspace);
163 atomic_dec(&alloc_workspace);
164 }
165}
166
167/*
168 * given an address space and start/len, compress the bytes.
169 *
170 * pages are allocated to hold the compressed result and stored
171 * in 'pages'
172 *
173 * out_pages is used to return the number of pages allocated. There
174 * may be pages allocated even if we return an error
175 *
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{
195 int ret;
196 struct workspace *workspace;
197 char *data_in;
198 char *cpage_out;
199 int nr_pages = 0;
200 struct page *in_page = NULL;
201 struct page *out_page = NULL;
202 int out_written = 0;
203 int in_read = 0;
204 unsigned long bytes_left;
205
206 *out_pages = 0;
207 *total_out = 0;
208 *total_in = 0;
209
210 workspace = find_zlib_workspace();
211 if (!workspace)
212 return -1;
213
214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
215 printk(KERN_WARNING "deflateInit failed\n");
216 ret = -1;
217 goto out;
218 }
219
220 workspace->def_strm.total_in = 0;
221 workspace->def_strm.total_out = 0;
222
223 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
224 data_in = kmap(in_page);
225
226 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
227 cpage_out = kmap(out_page);
228 pages[0] = out_page;
229 nr_pages = 1;
230
231 workspace->def_strm.next_in = data_in;
232 workspace->def_strm.next_out = cpage_out;
233 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
234 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
235
236 out_written = 0;
237 in_read = 0;
238
239 while (workspace->def_strm.total_in < len) {
240 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
241 if (ret != Z_OK) {
242 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
243 ret);
244 zlib_deflateEnd(&workspace->def_strm);
245 ret = -1;
246 goto out;
247 }
248
249 /* we're making it bigger, give up */
250 if (workspace->def_strm.total_in > 8192 &&
251 workspace->def_strm.total_in <
252 workspace->def_strm.total_out) {
253 ret = -1;
254 goto out;
255 }
256 /* we need another page for writing out. Test this
257 * before the total_in so we will pull in a new page for
258 * the stream end if required
259 */
260 if (workspace->def_strm.avail_out == 0) {
261 kunmap(out_page);
262 if (nr_pages == nr_dest_pages) {
263 out_page = NULL;
264 ret = -1;
265 goto out;
266 }
267 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
268 cpage_out = kmap(out_page);
269 pages[nr_pages] = out_page;
270 nr_pages++;
271 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
272 workspace->def_strm.next_out = cpage_out;
273 }
274 /* we're all done */
275 if (workspace->def_strm.total_in >= len)
276 break;
277
278 /* we've read in a full page, get a new one */
279 if (workspace->def_strm.avail_in == 0) {
280 if (workspace->def_strm.total_out > max_out)
281 break;
282
283 bytes_left = len - workspace->def_strm.total_in;
284 kunmap(in_page);
285 page_cache_release(in_page);
286
287 start += PAGE_CACHE_SIZE;
288 in_page = find_get_page(mapping,
289 start >> PAGE_CACHE_SHIFT);
290 data_in = kmap(in_page);
291 workspace->def_strm.avail_in = min(bytes_left,
292 PAGE_CACHE_SIZE);
293 workspace->def_strm.next_in = data_in;
294 }
295 }
296 workspace->def_strm.avail_in = 0;
297 ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
298 zlib_deflateEnd(&workspace->def_strm);
299
300 if (ret != Z_STREAM_END) {
301 ret = -1;
302 goto out;
303 }
304
305 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
306 ret = -1;
307 goto out;
308 }
309
310 ret = 0;
311 *total_out = workspace->def_strm.total_out;
312 *total_in = workspace->def_strm.total_in;
313out:
314 *out_pages = nr_pages;
315 if (out_page)
316 kunmap(out_page);
317
318 if (in_page) {
319 kunmap(in_page);
320 page_cache_release(in_page);
321 }
322 free_workspace(workspace);
323 return ret;
324}
325
326/*
327 * pages_in is an array of pages with compressed data.
328 *
329 * disk_start is the starting logical offset of this array in the file
330 *
331 * bvec is a bio_vec of pages from the file that we want to decompress into
332 *
333 * vcnt is the count of pages in the biovec
334 *
335 * srclen is the number of bytes in pages_in
336 *
337 * The basic idea is that we have a bio that was created by readpages.
338 * The pages in the bio are for the uncompressed data, and they may not
339 * be contiguous. They all correspond to the range of bytes covered by
340 * the compressed extent.
341 */
342int btrfs_zlib_decompress_biovec(struct page **pages_in,
343 u64 disk_start,
344 struct bio_vec *bvec,
345 int vcnt,
346 size_t srclen)
347{
348 int ret = 0;
349 int wbits = MAX_WBITS;
350 struct workspace *workspace;
351 char *data_in;
352 size_t total_out = 0;
353 unsigned long page_bytes_left;
354 unsigned long page_in_index = 0;
355 unsigned long page_out_index = 0;
356 struct page *page_out;
357 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
358 PAGE_CACHE_SIZE;
359 unsigned long buf_start;
360 unsigned long buf_offset;
361 unsigned long bytes;
362 unsigned long working_bytes;
363 unsigned long pg_offset;
364 unsigned long start_byte;
365 unsigned long current_buf_start;
366 char *kaddr;
367
368 workspace = find_zlib_workspace();
369 if (!workspace)
370 return -ENOMEM;
371
372 data_in = kmap(pages_in[page_in_index]);
373 workspace->inf_strm.next_in = data_in;
374 workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
375 workspace->inf_strm.total_in = 0;
376
377 workspace->inf_strm.total_out = 0;
378 workspace->inf_strm.next_out = workspace->buf;
379 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
380 page_out = bvec[page_out_index].bv_page;
381 page_bytes_left = PAGE_CACHE_SIZE;
382 pg_offset = 0;
383
384 /* If it's deflate, and it's got no preset dictionary, then
385 we can tell zlib to skip the adler32 check. */
386 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
387 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
388 !(((data_in[0]<<8) + data_in[1]) % 31)) {
389
390 wbits = -((data_in[0] >> 4) + 8);
391 workspace->inf_strm.next_in += 2;
392 workspace->inf_strm.avail_in -= 2;
393 }
394
395 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
396 printk(KERN_WARNING "inflateInit failed\n");
397 ret = -1;
398 goto out;
399 }
400 while (workspace->inf_strm.total_in < srclen) {
401 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
402 if (ret != Z_OK && ret != Z_STREAM_END)
403 break;
404 /*
405 * buf start is the byte offset we're of the start of
406 * our workspace buffer
407 */
408 buf_start = total_out;
409
410 /* total_out is the last byte of the workspace buffer */
411 total_out = workspace->inf_strm.total_out;
412
413 working_bytes = total_out - buf_start;
414
415 /*
416 * start byte is the first byte of the page we're currently
417 * copying into relative to the start of the compressed data.
418 */
419 start_byte = page_offset(page_out) - disk_start;
420
421 if (working_bytes == 0) {
422 /* we didn't make progress in this inflate
423 * call, we're done
424 */
425 if (ret != Z_STREAM_END)
426 ret = -1;
427 break;
428 }
429
430 /* we haven't yet hit data corresponding to this page */
431 if (total_out <= start_byte)
432 goto next;
433
434 /*
435 * the start of the data we care about is offset into
436 * the middle of our working buffer
437 */
438 if (total_out > start_byte && buf_start < start_byte) {
439 buf_offset = start_byte - buf_start;
440 working_bytes -= buf_offset;
441 } else {
442 buf_offset = 0;
443 }
444 current_buf_start = buf_start;
445
446 /* copy bytes from the working buffer into the pages */
447 while (working_bytes > 0) {
448 bytes = min(PAGE_CACHE_SIZE - pg_offset,
449 PAGE_CACHE_SIZE - buf_offset);
450 bytes = min(bytes, working_bytes);
451 kaddr = kmap_atomic(page_out, KM_USER0);
452 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
453 bytes);
454 kunmap_atomic(kaddr, KM_USER0);
455 flush_dcache_page(page_out);
456
457 pg_offset += bytes;
458 page_bytes_left -= bytes;
459 buf_offset += bytes;
460 working_bytes -= bytes;
461 current_buf_start += bytes;
462
463 /* check if we need to pick another page */
464 if (page_bytes_left == 0) {
465 page_out_index++;
466 if (page_out_index >= vcnt) {
467 ret = 0;
468 goto done;
469 }
470
471 page_out = bvec[page_out_index].bv_page;
472 pg_offset = 0;
473 page_bytes_left = PAGE_CACHE_SIZE;
474 start_byte = page_offset(page_out) - disk_start;
475
476 /*
477 * make sure our new page is covered by this
478 * working buffer
479 */
480 if (total_out <= start_byte)
481 goto next;
482
483 /* the next page in the biovec might not
484 * be adjacent to the last page, but it
485 * might still be found inside this working
486 * buffer. bump our offset pointer
487 */
488 if (total_out > start_byte &&
489 current_buf_start < start_byte) {
490 buf_offset = start_byte - buf_start;
491 working_bytes = total_out - start_byte;
492 current_buf_start = buf_start +
493 buf_offset;
494 }
495 }
496 }
497next:
498 workspace->inf_strm.next_out = workspace->buf;
499 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
500
501 if (workspace->inf_strm.avail_in == 0) {
502 unsigned long tmp;
503 kunmap(pages_in[page_in_index]);
504 page_in_index++;
505 if (page_in_index >= total_pages_in) {
506 data_in = NULL;
507 break;
508 }
509 data_in = kmap(pages_in[page_in_index]);
510 workspace->inf_strm.next_in = data_in;
511 tmp = srclen - workspace->inf_strm.total_in;
512 workspace->inf_strm.avail_in = min(tmp,
513 PAGE_CACHE_SIZE);
514 }
515 }
516 if (ret != Z_STREAM_END)
517 ret = -1;
518 else
519 ret = 0;
520done:
521 zlib_inflateEnd(&workspace->inf_strm);
522 if (data_in)
523 kunmap(pages_in[page_in_index]);
524out:
525 free_workspace(workspace);
526 return ret;
527}
528
529/*
530 * a less complex decompression routine. Our compressed data fits in a
531 * single page, and we want to read a single page out of it.
532 * start_byte tells us the offset into the compressed data we're interested in
533 */
534int btrfs_zlib_decompress(unsigned char *data_in,
535 struct page *dest_page,
536 unsigned long start_byte,
537 size_t srclen, size_t destlen)
538{
539 int ret = 0;
540 int wbits = MAX_WBITS;
541 struct workspace *workspace;
542 unsigned long bytes_left = destlen;
543 unsigned long total_out = 0;
544 char *kaddr;
545
546 if (destlen > PAGE_CACHE_SIZE)
547 return -ENOMEM;
548
549 workspace = find_zlib_workspace();
550 if (!workspace)
551 return -ENOMEM;
552
553 workspace->inf_strm.next_in = data_in;
554 workspace->inf_strm.avail_in = srclen;
555 workspace->inf_strm.total_in = 0;
556
557 workspace->inf_strm.next_out = workspace->buf;
558 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
559 workspace->inf_strm.total_out = 0;
560 /* If it's deflate, and it's got no preset dictionary, then
561 we can tell zlib to skip the adler32 check. */
562 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
563 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
564 !(((data_in[0]<<8) + data_in[1]) % 31)) {
565
566 wbits = -((data_in[0] >> 4) + 8);
567 workspace->inf_strm.next_in += 2;
568 workspace->inf_strm.avail_in -= 2;
569 }
570
571 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
572 printk(KERN_WARNING "inflateInit failed\n");
573 ret = -1;
574 goto out;
575 }
576
577 while (bytes_left > 0) {
578 unsigned long buf_start;
579 unsigned long buf_offset;
580 unsigned long bytes;
581 unsigned long pg_offset = 0;
582
583 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
584 if (ret != Z_OK && ret != Z_STREAM_END)
585 break;
586
587 buf_start = total_out;
588 total_out = workspace->inf_strm.total_out;
589
590 if (total_out == buf_start) {
591 ret = -1;
592 break;
593 }
594
595 if (total_out <= start_byte)
596 goto next;
597
598 if (total_out > start_byte && buf_start < start_byte)
599 buf_offset = start_byte - buf_start;
600 else
601 buf_offset = 0;
602
603 bytes = min(PAGE_CACHE_SIZE - pg_offset,
604 PAGE_CACHE_SIZE - buf_offset);
605 bytes = min(bytes, bytes_left);
606
607 kaddr = kmap_atomic(dest_page, KM_USER0);
608 memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
609 kunmap_atomic(kaddr, KM_USER0);
610
611 pg_offset += bytes;
612 bytes_left -= bytes;
613next:
614 workspace->inf_strm.next_out = workspace->buf;
615 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
616 }
617
618 if (ret != Z_STREAM_END && bytes_left != 0)
619 ret = -1;
620 else
621 ret = 0;
622
623 zlib_inflateEnd(&workspace->inf_strm);
624out:
625 free_workspace(workspace);
626 return ret;
627}
628
629void btrfs_zlib_exit(void)
630{
631 free_workspaces();
632}