aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/COPYING356
-rw-r--r--fs/btrfs/INSTALL48
-rw-r--r--fs/btrfs/Makefile25
-rw-r--r--fs/btrfs/acl.c351
-rw-r--r--fs/btrfs/async-thread.c419
-rw-r--r--fs/btrfs/async-thread.h101
-rw-r--r--fs/btrfs/btrfs_inode.h131
-rw-r--r--fs/btrfs/compat.h32
-rw-r--r--fs/btrfs/compression.c709
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/crc32c.h120
-rw-r--r--fs/btrfs/ctree.c3953
-rw-r--r--fs/btrfs/ctree.h2129
-rw-r--r--fs/btrfs/dir-item.c386
-rw-r--r--fs/btrfs/disk-io.c2343
-rw-r--r--fs/btrfs/disk-io.h102
-rw-r--r--fs/btrfs/export.c203
-rw-r--r--fs/btrfs/export.h19
-rw-r--r--fs/btrfs/extent-tree.c5990
-rw-r--r--fs/btrfs/extent_io.c3717
-rw-r--r--fs/btrfs/extent_io.h269
-rw-r--r--fs/btrfs/extent_map.c351
-rw-r--r--fs/btrfs/extent_map.h62
-rw-r--r--fs/btrfs/file-item.c821
-rw-r--r--fs/btrfs/file.c1292
-rw-r--r--fs/btrfs/free-space-cache.c495
-rw-r--r--fs/btrfs/hash.h27
-rw-r--r--fs/btrfs/inode-item.c206
-rw-r--r--fs/btrfs/inode-map.c144
-rw-r--r--fs/btrfs/inode.c5040
-rw-r--r--fs/btrfs/ioctl.c1132
-rw-r--r--fs/btrfs/ioctl.h67
-rw-r--r--fs/btrfs/locking.c88
-rw-r--r--fs/btrfs/locking.h27
-rw-r--r--fs/btrfs/ordered-data.c730
-rw-r--r--fs/btrfs/ordered-data.h158
-rw-r--r--fs/btrfs/orphan.c67
-rw-r--r--fs/btrfs/print-tree.c216
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/ref-cache.c230
-rw-r--r--fs/btrfs/ref-cache.h77
-rw-r--r--fs/btrfs/root-tree.c366
-rw-r--r--fs/btrfs/struct-funcs.c139
-rw-r--r--fs/btrfs/super.c720
-rw-r--r--fs/btrfs/sysfs.c269
-rw-r--r--fs/btrfs/transaction.c1097
-rw-r--r--fs/btrfs/transaction.h106
-rw-r--r--fs/btrfs/tree-defrag.c147
-rw-r--r--fs/btrfs/tree-log.c2996
-rw-r--r--fs/btrfs/tree-log.h41
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c3218
-rw-r--r--fs/btrfs/volumes.h162
-rw-r--r--fs/btrfs/xattr.c322
-rw-r--r--fs/btrfs/xattr.h39
-rw-r--r--fs/btrfs/zlib.c632
57 files changed, 43004 insertions, 0 deletions
diff --git a/fs/btrfs/COPYING b/fs/btrfs/COPYING
new file mode 100644
index 000000000000..ca442d313d86
--- /dev/null
+++ b/fs/btrfs/COPYING
@@ -0,0 +1,356 @@
1
2 NOTE! This copyright does *not* cover user programs that use kernel
3 services by normal system calls - this is merely considered normal use
4 of the kernel, and does *not* fall under the heading of "derived work".
5 Also note that the GPL below is copyrighted by the Free Software
6 Foundation, but the instance of code that it refers to (the Linux
7 kernel) is copyrighted by me and others who actually wrote it.
8
9 Also note that the only valid version of the GPL as far as the kernel
10 is concerned is _this_ particular version of the license (ie v2, not
11 v2.2 or v3.x or whatever), unless explicitly otherwise stated.
12
13 Linus Torvalds
14
15----------------------------------------
16
17 GNU GENERAL PUBLIC LICENSE
18 Version 2, June 1991
19
20 Copyright (C) 1989, 1991 Free Software Foundation, Inc.
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 Everyone is permitted to copy and distribute verbatim copies
23 of this license document, but changing it is not allowed.
24
25 Preamble
26
27 The licenses for most software are designed to take away your
28freedom to share and change it. By contrast, the GNU General Public
29License is intended to guarantee your freedom to share and change free
30software--to make sure the software is free for all its users. This
31General Public License applies to most of the Free Software
32Foundation's software and to any other program whose authors commit to
33using it. (Some other Free Software Foundation software is covered by
34the GNU Library General Public License instead.) You can apply it to
35your programs, too.
36
37 When we speak of free software, we are referring to freedom, not
38price. Our General Public Licenses are designed to make sure that you
39have the freedom to distribute copies of free software (and charge for
40this service if you wish), that you receive source code or can get it
41if you want it, that you can change the software or use pieces of it
42in new free programs; and that you know you can do these things.
43
44 To protect your rights, we need to make restrictions that forbid
45anyone to deny you these rights or to ask you to surrender the rights.
46These restrictions translate to certain responsibilities for you if you
47distribute copies of the software, or if you modify it.
48
49 For example, if you distribute copies of such a program, whether
50gratis or for a fee, you must give the recipients all the rights that
51you have. You must make sure that they, too, receive or can get the
52source code. And you must show them these terms so they know their
53rights.
54
55 We protect your rights with two steps: (1) copyright the software, and
56(2) offer you this license which gives you legal permission to copy,
57distribute and/or modify the software.
58
59 Also, for each author's protection and ours, we want to make certain
60that everyone understands that there is no warranty for this free
61software. If the software is modified by someone else and passed on, we
62want its recipients to know that what they have is not the original, so
63that any problems introduced by others will not reflect on the original
64authors' reputations.
65
66 Finally, any free program is threatened constantly by software
67patents. We wish to avoid the danger that redistributors of a free
68program will individually obtain patent licenses, in effect making the
69program proprietary. To prevent this, we have made it clear that any
70patent must be licensed for everyone's free use or not licensed at all.
71
72 The precise terms and conditions for copying, distribution and
73modification follow.
74
75 GNU GENERAL PUBLIC LICENSE
76 TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
77
78 0. This License applies to any program or other work which contains
79a notice placed by the copyright holder saying it may be distributed
80under the terms of this General Public License. The "Program", below,
81refers to any such program or work, and a "work based on the Program"
82means either the Program or any derivative work under copyright law:
83that is to say, a work containing the Program or a portion of it,
84either verbatim or with modifications and/or translated into another
85language. (Hereinafter, translation is included without limitation in
86the term "modification".) Each licensee is addressed as "you".
87
88Activities other than copying, distribution and modification are not
89covered by this License; they are outside its scope. The act of
90running the Program is not restricted, and the output from the Program
91is covered only if its contents constitute a work based on the
92Program (independent of having been made by running the Program).
93Whether that is true depends on what the Program does.
94
95 1. You may copy and distribute verbatim copies of the Program's
96source code as you receive it, in any medium, provided that you
97conspicuously and appropriately publish on each copy an appropriate
98copyright notice and disclaimer of warranty; keep intact all the
99notices that refer to this License and to the absence of any warranty;
100and give any other recipients of the Program a copy of this License
101along with the Program.
102
103You may charge a fee for the physical act of transferring a copy, and
104you may at your option offer warranty protection in exchange for a fee.
105
106 2. You may modify your copy or copies of the Program or any portion
107of it, thus forming a work based on the Program, and copy and
108distribute such modifications or work under the terms of Section 1
109above, provided that you also meet all of these conditions:
110
111 a) You must cause the modified files to carry prominent notices
112 stating that you changed the files and the date of any change.
113
114 b) You must cause any work that you distribute or publish, that in
115 whole or in part contains or is derived from the Program or any
116 part thereof, to be licensed as a whole at no charge to all third
117 parties under the terms of this License.
118
119 c) If the modified program normally reads commands interactively
120 when run, you must cause it, when started running for such
121 interactive use in the most ordinary way, to print or display an
122 announcement including an appropriate copyright notice and a
123 notice that there is no warranty (or else, saying that you provide
124 a warranty) and that users may redistribute the program under
125 these conditions, and telling the user how to view a copy of this
126 License. (Exception: if the Program itself is interactive but
127 does not normally print such an announcement, your work based on
128 the Program is not required to print an announcement.)
129
130These requirements apply to the modified work as a whole. If
131identifiable sections of that work are not derived from the Program,
132and can be reasonably considered independent and separate works in
133themselves, then this License, and its terms, do not apply to those
134sections when you distribute them as separate works. But when you
135distribute the same sections as part of a whole which is a work based
136on the Program, the distribution of the whole must be on the terms of
137this License, whose permissions for other licensees extend to the
138entire whole, and thus to each and every part regardless of who wrote it.
139
140Thus, it is not the intent of this section to claim rights or contest
141your rights to work written entirely by you; rather, the intent is to
142exercise the right to control the distribution of derivative or
143collective works based on the Program.
144
145In addition, mere aggregation of another work not based on the Program
146with the Program (or with a work based on the Program) on a volume of
147a storage or distribution medium does not bring the other work under
148the scope of this License.
149
150 3. You may copy and distribute the Program (or a work based on it,
151under Section 2) in object code or executable form under the terms of
152Sections 1 and 2 above provided that you also do one of the following:
153
154 a) Accompany it with the complete corresponding machine-readable
155 source code, which must be distributed under the terms of Sections
156 1 and 2 above on a medium customarily used for software interchange; or,
157
158 b) Accompany it with a written offer, valid for at least three
159 years, to give any third party, for a charge no more than your
160 cost of physically performing source distribution, a complete
161 machine-readable copy of the corresponding source code, to be
162 distributed under the terms of Sections 1 and 2 above on a medium
163 customarily used for software interchange; or,
164
165 c) Accompany it with the information you received as to the offer
166 to distribute corresponding source code. (This alternative is
167 allowed only for noncommercial distribution and only if you
168 received the program in object code or executable form with such
169 an offer, in accord with Subsection b above.)
170
171The source code for a work means the preferred form of the work for
172making modifications to it. For an executable work, complete source
173code means all the source code for all modules it contains, plus any
174associated interface definition files, plus the scripts used to
175control compilation and installation of the executable. However, as a
176special exception, the source code distributed need not include
177anything that is normally distributed (in either source or binary
178form) with the major components (compiler, kernel, and so on) of the
179operating system on which the executable runs, unless that component
180itself accompanies the executable.
181
182If distribution of executable or object code is made by offering
183access to copy from a designated place, then offering equivalent
184access to copy the source code from the same place counts as
185distribution of the source code, even though third parties are not
186compelled to copy the source along with the object code.
187
188 4. You may not copy, modify, sublicense, or distribute the Program
189except as expressly provided under this License. Any attempt
190otherwise to copy, modify, sublicense or distribute the Program is
191void, and will automatically terminate your rights under this License.
192However, parties who have received copies, or rights, from you under
193this License will not have their licenses terminated so long as such
194parties remain in full compliance.
195
196 5. You are not required to accept this License, since you have not
197signed it. However, nothing else grants you permission to modify or
198distribute the Program or its derivative works. These actions are
199prohibited by law if you do not accept this License. Therefore, by
200modifying or distributing the Program (or any work based on the
201Program), you indicate your acceptance of this License to do so, and
202all its terms and conditions for copying, distributing or modifying
203the Program or works based on it.
204
205 6. Each time you redistribute the Program (or any work based on the
206Program), the recipient automatically receives a license from the
207original licensor to copy, distribute or modify the Program subject to
208these terms and conditions. You may not impose any further
209restrictions on the recipients' exercise of the rights granted herein.
210You are not responsible for enforcing compliance by third parties to
211this License.
212
213 7. If, as a consequence of a court judgment or allegation of patent
214infringement or for any other reason (not limited to patent issues),
215conditions are imposed on you (whether by court order, agreement or
216otherwise) that contradict the conditions of this License, they do not
217excuse you from the conditions of this License. If you cannot
218distribute so as to satisfy simultaneously your obligations under this
219License and any other pertinent obligations, then as a consequence you
220may not distribute the Program at all. For example, if a patent
221license would not permit royalty-free redistribution of the Program by
222all those who receive copies directly or indirectly through you, then
223the only way you could satisfy both it and this License would be to
224refrain entirely from distribution of the Program.
225
226If any portion of this section is held invalid or unenforceable under
227any particular circumstance, the balance of the section is intended to
228apply and the section as a whole is intended to apply in other
229circumstances.
230
231It is not the purpose of this section to induce you to infringe any
232patents or other property right claims or to contest validity of any
233such claims; this section has the sole purpose of protecting the
234integrity of the free software distribution system, which is
235implemented by public license practices. Many people have made
236generous contributions to the wide range of software distributed
237through that system in reliance on consistent application of that
238system; it is up to the author/donor to decide if he or she is willing
239to distribute software through any other system and a licensee cannot
240impose that choice.
241
242This section is intended to make thoroughly clear what is believed to
243be a consequence of the rest of this License.
244
245 8. If the distribution and/or use of the Program is restricted in
246certain countries either by patents or by copyrighted interfaces, the
247original copyright holder who places the Program under this License
248may add an explicit geographical distribution limitation excluding
249those countries, so that distribution is permitted only in or among
250countries not thus excluded. In such case, this License incorporates
251the limitation as if written in the body of this License.
252
253 9. The Free Software Foundation may publish revised and/or new versions
254of the General Public License from time to time. Such new versions will
255be similar in spirit to the present version, but may differ in detail to
256address new problems or concerns.
257
258Each version is given a distinguishing version number. If the Program
259specifies a version number of this License which applies to it and "any
260later version", you have the option of following the terms and conditions
261either of that version or of any later version published by the Free
262Software Foundation. If the Program does not specify a version number of
263this License, you may choose any version ever published by the Free Software
264Foundation.
265
266 10. If you wish to incorporate parts of the Program into other free
267programs whose distribution conditions are different, write to the author
268to ask for permission. For software which is copyrighted by the Free
269Software Foundation, write to the Free Software Foundation; we sometimes
270make exceptions for this. Our decision will be guided by the two goals
271of preserving the free status of all derivatives of our free software and
272of promoting the sharing and reuse of software generally.
273
274 NO WARRANTY
275
276 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
277FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
278OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
279PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
280OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
281MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
282TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
283PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
284REPAIR OR CORRECTION.
285
286 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
287WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
288REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
289INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
290OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
291TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
292YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
293PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
294POSSIBILITY OF SUCH DAMAGES.
295
296 END OF TERMS AND CONDITIONS
297
298 How to Apply These Terms to Your New Programs
299
300 If you develop a new program, and you want it to be of the greatest
301possible use to the public, the best way to achieve this is to make it
302free software which everyone can redistribute and change under these terms.
303
304 To do so, attach the following notices to the program. It is safest
305to attach them to the start of each source file to most effectively
306convey the exclusion of warranty; and each file should have at least
307the "copyright" line and a pointer to where the full notice is found.
308
309 <one line to give the program's name and a brief idea of what it does.>
310 Copyright (C) <year> <name of author>
311
312 This program is free software; you can redistribute it and/or modify
313 it under the terms of the GNU General Public License as published by
314 the Free Software Foundation; either version 2 of the License, or
315 (at your option) any later version.
316
317 This program is distributed in the hope that it will be useful,
318 but WITHOUT ANY WARRANTY; without even the implied warranty of
319 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
320 GNU General Public License for more details.
321
322 You should have received a copy of the GNU General Public License
323 along with this program; if not, write to the Free Software
324 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
325
326
327Also add information on how to contact you by electronic and paper mail.
328
329If the program is interactive, make it output a short notice like this
330when it starts in an interactive mode:
331
332 Gnomovision version 69, Copyright (C) year name of author
333 Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
334 This is free software, and you are welcome to redistribute it
335 under certain conditions; type `show c' for details.
336
337The hypothetical commands `show w' and `show c' should show the appropriate
338parts of the General Public License. Of course, the commands you use may
339be called something other than `show w' and `show c'; they could even be
340mouse-clicks or menu items--whatever suits your program.
341
342You should also get your employer (if you work as a programmer) or your
343school, if any, to sign a "copyright disclaimer" for the program, if
344necessary. Here is a sample; alter the names:
345
346 Yoyodyne, Inc., hereby disclaims all copyright interest in the program
347 `Gnomovision' (which makes passes at compilers) written by James Hacker.
348
349 <signature of Ty Coon>, 1 April 1989
350 Ty Coon, President of Vice
351
352This General Public License does not permit incorporating your program into
353proprietary programs. If your program is a subroutine library, you may
354consider it more useful to permit linking proprietary applications with the
355library. If this is what you want to do, use the GNU Library General
356Public License instead of this License.
diff --git a/fs/btrfs/INSTALL b/fs/btrfs/INSTALL
new file mode 100644
index 000000000000..16b45a56878d
--- /dev/null
+++ b/fs/btrfs/INSTALL
@@ -0,0 +1,48 @@
1Install Instructions
2
3Btrfs puts snapshots and subvolumes into the root directory of the FS. This
4directory can only be changed by btrfsctl right now, and normal filesystem
5operations do not work on it. The default subvolume is called 'default',
6and you can create files and directories in mount_point/default
7
8Btrfs uses libcrc32c in the kernel for file and metadata checksums. You need
9to compile the kernel with:
10
11CONFIG_LIBCRC32C=m
12
13libcrc32c can be static as well. Once your kernel is setup, typing make in the
14btrfs module sources will build against the running kernel. When the build is
15complete:
16
17modprobe libcrc32c
18insmod btrfs.ko
19
20The Btrfs utility programs require libuuid to build. This can be found
21in the e2fsprogs sources, and is usually available as libuuid or
22e2fsprogs-devel from various distros.
23
24Building the utilities is just make ; make install. The programs go
25into /usr/local/bin. The commands available are:
26
27mkfs.btrfs: create a filesystem
28
29btrfsctl: control program to create snapshots and subvolumes:
30
31 mount /dev/sda2 /mnt
32 btrfsctl -s new_subvol_name /mnt
33 btrfsctl -s snapshot_of_default /mnt/default
34 btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
35 btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
36 ls /mnt
37 default snapshot_of_a_snapshot snapshot_of_new_subvol
38 new_subvol_name snapshot_of_default
39
40 Snapshots and subvolumes cannot be deleted right now, but you can
41 rm -rf all the files and directories inside them.
42
43btrfsck: do a limited check of the FS extent trees.</li>
44
45debug-tree: print all of the FS metadata in text form. Example:
46
47 debug-tree /dev/sda2 >& big_output_file
48
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000000..d2cf5a54a4b8
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,25 @@
1ifneq ($(KERNELRELEASE),)
2# kbuild part of makefile
3
4obj-$(CONFIG_BTRFS_FS) := btrfs.o
5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 file-item.o inode-item.o inode-map.o disk-io.o \
7 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o
12else
13
14# Normal Makefile
15
16KERNELDIR := /lib/modules/`uname -r`/build
17all:
18 $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
19
20modules_install:
21 $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
22clean:
23 $(MAKE) -C $(KERNELDIR) M=`pwd` clean
24
25endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000000..1d53b62dbba5
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,351 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/string.h>
21#include <linux/xattr.h>
22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h>
24#include <linux/sched.h>
25
26#include "ctree.h"
27#include "btrfs_inode.h"
28#include "xattr.h"
29
30#ifdef CONFIG_FS_POSIX_ACL
31
32static void btrfs_update_cached_acl(struct inode *inode,
33 struct posix_acl **p_acl,
34 struct posix_acl *acl)
35{
36 spin_lock(&inode->i_lock);
37 if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
38 posix_acl_release(*p_acl);
39 *p_acl = posix_acl_dup(acl);
40 spin_unlock(&inode->i_lock);
41}
42
43static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
44{
45 int size;
46 const char *name;
47 char *value = NULL;
48 struct posix_acl *acl = NULL, **p_acl;
49
50 switch (type) {
51 case ACL_TYPE_ACCESS:
52 name = POSIX_ACL_XATTR_ACCESS;
53 p_acl = &BTRFS_I(inode)->i_acl;
54 break;
55 case ACL_TYPE_DEFAULT:
56 name = POSIX_ACL_XATTR_DEFAULT;
57 p_acl = &BTRFS_I(inode)->i_default_acl;
58 break;
59 default:
60 return ERR_PTR(-EINVAL);
61 }
62
63 spin_lock(&inode->i_lock);
64 if (*p_acl != BTRFS_ACL_NOT_CACHED)
65 acl = posix_acl_dup(*p_acl);
66 spin_unlock(&inode->i_lock);
67
68 if (acl)
69 return acl;
70
71
72 size = __btrfs_getxattr(inode, name, "", 0);
73 if (size > 0) {
74 value = kzalloc(size, GFP_NOFS);
75 if (!value)
76 return ERR_PTR(-ENOMEM);
77 size = __btrfs_getxattr(inode, name, value, size);
78 if (size > 0) {
79 acl = posix_acl_from_xattr(value, size);
80 btrfs_update_cached_acl(inode, p_acl, acl);
81 }
82 kfree(value);
83 } else if (size == -ENOENT) {
84 acl = NULL;
85 btrfs_update_cached_acl(inode, p_acl, acl);
86 }
87
88 return acl;
89}
90
91static int btrfs_xattr_get_acl(struct inode *inode, int type,
92 void *value, size_t size)
93{
94 struct posix_acl *acl;
95 int ret = 0;
96
97 acl = btrfs_get_acl(inode, type);
98
99 if (IS_ERR(acl))
100 return PTR_ERR(acl);
101 if (acl == NULL)
102 return -ENODATA;
103 ret = posix_acl_to_xattr(acl, value, size);
104 posix_acl_release(acl);
105
106 return ret;
107}
108
109/*
110 * Needs to be called with fs_mutex held
111 */
112static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
113{
114 int ret, size = 0;
115 const char *name;
116 struct posix_acl **p_acl;
117 char *value = NULL;
118 mode_t mode;
119
120 if (acl) {
121 ret = posix_acl_valid(acl);
122 if (ret < 0)
123 return ret;
124 ret = 0;
125 }
126
127 switch (type) {
128 case ACL_TYPE_ACCESS:
129 mode = inode->i_mode;
130 ret = posix_acl_equiv_mode(acl, &mode);
131 if (ret < 0)
132 return ret;
133 ret = 0;
134 inode->i_mode = mode;
135 name = POSIX_ACL_XATTR_ACCESS;
136 p_acl = &BTRFS_I(inode)->i_acl;
137 break;
138 case ACL_TYPE_DEFAULT:
139 if (!S_ISDIR(inode->i_mode))
140 return acl ? -EINVAL : 0;
141 name = POSIX_ACL_XATTR_DEFAULT;
142 p_acl = &BTRFS_I(inode)->i_default_acl;
143 break;
144 default:
145 return -EINVAL;
146 }
147
148 if (acl) {
149 size = posix_acl_xattr_size(acl->a_count);
150 value = kmalloc(size, GFP_NOFS);
151 if (!value) {
152 ret = -ENOMEM;
153 goto out;
154 }
155
156 ret = posix_acl_to_xattr(acl, value, size);
157 if (ret < 0)
158 goto out;
159 }
160
161 ret = __btrfs_setxattr(inode, name, value, size, 0);
162
163out:
164 kfree(value);
165
166 if (!ret)
167 btrfs_update_cached_acl(inode, p_acl, acl);
168
169 return ret;
170}
171
172static int btrfs_xattr_set_acl(struct inode *inode, int type,
173 const void *value, size_t size)
174{
175 int ret = 0;
176 struct posix_acl *acl = NULL;
177
178 if (value) {
179 acl = posix_acl_from_xattr(value, size);
180 if (acl == NULL) {
181 value = NULL;
182 size = 0;
183 } else if (IS_ERR(acl)) {
184 return PTR_ERR(acl);
185 }
186 }
187
188 ret = btrfs_set_acl(inode, acl, type);
189
190 posix_acl_release(acl);
191
192 return ret;
193}
194
195
196static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
197 void *value, size_t size)
198{
199 return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
200}
201
202static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
203 const void *value, size_t size, int flags)
204{
205 return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
206}
207
208static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
209 void *value, size_t size)
210{
211 return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
212}
213
214static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
215 const void *value, size_t size, int flags)
216{
217 return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
218}
219
220int btrfs_check_acl(struct inode *inode, int mask)
221{
222 struct posix_acl *acl;
223 int error = -EAGAIN;
224
225 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
226
227 if (IS_ERR(acl))
228 return PTR_ERR(acl);
229 if (acl) {
230 error = posix_acl_permission(inode, acl, mask);
231 posix_acl_release(acl);
232 }
233
234 return error;
235}
236
237/*
238 * btrfs_init_acl is already generally called under fs_mutex, so the locking
239 * stuff has been fixed to work with that. If the locking stuff changes, we
240 * need to re-evaluate the acl locking stuff.
241 */
242int btrfs_init_acl(struct inode *inode, struct inode *dir)
243{
244 struct posix_acl *acl = NULL;
245 int ret = 0;
246
247 /* this happens with subvols */
248 if (!dir)
249 return 0;
250
251 if (!S_ISLNK(inode->i_mode)) {
252 if (IS_POSIXACL(dir)) {
253 acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
254 if (IS_ERR(acl))
255 return PTR_ERR(acl);
256 }
257
258 if (!acl)
259 inode->i_mode &= ~current->fs->umask;
260 }
261
262 if (IS_POSIXACL(dir) && acl) {
263 struct posix_acl *clone;
264 mode_t mode;
265
266 if (S_ISDIR(inode->i_mode)) {
267 ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
268 if (ret)
269 goto failed;
270 }
271 clone = posix_acl_clone(acl, GFP_NOFS);
272 ret = -ENOMEM;
273 if (!clone)
274 goto failed;
275
276 mode = inode->i_mode;
277 ret = posix_acl_create_masq(clone, &mode);
278 if (ret >= 0) {
279 inode->i_mode = mode;
280 if (ret > 0) {
281 /* we need an acl */
282 ret = btrfs_set_acl(inode, clone,
283 ACL_TYPE_ACCESS);
284 }
285 }
286 }
287failed:
288 posix_acl_release(acl);
289
290 return ret;
291}
292
293int btrfs_acl_chmod(struct inode *inode)
294{
295 struct posix_acl *acl, *clone;
296 int ret = 0;
297
298 if (S_ISLNK(inode->i_mode))
299 return -EOPNOTSUPP;
300
301 if (!IS_POSIXACL(inode))
302 return 0;
303
304 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
305 if (IS_ERR(acl) || !acl)
306 return PTR_ERR(acl);
307
308 clone = posix_acl_clone(acl, GFP_KERNEL);
309 posix_acl_release(acl);
310 if (!clone)
311 return -ENOMEM;
312
313 ret = posix_acl_chmod_masq(clone, inode->i_mode);
314 if (!ret)
315 ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
316
317 posix_acl_release(clone);
318
319 return ret;
320}
321
322struct xattr_handler btrfs_xattr_acl_default_handler = {
323 .prefix = POSIX_ACL_XATTR_DEFAULT,
324 .get = btrfs_xattr_acl_default_get,
325 .set = btrfs_xattr_acl_default_set,
326};
327
328struct xattr_handler btrfs_xattr_acl_access_handler = {
329 .prefix = POSIX_ACL_XATTR_ACCESS,
330 .get = btrfs_xattr_acl_access_get,
331 .set = btrfs_xattr_acl_access_set,
332};
333
334#else /* CONFIG_FS_POSIX_ACL */
335
336int btrfs_acl_chmod(struct inode *inode)
337{
338 return 0;
339}
340
341int btrfs_init_acl(struct inode *inode, struct inode *dir)
342{
343 return 0;
344}
345
346int btrfs_check_acl(struct inode *inode, int mask)
347{
348 return 0;
349}
350
351#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..8e2fec05dbe0
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,419 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/kthread.h>
21#include <linux/list.h>
22#include <linux/spinlock.h>
23# include <linux/freezer.h>
24#include "async-thread.h"
25
26#define WORK_QUEUED_BIT 0
27#define WORK_DONE_BIT 1
28#define WORK_ORDER_DONE_BIT 2
29
30/*
31 * container for the kthread task pointer and the list of pending work
32 * One of these is allocated per thread.
33 */
34struct btrfs_worker_thread {
35 /* pool we belong to */
36 struct btrfs_workers *workers;
37
38 /* list of struct btrfs_work that are waiting for service */
39 struct list_head pending;
40
41 /* list of worker threads from struct btrfs_workers */
42 struct list_head worker_list;
43
44 /* kthread */
45 struct task_struct *task;
46
47 /* number of things on the pending list */
48 atomic_t num_pending;
49
50 unsigned long sequence;
51
52 /* protects the pending list. */
53 spinlock_t lock;
54
55 /* set to non-zero when this thread is already awake and kicking */
56 int working;
57
58 /* are we currently idle */
59 int idle;
60};
61
62/*
63 * helper function to move a thread onto the idle list after it
64 * has finished some requests.
65 */
66static void check_idle_worker(struct btrfs_worker_thread *worker)
67{
68 if (!worker->idle && atomic_read(&worker->num_pending) <
69 worker->workers->idle_thresh / 2) {
70 unsigned long flags;
71 spin_lock_irqsave(&worker->workers->lock, flags);
72 worker->idle = 1;
73 list_move(&worker->worker_list, &worker->workers->idle_list);
74 spin_unlock_irqrestore(&worker->workers->lock, flags);
75 }
76}
77
78/*
79 * helper function to move a thread off the idle list after new
80 * pending work is added.
81 */
82static void check_busy_worker(struct btrfs_worker_thread *worker)
83{
84 if (worker->idle && atomic_read(&worker->num_pending) >=
85 worker->workers->idle_thresh) {
86 unsigned long flags;
87 spin_lock_irqsave(&worker->workers->lock, flags);
88 worker->idle = 0;
89 list_move_tail(&worker->worker_list,
90 &worker->workers->worker_list);
91 spin_unlock_irqrestore(&worker->workers->lock, flags);
92 }
93}
94
95static noinline int run_ordered_completions(struct btrfs_workers *workers,
96 struct btrfs_work *work)
97{
98 unsigned long flags;
99
100 if (!workers->ordered)
101 return 0;
102
103 set_bit(WORK_DONE_BIT, &work->flags);
104
105 spin_lock_irqsave(&workers->lock, flags);
106
107 while (!list_empty(&workers->order_list)) {
108 work = list_entry(workers->order_list.next,
109 struct btrfs_work, order_list);
110
111 if (!test_bit(WORK_DONE_BIT, &work->flags))
112 break;
113
114 /* we are going to call the ordered done function, but
115 * we leave the work item on the list as a barrier so
116 * that later work items that are done don't have their
117 * functions called before this one returns
118 */
119 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
120 break;
121
122 spin_unlock_irqrestore(&workers->lock, flags);
123
124 work->ordered_func(work);
125
126 /* now take the lock again and call the freeing code */
127 spin_lock_irqsave(&workers->lock, flags);
128 list_del(&work->order_list);
129 work->ordered_free(work);
130 }
131
132 spin_unlock_irqrestore(&workers->lock, flags);
133 return 0;
134}
135
136/*
137 * main loop for servicing work items
138 */
139static int worker_loop(void *arg)
140{
141 struct btrfs_worker_thread *worker = arg;
142 struct list_head *cur;
143 struct btrfs_work *work;
144 do {
145 spin_lock_irq(&worker->lock);
146 while (!list_empty(&worker->pending)) {
147 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list);
149 list_del(&work->list);
150 clear_bit(WORK_QUEUED_BIT, &work->flags);
151
152 work->worker = worker;
153 spin_unlock_irq(&worker->lock);
154
155 work->func(work);
156
157 atomic_dec(&worker->num_pending);
158 /*
159 * unless this is an ordered work queue,
160 * 'work' was probably freed by func above.
161 */
162 run_ordered_completions(worker->workers, work);
163
164 spin_lock_irq(&worker->lock);
165 check_idle_worker(worker);
166
167 }
168 worker->working = 0;
169 if (freezing(current)) {
170 refrigerator();
171 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop())
175 schedule();
176 __set_current_state(TASK_RUNNING);
177 }
178 } while (!kthread_should_stop());
179 return 0;
180}
181
182/*
183 * this will wait for all the worker threads to shutdown
184 */
185int btrfs_stop_workers(struct btrfs_workers *workers)
186{
187 struct list_head *cur;
188 struct btrfs_worker_thread *worker;
189
190 list_splice_init(&workers->idle_list, &workers->worker_list);
191 while (!list_empty(&workers->worker_list)) {
192 cur = workers->worker_list.next;
193 worker = list_entry(cur, struct btrfs_worker_thread,
194 worker_list);
195 kthread_stop(worker->task);
196 list_del(&worker->worker_list);
197 kfree(worker);
198 }
199 return 0;
200}
201
202/*
203 * simple init on struct btrfs_workers
204 */
205void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
206{
207 workers->num_workers = 0;
208 INIT_LIST_HEAD(&workers->worker_list);
209 INIT_LIST_HEAD(&workers->idle_list);
210 INIT_LIST_HEAD(&workers->order_list);
211 spin_lock_init(&workers->lock);
212 workers->max_workers = max;
213 workers->idle_thresh = 32;
214 workers->name = name;
215 workers->ordered = 0;
216}
217
218/*
219 * starts new worker threads. This does not enforce the max worker
220 * count in case you need to temporarily go past it.
221 */
222int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
223{
224 struct btrfs_worker_thread *worker;
225 int ret = 0;
226 int i;
227
228 for (i = 0; i < num_workers; i++) {
229 worker = kzalloc(sizeof(*worker), GFP_NOFS);
230 if (!worker) {
231 ret = -ENOMEM;
232 goto fail;
233 }
234
235 INIT_LIST_HEAD(&worker->pending);
236 INIT_LIST_HEAD(&worker->worker_list);
237 spin_lock_init(&worker->lock);
238 atomic_set(&worker->num_pending, 0);
239 worker->task = kthread_run(worker_loop, worker,
240 "btrfs-%s-%d", workers->name,
241 workers->num_workers + i);
242 worker->workers = workers;
243 if (IS_ERR(worker->task)) {
244 kfree(worker);
245 ret = PTR_ERR(worker->task);
246 goto fail;
247 }
248
249 spin_lock_irq(&workers->lock);
250 list_add_tail(&worker->worker_list, &workers->idle_list);
251 worker->idle = 1;
252 workers->num_workers++;
253 spin_unlock_irq(&workers->lock);
254 }
255 return 0;
256fail:
257 btrfs_stop_workers(workers);
258 return ret;
259}
260
261/*
262 * run through the list and find a worker thread that doesn't have a lot
263 * to do right now. This can return null if we aren't yet at the thread
264 * count limit and all of the threads are busy.
265 */
266static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
267{
268 struct btrfs_worker_thread *worker;
269 struct list_head *next;
270 int enforce_min = workers->num_workers < workers->max_workers;
271
272 /*
273 * if we find an idle thread, don't move it to the end of the
274 * idle list. This improves the chance that the next submission
275 * will reuse the same thread, and maybe catch it while it is still
276 * working
277 */
278 if (!list_empty(&workers->idle_list)) {
279 next = workers->idle_list.next;
280 worker = list_entry(next, struct btrfs_worker_thread,
281 worker_list);
282 return worker;
283 }
284 if (enforce_min || list_empty(&workers->worker_list))
285 return NULL;
286
287 /*
288 * if we pick a busy task, move the task to the end of the list.
289 * hopefully this will keep things somewhat evenly balanced.
290 * Do the move in batches based on the sequence number. This groups
291 * requests submitted at roughly the same time onto the same worker.
292 */
293 next = workers->worker_list.next;
294 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
295 atomic_inc(&worker->num_pending);
296 worker->sequence++;
297
298 if (worker->sequence % workers->idle_thresh == 0)
299 list_move_tail(next, &workers->worker_list);
300 return worker;
301}
302
303/*
304 * selects a worker thread to take the next job. This will either find
305 * an idle worker, start a new worker up to the max count, or just return
306 * one of the existing busy workers.
307 */
308static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
309{
310 struct btrfs_worker_thread *worker;
311 unsigned long flags;
312
313again:
314 spin_lock_irqsave(&workers->lock, flags);
315 worker = next_worker(workers);
316 spin_unlock_irqrestore(&workers->lock, flags);
317
318 if (!worker) {
319 spin_lock_irqsave(&workers->lock, flags);
320 if (workers->num_workers >= workers->max_workers) {
321 struct list_head *fallback = NULL;
322 /*
323 * we have failed to find any workers, just
324 * return the force one
325 */
326 if (!list_empty(&workers->worker_list))
327 fallback = workers->worker_list.next;
328 if (!list_empty(&workers->idle_list))
329 fallback = workers->idle_list.next;
330 BUG_ON(!fallback);
331 worker = list_entry(fallback,
332 struct btrfs_worker_thread, worker_list);
333 spin_unlock_irqrestore(&workers->lock, flags);
334 } else {
335 spin_unlock_irqrestore(&workers->lock, flags);
336 /* we're below the limit, start another worker */
337 btrfs_start_workers(workers, 1);
338 goto again;
339 }
340 }
341 return worker;
342}
343
344/*
345 * btrfs_requeue_work just puts the work item back on the tail of the list
346 * it was taken from. It is intended for use with long running work functions
347 * that make some progress and want to give the cpu up for others.
348 */
349int btrfs_requeue_work(struct btrfs_work *work)
350{
351 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags;
353
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out;
356
357 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending);
360
361 /* by definition we're busy, take ourselves off the idle
362 * list
363 */
364 if (worker->idle) {
365 spin_lock_irqsave(&worker->workers->lock, flags);
366 worker->idle = 0;
367 list_move_tail(&worker->worker_list,
368 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 }
371
372 spin_unlock_irqrestore(&worker->lock, flags);
373
374out:
375 return 0;
376}
377
378/*
379 * places a struct btrfs_work into the pending queue of one of the kthreads
380 */
381int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
382{
383 struct btrfs_worker_thread *worker;
384 unsigned long flags;
385 int wake = 0;
386
387 /* don't requeue something already on a list */
388 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
389 goto out;
390
391 worker = find_worker(workers);
392 if (workers->ordered) {
393 spin_lock_irqsave(&workers->lock, flags);
394 list_add_tail(&work->order_list, &workers->order_list);
395 spin_unlock_irqrestore(&workers->lock, flags);
396 } else {
397 INIT_LIST_HEAD(&work->order_list);
398 }
399
400 spin_lock_irqsave(&worker->lock, flags);
401 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404
405 /*
406 * avoid calling into wake_up_process if this thread has already
407 * been kicked
408 */
409 if (!worker->working)
410 wake = 1;
411 worker->working = 1;
412
413 spin_unlock_irqrestore(&worker->lock, flags);
414
415 if (wake)
416 wake_up_process(worker->task);
417out:
418 return 0;
419}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000000..31be4ed8b63e
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,101 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_
21
22struct btrfs_worker_thread;
23
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work {
39 /*
40 * func should be set to the function you want called
41 * your work struct is passed as the only arg
42 *
43 * ordered_func must be set for work sent to an ordered work queue,
44 * and it is called to complete a given work item in the same
45 * order they were sent to the queue.
46 */
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 /* max number of workers allowed. changed by btrfs_start_workers */
68 int max_workers;
69
70 /* once a worker has this many requests or fewer, it is idle */
71 int idle_thresh;
72
73 /* force completions in the order they were queued */
74 int ordered;
75
76 /* list with all the work threads. The workers on the idle thread
77 * may be actively servicing jobs, but they haven't yet hit the
78 * idle thresh limit above.
79 */
80 struct list_head worker_list;
81 struct list_head idle_list;
82
83 /*
84 * when operating in ordered mode, this maintains the list
85 * of work items waiting for completion
86 */
87 struct list_head order_list;
88
89 /* lock for finding the next worker thread to queue on */
90 spinlock_t lock;
91
92 /* extra name for this worker, used for current->name */
93 char *name;
94};
95
96int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
97int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
98int btrfs_stop_workers(struct btrfs_workers *workers);
99void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
100int btrfs_requeue_work(struct btrfs_work *work);
101#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000000..a8c9693b75ac
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,131 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_I__
20#define __BTRFS_I__
21
22#include "extent_map.h"
23#include "extent_io.h"
24#include "ordered-data.h"
25
26/* in memory btrfs inode */
27struct btrfs_inode {
28 /* which subvolume this inode belongs to */
29 struct btrfs_root *root;
30
31 /* key used to find this inode on disk. This is used by the code
32 * to read in roots of subvolumes
33 */
34 struct btrfs_key location;
35
36 /* the extent_tree has caches of all the extent mappings to disk */
37 struct extent_map_tree extent_tree;
38
39 /* the io_tree does range state (DIRTY, LOCKED etc) */
40 struct extent_io_tree io_tree;
41
42 /* special utility tree used to record which mirrors have already been
43 * tried when checksums fail for a given block
44 */
45 struct extent_io_tree io_failure_tree;
46
47 /* held while inesrting or deleting extents from files */
48 struct mutex extent_mutex;
49
50 /* held while logging the inode in tree-log.c */
51 struct mutex log_mutex;
52
53 /* used to order data wrt metadata */
54 struct btrfs_ordered_inode_tree ordered_tree;
55
56 /* standard acl pointers */
57 struct posix_acl *i_acl;
58 struct posix_acl *i_default_acl;
59
60 /* for keeping track of orphaned inodes */
61 struct list_head i_orphan;
62
63 /* list of all the delalloc inodes in the FS. There are times we need
64 * to write all the delalloc pages to disk, and this list is used
65 * to walk them all.
66 */
67 struct list_head delalloc_inodes;
68
69 /* full 64 bit generation number, struct vfs_inode doesn't have a big
70 * enough field for this.
71 */
72 u64 generation;
73
74 /* sequence number for NFS changes */
75 u64 sequence;
76
77 /*
78 * transid of the trans_handle that last modified this inode
79 */
80 u64 last_trans;
81 /*
82 * transid that last logged this inode
83 */
84 u64 logged_trans;
85
86 /*
87 * trans that last made a change that should be fully fsync'd. This
88 * gets reset to zero each time the inode is logged
89 */
90 u64 log_dirty_trans;
91
92 /* total number of bytes pending delalloc, used by stat to calc the
93 * real block usage of the file
94 */
95 u64 delalloc_bytes;
96
97 /*
98 * the size of the file stored in the metadata on disk. data=ordered
99 * means the in-memory i_size might be larger than the size on disk
100 * because not all the blocks are written yet.
101 */
102 u64 disk_i_size;
103
104 /* flags field from the on disk inode */
105 u32 flags;
106
107 /*
108 * if this is a directory then index_cnt is the counter for the index
109 * number for new files that are created
110 */
111 u64 index_cnt;
112
113 /* the start of block group preferred for allocations. */
114 u64 block_group;
115
116 struct inode vfs_inode;
117};
118
119static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
120{
121 return container_of(inode, struct btrfs_inode, vfs_inode);
122}
123
124static inline void btrfs_i_size_write(struct inode *inode, u64 size)
125{
126 inode->i_size = size;
127 BTRFS_I(inode)->disk_i_size = size;
128}
129
130
131#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 000000000000..594d60bdd3c4
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,32 @@
1#ifndef _COMPAT_H_
2#define _COMPAT_H_
3
4#define btrfs_drop_nlink(inode) drop_nlink(inode)
5#define btrfs_inc_nlink(inode) inc_nlink(inode)
6
7#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 27)
8static inline struct dentry *d_obtain_alias(struct inode *inode)
9{
10 struct dentry *d;
11
12 if (!inode)
13 return NULL;
14 if (IS_ERR(inode))
15 return ERR_CAST(inode);
16
17 d = d_alloc_anon(inode);
18 if (!d)
19 iput(inode);
20 return d;
21}
22#endif
23
24#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
25# define __pagevec_lru_add_file __pagevec_lru_add
26# define open_bdev_exclusive open_bdev_excl
27# define close_bdev_exclusive(bdev, mode) close_bdev_excl(bdev)
28typedef unsigned __bitwise__ fmode_t;
29#endif
30
31
32#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..ee848d8585d9
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,709 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h>
37#include "compat.h"
38#include "ctree.h"
39#include "disk-io.h"
40#include "transaction.h"
41#include "btrfs_inode.h"
42#include "volumes.h"
43#include "ordered-data.h"
44#include "compression.h"
45#include "extent_io.h"
46#include "extent_map.h"
47
48struct compressed_bio {
49 /* number of bios pending for this compressed extent */
50 atomic_t pending_bios;
51
52 /* the pages with the compressed data on them */
53 struct page **compressed_pages;
54
55 /* inode that owns this data */
56 struct inode *inode;
57
58 /* starting offset in the inode for our pages */
59 u64 start;
60
61 /* number of bytes in the inode we're working on */
62 unsigned long len;
63
64 /* number of bytes on disk */
65 unsigned long compressed_len;
66
67 /* number of compressed pages in the array */
68 unsigned long nr_pages;
69
70 /* IO errors */
71 int errors;
72 int mirror_num;
73
74 /* for reads, this is the bio we are copying the data into */
75 struct bio *orig_bio;
76
77 /*
78 * the start of a variable length array of checksums only
79 * used by reads
80 */
81 u32 sums;
82};
83
84static inline int compressed_bio_size(struct btrfs_root *root,
85 unsigned long disk_size)
86{
87 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
88 return sizeof(struct compressed_bio) +
89 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
90 csum_size;
91}
92
93static struct bio *compressed_bio_alloc(struct block_device *bdev,
94 u64 first_byte, gfp_t gfp_flags)
95{
96 struct bio *bio;
97 int nr_vecs;
98
99 nr_vecs = bio_get_nr_vecs(bdev);
100 bio = bio_alloc(gfp_flags, nr_vecs);
101
102 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
103 while (!bio && (nr_vecs /= 2))
104 bio = bio_alloc(gfp_flags, nr_vecs);
105 }
106
107 if (bio) {
108 bio->bi_size = 0;
109 bio->bi_bdev = bdev;
110 bio->bi_sector = first_byte >> 9;
111 }
112 return bio;
113}
114
115static int check_compressed_csum(struct inode *inode,
116 struct compressed_bio *cb,
117 u64 disk_start)
118{
119 int ret;
120 struct btrfs_root *root = BTRFS_I(inode)->root;
121 struct page *page;
122 unsigned long i;
123 char *kaddr;
124 u32 csum;
125 u32 *cb_sum = &cb->sums;
126
127 if (btrfs_test_flag(inode, NODATASUM))
128 return 0;
129
130 for (i = 0; i < cb->nr_pages; i++) {
131 page = cb->compressed_pages[i];
132 csum = ~(u32)0;
133
134 kaddr = kmap_atomic(page, KM_USER0);
135 csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
136 btrfs_csum_final(csum, (char *)&csum);
137 kunmap_atomic(kaddr, KM_USER0);
138
139 if (csum != *cb_sum) {
140 printk(KERN_INFO "btrfs csum failed ino %lu "
141 "extent %llu csum %u "
142 "wanted %u mirror %d\n", inode->i_ino,
143 (unsigned long long)disk_start,
144 csum, *cb_sum, cb->mirror_num);
145 ret = -EIO;
146 goto fail;
147 }
148 cb_sum++;
149
150 }
151 ret = 0;
152fail:
153 return ret;
154}
155
156/* when we finish reading compressed pages from the disk, we
157 * decompress them and then run the bio end_io routines on the
158 * decompressed pages (in the inode address space).
159 *
160 * This allows the checksumming and other IO error handling routines
161 * to work normally
162 *
163 * The compressed pages are freed here, and it must be run
164 * in process context
165 */
166static void end_compressed_bio_read(struct bio *bio, int err)
167{
168 struct extent_io_tree *tree;
169 struct compressed_bio *cb = bio->bi_private;
170 struct inode *inode;
171 struct page *page;
172 unsigned long index;
173 int ret;
174
175 if (err)
176 cb->errors = 1;
177
178 /* if there are more bios still pending for this compressed
179 * extent, just exit
180 */
181 if (!atomic_dec_and_test(&cb->pending_bios))
182 goto out;
183
184 inode = cb->inode;
185 ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
186 if (ret)
187 goto csum_failed;
188
189 /* ok, we're the last bio for this extent, lets start
190 * the decompression.
191 */
192 tree = &BTRFS_I(inode)->io_tree;
193 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
194 cb->start,
195 cb->orig_bio->bi_io_vec,
196 cb->orig_bio->bi_vcnt,
197 cb->compressed_len);
198csum_failed:
199 if (ret)
200 cb->errors = 1;
201
202 /* release the compressed pages */
203 index = 0;
204 for (index = 0; index < cb->nr_pages; index++) {
205 page = cb->compressed_pages[index];
206 page->mapping = NULL;
207 page_cache_release(page);
208 }
209
210 /* do io completion on the original bio */
211 if (cb->errors) {
212 bio_io_error(cb->orig_bio);
213 } else {
214 int bio_index = 0;
215 struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
216
217 /*
218 * we have verified the checksum already, set page
219 * checked so the end_io handlers know about it
220 */
221 while (bio_index < cb->orig_bio->bi_vcnt) {
222 SetPageChecked(bvec->bv_page);
223 bvec++;
224 bio_index++;
225 }
226 bio_endio(cb->orig_bio, 0);
227 }
228
229 /* finally free the cb struct */
230 kfree(cb->compressed_pages);
231 kfree(cb);
232out:
233 bio_put(bio);
234}
235
236/*
237 * Clear the writeback bits on all of the file
238 * pages for a compressed write
239 */
240static noinline int end_compressed_writeback(struct inode *inode, u64 start,
241 unsigned long ram_size)
242{
243 unsigned long index = start >> PAGE_CACHE_SHIFT;
244 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
245 struct page *pages[16];
246 unsigned long nr_pages = end_index - index + 1;
247 int i;
248 int ret;
249
250 while (nr_pages > 0) {
251 ret = find_get_pages_contig(inode->i_mapping, index,
252 min_t(unsigned long,
253 nr_pages, ARRAY_SIZE(pages)), pages);
254 if (ret == 0) {
255 nr_pages -= 1;
256 index += 1;
257 continue;
258 }
259 for (i = 0; i < ret; i++) {
260 end_page_writeback(pages[i]);
261 page_cache_release(pages[i]);
262 }
263 nr_pages -= ret;
264 index += ret;
265 }
266 /* the inode may be gone now */
267 return 0;
268}
269
270/*
271 * do the cleanup once all the compressed pages hit the disk.
272 * This will clear writeback on the file pages and free the compressed
273 * pages.
274 *
275 * This also calls the writeback end hooks for the file pages so that
276 * metadata and checksums can be updated in the file.
277 */
278static void end_compressed_bio_write(struct bio *bio, int err)
279{
280 struct extent_io_tree *tree;
281 struct compressed_bio *cb = bio->bi_private;
282 struct inode *inode;
283 struct page *page;
284 unsigned long index;
285
286 if (err)
287 cb->errors = 1;
288
289 /* if there are more bios still pending for this compressed
290 * extent, just exit
291 */
292 if (!atomic_dec_and_test(&cb->pending_bios))
293 goto out;
294
295 /* ok, we're the last bio for this extent, step one is to
296 * call back into the FS and do all the end_io operations
297 */
298 inode = cb->inode;
299 tree = &BTRFS_I(inode)->io_tree;
300 cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
301 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
302 cb->start,
303 cb->start + cb->len - 1,
304 NULL, 1);
305 cb->compressed_pages[0]->mapping = NULL;
306
307 end_compressed_writeback(inode, cb->start, cb->len);
308 /* note, our inode could be gone now */
309
310 /*
311 * release the compressed pages, these came from alloc_page and
312 * are not attached to the inode at all
313 */
314 index = 0;
315 for (index = 0; index < cb->nr_pages; index++) {
316 page = cb->compressed_pages[index];
317 page->mapping = NULL;
318 page_cache_release(page);
319 }
320
321 /* finally free the cb struct */
322 kfree(cb->compressed_pages);
323 kfree(cb);
324out:
325 bio_put(bio);
326}
327
328/*
329 * worker function to build and submit bios for previously compressed pages.
330 * The corresponding pages in the inode should be marked for writeback
331 * and the compressed pages should have a reference on them for dropping
332 * when the IO is complete.
333 *
334 * This also checksums the file bytes and gets things ready for
335 * the end io hooks.
336 */
337int btrfs_submit_compressed_write(struct inode *inode, u64 start,
338 unsigned long len, u64 disk_start,
339 unsigned long compressed_len,
340 struct page **compressed_pages,
341 unsigned long nr_pages)
342{
343 struct bio *bio = NULL;
344 struct btrfs_root *root = BTRFS_I(inode)->root;
345 struct compressed_bio *cb;
346 unsigned long bytes_left;
347 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
348 int page_index = 0;
349 struct page *page;
350 u64 first_byte = disk_start;
351 struct block_device *bdev;
352 int ret;
353
354 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
355 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
356 atomic_set(&cb->pending_bios, 0);
357 cb->errors = 0;
358 cb->inode = inode;
359 cb->start = start;
360 cb->len = len;
361 cb->mirror_num = 0;
362 cb->compressed_pages = compressed_pages;
363 cb->compressed_len = compressed_len;
364 cb->orig_bio = NULL;
365 cb->nr_pages = nr_pages;
366
367 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
368
369 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
370 bio->bi_private = cb;
371 bio->bi_end_io = end_compressed_bio_write;
372 atomic_inc(&cb->pending_bios);
373
374 /* create and submit bios for the compressed pages */
375 bytes_left = compressed_len;
376 for (page_index = 0; page_index < cb->nr_pages; page_index++) {
377 page = compressed_pages[page_index];
378 page->mapping = inode->i_mapping;
379 if (bio->bi_size)
380 ret = io_tree->ops->merge_bio_hook(page, 0,
381 PAGE_CACHE_SIZE,
382 bio, 0);
383 else
384 ret = 0;
385
386 page->mapping = NULL;
387 if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
388 PAGE_CACHE_SIZE) {
389 bio_get(bio);
390
391 /*
392 * inc the count before we submit the bio so
393 * we know the end IO handler won't happen before
394 * we inc the count. Otherwise, the cb might get
395 * freed before we're done setting it up
396 */
397 atomic_inc(&cb->pending_bios);
398 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
399 BUG_ON(ret);
400
401 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
402 BUG_ON(ret);
403
404 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
405 BUG_ON(ret);
406
407 bio_put(bio);
408
409 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
410 bio->bi_private = cb;
411 bio->bi_end_io = end_compressed_bio_write;
412 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
413 }
414 if (bytes_left < PAGE_CACHE_SIZE) {
415 printk("bytes left %lu compress len %lu nr %lu\n",
416 bytes_left, cb->compressed_len, cb->nr_pages);
417 }
418 bytes_left -= PAGE_CACHE_SIZE;
419 first_byte += PAGE_CACHE_SIZE;
420 cond_resched();
421 }
422 bio_get(bio);
423
424 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
425 BUG_ON(ret);
426
427 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
428 BUG_ON(ret);
429
430 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
431 BUG_ON(ret);
432
433 bio_put(bio);
434 return 0;
435}
436
437static noinline int add_ra_bio_pages(struct inode *inode,
438 u64 compressed_end,
439 struct compressed_bio *cb)
440{
441 unsigned long end_index;
442 unsigned long page_index;
443 u64 last_offset;
444 u64 isize = i_size_read(inode);
445 int ret;
446 struct page *page;
447 unsigned long nr_pages = 0;
448 struct extent_map *em;
449 struct address_space *mapping = inode->i_mapping;
450 struct pagevec pvec;
451 struct extent_map_tree *em_tree;
452 struct extent_io_tree *tree;
453 u64 end;
454 int misses = 0;
455
456 page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
457 last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
458 em_tree = &BTRFS_I(inode)->extent_tree;
459 tree = &BTRFS_I(inode)->io_tree;
460
461 if (isize == 0)
462 return 0;
463
464 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
465
466 pagevec_init(&pvec, 0);
467 while (last_offset < compressed_end) {
468 page_index = last_offset >> PAGE_CACHE_SHIFT;
469
470 if (page_index > end_index)
471 break;
472
473 rcu_read_lock();
474 page = radix_tree_lookup(&mapping->page_tree, page_index);
475 rcu_read_unlock();
476 if (page) {
477 misses++;
478 if (misses > 4)
479 break;
480 goto next;
481 }
482
483 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
484 if (!page)
485 break;
486
487 page->index = page_index;
488 /*
489 * what we want to do here is call add_to_page_cache_lru,
490 * but that isn't exported, so we reproduce it here
491 */
492 if (add_to_page_cache(page, mapping,
493 page->index, GFP_NOFS)) {
494 page_cache_release(page);
495 goto next;
496 }
497
498 /* open coding of lru_cache_add, also not exported */
499 page_cache_get(page);
500 if (!pagevec_add(&pvec, page))
501 __pagevec_lru_add_file(&pvec);
502
503 end = last_offset + PAGE_CACHE_SIZE - 1;
504 /*
505 * at this point, we have a locked page in the page cache
506 * for these bytes in the file. But, we have to make
507 * sure they map to this compressed extent on disk.
508 */
509 set_page_extent_mapped(page);
510 lock_extent(tree, last_offset, end, GFP_NOFS);
511 spin_lock(&em_tree->lock);
512 em = lookup_extent_mapping(em_tree, last_offset,
513 PAGE_CACHE_SIZE);
514 spin_unlock(&em_tree->lock);
515
516 if (!em || last_offset < em->start ||
517 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
518 (em->block_start >> 9) != cb->orig_bio->bi_sector) {
519 free_extent_map(em);
520 unlock_extent(tree, last_offset, end, GFP_NOFS);
521 unlock_page(page);
522 page_cache_release(page);
523 break;
524 }
525 free_extent_map(em);
526
527 if (page->index == end_index) {
528 char *userpage;
529 size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
530
531 if (zero_offset) {
532 int zeros;
533 zeros = PAGE_CACHE_SIZE - zero_offset;
534 userpage = kmap_atomic(page, KM_USER0);
535 memset(userpage + zero_offset, 0, zeros);
536 flush_dcache_page(page);
537 kunmap_atomic(userpage, KM_USER0);
538 }
539 }
540
541 ret = bio_add_page(cb->orig_bio, page,
542 PAGE_CACHE_SIZE, 0);
543
544 if (ret == PAGE_CACHE_SIZE) {
545 nr_pages++;
546 page_cache_release(page);
547 } else {
548 unlock_extent(tree, last_offset, end, GFP_NOFS);
549 unlock_page(page);
550 page_cache_release(page);
551 break;
552 }
553next:
554 last_offset += PAGE_CACHE_SIZE;
555 }
556 if (pagevec_count(&pvec))
557 __pagevec_lru_add_file(&pvec);
558 return 0;
559}
560
561/*
562 * for a compressed read, the bio we get passed has all the inode pages
563 * in it. We don't actually do IO on those pages but allocate new ones
564 * to hold the compressed pages on disk.
565 *
566 * bio->bi_sector points to the compressed extent on disk
567 * bio->bi_io_vec points to all of the inode pages
568 * bio->bi_vcnt is a count of pages
569 *
570 * After the compressed pages are read, we copy the bytes into the
571 * bio we were passed and then call the bio end_io calls
572 */
573int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
574 int mirror_num, unsigned long bio_flags)
575{
576 struct extent_io_tree *tree;
577 struct extent_map_tree *em_tree;
578 struct compressed_bio *cb;
579 struct btrfs_root *root = BTRFS_I(inode)->root;
580 unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
581 unsigned long compressed_len;
582 unsigned long nr_pages;
583 unsigned long page_index;
584 struct page *page;
585 struct block_device *bdev;
586 struct bio *comp_bio;
587 u64 cur_disk_byte = (u64)bio->bi_sector << 9;
588 u64 em_len;
589 u64 em_start;
590 struct extent_map *em;
591 int ret;
592 u32 *sums;
593
594 tree = &BTRFS_I(inode)->io_tree;
595 em_tree = &BTRFS_I(inode)->extent_tree;
596
597 /* we need the actual starting offset of this extent in the file */
598 spin_lock(&em_tree->lock);
599 em = lookup_extent_mapping(em_tree,
600 page_offset(bio->bi_io_vec->bv_page),
601 PAGE_CACHE_SIZE);
602 spin_unlock(&em_tree->lock);
603
604 compressed_len = em->block_len;
605 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
606 atomic_set(&cb->pending_bios, 0);
607 cb->errors = 0;
608 cb->inode = inode;
609 cb->mirror_num = mirror_num;
610 sums = &cb->sums;
611
612 cb->start = em->orig_start;
613 em_len = em->len;
614 em_start = em->start;
615
616 free_extent_map(em);
617 em = NULL;
618
619 cb->len = uncompressed_len;
620 cb->compressed_len = compressed_len;
621 cb->orig_bio = bio;
622
623 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
624 PAGE_CACHE_SIZE;
625 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
626 GFP_NOFS);
627 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
628
629 for (page_index = 0; page_index < nr_pages; page_index++) {
630 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
631 __GFP_HIGHMEM);
632 }
633 cb->nr_pages = nr_pages;
634
635 add_ra_bio_pages(inode, em_start + em_len, cb);
636
637 /* include any pages we added in add_ra-bio_pages */
638 uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
639 cb->len = uncompressed_len;
640
641 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
642 comp_bio->bi_private = cb;
643 comp_bio->bi_end_io = end_compressed_bio_read;
644 atomic_inc(&cb->pending_bios);
645
646 for (page_index = 0; page_index < nr_pages; page_index++) {
647 page = cb->compressed_pages[page_index];
648 page->mapping = inode->i_mapping;
649 page->index = em_start >> PAGE_CACHE_SHIFT;
650
651 if (comp_bio->bi_size)
652 ret = tree->ops->merge_bio_hook(page, 0,
653 PAGE_CACHE_SIZE,
654 comp_bio, 0);
655 else
656 ret = 0;
657
658 page->mapping = NULL;
659 if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
660 PAGE_CACHE_SIZE) {
661 bio_get(comp_bio);
662
663 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
664 BUG_ON(ret);
665
666 /*
667 * inc the count before we submit the bio so
668 * we know the end IO handler won't happen before
669 * we inc the count. Otherwise, the cb might get
670 * freed before we're done setting it up
671 */
672 atomic_inc(&cb->pending_bios);
673
674 if (!btrfs_test_flag(inode, NODATASUM)) {
675 btrfs_lookup_bio_sums(root, inode, comp_bio,
676 sums);
677 }
678 sums += (comp_bio->bi_size + root->sectorsize - 1) /
679 root->sectorsize;
680
681 ret = btrfs_map_bio(root, READ, comp_bio,
682 mirror_num, 0);
683 BUG_ON(ret);
684
685 bio_put(comp_bio);
686
687 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
688 GFP_NOFS);
689 comp_bio->bi_private = cb;
690 comp_bio->bi_end_io = end_compressed_bio_read;
691
692 bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
693 }
694 cur_disk_byte += PAGE_CACHE_SIZE;
695 }
696 bio_get(comp_bio);
697
698 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
699 BUG_ON(ret);
700
701 if (!btrfs_test_flag(inode, NODATASUM))
702 btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
703
704 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
705 BUG_ON(ret);
706
707 bio_put(comp_bio);
708 return 0;
709}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_
21
22int btrfs_zlib_decompress(unsigned char *data_in,
23 struct page *dest_page,
24 unsigned long start_byte,
25 size_t srclen, size_t destlen);
26int btrfs_zlib_compress_pages(struct address_space *mapping,
27 u64 start, unsigned long len,
28 struct page **pages,
29 unsigned long nr_dest_pages,
30 unsigned long *out_pages,
31 unsigned long *total_in,
32 unsigned long *total_out,
33 unsigned long max_out);
34int btrfs_zlib_decompress_biovec(struct page **pages_in,
35 u64 disk_start,
36 struct bio_vec *bvec,
37 int vcnt,
38 size_t srclen);
39void btrfs_zlib_exit(void);
40int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start,
42 unsigned long compressed_len,
43 struct page **compressed_pages,
44 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags);
47#endif
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 000000000000..1eaf11d334fd
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,120 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CRC32C__
20#define __BTRFS_CRC32C__
21#include <asm/byteorder.h>
22#include <linux/crc32c.h>
23#include <linux/version.h>
24
25/* #define CONFIG_BTRFS_HW_SUM 1 */
26
27#ifdef CONFIG_BTRFS_HW_SUM
28#ifdef CONFIG_X86
29/*
30 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
31 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
32 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
33 * http://www.intel.com/products/processor/manuals/
34 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
35 * Volume 2A: Instruction Set Reference, A-M
36 */
37
38#include <asm/cpufeature.h>
39#include <asm/processor.h>
40
41#define X86_FEATURE_XMM4_2 (4*32+20) /* Streaming SIMD Extensions-4.2 */
42#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
43
44#ifdef CONFIG_X86_64
45#define REX_PRE "0x48, "
46#define SCALE_F 8
47#else
48#define REX_PRE
49#define SCALE_F 4
50#endif
51
52static inline u32 btrfs_crc32c_le_hw_byte(u32 crc, unsigned char const *data,
53 size_t length)
54{
55 while (length--) {
56 __asm__ __volatile__(
57 ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
58 :"=S"(crc)
59 :"0"(crc), "c"(*data)
60 );
61 data++;
62 }
63
64 return crc;
65}
66
67static inline u32 __pure btrfs_crc32c_le_hw(u32 crc, unsigned char const *p,
68 size_t len)
69{
70 unsigned int iquotient = len / SCALE_F;
71 unsigned int iremainder = len % SCALE_F;
72#ifdef CONFIG_X86_64
73 u64 *ptmp = (u64 *)p;
74#else
75 u32 *ptmp = (u32 *)p;
76#endif
77
78 while (iquotient--) {
79 __asm__ __volatile__(
80 ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
81 :"=S"(crc)
82 :"0"(crc), "c"(*ptmp)
83 );
84 ptmp++;
85 }
86
87 if (iremainder)
88 crc = btrfs_crc32c_le_hw_byte(crc, (unsigned char *)ptmp,
89 iremainder);
90
91 return crc;
92}
93#endif /* CONFIG_BTRFS_HW_SUM */
94
95static inline u32 __btrfs_crc32c(u32 crc, unsigned char const *address,
96 size_t len)
97{
98#ifdef CONFIG_BTRFS_HW_SUM
99 if (cpu_has_xmm4_2)
100 return btrfs_crc32c_le_hw(crc, address, len);
101#endif
102 return crc32c_le(crc, address, len);
103}
104
105#else
106
107#define __btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
108
109#endif /* CONFIG_X86 */
110
111/**
112 * implementation of crc32c_le() changed in linux-2.6.23,
113 * has of v0.13 btrfs-progs is using the latest version.
114 * We must workaround older implementations of crc32c_le()
115 * found on older kernel versions.
116 */
117#define btrfs_crc32c(seed, data, length) \
118 __btrfs_crc32c(seed, (unsigned char const *)data, length)
119#endif
120
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000000..9e46c0776816
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3953 @@
1/*
2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "transaction.h"
23#include "print-tree.h"
24#include "locking.h"
25
26static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27 *root, struct btrfs_path *path, int level);
28static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29 *root, struct btrfs_key *ins_key,
30 struct btrfs_path *path, int data_size, int extend);
31static int push_node_left(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct extent_buffer *dst,
33 struct extent_buffer *src, int empty);
34static int balance_node_right(struct btrfs_trans_handle *trans,
35 struct btrfs_root *root,
36 struct extent_buffer *dst_buf,
37 struct extent_buffer *src_buf);
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot);
40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void)
47{
48 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) {
51 btrfs_init_path(path);
52 path->reada = 1;
53 }
54 return path;
55}
56
57/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p)
59{
60 btrfs_release_path(NULL, p);
61 kmem_cache_free(btrfs_path_cachep, p);
62}
63
64/*
65 * path release drops references on the extent buffers in the path
66 * and it drops any locks held by this path
67 *
68 * It is safe to call this on paths that no locks or extent buffers held.
69 */
70noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71{
72 int i;
73
74 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75 p->slots[i] = 0;
76 if (!p->nodes[i])
77 continue;
78 if (p->locks[i]) {
79 btrfs_tree_unlock(p->nodes[i]);
80 p->locks[i] = 0;
81 }
82 free_extent_buffer(p->nodes[i]);
83 p->nodes[i] = NULL;
84 }
85}
86
87/*
88 * safely gets a reference on the root node of a tree. A lock
89 * is not taken, so a concurrent writer may put a different node
90 * at the root of the tree. See btrfs_lock_root_node for the
91 * looping required.
92 *
93 * The extent buffer returned by this has a reference taken, so
94 * it won't disappear. It may stop being the root of the tree
95 * at any time because there are no locks held.
96 */
97struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98{
99 struct extent_buffer *eb;
100 spin_lock(&root->node_lock);
101 eb = root->node;
102 extent_buffer_get(eb);
103 spin_unlock(&root->node_lock);
104 return eb;
105}
106
107/* loop around taking references on and locking the root node of the
108 * tree until you end up with a lock on the root. A locked buffer
109 * is returned, with a reference held.
110 */
111struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
112{
113 struct extent_buffer *eb;
114
115 while (1) {
116 eb = btrfs_root_node(root);
117 btrfs_tree_lock(eb);
118
119 spin_lock(&root->node_lock);
120 if (eb == root->node) {
121 spin_unlock(&root->node_lock);
122 break;
123 }
124 spin_unlock(&root->node_lock);
125
126 btrfs_tree_unlock(eb);
127 free_extent_buffer(eb);
128 }
129 return eb;
130}
131
132/* cowonly root (everything not a reference counted cow subvolume), just get
133 * put onto a simple dirty list. transaction.c walks this to make sure they
134 * get properly updated on disk.
135 */
136static void add_root_to_dirty_list(struct btrfs_root *root)
137{
138 if (root->track_dirty && list_empty(&root->dirty_list)) {
139 list_add(&root->dirty_list,
140 &root->fs_info->dirty_cowonly_roots);
141 }
142}
143
144/*
145 * used by snapshot creation to make a copy of a root for a tree with
146 * a given objectid. The buffer with the new root node is returned in
147 * cow_ret, and this func returns zero on success or a negative error code.
148 */
149int btrfs_copy_root(struct btrfs_trans_handle *trans,
150 struct btrfs_root *root,
151 struct extent_buffer *buf,
152 struct extent_buffer **cow_ret, u64 new_root_objectid)
153{
154 struct extent_buffer *cow;
155 u32 nritems;
156 int ret = 0;
157 int level;
158 struct btrfs_root *new_root;
159
160 new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
161 if (!new_root)
162 return -ENOMEM;
163
164 memcpy(new_root, root, sizeof(*new_root));
165 new_root->root_key.objectid = new_root_objectid;
166
167 WARN_ON(root->ref_cows && trans->transid !=
168 root->fs_info->running_transaction->transid);
169 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
170
171 level = btrfs_header_level(buf);
172 nritems = btrfs_header_nritems(buf);
173
174 cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
175 new_root_objectid, trans->transid,
176 level, buf->start, 0);
177 if (IS_ERR(cow)) {
178 kfree(new_root);
179 return PTR_ERR(cow);
180 }
181
182 copy_extent_buffer(cow, buf, 0, 0, cow->len);
183 btrfs_set_header_bytenr(cow, cow->start);
184 btrfs_set_header_generation(cow, trans->transid);
185 btrfs_set_header_owner(cow, new_root_objectid);
186 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187
188 write_extent_buffer(cow, root->fs_info->fsid,
189 (unsigned long)btrfs_header_fsid(cow),
190 BTRFS_FSID_SIZE);
191
192 WARN_ON(btrfs_header_generation(buf) > trans->transid);
193 ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
194 kfree(new_root);
195
196 if (ret)
197 return ret;
198
199 btrfs_mark_buffer_dirty(cow);
200 *cow_ret = cow;
201 return 0;
202}
203
204/*
205 * does the dirty work in cow of a single block. The parent block (if
206 * supplied) is updated to point to the new cow copy. The new buffer is marked
207 * dirty and returned locked. If you modify the block it needs to be marked
208 * dirty again.
209 *
210 * search_start -- an allocation hint for the new block
211 *
212 * empty_size -- a hint that you plan on doing more cow. This is the size in
213 * bytes the allocator should try to find free next to the block it returns.
214 * This is just a hint and may be ignored by the allocator.
215 *
216 * prealloc_dest -- if you have already reserved a destination for the cow,
217 * this uses that block instead of allocating a new one.
218 * btrfs_alloc_reserved_extent is used to finish the allocation.
219 */
220static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
221 struct btrfs_root *root,
222 struct extent_buffer *buf,
223 struct extent_buffer *parent, int parent_slot,
224 struct extent_buffer **cow_ret,
225 u64 search_start, u64 empty_size,
226 u64 prealloc_dest)
227{
228 u64 parent_start;
229 struct extent_buffer *cow;
230 u32 nritems;
231 int ret = 0;
232 int level;
233 int unlock_orig = 0;
234
235 if (*cow_ret == buf)
236 unlock_orig = 1;
237
238 WARN_ON(!btrfs_tree_locked(buf));
239
240 if (parent)
241 parent_start = parent->start;
242 else
243 parent_start = 0;
244
245 WARN_ON(root->ref_cows && trans->transid !=
246 root->fs_info->running_transaction->transid);
247 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
248
249 level = btrfs_header_level(buf);
250 nritems = btrfs_header_nritems(buf);
251
252 if (prealloc_dest) {
253 struct btrfs_key ins;
254
255 ins.objectid = prealloc_dest;
256 ins.offset = buf->len;
257 ins.type = BTRFS_EXTENT_ITEM_KEY;
258
259 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
260 root->root_key.objectid,
261 trans->transid, level, &ins);
262 BUG_ON(ret);
263 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264 buf->len);
265 } else {
266 cow = btrfs_alloc_free_block(trans, root, buf->len,
267 parent_start,
268 root->root_key.objectid,
269 trans->transid, level,
270 search_start, empty_size);
271 }
272 if (IS_ERR(cow))
273 return PTR_ERR(cow);
274
275 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid);
278 btrfs_set_header_owner(cow, root->root_key.objectid);
279 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
280
281 write_extent_buffer(cow, root->fs_info->fsid,
282 (unsigned long)btrfs_header_fsid(cow),
283 BTRFS_FSID_SIZE);
284
285 WARN_ON(btrfs_header_generation(buf) > trans->transid);
286 if (btrfs_header_generation(buf) != trans->transid) {
287 u32 nr_extents;
288 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
289 if (ret)
290 return ret;
291
292 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
293 WARN_ON(ret);
294 } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
295 /*
296 * There are only two places that can drop reference to
297 * tree blocks owned by living reloc trees, one is here,
298 * the other place is btrfs_drop_subtree. In both places,
299 * we check reference count while tree block is locked.
300 * Furthermore, if reference count is one, it won't get
301 * increased by someone else.
302 */
303 u32 refs;
304 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
305 buf->len, &refs);
306 BUG_ON(ret);
307 if (refs == 1) {
308 ret = btrfs_update_ref(trans, root, buf, cow,
309 0, nritems);
310 clean_tree_block(trans, root, buf);
311 } else {
312 ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
313 }
314 BUG_ON(ret);
315 } else {
316 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
317 if (ret)
318 return ret;
319 clean_tree_block(trans, root, buf);
320 }
321
322 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
323 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
324 WARN_ON(ret);
325 }
326
327 if (buf == root->node) {
328 WARN_ON(parent && parent != buf);
329
330 spin_lock(&root->node_lock);
331 root->node = cow;
332 extent_buffer_get(cow);
333 spin_unlock(&root->node_lock);
334
335 if (buf != root->commit_root) {
336 btrfs_free_extent(trans, root, buf->start,
337 buf->len, buf->start,
338 root->root_key.objectid,
339 btrfs_header_generation(buf),
340 level, 1);
341 }
342 free_extent_buffer(buf);
343 add_root_to_dirty_list(root);
344 } else {
345 btrfs_set_node_blockptr(parent, parent_slot,
346 cow->start);
347 WARN_ON(trans->transid == 0);
348 btrfs_set_node_ptr_generation(parent, parent_slot,
349 trans->transid);
350 btrfs_mark_buffer_dirty(parent);
351 WARN_ON(btrfs_header_generation(parent) != trans->transid);
352 btrfs_free_extent(trans, root, buf->start, buf->len,
353 parent_start, btrfs_header_owner(parent),
354 btrfs_header_generation(parent), level, 1);
355 }
356 if (unlock_orig)
357 btrfs_tree_unlock(buf);
358 free_extent_buffer(buf);
359 btrfs_mark_buffer_dirty(cow);
360 *cow_ret = cow;
361 return 0;
362}
363
364/*
365 * cows a single block, see __btrfs_cow_block for the real work.
366 * This version of it has extra checks so that a block isn't cow'd more than
367 * once per transaction, as long as it hasn't been written yet
368 */
369noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
370 struct btrfs_root *root, struct extent_buffer *buf,
371 struct extent_buffer *parent, int parent_slot,
372 struct extent_buffer **cow_ret, u64 prealloc_dest)
373{
374 u64 search_start;
375 int ret;
376
377 if (trans->transaction != root->fs_info->running_transaction) {
378 printk(KERN_CRIT "trans %llu running %llu\n",
379 (unsigned long long)trans->transid,
380 (unsigned long long)
381 root->fs_info->running_transaction->transid);
382 WARN_ON(1);
383 }
384 if (trans->transid != root->fs_info->generation) {
385 printk(KERN_CRIT "trans %llu running %llu\n",
386 (unsigned long long)trans->transid,
387 (unsigned long long)root->fs_info->generation);
388 WARN_ON(1);
389 }
390
391 spin_lock(&root->fs_info->hash_lock);
392 if (btrfs_header_generation(buf) == trans->transid &&
393 btrfs_header_owner(buf) == root->root_key.objectid &&
394 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395 *cow_ret = buf;
396 spin_unlock(&root->fs_info->hash_lock);
397 WARN_ON(prealloc_dest);
398 return 0;
399 }
400 spin_unlock(&root->fs_info->hash_lock);
401 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
402 ret = __btrfs_cow_block(trans, root, buf, parent,
403 parent_slot, cow_ret, search_start, 0,
404 prealloc_dest);
405 return ret;
406}
407
408/*
409 * helper function for defrag to decide if two blocks pointed to by a
410 * node are actually close by
411 */
412static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
413{
414 if (blocknr < other && other - (blocknr + blocksize) < 32768)
415 return 1;
416 if (blocknr > other && blocknr - (other + blocksize) < 32768)
417 return 1;
418 return 0;
419}
420
421/*
422 * compare two keys in a memcmp fashion
423 */
424static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
425{
426 struct btrfs_key k1;
427
428 btrfs_disk_key_to_cpu(&k1, disk);
429
430 if (k1.objectid > k2->objectid)
431 return 1;
432 if (k1.objectid < k2->objectid)
433 return -1;
434 if (k1.type > k2->type)
435 return 1;
436 if (k1.type < k2->type)
437 return -1;
438 if (k1.offset > k2->offset)
439 return 1;
440 if (k1.offset < k2->offset)
441 return -1;
442 return 0;
443}
444
445/*
446 * same as comp_keys only with two btrfs_key's
447 */
448static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
449{
450 if (k1->objectid > k2->objectid)
451 return 1;
452 if (k1->objectid < k2->objectid)
453 return -1;
454 if (k1->type > k2->type)
455 return 1;
456 if (k1->type < k2->type)
457 return -1;
458 if (k1->offset > k2->offset)
459 return 1;
460 if (k1->offset < k2->offset)
461 return -1;
462 return 0;
463}
464
465/*
466 * this is used by the defrag code to go through all the
467 * leaves pointed to by a node and reallocate them so that
468 * disk order is close to key order
469 */
470int btrfs_realloc_node(struct btrfs_trans_handle *trans,
471 struct btrfs_root *root, struct extent_buffer *parent,
472 int start_slot, int cache_only, u64 *last_ret,
473 struct btrfs_key *progress)
474{
475 struct extent_buffer *cur;
476 u64 blocknr;
477 u64 gen;
478 u64 search_start = *last_ret;
479 u64 last_block = 0;
480 u64 other;
481 u32 parent_nritems;
482 int end_slot;
483 int i;
484 int err = 0;
485 int parent_level;
486 int uptodate;
487 u32 blocksize;
488 int progress_passed = 0;
489 struct btrfs_disk_key disk_key;
490
491 parent_level = btrfs_header_level(parent);
492 if (cache_only && parent_level != 1)
493 return 0;
494
495 if (trans->transaction != root->fs_info->running_transaction)
496 WARN_ON(1);
497 if (trans->transid != root->fs_info->generation)
498 WARN_ON(1);
499
500 parent_nritems = btrfs_header_nritems(parent);
501 blocksize = btrfs_level_size(root, parent_level - 1);
502 end_slot = parent_nritems;
503
504 if (parent_nritems == 1)
505 return 0;
506
507 for (i = start_slot; i < end_slot; i++) {
508 int close = 1;
509
510 if (!parent->map_token) {
511 map_extent_buffer(parent,
512 btrfs_node_key_ptr_offset(i),
513 sizeof(struct btrfs_key_ptr),
514 &parent->map_token, &parent->kaddr,
515 &parent->map_start, &parent->map_len,
516 KM_USER1);
517 }
518 btrfs_node_key(parent, &disk_key, i);
519 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
520 continue;
521
522 progress_passed = 1;
523 blocknr = btrfs_node_blockptr(parent, i);
524 gen = btrfs_node_ptr_generation(parent, i);
525 if (last_block == 0)
526 last_block = blocknr;
527
528 if (i > 0) {
529 other = btrfs_node_blockptr(parent, i - 1);
530 close = close_blocks(blocknr, other, blocksize);
531 }
532 if (!close && i < end_slot - 2) {
533 other = btrfs_node_blockptr(parent, i + 1);
534 close = close_blocks(blocknr, other, blocksize);
535 }
536 if (close) {
537 last_block = blocknr;
538 continue;
539 }
540 if (parent->map_token) {
541 unmap_extent_buffer(parent, parent->map_token,
542 KM_USER1);
543 parent->map_token = NULL;
544 }
545
546 cur = btrfs_find_tree_block(root, blocknr, blocksize);
547 if (cur)
548 uptodate = btrfs_buffer_uptodate(cur, gen);
549 else
550 uptodate = 0;
551 if (!cur || !uptodate) {
552 if (cache_only) {
553 free_extent_buffer(cur);
554 continue;
555 }
556 if (!cur) {
557 cur = read_tree_block(root, blocknr,
558 blocksize, gen);
559 } else if (!uptodate) {
560 btrfs_read_buffer(cur, gen);
561 }
562 }
563 if (search_start == 0)
564 search_start = last_block;
565
566 btrfs_tree_lock(cur);
567 err = __btrfs_cow_block(trans, root, cur, parent, i,
568 &cur, search_start,
569 min(16 * blocksize,
570 (end_slot - i) * blocksize), 0);
571 if (err) {
572 btrfs_tree_unlock(cur);
573 free_extent_buffer(cur);
574 break;
575 }
576 search_start = cur->start;
577 last_block = cur->start;
578 *last_ret = search_start;
579 btrfs_tree_unlock(cur);
580 free_extent_buffer(cur);
581 }
582 if (parent->map_token) {
583 unmap_extent_buffer(parent, parent->map_token,
584 KM_USER1);
585 parent->map_token = NULL;
586 }
587 return err;
588}
589
590/*
591 * The leaf data grows from end-to-front in the node.
592 * this returns the address of the start of the last item,
593 * which is the stop of the leaf data stack
594 */
595static inline unsigned int leaf_data_end(struct btrfs_root *root,
596 struct extent_buffer *leaf)
597{
598 u32 nr = btrfs_header_nritems(leaf);
599 if (nr == 0)
600 return BTRFS_LEAF_DATA_SIZE(root);
601 return btrfs_item_offset_nr(leaf, nr - 1);
602}
603
604/*
605 * extra debugging checks to make sure all the items in a key are
606 * well formed and in the proper order
607 */
608static int check_node(struct btrfs_root *root, struct btrfs_path *path,
609 int level)
610{
611 struct extent_buffer *parent = NULL;
612 struct extent_buffer *node = path->nodes[level];
613 struct btrfs_disk_key parent_key;
614 struct btrfs_disk_key node_key;
615 int parent_slot;
616 int slot;
617 struct btrfs_key cpukey;
618 u32 nritems = btrfs_header_nritems(node);
619
620 if (path->nodes[level + 1])
621 parent = path->nodes[level + 1];
622
623 slot = path->slots[level];
624 BUG_ON(nritems == 0);
625 if (parent) {
626 parent_slot = path->slots[level + 1];
627 btrfs_node_key(parent, &parent_key, parent_slot);
628 btrfs_node_key(node, &node_key, 0);
629 BUG_ON(memcmp(&parent_key, &node_key,
630 sizeof(struct btrfs_disk_key)));
631 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
632 btrfs_header_bytenr(node));
633 }
634 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
635 if (slot != 0) {
636 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
637 btrfs_node_key(node, &node_key, slot);
638 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
639 }
640 if (slot < nritems - 1) {
641 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
642 btrfs_node_key(node, &node_key, slot);
643 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
644 }
645 return 0;
646}
647
648/*
649 * extra checking to make sure all the items in a leaf are
650 * well formed and in the proper order
651 */
652static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
653 int level)
654{
655 struct extent_buffer *leaf = path->nodes[level];
656 struct extent_buffer *parent = NULL;
657 int parent_slot;
658 struct btrfs_key cpukey;
659 struct btrfs_disk_key parent_key;
660 struct btrfs_disk_key leaf_key;
661 int slot = path->slots[0];
662
663 u32 nritems = btrfs_header_nritems(leaf);
664
665 if (path->nodes[level + 1])
666 parent = path->nodes[level + 1];
667
668 if (nritems == 0)
669 return 0;
670
671 if (parent) {
672 parent_slot = path->slots[level + 1];
673 btrfs_node_key(parent, &parent_key, parent_slot);
674 btrfs_item_key(leaf, &leaf_key, 0);
675
676 BUG_ON(memcmp(&parent_key, &leaf_key,
677 sizeof(struct btrfs_disk_key)));
678 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
679 btrfs_header_bytenr(leaf));
680 }
681 if (slot != 0 && slot < nritems - 1) {
682 btrfs_item_key(leaf, &leaf_key, slot);
683 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
684 if (comp_keys(&leaf_key, &cpukey) <= 0) {
685 btrfs_print_leaf(root, leaf);
686 printk(KERN_CRIT "slot %d offset bad key\n", slot);
687 BUG_ON(1);
688 }
689 if (btrfs_item_offset_nr(leaf, slot - 1) !=
690 btrfs_item_end_nr(leaf, slot)) {
691 btrfs_print_leaf(root, leaf);
692 printk(KERN_CRIT "slot %d offset bad\n", slot);
693 BUG_ON(1);
694 }
695 }
696 if (slot < nritems - 1) {
697 btrfs_item_key(leaf, &leaf_key, slot);
698 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
699 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
700 if (btrfs_item_offset_nr(leaf, slot) !=
701 btrfs_item_end_nr(leaf, slot + 1)) {
702 btrfs_print_leaf(root, leaf);
703 printk(KERN_CRIT "slot %d offset bad\n", slot);
704 BUG_ON(1);
705 }
706 }
707 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
708 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
709 return 0;
710}
711
712static noinline int check_block(struct btrfs_root *root,
713 struct btrfs_path *path, int level)
714{
715 return 0;
716 if (level == 0)
717 return check_leaf(root, path, level);
718 return check_node(root, path, level);
719}
720
721/*
722 * search for key in the extent_buffer. The items start at offset p,
723 * and they are item_size apart. There are 'max' items in p.
724 *
725 * the slot in the array is returned via slot, and it points to
726 * the place where you would insert key if it is not found in
727 * the array.
728 *
729 * slot may point to max if the key is bigger than all of the keys
730 */
731static noinline int generic_bin_search(struct extent_buffer *eb,
732 unsigned long p,
733 int item_size, struct btrfs_key *key,
734 int max, int *slot)
735{
736 int low = 0;
737 int high = max;
738 int mid;
739 int ret;
740 struct btrfs_disk_key *tmp = NULL;
741 struct btrfs_disk_key unaligned;
742 unsigned long offset;
743 char *map_token = NULL;
744 char *kaddr = NULL;
745 unsigned long map_start = 0;
746 unsigned long map_len = 0;
747 int err;
748
749 while (low < high) {
750 mid = (low + high) / 2;
751 offset = p + mid * item_size;
752
753 if (!map_token || offset < map_start ||
754 (offset + sizeof(struct btrfs_disk_key)) >
755 map_start + map_len) {
756 if (map_token) {
757 unmap_extent_buffer(eb, map_token, KM_USER0);
758 map_token = NULL;
759 }
760
761 err = map_private_extent_buffer(eb, offset,
762 sizeof(struct btrfs_disk_key),
763 &map_token, &kaddr,
764 &map_start, &map_len, KM_USER0);
765
766 if (!err) {
767 tmp = (struct btrfs_disk_key *)(kaddr + offset -
768 map_start);
769 } else {
770 read_extent_buffer(eb, &unaligned,
771 offset, sizeof(unaligned));
772 tmp = &unaligned;
773 }
774
775 } else {
776 tmp = (struct btrfs_disk_key *)(kaddr + offset -
777 map_start);
778 }
779 ret = comp_keys(tmp, key);
780
781 if (ret < 0)
782 low = mid + 1;
783 else if (ret > 0)
784 high = mid;
785 else {
786 *slot = mid;
787 if (map_token)
788 unmap_extent_buffer(eb, map_token, KM_USER0);
789 return 0;
790 }
791 }
792 *slot = low;
793 if (map_token)
794 unmap_extent_buffer(eb, map_token, KM_USER0);
795 return 1;
796}
797
798/*
799 * simple bin_search frontend that does the right thing for
800 * leaves vs nodes
801 */
802static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
803 int level, int *slot)
804{
805 if (level == 0) {
806 return generic_bin_search(eb,
807 offsetof(struct btrfs_leaf, items),
808 sizeof(struct btrfs_item),
809 key, btrfs_header_nritems(eb),
810 slot);
811 } else {
812 return generic_bin_search(eb,
813 offsetof(struct btrfs_node, ptrs),
814 sizeof(struct btrfs_key_ptr),
815 key, btrfs_header_nritems(eb),
816 slot);
817 }
818 return -1;
819}
820
821/* given a node and slot number, this reads the blocks it points to. The
822 * extent buffer is returned with a reference taken (but unlocked).
823 * NULL is returned on error.
824 */
825static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
826 struct extent_buffer *parent, int slot)
827{
828 int level = btrfs_header_level(parent);
829 if (slot < 0)
830 return NULL;
831 if (slot >= btrfs_header_nritems(parent))
832 return NULL;
833
834 BUG_ON(level == 0);
835
836 return read_tree_block(root, btrfs_node_blockptr(parent, slot),
837 btrfs_level_size(root, level - 1),
838 btrfs_node_ptr_generation(parent, slot));
839}
840
841/*
842 * node level balancing, used to make sure nodes are in proper order for
843 * item deletion. We balance from the top down, so we have to make sure
844 * that a deletion won't leave an node completely empty later on.
845 */
846static noinline int balance_level(struct btrfs_trans_handle *trans,
847 struct btrfs_root *root,
848 struct btrfs_path *path, int level)
849{
850 struct extent_buffer *right = NULL;
851 struct extent_buffer *mid;
852 struct extent_buffer *left = NULL;
853 struct extent_buffer *parent = NULL;
854 int ret = 0;
855 int wret;
856 int pslot;
857 int orig_slot = path->slots[level];
858 int err_on_enospc = 0;
859 u64 orig_ptr;
860
861 if (level == 0)
862 return 0;
863
864 mid = path->nodes[level];
865 WARN_ON(!path->locks[level]);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867
868 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
869
870 if (level < BTRFS_MAX_LEVEL - 1)
871 parent = path->nodes[level + 1];
872 pslot = path->slots[level + 1];
873
874 /*
875 * deal with the case where there is only one pointer in the root
876 * by promoting the node below to a root
877 */
878 if (!parent) {
879 struct extent_buffer *child;
880
881 if (btrfs_header_nritems(mid) != 1)
882 return 0;
883
884 /* promote the child to a root */
885 child = read_node_slot(root, mid, 0);
886 btrfs_tree_lock(child);
887 BUG_ON(!child);
888 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 BUG_ON(ret);
890
891 spin_lock(&root->node_lock);
892 root->node = child;
893 spin_unlock(&root->node_lock);
894
895 ret = btrfs_update_extent_ref(trans, root, child->start,
896 mid->start, child->start,
897 root->root_key.objectid,
898 trans->transid, level - 1);
899 BUG_ON(ret);
900
901 add_root_to_dirty_list(root);
902 btrfs_tree_unlock(child);
903 path->locks[level] = 0;
904 path->nodes[level] = NULL;
905 clean_tree_block(trans, root, mid);
906 btrfs_tree_unlock(mid);
907 /* once for the path */
908 free_extent_buffer(mid);
909 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
910 mid->start, root->root_key.objectid,
911 btrfs_header_generation(mid),
912 level, 1);
913 /* once for the root ptr */
914 free_extent_buffer(mid);
915 return ret;
916 }
917 if (btrfs_header_nritems(mid) >
918 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
919 return 0;
920
921 if (btrfs_header_nritems(mid) < 2)
922 err_on_enospc = 1;
923
924 left = read_node_slot(root, parent, pslot - 1);
925 if (left) {
926 btrfs_tree_lock(left);
927 wret = btrfs_cow_block(trans, root, left,
928 parent, pslot - 1, &left, 0);
929 if (wret) {
930 ret = wret;
931 goto enospc;
932 }
933 }
934 right = read_node_slot(root, parent, pslot + 1);
935 if (right) {
936 btrfs_tree_lock(right);
937 wret = btrfs_cow_block(trans, root, right,
938 parent, pslot + 1, &right, 0);
939 if (wret) {
940 ret = wret;
941 goto enospc;
942 }
943 }
944
945 /* first, try to make some room in the middle buffer */
946 if (left) {
947 orig_slot += btrfs_header_nritems(left);
948 wret = push_node_left(trans, root, left, mid, 1);
949 if (wret < 0)
950 ret = wret;
951 if (btrfs_header_nritems(mid) < 2)
952 err_on_enospc = 1;
953 }
954
955 /*
956 * then try to empty the right most buffer into the middle
957 */
958 if (right) {
959 wret = push_node_left(trans, root, mid, right, 1);
960 if (wret < 0 && wret != -ENOSPC)
961 ret = wret;
962 if (btrfs_header_nritems(right) == 0) {
963 u64 bytenr = right->start;
964 u64 generation = btrfs_header_generation(parent);
965 u32 blocksize = right->len;
966
967 clean_tree_block(trans, root, right);
968 btrfs_tree_unlock(right);
969 free_extent_buffer(right);
970 right = NULL;
971 wret = del_ptr(trans, root, path, level + 1, pslot +
972 1);
973 if (wret)
974 ret = wret;
975 wret = btrfs_free_extent(trans, root, bytenr,
976 blocksize, parent->start,
977 btrfs_header_owner(parent),
978 generation, level, 1);
979 if (wret)
980 ret = wret;
981 } else {
982 struct btrfs_disk_key right_key;
983 btrfs_node_key(right, &right_key, 0);
984 btrfs_set_node_key(parent, &right_key, pslot + 1);
985 btrfs_mark_buffer_dirty(parent);
986 }
987 }
988 if (btrfs_header_nritems(mid) == 1) {
989 /*
990 * we're not allowed to leave a node with one item in the
991 * tree during a delete. A deletion from lower in the tree
992 * could try to delete the only pointer in this node.
993 * So, pull some keys from the left.
994 * There has to be a left pointer at this point because
995 * otherwise we would have pulled some pointers from the
996 * right
997 */
998 BUG_ON(!left);
999 wret = balance_node_right(trans, root, mid, left);
1000 if (wret < 0) {
1001 ret = wret;
1002 goto enospc;
1003 }
1004 if (wret == 1) {
1005 wret = push_node_left(trans, root, left, mid, 1);
1006 if (wret < 0)
1007 ret = wret;
1008 }
1009 BUG_ON(wret == 1);
1010 }
1011 if (btrfs_header_nritems(mid) == 0) {
1012 /* we've managed to empty the middle node, drop it */
1013 u64 root_gen = btrfs_header_generation(parent);
1014 u64 bytenr = mid->start;
1015 u32 blocksize = mid->len;
1016
1017 clean_tree_block(trans, root, mid);
1018 btrfs_tree_unlock(mid);
1019 free_extent_buffer(mid);
1020 mid = NULL;
1021 wret = del_ptr(trans, root, path, level + 1, pslot);
1022 if (wret)
1023 ret = wret;
1024 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1025 parent->start,
1026 btrfs_header_owner(parent),
1027 root_gen, level, 1);
1028 if (wret)
1029 ret = wret;
1030 } else {
1031 /* update the parent key to reflect our changes */
1032 struct btrfs_disk_key mid_key;
1033 btrfs_node_key(mid, &mid_key, 0);
1034 btrfs_set_node_key(parent, &mid_key, pslot);
1035 btrfs_mark_buffer_dirty(parent);
1036 }
1037
1038 /* update the path */
1039 if (left) {
1040 if (btrfs_header_nritems(left) > orig_slot) {
1041 extent_buffer_get(left);
1042 /* left was locked after cow */
1043 path->nodes[level] = left;
1044 path->slots[level + 1] -= 1;
1045 path->slots[level] = orig_slot;
1046 if (mid) {
1047 btrfs_tree_unlock(mid);
1048 free_extent_buffer(mid);
1049 }
1050 } else {
1051 orig_slot -= btrfs_header_nritems(left);
1052 path->slots[level] = orig_slot;
1053 }
1054 }
1055 /* double check we haven't messed things up */
1056 check_block(root, path, level);
1057 if (orig_ptr !=
1058 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1059 BUG();
1060enospc:
1061 if (right) {
1062 btrfs_tree_unlock(right);
1063 free_extent_buffer(right);
1064 }
1065 if (left) {
1066 if (path->nodes[level] != left)
1067 btrfs_tree_unlock(left);
1068 free_extent_buffer(left);
1069 }
1070 return ret;
1071}
1072
1073/* Node balancing for insertion. Here we only split or push nodes around
1074 * when they are completely full. This is also done top down, so we
1075 * have to be pessimistic.
1076 */
1077static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1078 struct btrfs_root *root,
1079 struct btrfs_path *path, int level)
1080{
1081 struct extent_buffer *right = NULL;
1082 struct extent_buffer *mid;
1083 struct extent_buffer *left = NULL;
1084 struct extent_buffer *parent = NULL;
1085 int ret = 0;
1086 int wret;
1087 int pslot;
1088 int orig_slot = path->slots[level];
1089 u64 orig_ptr;
1090
1091 if (level == 0)
1092 return 1;
1093
1094 mid = path->nodes[level];
1095 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1096 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1097
1098 if (level < BTRFS_MAX_LEVEL - 1)
1099 parent = path->nodes[level + 1];
1100 pslot = path->slots[level + 1];
1101
1102 if (!parent)
1103 return 1;
1104
1105 left = read_node_slot(root, parent, pslot - 1);
1106
1107 /* first, try to make some room in the middle buffer */
1108 if (left) {
1109 u32 left_nr;
1110
1111 btrfs_tree_lock(left);
1112 left_nr = btrfs_header_nritems(left);
1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1114 wret = 1;
1115 } else {
1116 ret = btrfs_cow_block(trans, root, left, parent,
1117 pslot - 1, &left, 0);
1118 if (ret)
1119 wret = 1;
1120 else {
1121 wret = push_node_left(trans, root,
1122 left, mid, 0);
1123 }
1124 }
1125 if (wret < 0)
1126 ret = wret;
1127 if (wret == 0) {
1128 struct btrfs_disk_key disk_key;
1129 orig_slot += left_nr;
1130 btrfs_node_key(mid, &disk_key, 0);
1131 btrfs_set_node_key(parent, &disk_key, pslot);
1132 btrfs_mark_buffer_dirty(parent);
1133 if (btrfs_header_nritems(left) > orig_slot) {
1134 path->nodes[level] = left;
1135 path->slots[level + 1] -= 1;
1136 path->slots[level] = orig_slot;
1137 btrfs_tree_unlock(mid);
1138 free_extent_buffer(mid);
1139 } else {
1140 orig_slot -=
1141 btrfs_header_nritems(left);
1142 path->slots[level] = orig_slot;
1143 btrfs_tree_unlock(left);
1144 free_extent_buffer(left);
1145 }
1146 return 0;
1147 }
1148 btrfs_tree_unlock(left);
1149 free_extent_buffer(left);
1150 }
1151 right = read_node_slot(root, parent, pslot + 1);
1152
1153 /*
1154 * then try to empty the right most buffer into the middle
1155 */
1156 if (right) {
1157 u32 right_nr;
1158 btrfs_tree_lock(right);
1159 right_nr = btrfs_header_nritems(right);
1160 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1161 wret = 1;
1162 } else {
1163 ret = btrfs_cow_block(trans, root, right,
1164 parent, pslot + 1,
1165 &right, 0);
1166 if (ret)
1167 wret = 1;
1168 else {
1169 wret = balance_node_right(trans, root,
1170 right, mid);
1171 }
1172 }
1173 if (wret < 0)
1174 ret = wret;
1175 if (wret == 0) {
1176 struct btrfs_disk_key disk_key;
1177
1178 btrfs_node_key(right, &disk_key, 0);
1179 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1180 btrfs_mark_buffer_dirty(parent);
1181
1182 if (btrfs_header_nritems(mid) <= orig_slot) {
1183 path->nodes[level] = right;
1184 path->slots[level + 1] += 1;
1185 path->slots[level] = orig_slot -
1186 btrfs_header_nritems(mid);
1187 btrfs_tree_unlock(mid);
1188 free_extent_buffer(mid);
1189 } else {
1190 btrfs_tree_unlock(right);
1191 free_extent_buffer(right);
1192 }
1193 return 0;
1194 }
1195 btrfs_tree_unlock(right);
1196 free_extent_buffer(right);
1197 }
1198 return 1;
1199}
1200
1201/*
1202 * readahead one full node of leaves, finding things that are close
1203 * to the block in 'slot', and triggering ra on them.
1204 */
1205static noinline void reada_for_search(struct btrfs_root *root,
1206 struct btrfs_path *path,
1207 int level, int slot, u64 objectid)
1208{
1209 struct extent_buffer *node;
1210 struct btrfs_disk_key disk_key;
1211 u32 nritems;
1212 u64 search;
1213 u64 lowest_read;
1214 u64 highest_read;
1215 u64 nread = 0;
1216 int direction = path->reada;
1217 struct extent_buffer *eb;
1218 u32 nr;
1219 u32 blocksize;
1220 u32 nscan = 0;
1221
1222 if (level != 1)
1223 return;
1224
1225 if (!path->nodes[level])
1226 return;
1227
1228 node = path->nodes[level];
1229
1230 search = btrfs_node_blockptr(node, slot);
1231 blocksize = btrfs_level_size(root, level - 1);
1232 eb = btrfs_find_tree_block(root, search, blocksize);
1233 if (eb) {
1234 free_extent_buffer(eb);
1235 return;
1236 }
1237
1238 highest_read = search;
1239 lowest_read = search;
1240
1241 nritems = btrfs_header_nritems(node);
1242 nr = slot;
1243 while (1) {
1244 if (direction < 0) {
1245 if (nr == 0)
1246 break;
1247 nr--;
1248 } else if (direction > 0) {
1249 nr++;
1250 if (nr >= nritems)
1251 break;
1252 }
1253 if (path->reada < 0 && objectid) {
1254 btrfs_node_key(node, &disk_key, nr);
1255 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1256 break;
1257 }
1258 search = btrfs_node_blockptr(node, nr);
1259 if ((search >= lowest_read && search <= highest_read) ||
1260 (search < lowest_read && lowest_read - search <= 16384) ||
1261 (search > highest_read && search - highest_read <= 16384)) {
1262 readahead_tree_block(root, search, blocksize,
1263 btrfs_node_ptr_generation(node, nr));
1264 nread += blocksize;
1265 }
1266 nscan++;
1267 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
1268 break;
1269
1270 if (nread > (256 * 1024) || nscan > 128)
1271 break;
1272
1273 if (search < lowest_read)
1274 lowest_read = search;
1275 if (search > highest_read)
1276 highest_read = search;
1277 }
1278}
1279
1280/*
1281 * when we walk down the tree, it is usually safe to unlock the higher layers
1282 * in the tree. The exceptions are when our path goes through slot 0, because
1283 * operations on the tree might require changing key pointers higher up in the
1284 * tree.
1285 *
1286 * callers might also have set path->keep_locks, which tells this code to keep
1287 * the lock if the path points to the last slot in the block. This is part of
1288 * walking through the tree, and selecting the next slot in the higher block.
1289 *
1290 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1291 * if lowest_unlock is 1, level 0 won't be unlocked
1292 */
1293static noinline void unlock_up(struct btrfs_path *path, int level,
1294 int lowest_unlock)
1295{
1296 int i;
1297 int skip_level = level;
1298 int no_skips = 0;
1299 struct extent_buffer *t;
1300
1301 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1302 if (!path->nodes[i])
1303 break;
1304 if (!path->locks[i])
1305 break;
1306 if (!no_skips && path->slots[i] == 0) {
1307 skip_level = i + 1;
1308 continue;
1309 }
1310 if (!no_skips && path->keep_locks) {
1311 u32 nritems;
1312 t = path->nodes[i];
1313 nritems = btrfs_header_nritems(t);
1314 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1315 skip_level = i + 1;
1316 continue;
1317 }
1318 }
1319 if (skip_level < i && i >= lowest_unlock)
1320 no_skips = 1;
1321
1322 t = path->nodes[i];
1323 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1324 btrfs_tree_unlock(t);
1325 path->locks[i] = 0;
1326 }
1327 }
1328}
1329
1330/*
1331 * look for key in the tree. path is filled in with nodes along the way
1332 * if key is found, we return zero and you can find the item in the leaf
1333 * level of the path (level 0)
1334 *
1335 * If the key isn't found, the path points to the slot where it should
1336 * be inserted, and 1 is returned. If there are other errors during the
1337 * search a negative error number is returned.
1338 *
1339 * if ins_len > 0, nodes and leaves will be split as we walk down the
1340 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
1341 * possible)
1342 */
1343int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1344 *root, struct btrfs_key *key, struct btrfs_path *p, int
1345 ins_len, int cow)
1346{
1347 struct extent_buffer *b;
1348 struct extent_buffer *tmp;
1349 int slot;
1350 int ret;
1351 int level;
1352 int should_reada = p->reada;
1353 int lowest_unlock = 1;
1354 int blocksize;
1355 u8 lowest_level = 0;
1356 u64 blocknr;
1357 u64 gen;
1358 struct btrfs_key prealloc_block;
1359
1360 lowest_level = p->lowest_level;
1361 WARN_ON(lowest_level && ins_len > 0);
1362 WARN_ON(p->nodes[0] != NULL);
1363
1364 if (ins_len < 0)
1365 lowest_unlock = 2;
1366
1367 prealloc_block.objectid = 0;
1368
1369again:
1370 if (p->skip_locking)
1371 b = btrfs_root_node(root);
1372 else
1373 b = btrfs_lock_root_node(root);
1374
1375 while (b) {
1376 level = btrfs_header_level(b);
1377
1378 /*
1379 * setup the path here so we can release it under lock
1380 * contention with the cow code
1381 */
1382 p->nodes[level] = b;
1383 if (!p->skip_locking)
1384 p->locks[level] = 1;
1385
1386 if (cow) {
1387 int wret;
1388
1389 /* is a cow on this block not required */
1390 spin_lock(&root->fs_info->hash_lock);
1391 if (btrfs_header_generation(b) == trans->transid &&
1392 btrfs_header_owner(b) == root->root_key.objectid &&
1393 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1394 spin_unlock(&root->fs_info->hash_lock);
1395 goto cow_done;
1396 }
1397 spin_unlock(&root->fs_info->hash_lock);
1398
1399 /* ok, we have to cow, is our old prealloc the right
1400 * size?
1401 */
1402 if (prealloc_block.objectid &&
1403 prealloc_block.offset != b->len) {
1404 btrfs_free_reserved_extent(root,
1405 prealloc_block.objectid,
1406 prealloc_block.offset);
1407 prealloc_block.objectid = 0;
1408 }
1409
1410 /*
1411 * for higher level blocks, try not to allocate blocks
1412 * with the block and the parent locks held.
1413 */
1414 if (level > 1 && !prealloc_block.objectid &&
1415 btrfs_path_lock_waiting(p, level)) {
1416 u32 size = b->len;
1417 u64 hint = b->start;
1418
1419 btrfs_release_path(root, p);
1420 ret = btrfs_reserve_extent(trans, root,
1421 size, size, 0,
1422 hint, (u64)-1,
1423 &prealloc_block, 0);
1424 BUG_ON(ret);
1425 goto again;
1426 }
1427
1428 wret = btrfs_cow_block(trans, root, b,
1429 p->nodes[level + 1],
1430 p->slots[level + 1],
1431 &b, prealloc_block.objectid);
1432 prealloc_block.objectid = 0;
1433 if (wret) {
1434 free_extent_buffer(b);
1435 ret = wret;
1436 goto done;
1437 }
1438 }
1439cow_done:
1440 BUG_ON(!cow && ins_len);
1441 if (level != btrfs_header_level(b))
1442 WARN_ON(1);
1443 level = btrfs_header_level(b);
1444
1445 p->nodes[level] = b;
1446 if (!p->skip_locking)
1447 p->locks[level] = 1;
1448
1449 ret = check_block(root, p, level);
1450 if (ret) {
1451 ret = -1;
1452 goto done;
1453 }
1454
1455 ret = bin_search(b, key, level, &slot);
1456 if (level != 0) {
1457 if (ret && slot > 0)
1458 slot -= 1;
1459 p->slots[level] = slot;
1460 if ((p->search_for_split || ins_len > 0) &&
1461 btrfs_header_nritems(b) >=
1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463 int sret = split_node(trans, root, p, level);
1464 BUG_ON(sret > 0);
1465 if (sret) {
1466 ret = sret;
1467 goto done;
1468 }
1469 b = p->nodes[level];
1470 slot = p->slots[level];
1471 } else if (ins_len < 0) {
1472 int sret = balance_level(trans, root, p,
1473 level);
1474 if (sret) {
1475 ret = sret;
1476 goto done;
1477 }
1478 b = p->nodes[level];
1479 if (!b) {
1480 btrfs_release_path(NULL, p);
1481 goto again;
1482 }
1483 slot = p->slots[level];
1484 BUG_ON(btrfs_header_nritems(b) == 1);
1485 }
1486 unlock_up(p, level, lowest_unlock);
1487
1488 /* this is only true while dropping a snapshot */
1489 if (level == lowest_level) {
1490 ret = 0;
1491 goto done;
1492 }
1493
1494 blocknr = btrfs_node_blockptr(b, slot);
1495 gen = btrfs_node_ptr_generation(b, slot);
1496 blocksize = btrfs_level_size(root, level - 1);
1497
1498 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1499 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1500 b = tmp;
1501 } else {
1502 /*
1503 * reduce lock contention at high levels
1504 * of the btree by dropping locks before
1505 * we read.
1506 */
1507 if (level > 1) {
1508 btrfs_release_path(NULL, p);
1509 if (tmp)
1510 free_extent_buffer(tmp);
1511 if (should_reada)
1512 reada_for_search(root, p,
1513 level, slot,
1514 key->objectid);
1515
1516 tmp = read_tree_block(root, blocknr,
1517 blocksize, gen);
1518 if (tmp)
1519 free_extent_buffer(tmp);
1520 goto again;
1521 } else {
1522 if (tmp)
1523 free_extent_buffer(tmp);
1524 if (should_reada)
1525 reada_for_search(root, p,
1526 level, slot,
1527 key->objectid);
1528 b = read_node_slot(root, b, slot);
1529 }
1530 }
1531 if (!p->skip_locking)
1532 btrfs_tree_lock(b);
1533 } else {
1534 p->slots[level] = slot;
1535 if (ins_len > 0 &&
1536 btrfs_leaf_free_space(root, b) < ins_len) {
1537 int sret = split_leaf(trans, root, key,
1538 p, ins_len, ret == 0);
1539 BUG_ON(sret > 0);
1540 if (sret) {
1541 ret = sret;
1542 goto done;
1543 }
1544 }
1545 if (!p->search_for_split)
1546 unlock_up(p, level, lowest_unlock);
1547 goto done;
1548 }
1549 }
1550 ret = 1;
1551done:
1552 if (prealloc_block.objectid) {
1553 btrfs_free_reserved_extent(root,
1554 prealloc_block.objectid,
1555 prealloc_block.offset);
1556 }
1557
1558 return ret;
1559}
1560
1561int btrfs_merge_path(struct btrfs_trans_handle *trans,
1562 struct btrfs_root *root,
1563 struct btrfs_key *node_keys,
1564 u64 *nodes, int lowest_level)
1565{
1566 struct extent_buffer *eb;
1567 struct extent_buffer *parent;
1568 struct btrfs_key key;
1569 u64 bytenr;
1570 u64 generation;
1571 u32 blocksize;
1572 int level;
1573 int slot;
1574 int key_match;
1575 int ret;
1576
1577 eb = btrfs_lock_root_node(root);
1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1579 BUG_ON(ret);
1580
1581 parent = eb;
1582 while (1) {
1583 level = btrfs_header_level(parent);
1584 if (level == 0 || level <= lowest_level)
1585 break;
1586
1587 ret = bin_search(parent, &node_keys[lowest_level], level,
1588 &slot);
1589 if (ret && slot > 0)
1590 slot--;
1591
1592 bytenr = btrfs_node_blockptr(parent, slot);
1593 if (nodes[level - 1] == bytenr)
1594 break;
1595
1596 blocksize = btrfs_level_size(root, level - 1);
1597 generation = btrfs_node_ptr_generation(parent, slot);
1598 btrfs_node_key_to_cpu(eb, &key, slot);
1599 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1600
1601 if (generation == trans->transid) {
1602 eb = read_tree_block(root, bytenr, blocksize,
1603 generation);
1604 btrfs_tree_lock(eb);
1605 }
1606
1607 /*
1608 * if node keys match and node pointer hasn't been modified
1609 * in the running transaction, we can merge the path. for
1610 * blocks owened by reloc trees, the node pointer check is
1611 * skipped, this is because these blocks are fully controlled
1612 * by the space balance code, no one else can modify them.
1613 */
1614 if (!nodes[level - 1] || !key_match ||
1615 (generation == trans->transid &&
1616 btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1617 if (level == 1 || level == lowest_level + 1) {
1618 if (generation == trans->transid) {
1619 btrfs_tree_unlock(eb);
1620 free_extent_buffer(eb);
1621 }
1622 break;
1623 }
1624
1625 if (generation != trans->transid) {
1626 eb = read_tree_block(root, bytenr, blocksize,
1627 generation);
1628 btrfs_tree_lock(eb);
1629 }
1630
1631 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1632 &eb, 0);
1633 BUG_ON(ret);
1634
1635 if (root->root_key.objectid ==
1636 BTRFS_TREE_RELOC_OBJECTID) {
1637 if (!nodes[level - 1]) {
1638 nodes[level - 1] = eb->start;
1639 memcpy(&node_keys[level - 1], &key,
1640 sizeof(node_keys[0]));
1641 } else {
1642 WARN_ON(1);
1643 }
1644 }
1645
1646 btrfs_tree_unlock(parent);
1647 free_extent_buffer(parent);
1648 parent = eb;
1649 continue;
1650 }
1651
1652 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1653 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1654 btrfs_mark_buffer_dirty(parent);
1655
1656 ret = btrfs_inc_extent_ref(trans, root,
1657 nodes[level - 1],
1658 blocksize, parent->start,
1659 btrfs_header_owner(parent),
1660 btrfs_header_generation(parent),
1661 level - 1);
1662 BUG_ON(ret);
1663
1664 /*
1665 * If the block was created in the running transaction,
1666 * it's possible this is the last reference to it, so we
1667 * should drop the subtree.
1668 */
1669 if (generation == trans->transid) {
1670 ret = btrfs_drop_subtree(trans, root, eb, parent);
1671 BUG_ON(ret);
1672 btrfs_tree_unlock(eb);
1673 free_extent_buffer(eb);
1674 } else {
1675 ret = btrfs_free_extent(trans, root, bytenr,
1676 blocksize, parent->start,
1677 btrfs_header_owner(parent),
1678 btrfs_header_generation(parent),
1679 level - 1, 1);
1680 BUG_ON(ret);
1681 }
1682 break;
1683 }
1684 btrfs_tree_unlock(parent);
1685 free_extent_buffer(parent);
1686 return 0;
1687}
1688
1689/*
1690 * adjust the pointers going up the tree, starting at level
1691 * making sure the right key of each node is points to 'key'.
1692 * This is used after shifting pointers to the left, so it stops
1693 * fixing up pointers when a given leaf/node is not in slot 0 of the
1694 * higher levels
1695 *
1696 * If this fails to write a tree block, it returns -1, but continues
1697 * fixing up the blocks in ram so the tree is consistent.
1698 */
1699static int fixup_low_keys(struct btrfs_trans_handle *trans,
1700 struct btrfs_root *root, struct btrfs_path *path,
1701 struct btrfs_disk_key *key, int level)
1702{
1703 int i;
1704 int ret = 0;
1705 struct extent_buffer *t;
1706
1707 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1708 int tslot = path->slots[i];
1709 if (!path->nodes[i])
1710 break;
1711 t = path->nodes[i];
1712 btrfs_set_node_key(t, key, tslot);
1713 btrfs_mark_buffer_dirty(path->nodes[i]);
1714 if (tslot != 0)
1715 break;
1716 }
1717 return ret;
1718}
1719
1720/*
1721 * update item key.
1722 *
1723 * This function isn't completely safe. It's the caller's responsibility
1724 * that the new key won't break the order
1725 */
1726int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1727 struct btrfs_root *root, struct btrfs_path *path,
1728 struct btrfs_key *new_key)
1729{
1730 struct btrfs_disk_key disk_key;
1731 struct extent_buffer *eb;
1732 int slot;
1733
1734 eb = path->nodes[0];
1735 slot = path->slots[0];
1736 if (slot > 0) {
1737 btrfs_item_key(eb, &disk_key, slot - 1);
1738 if (comp_keys(&disk_key, new_key) >= 0)
1739 return -1;
1740 }
1741 if (slot < btrfs_header_nritems(eb) - 1) {
1742 btrfs_item_key(eb, &disk_key, slot + 1);
1743 if (comp_keys(&disk_key, new_key) <= 0)
1744 return -1;
1745 }
1746
1747 btrfs_cpu_key_to_disk(&disk_key, new_key);
1748 btrfs_set_item_key(eb, &disk_key, slot);
1749 btrfs_mark_buffer_dirty(eb);
1750 if (slot == 0)
1751 fixup_low_keys(trans, root, path, &disk_key, 1);
1752 return 0;
1753}
1754
1755/*
1756 * try to push data from one node into the next node left in the
1757 * tree.
1758 *
1759 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1760 * error, and > 0 if there was no room in the left hand block.
1761 */
1762static int push_node_left(struct btrfs_trans_handle *trans,
1763 struct btrfs_root *root, struct extent_buffer *dst,
1764 struct extent_buffer *src, int empty)
1765{
1766 int push_items = 0;
1767 int src_nritems;
1768 int dst_nritems;
1769 int ret = 0;
1770
1771 src_nritems = btrfs_header_nritems(src);
1772 dst_nritems = btrfs_header_nritems(dst);
1773 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1774 WARN_ON(btrfs_header_generation(src) != trans->transid);
1775 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1776
1777 if (!empty && src_nritems <= 8)
1778 return 1;
1779
1780 if (push_items <= 0)
1781 return 1;
1782
1783 if (empty) {
1784 push_items = min(src_nritems, push_items);
1785 if (push_items < src_nritems) {
1786 /* leave at least 8 pointers in the node if
1787 * we aren't going to empty it
1788 */
1789 if (src_nritems - push_items < 8) {
1790 if (push_items <= 8)
1791 return 1;
1792 push_items -= 8;
1793 }
1794 }
1795 } else
1796 push_items = min(src_nritems - 8, push_items);
1797
1798 copy_extent_buffer(dst, src,
1799 btrfs_node_key_ptr_offset(dst_nritems),
1800 btrfs_node_key_ptr_offset(0),
1801 push_items * sizeof(struct btrfs_key_ptr));
1802
1803 if (push_items < src_nritems) {
1804 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1805 btrfs_node_key_ptr_offset(push_items),
1806 (src_nritems - push_items) *
1807 sizeof(struct btrfs_key_ptr));
1808 }
1809 btrfs_set_header_nritems(src, src_nritems - push_items);
1810 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1811 btrfs_mark_buffer_dirty(src);
1812 btrfs_mark_buffer_dirty(dst);
1813
1814 ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
1815 BUG_ON(ret);
1816
1817 return ret;
1818}
1819
1820/*
1821 * try to push data from one node into the next node right in the
1822 * tree.
1823 *
1824 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1825 * error, and > 0 if there was no room in the right hand block.
1826 *
1827 * this will only push up to 1/2 the contents of the left node over
1828 */
1829static int balance_node_right(struct btrfs_trans_handle *trans,
1830 struct btrfs_root *root,
1831 struct extent_buffer *dst,
1832 struct extent_buffer *src)
1833{
1834 int push_items = 0;
1835 int max_push;
1836 int src_nritems;
1837 int dst_nritems;
1838 int ret = 0;
1839
1840 WARN_ON(btrfs_header_generation(src) != trans->transid);
1841 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1842
1843 src_nritems = btrfs_header_nritems(src);
1844 dst_nritems = btrfs_header_nritems(dst);
1845 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1846 if (push_items <= 0)
1847 return 1;
1848
1849 if (src_nritems < 4)
1850 return 1;
1851
1852 max_push = src_nritems / 2 + 1;
1853 /* don't try to empty the node */
1854 if (max_push >= src_nritems)
1855 return 1;
1856
1857 if (max_push < push_items)
1858 push_items = max_push;
1859
1860 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1861 btrfs_node_key_ptr_offset(0),
1862 (dst_nritems) *
1863 sizeof(struct btrfs_key_ptr));
1864
1865 copy_extent_buffer(dst, src,
1866 btrfs_node_key_ptr_offset(0),
1867 btrfs_node_key_ptr_offset(src_nritems - push_items),
1868 push_items * sizeof(struct btrfs_key_ptr));
1869
1870 btrfs_set_header_nritems(src, src_nritems - push_items);
1871 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1872
1873 btrfs_mark_buffer_dirty(src);
1874 btrfs_mark_buffer_dirty(dst);
1875
1876 ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
1877 BUG_ON(ret);
1878
1879 return ret;
1880}
1881
1882/*
1883 * helper function to insert a new root level in the tree.
1884 * A new node is allocated, and a single item is inserted to
1885 * point to the existing root
1886 *
1887 * returns zero on success or < 0 on failure.
1888 */
1889static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1890 struct btrfs_root *root,
1891 struct btrfs_path *path, int level)
1892{
1893 u64 lower_gen;
1894 struct extent_buffer *lower;
1895 struct extent_buffer *c;
1896 struct extent_buffer *old;
1897 struct btrfs_disk_key lower_key;
1898 int ret;
1899
1900 BUG_ON(path->nodes[level]);
1901 BUG_ON(path->nodes[level-1] != root->node);
1902
1903 lower = path->nodes[level-1];
1904 if (level == 1)
1905 btrfs_item_key(lower, &lower_key, 0);
1906 else
1907 btrfs_node_key(lower, &lower_key, 0);
1908
1909 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1910 root->root_key.objectid, trans->transid,
1911 level, root->node->start, 0);
1912 if (IS_ERR(c))
1913 return PTR_ERR(c);
1914
1915 memset_extent_buffer(c, 0, 0, root->nodesize);
1916 btrfs_set_header_nritems(c, 1);
1917 btrfs_set_header_level(c, level);
1918 btrfs_set_header_bytenr(c, c->start);
1919 btrfs_set_header_generation(c, trans->transid);
1920 btrfs_set_header_owner(c, root->root_key.objectid);
1921
1922 write_extent_buffer(c, root->fs_info->fsid,
1923 (unsigned long)btrfs_header_fsid(c),
1924 BTRFS_FSID_SIZE);
1925
1926 write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1927 (unsigned long)btrfs_header_chunk_tree_uuid(c),
1928 BTRFS_UUID_SIZE);
1929
1930 btrfs_set_node_key(c, &lower_key, 0);
1931 btrfs_set_node_blockptr(c, 0, lower->start);
1932 lower_gen = btrfs_header_generation(lower);
1933 WARN_ON(lower_gen != trans->transid);
1934
1935 btrfs_set_node_ptr_generation(c, 0, lower_gen);
1936
1937 btrfs_mark_buffer_dirty(c);
1938
1939 spin_lock(&root->node_lock);
1940 old = root->node;
1941 root->node = c;
1942 spin_unlock(&root->node_lock);
1943
1944 ret = btrfs_update_extent_ref(trans, root, lower->start,
1945 lower->start, c->start,
1946 root->root_key.objectid,
1947 trans->transid, level - 1);
1948 BUG_ON(ret);
1949
1950 /* the super has an extra ref to root->node */
1951 free_extent_buffer(old);
1952
1953 add_root_to_dirty_list(root);
1954 extent_buffer_get(c);
1955 path->nodes[level] = c;
1956 path->locks[level] = 1;
1957 path->slots[level] = 0;
1958 return 0;
1959}
1960
1961/*
1962 * worker function to insert a single pointer in a node.
1963 * the node should have enough room for the pointer already
1964 *
1965 * slot and level indicate where you want the key to go, and
1966 * blocknr is the block the key points to.
1967 *
1968 * returns zero on success and < 0 on any error
1969 */
1970static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
1971 *root, struct btrfs_path *path, struct btrfs_disk_key
1972 *key, u64 bytenr, int slot, int level)
1973{
1974 struct extent_buffer *lower;
1975 int nritems;
1976
1977 BUG_ON(!path->nodes[level]);
1978 lower = path->nodes[level];
1979 nritems = btrfs_header_nritems(lower);
1980 if (slot > nritems)
1981 BUG();
1982 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
1983 BUG();
1984 if (slot != nritems) {
1985 memmove_extent_buffer(lower,
1986 btrfs_node_key_ptr_offset(slot + 1),
1987 btrfs_node_key_ptr_offset(slot),
1988 (nritems - slot) * sizeof(struct btrfs_key_ptr));
1989 }
1990 btrfs_set_node_key(lower, key, slot);
1991 btrfs_set_node_blockptr(lower, slot, bytenr);
1992 WARN_ON(trans->transid == 0);
1993 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
1994 btrfs_set_header_nritems(lower, nritems + 1);
1995 btrfs_mark_buffer_dirty(lower);
1996 return 0;
1997}
1998
1999/*
2000 * split the node at the specified level in path in two.
2001 * The path is corrected to point to the appropriate node after the split
2002 *
2003 * Before splitting this tries to make some room in the node by pushing
2004 * left and right, if either one works, it returns right away.
2005 *
2006 * returns 0 on success and < 0 on failure
2007 */
2008static noinline int split_node(struct btrfs_trans_handle *trans,
2009 struct btrfs_root *root,
2010 struct btrfs_path *path, int level)
2011{
2012 struct extent_buffer *c;
2013 struct extent_buffer *split;
2014 struct btrfs_disk_key disk_key;
2015 int mid;
2016 int ret;
2017 int wret;
2018 u32 c_nritems;
2019
2020 c = path->nodes[level];
2021 WARN_ON(btrfs_header_generation(c) != trans->transid);
2022 if (c == root->node) {
2023 /* trying to split the root, lets make a new one */
2024 ret = insert_new_root(trans, root, path, level + 1);
2025 if (ret)
2026 return ret;
2027 } else {
2028 ret = push_nodes_for_insert(trans, root, path, level);
2029 c = path->nodes[level];
2030 if (!ret && btrfs_header_nritems(c) <
2031 BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2032 return 0;
2033 if (ret < 0)
2034 return ret;
2035 }
2036
2037 c_nritems = btrfs_header_nritems(c);
2038
2039 split = btrfs_alloc_free_block(trans, root, root->nodesize,
2040 path->nodes[level + 1]->start,
2041 root->root_key.objectid,
2042 trans->transid, level, c->start, 0);
2043 if (IS_ERR(split))
2044 return PTR_ERR(split);
2045
2046 btrfs_set_header_flags(split, btrfs_header_flags(c));
2047 btrfs_set_header_level(split, btrfs_header_level(c));
2048 btrfs_set_header_bytenr(split, split->start);
2049 btrfs_set_header_generation(split, trans->transid);
2050 btrfs_set_header_owner(split, root->root_key.objectid);
2051 btrfs_set_header_flags(split, 0);
2052 write_extent_buffer(split, root->fs_info->fsid,
2053 (unsigned long)btrfs_header_fsid(split),
2054 BTRFS_FSID_SIZE);
2055 write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2056 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2057 BTRFS_UUID_SIZE);
2058
2059 mid = (c_nritems + 1) / 2;
2060
2061 copy_extent_buffer(split, c,
2062 btrfs_node_key_ptr_offset(0),
2063 btrfs_node_key_ptr_offset(mid),
2064 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2065 btrfs_set_header_nritems(split, c_nritems - mid);
2066 btrfs_set_header_nritems(c, mid);
2067 ret = 0;
2068
2069 btrfs_mark_buffer_dirty(c);
2070 btrfs_mark_buffer_dirty(split);
2071
2072 btrfs_node_key(split, &disk_key, 0);
2073 wret = insert_ptr(trans, root, path, &disk_key, split->start,
2074 path->slots[level + 1] + 1,
2075 level + 1);
2076 if (wret)
2077 ret = wret;
2078
2079 ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2080 BUG_ON(ret);
2081
2082 if (path->slots[level] >= mid) {
2083 path->slots[level] -= mid;
2084 btrfs_tree_unlock(c);
2085 free_extent_buffer(c);
2086 path->nodes[level] = split;
2087 path->slots[level + 1] += 1;
2088 } else {
2089 btrfs_tree_unlock(split);
2090 free_extent_buffer(split);
2091 }
2092 return ret;
2093}
2094
2095/*
2096 * how many bytes are required to store the items in a leaf. start
2097 * and nr indicate which items in the leaf to check. This totals up the
2098 * space used both by the item structs and the item data
2099 */
2100static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2101{
2102 int data_len;
2103 int nritems = btrfs_header_nritems(l);
2104 int end = min(nritems, start + nr) - 1;
2105
2106 if (!nr)
2107 return 0;
2108 data_len = btrfs_item_end_nr(l, start);
2109 data_len = data_len - btrfs_item_offset_nr(l, end);
2110 data_len += sizeof(struct btrfs_item) * nr;
2111 WARN_ON(data_len < 0);
2112 return data_len;
2113}
2114
2115/*
2116 * The space between the end of the leaf items and
2117 * the start of the leaf data. IOW, how much room
2118 * the leaf has left for both items and data
2119 */
2120noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2121 struct extent_buffer *leaf)
2122{
2123 int nritems = btrfs_header_nritems(leaf);
2124 int ret;
2125 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2126 if (ret < 0) {
2127 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2128 "used %d nritems %d\n",
2129 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2130 leaf_space_used(leaf, 0, nritems), nritems);
2131 }
2132 return ret;
2133}
2134
2135/*
2136 * push some data in the path leaf to the right, trying to free up at
2137 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2138 *
2139 * returns 1 if the push failed because the other node didn't have enough
2140 * room, 0 if everything worked out and < 0 if there were major errors.
2141 */
2142static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2143 *root, struct btrfs_path *path, int data_size,
2144 int empty)
2145{
2146 struct extent_buffer *left = path->nodes[0];
2147 struct extent_buffer *right;
2148 struct extent_buffer *upper;
2149 struct btrfs_disk_key disk_key;
2150 int slot;
2151 u32 i;
2152 int free_space;
2153 int push_space = 0;
2154 int push_items = 0;
2155 struct btrfs_item *item;
2156 u32 left_nritems;
2157 u32 nr;
2158 u32 right_nritems;
2159 u32 data_end;
2160 u32 this_item_size;
2161 int ret;
2162
2163 slot = path->slots[1];
2164 if (!path->nodes[1])
2165 return 1;
2166
2167 upper = path->nodes[1];
2168 if (slot >= btrfs_header_nritems(upper) - 1)
2169 return 1;
2170
2171 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2172
2173 right = read_node_slot(root, upper, slot + 1);
2174 btrfs_tree_lock(right);
2175 free_space = btrfs_leaf_free_space(root, right);
2176 if (free_space < data_size)
2177 goto out_unlock;
2178
2179 /* cow and double check */
2180 ret = btrfs_cow_block(trans, root, right, upper,
2181 slot + 1, &right, 0);
2182 if (ret)
2183 goto out_unlock;
2184
2185 free_space = btrfs_leaf_free_space(root, right);
2186 if (free_space < data_size)
2187 goto out_unlock;
2188
2189 left_nritems = btrfs_header_nritems(left);
2190 if (left_nritems == 0)
2191 goto out_unlock;
2192
2193 if (empty)
2194 nr = 0;
2195 else
2196 nr = 1;
2197
2198 if (path->slots[0] >= left_nritems)
2199 push_space += data_size;
2200
2201 i = left_nritems - 1;
2202 while (i >= nr) {
2203 item = btrfs_item_nr(left, i);
2204
2205 if (!empty && push_items > 0) {
2206 if (path->slots[0] > i)
2207 break;
2208 if (path->slots[0] == i) {
2209 int space = btrfs_leaf_free_space(root, left);
2210 if (space + push_space * 2 > free_space)
2211 break;
2212 }
2213 }
2214
2215 if (path->slots[0] == i)
2216 push_space += data_size;
2217
2218 if (!left->map_token) {
2219 map_extent_buffer(left, (unsigned long)item,
2220 sizeof(struct btrfs_item),
2221 &left->map_token, &left->kaddr,
2222 &left->map_start, &left->map_len,
2223 KM_USER1);
2224 }
2225
2226 this_item_size = btrfs_item_size(left, item);
2227 if (this_item_size + sizeof(*item) + push_space > free_space)
2228 break;
2229
2230 push_items++;
2231 push_space += this_item_size + sizeof(*item);
2232 if (i == 0)
2233 break;
2234 i--;
2235 }
2236 if (left->map_token) {
2237 unmap_extent_buffer(left, left->map_token, KM_USER1);
2238 left->map_token = NULL;
2239 }
2240
2241 if (push_items == 0)
2242 goto out_unlock;
2243
2244 if (!empty && push_items == left_nritems)
2245 WARN_ON(1);
2246
2247 /* push left to right */
2248 right_nritems = btrfs_header_nritems(right);
2249
2250 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2251 push_space -= leaf_data_end(root, left);
2252
2253 /* make room in the right data area */
2254 data_end = leaf_data_end(root, right);
2255 memmove_extent_buffer(right,
2256 btrfs_leaf_data(right) + data_end - push_space,
2257 btrfs_leaf_data(right) + data_end,
2258 BTRFS_LEAF_DATA_SIZE(root) - data_end);
2259
2260 /* copy from the left data area */
2261 copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2262 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2263 btrfs_leaf_data(left) + leaf_data_end(root, left),
2264 push_space);
2265
2266 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2267 btrfs_item_nr_offset(0),
2268 right_nritems * sizeof(struct btrfs_item));
2269
2270 /* copy the items from left to right */
2271 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2272 btrfs_item_nr_offset(left_nritems - push_items),
2273 push_items * sizeof(struct btrfs_item));
2274
2275 /* update the item pointers */
2276 right_nritems += push_items;
2277 btrfs_set_header_nritems(right, right_nritems);
2278 push_space = BTRFS_LEAF_DATA_SIZE(root);
2279 for (i = 0; i < right_nritems; i++) {
2280 item = btrfs_item_nr(right, i);
2281 if (!right->map_token) {
2282 map_extent_buffer(right, (unsigned long)item,
2283 sizeof(struct btrfs_item),
2284 &right->map_token, &right->kaddr,
2285 &right->map_start, &right->map_len,
2286 KM_USER1);
2287 }
2288 push_space -= btrfs_item_size(right, item);
2289 btrfs_set_item_offset(right, item, push_space);
2290 }
2291
2292 if (right->map_token) {
2293 unmap_extent_buffer(right, right->map_token, KM_USER1);
2294 right->map_token = NULL;
2295 }
2296 left_nritems -= push_items;
2297 btrfs_set_header_nritems(left, left_nritems);
2298
2299 if (left_nritems)
2300 btrfs_mark_buffer_dirty(left);
2301 btrfs_mark_buffer_dirty(right);
2302
2303 ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2304 BUG_ON(ret);
2305
2306 btrfs_item_key(right, &disk_key, 0);
2307 btrfs_set_node_key(upper, &disk_key, slot + 1);
2308 btrfs_mark_buffer_dirty(upper);
2309
2310 /* then fixup the leaf pointer in the path */
2311 if (path->slots[0] >= left_nritems) {
2312 path->slots[0] -= left_nritems;
2313 if (btrfs_header_nritems(path->nodes[0]) == 0)
2314 clean_tree_block(trans, root, path->nodes[0]);
2315 btrfs_tree_unlock(path->nodes[0]);
2316 free_extent_buffer(path->nodes[0]);
2317 path->nodes[0] = right;
2318 path->slots[1] += 1;
2319 } else {
2320 btrfs_tree_unlock(right);
2321 free_extent_buffer(right);
2322 }
2323 return 0;
2324
2325out_unlock:
2326 btrfs_tree_unlock(right);
2327 free_extent_buffer(right);
2328 return 1;
2329}
2330
2331/*
2332 * push some data in the path leaf to the left, trying to free up at
2333 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2334 */
2335static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2336 *root, struct btrfs_path *path, int data_size,
2337 int empty)
2338{
2339 struct btrfs_disk_key disk_key;
2340 struct extent_buffer *right = path->nodes[0];
2341 struct extent_buffer *left;
2342 int slot;
2343 int i;
2344 int free_space;
2345 int push_space = 0;
2346 int push_items = 0;
2347 struct btrfs_item *item;
2348 u32 old_left_nritems;
2349 u32 right_nritems;
2350 u32 nr;
2351 int ret = 0;
2352 int wret;
2353 u32 this_item_size;
2354 u32 old_left_item_size;
2355
2356 slot = path->slots[1];
2357 if (slot == 0)
2358 return 1;
2359 if (!path->nodes[1])
2360 return 1;
2361
2362 right_nritems = btrfs_header_nritems(right);
2363 if (right_nritems == 0)
2364 return 1;
2365
2366 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2367
2368 left = read_node_slot(root, path->nodes[1], slot - 1);
2369 btrfs_tree_lock(left);
2370 free_space = btrfs_leaf_free_space(root, left);
2371 if (free_space < data_size) {
2372 ret = 1;
2373 goto out;
2374 }
2375
2376 /* cow and double check */
2377 ret = btrfs_cow_block(trans, root, left,
2378 path->nodes[1], slot - 1, &left, 0);
2379 if (ret) {
2380 /* we hit -ENOSPC, but it isn't fatal here */
2381 ret = 1;
2382 goto out;
2383 }
2384
2385 free_space = btrfs_leaf_free_space(root, left);
2386 if (free_space < data_size) {
2387 ret = 1;
2388 goto out;
2389 }
2390
2391 if (empty)
2392 nr = right_nritems;
2393 else
2394 nr = right_nritems - 1;
2395
2396 for (i = 0; i < nr; i++) {
2397 item = btrfs_item_nr(right, i);
2398 if (!right->map_token) {
2399 map_extent_buffer(right, (unsigned long)item,
2400 sizeof(struct btrfs_item),
2401 &right->map_token, &right->kaddr,
2402 &right->map_start, &right->map_len,
2403 KM_USER1);
2404 }
2405
2406 if (!empty && push_items > 0) {
2407 if (path->slots[0] < i)
2408 break;
2409 if (path->slots[0] == i) {
2410 int space = btrfs_leaf_free_space(root, right);
2411 if (space + push_space * 2 > free_space)
2412 break;
2413 }
2414 }
2415
2416 if (path->slots[0] == i)
2417 push_space += data_size;
2418
2419 this_item_size = btrfs_item_size(right, item);
2420 if (this_item_size + sizeof(*item) + push_space > free_space)
2421 break;
2422
2423 push_items++;
2424 push_space += this_item_size + sizeof(*item);
2425 }
2426
2427 if (right->map_token) {
2428 unmap_extent_buffer(right, right->map_token, KM_USER1);
2429 right->map_token = NULL;
2430 }
2431
2432 if (push_items == 0) {
2433 ret = 1;
2434 goto out;
2435 }
2436 if (!empty && push_items == btrfs_header_nritems(right))
2437 WARN_ON(1);
2438
2439 /* push data from right to left */
2440 copy_extent_buffer(left, right,
2441 btrfs_item_nr_offset(btrfs_header_nritems(left)),
2442 btrfs_item_nr_offset(0),
2443 push_items * sizeof(struct btrfs_item));
2444
2445 push_space = BTRFS_LEAF_DATA_SIZE(root) -
2446 btrfs_item_offset_nr(right, push_items - 1);
2447
2448 copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2449 leaf_data_end(root, left) - push_space,
2450 btrfs_leaf_data(right) +
2451 btrfs_item_offset_nr(right, push_items - 1),
2452 push_space);
2453 old_left_nritems = btrfs_header_nritems(left);
2454 BUG_ON(old_left_nritems <= 0);
2455
2456 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2457 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2458 u32 ioff;
2459
2460 item = btrfs_item_nr(left, i);
2461 if (!left->map_token) {
2462 map_extent_buffer(left, (unsigned long)item,
2463 sizeof(struct btrfs_item),
2464 &left->map_token, &left->kaddr,
2465 &left->map_start, &left->map_len,
2466 KM_USER1);
2467 }
2468
2469 ioff = btrfs_item_offset(left, item);
2470 btrfs_set_item_offset(left, item,
2471 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2472 }
2473 btrfs_set_header_nritems(left, old_left_nritems + push_items);
2474 if (left->map_token) {
2475 unmap_extent_buffer(left, left->map_token, KM_USER1);
2476 left->map_token = NULL;
2477 }
2478
2479 /* fixup right node */
2480 if (push_items > right_nritems) {
2481 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2482 right_nritems);
2483 WARN_ON(1);
2484 }
2485
2486 if (push_items < right_nritems) {
2487 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2488 leaf_data_end(root, right);
2489 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2490 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2491 btrfs_leaf_data(right) +
2492 leaf_data_end(root, right), push_space);
2493
2494 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2495 btrfs_item_nr_offset(push_items),
2496 (btrfs_header_nritems(right) - push_items) *
2497 sizeof(struct btrfs_item));
2498 }
2499 right_nritems -= push_items;
2500 btrfs_set_header_nritems(right, right_nritems);
2501 push_space = BTRFS_LEAF_DATA_SIZE(root);
2502 for (i = 0; i < right_nritems; i++) {
2503 item = btrfs_item_nr(right, i);
2504
2505 if (!right->map_token) {
2506 map_extent_buffer(right, (unsigned long)item,
2507 sizeof(struct btrfs_item),
2508 &right->map_token, &right->kaddr,
2509 &right->map_start, &right->map_len,
2510 KM_USER1);
2511 }
2512
2513 push_space = push_space - btrfs_item_size(right, item);
2514 btrfs_set_item_offset(right, item, push_space);
2515 }
2516 if (right->map_token) {
2517 unmap_extent_buffer(right, right->map_token, KM_USER1);
2518 right->map_token = NULL;
2519 }
2520
2521 btrfs_mark_buffer_dirty(left);
2522 if (right_nritems)
2523 btrfs_mark_buffer_dirty(right);
2524
2525 ret = btrfs_update_ref(trans, root, right, left,
2526 old_left_nritems, push_items);
2527 BUG_ON(ret);
2528
2529 btrfs_item_key(right, &disk_key, 0);
2530 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2531 if (wret)
2532 ret = wret;
2533
2534 /* then fixup the leaf pointer in the path */
2535 if (path->slots[0] < push_items) {
2536 path->slots[0] += old_left_nritems;
2537 if (btrfs_header_nritems(path->nodes[0]) == 0)
2538 clean_tree_block(trans, root, path->nodes[0]);
2539 btrfs_tree_unlock(path->nodes[0]);
2540 free_extent_buffer(path->nodes[0]);
2541 path->nodes[0] = left;
2542 path->slots[1] -= 1;
2543 } else {
2544 btrfs_tree_unlock(left);
2545 free_extent_buffer(left);
2546 path->slots[0] -= push_items;
2547 }
2548 BUG_ON(path->slots[0] < 0);
2549 return ret;
2550out:
2551 btrfs_tree_unlock(left);
2552 free_extent_buffer(left);
2553 return ret;
2554}
2555
2556/*
2557 * split the path's leaf in two, making sure there is at least data_size
2558 * available for the resulting leaf level of the path.
2559 *
2560 * returns 0 if all went well and < 0 on failure.
2561 */
2562static noinline int split_leaf(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root,
2564 struct btrfs_key *ins_key,
2565 struct btrfs_path *path, int data_size,
2566 int extend)
2567{
2568 struct extent_buffer *l;
2569 u32 nritems;
2570 int mid;
2571 int slot;
2572 struct extent_buffer *right;
2573 int data_copy_size;
2574 int rt_data_off;
2575 int i;
2576 int ret = 0;
2577 int wret;
2578 int double_split;
2579 int num_doubles = 0;
2580 struct btrfs_disk_key disk_key;
2581
2582 /* first try to make some room by pushing left and right */
2583 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2584 wret = push_leaf_right(trans, root, path, data_size, 0);
2585 if (wret < 0)
2586 return wret;
2587 if (wret) {
2588 wret = push_leaf_left(trans, root, path, data_size, 0);
2589 if (wret < 0)
2590 return wret;
2591 }
2592 l = path->nodes[0];
2593
2594 /* did the pushes work? */
2595 if (btrfs_leaf_free_space(root, l) >= data_size)
2596 return 0;
2597 }
2598
2599 if (!path->nodes[1]) {
2600 ret = insert_new_root(trans, root, path, 1);
2601 if (ret)
2602 return ret;
2603 }
2604again:
2605 double_split = 0;
2606 l = path->nodes[0];
2607 slot = path->slots[0];
2608 nritems = btrfs_header_nritems(l);
2609 mid = (nritems + 1) / 2;
2610
2611 right = btrfs_alloc_free_block(trans, root, root->leafsize,
2612 path->nodes[1]->start,
2613 root->root_key.objectid,
2614 trans->transid, 0, l->start, 0);
2615 if (IS_ERR(right)) {
2616 BUG_ON(1);
2617 return PTR_ERR(right);
2618 }
2619
2620 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2621 btrfs_set_header_bytenr(right, right->start);
2622 btrfs_set_header_generation(right, trans->transid);
2623 btrfs_set_header_owner(right, root->root_key.objectid);
2624 btrfs_set_header_level(right, 0);
2625 write_extent_buffer(right, root->fs_info->fsid,
2626 (unsigned long)btrfs_header_fsid(right),
2627 BTRFS_FSID_SIZE);
2628
2629 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2630 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2631 BTRFS_UUID_SIZE);
2632 if (mid <= slot) {
2633 if (nritems == 1 ||
2634 leaf_space_used(l, mid, nritems - mid) + data_size >
2635 BTRFS_LEAF_DATA_SIZE(root)) {
2636 if (slot >= nritems) {
2637 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2638 btrfs_set_header_nritems(right, 0);
2639 wret = insert_ptr(trans, root, path,
2640 &disk_key, right->start,
2641 path->slots[1] + 1, 1);
2642 if (wret)
2643 ret = wret;
2644
2645 btrfs_tree_unlock(path->nodes[0]);
2646 free_extent_buffer(path->nodes[0]);
2647 path->nodes[0] = right;
2648 path->slots[0] = 0;
2649 path->slots[1] += 1;
2650 btrfs_mark_buffer_dirty(right);
2651 return ret;
2652 }
2653 mid = slot;
2654 if (mid != nritems &&
2655 leaf_space_used(l, mid, nritems - mid) +
2656 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2657 double_split = 1;
2658 }
2659 }
2660 } else {
2661 if (leaf_space_used(l, 0, mid) + data_size >
2662 BTRFS_LEAF_DATA_SIZE(root)) {
2663 if (!extend && data_size && slot == 0) {
2664 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2665 btrfs_set_header_nritems(right, 0);
2666 wret = insert_ptr(trans, root, path,
2667 &disk_key,
2668 right->start,
2669 path->slots[1], 1);
2670 if (wret)
2671 ret = wret;
2672 btrfs_tree_unlock(path->nodes[0]);
2673 free_extent_buffer(path->nodes[0]);
2674 path->nodes[0] = right;
2675 path->slots[0] = 0;
2676 if (path->slots[1] == 0) {
2677 wret = fixup_low_keys(trans, root,
2678 path, &disk_key, 1);
2679 if (wret)
2680 ret = wret;
2681 }
2682 btrfs_mark_buffer_dirty(right);
2683 return ret;
2684 } else if ((extend || !data_size) && slot == 0) {
2685 mid = 1;
2686 } else {
2687 mid = slot;
2688 if (mid != nritems &&
2689 leaf_space_used(l, mid, nritems - mid) +
2690 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2691 double_split = 1;
2692 }
2693 }
2694 }
2695 }
2696 nritems = nritems - mid;
2697 btrfs_set_header_nritems(right, nritems);
2698 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2699
2700 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2701 btrfs_item_nr_offset(mid),
2702 nritems * sizeof(struct btrfs_item));
2703
2704 copy_extent_buffer(right, l,
2705 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2706 data_copy_size, btrfs_leaf_data(l) +
2707 leaf_data_end(root, l), data_copy_size);
2708
2709 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2710 btrfs_item_end_nr(l, mid);
2711
2712 for (i = 0; i < nritems; i++) {
2713 struct btrfs_item *item = btrfs_item_nr(right, i);
2714 u32 ioff;
2715
2716 if (!right->map_token) {
2717 map_extent_buffer(right, (unsigned long)item,
2718 sizeof(struct btrfs_item),
2719 &right->map_token, &right->kaddr,
2720 &right->map_start, &right->map_len,
2721 KM_USER1);
2722 }
2723
2724 ioff = btrfs_item_offset(right, item);
2725 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2726 }
2727
2728 if (right->map_token) {
2729 unmap_extent_buffer(right, right->map_token, KM_USER1);
2730 right->map_token = NULL;
2731 }
2732
2733 btrfs_set_header_nritems(l, mid);
2734 ret = 0;
2735 btrfs_item_key(right, &disk_key, 0);
2736 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2737 path->slots[1] + 1, 1);
2738 if (wret)
2739 ret = wret;
2740
2741 btrfs_mark_buffer_dirty(right);
2742 btrfs_mark_buffer_dirty(l);
2743 BUG_ON(path->slots[0] != slot);
2744
2745 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2746 BUG_ON(ret);
2747
2748 if (mid <= slot) {
2749 btrfs_tree_unlock(path->nodes[0]);
2750 free_extent_buffer(path->nodes[0]);
2751 path->nodes[0] = right;
2752 path->slots[0] -= mid;
2753 path->slots[1] += 1;
2754 } else {
2755 btrfs_tree_unlock(right);
2756 free_extent_buffer(right);
2757 }
2758
2759 BUG_ON(path->slots[0] < 0);
2760
2761 if (double_split) {
2762 BUG_ON(num_doubles != 0);
2763 num_doubles++;
2764 goto again;
2765 }
2766 return ret;
2767}
2768
2769/*
2770 * This function splits a single item into two items,
2771 * giving 'new_key' to the new item and splitting the
2772 * old one at split_offset (from the start of the item).
2773 *
2774 * The path may be released by this operation. After
2775 * the split, the path is pointing to the old item. The
2776 * new item is going to be in the same node as the old one.
2777 *
2778 * Note, the item being split must be smaller enough to live alone on
2779 * a tree block with room for one extra struct btrfs_item
2780 *
2781 * This allows us to split the item in place, keeping a lock on the
2782 * leaf the entire time.
2783 */
2784int btrfs_split_item(struct btrfs_trans_handle *trans,
2785 struct btrfs_root *root,
2786 struct btrfs_path *path,
2787 struct btrfs_key *new_key,
2788 unsigned long split_offset)
2789{
2790 u32 item_size;
2791 struct extent_buffer *leaf;
2792 struct btrfs_key orig_key;
2793 struct btrfs_item *item;
2794 struct btrfs_item *new_item;
2795 int ret = 0;
2796 int slot;
2797 u32 nritems;
2798 u32 orig_offset;
2799 struct btrfs_disk_key disk_key;
2800 char *buf;
2801
2802 leaf = path->nodes[0];
2803 btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
2804 if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
2805 goto split;
2806
2807 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2808 btrfs_release_path(root, path);
2809
2810 path->search_for_split = 1;
2811 path->keep_locks = 1;
2812
2813 ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
2814 path->search_for_split = 0;
2815
2816 /* if our item isn't there or got smaller, return now */
2817 if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
2818 path->slots[0])) {
2819 path->keep_locks = 0;
2820 return -EAGAIN;
2821 }
2822
2823 ret = split_leaf(trans, root, &orig_key, path,
2824 sizeof(struct btrfs_item), 1);
2825 path->keep_locks = 0;
2826 BUG_ON(ret);
2827
2828 leaf = path->nodes[0];
2829 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830
2831split:
2832 item = btrfs_item_nr(leaf, path->slots[0]);
2833 orig_offset = btrfs_item_offset(leaf, item);
2834 item_size = btrfs_item_size(leaf, item);
2835
2836
2837 buf = kmalloc(item_size, GFP_NOFS);
2838 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
2839 path->slots[0]), item_size);
2840 slot = path->slots[0] + 1;
2841 leaf = path->nodes[0];
2842
2843 nritems = btrfs_header_nritems(leaf);
2844
2845 if (slot != nritems) {
2846 /* shift the items */
2847 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
2848 btrfs_item_nr_offset(slot),
2849 (nritems - slot) * sizeof(struct btrfs_item));
2850
2851 }
2852
2853 btrfs_cpu_key_to_disk(&disk_key, new_key);
2854 btrfs_set_item_key(leaf, &disk_key, slot);
2855
2856 new_item = btrfs_item_nr(leaf, slot);
2857
2858 btrfs_set_item_offset(leaf, new_item, orig_offset);
2859 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
2860
2861 btrfs_set_item_offset(leaf, item,
2862 orig_offset + item_size - split_offset);
2863 btrfs_set_item_size(leaf, item, split_offset);
2864
2865 btrfs_set_header_nritems(leaf, nritems + 1);
2866
2867 /* write the data for the start of the original item */
2868 write_extent_buffer(leaf, buf,
2869 btrfs_item_ptr_offset(leaf, path->slots[0]),
2870 split_offset);
2871
2872 /* write the data for the new item */
2873 write_extent_buffer(leaf, buf + split_offset,
2874 btrfs_item_ptr_offset(leaf, slot),
2875 item_size - split_offset);
2876 btrfs_mark_buffer_dirty(leaf);
2877
2878 ret = 0;
2879 if (btrfs_leaf_free_space(root, leaf) < 0) {
2880 btrfs_print_leaf(root, leaf);
2881 BUG();
2882 }
2883 kfree(buf);
2884 return ret;
2885}
2886
2887/*
2888 * make the item pointed to by the path smaller. new_size indicates
2889 * how small to make it, and from_end tells us if we just chop bytes
2890 * off the end of the item or if we shift the item to chop bytes off
2891 * the front.
2892 */
2893int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2894 struct btrfs_root *root,
2895 struct btrfs_path *path,
2896 u32 new_size, int from_end)
2897{
2898 int ret = 0;
2899 int slot;
2900 int slot_orig;
2901 struct extent_buffer *leaf;
2902 struct btrfs_item *item;
2903 u32 nritems;
2904 unsigned int data_end;
2905 unsigned int old_data_start;
2906 unsigned int old_size;
2907 unsigned int size_diff;
2908 int i;
2909
2910 slot_orig = path->slots[0];
2911 leaf = path->nodes[0];
2912 slot = path->slots[0];
2913
2914 old_size = btrfs_item_size_nr(leaf, slot);
2915 if (old_size == new_size)
2916 return 0;
2917
2918 nritems = btrfs_header_nritems(leaf);
2919 data_end = leaf_data_end(root, leaf);
2920
2921 old_data_start = btrfs_item_offset_nr(leaf, slot);
2922
2923 size_diff = old_size - new_size;
2924
2925 BUG_ON(slot < 0);
2926 BUG_ON(slot >= nritems);
2927
2928 /*
2929 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2930 */
2931 /* first correct the data pointers */
2932 for (i = slot; i < nritems; i++) {
2933 u32 ioff;
2934 item = btrfs_item_nr(leaf, i);
2935
2936 if (!leaf->map_token) {
2937 map_extent_buffer(leaf, (unsigned long)item,
2938 sizeof(struct btrfs_item),
2939 &leaf->map_token, &leaf->kaddr,
2940 &leaf->map_start, &leaf->map_len,
2941 KM_USER1);
2942 }
2943
2944 ioff = btrfs_item_offset(leaf, item);
2945 btrfs_set_item_offset(leaf, item, ioff + size_diff);
2946 }
2947
2948 if (leaf->map_token) {
2949 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2950 leaf->map_token = NULL;
2951 }
2952
2953 /* shift the data */
2954 if (from_end) {
2955 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2956 data_end + size_diff, btrfs_leaf_data(leaf) +
2957 data_end, old_data_start + new_size - data_end);
2958 } else {
2959 struct btrfs_disk_key disk_key;
2960 u64 offset;
2961
2962 btrfs_item_key(leaf, &disk_key, slot);
2963
2964 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
2965 unsigned long ptr;
2966 struct btrfs_file_extent_item *fi;
2967
2968 fi = btrfs_item_ptr(leaf, slot,
2969 struct btrfs_file_extent_item);
2970 fi = (struct btrfs_file_extent_item *)(
2971 (unsigned long)fi - size_diff);
2972
2973 if (btrfs_file_extent_type(leaf, fi) ==
2974 BTRFS_FILE_EXTENT_INLINE) {
2975 ptr = btrfs_item_ptr_offset(leaf, slot);
2976 memmove_extent_buffer(leaf, ptr,
2977 (unsigned long)fi,
2978 offsetof(struct btrfs_file_extent_item,
2979 disk_bytenr));
2980 }
2981 }
2982
2983 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2984 data_end + size_diff, btrfs_leaf_data(leaf) +
2985 data_end, old_data_start - data_end);
2986
2987 offset = btrfs_disk_key_offset(&disk_key);
2988 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
2989 btrfs_set_item_key(leaf, &disk_key, slot);
2990 if (slot == 0)
2991 fixup_low_keys(trans, root, path, &disk_key, 1);
2992 }
2993
2994 item = btrfs_item_nr(leaf, slot);
2995 btrfs_set_item_size(leaf, item, new_size);
2996 btrfs_mark_buffer_dirty(leaf);
2997
2998 ret = 0;
2999 if (btrfs_leaf_free_space(root, leaf) < 0) {
3000 btrfs_print_leaf(root, leaf);
3001 BUG();
3002 }
3003 return ret;
3004}
3005
3006/*
3007 * make the item pointed to by the path bigger, data_size is the new size.
3008 */
3009int btrfs_extend_item(struct btrfs_trans_handle *trans,
3010 struct btrfs_root *root, struct btrfs_path *path,
3011 u32 data_size)
3012{
3013 int ret = 0;
3014 int slot;
3015 int slot_orig;
3016 struct extent_buffer *leaf;
3017 struct btrfs_item *item;
3018 u32 nritems;
3019 unsigned int data_end;
3020 unsigned int old_data;
3021 unsigned int old_size;
3022 int i;
3023
3024 slot_orig = path->slots[0];
3025 leaf = path->nodes[0];
3026
3027 nritems = btrfs_header_nritems(leaf);
3028 data_end = leaf_data_end(root, leaf);
3029
3030 if (btrfs_leaf_free_space(root, leaf) < data_size) {
3031 btrfs_print_leaf(root, leaf);
3032 BUG();
3033 }
3034 slot = path->slots[0];
3035 old_data = btrfs_item_end_nr(leaf, slot);
3036
3037 BUG_ON(slot < 0);
3038 if (slot >= nritems) {
3039 btrfs_print_leaf(root, leaf);
3040 printk(KERN_CRIT "slot %d too large, nritems %d\n",
3041 slot, nritems);
3042 BUG_ON(1);
3043 }
3044
3045 /*
3046 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3047 */
3048 /* first correct the data pointers */
3049 for (i = slot; i < nritems; i++) {
3050 u32 ioff;
3051 item = btrfs_item_nr(leaf, i);
3052
3053 if (!leaf->map_token) {
3054 map_extent_buffer(leaf, (unsigned long)item,
3055 sizeof(struct btrfs_item),
3056 &leaf->map_token, &leaf->kaddr,
3057 &leaf->map_start, &leaf->map_len,
3058 KM_USER1);
3059 }
3060 ioff = btrfs_item_offset(leaf, item);
3061 btrfs_set_item_offset(leaf, item, ioff - data_size);
3062 }
3063
3064 if (leaf->map_token) {
3065 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3066 leaf->map_token = NULL;
3067 }
3068
3069 /* shift the data */
3070 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3071 data_end - data_size, btrfs_leaf_data(leaf) +
3072 data_end, old_data - data_end);
3073
3074 data_end = old_data;
3075 old_size = btrfs_item_size_nr(leaf, slot);
3076 item = btrfs_item_nr(leaf, slot);
3077 btrfs_set_item_size(leaf, item, old_size + data_size);
3078 btrfs_mark_buffer_dirty(leaf);
3079
3080 ret = 0;
3081 if (btrfs_leaf_free_space(root, leaf) < 0) {
3082 btrfs_print_leaf(root, leaf);
3083 BUG();
3084 }
3085 return ret;
3086}
3087
3088/*
3089 * Given a key and some data, insert items into the tree.
3090 * This does all the path init required, making room in the tree if needed.
3091 * Returns the number of keys that were inserted.
3092 */
3093int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3094 struct btrfs_root *root,
3095 struct btrfs_path *path,
3096 struct btrfs_key *cpu_key, u32 *data_size,
3097 int nr)
3098{
3099 struct extent_buffer *leaf;
3100 struct btrfs_item *item;
3101 int ret = 0;
3102 int slot;
3103 int i;
3104 u32 nritems;
3105 u32 total_data = 0;
3106 u32 total_size = 0;
3107 unsigned int data_end;
3108 struct btrfs_disk_key disk_key;
3109 struct btrfs_key found_key;
3110
3111 for (i = 0; i < nr; i++) {
3112 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3113 BTRFS_LEAF_DATA_SIZE(root)) {
3114 break;
3115 nr = i;
3116 }
3117 total_data += data_size[i];
3118 total_size += data_size[i] + sizeof(struct btrfs_item);
3119 }
3120 BUG_ON(nr == 0);
3121
3122 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3123 if (ret == 0)
3124 return -EEXIST;
3125 if (ret < 0)
3126 goto out;
3127
3128 leaf = path->nodes[0];
3129
3130 nritems = btrfs_header_nritems(leaf);
3131 data_end = leaf_data_end(root, leaf);
3132
3133 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3134 for (i = nr; i >= 0; i--) {
3135 total_data -= data_size[i];
3136 total_size -= data_size[i] + sizeof(struct btrfs_item);
3137 if (total_size < btrfs_leaf_free_space(root, leaf))
3138 break;
3139 }
3140 nr = i;
3141 }
3142
3143 slot = path->slots[0];
3144 BUG_ON(slot < 0);
3145
3146 if (slot != nritems) {
3147 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3148
3149 item = btrfs_item_nr(leaf, slot);
3150 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3151
3152 /* figure out how many keys we can insert in here */
3153 total_data = data_size[0];
3154 for (i = 1; i < nr; i++) {
3155 if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3156 break;
3157 total_data += data_size[i];
3158 }
3159 nr = i;
3160
3161 if (old_data < data_end) {
3162 btrfs_print_leaf(root, leaf);
3163 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3164 slot, old_data, data_end);
3165 BUG_ON(1);
3166 }
3167 /*
3168 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3169 */
3170 /* first correct the data pointers */
3171 WARN_ON(leaf->map_token);
3172 for (i = slot; i < nritems; i++) {
3173 u32 ioff;
3174
3175 item = btrfs_item_nr(leaf, i);
3176 if (!leaf->map_token) {
3177 map_extent_buffer(leaf, (unsigned long)item,
3178 sizeof(struct btrfs_item),
3179 &leaf->map_token, &leaf->kaddr,
3180 &leaf->map_start, &leaf->map_len,
3181 KM_USER1);
3182 }
3183
3184 ioff = btrfs_item_offset(leaf, item);
3185 btrfs_set_item_offset(leaf, item, ioff - total_data);
3186 }
3187 if (leaf->map_token) {
3188 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3189 leaf->map_token = NULL;
3190 }
3191
3192 /* shift the items */
3193 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3194 btrfs_item_nr_offset(slot),
3195 (nritems - slot) * sizeof(struct btrfs_item));
3196
3197 /* shift the data */
3198 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3199 data_end - total_data, btrfs_leaf_data(leaf) +
3200 data_end, old_data - data_end);
3201 data_end = old_data;
3202 } else {
3203 /*
3204 * this sucks but it has to be done, if we are inserting at
3205 * the end of the leaf only insert 1 of the items, since we
3206 * have no way of knowing whats on the next leaf and we'd have
3207 * to drop our current locks to figure it out
3208 */
3209 nr = 1;
3210 }
3211
3212 /* setup the item for the new data */
3213 for (i = 0; i < nr; i++) {
3214 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3215 btrfs_set_item_key(leaf, &disk_key, slot + i);
3216 item = btrfs_item_nr(leaf, slot + i);
3217 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3218 data_end -= data_size[i];
3219 btrfs_set_item_size(leaf, item, data_size[i]);
3220 }
3221 btrfs_set_header_nritems(leaf, nritems + nr);
3222 btrfs_mark_buffer_dirty(leaf);
3223
3224 ret = 0;
3225 if (slot == 0) {
3226 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3227 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3228 }
3229
3230 if (btrfs_leaf_free_space(root, leaf) < 0) {
3231 btrfs_print_leaf(root, leaf);
3232 BUG();
3233 }
3234out:
3235 if (!ret)
3236 ret = nr;
3237 return ret;
3238}
3239
3240/*
3241 * Given a key and some data, insert items into the tree.
3242 * This does all the path init required, making room in the tree if needed.
3243 */
3244int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3245 struct btrfs_root *root,
3246 struct btrfs_path *path,
3247 struct btrfs_key *cpu_key, u32 *data_size,
3248 int nr)
3249{
3250 struct extent_buffer *leaf;
3251 struct btrfs_item *item;
3252 int ret = 0;
3253 int slot;
3254 int slot_orig;
3255 int i;
3256 u32 nritems;
3257 u32 total_size = 0;
3258 u32 total_data = 0;
3259 unsigned int data_end;
3260 struct btrfs_disk_key disk_key;
3261
3262 for (i = 0; i < nr; i++)
3263 total_data += data_size[i];
3264
3265 total_size = total_data + (nr * sizeof(struct btrfs_item));
3266 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3267 if (ret == 0)
3268 return -EEXIST;
3269 if (ret < 0)
3270 goto out;
3271
3272 slot_orig = path->slots[0];
3273 leaf = path->nodes[0];
3274
3275 nritems = btrfs_header_nritems(leaf);
3276 data_end = leaf_data_end(root, leaf);
3277
3278 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3279 btrfs_print_leaf(root, leaf);
3280 printk(KERN_CRIT "not enough freespace need %u have %d\n",
3281 total_size, btrfs_leaf_free_space(root, leaf));
3282 BUG();
3283 }
3284
3285 slot = path->slots[0];
3286 BUG_ON(slot < 0);
3287
3288 if (slot != nritems) {
3289 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3290
3291 if (old_data < data_end) {
3292 btrfs_print_leaf(root, leaf);
3293 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3294 slot, old_data, data_end);
3295 BUG_ON(1);
3296 }
3297 /*
3298 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3299 */
3300 /* first correct the data pointers */
3301 WARN_ON(leaf->map_token);
3302 for (i = slot; i < nritems; i++) {
3303 u32 ioff;
3304
3305 item = btrfs_item_nr(leaf, i);
3306 if (!leaf->map_token) {
3307 map_extent_buffer(leaf, (unsigned long)item,
3308 sizeof(struct btrfs_item),
3309 &leaf->map_token, &leaf->kaddr,
3310 &leaf->map_start, &leaf->map_len,
3311 KM_USER1);
3312 }
3313
3314 ioff = btrfs_item_offset(leaf, item);
3315 btrfs_set_item_offset(leaf, item, ioff - total_data);
3316 }
3317 if (leaf->map_token) {
3318 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3319 leaf->map_token = NULL;
3320 }
3321
3322 /* shift the items */
3323 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3324 btrfs_item_nr_offset(slot),
3325 (nritems - slot) * sizeof(struct btrfs_item));
3326
3327 /* shift the data */
3328 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3329 data_end - total_data, btrfs_leaf_data(leaf) +
3330 data_end, old_data - data_end);
3331 data_end = old_data;
3332 }
3333
3334 /* setup the item for the new data */
3335 for (i = 0; i < nr; i++) {
3336 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3337 btrfs_set_item_key(leaf, &disk_key, slot + i);
3338 item = btrfs_item_nr(leaf, slot + i);
3339 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3340 data_end -= data_size[i];
3341 btrfs_set_item_size(leaf, item, data_size[i]);
3342 }
3343 btrfs_set_header_nritems(leaf, nritems + nr);
3344 btrfs_mark_buffer_dirty(leaf);
3345
3346 ret = 0;
3347 if (slot == 0) {
3348 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3349 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3350 }
3351
3352 if (btrfs_leaf_free_space(root, leaf) < 0) {
3353 btrfs_print_leaf(root, leaf);
3354 BUG();
3355 }
3356out:
3357 return ret;
3358}
3359
3360/*
3361 * Given a key and some data, insert an item into the tree.
3362 * This does all the path init required, making room in the tree if needed.
3363 */
3364int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3365 *root, struct btrfs_key *cpu_key, void *data, u32
3366 data_size)
3367{
3368 int ret = 0;
3369 struct btrfs_path *path;
3370 struct extent_buffer *leaf;
3371 unsigned long ptr;
3372
3373 path = btrfs_alloc_path();
3374 BUG_ON(!path);
3375 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3376 if (!ret) {
3377 leaf = path->nodes[0];
3378 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3379 write_extent_buffer(leaf, data, ptr, data_size);
3380 btrfs_mark_buffer_dirty(leaf);
3381 }
3382 btrfs_free_path(path);
3383 return ret;
3384}
3385
3386/*
3387 * delete the pointer from a given node.
3388 *
3389 * the tree should have been previously balanced so the deletion does not
3390 * empty a node.
3391 */
3392static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3393 struct btrfs_path *path, int level, int slot)
3394{
3395 struct extent_buffer *parent = path->nodes[level];
3396 u32 nritems;
3397 int ret = 0;
3398 int wret;
3399
3400 nritems = btrfs_header_nritems(parent);
3401 if (slot != nritems - 1) {
3402 memmove_extent_buffer(parent,
3403 btrfs_node_key_ptr_offset(slot),
3404 btrfs_node_key_ptr_offset(slot + 1),
3405 sizeof(struct btrfs_key_ptr) *
3406 (nritems - slot - 1));
3407 }
3408 nritems--;
3409 btrfs_set_header_nritems(parent, nritems);
3410 if (nritems == 0 && parent == root->node) {
3411 BUG_ON(btrfs_header_level(root->node) != 1);
3412 /* just turn the root into a leaf and break */
3413 btrfs_set_header_level(root->node, 0);
3414 } else if (slot == 0) {
3415 struct btrfs_disk_key disk_key;
3416
3417 btrfs_node_key(parent, &disk_key, 0);
3418 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3419 if (wret)
3420 ret = wret;
3421 }
3422 btrfs_mark_buffer_dirty(parent);
3423 return ret;
3424}
3425
3426/*
3427 * a helper function to delete the leaf pointed to by path->slots[1] and
3428 * path->nodes[1]. bytenr is the node block pointer, but since the callers
3429 * already know it, it is faster to have them pass it down than to
3430 * read it out of the node again.
3431 *
3432 * This deletes the pointer in path->nodes[1] and frees the leaf
3433 * block extent. zero is returned if it all worked out, < 0 otherwise.
3434 *
3435 * The path must have already been setup for deleting the leaf, including
3436 * all the proper balancing. path->nodes[1] must be locked.
3437 */
3438noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3439 struct btrfs_root *root,
3440 struct btrfs_path *path, u64 bytenr)
3441{
3442 int ret;
3443 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3444
3445 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3446 if (ret)
3447 return ret;
3448
3449 ret = btrfs_free_extent(trans, root, bytenr,
3450 btrfs_level_size(root, 0),
3451 path->nodes[1]->start,
3452 btrfs_header_owner(path->nodes[1]),
3453 root_gen, 0, 1);
3454 return ret;
3455}
3456/*
3457 * delete the item at the leaf level in path. If that empties
3458 * the leaf, remove it from the tree
3459 */
3460int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3461 struct btrfs_path *path, int slot, int nr)
3462{
3463 struct extent_buffer *leaf;
3464 struct btrfs_item *item;
3465 int last_off;
3466 int dsize = 0;
3467 int ret = 0;
3468 int wret;
3469 int i;
3470 u32 nritems;
3471
3472 leaf = path->nodes[0];
3473 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3474
3475 for (i = 0; i < nr; i++)
3476 dsize += btrfs_item_size_nr(leaf, slot + i);
3477
3478 nritems = btrfs_header_nritems(leaf);
3479
3480 if (slot + nr != nritems) {
3481 int data_end = leaf_data_end(root, leaf);
3482
3483 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3484 data_end + dsize,
3485 btrfs_leaf_data(leaf) + data_end,
3486 last_off - data_end);
3487
3488 for (i = slot + nr; i < nritems; i++) {
3489 u32 ioff;
3490
3491 item = btrfs_item_nr(leaf, i);
3492 if (!leaf->map_token) {
3493 map_extent_buffer(leaf, (unsigned long)item,
3494 sizeof(struct btrfs_item),
3495 &leaf->map_token, &leaf->kaddr,
3496 &leaf->map_start, &leaf->map_len,
3497 KM_USER1);
3498 }
3499 ioff = btrfs_item_offset(leaf, item);
3500 btrfs_set_item_offset(leaf, item, ioff + dsize);
3501 }
3502
3503 if (leaf->map_token) {
3504 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3505 leaf->map_token = NULL;
3506 }
3507
3508 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3509 btrfs_item_nr_offset(slot + nr),
3510 sizeof(struct btrfs_item) *
3511 (nritems - slot - nr));
3512 }
3513 btrfs_set_header_nritems(leaf, nritems - nr);
3514 nritems -= nr;
3515
3516 /* delete the leaf if we've emptied it */
3517 if (nritems == 0) {
3518 if (leaf == root->node) {
3519 btrfs_set_header_level(leaf, 0);
3520 } else {
3521 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3522 BUG_ON(ret);
3523 }
3524 } else {
3525 int used = leaf_space_used(leaf, 0, nritems);
3526 if (slot == 0) {
3527 struct btrfs_disk_key disk_key;
3528
3529 btrfs_item_key(leaf, &disk_key, 0);
3530 wret = fixup_low_keys(trans, root, path,
3531 &disk_key, 1);
3532 if (wret)
3533 ret = wret;
3534 }
3535
3536 /* delete the leaf if it is mostly empty */
3537 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3538 /* push_leaf_left fixes the path.
3539 * make sure the path still points to our leaf
3540 * for possible call to del_ptr below
3541 */
3542 slot = path->slots[1];
3543 extent_buffer_get(leaf);
3544
3545 wret = push_leaf_left(trans, root, path, 1, 1);
3546 if (wret < 0 && wret != -ENOSPC)
3547 ret = wret;
3548
3549 if (path->nodes[0] == leaf &&
3550 btrfs_header_nritems(leaf)) {
3551 wret = push_leaf_right(trans, root, path, 1, 1);
3552 if (wret < 0 && wret != -ENOSPC)
3553 ret = wret;
3554 }
3555
3556 if (btrfs_header_nritems(leaf) == 0) {
3557 path->slots[1] = slot;
3558 ret = btrfs_del_leaf(trans, root, path,
3559 leaf->start);
3560 BUG_ON(ret);
3561 free_extent_buffer(leaf);
3562 } else {
3563 /* if we're still in the path, make sure
3564 * we're dirty. Otherwise, one of the
3565 * push_leaf functions must have already
3566 * dirtied this buffer
3567 */
3568 if (path->nodes[0] == leaf)
3569 btrfs_mark_buffer_dirty(leaf);
3570 free_extent_buffer(leaf);
3571 }
3572 } else {
3573 btrfs_mark_buffer_dirty(leaf);
3574 }
3575 }
3576 return ret;
3577}
3578
3579/*
3580 * search the tree again to find a leaf with lesser keys
3581 * returns 0 if it found something or 1 if there are no lesser leaves.
3582 * returns < 0 on io errors.
3583 *
3584 * This may release the path, and so you may lose any locks held at the
3585 * time you call it.
3586 */
3587int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3588{
3589 struct btrfs_key key;
3590 struct btrfs_disk_key found_key;
3591 int ret;
3592
3593 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3594
3595 if (key.offset > 0)
3596 key.offset--;
3597 else if (key.type > 0)
3598 key.type--;
3599 else if (key.objectid > 0)
3600 key.objectid--;
3601 else
3602 return 1;
3603
3604 btrfs_release_path(root, path);
3605 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3606 if (ret < 0)
3607 return ret;
3608 btrfs_item_key(path->nodes[0], &found_key, 0);
3609 ret = comp_keys(&found_key, &key);
3610 if (ret < 0)
3611 return 0;
3612 return 1;
3613}
3614
3615/*
3616 * A helper function to walk down the tree starting at min_key, and looking
3617 * for nodes or leaves that are either in cache or have a minimum
3618 * transaction id. This is used by the btree defrag code, and tree logging
3619 *
3620 * This does not cow, but it does stuff the starting key it finds back
3621 * into min_key, so you can call btrfs_search_slot with cow=1 on the
3622 * key and get a writable path.
3623 *
3624 * This does lock as it descends, and path->keep_locks should be set
3625 * to 1 by the caller.
3626 *
3627 * This honors path->lowest_level to prevent descent past a given level
3628 * of the tree.
3629 *
3630 * min_trans indicates the oldest transaction that you are interested
3631 * in walking through. Any nodes or leaves older than min_trans are
3632 * skipped over (without reading them).
3633 *
3634 * returns zero if something useful was found, < 0 on error and 1 if there
3635 * was nothing in the tree that matched the search criteria.
3636 */
3637int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3638 struct btrfs_key *max_key,
3639 struct btrfs_path *path, int cache_only,
3640 u64 min_trans)
3641{
3642 struct extent_buffer *cur;
3643 struct btrfs_key found_key;
3644 int slot;
3645 int sret;
3646 u32 nritems;
3647 int level;
3648 int ret = 1;
3649
3650 WARN_ON(!path->keep_locks);
3651again:
3652 cur = btrfs_lock_root_node(root);
3653 level = btrfs_header_level(cur);
3654 WARN_ON(path->nodes[level]);
3655 path->nodes[level] = cur;
3656 path->locks[level] = 1;
3657
3658 if (btrfs_header_generation(cur) < min_trans) {
3659 ret = 1;
3660 goto out;
3661 }
3662 while (1) {
3663 nritems = btrfs_header_nritems(cur);
3664 level = btrfs_header_level(cur);
3665 sret = bin_search(cur, min_key, level, &slot);
3666
3667 /* at the lowest level, we're done, setup the path and exit */
3668 if (level == path->lowest_level) {
3669 if (slot >= nritems)
3670 goto find_next_key;
3671 ret = 0;
3672 path->slots[level] = slot;
3673 btrfs_item_key_to_cpu(cur, &found_key, slot);
3674 goto out;
3675 }
3676 if (sret && slot > 0)
3677 slot--;
3678 /*
3679 * check this node pointer against the cache_only and
3680 * min_trans parameters. If it isn't in cache or is too
3681 * old, skip to the next one.
3682 */
3683 while (slot < nritems) {
3684 u64 blockptr;
3685 u64 gen;
3686 struct extent_buffer *tmp;
3687 struct btrfs_disk_key disk_key;
3688
3689 blockptr = btrfs_node_blockptr(cur, slot);
3690 gen = btrfs_node_ptr_generation(cur, slot);
3691 if (gen < min_trans) {
3692 slot++;
3693 continue;
3694 }
3695 if (!cache_only)
3696 break;
3697
3698 if (max_key) {
3699 btrfs_node_key(cur, &disk_key, slot);
3700 if (comp_keys(&disk_key, max_key) >= 0) {
3701 ret = 1;
3702 goto out;
3703 }
3704 }
3705
3706 tmp = btrfs_find_tree_block(root, blockptr,
3707 btrfs_level_size(root, level - 1));
3708
3709 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3710 free_extent_buffer(tmp);
3711 break;
3712 }
3713 if (tmp)
3714 free_extent_buffer(tmp);
3715 slot++;
3716 }
3717find_next_key:
3718 /*
3719 * we didn't find a candidate key in this node, walk forward
3720 * and find another one
3721 */
3722 if (slot >= nritems) {
3723 path->slots[level] = slot;
3724 sret = btrfs_find_next_key(root, path, min_key, level,
3725 cache_only, min_trans);
3726 if (sret == 0) {
3727 btrfs_release_path(root, path);
3728 goto again;
3729 } else {
3730 goto out;
3731 }
3732 }
3733 /* save our key for returning back */
3734 btrfs_node_key_to_cpu(cur, &found_key, slot);
3735 path->slots[level] = slot;
3736 if (level == path->lowest_level) {
3737 ret = 0;
3738 unlock_up(path, level, 1);
3739 goto out;
3740 }
3741 cur = read_node_slot(root, cur, slot);
3742
3743 btrfs_tree_lock(cur);
3744 path->locks[level - 1] = 1;
3745 path->nodes[level - 1] = cur;
3746 unlock_up(path, level, 1);
3747 }
3748out:
3749 if (ret == 0)
3750 memcpy(min_key, &found_key, sizeof(found_key));
3751 return ret;
3752}
3753
3754/*
3755 * this is similar to btrfs_next_leaf, but does not try to preserve
3756 * and fixup the path. It looks for and returns the next key in the
3757 * tree based on the current path and the cache_only and min_trans
3758 * parameters.
3759 *
3760 * 0 is returned if another key is found, < 0 if there are any errors
3761 * and 1 is returned if there are no higher keys in the tree
3762 *
3763 * path->keep_locks should be set to 1 on the search made before
3764 * calling this function.
3765 */
3766int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3767 struct btrfs_key *key, int lowest_level,
3768 int cache_only, u64 min_trans)
3769{
3770 int level = lowest_level;
3771 int slot;
3772 struct extent_buffer *c;
3773
3774 WARN_ON(!path->keep_locks);
3775 while (level < BTRFS_MAX_LEVEL) {
3776 if (!path->nodes[level])
3777 return 1;
3778
3779 slot = path->slots[level] + 1;
3780 c = path->nodes[level];
3781next:
3782 if (slot >= btrfs_header_nritems(c)) {
3783 level++;
3784 if (level == BTRFS_MAX_LEVEL)
3785 return 1;
3786 continue;
3787 }
3788 if (level == 0)
3789 btrfs_item_key_to_cpu(c, key, slot);
3790 else {
3791 u64 blockptr = btrfs_node_blockptr(c, slot);
3792 u64 gen = btrfs_node_ptr_generation(c, slot);
3793
3794 if (cache_only) {
3795 struct extent_buffer *cur;
3796 cur = btrfs_find_tree_block(root, blockptr,
3797 btrfs_level_size(root, level - 1));
3798 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
3799 slot++;
3800 if (cur)
3801 free_extent_buffer(cur);
3802 goto next;
3803 }
3804 free_extent_buffer(cur);
3805 }
3806 if (gen < min_trans) {
3807 slot++;
3808 goto next;
3809 }
3810 btrfs_node_key_to_cpu(c, key, slot);
3811 }
3812 return 0;
3813 }
3814 return 1;
3815}
3816
3817/*
3818 * search the tree again to find a leaf with greater keys
3819 * returns 0 if it found something or 1 if there are no greater leaves.
3820 * returns < 0 on io errors.
3821 */
3822int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3823{
3824 int slot;
3825 int level = 1;
3826 struct extent_buffer *c;
3827 struct extent_buffer *next = NULL;
3828 struct btrfs_key key;
3829 u32 nritems;
3830 int ret;
3831
3832 nritems = btrfs_header_nritems(path->nodes[0]);
3833 if (nritems == 0)
3834 return 1;
3835
3836 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
3837
3838 btrfs_release_path(root, path);
3839 path->keep_locks = 1;
3840 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3841 path->keep_locks = 0;
3842
3843 if (ret < 0)
3844 return ret;
3845
3846 nritems = btrfs_header_nritems(path->nodes[0]);
3847 /*
3848 * by releasing the path above we dropped all our locks. A balance
3849 * could have added more items next to the key that used to be
3850 * at the very end of the block. So, check again here and
3851 * advance the path if there are now more items available.
3852 */
3853 if (nritems > 0 && path->slots[0] < nritems - 1) {
3854 path->slots[0]++;
3855 goto done;
3856 }
3857
3858 while (level < BTRFS_MAX_LEVEL) {
3859 if (!path->nodes[level])
3860 return 1;
3861
3862 slot = path->slots[level] + 1;
3863 c = path->nodes[level];
3864 if (slot >= btrfs_header_nritems(c)) {
3865 level++;
3866 if (level == BTRFS_MAX_LEVEL)
3867 return 1;
3868 continue;
3869 }
3870
3871 if (next) {
3872 btrfs_tree_unlock(next);
3873 free_extent_buffer(next);
3874 }
3875
3876 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3877 path->reada)
3878 reada_for_search(root, path, level, slot, 0);
3879
3880 next = read_node_slot(root, c, slot);
3881 if (!path->skip_locking) {
3882 WARN_ON(!btrfs_tree_locked(c));
3883 btrfs_tree_lock(next);
3884 }
3885 break;
3886 }
3887 path->slots[level] = slot;
3888 while (1) {
3889 level--;
3890 c = path->nodes[level];
3891 if (path->locks[level])
3892 btrfs_tree_unlock(c);
3893 free_extent_buffer(c);
3894 path->nodes[level] = next;
3895 path->slots[level] = 0;
3896 if (!path->skip_locking)
3897 path->locks[level] = 1;
3898 if (!level)
3899 break;
3900 if (level == 1 && path->locks[1] && path->reada)
3901 reada_for_search(root, path, level, slot, 0);
3902 next = read_node_slot(root, next, 0);
3903 if (!path->skip_locking) {
3904 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3905 btrfs_tree_lock(next);
3906 }
3907 }
3908done:
3909 unlock_up(path, 0, 1);
3910 return 0;
3911}
3912
3913/*
3914 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
3915 * searching until it gets past min_objectid or finds an item of 'type'
3916 *
3917 * returns 0 if something is found, 1 if nothing was found and < 0 on error
3918 */
3919int btrfs_previous_item(struct btrfs_root *root,
3920 struct btrfs_path *path, u64 min_objectid,
3921 int type)
3922{
3923 struct btrfs_key found_key;
3924 struct extent_buffer *leaf;
3925 u32 nritems;
3926 int ret;
3927
3928 while (1) {
3929 if (path->slots[0] == 0) {
3930 ret = btrfs_prev_leaf(root, path);
3931 if (ret != 0)
3932 return ret;
3933 } else {
3934 path->slots[0]--;
3935 }
3936 leaf = path->nodes[0];
3937 nritems = btrfs_header_nritems(leaf);
3938 if (nritems == 0)
3939 return 1;
3940 if (path->slots[0] == nritems)
3941 path->slots[0]--;
3942
3943 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3944 if (found_key.type == type)
3945 return 0;
3946 if (found_key.objectid < min_objectid)
3947 break;
3948 if (found_key.objectid == min_objectid &&
3949 found_key.type < type)
3950 break;
3951 }
3952 return 1;
3953}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000000..eee060f88113
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2129 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CTREE__
20#define __BTRFS_CTREE__
21
22#include <linux/version.h>
23#include <linux/mm.h>
24#include <linux/highmem.h>
25#include <linux/fs.h>
26#include <linux/completion.h>
27#include <linux/backing-dev.h>
28#include <linux/wait.h>
29#include <asm/kmap_types.h>
30#include "extent_io.h"
31#include "extent_map.h"
32#include "async-thread.h"
33
34struct btrfs_trans_handle;
35struct btrfs_transaction;
36extern struct kmem_cache *btrfs_trans_handle_cachep;
37extern struct kmem_cache *btrfs_transaction_cachep;
38extern struct kmem_cache *btrfs_bit_radix_cachep;
39extern struct kmem_cache *btrfs_path_cachep;
40struct btrfs_ordered_sum;
41
42#define BTRFS_MAGIC "_BHRfS_M"
43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45
46#ifdef CONFIG_LOCKDEP
47# define BTRFS_MAX_LEVEL 7
48#else
49# define BTRFS_MAX_LEVEL 8
50#endif
51
52/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL
54
55/* stores information about which extents are in use, and reference counts */
56#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
57
58/*
59 * chunk tree stores translations from logical -> physical block numbering
60 * the super block points to the chunk tree
61 */
62#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
63
64/*
65 * stores information about which areas of a given device are in use.
66 * one per device. The tree of tree roots points to the device tree
67 */
68#define BTRFS_DEV_TREE_OBJECTID 4ULL
69
70/* one per subvolume, storing files and directories */
71#define BTRFS_FS_TREE_OBJECTID 5ULL
72
73/* directory objectid inside the root tree */
74#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
75
76/* holds checksums of all the data extents */
77#define BTRFS_CSUM_TREE_OBJECTID 7ULL
78
79/* orhpan objectid for tracking unlinked/truncated files */
80#define BTRFS_ORPHAN_OBJECTID -5ULL
81
82/* does write ahead logging to speed up fsyncs */
83#define BTRFS_TREE_LOG_OBJECTID -6ULL
84#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
85
86/* for space balancing */
87#define BTRFS_TREE_RELOC_OBJECTID -8ULL
88#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
89
90/*
91 * extent checksums all have this objectid
92 * this allows them to share the logging tree
93 * for fsyncs
94 */
95#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
96
97/* dummy objectid represents multiple objectids */
98#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
99
100/*
101 * All files have objectids in this range.
102 */
103#define BTRFS_FIRST_FREE_OBJECTID 256ULL
104#define BTRFS_LAST_FREE_OBJECTID -256ULL
105#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
106
107
108/*
109 * the device items go into the chunk tree. The key is in the form
110 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
111 */
112#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
113
114/*
115 * we can actually store much bigger names, but lets not confuse the rest
116 * of linux
117 */
118#define BTRFS_NAME_LEN 255
119
120/* 32 bytes in various csum fields */
121#define BTRFS_CSUM_SIZE 32
122
123/* csum types */
124#define BTRFS_CSUM_TYPE_CRC32 0
125
126static int btrfs_csum_sizes[] = { 4, 0 };
127
128/* four bytes for CRC32 */
129#define BTRFS_EMPTY_DIR_SIZE 0
130
131#define BTRFS_FT_UNKNOWN 0
132#define BTRFS_FT_REG_FILE 1
133#define BTRFS_FT_DIR 2
134#define BTRFS_FT_CHRDEV 3
135#define BTRFS_FT_BLKDEV 4
136#define BTRFS_FT_FIFO 5
137#define BTRFS_FT_SOCK 6
138#define BTRFS_FT_SYMLINK 7
139#define BTRFS_FT_XATTR 8
140#define BTRFS_FT_MAX 9
141
142/*
143 * the key defines the order in the tree, and so it also defines (optimal)
144 * block layout. objectid corresonds to the inode number. The flags
145 * tells us things about the object, and is a kind of stream selector.
146 * so for a given inode, keys with flags of 1 might refer to the inode
147 * data, flags of 2 may point to file data in the btree and flags == 3
148 * may point to extents.
149 *
150 * offset is the starting byte offset for this key in the stream.
151 *
152 * btrfs_disk_key is in disk byte order. struct btrfs_key is always
153 * in cpu native order. Otherwise they are identical and their sizes
154 * should be the same (ie both packed)
155 */
156struct btrfs_disk_key {
157 __le64 objectid;
158 u8 type;
159 __le64 offset;
160} __attribute__ ((__packed__));
161
162struct btrfs_key {
163 u64 objectid;
164 u8 type;
165 u64 offset;
166} __attribute__ ((__packed__));
167
168struct btrfs_mapping_tree {
169 struct extent_map_tree map_tree;
170};
171
172#define BTRFS_UUID_SIZE 16
173struct btrfs_dev_item {
174 /* the internal btrfs device id */
175 __le64 devid;
176
177 /* size of the device */
178 __le64 total_bytes;
179
180 /* bytes used */
181 __le64 bytes_used;
182
183 /* optimal io alignment for this device */
184 __le32 io_align;
185
186 /* optimal io width for this device */
187 __le32 io_width;
188
189 /* minimal io size for this device */
190 __le32 sector_size;
191
192 /* type and info about this device */
193 __le64 type;
194
195 /* expected generation for this device */
196 __le64 generation;
197
198 /*
199 * starting byte of this partition on the device,
200 * to allowr for stripe alignment in the future
201 */
202 __le64 start_offset;
203
204 /* grouping information for allocation decisions */
205 __le32 dev_group;
206
207 /* seek speed 0-100 where 100 is fastest */
208 u8 seek_speed;
209
210 /* bandwidth 0-100 where 100 is fastest */
211 u8 bandwidth;
212
213 /* btrfs generated uuid for this device */
214 u8 uuid[BTRFS_UUID_SIZE];
215
216 /* uuid of FS who owns this device */
217 u8 fsid[BTRFS_UUID_SIZE];
218} __attribute__ ((__packed__));
219
220struct btrfs_stripe {
221 __le64 devid;
222 __le64 offset;
223 u8 dev_uuid[BTRFS_UUID_SIZE];
224} __attribute__ ((__packed__));
225
226struct btrfs_chunk {
227 /* size of this chunk in bytes */
228 __le64 length;
229
230 /* objectid of the root referencing this chunk */
231 __le64 owner;
232
233 __le64 stripe_len;
234 __le64 type;
235
236 /* optimal io alignment for this chunk */
237 __le32 io_align;
238
239 /* optimal io width for this chunk */
240 __le32 io_width;
241
242 /* minimal io size for this chunk */
243 __le32 sector_size;
244
245 /* 2^16 stripes is quite a lot, a second limit is the size of a single
246 * item in the btree
247 */
248 __le16 num_stripes;
249
250 /* sub stripes only matter for raid10 */
251 __le16 sub_stripes;
252 struct btrfs_stripe stripe;
253 /* additional stripes go here */
254} __attribute__ ((__packed__));
255
256static inline unsigned long btrfs_chunk_item_size(int num_stripes)
257{
258 BUG_ON(num_stripes == 0);
259 return sizeof(struct btrfs_chunk) +
260 sizeof(struct btrfs_stripe) * (num_stripes - 1);
261}
262
263#define BTRFS_FSID_SIZE 16
264#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
265
266/*
267 * every tree block (leaf or node) starts with this header.
268 */
269struct btrfs_header {
270 /* these first four must match the super block */
271 u8 csum[BTRFS_CSUM_SIZE];
272 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
273 __le64 bytenr; /* which block this node is supposed to live in */
274 __le64 flags;
275
276 /* allowed to be different from the super from here on down */
277 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
278 __le64 generation;
279 __le64 owner;
280 __le32 nritems;
281 u8 level;
282} __attribute__ ((__packed__));
283
284#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
285 sizeof(struct btrfs_header)) / \
286 sizeof(struct btrfs_key_ptr))
287#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
288#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
289#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
290 sizeof(struct btrfs_item) - \
291 sizeof(struct btrfs_file_extent_item))
292
293#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
294
295/*
296 * this is a very generous portion of the super block, giving us
297 * room to translate 14 chunks with 3 stripes each.
298 */
299#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
300#define BTRFS_LABEL_SIZE 256
301
302/*
303 * the super block basically lists the main trees of the FS
304 * it currently lacks any block count etc etc
305 */
306struct btrfs_super_block {
307 u8 csum[BTRFS_CSUM_SIZE];
308 /* the first 4 fields must match struct btrfs_header */
309 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
310 __le64 bytenr; /* this block number */
311 __le64 flags;
312
313 /* allowed to be different from the btrfs_header from here own down */
314 __le64 magic;
315 __le64 generation;
316 __le64 root;
317 __le64 chunk_root;
318 __le64 log_root;
319
320 /* this will help find the new super based on the log root */
321 __le64 log_root_transid;
322 __le64 total_bytes;
323 __le64 bytes_used;
324 __le64 root_dir_objectid;
325 __le64 num_devices;
326 __le32 sectorsize;
327 __le32 nodesize;
328 __le32 leafsize;
329 __le32 stripesize;
330 __le32 sys_chunk_array_size;
331 __le64 chunk_root_generation;
332 __le64 compat_flags;
333 __le64 compat_ro_flags;
334 __le64 incompat_flags;
335 __le16 csum_type;
336 u8 root_level;
337 u8 chunk_root_level;
338 u8 log_root_level;
339 struct btrfs_dev_item dev_item;
340
341 char label[BTRFS_LABEL_SIZE];
342
343 /* future expansion */
344 __le64 reserved[32];
345 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
346} __attribute__ ((__packed__));
347
348/*
349 * Compat flags that we support. If any incompat flags are set other than the
350 * ones specified below then we will fail to mount
351 */
352#define BTRFS_FEATURE_COMPAT_SUPP 0x0
353#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0
354#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0
355
356/*
357 * A leaf is full of items. offset and size tell us where to find
358 * the item in the leaf (relative to the start of the data area)
359 */
360struct btrfs_item {
361 struct btrfs_disk_key key;
362 __le32 offset;
363 __le32 size;
364} __attribute__ ((__packed__));
365
366/*
367 * leaves have an item area and a data area:
368 * [item0, item1....itemN] [free space] [dataN...data1, data0]
369 *
370 * The data is separate from the items to get the keys closer together
371 * during searches.
372 */
373struct btrfs_leaf {
374 struct btrfs_header header;
375 struct btrfs_item items[];
376} __attribute__ ((__packed__));
377
378/*
379 * all non-leaf blocks are nodes, they hold only keys and pointers to
380 * other blocks
381 */
382struct btrfs_key_ptr {
383 struct btrfs_disk_key key;
384 __le64 blockptr;
385 __le64 generation;
386} __attribute__ ((__packed__));
387
388struct btrfs_node {
389 struct btrfs_header header;
390 struct btrfs_key_ptr ptrs[];
391} __attribute__ ((__packed__));
392
393/*
394 * btrfs_paths remember the path taken from the root down to the leaf.
395 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
396 * to any other levels that are present.
397 *
398 * The slots array records the index of the item or block pointer
399 * used while walking the tree.
400 */
401struct btrfs_path {
402 struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
403 int slots[BTRFS_MAX_LEVEL];
404 /* if there is real range locking, this locks field will change */
405 int locks[BTRFS_MAX_LEVEL];
406 int reada;
407 /* keep some upper locks as we walk down */
408 int keep_locks;
409 int skip_locking;
410 int lowest_level;
411
412 /*
413 * set by btrfs_split_item, tells search_slot to keep all locks
414 * and to force calls to keep space in the nodes
415 */
416 int search_for_split;
417};
418
419/*
420 * items in the extent btree are used to record the objectid of the
421 * owner of the block and the number of references
422 */
423struct btrfs_extent_item {
424 __le32 refs;
425} __attribute__ ((__packed__));
426
427struct btrfs_extent_ref {
428 __le64 root;
429 __le64 generation;
430 __le64 objectid;
431 __le32 num_refs;
432} __attribute__ ((__packed__));
433
434/* dev extents record free space on individual devices. The owner
435 * field points back to the chunk allocation mapping tree that allocated
436 * the extent. The chunk tree uuid field is a way to double check the owner
437 */
438struct btrfs_dev_extent {
439 __le64 chunk_tree;
440 __le64 chunk_objectid;
441 __le64 chunk_offset;
442 __le64 length;
443 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
444} __attribute__ ((__packed__));
445
446struct btrfs_inode_ref {
447 __le64 index;
448 __le16 name_len;
449 /* name goes here */
450} __attribute__ ((__packed__));
451
452struct btrfs_timespec {
453 __le64 sec;
454 __le32 nsec;
455} __attribute__ ((__packed__));
456
457typedef enum {
458 BTRFS_COMPRESS_NONE = 0,
459 BTRFS_COMPRESS_ZLIB = 1,
460 BTRFS_COMPRESS_LAST = 2,
461} btrfs_compression_type;
462
463/* we don't understand any encryption methods right now */
464typedef enum {
465 BTRFS_ENCRYPTION_NONE = 0,
466 BTRFS_ENCRYPTION_LAST = 1,
467} btrfs_encryption_type;
468
469struct btrfs_inode_item {
470 /* nfs style generation number */
471 __le64 generation;
472 /* transid that last touched this inode */
473 __le64 transid;
474 __le64 size;
475 __le64 nbytes;
476 __le64 block_group;
477 __le32 nlink;
478 __le32 uid;
479 __le32 gid;
480 __le32 mode;
481 __le64 rdev;
482 __le64 flags;
483
484 /* modification sequence number for NFS */
485 __le64 sequence;
486
487 /*
488 * a little future expansion, for more than this we can
489 * just grow the inode item and version it
490 */
491 __le64 reserved[4];
492 struct btrfs_timespec atime;
493 struct btrfs_timespec ctime;
494 struct btrfs_timespec mtime;
495 struct btrfs_timespec otime;
496} __attribute__ ((__packed__));
497
498struct btrfs_dir_log_item {
499 __le64 end;
500} __attribute__ ((__packed__));
501
502struct btrfs_dir_item {
503 struct btrfs_disk_key location;
504 __le64 transid;
505 __le16 data_len;
506 __le16 name_len;
507 u8 type;
508} __attribute__ ((__packed__));
509
510struct btrfs_root_item {
511 struct btrfs_inode_item inode;
512 __le64 generation;
513 __le64 root_dirid;
514 __le64 bytenr;
515 __le64 byte_limit;
516 __le64 bytes_used;
517 __le64 last_snapshot;
518 __le64 flags;
519 __le32 refs;
520 struct btrfs_disk_key drop_progress;
521 u8 drop_level;
522 u8 level;
523} __attribute__ ((__packed__));
524
525/*
526 * this is used for both forward and backward root refs
527 */
528struct btrfs_root_ref {
529 __le64 dirid;
530 __le64 sequence;
531 __le16 name_len;
532} __attribute__ ((__packed__));
533
534#define BTRFS_FILE_EXTENT_INLINE 0
535#define BTRFS_FILE_EXTENT_REG 1
536#define BTRFS_FILE_EXTENT_PREALLOC 2
537
538struct btrfs_file_extent_item {
539 /*
540 * transaction id that created this extent
541 */
542 __le64 generation;
543 /*
544 * max number of bytes to hold this extent in ram
545 * when we split a compressed extent we can't know how big
546 * each of the resulting pieces will be. So, this is
547 * an upper limit on the size of the extent in ram instead of
548 * an exact limit.
549 */
550 __le64 ram_bytes;
551
552 /*
553 * 32 bits for the various ways we might encode the data,
554 * including compression and encryption. If any of these
555 * are set to something a given disk format doesn't understand
556 * it is treated like an incompat flag for reading and writing,
557 * but not for stat.
558 */
559 u8 compression;
560 u8 encryption;
561 __le16 other_encoding; /* spare for later use */
562
563 /* are we inline data or a real extent? */
564 u8 type;
565
566 /*
567 * disk space consumed by the extent, checksum blocks are included
568 * in these numbers
569 */
570 __le64 disk_bytenr;
571 __le64 disk_num_bytes;
572 /*
573 * the logical offset in file blocks (no csums)
574 * this extent record is for. This allows a file extent to point
575 * into the middle of an existing extent on disk, sharing it
576 * between two snapshots (useful if some bytes in the middle of the
577 * extent have changed
578 */
579 __le64 offset;
580 /*
581 * the logical number of file blocks (no csums included). This
582 * always reflects the size uncompressed and without encoding.
583 */
584 __le64 num_bytes;
585
586} __attribute__ ((__packed__));
587
588struct btrfs_csum_item {
589 u8 csum;
590} __attribute__ ((__packed__));
591
592/* different types of block groups (and chunks) */
593#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
594#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
595#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
596#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
597#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
598#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
599#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
600
601struct btrfs_block_group_item {
602 __le64 used;
603 __le64 chunk_objectid;
604 __le64 flags;
605} __attribute__ ((__packed__));
606
607struct btrfs_space_info {
608 u64 flags;
609 u64 total_bytes;
610 u64 bytes_used;
611 u64 bytes_pinned;
612 u64 bytes_reserved;
613 u64 bytes_readonly;
614 int full;
615 int force_alloc;
616 struct list_head list;
617
618 /* for block groups in our same type */
619 struct list_head block_groups;
620 spinlock_t lock;
621 struct rw_semaphore groups_sem;
622};
623
624struct btrfs_free_space {
625 struct rb_node bytes_index;
626 struct rb_node offset_index;
627 u64 offset;
628 u64 bytes;
629};
630
631struct btrfs_block_group_cache {
632 struct btrfs_key key;
633 struct btrfs_block_group_item item;
634 spinlock_t lock;
635 struct mutex alloc_mutex;
636 struct mutex cache_mutex;
637 u64 pinned;
638 u64 reserved;
639 u64 flags;
640 int cached;
641 int ro;
642 int dirty;
643
644 struct btrfs_space_info *space_info;
645
646 /* free space cache stuff */
647 struct rb_root free_space_bytes;
648 struct rb_root free_space_offset;
649
650 /* block group cache stuff */
651 struct rb_node cache_node;
652
653 /* for block groups in the same raid type */
654 struct list_head list;
655
656 /* usage count */
657 atomic_t count;
658};
659
660struct btrfs_leaf_ref_tree {
661 struct rb_root root;
662 struct list_head list;
663 spinlock_t lock;
664};
665
666struct btrfs_device;
667struct btrfs_fs_devices;
668struct btrfs_fs_info {
669 u8 fsid[BTRFS_FSID_SIZE];
670 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
671 struct btrfs_root *extent_root;
672 struct btrfs_root *tree_root;
673 struct btrfs_root *chunk_root;
674 struct btrfs_root *dev_root;
675 struct btrfs_root *fs_root;
676 struct btrfs_root *csum_root;
677
678 /* the log root tree is a directory of all the other log roots */
679 struct btrfs_root *log_root_tree;
680 struct radix_tree_root fs_roots_radix;
681
682 /* block group cache stuff */
683 spinlock_t block_group_cache_lock;
684 struct rb_root block_group_cache_tree;
685
686 struct extent_io_tree pinned_extents;
687 struct extent_io_tree pending_del;
688 struct extent_io_tree extent_ins;
689
690 /* logical->physical extent mapping */
691 struct btrfs_mapping_tree mapping_tree;
692
693 u64 generation;
694 u64 last_trans_committed;
695 u64 last_trans_new_blockgroup;
696 u64 open_ioctl_trans;
697 unsigned long mount_opt;
698 u64 max_extent;
699 u64 max_inline;
700 u64 alloc_start;
701 struct btrfs_transaction *running_transaction;
702 wait_queue_head_t transaction_throttle;
703 wait_queue_head_t transaction_wait;
704
705 wait_queue_head_t async_submit_wait;
706 wait_queue_head_t tree_log_wait;
707
708 struct btrfs_super_block super_copy;
709 struct btrfs_super_block super_for_commit;
710 struct block_device *__bdev;
711 struct super_block *sb;
712 struct inode *btree_inode;
713 struct backing_dev_info bdi;
714 spinlock_t hash_lock;
715 struct mutex trans_mutex;
716 struct mutex tree_log_mutex;
717 struct mutex transaction_kthread_mutex;
718 struct mutex cleaner_mutex;
719 struct mutex extent_ins_mutex;
720 struct mutex pinned_mutex;
721 struct mutex chunk_mutex;
722 struct mutex drop_mutex;
723 struct mutex volume_mutex;
724 struct mutex tree_reloc_mutex;
725 struct list_head trans_list;
726 struct list_head hashers;
727 struct list_head dead_roots;
728
729 atomic_t nr_async_submits;
730 atomic_t async_submit_draining;
731 atomic_t nr_async_bios;
732 atomic_t async_delalloc_pages;
733 atomic_t tree_log_writers;
734 atomic_t tree_log_commit;
735 unsigned long tree_log_batch;
736 u64 tree_log_transid;
737
738 /*
739 * this is used by the balancing code to wait for all the pending
740 * ordered extents
741 */
742 spinlock_t ordered_extent_lock;
743 struct list_head ordered_extents;
744 struct list_head delalloc_inodes;
745
746 /*
747 * there is a pool of worker threads for checksumming during writes
748 * and a pool for checksumming after reads. This is because readers
749 * can run with FS locks held, and the writers may be waiting for
750 * those locks. We don't want ordering in the pending list to cause
751 * deadlocks, and so the two are serviced separately.
752 *
753 * A third pool does submit_bio to avoid deadlocking with the other
754 * two
755 */
756 struct btrfs_workers workers;
757 struct btrfs_workers delalloc_workers;
758 struct btrfs_workers endio_workers;
759 struct btrfs_workers endio_meta_workers;
760 struct btrfs_workers endio_meta_write_workers;
761 struct btrfs_workers endio_write_workers;
762 struct btrfs_workers submit_workers;
763 /*
764 * fixup workers take dirty pages that didn't properly go through
765 * the cow mechanism and make them safe to write. It happens
766 * for the sys_munmap function call path
767 */
768 struct btrfs_workers fixup_workers;
769 struct task_struct *transaction_kthread;
770 struct task_struct *cleaner_kthread;
771 int thread_pool_size;
772
773 /* tree relocation relocated fields */
774 struct list_head dead_reloc_roots;
775 struct btrfs_leaf_ref_tree reloc_ref_tree;
776 struct btrfs_leaf_ref_tree shared_ref_tree;
777
778 struct kobject super_kobj;
779 struct completion kobj_unregister;
780 int do_barriers;
781 int closing;
782 int log_root_recovering;
783 atomic_t throttles;
784 atomic_t throttle_gen;
785
786 u64 total_pinned;
787 struct list_head dirty_cowonly_roots;
788
789 struct btrfs_fs_devices *fs_devices;
790 struct list_head space_info;
791 spinlock_t delalloc_lock;
792 spinlock_t new_trans_lock;
793 u64 delalloc_bytes;
794 u64 last_alloc;
795 u64 last_data_alloc;
796
797 spinlock_t ref_cache_lock;
798 u64 total_ref_cache_size;
799
800 u64 avail_data_alloc_bits;
801 u64 avail_metadata_alloc_bits;
802 u64 avail_system_alloc_bits;
803 u64 data_alloc_profile;
804 u64 metadata_alloc_profile;
805 u64 system_alloc_profile;
806
807 void *bdev_holder;
808};
809
810/*
811 * in ram representation of the tree. extent_root is used for all allocations
812 * and for the extent tree extent_root root.
813 */
814struct btrfs_dirty_root;
815struct btrfs_root {
816 struct extent_buffer *node;
817
818 /* the node lock is held while changing the node pointer */
819 spinlock_t node_lock;
820
821 struct extent_buffer *commit_root;
822 struct btrfs_leaf_ref_tree *ref_tree;
823 struct btrfs_leaf_ref_tree ref_tree_struct;
824 struct btrfs_dirty_root *dirty_root;
825 struct btrfs_root *log_root;
826 struct btrfs_root *reloc_root;
827
828 struct btrfs_root_item root_item;
829 struct btrfs_key root_key;
830 struct btrfs_fs_info *fs_info;
831 struct extent_io_tree dirty_log_pages;
832
833 struct kobject root_kobj;
834 struct completion kobj_unregister;
835 struct mutex objectid_mutex;
836 struct mutex log_mutex;
837
838 u64 objectid;
839 u64 last_trans;
840
841 /* data allocations are done in sectorsize units */
842 u32 sectorsize;
843
844 /* node allocations are done in nodesize units */
845 u32 nodesize;
846
847 /* leaf allocations are done in leafsize units */
848 u32 leafsize;
849
850 u32 stripesize;
851
852 u32 type;
853 u64 highest_inode;
854 u64 last_inode_alloc;
855 int ref_cows;
856 int track_dirty;
857 u64 defrag_trans_start;
858 struct btrfs_key defrag_progress;
859 struct btrfs_key defrag_max;
860 int defrag_running;
861 int defrag_level;
862 char *name;
863 int in_sysfs;
864
865 /* the dirty list is only used by non-reference counted roots */
866 struct list_head dirty_list;
867
868 spinlock_t list_lock;
869 struct list_head dead_list;
870 struct list_head orphan_list;
871
872 /*
873 * right now this just gets used so that a root has its own devid
874 * for stat. It may be used for more later
875 */
876 struct super_block anon_super;
877};
878
879/*
880
881 * inode items have the data typically returned from stat and store other
882 * info about object characteristics. There is one for every file and dir in
883 * the FS
884 */
885#define BTRFS_INODE_ITEM_KEY 1
886#define BTRFS_INODE_REF_KEY 12
887#define BTRFS_XATTR_ITEM_KEY 24
888#define BTRFS_ORPHAN_ITEM_KEY 48
889/* reserve 2-15 close to the inode for later flexibility */
890
891/*
892 * dir items are the name -> inode pointers in a directory. There is one
893 * for every name in a directory.
894 */
895#define BTRFS_DIR_LOG_ITEM_KEY 60
896#define BTRFS_DIR_LOG_INDEX_KEY 72
897#define BTRFS_DIR_ITEM_KEY 84
898#define BTRFS_DIR_INDEX_KEY 96
899/*
900 * extent data is for file data
901 */
902#define BTRFS_EXTENT_DATA_KEY 108
903
904/*
905 * extent csums are stored in a separate tree and hold csums for
906 * an entire extent on disk.
907 */
908#define BTRFS_EXTENT_CSUM_KEY 128
909
910/*
911 * root items point to tree roots. There are typically in the root
912 * tree used by the super block to find all the other trees
913 */
914#define BTRFS_ROOT_ITEM_KEY 132
915
916/*
917 * root backrefs tie subvols and snapshots to the directory entries that
918 * reference them
919 */
920#define BTRFS_ROOT_BACKREF_KEY 144
921
922/*
923 * root refs make a fast index for listing all of the snapshots and
924 * subvolumes referenced by a given root. They point directly to the
925 * directory item in the root that references the subvol
926 */
927#define BTRFS_ROOT_REF_KEY 156
928
929/*
930 * extent items are in the extent map tree. These record which blocks
931 * are used, and how many references there are to each block
932 */
933#define BTRFS_EXTENT_ITEM_KEY 168
934#define BTRFS_EXTENT_REF_KEY 180
935
936/*
937 * block groups give us hints into the extent allocation trees. Which
938 * blocks are free etc etc
939 */
940#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
941
942#define BTRFS_DEV_EXTENT_KEY 204
943#define BTRFS_DEV_ITEM_KEY 216
944#define BTRFS_CHUNK_ITEM_KEY 228
945
946/*
947 * string items are for debugging. They just store a short string of
948 * data in the FS
949 */
950#define BTRFS_STRING_ITEM_KEY 253
951
952#define BTRFS_MOUNT_NODATASUM (1 << 0)
953#define BTRFS_MOUNT_NODATACOW (1 << 1)
954#define BTRFS_MOUNT_NOBARRIER (1 << 2)
955#define BTRFS_MOUNT_SSD (1 << 3)
956#define BTRFS_MOUNT_DEGRADED (1 << 4)
957#define BTRFS_MOUNT_COMPRESS (1 << 5)
958
959#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
960#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
961#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
962 BTRFS_MOUNT_##opt)
963/*
964 * Inode flags
965 */
966#define BTRFS_INODE_NODATASUM (1 << 0)
967#define BTRFS_INODE_NODATACOW (1 << 1)
968#define BTRFS_INODE_READONLY (1 << 2)
969#define BTRFS_INODE_NOCOMPRESS (1 << 3)
970#define BTRFS_INODE_PREALLOC (1 << 4)
971#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
972 ~BTRFS_INODE_##flag)
973#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
974 BTRFS_INODE_##flag)
975#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \
976 BTRFS_INODE_##flag)
977/* some macros to generate set/get funcs for the struct fields. This
978 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
979 * one for u8:
980 */
981#define le8_to_cpu(v) (v)
982#define cpu_to_le8(v) (v)
983#define __le8 u8
984
985#define read_eb_member(eb, ptr, type, member, result) ( \
986 read_extent_buffer(eb, (char *)(result), \
987 ((unsigned long)(ptr)) + \
988 offsetof(type, member), \
989 sizeof(((type *)0)->member)))
990
991#define write_eb_member(eb, ptr, type, member, result) ( \
992 write_extent_buffer(eb, (char *)(result), \
993 ((unsigned long)(ptr)) + \
994 offsetof(type, member), \
995 sizeof(((type *)0)->member)))
996
997#ifndef BTRFS_SETGET_FUNCS
998#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
999u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
1000void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
1001#endif
1002
1003#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1004static inline u##bits btrfs_##name(struct extent_buffer *eb) \
1005{ \
1006 type *p = kmap_atomic(eb->first_page, KM_USER0); \
1007 u##bits res = le##bits##_to_cpu(p->member); \
1008 kunmap_atomic(p, KM_USER0); \
1009 return res; \
1010} \
1011static inline void btrfs_set_##name(struct extent_buffer *eb, \
1012 u##bits val) \
1013{ \
1014 type *p = kmap_atomic(eb->first_page, KM_USER0); \
1015 p->member = cpu_to_le##bits(val); \
1016 kunmap_atomic(p, KM_USER0); \
1017}
1018
1019#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
1020static inline u##bits btrfs_##name(type *s) \
1021{ \
1022 return le##bits##_to_cpu(s->member); \
1023} \
1024static inline void btrfs_set_##name(type *s, u##bits val) \
1025{ \
1026 s->member = cpu_to_le##bits(val); \
1027}
1028
1029BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
1030BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
1031BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
1032BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
1033BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
1034BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
1035 start_offset, 64);
1036BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
1037BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
1038BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
1039BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
1040BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
1041BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
1042
1043BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
1044BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
1045 total_bytes, 64);
1046BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
1047 bytes_used, 64);
1048BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
1049 io_align, 32);
1050BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
1051 io_width, 32);
1052BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
1053 sector_size, 32);
1054BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
1055BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
1056 dev_group, 32);
1057BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
1058 seek_speed, 8);
1059BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
1060 bandwidth, 8);
1061BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
1062 generation, 64);
1063
1064static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
1065{
1066 return (char *)d + offsetof(struct btrfs_dev_item, uuid);
1067}
1068
1069static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
1070{
1071 return (char *)d + offsetof(struct btrfs_dev_item, fsid);
1072}
1073
1074BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
1075BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
1076BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
1077BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
1078BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
1079BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
1080BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
1081BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
1082BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
1083BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
1084BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
1085
1086static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
1087{
1088 return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
1089}
1090
1091BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
1092BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
1093BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
1094 stripe_len, 64);
1095BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
1096 io_align, 32);
1097BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
1098 io_width, 32);
1099BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
1100 sector_size, 32);
1101BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
1102BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
1103 num_stripes, 16);
1104BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
1105 sub_stripes, 16);
1106BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
1107BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
1108
1109static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
1110 int nr)
1111{
1112 unsigned long offset = (unsigned long)c;
1113 offset += offsetof(struct btrfs_chunk, stripe);
1114 offset += nr * sizeof(struct btrfs_stripe);
1115 return (struct btrfs_stripe *)offset;
1116}
1117
1118static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
1119{
1120 return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
1121}
1122
1123static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
1124 struct btrfs_chunk *c, int nr)
1125{
1126 return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
1127}
1128
1129static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
1130 struct btrfs_chunk *c, int nr,
1131 u64 val)
1132{
1133 btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
1134}
1135
1136static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
1137 struct btrfs_chunk *c, int nr)
1138{
1139 return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
1140}
1141
1142static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
1143 struct btrfs_chunk *c, int nr,
1144 u64 val)
1145{
1146 btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
1147}
1148
1149/* struct btrfs_block_group_item */
1150BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
1151 used, 64);
1152BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
1153 used, 64);
1154BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
1155 struct btrfs_block_group_item, chunk_objectid, 64);
1156
1157BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
1158 struct btrfs_block_group_item, chunk_objectid, 64);
1159BTRFS_SETGET_FUNCS(disk_block_group_flags,
1160 struct btrfs_block_group_item, flags, 64);
1161BTRFS_SETGET_STACK_FUNCS(block_group_flags,
1162 struct btrfs_block_group_item, flags, 64);
1163
1164/* struct btrfs_inode_ref */
1165BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1166BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1167
1168/* struct btrfs_inode_item */
1169BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1170BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
1171BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
1172BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
1173BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
1174BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
1175BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
1176BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
1177BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
1178BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
1179BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
1180BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
1181
1182static inline struct btrfs_timespec *
1183btrfs_inode_atime(struct btrfs_inode_item *inode_item)
1184{
1185 unsigned long ptr = (unsigned long)inode_item;
1186 ptr += offsetof(struct btrfs_inode_item, atime);
1187 return (struct btrfs_timespec *)ptr;
1188}
1189
1190static inline struct btrfs_timespec *
1191btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
1192{
1193 unsigned long ptr = (unsigned long)inode_item;
1194 ptr += offsetof(struct btrfs_inode_item, mtime);
1195 return (struct btrfs_timespec *)ptr;
1196}
1197
1198static inline struct btrfs_timespec *
1199btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
1200{
1201 unsigned long ptr = (unsigned long)inode_item;
1202 ptr += offsetof(struct btrfs_inode_item, ctime);
1203 return (struct btrfs_timespec *)ptr;
1204}
1205
1206static inline struct btrfs_timespec *
1207btrfs_inode_otime(struct btrfs_inode_item *inode_item)
1208{
1209 unsigned long ptr = (unsigned long)inode_item;
1210 ptr += offsetof(struct btrfs_inode_item, otime);
1211 return (struct btrfs_timespec *)ptr;
1212}
1213
1214BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
1215BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
1216
1217/* struct btrfs_dev_extent */
1218BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
1219 chunk_tree, 64);
1220BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
1221 chunk_objectid, 64);
1222BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
1223 chunk_offset, 64);
1224BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
1225
1226static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
1227{
1228 unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
1229 return (u8 *)((unsigned long)dev + ptr);
1230}
1231
1232/* struct btrfs_extent_ref */
1233BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
1234BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
1235BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
1236BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
1237
1238BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
1239BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
1240 generation, 64);
1241BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
1242 objectid, 64);
1243BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
1244 num_refs, 32);
1245
1246/* struct btrfs_extent_item */
1247BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
1248BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
1249 refs, 32);
1250
1251/* struct btrfs_node */
1252BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
1253BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
1254
1255static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
1256{
1257 unsigned long ptr;
1258 ptr = offsetof(struct btrfs_node, ptrs) +
1259 sizeof(struct btrfs_key_ptr) * nr;
1260 return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
1261}
1262
1263static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
1264 int nr, u64 val)
1265{
1266 unsigned long ptr;
1267 ptr = offsetof(struct btrfs_node, ptrs) +
1268 sizeof(struct btrfs_key_ptr) * nr;
1269 btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
1270}
1271
1272static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
1273{
1274 unsigned long ptr;
1275 ptr = offsetof(struct btrfs_node, ptrs) +
1276 sizeof(struct btrfs_key_ptr) * nr;
1277 return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
1278}
1279
1280static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
1281 int nr, u64 val)
1282{
1283 unsigned long ptr;
1284 ptr = offsetof(struct btrfs_node, ptrs) +
1285 sizeof(struct btrfs_key_ptr) * nr;
1286 btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
1287}
1288
1289static inline unsigned long btrfs_node_key_ptr_offset(int nr)
1290{
1291 return offsetof(struct btrfs_node, ptrs) +
1292 sizeof(struct btrfs_key_ptr) * nr;
1293}
1294
1295void btrfs_node_key(struct extent_buffer *eb,
1296 struct btrfs_disk_key *disk_key, int nr);
1297
1298static inline void btrfs_set_node_key(struct extent_buffer *eb,
1299 struct btrfs_disk_key *disk_key, int nr)
1300{
1301 unsigned long ptr;
1302 ptr = btrfs_node_key_ptr_offset(nr);
1303 write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
1304 struct btrfs_key_ptr, key, disk_key);
1305}
1306
1307/* struct btrfs_item */
1308BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
1309BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
1310
1311static inline unsigned long btrfs_item_nr_offset(int nr)
1312{
1313 return offsetof(struct btrfs_leaf, items) +
1314 sizeof(struct btrfs_item) * nr;
1315}
1316
1317static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
1318 int nr)
1319{
1320 return (struct btrfs_item *)btrfs_item_nr_offset(nr);
1321}
1322
1323static inline u32 btrfs_item_end(struct extent_buffer *eb,
1324 struct btrfs_item *item)
1325{
1326 return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
1327}
1328
1329static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
1330{
1331 return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
1332}
1333
1334static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
1335{
1336 return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
1337}
1338
1339static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
1340{
1341 return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
1342}
1343
1344static inline void btrfs_item_key(struct extent_buffer *eb,
1345 struct btrfs_disk_key *disk_key, int nr)
1346{
1347 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1348 read_eb_member(eb, item, struct btrfs_item, key, disk_key);
1349}
1350
1351static inline void btrfs_set_item_key(struct extent_buffer *eb,
1352 struct btrfs_disk_key *disk_key, int nr)
1353{
1354 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1355 write_eb_member(eb, item, struct btrfs_item, key, disk_key);
1356}
1357
1358BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
1359
1360/*
1361 * struct btrfs_root_ref
1362 */
1363BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
1364BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
1365BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
1366
1367/* struct btrfs_dir_item */
1368BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
1369BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
1370BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
1371BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
1372
1373static inline void btrfs_dir_item_key(struct extent_buffer *eb,
1374 struct btrfs_dir_item *item,
1375 struct btrfs_disk_key *key)
1376{
1377 read_eb_member(eb, item, struct btrfs_dir_item, location, key);
1378}
1379
1380static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1381 struct btrfs_dir_item *item,
1382 struct btrfs_disk_key *key)
1383{
1384 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1385}
1386
1387/* struct btrfs_disk_key */
1388BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1389 objectid, 64);
1390BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
1391BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
1392
1393static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
1394 struct btrfs_disk_key *disk)
1395{
1396 cpu->offset = le64_to_cpu(disk->offset);
1397 cpu->type = disk->type;
1398 cpu->objectid = le64_to_cpu(disk->objectid);
1399}
1400
1401static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
1402 struct btrfs_key *cpu)
1403{
1404 disk->offset = cpu_to_le64(cpu->offset);
1405 disk->type = cpu->type;
1406 disk->objectid = cpu_to_le64(cpu->objectid);
1407}
1408
1409static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
1410 struct btrfs_key *key, int nr)
1411{
1412 struct btrfs_disk_key disk_key;
1413 btrfs_node_key(eb, &disk_key, nr);
1414 btrfs_disk_key_to_cpu(key, &disk_key);
1415}
1416
1417static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
1418 struct btrfs_key *key, int nr)
1419{
1420 struct btrfs_disk_key disk_key;
1421 btrfs_item_key(eb, &disk_key, nr);
1422 btrfs_disk_key_to_cpu(key, &disk_key);
1423}
1424
1425static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
1426 struct btrfs_dir_item *item,
1427 struct btrfs_key *key)
1428{
1429 struct btrfs_disk_key disk_key;
1430 btrfs_dir_item_key(eb, item, &disk_key);
1431 btrfs_disk_key_to_cpu(key, &disk_key);
1432}
1433
1434
1435static inline u8 btrfs_key_type(struct btrfs_key *key)
1436{
1437 return key->type;
1438}
1439
1440static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
1441{
1442 key->type = val;
1443}
1444
1445/* struct btrfs_header */
1446BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
1447BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
1448 generation, 64);
1449BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
1450BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
1451BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
1452BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
1453
1454static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
1455{
1456 return (btrfs_header_flags(eb) & flag) == flag;
1457}
1458
1459static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
1460{
1461 u64 flags = btrfs_header_flags(eb);
1462 btrfs_set_header_flags(eb, flags | flag);
1463 return (flags & flag) == flag;
1464}
1465
1466static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
1467{
1468 u64 flags = btrfs_header_flags(eb);
1469 btrfs_set_header_flags(eb, flags & ~flag);
1470 return (flags & flag) == flag;
1471}
1472
1473static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
1474{
1475 unsigned long ptr = offsetof(struct btrfs_header, fsid);
1476 return (u8 *)ptr;
1477}
1478
1479static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
1480{
1481 unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
1482 return (u8 *)ptr;
1483}
1484
1485static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
1486{
1487 unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
1488 return (u8 *)ptr;
1489}
1490
1491static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
1492{
1493 unsigned long ptr = offsetof(struct btrfs_header, csum);
1494 return (u8 *)ptr;
1495}
1496
1497static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
1498{
1499 return NULL;
1500}
1501
1502static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
1503{
1504 return NULL;
1505}
1506
1507static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
1508{
1509 return NULL;
1510}
1511
1512static inline int btrfs_is_leaf(struct extent_buffer *eb)
1513{
1514 return btrfs_header_level(eb) == 0;
1515}
1516
1517/* struct btrfs_root_item */
1518BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
1519 generation, 64);
1520BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
1521BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
1522BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
1523
1524BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
1525 generation, 64);
1526BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
1527BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
1528BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
1529BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
1530BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
1531BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
1532BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1533BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1534 last_snapshot, 64);
1535
1536/* struct btrfs_super_block */
1537
1538BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
1539BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
1540BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
1541 generation, 64);
1542BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
1543BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
1544 struct btrfs_super_block, sys_chunk_array_size, 32);
1545BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
1546 struct btrfs_super_block, chunk_root_generation, 64);
1547BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
1548 root_level, 8);
1549BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
1550 chunk_root, 64);
1551BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
1552 chunk_root_level, 8);
1553BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
1554 log_root, 64);
1555BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
1556 log_root_transid, 64);
1557BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
1558 log_root_level, 8);
1559BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
1560 total_bytes, 64);
1561BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
1562 bytes_used, 64);
1563BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
1564 sectorsize, 32);
1565BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
1566 nodesize, 32);
1567BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
1568 leafsize, 32);
1569BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
1570 stripesize, 32);
1571BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
1572 root_dir_objectid, 64);
1573BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1574 num_devices, 64);
1575BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1576 compat_flags, 64);
1577BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1578 compat_flags, 64);
1579BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1580 incompat_flags, 64);
1581BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
1582 csum_type, 16);
1583
1584static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
1585{
1586 int t = btrfs_super_csum_type(s);
1587 BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
1588 return btrfs_csum_sizes[t];
1589}
1590
1591static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
1592{
1593 return offsetof(struct btrfs_leaf, items);
1594}
1595
1596/* struct btrfs_file_extent_item */
1597BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
1598
1599static inline unsigned long
1600btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
1601{
1602 unsigned long offset = (unsigned long)e;
1603 offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
1604 return offset;
1605}
1606
1607static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
1608{
1609 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
1610}
1611
1612BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
1613 disk_bytenr, 64);
1614BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
1615 generation, 64);
1616BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
1617 disk_num_bytes, 64);
1618BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
1619 offset, 64);
1620BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
1621 num_bytes, 64);
1622BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
1623 ram_bytes, 64);
1624BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
1625 compression, 8);
1626BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
1627 encryption, 8);
1628BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
1629 other_encoding, 16);
1630
1631/* this returns the number of file bytes represented by the inline item.
1632 * If an item is compressed, this is the uncompressed size
1633 */
1634static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1635 struct btrfs_file_extent_item *e)
1636{
1637 return btrfs_file_extent_ram_bytes(eb, e);
1638}
1639
1640/*
1641 * this returns the number of bytes used by the item on disk, minus the
1642 * size of any extent headers. If a file is compressed on disk, this is
1643 * the compressed size
1644 */
1645static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
1646 struct btrfs_item *e)
1647{
1648 unsigned long offset;
1649 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1650 return btrfs_item_size(eb, e) - offset;
1651}
1652
1653static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
1654{
1655 return sb->s_fs_info;
1656}
1657
1658static inline int btrfs_set_root_name(struct btrfs_root *root,
1659 const char *name, int len)
1660{
1661 /* if we already have a name just free it */
1662 kfree(root->name);
1663
1664 root->name = kmalloc(len+1, GFP_KERNEL);
1665 if (!root->name)
1666 return -ENOMEM;
1667
1668 memcpy(root->name, name, len);
1669 root->name[len] = '\0';
1670
1671 return 0;
1672}
1673
1674static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
1675{
1676 if (level == 0)
1677 return root->leafsize;
1678 return root->nodesize;
1679}
1680
1681/* helper function to cast into the data area of the leaf. */
1682#define btrfs_item_ptr(leaf, slot, type) \
1683 ((type *)(btrfs_leaf_data(leaf) + \
1684 btrfs_item_offset_nr(leaf, slot)))
1685
1686#define btrfs_item_ptr_offset(leaf, slot) \
1687 ((unsigned long)(btrfs_leaf_data(leaf) + \
1688 btrfs_item_offset_nr(leaf, slot)))
1689
1690static inline struct dentry *fdentry(struct file *file)
1691{
1692 return file->f_path.dentry;
1693}
1694
1695/* extent-tree.c */
1696int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1697int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1698 struct btrfs_root *root, u64 bytenr,
1699 u64 num_bytes, u32 *refs);
1700int btrfs_update_pinned_extents(struct btrfs_root *root,
1701 u64 bytenr, u64 num, int pin);
1702int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1703 struct btrfs_root *root, struct extent_buffer *leaf);
1704int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1705 struct btrfs_root *root, u64 objectid, u64 bytenr);
1706int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1707 struct btrfs_root *root);
1708int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1709struct btrfs_block_group_cache *btrfs_lookup_block_group(
1710 struct btrfs_fs_info *info,
1711 u64 bytenr);
1712u64 btrfs_find_block_group(struct btrfs_root *root,
1713 u64 search_start, u64 search_hint, int owner);
1714struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1715 struct btrfs_root *root,
1716 u32 blocksize, u64 parent,
1717 u64 root_objectid,
1718 u64 ref_generation,
1719 int level,
1720 u64 hint,
1721 u64 empty_size);
1722struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1723 struct btrfs_root *root,
1724 u64 bytenr, u32 blocksize);
1725int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1726 struct btrfs_root *root,
1727 u64 num_bytes, u64 parent, u64 min_bytes,
1728 u64 root_objectid, u64 ref_generation,
1729 u64 owner, u64 empty_size, u64 hint_byte,
1730 u64 search_end, struct btrfs_key *ins, u64 data);
1731int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
1732 struct btrfs_root *root, u64 parent,
1733 u64 root_objectid, u64 ref_generation,
1734 u64 owner, struct btrfs_key *ins);
1735int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
1736 struct btrfs_root *root, u64 parent,
1737 u64 root_objectid, u64 ref_generation,
1738 u64 owner, struct btrfs_key *ins);
1739int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1740 struct btrfs_root *root,
1741 u64 num_bytes, u64 min_alloc_size,
1742 u64 empty_size, u64 hint_byte,
1743 u64 search_end, struct btrfs_key *ins,
1744 u64 data);
1745int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1746 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1747 u32 *nr_extents);
1748int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1749 struct extent_buffer *buf, u32 nr_extents);
1750int btrfs_update_ref(struct btrfs_trans_handle *trans,
1751 struct btrfs_root *root, struct extent_buffer *orig_buf,
1752 struct extent_buffer *buf, int start_slot, int nr);
1753int btrfs_free_extent(struct btrfs_trans_handle *trans,
1754 struct btrfs_root *root,
1755 u64 bytenr, u64 num_bytes, u64 parent,
1756 u64 root_objectid, u64 ref_generation,
1757 u64 owner_objectid, int pin);
1758int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
1759int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1760 struct btrfs_root *root,
1761 struct extent_io_tree *unpin);
1762int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1763 struct btrfs_root *root,
1764 u64 bytenr, u64 num_bytes, u64 parent,
1765 u64 root_objectid, u64 ref_generation,
1766 u64 owner_objectid);
1767int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1768 struct btrfs_root *root, u64 bytenr,
1769 u64 orig_parent, u64 parent,
1770 u64 root_objectid, u64 ref_generation,
1771 u64 owner_objectid);
1772int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1773 struct btrfs_root *root);
1774int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
1775int btrfs_free_block_groups(struct btrfs_fs_info *info);
1776int btrfs_read_block_groups(struct btrfs_root *root);
1777int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1778 struct btrfs_root *root, u64 bytes_used,
1779 u64 type, u64 chunk_objectid, u64 chunk_offset,
1780 u64 size);
1781int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1782 struct btrfs_root *root, u64 group_start);
1783int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
1784int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
1785 struct btrfs_root *root);
1786int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
1787int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
1788 struct btrfs_root *root,
1789 struct extent_buffer *buf, u64 orig_start);
1790int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1791int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1792int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1793u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1794/* ctree.c */
1795int btrfs_previous_item(struct btrfs_root *root,
1796 struct btrfs_path *path, u64 min_objectid,
1797 int type);
1798int btrfs_merge_path(struct btrfs_trans_handle *trans,
1799 struct btrfs_root *root,
1800 struct btrfs_key *node_keys,
1801 u64 *nodes, int lowest_level);
1802int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1803 struct btrfs_root *root, struct btrfs_path *path,
1804 struct btrfs_key *new_key);
1805struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
1806struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
1807int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
1808 struct btrfs_key *key, int lowest_level,
1809 int cache_only, u64 min_trans);
1810int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1811 struct btrfs_key *max_key,
1812 struct btrfs_path *path, int cache_only,
1813 u64 min_trans);
1814int btrfs_cow_block(struct btrfs_trans_handle *trans,
1815 struct btrfs_root *root, struct extent_buffer *buf,
1816 struct extent_buffer *parent, int parent_slot,
1817 struct extent_buffer **cow_ret, u64 prealloc_dest);
1818int btrfs_copy_root(struct btrfs_trans_handle *trans,
1819 struct btrfs_root *root,
1820 struct extent_buffer *buf,
1821 struct extent_buffer **cow_ret, u64 new_root_objectid);
1822int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
1823 *root, struct btrfs_path *path, u32 data_size);
1824int btrfs_truncate_item(struct btrfs_trans_handle *trans,
1825 struct btrfs_root *root,
1826 struct btrfs_path *path,
1827 u32 new_size, int from_end);
1828int btrfs_split_item(struct btrfs_trans_handle *trans,
1829 struct btrfs_root *root,
1830 struct btrfs_path *path,
1831 struct btrfs_key *new_key,
1832 unsigned long split_offset);
1833int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1834 *root, struct btrfs_key *key, struct btrfs_path *p, int
1835 ins_len, int cow);
1836int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1837 struct btrfs_root *root, struct extent_buffer *parent,
1838 int start_slot, int cache_only, u64 *last_ret,
1839 struct btrfs_key *progress);
1840void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1841struct btrfs_path *btrfs_alloc_path(void);
1842void btrfs_free_path(struct btrfs_path *p);
1843void btrfs_init_path(struct btrfs_path *p);
1844int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1845 struct btrfs_path *path, int slot, int nr);
1846int btrfs_del_leaf(struct btrfs_trans_handle *trans,
1847 struct btrfs_root *root,
1848 struct btrfs_path *path, u64 bytenr);
1849static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
1850 struct btrfs_root *root,
1851 struct btrfs_path *path)
1852{
1853 return btrfs_del_items(trans, root, path, path->slots[0], 1);
1854}
1855
1856int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
1857 *root, struct btrfs_key *key, void *data, u32 data_size);
1858int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
1859 struct btrfs_root *root,
1860 struct btrfs_path *path,
1861 struct btrfs_key *cpu_key, u32 *data_size,
1862 int nr);
1863int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
1864 struct btrfs_root *root,
1865 struct btrfs_path *path,
1866 struct btrfs_key *cpu_key, u32 *data_size, int nr);
1867
1868static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
1869 struct btrfs_root *root,
1870 struct btrfs_path *path,
1871 struct btrfs_key *key,
1872 u32 data_size)
1873{
1874 return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
1875}
1876
1877int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
1878int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
1879int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
1880int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
1881 *root);
1882int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
1883 struct btrfs_root *root,
1884 struct extent_buffer *node,
1885 struct extent_buffer *parent);
1886/* root-item.c */
1887int btrfs_find_root_ref(struct btrfs_root *tree_root,
1888 struct btrfs_path *path,
1889 u64 root_id, u64 ref_id);
1890int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
1891 struct btrfs_root *tree_root,
1892 u64 root_id, u8 type, u64 ref_id,
1893 u64 dirid, u64 sequence,
1894 const char *name, int name_len);
1895int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1896 struct btrfs_key *key);
1897int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
1898 *root, struct btrfs_key *key, struct btrfs_root_item
1899 *item);
1900int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
1901 *root, struct btrfs_key *key, struct btrfs_root_item
1902 *item);
1903int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
1904 btrfs_root_item *item, struct btrfs_key *key);
1905int btrfs_search_root(struct btrfs_root *root, u64 search_start,
1906 u64 *found_objectid);
1907int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
1908 struct btrfs_root *latest_root);
1909/* dir-item.c */
1910int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
1911 struct btrfs_root *root, const char *name,
1912 int name_len, u64 dir,
1913 struct btrfs_key *location, u8 type, u64 index);
1914struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
1915 struct btrfs_root *root,
1916 struct btrfs_path *path, u64 dir,
1917 const char *name, int name_len,
1918 int mod);
1919struct btrfs_dir_item *
1920btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1921 struct btrfs_root *root,
1922 struct btrfs_path *path, u64 dir,
1923 u64 objectid, const char *name, int name_len,
1924 int mod);
1925struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
1926 struct btrfs_path *path,
1927 const char *name, int name_len);
1928int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
1929 struct btrfs_root *root,
1930 struct btrfs_path *path,
1931 struct btrfs_dir_item *di);
1932int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
1933 struct btrfs_root *root, const char *name,
1934 u16 name_len, const void *data, u16 data_len,
1935 u64 dir);
1936struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
1937 struct btrfs_root *root,
1938 struct btrfs_path *path, u64 dir,
1939 const char *name, u16 name_len,
1940 int mod);
1941
1942/* orphan.c */
1943int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
1944 struct btrfs_root *root, u64 offset);
1945int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
1946 struct btrfs_root *root, u64 offset);
1947
1948/* inode-map.c */
1949int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
1950 struct btrfs_root *fs_root,
1951 u64 dirid, u64 *objectid);
1952int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
1953
1954/* inode-item.c */
1955int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
1956 struct btrfs_root *root,
1957 const char *name, int name_len,
1958 u64 inode_objectid, u64 ref_objectid, u64 index);
1959int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
1960 struct btrfs_root *root,
1961 const char *name, int name_len,
1962 u64 inode_objectid, u64 ref_objectid, u64 *index);
1963int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
1964 struct btrfs_root *root,
1965 struct btrfs_path *path, u64 objectid);
1966int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
1967 *root, struct btrfs_path *path,
1968 struct btrfs_key *location, int mod);
1969
1970/* file-item.c */
1971int btrfs_del_csums(struct btrfs_trans_handle *trans,
1972 struct btrfs_root *root, u64 bytenr, u64 len);
1973int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
1974 struct bio *bio, u32 *dst);
1975int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
1976 struct btrfs_root *root,
1977 u64 objectid, u64 pos,
1978 u64 disk_offset, u64 disk_num_bytes,
1979 u64 num_bytes, u64 offset, u64 ram_bytes,
1980 u8 compression, u8 encryption, u16 other_encoding);
1981int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
1982 struct btrfs_root *root,
1983 struct btrfs_path *path, u64 objectid,
1984 u64 bytenr, int mod);
1985int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
1986 struct btrfs_root *root,
1987 struct btrfs_ordered_sum *sums);
1988int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
1989 struct bio *bio, u64 file_start, int contig);
1990int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
1991 u64 start, unsigned long len);
1992struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
1993 struct btrfs_root *root,
1994 struct btrfs_path *path,
1995 u64 bytenr, int cow);
1996int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, struct btrfs_path *path,
1998 u64 isize);
1999int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
2000 u64 end, struct list_head *list);
2001/* inode.c */
2002
2003/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
2004#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
2005#define ClearPageChecked ClearPageFsMisc
2006#define SetPageChecked SetPageFsMisc
2007#define PageChecked PageFsMisc
2008#endif
2009
2010struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
2011int btrfs_set_inode_index(struct inode *dir, u64 *index);
2012int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2013 struct btrfs_root *root,
2014 struct inode *dir, struct inode *inode,
2015 const char *name, int name_len);
2016int btrfs_add_link(struct btrfs_trans_handle *trans,
2017 struct inode *parent_inode, struct inode *inode,
2018 const char *name, int name_len, int add_backref, u64 index);
2019int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2020 struct btrfs_root *root,
2021 struct inode *inode, u64 new_size,
2022 u32 min_type);
2023
2024int btrfs_start_delalloc_inodes(struct btrfs_root *root);
2025int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
2026int btrfs_writepages(struct address_space *mapping,
2027 struct writeback_control *wbc);
2028int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2029 struct btrfs_root *new_root, struct dentry *dentry,
2030 u64 new_dirid, u64 alloc_hint);
2031int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2032 size_t size, struct bio *bio, unsigned long bio_flags);
2033
2034unsigned long btrfs_force_ra(struct address_space *mapping,
2035 struct file_ra_state *ra, struct file *file,
2036 pgoff_t offset, pgoff_t last_index);
2037int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
2038 int for_del);
2039int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
2040int btrfs_readpage(struct file *file, struct page *page);
2041void btrfs_delete_inode(struct inode *inode);
2042void btrfs_put_inode(struct inode *inode);
2043void btrfs_read_locked_inode(struct inode *inode);
2044int btrfs_write_inode(struct inode *inode, int wait);
2045void btrfs_dirty_inode(struct inode *inode);
2046struct inode *btrfs_alloc_inode(struct super_block *sb);
2047void btrfs_destroy_inode(struct inode *inode);
2048int btrfs_init_cachep(void);
2049void btrfs_destroy_cachep(void);
2050long btrfs_ioctl_trans_end(struct file *file);
2051struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
2052 struct btrfs_root *root, int wait);
2053struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
2054 struct btrfs_root *root);
2055struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2056 struct btrfs_root *root, int *is_new);
2057int btrfs_commit_write(struct file *file, struct page *page,
2058 unsigned from, unsigned to);
2059struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2060 size_t page_offset, u64 start, u64 end,
2061 int create);
2062int btrfs_update_inode(struct btrfs_trans_handle *trans,
2063 struct btrfs_root *root,
2064 struct inode *inode);
2065int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2066int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2067void btrfs_orphan_cleanup(struct btrfs_root *root);
2068int btrfs_cont_expand(struct inode *inode, loff_t size);
2069
2070/* ioctl.c */
2071long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
2072
2073/* file.c */
2074int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
2075int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2076 int skip_pinned);
2077int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2078extern struct file_operations btrfs_file_operations;
2079int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2080 struct btrfs_root *root, struct inode *inode,
2081 u64 start, u64 end, u64 inline_limit, u64 *hint_block);
2082int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2083 struct btrfs_root *root,
2084 struct inode *inode, u64 start, u64 end);
2085int btrfs_release_file(struct inode *inode, struct file *file);
2086
2087/* tree-defrag.c */
2088int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
2089 struct btrfs_root *root, int cache_only);
2090
2091/* sysfs.c */
2092int btrfs_init_sysfs(void);
2093void btrfs_exit_sysfs(void);
2094int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
2095int btrfs_sysfs_add_root(struct btrfs_root *root);
2096void btrfs_sysfs_del_root(struct btrfs_root *root);
2097void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2098
2099/* xattr.c */
2100ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2101
2102/* super.c */
2103u64 btrfs_parse_size(char *str);
2104int btrfs_parse_options(struct btrfs_root *root, char *options);
2105int btrfs_sync_fs(struct super_block *sb, int wait);
2106
2107/* acl.c */
2108int btrfs_check_acl(struct inode *inode, int mask);
2109int btrfs_init_acl(struct inode *inode, struct inode *dir);
2110int btrfs_acl_chmod(struct inode *inode);
2111
2112/* free-space-cache.c */
2113int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2114 u64 bytenr, u64 size);
2115int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2116 u64 offset, u64 bytes);
2117int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2118 u64 bytenr, u64 size);
2119int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2120 u64 offset, u64 bytes);
2121void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2122 *block_group);
2123struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2124 *block_group, u64 offset,
2125 u64 bytes);
2126void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2127 u64 bytes);
2128u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2129#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000000..926a0b287a7d
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,386 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "hash.h"
22#include "transaction.h"
23
24/*
25 * insert a name into a directory, doing overflow properly if there is a hash
26 * collision. data_size indicates how big the item inserted should be. On
27 * success a struct btrfs_dir_item pointer is returned, otherwise it is
28 * an ERR_PTR.
29 *
30 * The name is not copied into the dir item, you have to do that yourself.
31 */
32static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
33 *trans,
34 struct btrfs_root *root,
35 struct btrfs_path *path,
36 struct btrfs_key *cpu_key,
37 u32 data_size,
38 const char *name,
39 int name_len)
40{
41 int ret;
42 char *ptr;
43 struct btrfs_item *item;
44 struct extent_buffer *leaf;
45
46 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
47 if (ret == -EEXIST) {
48 struct btrfs_dir_item *di;
49 di = btrfs_match_dir_item_name(root, path, name, name_len);
50 if (di)
51 return ERR_PTR(-EEXIST);
52 ret = btrfs_extend_item(trans, root, path, data_size);
53 WARN_ON(ret > 0);
54 }
55 if (ret < 0)
56 return ERR_PTR(ret);
57 WARN_ON(ret > 0);
58 leaf = path->nodes[0];
59 item = btrfs_item_nr(leaf, path->slots[0]);
60 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
61 BUG_ON(data_size > btrfs_item_size(leaf, item));
62 ptr += btrfs_item_size(leaf, item) - data_size;
63 return (struct btrfs_dir_item *)ptr;
64}
65
66/*
67 * xattrs work a lot like directories, this inserts an xattr item
68 * into the tree
69 */
70int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
71 struct btrfs_root *root, const char *name,
72 u16 name_len, const void *data, u16 data_len,
73 u64 dir)
74{
75 int ret = 0;
76 struct btrfs_path *path;
77 struct btrfs_dir_item *dir_item;
78 unsigned long name_ptr, data_ptr;
79 struct btrfs_key key, location;
80 struct btrfs_disk_key disk_key;
81 struct extent_buffer *leaf;
82 u32 data_size;
83
84 key.objectid = dir;
85 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
86 key.offset = btrfs_name_hash(name, name_len);
87 path = btrfs_alloc_path();
88 if (!path)
89 return -ENOMEM;
90 if (name_len + data_len + sizeof(struct btrfs_dir_item) >
91 BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
92 return -ENOSPC;
93
94 data_size = sizeof(*dir_item) + name_len + data_len;
95 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
96 name, name_len);
97 /*
98 * FIXME: at some point we should handle xattr's that are larger than
99 * what we can fit in our leaf. We set location to NULL b/c we arent
100 * pointing at anything else, that will change if we store the xattr
101 * data in a separate inode.
102 */
103 BUG_ON(IS_ERR(dir_item));
104 memset(&location, 0, sizeof(location));
105
106 leaf = path->nodes[0];
107 btrfs_cpu_key_to_disk(&disk_key, &location);
108 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
109 btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
110 btrfs_set_dir_name_len(leaf, dir_item, name_len);
111 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
112 btrfs_set_dir_data_len(leaf, dir_item, data_len);
113 name_ptr = (unsigned long)(dir_item + 1);
114 data_ptr = (unsigned long)((char *)name_ptr + name_len);
115
116 write_extent_buffer(leaf, name, name_ptr, name_len);
117 write_extent_buffer(leaf, data, data_ptr, data_len);
118 btrfs_mark_buffer_dirty(path->nodes[0]);
119
120 btrfs_free_path(path);
121 return ret;
122}
123
124/*
125 * insert a directory item in the tree, doing all the magic for
126 * both indexes. 'dir' indicates which objectid to insert it into,
127 * 'location' is the key to stuff into the directory item, 'type' is the
128 * type of the inode we're pointing to, and 'index' is the sequence number
129 * to use for the second index (if one is created).
130 */
131int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
132 *root, const char *name, int name_len, u64 dir,
133 struct btrfs_key *location, u8 type, u64 index)
134{
135 int ret = 0;
136 int ret2 = 0;
137 struct btrfs_path *path;
138 struct btrfs_dir_item *dir_item;
139 struct extent_buffer *leaf;
140 unsigned long name_ptr;
141 struct btrfs_key key;
142 struct btrfs_disk_key disk_key;
143 u32 data_size;
144
145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len);
148 path = btrfs_alloc_path();
149 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len);
152 if (IS_ERR(dir_item)) {
153 ret = PTR_ERR(dir_item);
154 if (ret == -EEXIST)
155 goto second_insert;
156 goto out;
157 }
158
159 leaf = path->nodes[0];
160 btrfs_cpu_key_to_disk(&disk_key, location);
161 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
162 btrfs_set_dir_type(leaf, dir_item, type);
163 btrfs_set_dir_data_len(leaf, dir_item, 0);
164 btrfs_set_dir_name_len(leaf, dir_item, name_len);
165 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
166 name_ptr = (unsigned long)(dir_item + 1);
167
168 write_extent_buffer(leaf, name, name_ptr, name_len);
169 btrfs_mark_buffer_dirty(leaf);
170
171second_insert:
172 /* FIXME, use some real flag for selecting the extra index */
173 if (root == root->fs_info->tree_root) {
174 ret = 0;
175 goto out;
176 }
177 btrfs_release_path(root, path);
178
179 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
180 key.offset = index;
181 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
182 name, name_len);
183 if (IS_ERR(dir_item)) {
184 ret2 = PTR_ERR(dir_item);
185 goto out;
186 }
187 leaf = path->nodes[0];
188 btrfs_cpu_key_to_disk(&disk_key, location);
189 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
190 btrfs_set_dir_type(leaf, dir_item, type);
191 btrfs_set_dir_data_len(leaf, dir_item, 0);
192 btrfs_set_dir_name_len(leaf, dir_item, name_len);
193 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
194 name_ptr = (unsigned long)(dir_item + 1);
195 write_extent_buffer(leaf, name, name_ptr, name_len);
196 btrfs_mark_buffer_dirty(leaf);
197out:
198 btrfs_free_path(path);
199 if (ret)
200 return ret;
201 if (ret2)
202 return ret2;
203 return 0;
204}
205
206/*
207 * lookup a directory item based on name. 'dir' is the objectid
208 * we're searching in, and 'mod' tells us if you plan on deleting the
209 * item (use mod < 0) or changing the options (use mod > 0)
210 */
211struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
212 struct btrfs_root *root,
213 struct btrfs_path *path, u64 dir,
214 const char *name, int name_len,
215 int mod)
216{
217 int ret;
218 struct btrfs_key key;
219 int ins_len = mod < 0 ? -1 : 0;
220 int cow = mod != 0;
221 struct btrfs_key found_key;
222 struct extent_buffer *leaf;
223
224 key.objectid = dir;
225 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
226
227 key.offset = btrfs_name_hash(name, name_len);
228
229 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
230 if (ret < 0)
231 return ERR_PTR(ret);
232 if (ret > 0) {
233 if (path->slots[0] == 0)
234 return NULL;
235 path->slots[0]--;
236 }
237
238 leaf = path->nodes[0];
239 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
240
241 if (found_key.objectid != dir ||
242 btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
243 found_key.offset != key.offset)
244 return NULL;
245
246 return btrfs_match_dir_item_name(root, path, name, name_len);
247}
248
249/*
250 * lookup a directory item based on index. 'dir' is the objectid
251 * we're searching in, and 'mod' tells us if you plan on deleting the
252 * item (use mod < 0) or changing the options (use mod > 0)
253 *
254 * The name is used to make sure the index really points to the name you were
255 * looking for.
256 */
257struct btrfs_dir_item *
258btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
259 struct btrfs_root *root,
260 struct btrfs_path *path, u64 dir,
261 u64 objectid, const char *name, int name_len,
262 int mod)
263{
264 int ret;
265 struct btrfs_key key;
266 int ins_len = mod < 0 ? -1 : 0;
267 int cow = mod != 0;
268
269 key.objectid = dir;
270 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
271 key.offset = objectid;
272
273 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
274 if (ret < 0)
275 return ERR_PTR(ret);
276 if (ret > 0)
277 return ERR_PTR(-ENOENT);
278 return btrfs_match_dir_item_name(root, path, name, name_len);
279}
280
281struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
282 struct btrfs_root *root,
283 struct btrfs_path *path, u64 dir,
284 const char *name, u16 name_len,
285 int mod)
286{
287 int ret;
288 struct btrfs_key key;
289 int ins_len = mod < 0 ? -1 : 0;
290 int cow = mod != 0;
291 struct btrfs_key found_key;
292 struct extent_buffer *leaf;
293
294 key.objectid = dir;
295 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
296 key.offset = btrfs_name_hash(name, name_len);
297 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
298 if (ret < 0)
299 return ERR_PTR(ret);
300 if (ret > 0) {
301 if (path->slots[0] == 0)
302 return NULL;
303 path->slots[0]--;
304 }
305
306 leaf = path->nodes[0];
307 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
308
309 if (found_key.objectid != dir ||
310 btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
311 found_key.offset != key.offset)
312 return NULL;
313
314 return btrfs_match_dir_item_name(root, path, name, name_len);
315}
316
317/*
318 * helper function to look at the directory item pointed to by 'path'
319 * this walks through all the entries in a dir item and finds one
320 * for a specific name.
321 */
322struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
323 struct btrfs_path *path,
324 const char *name, int name_len)
325{
326 struct btrfs_dir_item *dir_item;
327 unsigned long name_ptr;
328 u32 total_len;
329 u32 cur = 0;
330 u32 this_len;
331 struct extent_buffer *leaf;
332
333 leaf = path->nodes[0];
334 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
335 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
336 while (cur < total_len) {
337 this_len = sizeof(*dir_item) +
338 btrfs_dir_name_len(leaf, dir_item) +
339 btrfs_dir_data_len(leaf, dir_item);
340 name_ptr = (unsigned long)(dir_item + 1);
341
342 if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
343 memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
344 return dir_item;
345
346 cur += this_len;
347 dir_item = (struct btrfs_dir_item *)((char *)dir_item +
348 this_len);
349 }
350 return NULL;
351}
352
353/*
354 * given a pointer into a directory item, delete it. This
355 * handles items that have more than one entry in them.
356 */
357int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
358 struct btrfs_root *root,
359 struct btrfs_path *path,
360 struct btrfs_dir_item *di)
361{
362
363 struct extent_buffer *leaf;
364 u32 sub_item_len;
365 u32 item_len;
366 int ret = 0;
367
368 leaf = path->nodes[0];
369 sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
370 btrfs_dir_data_len(leaf, di);
371 item_len = btrfs_item_size_nr(leaf, path->slots[0]);
372 if (sub_item_len == item_len) {
373 ret = btrfs_del_item(trans, root, path);
374 } else {
375 /* MARKER */
376 unsigned long ptr = (unsigned long)di;
377 unsigned long start;
378
379 start = btrfs_item_ptr_offset(leaf, path->slots[0]);
380 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
381 item_len - (ptr + sub_item_len - start));
382 ret = btrfs_truncate_item(trans, root, path,
383 item_len - sub_item_len, 1);
384 }
385 return 0;
386}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000000..81a313874ae5
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2343 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/fs.h>
21#include <linux/blkdev.h>
22#include <linux/scatterlist.h>
23#include <linux/swap.h>
24#include <linux/radix-tree.h>
25#include <linux/writeback.h>
26#include <linux/buffer_head.h>
27#include <linux/workqueue.h>
28#include <linux/kthread.h>
29#include <linux/freezer.h>
30#include "compat.h"
31#include "crc32c.h"
32#include "ctree.h"
33#include "disk-io.h"
34#include "transaction.h"
35#include "btrfs_inode.h"
36#include "volumes.h"
37#include "print-tree.h"
38#include "async-thread.h"
39#include "locking.h"
40#include "ref-cache.h"
41#include "tree-log.h"
42
43static struct extent_io_ops btree_extent_io_ops;
44static void end_workqueue_fn(struct btrfs_work *work);
45
46/*
47 * end_io_wq structs are used to do processing in task context when an IO is
48 * complete. This is used during reads to verify checksums, and it is used
49 * by writes to insert metadata for new file extents after IO is complete.
50 */
51struct end_io_wq {
52 struct bio *bio;
53 bio_end_io_t *end_io;
54 void *private;
55 struct btrfs_fs_info *info;
56 int error;
57 int metadata;
58 struct list_head list;
59 struct btrfs_work work;
60};
61
62/*
63 * async submit bios are used to offload expensive checksumming
64 * onto the worker threads. They checksum file and metadata bios
65 * just before they are sent down the IO stack.
66 */
67struct async_submit_bio {
68 struct inode *inode;
69 struct bio *bio;
70 struct list_head list;
71 extent_submit_bio_hook_t *submit_bio_start;
72 extent_submit_bio_hook_t *submit_bio_done;
73 int rw;
74 int mirror_num;
75 unsigned long bio_flags;
76 struct btrfs_work work;
77};
78
79/*
80 * extents on the btree inode are pretty simple, there's one extent
81 * that covers the entire device
82 */
83static struct extent_map *btree_get_extent(struct inode *inode,
84 struct page *page, size_t page_offset, u64 start, u64 len,
85 int create)
86{
87 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
88 struct extent_map *em;
89 int ret;
90
91 spin_lock(&em_tree->lock);
92 em = lookup_extent_mapping(em_tree, start, len);
93 if (em) {
94 em->bdev =
95 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
96 spin_unlock(&em_tree->lock);
97 goto out;
98 }
99 spin_unlock(&em_tree->lock);
100
101 em = alloc_extent_map(GFP_NOFS);
102 if (!em) {
103 em = ERR_PTR(-ENOMEM);
104 goto out;
105 }
106 em->start = 0;
107 em->len = (u64)-1;
108 em->block_len = (u64)-1;
109 em->block_start = 0;
110 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
111
112 spin_lock(&em_tree->lock);
113 ret = add_extent_mapping(em_tree, em);
114 if (ret == -EEXIST) {
115 u64 failed_start = em->start;
116 u64 failed_len = em->len;
117
118 free_extent_map(em);
119 em = lookup_extent_mapping(em_tree, start, len);
120 if (em) {
121 ret = 0;
122 } else {
123 em = lookup_extent_mapping(em_tree, failed_start,
124 failed_len);
125 ret = -EIO;
126 }
127 } else if (ret) {
128 free_extent_map(em);
129 em = NULL;
130 }
131 spin_unlock(&em_tree->lock);
132
133 if (ret)
134 em = ERR_PTR(ret);
135out:
136 return em;
137}
138
139u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
140{
141 return btrfs_crc32c(seed, data, len);
142}
143
144void btrfs_csum_final(u32 crc, char *result)
145{
146 *(__le32 *)result = ~cpu_to_le32(crc);
147}
148
149/*
150 * compute the csum for a btree block, and either verify it or write it
151 * into the csum field of the block.
152 */
153static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
154 int verify)
155{
156 u16 csum_size =
157 btrfs_super_csum_size(&root->fs_info->super_copy);
158 char *result = NULL;
159 unsigned long len;
160 unsigned long cur_len;
161 unsigned long offset = BTRFS_CSUM_SIZE;
162 char *map_token = NULL;
163 char *kaddr;
164 unsigned long map_start;
165 unsigned long map_len;
166 int err;
167 u32 crc = ~(u32)0;
168 unsigned long inline_result;
169
170 len = buf->len - offset;
171 while (len > 0) {
172 err = map_private_extent_buffer(buf, offset, 32,
173 &map_token, &kaddr,
174 &map_start, &map_len, KM_USER0);
175 if (err)
176 return 1;
177 cur_len = min(len, map_len - (offset - map_start));
178 crc = btrfs_csum_data(root, kaddr + offset - map_start,
179 crc, cur_len);
180 len -= cur_len;
181 offset += cur_len;
182 unmap_extent_buffer(buf, map_token, KM_USER0);
183 }
184 if (csum_size > sizeof(inline_result)) {
185 result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
186 if (!result)
187 return 1;
188 } else {
189 result = (char *)&inline_result;
190 }
191
192 btrfs_csum_final(crc, result);
193
194 if (verify) {
195 if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
196 u32 val;
197 u32 found = 0;
198 memcpy(&found, result, csum_size);
199
200 read_extent_buffer(buf, &val, 0, csum_size);
201 printk(KERN_INFO "btrfs: %s checksum verify failed "
202 "on %llu wanted %X found %X level %d\n",
203 root->fs_info->sb->s_id,
204 buf->start, val, found, btrfs_header_level(buf));
205 if (result != (char *)&inline_result)
206 kfree(result);
207 return 1;
208 }
209 } else {
210 write_extent_buffer(buf, result, 0, csum_size);
211 }
212 if (result != (char *)&inline_result)
213 kfree(result);
214 return 0;
215}
216
217/*
218 * we can't consider a given block up to date unless the transid of the
219 * block matches the transid in the parent node's pointer. This is how we
220 * detect blocks that either didn't get written at all or got written
221 * in the wrong place.
222 */
223static int verify_parent_transid(struct extent_io_tree *io_tree,
224 struct extent_buffer *eb, u64 parent_transid)
225{
226 int ret;
227
228 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
229 return 0;
230
231 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
232 if (extent_buffer_uptodate(io_tree, eb) &&
233 btrfs_header_generation(eb) == parent_transid) {
234 ret = 0;
235 goto out;
236 }
237 printk("parent transid verify failed on %llu wanted %llu found %llu\n",
238 (unsigned long long)eb->start,
239 (unsigned long long)parent_transid,
240 (unsigned long long)btrfs_header_generation(eb));
241 ret = 1;
242 clear_extent_buffer_uptodate(io_tree, eb);
243out:
244 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
245 GFP_NOFS);
246 return ret;
247}
248
249/*
250 * helper to read a given tree block, doing retries as required when
251 * the checksums don't match and we have alternate mirrors to try.
252 */
253static int btree_read_extent_buffer_pages(struct btrfs_root *root,
254 struct extent_buffer *eb,
255 u64 start, u64 parent_transid)
256{
257 struct extent_io_tree *io_tree;
258 int ret;
259 int num_copies = 0;
260 int mirror_num = 0;
261
262 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
263 while (1) {
264 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
265 btree_get_extent, mirror_num);
266 if (!ret &&
267 !verify_parent_transid(io_tree, eb, parent_transid))
268 return ret;
269
270 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
271 eb->start, eb->len);
272 if (num_copies == 1)
273 return ret;
274
275 mirror_num++;
276 if (mirror_num > num_copies)
277 return ret;
278 }
279 return -EIO;
280}
281
282/*
283 * checksum a dirty tree block before IO. This has extra checks to make sure
284 * we only fill in the checksum field in the first page of a multi-page block
285 */
286
287static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
288{
289 struct extent_io_tree *tree;
290 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
291 u64 found_start;
292 int found_level;
293 unsigned long len;
294 struct extent_buffer *eb;
295 int ret;
296
297 tree = &BTRFS_I(page->mapping->host)->io_tree;
298
299 if (page->private == EXTENT_PAGE_PRIVATE)
300 goto out;
301 if (!page->private)
302 goto out;
303 len = page->private >> 2;
304 WARN_ON(len == 0);
305
306 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
307 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
308 btrfs_header_generation(eb));
309 BUG_ON(ret);
310 found_start = btrfs_header_bytenr(eb);
311 if (found_start != start) {
312 WARN_ON(1);
313 goto err;
314 }
315 if (eb->first_page != page) {
316 WARN_ON(1);
317 goto err;
318 }
319 if (!PageUptodate(page)) {
320 WARN_ON(1);
321 goto err;
322 }
323 found_level = btrfs_header_level(eb);
324
325 csum_tree_block(root, eb, 0);
326err:
327 free_extent_buffer(eb);
328out:
329 return 0;
330}
331
332static int check_tree_block_fsid(struct btrfs_root *root,
333 struct extent_buffer *eb)
334{
335 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
336 u8 fsid[BTRFS_UUID_SIZE];
337 int ret = 1;
338
339 read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
340 BTRFS_FSID_SIZE);
341 while (fs_devices) {
342 if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
343 ret = 0;
344 break;
345 }
346 fs_devices = fs_devices->seed;
347 }
348 return ret;
349}
350
351static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
352 struct extent_state *state)
353{
354 struct extent_io_tree *tree;
355 u64 found_start;
356 int found_level;
357 unsigned long len;
358 struct extent_buffer *eb;
359 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
360 int ret = 0;
361
362 tree = &BTRFS_I(page->mapping->host)->io_tree;
363 if (page->private == EXTENT_PAGE_PRIVATE)
364 goto out;
365 if (!page->private)
366 goto out;
367
368 len = page->private >> 2;
369 WARN_ON(len == 0);
370
371 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
372
373 found_start = btrfs_header_bytenr(eb);
374 if (found_start != start) {
375 printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
376 (unsigned long long)found_start,
377 (unsigned long long)eb->start);
378 ret = -EIO;
379 goto err;
380 }
381 if (eb->first_page != page) {
382 printk(KERN_INFO "btrfs bad first page %lu %lu\n",
383 eb->first_page->index, page->index);
384 WARN_ON(1);
385 ret = -EIO;
386 goto err;
387 }
388 if (check_tree_block_fsid(root, eb)) {
389 printk(KERN_INFO "btrfs bad fsid on block %llu\n",
390 (unsigned long long)eb->start);
391 ret = -EIO;
392 goto err;
393 }
394 found_level = btrfs_header_level(eb);
395
396 ret = csum_tree_block(root, eb, 1);
397 if (ret)
398 ret = -EIO;
399
400 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
401 end = eb->start + end - 1;
402err:
403 free_extent_buffer(eb);
404out:
405 return ret;
406}
407
408static void end_workqueue_bio(struct bio *bio, int err)
409{
410 struct end_io_wq *end_io_wq = bio->bi_private;
411 struct btrfs_fs_info *fs_info;
412
413 fs_info = end_io_wq->info;
414 end_io_wq->error = err;
415 end_io_wq->work.func = end_workqueue_fn;
416 end_io_wq->work.flags = 0;
417
418 if (bio->bi_rw & (1 << BIO_RW)) {
419 if (end_io_wq->metadata)
420 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
421 &end_io_wq->work);
422 else
423 btrfs_queue_worker(&fs_info->endio_write_workers,
424 &end_io_wq->work);
425 } else {
426 if (end_io_wq->metadata)
427 btrfs_queue_worker(&fs_info->endio_meta_workers,
428 &end_io_wq->work);
429 else
430 btrfs_queue_worker(&fs_info->endio_workers,
431 &end_io_wq->work);
432 }
433}
434
435int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
436 int metadata)
437{
438 struct end_io_wq *end_io_wq;
439 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
440 if (!end_io_wq)
441 return -ENOMEM;
442
443 end_io_wq->private = bio->bi_private;
444 end_io_wq->end_io = bio->bi_end_io;
445 end_io_wq->info = info;
446 end_io_wq->error = 0;
447 end_io_wq->bio = bio;
448 end_io_wq->metadata = metadata;
449
450 bio->bi_private = end_io_wq;
451 bio->bi_end_io = end_workqueue_bio;
452 return 0;
453}
454
455unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
456{
457 unsigned long limit = min_t(unsigned long,
458 info->workers.max_workers,
459 info->fs_devices->open_devices);
460 return 256 * limit;
461}
462
463int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
464{
465 return atomic_read(&info->nr_async_bios) >
466 btrfs_async_submit_limit(info);
467}
468
469static void run_one_async_start(struct btrfs_work *work)
470{
471 struct btrfs_fs_info *fs_info;
472 struct async_submit_bio *async;
473
474 async = container_of(work, struct async_submit_bio, work);
475 fs_info = BTRFS_I(async->inode)->root->fs_info;
476 async->submit_bio_start(async->inode, async->rw, async->bio,
477 async->mirror_num, async->bio_flags);
478}
479
480static void run_one_async_done(struct btrfs_work *work)
481{
482 struct btrfs_fs_info *fs_info;
483 struct async_submit_bio *async;
484 int limit;
485
486 async = container_of(work, struct async_submit_bio, work);
487 fs_info = BTRFS_I(async->inode)->root->fs_info;
488
489 limit = btrfs_async_submit_limit(fs_info);
490 limit = limit * 2 / 3;
491
492 atomic_dec(&fs_info->nr_async_submits);
493
494 if (atomic_read(&fs_info->nr_async_submits) < limit &&
495 waitqueue_active(&fs_info->async_submit_wait))
496 wake_up(&fs_info->async_submit_wait);
497
498 async->submit_bio_done(async->inode, async->rw, async->bio,
499 async->mirror_num, async->bio_flags);
500}
501
502static void run_one_async_free(struct btrfs_work *work)
503{
504 struct async_submit_bio *async;
505
506 async = container_of(work, struct async_submit_bio, work);
507 kfree(async);
508}
509
510int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
511 int rw, struct bio *bio, int mirror_num,
512 unsigned long bio_flags,
513 extent_submit_bio_hook_t *submit_bio_start,
514 extent_submit_bio_hook_t *submit_bio_done)
515{
516 struct async_submit_bio *async;
517
518 async = kmalloc(sizeof(*async), GFP_NOFS);
519 if (!async)
520 return -ENOMEM;
521
522 async->inode = inode;
523 async->rw = rw;
524 async->bio = bio;
525 async->mirror_num = mirror_num;
526 async->submit_bio_start = submit_bio_start;
527 async->submit_bio_done = submit_bio_done;
528
529 async->work.func = run_one_async_start;
530 async->work.ordered_func = run_one_async_done;
531 async->work.ordered_free = run_one_async_free;
532
533 async->work.flags = 0;
534 async->bio_flags = bio_flags;
535
536 atomic_inc(&fs_info->nr_async_submits);
537 btrfs_queue_worker(&fs_info->workers, &async->work);
538#if 0
539 int limit = btrfs_async_submit_limit(fs_info);
540 if (atomic_read(&fs_info->nr_async_submits) > limit) {
541 wait_event_timeout(fs_info->async_submit_wait,
542 (atomic_read(&fs_info->nr_async_submits) < limit),
543 HZ/10);
544
545 wait_event_timeout(fs_info->async_submit_wait,
546 (atomic_read(&fs_info->nr_async_bios) < limit),
547 HZ/10);
548 }
549#endif
550 while (atomic_read(&fs_info->async_submit_draining) &&
551 atomic_read(&fs_info->nr_async_submits)) {
552 wait_event(fs_info->async_submit_wait,
553 (atomic_read(&fs_info->nr_async_submits) == 0));
554 }
555
556 return 0;
557}
558
559static int btree_csum_one_bio(struct bio *bio)
560{
561 struct bio_vec *bvec = bio->bi_io_vec;
562 int bio_index = 0;
563 struct btrfs_root *root;
564
565 WARN_ON(bio->bi_vcnt <= 0);
566 while (bio_index < bio->bi_vcnt) {
567 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
568 csum_dirty_buffer(root, bvec->bv_page);
569 bio_index++;
570 bvec++;
571 }
572 return 0;
573}
574
575static int __btree_submit_bio_start(struct inode *inode, int rw,
576 struct bio *bio, int mirror_num,
577 unsigned long bio_flags)
578{
579 /*
580 * when we're called for a write, we're already in the async
581 * submission context. Just jump into btrfs_map_bio
582 */
583 btree_csum_one_bio(bio);
584 return 0;
585}
586
587static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
588 int mirror_num, unsigned long bio_flags)
589{
590 /*
591 * when we're called for a write, we're already in the async
592 * submission context. Just jump into btrfs_map_bio
593 */
594 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
595}
596
597static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
598 int mirror_num, unsigned long bio_flags)
599{
600 int ret;
601
602 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
603 bio, 1);
604 BUG_ON(ret);
605
606 if (!(rw & (1 << BIO_RW))) {
607 /*
608 * called for a read, do the setup so that checksum validation
609 * can happen in the async kernel threads
610 */
611 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
612 mirror_num, 0);
613 }
614 /*
615 * kthread helpers are used to submit writes so that checksumming
616 * can happen in parallel across all CPUs
617 */
618 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
619 inode, rw, bio, mirror_num, 0,
620 __btree_submit_bio_start,
621 __btree_submit_bio_done);
622}
623
624static int btree_writepage(struct page *page, struct writeback_control *wbc)
625{
626 struct extent_io_tree *tree;
627 tree = &BTRFS_I(page->mapping->host)->io_tree;
628
629 if (current->flags & PF_MEMALLOC) {
630 redirty_page_for_writepage(wbc, page);
631 unlock_page(page);
632 return 0;
633 }
634 return extent_write_full_page(tree, page, btree_get_extent, wbc);
635}
636
637static int btree_writepages(struct address_space *mapping,
638 struct writeback_control *wbc)
639{
640 struct extent_io_tree *tree;
641 tree = &BTRFS_I(mapping->host)->io_tree;
642 if (wbc->sync_mode == WB_SYNC_NONE) {
643 u64 num_dirty;
644 u64 start = 0;
645 unsigned long thresh = 32 * 1024 * 1024;
646
647 if (wbc->for_kupdate)
648 return 0;
649
650 num_dirty = count_range_bits(tree, &start, (u64)-1,
651 thresh, EXTENT_DIRTY);
652 if (num_dirty < thresh)
653 return 0;
654 }
655 return extent_writepages(tree, mapping, btree_get_extent, wbc);
656}
657
658static int btree_readpage(struct file *file, struct page *page)
659{
660 struct extent_io_tree *tree;
661 tree = &BTRFS_I(page->mapping->host)->io_tree;
662 return extent_read_full_page(tree, page, btree_get_extent);
663}
664
665static int btree_releasepage(struct page *page, gfp_t gfp_flags)
666{
667 struct extent_io_tree *tree;
668 struct extent_map_tree *map;
669 int ret;
670
671 if (PageWriteback(page) || PageDirty(page))
672 return 0;
673
674 tree = &BTRFS_I(page->mapping->host)->io_tree;
675 map = &BTRFS_I(page->mapping->host)->extent_tree;
676
677 ret = try_release_extent_state(map, tree, page, gfp_flags);
678 if (!ret)
679 return 0;
680
681 ret = try_release_extent_buffer(tree, page);
682 if (ret == 1) {
683 ClearPagePrivate(page);
684 set_page_private(page, 0);
685 page_cache_release(page);
686 }
687
688 return ret;
689}
690
691static void btree_invalidatepage(struct page *page, unsigned long offset)
692{
693 struct extent_io_tree *tree;
694 tree = &BTRFS_I(page->mapping->host)->io_tree;
695 extent_invalidatepage(tree, page, offset);
696 btree_releasepage(page, GFP_NOFS);
697 if (PagePrivate(page)) {
698 printk(KERN_WARNING "btrfs warning page private not zero "
699 "on page %llu\n", (unsigned long long)page_offset(page));
700 ClearPagePrivate(page);
701 set_page_private(page, 0);
702 page_cache_release(page);
703 }
704}
705
706#if 0
707static int btree_writepage(struct page *page, struct writeback_control *wbc)
708{
709 struct buffer_head *bh;
710 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
711 struct buffer_head *head;
712 if (!page_has_buffers(page)) {
713 create_empty_buffers(page, root->fs_info->sb->s_blocksize,
714 (1 << BH_Dirty)|(1 << BH_Uptodate));
715 }
716 head = page_buffers(page);
717 bh = head;
718 do {
719 if (buffer_dirty(bh))
720 csum_tree_block(root, bh, 0);
721 bh = bh->b_this_page;
722 } while (bh != head);
723 return block_write_full_page(page, btree_get_block, wbc);
724}
725#endif
726
727static struct address_space_operations btree_aops = {
728 .readpage = btree_readpage,
729 .writepage = btree_writepage,
730 .writepages = btree_writepages,
731 .releasepage = btree_releasepage,
732 .invalidatepage = btree_invalidatepage,
733 .sync_page = block_sync_page,
734};
735
736int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
737 u64 parent_transid)
738{
739 struct extent_buffer *buf = NULL;
740 struct inode *btree_inode = root->fs_info->btree_inode;
741 int ret = 0;
742
743 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
744 if (!buf)
745 return 0;
746 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
747 buf, 0, 0, btree_get_extent, 0);
748 free_extent_buffer(buf);
749 return ret;
750}
751
752struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
753 u64 bytenr, u32 blocksize)
754{
755 struct inode *btree_inode = root->fs_info->btree_inode;
756 struct extent_buffer *eb;
757 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
758 bytenr, blocksize, GFP_NOFS);
759 return eb;
760}
761
762struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
763 u64 bytenr, u32 blocksize)
764{
765 struct inode *btree_inode = root->fs_info->btree_inode;
766 struct extent_buffer *eb;
767
768 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
769 bytenr, blocksize, NULL, GFP_NOFS);
770 return eb;
771}
772
773
774int btrfs_write_tree_block(struct extent_buffer *buf)
775{
776 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
777 buf->start + buf->len - 1, WB_SYNC_ALL);
778}
779
780int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
781{
782 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
783 buf->start, buf->start + buf->len - 1);
784}
785
786struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
787 u32 blocksize, u64 parent_transid)
788{
789 struct extent_buffer *buf = NULL;
790 struct inode *btree_inode = root->fs_info->btree_inode;
791 struct extent_io_tree *io_tree;
792 int ret;
793
794 io_tree = &BTRFS_I(btree_inode)->io_tree;
795
796 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
797 if (!buf)
798 return NULL;
799
800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
801
802 if (ret == 0)
803 buf->flags |= EXTENT_UPTODATE;
804 else
805 WARN_ON(1);
806 return buf;
807
808}
809
810int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
811 struct extent_buffer *buf)
812{
813 struct inode *btree_inode = root->fs_info->btree_inode;
814 if (btrfs_header_generation(buf) ==
815 root->fs_info->running_transaction->transid) {
816 WARN_ON(!btrfs_tree_locked(buf));
817 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
818 buf);
819 }
820 return 0;
821}
822
823static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
824 u32 stripesize, struct btrfs_root *root,
825 struct btrfs_fs_info *fs_info,
826 u64 objectid)
827{
828 root->node = NULL;
829 root->commit_root = NULL;
830 root->ref_tree = NULL;
831 root->sectorsize = sectorsize;
832 root->nodesize = nodesize;
833 root->leafsize = leafsize;
834 root->stripesize = stripesize;
835 root->ref_cows = 0;
836 root->track_dirty = 0;
837
838 root->fs_info = fs_info;
839 root->objectid = objectid;
840 root->last_trans = 0;
841 root->highest_inode = 0;
842 root->last_inode_alloc = 0;
843 root->name = NULL;
844 root->in_sysfs = 0;
845
846 INIT_LIST_HEAD(&root->dirty_list);
847 INIT_LIST_HEAD(&root->orphan_list);
848 INIT_LIST_HEAD(&root->dead_list);
849 spin_lock_init(&root->node_lock);
850 spin_lock_init(&root->list_lock);
851 mutex_init(&root->objectid_mutex);
852 mutex_init(&root->log_mutex);
853 extent_io_tree_init(&root->dirty_log_pages,
854 fs_info->btree_inode->i_mapping, GFP_NOFS);
855
856 btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
857 root->ref_tree = &root->ref_tree_struct;
858
859 memset(&root->root_key, 0, sizeof(root->root_key));
860 memset(&root->root_item, 0, sizeof(root->root_item));
861 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
862 memset(&root->root_kobj, 0, sizeof(root->root_kobj));
863 root->defrag_trans_start = fs_info->generation;
864 init_completion(&root->kobj_unregister);
865 root->defrag_running = 0;
866 root->defrag_level = 0;
867 root->root_key.objectid = objectid;
868 root->anon_super.s_root = NULL;
869 root->anon_super.s_dev = 0;
870 INIT_LIST_HEAD(&root->anon_super.s_list);
871 INIT_LIST_HEAD(&root->anon_super.s_instances);
872 init_rwsem(&root->anon_super.s_umount);
873
874 return 0;
875}
876
877static int find_and_setup_root(struct btrfs_root *tree_root,
878 struct btrfs_fs_info *fs_info,
879 u64 objectid,
880 struct btrfs_root *root)
881{
882 int ret;
883 u32 blocksize;
884 u64 generation;
885
886 __setup_root(tree_root->nodesize, tree_root->leafsize,
887 tree_root->sectorsize, tree_root->stripesize,
888 root, fs_info, objectid);
889 ret = btrfs_find_last_root(tree_root, objectid,
890 &root->root_item, &root->root_key);
891 BUG_ON(ret);
892
893 generation = btrfs_root_generation(&root->root_item);
894 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
895 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
896 blocksize, generation);
897 BUG_ON(!root->node);
898 return 0;
899}
900
901int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
902 struct btrfs_fs_info *fs_info)
903{
904 struct extent_buffer *eb;
905 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
906 u64 start = 0;
907 u64 end = 0;
908 int ret;
909
910 if (!log_root_tree)
911 return 0;
912
913 while (1) {
914 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
915 0, &start, &end, EXTENT_DIRTY);
916 if (ret)
917 break;
918
919 clear_extent_dirty(&log_root_tree->dirty_log_pages,
920 start, end, GFP_NOFS);
921 }
922 eb = fs_info->log_root_tree->node;
923
924 WARN_ON(btrfs_header_level(eb) != 0);
925 WARN_ON(btrfs_header_nritems(eb) != 0);
926
927 ret = btrfs_free_reserved_extent(fs_info->tree_root,
928 eb->start, eb->len);
929 BUG_ON(ret);
930
931 free_extent_buffer(eb);
932 kfree(fs_info->log_root_tree);
933 fs_info->log_root_tree = NULL;
934 return 0;
935}
936
937int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
938 struct btrfs_fs_info *fs_info)
939{
940 struct btrfs_root *root;
941 struct btrfs_root *tree_root = fs_info->tree_root;
942
943 root = kzalloc(sizeof(*root), GFP_NOFS);
944 if (!root)
945 return -ENOMEM;
946
947 __setup_root(tree_root->nodesize, tree_root->leafsize,
948 tree_root->sectorsize, tree_root->stripesize,
949 root, fs_info, BTRFS_TREE_LOG_OBJECTID);
950
951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
952 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
954 root->ref_cows = 0;
955
956 root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
957 0, BTRFS_TREE_LOG_OBJECTID,
958 trans->transid, 0, 0, 0);
959
960 btrfs_set_header_nritems(root->node, 0);
961 btrfs_set_header_level(root->node, 0);
962 btrfs_set_header_bytenr(root->node, root->node->start);
963 btrfs_set_header_generation(root->node, trans->transid);
964 btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
965
966 write_extent_buffer(root->node, root->fs_info->fsid,
967 (unsigned long)btrfs_header_fsid(root->node),
968 BTRFS_FSID_SIZE);
969 btrfs_mark_buffer_dirty(root->node);
970 btrfs_tree_unlock(root->node);
971 fs_info->log_root_tree = root;
972 return 0;
973}
974
975struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
976 struct btrfs_key *location)
977{
978 struct btrfs_root *root;
979 struct btrfs_fs_info *fs_info = tree_root->fs_info;
980 struct btrfs_path *path;
981 struct extent_buffer *l;
982 u64 highest_inode;
983 u64 generation;
984 u32 blocksize;
985 int ret = 0;
986
987 root = kzalloc(sizeof(*root), GFP_NOFS);
988 if (!root)
989 return ERR_PTR(-ENOMEM);
990 if (location->offset == (u64)-1) {
991 ret = find_and_setup_root(tree_root, fs_info,
992 location->objectid, root);
993 if (ret) {
994 kfree(root);
995 return ERR_PTR(ret);
996 }
997 goto insert;
998 }
999
1000 __setup_root(tree_root->nodesize, tree_root->leafsize,
1001 tree_root->sectorsize, tree_root->stripesize,
1002 root, fs_info, location->objectid);
1003
1004 path = btrfs_alloc_path();
1005 BUG_ON(!path);
1006 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1007 if (ret != 0) {
1008 if (ret > 0)
1009 ret = -ENOENT;
1010 goto out;
1011 }
1012 l = path->nodes[0];
1013 read_extent_buffer(l, &root->root_item,
1014 btrfs_item_ptr_offset(l, path->slots[0]),
1015 sizeof(root->root_item));
1016 memcpy(&root->root_key, location, sizeof(*location));
1017 ret = 0;
1018out:
1019 btrfs_release_path(root, path);
1020 btrfs_free_path(path);
1021 if (ret) {
1022 kfree(root);
1023 return ERR_PTR(ret);
1024 }
1025 generation = btrfs_root_generation(&root->root_item);
1026 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1027 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1028 blocksize, generation);
1029 BUG_ON(!root->node);
1030insert:
1031 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1032 root->ref_cows = 1;
1033 ret = btrfs_find_highest_inode(root, &highest_inode);
1034 if (ret == 0) {
1035 root->highest_inode = highest_inode;
1036 root->last_inode_alloc = highest_inode;
1037 }
1038 }
1039 return root;
1040}
1041
1042struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1043 u64 root_objectid)
1044{
1045 struct btrfs_root *root;
1046
1047 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
1048 return fs_info->tree_root;
1049 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
1050 return fs_info->extent_root;
1051
1052 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1053 (unsigned long)root_objectid);
1054 return root;
1055}
1056
1057struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1058 struct btrfs_key *location)
1059{
1060 struct btrfs_root *root;
1061 int ret;
1062
1063 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1064 return fs_info->tree_root;
1065 if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1066 return fs_info->extent_root;
1067 if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1068 return fs_info->chunk_root;
1069 if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1070 return fs_info->dev_root;
1071 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1072 return fs_info->csum_root;
1073
1074 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1075 (unsigned long)location->objectid);
1076 if (root)
1077 return root;
1078
1079 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1080 if (IS_ERR(root))
1081 return root;
1082
1083 set_anon_super(&root->anon_super, NULL);
1084
1085 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1086 (unsigned long)root->root_key.objectid,
1087 root);
1088 if (ret) {
1089 free_extent_buffer(root->node);
1090 kfree(root);
1091 return ERR_PTR(ret);
1092 }
1093 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
1094 ret = btrfs_find_dead_roots(fs_info->tree_root,
1095 root->root_key.objectid, root);
1096 BUG_ON(ret);
1097 btrfs_orphan_cleanup(root);
1098 }
1099 return root;
1100}
1101
1102struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1103 struct btrfs_key *location,
1104 const char *name, int namelen)
1105{
1106 struct btrfs_root *root;
1107 int ret;
1108
1109 root = btrfs_read_fs_root_no_name(fs_info, location);
1110 if (!root)
1111 return NULL;
1112
1113 if (root->in_sysfs)
1114 return root;
1115
1116 ret = btrfs_set_root_name(root, name, namelen);
1117 if (ret) {
1118 free_extent_buffer(root->node);
1119 kfree(root);
1120 return ERR_PTR(ret);
1121 }
1122#if 0
1123 ret = btrfs_sysfs_add_root(root);
1124 if (ret) {
1125 free_extent_buffer(root->node);
1126 kfree(root->name);
1127 kfree(root);
1128 return ERR_PTR(ret);
1129 }
1130#endif
1131 root->in_sysfs = 1;
1132 return root;
1133}
1134
1135static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1136{
1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1138 int ret = 0;
1139 struct list_head *cur;
1140 struct btrfs_device *device;
1141 struct backing_dev_info *bdi;
1142#if 0
1143 if ((bdi_bits & (1 << BDI_write_congested)) &&
1144 btrfs_congested_async(info, 0))
1145 return 1;
1146#endif
1147 list_for_each(cur, &info->fs_devices->devices) {
1148 device = list_entry(cur, struct btrfs_device, dev_list);
1149 if (!device->bdev)
1150 continue;
1151 bdi = blk_get_backing_dev_info(device->bdev);
1152 if (bdi && bdi_congested(bdi, bdi_bits)) {
1153 ret = 1;
1154 break;
1155 }
1156 }
1157 return ret;
1158}
1159
1160/*
1161 * this unplugs every device on the box, and it is only used when page
1162 * is null
1163 */
1164static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1165{
1166 struct list_head *cur;
1167 struct btrfs_device *device;
1168 struct btrfs_fs_info *info;
1169
1170 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1171 list_for_each(cur, &info->fs_devices->devices) {
1172 device = list_entry(cur, struct btrfs_device, dev_list);
1173 if (!device->bdev)
1174 continue;
1175
1176 bdi = blk_get_backing_dev_info(device->bdev);
1177 if (bdi->unplug_io_fn)
1178 bdi->unplug_io_fn(bdi, page);
1179 }
1180}
1181
1182static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1183{
1184 struct inode *inode;
1185 struct extent_map_tree *em_tree;
1186 struct extent_map *em;
1187 struct address_space *mapping;
1188 u64 offset;
1189
1190 /* the generic O_DIRECT read code does this */
1191 if (1 || !page) {
1192 __unplug_io_fn(bdi, page);
1193 return;
1194 }
1195
1196 /*
1197 * page->mapping may change at any time. Get a consistent copy
1198 * and use that for everything below
1199 */
1200 smp_mb();
1201 mapping = page->mapping;
1202 if (!mapping)
1203 return;
1204
1205 inode = mapping->host;
1206
1207 /*
1208 * don't do the expensive searching for a small number of
1209 * devices
1210 */
1211 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1212 __unplug_io_fn(bdi, page);
1213 return;
1214 }
1215
1216 offset = page_offset(page);
1217
1218 em_tree = &BTRFS_I(inode)->extent_tree;
1219 spin_lock(&em_tree->lock);
1220 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1221 spin_unlock(&em_tree->lock);
1222 if (!em) {
1223 __unplug_io_fn(bdi, page);
1224 return;
1225 }
1226
1227 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1228 free_extent_map(em);
1229 __unplug_io_fn(bdi, page);
1230 return;
1231 }
1232 offset = offset - em->start;
1233 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1234 em->block_start + offset, page);
1235 free_extent_map(em);
1236}
1237
1238static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1239{
1240 bdi_init(bdi);
1241 bdi->ra_pages = default_backing_dev_info.ra_pages;
1242 bdi->state = 0;
1243 bdi->capabilities = default_backing_dev_info.capabilities;
1244 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1245 bdi->unplug_io_data = info;
1246 bdi->congested_fn = btrfs_congested_fn;
1247 bdi->congested_data = info;
1248 return 0;
1249}
1250
1251static int bio_ready_for_csum(struct bio *bio)
1252{
1253 u64 length = 0;
1254 u64 buf_len = 0;
1255 u64 start = 0;
1256 struct page *page;
1257 struct extent_io_tree *io_tree = NULL;
1258 struct btrfs_fs_info *info = NULL;
1259 struct bio_vec *bvec;
1260 int i;
1261 int ret;
1262
1263 bio_for_each_segment(bvec, bio, i) {
1264 page = bvec->bv_page;
1265 if (page->private == EXTENT_PAGE_PRIVATE) {
1266 length += bvec->bv_len;
1267 continue;
1268 }
1269 if (!page->private) {
1270 length += bvec->bv_len;
1271 continue;
1272 }
1273 length = bvec->bv_len;
1274 buf_len = page->private >> 2;
1275 start = page_offset(page) + bvec->bv_offset;
1276 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1277 info = BTRFS_I(page->mapping->host)->root->fs_info;
1278 }
1279 /* are we fully contained in this bio? */
1280 if (buf_len <= length)
1281 return 1;
1282
1283 ret = extent_range_uptodate(io_tree, start + length,
1284 start + buf_len - 1);
1285 if (ret == 1)
1286 return ret;
1287 return ret;
1288}
1289
1290/*
1291 * called by the kthread helper functions to finally call the bio end_io
1292 * functions. This is where read checksum verification actually happens
1293 */
1294static void end_workqueue_fn(struct btrfs_work *work)
1295{
1296 struct bio *bio;
1297 struct end_io_wq *end_io_wq;
1298 struct btrfs_fs_info *fs_info;
1299 int error;
1300
1301 end_io_wq = container_of(work, struct end_io_wq, work);
1302 bio = end_io_wq->bio;
1303 fs_info = end_io_wq->info;
1304
1305 /* metadata bio reads are special because the whole tree block must
1306 * be checksummed at once. This makes sure the entire block is in
1307 * ram and up to date before trying to verify things. For
1308 * blocksize <= pagesize, it is basically a noop
1309 */
1310 if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
1311 !bio_ready_for_csum(bio)) {
1312 btrfs_queue_worker(&fs_info->endio_meta_workers,
1313 &end_io_wq->work);
1314 return;
1315 }
1316 error = end_io_wq->error;
1317 bio->bi_private = end_io_wq->private;
1318 bio->bi_end_io = end_io_wq->end_io;
1319 kfree(end_io_wq);
1320 bio_endio(bio, error);
1321}
1322
1323static int cleaner_kthread(void *arg)
1324{
1325 struct btrfs_root *root = arg;
1326
1327 do {
1328 smp_mb();
1329 if (root->fs_info->closing)
1330 break;
1331
1332 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1333 mutex_lock(&root->fs_info->cleaner_mutex);
1334 btrfs_clean_old_snapshots(root);
1335 mutex_unlock(&root->fs_info->cleaner_mutex);
1336
1337 if (freezing(current)) {
1338 refrigerator();
1339 } else {
1340 smp_mb();
1341 if (root->fs_info->closing)
1342 break;
1343 set_current_state(TASK_INTERRUPTIBLE);
1344 schedule();
1345 __set_current_state(TASK_RUNNING);
1346 }
1347 } while (!kthread_should_stop());
1348 return 0;
1349}
1350
1351static int transaction_kthread(void *arg)
1352{
1353 struct btrfs_root *root = arg;
1354 struct btrfs_trans_handle *trans;
1355 struct btrfs_transaction *cur;
1356 unsigned long now;
1357 unsigned long delay;
1358 int ret;
1359
1360 do {
1361 smp_mb();
1362 if (root->fs_info->closing)
1363 break;
1364
1365 delay = HZ * 30;
1366 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1367 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1368
1369 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1370 printk(KERN_INFO "btrfs: total reference cache "
1371 "size %llu\n",
1372 root->fs_info->total_ref_cache_size);
1373 }
1374
1375 mutex_lock(&root->fs_info->trans_mutex);
1376 cur = root->fs_info->running_transaction;
1377 if (!cur) {
1378 mutex_unlock(&root->fs_info->trans_mutex);
1379 goto sleep;
1380 }
1381
1382 now = get_seconds();
1383 if (now < cur->start_time || now - cur->start_time < 30) {
1384 mutex_unlock(&root->fs_info->trans_mutex);
1385 delay = HZ * 5;
1386 goto sleep;
1387 }
1388 mutex_unlock(&root->fs_info->trans_mutex);
1389 trans = btrfs_start_transaction(root, 1);
1390 ret = btrfs_commit_transaction(trans, root);
1391sleep:
1392 wake_up_process(root->fs_info->cleaner_kthread);
1393 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1394
1395 if (freezing(current)) {
1396 refrigerator();
1397 } else {
1398 if (root->fs_info->closing)
1399 break;
1400 set_current_state(TASK_INTERRUPTIBLE);
1401 schedule_timeout(delay);
1402 __set_current_state(TASK_RUNNING);
1403 }
1404 } while (!kthread_should_stop());
1405 return 0;
1406}
1407
1408struct btrfs_root *open_ctree(struct super_block *sb,
1409 struct btrfs_fs_devices *fs_devices,
1410 char *options)
1411{
1412 u32 sectorsize;
1413 u32 nodesize;
1414 u32 leafsize;
1415 u32 blocksize;
1416 u32 stripesize;
1417 u64 generation;
1418 u64 features;
1419 struct btrfs_key location;
1420 struct buffer_head *bh;
1421 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
1422 GFP_NOFS);
1423 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1424 GFP_NOFS);
1425 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
1426 GFP_NOFS);
1427 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1428 GFP_NOFS);
1429 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1430 GFP_NOFS);
1431 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
1432 GFP_NOFS);
1433 struct btrfs_root *log_tree_root;
1434
1435 int ret;
1436 int err = -EINVAL;
1437
1438 struct btrfs_super_block *disk_super;
1439
1440 if (!extent_root || !tree_root || !fs_info ||
1441 !chunk_root || !dev_root || !csum_root) {
1442 err = -ENOMEM;
1443 goto fail;
1444 }
1445 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1446 INIT_LIST_HEAD(&fs_info->trans_list);
1447 INIT_LIST_HEAD(&fs_info->dead_roots);
1448 INIT_LIST_HEAD(&fs_info->hashers);
1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1450 spin_lock_init(&fs_info->hash_lock);
1451 spin_lock_init(&fs_info->delalloc_lock);
1452 spin_lock_init(&fs_info->new_trans_lock);
1453 spin_lock_init(&fs_info->ref_cache_lock);
1454
1455 init_completion(&fs_info->kobj_unregister);
1456 fs_info->tree_root = tree_root;
1457 fs_info->extent_root = extent_root;
1458 fs_info->csum_root = csum_root;
1459 fs_info->chunk_root = chunk_root;
1460 fs_info->dev_root = dev_root;
1461 fs_info->fs_devices = fs_devices;
1462 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1463 INIT_LIST_HEAD(&fs_info->space_info);
1464 btrfs_mapping_init(&fs_info->mapping_tree);
1465 atomic_set(&fs_info->nr_async_submits, 0);
1466 atomic_set(&fs_info->async_delalloc_pages, 0);
1467 atomic_set(&fs_info->async_submit_draining, 0);
1468 atomic_set(&fs_info->nr_async_bios, 0);
1469 atomic_set(&fs_info->throttles, 0);
1470 atomic_set(&fs_info->throttle_gen, 0);
1471 fs_info->sb = sb;
1472 fs_info->max_extent = (u64)-1;
1473 fs_info->max_inline = 8192 * 1024;
1474 setup_bdi(fs_info, &fs_info->bdi);
1475 fs_info->btree_inode = new_inode(sb);
1476 fs_info->btree_inode->i_ino = 1;
1477 fs_info->btree_inode->i_nlink = 1;
1478
1479 fs_info->thread_pool_size = min_t(unsigned long,
1480 num_online_cpus() + 2, 8);
1481
1482 INIT_LIST_HEAD(&fs_info->ordered_extents);
1483 spin_lock_init(&fs_info->ordered_extent_lock);
1484
1485 sb->s_blocksize = 4096;
1486 sb->s_blocksize_bits = blksize_bits(4096);
1487
1488 /*
1489 * we set the i_size on the btree inode to the max possible int.
1490 * the real end of the address space is determined by all of
1491 * the devices in the system
1492 */
1493 fs_info->btree_inode->i_size = OFFSET_MAX;
1494 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1495 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1496
1497 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1498 fs_info->btree_inode->i_mapping,
1499 GFP_NOFS);
1500 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1501 GFP_NOFS);
1502
1503 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1504
1505 spin_lock_init(&fs_info->block_group_cache_lock);
1506 fs_info->block_group_cache_tree.rb_node = NULL;
1507
1508 extent_io_tree_init(&fs_info->pinned_extents,
1509 fs_info->btree_inode->i_mapping, GFP_NOFS);
1510 extent_io_tree_init(&fs_info->pending_del,
1511 fs_info->btree_inode->i_mapping, GFP_NOFS);
1512 extent_io_tree_init(&fs_info->extent_ins,
1513 fs_info->btree_inode->i_mapping, GFP_NOFS);
1514 fs_info->do_barriers = 1;
1515
1516 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
1517 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
1518 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
1519
1520 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1521 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1522 sizeof(struct btrfs_key));
1523 insert_inode_hash(fs_info->btree_inode);
1524
1525 mutex_init(&fs_info->trans_mutex);
1526 mutex_init(&fs_info->tree_log_mutex);
1527 mutex_init(&fs_info->drop_mutex);
1528 mutex_init(&fs_info->extent_ins_mutex);
1529 mutex_init(&fs_info->pinned_mutex);
1530 mutex_init(&fs_info->chunk_mutex);
1531 mutex_init(&fs_info->transaction_kthread_mutex);
1532 mutex_init(&fs_info->cleaner_mutex);
1533 mutex_init(&fs_info->volume_mutex);
1534 mutex_init(&fs_info->tree_reloc_mutex);
1535 init_waitqueue_head(&fs_info->transaction_throttle);
1536 init_waitqueue_head(&fs_info->transaction_wait);
1537 init_waitqueue_head(&fs_info->async_submit_wait);
1538 init_waitqueue_head(&fs_info->tree_log_wait);
1539 atomic_set(&fs_info->tree_log_commit, 0);
1540 atomic_set(&fs_info->tree_log_writers, 0);
1541 fs_info->tree_log_transid = 0;
1542
1543 __setup_root(4096, 4096, 4096, 4096, tree_root,
1544 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1545
1546
1547 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1548 if (!bh)
1549 goto fail_iput;
1550
1551 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1552 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
1553 sizeof(fs_info->super_for_commit));
1554 brelse(bh);
1555
1556 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
1557
1558 disk_super = &fs_info->super_copy;
1559 if (!btrfs_super_root(disk_super))
1560 goto fail_iput;
1561
1562 ret = btrfs_parse_options(tree_root, options);
1563 if (ret) {
1564 err = ret;
1565 goto fail_iput;
1566 }
1567
1568 features = btrfs_super_incompat_flags(disk_super) &
1569 ~BTRFS_FEATURE_INCOMPAT_SUPP;
1570 if (features) {
1571 printk(KERN_ERR "BTRFS: couldn't mount because of "
1572 "unsupported optional features (%Lx).\n",
1573 features);
1574 err = -EINVAL;
1575 goto fail_iput;
1576 }
1577
1578 features = btrfs_super_compat_ro_flags(disk_super) &
1579 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
1580 if (!(sb->s_flags & MS_RDONLY) && features) {
1581 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
1582 "unsupported option features (%Lx).\n",
1583 features);
1584 err = -EINVAL;
1585 goto fail_iput;
1586 }
1587
1588 /*
1589 * we need to start all the end_io workers up front because the
1590 * queue work function gets called at interrupt time, and so it
1591 * cannot dynamically grow.
1592 */
1593 btrfs_init_workers(&fs_info->workers, "worker",
1594 fs_info->thread_pool_size);
1595
1596 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1597 fs_info->thread_pool_size);
1598
1599 btrfs_init_workers(&fs_info->submit_workers, "submit",
1600 min_t(u64, fs_devices->num_devices,
1601 fs_info->thread_pool_size));
1602
1603 /* a higher idle thresh on the submit workers makes it much more
1604 * likely that bios will be send down in a sane order to the
1605 * devices
1606 */
1607 fs_info->submit_workers.idle_thresh = 64;
1608
1609 fs_info->workers.idle_thresh = 16;
1610 fs_info->workers.ordered = 1;
1611
1612 fs_info->delalloc_workers.idle_thresh = 2;
1613 fs_info->delalloc_workers.ordered = 1;
1614
1615 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1616 btrfs_init_workers(&fs_info->endio_workers, "endio",
1617 fs_info->thread_pool_size);
1618 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1619 fs_info->thread_pool_size);
1620 btrfs_init_workers(&fs_info->endio_meta_write_workers,
1621 "endio-meta-write", fs_info->thread_pool_size);
1622 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1623 fs_info->thread_pool_size);
1624
1625 /*
1626 * endios are largely parallel and should have a very
1627 * low idle thresh
1628 */
1629 fs_info->endio_workers.idle_thresh = 4;
1630 fs_info->endio_write_workers.idle_thresh = 64;
1631 fs_info->endio_meta_write_workers.idle_thresh = 64;
1632
1633 btrfs_start_workers(&fs_info->workers, 1);
1634 btrfs_start_workers(&fs_info->submit_workers, 1);
1635 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1636 btrfs_start_workers(&fs_info->fixup_workers, 1);
1637 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1638 btrfs_start_workers(&fs_info->endio_meta_workers,
1639 fs_info->thread_pool_size);
1640 btrfs_start_workers(&fs_info->endio_meta_write_workers,
1641 fs_info->thread_pool_size);
1642 btrfs_start_workers(&fs_info->endio_write_workers,
1643 fs_info->thread_pool_size);
1644
1645 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1646 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1647 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
1648
1649 nodesize = btrfs_super_nodesize(disk_super);
1650 leafsize = btrfs_super_leafsize(disk_super);
1651 sectorsize = btrfs_super_sectorsize(disk_super);
1652 stripesize = btrfs_super_stripesize(disk_super);
1653 tree_root->nodesize = nodesize;
1654 tree_root->leafsize = leafsize;
1655 tree_root->sectorsize = sectorsize;
1656 tree_root->stripesize = stripesize;
1657
1658 sb->s_blocksize = sectorsize;
1659 sb->s_blocksize_bits = blksize_bits(sectorsize);
1660
1661 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
1662 sizeof(disk_super->magic))) {
1663 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
1664 goto fail_sb_buffer;
1665 }
1666
1667 mutex_lock(&fs_info->chunk_mutex);
1668 ret = btrfs_read_sys_array(tree_root);
1669 mutex_unlock(&fs_info->chunk_mutex);
1670 if (ret) {
1671 printk(KERN_WARNING "btrfs: failed to read the system "
1672 "array on %s\n", sb->s_id);
1673 goto fail_sys_array;
1674 }
1675
1676 blocksize = btrfs_level_size(tree_root,
1677 btrfs_super_chunk_root_level(disk_super));
1678 generation = btrfs_super_chunk_root_generation(disk_super);
1679
1680 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1681 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1682
1683 chunk_root->node = read_tree_block(chunk_root,
1684 btrfs_super_chunk_root(disk_super),
1685 blocksize, generation);
1686 BUG_ON(!chunk_root->node);
1687
1688 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1689 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
1690 BTRFS_UUID_SIZE);
1691
1692 mutex_lock(&fs_info->chunk_mutex);
1693 ret = btrfs_read_chunk_tree(chunk_root);
1694 mutex_unlock(&fs_info->chunk_mutex);
1695 if (ret) {
1696 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1697 sb->s_id);
1698 goto fail_chunk_root;
1699 }
1700
1701 btrfs_close_extra_devices(fs_devices);
1702
1703 blocksize = btrfs_level_size(tree_root,
1704 btrfs_super_root_level(disk_super));
1705 generation = btrfs_super_generation(disk_super);
1706
1707 tree_root->node = read_tree_block(tree_root,
1708 btrfs_super_root(disk_super),
1709 blocksize, generation);
1710 if (!tree_root->node)
1711 goto fail_chunk_root;
1712
1713
1714 ret = find_and_setup_root(tree_root, fs_info,
1715 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1716 if (ret)
1717 goto fail_tree_root;
1718 extent_root->track_dirty = 1;
1719
1720 ret = find_and_setup_root(tree_root, fs_info,
1721 BTRFS_DEV_TREE_OBJECTID, dev_root);
1722 dev_root->track_dirty = 1;
1723
1724 if (ret)
1725 goto fail_extent_root;
1726
1727 ret = find_and_setup_root(tree_root, fs_info,
1728 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1729 if (ret)
1730 goto fail_extent_root;
1731
1732 csum_root->track_dirty = 1;
1733
1734 btrfs_read_block_groups(extent_root);
1735
1736 fs_info->generation = generation;
1737 fs_info->last_trans_committed = generation;
1738 fs_info->data_alloc_profile = (u64)-1;
1739 fs_info->metadata_alloc_profile = (u64)-1;
1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1742 "btrfs-cleaner");
1743 if (!fs_info->cleaner_kthread)
1744 goto fail_csum_root;
1745
1746 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1747 tree_root,
1748 "btrfs-transaction");
1749 if (!fs_info->transaction_kthread)
1750 goto fail_cleaner;
1751
1752 if (btrfs_super_log_root(disk_super) != 0) {
1753 u64 bytenr = btrfs_super_log_root(disk_super);
1754
1755 if (fs_devices->rw_devices == 0) {
1756 printk(KERN_WARNING "Btrfs log replay required "
1757 "on RO media\n");
1758 err = -EIO;
1759 goto fail_trans_kthread;
1760 }
1761 blocksize =
1762 btrfs_level_size(tree_root,
1763 btrfs_super_log_root_level(disk_super));
1764
1765 log_tree_root = kzalloc(sizeof(struct btrfs_root),
1766 GFP_NOFS);
1767
1768 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1769 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1770
1771 log_tree_root->node = read_tree_block(tree_root, bytenr,
1772 blocksize,
1773 generation + 1);
1774 ret = btrfs_recover_log_trees(log_tree_root);
1775 BUG_ON(ret);
1776
1777 if (sb->s_flags & MS_RDONLY) {
1778 ret = btrfs_commit_super(tree_root);
1779 BUG_ON(ret);
1780 }
1781 }
1782
1783 if (!(sb->s_flags & MS_RDONLY)) {
1784 ret = btrfs_cleanup_reloc_trees(tree_root);
1785 BUG_ON(ret);
1786 }
1787
1788 location.objectid = BTRFS_FS_TREE_OBJECTID;
1789 location.type = BTRFS_ROOT_ITEM_KEY;
1790 location.offset = (u64)-1;
1791
1792 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1793 if (!fs_info->fs_root)
1794 goto fail_trans_kthread;
1795 return tree_root;
1796
1797fail_trans_kthread:
1798 kthread_stop(fs_info->transaction_kthread);
1799fail_cleaner:
1800 kthread_stop(fs_info->cleaner_kthread);
1801
1802 /*
1803 * make sure we're done with the btree inode before we stop our
1804 * kthreads
1805 */
1806 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
1807 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1808
1809fail_csum_root:
1810 free_extent_buffer(csum_root->node);
1811fail_extent_root:
1812 free_extent_buffer(extent_root->node);
1813fail_tree_root:
1814 free_extent_buffer(tree_root->node);
1815fail_chunk_root:
1816 free_extent_buffer(chunk_root->node);
1817fail_sys_array:
1818 free_extent_buffer(dev_root->node);
1819fail_sb_buffer:
1820 btrfs_stop_workers(&fs_info->fixup_workers);
1821 btrfs_stop_workers(&fs_info->delalloc_workers);
1822 btrfs_stop_workers(&fs_info->workers);
1823 btrfs_stop_workers(&fs_info->endio_workers);
1824 btrfs_stop_workers(&fs_info->endio_meta_workers);
1825 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
1826 btrfs_stop_workers(&fs_info->endio_write_workers);
1827 btrfs_stop_workers(&fs_info->submit_workers);
1828fail_iput:
1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1830 iput(fs_info->btree_inode);
1831fail:
1832 btrfs_close_devices(fs_info->fs_devices);
1833 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1834
1835 kfree(extent_root);
1836 kfree(tree_root);
1837 bdi_destroy(&fs_info->bdi);
1838 kfree(fs_info);
1839 kfree(chunk_root);
1840 kfree(dev_root);
1841 kfree(csum_root);
1842 return ERR_PTR(err);
1843}
1844
1845static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
1846{
1847 char b[BDEVNAME_SIZE];
1848
1849 if (uptodate) {
1850 set_buffer_uptodate(bh);
1851 } else {
1852 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
1853 printk(KERN_WARNING "lost page write due to "
1854 "I/O error on %s\n",
1855 bdevname(bh->b_bdev, b));
1856 }
1857 /* note, we dont' set_buffer_write_io_error because we have
1858 * our own ways of dealing with the IO errors
1859 */
1860 clear_buffer_uptodate(bh);
1861 }
1862 unlock_buffer(bh);
1863 put_bh(bh);
1864}
1865
1866struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
1867{
1868 struct buffer_head *bh;
1869 struct buffer_head *latest = NULL;
1870 struct btrfs_super_block *super;
1871 int i;
1872 u64 transid = 0;
1873 u64 bytenr;
1874
1875 /* we would like to check all the supers, but that would make
1876 * a btrfs mount succeed after a mkfs from a different FS.
1877 * So, we need to add a special mount option to scan for
1878 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1879 */
1880 for (i = 0; i < 1; i++) {
1881 bytenr = btrfs_sb_offset(i);
1882 if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
1883 break;
1884 bh = __bread(bdev, bytenr / 4096, 4096);
1885 if (!bh)
1886 continue;
1887
1888 super = (struct btrfs_super_block *)bh->b_data;
1889 if (btrfs_super_bytenr(super) != bytenr ||
1890 strncmp((char *)(&super->magic), BTRFS_MAGIC,
1891 sizeof(super->magic))) {
1892 brelse(bh);
1893 continue;
1894 }
1895
1896 if (!latest || btrfs_super_generation(super) > transid) {
1897 brelse(latest);
1898 latest = bh;
1899 transid = btrfs_super_generation(super);
1900 } else {
1901 brelse(bh);
1902 }
1903 }
1904 return latest;
1905}
1906
1907static int write_dev_supers(struct btrfs_device *device,
1908 struct btrfs_super_block *sb,
1909 int do_barriers, int wait, int max_mirrors)
1910{
1911 struct buffer_head *bh;
1912 int i;
1913 int ret;
1914 int errors = 0;
1915 u32 crc;
1916 u64 bytenr;
1917 int last_barrier = 0;
1918
1919 if (max_mirrors == 0)
1920 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
1921
1922 /* make sure only the last submit_bh does a barrier */
1923 if (do_barriers) {
1924 for (i = 0; i < max_mirrors; i++) {
1925 bytenr = btrfs_sb_offset(i);
1926 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1927 device->total_bytes)
1928 break;
1929 last_barrier = i;
1930 }
1931 }
1932
1933 for (i = 0; i < max_mirrors; i++) {
1934 bytenr = btrfs_sb_offset(i);
1935 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
1936 break;
1937
1938 if (wait) {
1939 bh = __find_get_block(device->bdev, bytenr / 4096,
1940 BTRFS_SUPER_INFO_SIZE);
1941 BUG_ON(!bh);
1942 brelse(bh);
1943 wait_on_buffer(bh);
1944 if (buffer_uptodate(bh)) {
1945 brelse(bh);
1946 continue;
1947 }
1948 } else {
1949 btrfs_set_super_bytenr(sb, bytenr);
1950
1951 crc = ~(u32)0;
1952 crc = btrfs_csum_data(NULL, (char *)sb +
1953 BTRFS_CSUM_SIZE, crc,
1954 BTRFS_SUPER_INFO_SIZE -
1955 BTRFS_CSUM_SIZE);
1956 btrfs_csum_final(crc, sb->csum);
1957
1958 bh = __getblk(device->bdev, bytenr / 4096,
1959 BTRFS_SUPER_INFO_SIZE);
1960 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
1961
1962 set_buffer_uptodate(bh);
1963 get_bh(bh);
1964 lock_buffer(bh);
1965 bh->b_end_io = btrfs_end_buffer_write_sync;
1966 }
1967
1968 if (i == last_barrier && do_barriers && device->barriers) {
1969 ret = submit_bh(WRITE_BARRIER, bh);
1970 if (ret == -EOPNOTSUPP) {
1971 printk("btrfs: disabling barriers on dev %s\n",
1972 device->name);
1973 set_buffer_uptodate(bh);
1974 device->barriers = 0;
1975 get_bh(bh);
1976 lock_buffer(bh);
1977 ret = submit_bh(WRITE, bh);
1978 }
1979 } else {
1980 ret = submit_bh(WRITE, bh);
1981 }
1982
1983 if (!ret && wait) {
1984 wait_on_buffer(bh);
1985 if (!buffer_uptodate(bh))
1986 errors++;
1987 } else if (ret) {
1988 errors++;
1989 }
1990 if (wait)
1991 brelse(bh);
1992 }
1993 return errors < i ? 0 : -1;
1994}
1995
1996int write_all_supers(struct btrfs_root *root, int max_mirrors)
1997{
1998 struct list_head *cur;
1999 struct list_head *head = &root->fs_info->fs_devices->devices;
2000 struct btrfs_device *dev;
2001 struct btrfs_super_block *sb;
2002 struct btrfs_dev_item *dev_item;
2003 int ret;
2004 int do_barriers;
2005 int max_errors;
2006 int total_errors = 0;
2007 u64 flags;
2008
2009 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
2010 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2011
2012 sb = &root->fs_info->super_for_commit;
2013 dev_item = &sb->dev_item;
2014 list_for_each(cur, head) {
2015 dev = list_entry(cur, struct btrfs_device, dev_list);
2016 if (!dev->bdev) {
2017 total_errors++;
2018 continue;
2019 }
2020 if (!dev->in_fs_metadata || !dev->writeable)
2021 continue;
2022
2023 btrfs_set_stack_device_generation(dev_item, 0);
2024 btrfs_set_stack_device_type(dev_item, dev->type);
2025 btrfs_set_stack_device_id(dev_item, dev->devid);
2026 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
2027 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
2028 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
2029 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
2030 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
2031 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
2032 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
2033
2034 flags = btrfs_super_flags(sb);
2035 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
2036
2037 ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
2038 if (ret)
2039 total_errors++;
2040 }
2041 if (total_errors > max_errors) {
2042 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2043 total_errors);
2044 BUG();
2045 }
2046
2047 total_errors = 0;
2048 list_for_each(cur, head) {
2049 dev = list_entry(cur, struct btrfs_device, dev_list);
2050 if (!dev->bdev)
2051 continue;
2052 if (!dev->in_fs_metadata || !dev->writeable)
2053 continue;
2054
2055 ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
2056 if (ret)
2057 total_errors++;
2058 }
2059 if (total_errors > max_errors) {
2060 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2061 total_errors);
2062 BUG();
2063 }
2064 return 0;
2065}
2066
2067int write_ctree_super(struct btrfs_trans_handle *trans,
2068 struct btrfs_root *root, int max_mirrors)
2069{
2070 int ret;
2071
2072 ret = write_all_supers(root, max_mirrors);
2073 return ret;
2074}
2075
2076int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2077{
2078 radix_tree_delete(&fs_info->fs_roots_radix,
2079 (unsigned long)root->root_key.objectid);
2080 if (root->anon_super.s_dev) {
2081 down_write(&root->anon_super.s_umount);
2082 kill_anon_super(&root->anon_super);
2083 }
2084 if (root->node)
2085 free_extent_buffer(root->node);
2086 if (root->commit_root)
2087 free_extent_buffer(root->commit_root);
2088 kfree(root->name);
2089 kfree(root);
2090 return 0;
2091}
2092
2093static int del_fs_roots(struct btrfs_fs_info *fs_info)
2094{
2095 int ret;
2096 struct btrfs_root *gang[8];
2097 int i;
2098
2099 while (1) {
2100 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2101 (void **)gang, 0,
2102 ARRAY_SIZE(gang));
2103 if (!ret)
2104 break;
2105 for (i = 0; i < ret; i++)
2106 btrfs_free_fs_root(fs_info, gang[i]);
2107 }
2108 return 0;
2109}
2110
2111int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2112{
2113 u64 root_objectid = 0;
2114 struct btrfs_root *gang[8];
2115 int i;
2116 int ret;
2117
2118 while (1) {
2119 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2120 (void **)gang, root_objectid,
2121 ARRAY_SIZE(gang));
2122 if (!ret)
2123 break;
2124 for (i = 0; i < ret; i++) {
2125 root_objectid = gang[i]->root_key.objectid;
2126 ret = btrfs_find_dead_roots(fs_info->tree_root,
2127 root_objectid, gang[i]);
2128 BUG_ON(ret);
2129 btrfs_orphan_cleanup(gang[i]);
2130 }
2131 root_objectid++;
2132 }
2133 return 0;
2134}
2135
2136int btrfs_commit_super(struct btrfs_root *root)
2137{
2138 struct btrfs_trans_handle *trans;
2139 int ret;
2140
2141 mutex_lock(&root->fs_info->cleaner_mutex);
2142 btrfs_clean_old_snapshots(root);
2143 mutex_unlock(&root->fs_info->cleaner_mutex);
2144 trans = btrfs_start_transaction(root, 1);
2145 ret = btrfs_commit_transaction(trans, root);
2146 BUG_ON(ret);
2147 /* run commit again to drop the original snapshot */
2148 trans = btrfs_start_transaction(root, 1);
2149 btrfs_commit_transaction(trans, root);
2150 ret = btrfs_write_and_wait_transaction(NULL, root);
2151 BUG_ON(ret);
2152
2153 ret = write_ctree_super(NULL, root, 0);
2154 return ret;
2155}
2156
2157int close_ctree(struct btrfs_root *root)
2158{
2159 struct btrfs_fs_info *fs_info = root->fs_info;
2160 int ret;
2161
2162 fs_info->closing = 1;
2163 smp_mb();
2164
2165 kthread_stop(root->fs_info->transaction_kthread);
2166 kthread_stop(root->fs_info->cleaner_kthread);
2167
2168 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2169 ret = btrfs_commit_super(root);
2170 if (ret)
2171 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2172 }
2173
2174 if (fs_info->delalloc_bytes) {
2175 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2176 fs_info->delalloc_bytes);
2177 }
2178 if (fs_info->total_ref_cache_size) {
2179 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
2180 (unsigned long long)fs_info->total_ref_cache_size);
2181 }
2182
2183 if (fs_info->extent_root->node)
2184 free_extent_buffer(fs_info->extent_root->node);
2185
2186 if (fs_info->tree_root->node)
2187 free_extent_buffer(fs_info->tree_root->node);
2188
2189 if (root->fs_info->chunk_root->node)
2190 free_extent_buffer(root->fs_info->chunk_root->node);
2191
2192 if (root->fs_info->dev_root->node)
2193 free_extent_buffer(root->fs_info->dev_root->node);
2194
2195 if (root->fs_info->csum_root->node)
2196 free_extent_buffer(root->fs_info->csum_root->node);
2197
2198 btrfs_free_block_groups(root->fs_info);
2199
2200 del_fs_roots(fs_info);
2201
2202 iput(fs_info->btree_inode);
2203
2204 btrfs_stop_workers(&fs_info->fixup_workers);
2205 btrfs_stop_workers(&fs_info->delalloc_workers);
2206 btrfs_stop_workers(&fs_info->workers);
2207 btrfs_stop_workers(&fs_info->endio_workers);
2208 btrfs_stop_workers(&fs_info->endio_meta_workers);
2209 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2210 btrfs_stop_workers(&fs_info->endio_write_workers);
2211 btrfs_stop_workers(&fs_info->submit_workers);
2212
2213#if 0
2214 while (!list_empty(&fs_info->hashers)) {
2215 struct btrfs_hasher *hasher;
2216 hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
2217 hashers);
2218 list_del(&hasher->hashers);
2219 crypto_free_hash(&fs_info->hash_tfm);
2220 kfree(hasher);
2221 }
2222#endif
2223 btrfs_close_devices(fs_info->fs_devices);
2224 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2225
2226 bdi_destroy(&fs_info->bdi);
2227
2228 kfree(fs_info->extent_root);
2229 kfree(fs_info->tree_root);
2230 kfree(fs_info->chunk_root);
2231 kfree(fs_info->dev_root);
2232 kfree(fs_info->csum_root);
2233 return 0;
2234}
2235
2236int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2237{
2238 int ret;
2239 struct inode *btree_inode = buf->first_page->mapping->host;
2240
2241 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
2242 if (!ret)
2243 return ret;
2244
2245 ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
2246 parent_transid);
2247 return !ret;
2248}
2249
2250int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
2251{
2252 struct inode *btree_inode = buf->first_page->mapping->host;
2253 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
2254 buf);
2255}
2256
2257void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2258{
2259 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2260 u64 transid = btrfs_header_generation(buf);
2261 struct inode *btree_inode = root->fs_info->btree_inode;
2262
2263 WARN_ON(!btrfs_tree_locked(buf));
2264 if (transid != root->fs_info->generation) {
2265 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
2266 "found %llu running %llu\n",
2267 (unsigned long long)buf->start,
2268 (unsigned long long)transid,
2269 (unsigned long long)root->fs_info->generation);
2270 WARN_ON(1);
2271 }
2272 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
2273}
2274
2275void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2276{
2277 /*
2278 * looks as though older kernels can get into trouble with
2279 * this code, they end up stuck in balance_dirty_pages forever
2280 */
2281 struct extent_io_tree *tree;
2282 u64 num_dirty;
2283 u64 start = 0;
2284 unsigned long thresh = 32 * 1024 * 1024;
2285 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2286
2287 if (current_is_pdflush() || current->flags & PF_MEMALLOC)
2288 return;
2289
2290 num_dirty = count_range_bits(tree, &start, (u64)-1,
2291 thresh, EXTENT_DIRTY);
2292 if (num_dirty > thresh) {
2293 balance_dirty_pages_ratelimited_nr(
2294 root->fs_info->btree_inode->i_mapping, 1);
2295 }
2296 return;
2297}
2298
2299int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2300{
2301 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2302 int ret;
2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2304 if (ret == 0)
2305 buf->flags |= EXTENT_UPTODATE;
2306 return ret;
2307}
2308
2309int btree_lock_page_hook(struct page *page)
2310{
2311 struct inode *inode = page->mapping->host;
2312 struct btrfs_root *root = BTRFS_I(inode)->root;
2313 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2314 struct extent_buffer *eb;
2315 unsigned long len;
2316 u64 bytenr = page_offset(page);
2317
2318 if (page->private == EXTENT_PAGE_PRIVATE)
2319 goto out;
2320
2321 len = page->private >> 2;
2322 eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
2323 if (!eb)
2324 goto out;
2325
2326 btrfs_tree_lock(eb);
2327 spin_lock(&root->fs_info->hash_lock);
2328 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2329 spin_unlock(&root->fs_info->hash_lock);
2330 btrfs_tree_unlock(eb);
2331 free_extent_buffer(eb);
2332out:
2333 lock_page(page);
2334 return 0;
2335}
2336
2337static struct extent_io_ops btree_extent_io_ops = {
2338 .write_cache_pages_lock_hook = btree_lock_page_hook,
2339 .readpage_end_io_hook = btree_readpage_end_io_hook,
2340 .submit_bio_hook = btree_submit_bio_hook,
2341 /* note we're sharing with inode.c for the merge bio hook */
2342 .merge_bio_hook = btrfs_merge_bio_hook,
2343};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000000..c0ff404c31b7
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,102 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __DISKIO__
20#define __DISKIO__
21
22#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
23#define BTRFS_SUPER_INFO_SIZE 4096
24
25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12
27
28static inline u64 btrfs_sb_offset(int mirror)
29{
30 u64 start = 16 * 1024;
31 if (mirror)
32 return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
33 return BTRFS_SUPER_INFO_OFFSET;
34}
35
36struct btrfs_device;
37struct btrfs_fs_devices;
38
39struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans,
46 struct btrfs_root *root, struct extent_buffer *buf);
47struct btrfs_root *open_ctree(struct super_block *sb,
48 struct btrfs_fs_devices *fs_devices,
49 char *options);
50int close_ctree(struct btrfs_root *root);
51int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
58 u64 root_objectid);
59struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
60 struct btrfs_key *location,
61 const char *name, int namelen);
62struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
63 struct btrfs_key *location);
64struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
65 struct btrfs_key *location);
66int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
67int btrfs_insert_dev_radix(struct btrfs_root *root,
68 struct block_device *bdev,
69 u64 device_id,
70 u64 block_start,
71 u64 num_blocks);
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
76int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
77int wait_on_tree_block_writeback(struct btrfs_root *root,
78 struct extent_buffer *buf);
79int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
80u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
81void btrfs_csum_final(u32 crc, char *result);
82int btrfs_open_device(struct btrfs_device *dev);
83int btrfs_verify_block_csum(struct btrfs_root *root,
84 struct extent_buffer *buf);
85int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
86 int metadata);
87int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
88 int rw, struct bio *bio, int mirror_num,
89 unsigned long bio_flags,
90 extent_submit_bio_hook_t *submit_bio_start,
91 extent_submit_bio_hook_t *submit_bio_done);
92
93int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
94unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
95int btrfs_write_tree_block(struct extent_buffer *buf);
96int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
97int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
98 struct btrfs_fs_info *fs_info);
99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
100 struct btrfs_fs_info *fs_info);
101int btree_lock_page_hook(struct page *page);
102#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000000..85315d2c90de
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,203 @@
1#include <linux/fs.h>
2#include <linux/types.h>
3#include "ctree.h"
4#include "disk-io.h"
5#include "btrfs_inode.h"
6#include "print-tree.h"
7#include "export.h"
8#include "compat.h"
9
10#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
11 parent_objectid) / 4)
12#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
13 parent_root_objectid) / 4)
14#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
15
16static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
17 int connectable)
18{
19 struct btrfs_fid *fid = (struct btrfs_fid *)fh;
20 struct inode *inode = dentry->d_inode;
21 int len = *max_len;
22 int type;
23
24 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
25 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
26 return 255;
27
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT;
30
31 fid->objectid = BTRFS_I(inode)->location.objectid;
32 fid->root_objectid = BTRFS_I(inode)->root->objectid;
33 fid->gen = inode->i_generation;
34
35 if (connectable && !S_ISDIR(inode->i_mode)) {
36 struct inode *parent;
37 u64 parent_root_id;
38
39 spin_lock(&dentry->d_lock);
40
41 parent = dentry->d_parent->d_inode;
42 fid->parent_objectid = BTRFS_I(parent)->location.objectid;
43 fid->parent_gen = parent->i_generation;
44 parent_root_id = BTRFS_I(parent)->root->objectid;
45
46 spin_unlock(&dentry->d_lock);
47
48 if (parent_root_id != fid->root_objectid) {
49 fid->parent_root_objectid = parent_root_id;
50 len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
51 type = FILEID_BTRFS_WITH_PARENT_ROOT;
52 } else {
53 len = BTRFS_FID_SIZE_CONNECTABLE;
54 type = FILEID_BTRFS_WITH_PARENT;
55 }
56 }
57
58 *max_len = len;
59 return type;
60}
61
62static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
63 u64 root_objectid, u32 generation)
64{
65 struct btrfs_root *root;
66 struct inode *inode;
67 struct btrfs_key key;
68
69 key.objectid = root_objectid;
70 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
71 key.offset = (u64)-1;
72
73 root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
74 if (IS_ERR(root))
75 return ERR_CAST(root);
76
77 key.objectid = objectid;
78 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
79 key.offset = 0;
80
81 inode = btrfs_iget(sb, &key, root, NULL);
82 if (IS_ERR(inode))
83 return (void *)inode;
84
85 if (generation != inode->i_generation) {
86 iput(inode);
87 return ERR_PTR(-ESTALE);
88 }
89
90 return d_obtain_alias(inode);
91}
92
93static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
94 int fh_len, int fh_type)
95{
96 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
97 u64 objectid, root_objectid;
98 u32 generation;
99
100 if (fh_type == FILEID_BTRFS_WITH_PARENT) {
101 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
102 return NULL;
103 root_objectid = fid->root_objectid;
104 } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
105 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
106 return NULL;
107 root_objectid = fid->parent_root_objectid;
108 } else
109 return NULL;
110
111 objectid = fid->parent_objectid;
112 generation = fid->parent_gen;
113
114 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
115}
116
117static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
118 int fh_len, int fh_type)
119{
120 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
121 u64 objectid, root_objectid;
122 u32 generation;
123
124 if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
125 fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
126 (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
127 fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
128 (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
129 fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
130 return NULL;
131
132 objectid = fid->objectid;
133 root_objectid = fid->root_objectid;
134 generation = fid->gen;
135
136 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
137}
138
139static struct dentry *btrfs_get_parent(struct dentry *child)
140{
141 struct inode *dir = child->d_inode;
142 struct btrfs_root *root = BTRFS_I(dir)->root;
143 struct btrfs_key key;
144 struct btrfs_path *path;
145 struct extent_buffer *leaf;
146 int slot;
147 u64 objectid;
148 int ret;
149
150 path = btrfs_alloc_path();
151
152 key.objectid = dir->i_ino;
153 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
154 key.offset = (u64)-1;
155
156 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
157 if (ret < 0) {
158 /* Error */
159 btrfs_free_path(path);
160 return ERR_PTR(ret);
161 }
162 leaf = path->nodes[0];
163 slot = path->slots[0];
164 if (ret) {
165 /* btrfs_search_slot() returns the slot where we'd want to
166 insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
167 The _real_ backref, telling us what the parent inode
168 _actually_ is, will be in the slot _before_ the one
169 that btrfs_search_slot() returns. */
170 if (!slot) {
171 /* Unless there is _no_ key in the tree before... */
172 btrfs_free_path(path);
173 return ERR_PTR(-EIO);
174 }
175 slot--;
176 }
177
178 btrfs_item_key_to_cpu(leaf, &key, slot);
179 btrfs_free_path(path);
180
181 if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
182 return ERR_PTR(-EINVAL);
183
184 objectid = key.offset;
185
186 /* If we are already at the root of a subvol, return the real root */
187 if (objectid == dir->i_ino)
188 return dget(dir->i_sb->s_root);
189
190 /* Build a new key for the inode item */
191 key.objectid = objectid;
192 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
193 key.offset = 0;
194
195 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
196}
197
198const struct export_operations btrfs_export_ops = {
199 .encode_fh = btrfs_encode_fh,
200 .fh_to_dentry = btrfs_fh_to_dentry,
201 .fh_to_parent = btrfs_fh_to_parent,
202 .get_parent = btrfs_get_parent,
203};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000000..074348a95841
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
1#ifndef BTRFS_EXPORT_H
2#define BTRFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations btrfs_export_ops;
7
8struct btrfs_fid {
9 u64 objectid;
10 u64 root_objectid;
11 u32 gen;
12
13 u64 parent_objectid;
14 u32 parent_gen;
15
16 u64 parent_root_objectid;
17} __attribute__ ((packed));
18
19#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000000..ec43fa526d77
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,5990 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/pagemap.h>
20#include <linux/writeback.h>
21#include <linux/blkdev.h>
22#include <linux/version.h>
23#include "compat.h"
24#include "hash.h"
25#include "crc32c.h"
26#include "ctree.h"
27#include "disk-io.h"
28#include "print-tree.h"
29#include "transaction.h"
30#include "volumes.h"
31#include "locking.h"
32#include "ref-cache.h"
33#include "compat.h"
34
35#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1
37#define PENDING_BACKREF_UPDATE 2
38
39struct pending_extent_op {
40 int type;
41 u64 bytenr;
42 u64 num_bytes;
43 u64 parent;
44 u64 orig_parent;
45 u64 generation;
46 u64 orig_generation;
47 int level;
48 struct list_head list;
49 int del;
50};
51
52static int finish_current_insert(struct btrfs_trans_handle *trans,
53 struct btrfs_root *extent_root, int all);
54static int del_pending_extents(struct btrfs_trans_handle *trans,
55 struct btrfs_root *extent_root, int all);
56static int pin_down_bytes(struct btrfs_trans_handle *trans,
57 struct btrfs_root *root,
58 u64 bytenr, u64 num_bytes, int is_data);
59static int update_block_group(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free);
63
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{
66 return (cache->flags & bits) == bits;
67}
68
69/*
70 * this adds the block group to the fs_info rb tree for the block group
71 * cache
72 */
73static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
74 struct btrfs_block_group_cache *block_group)
75{
76 struct rb_node **p;
77 struct rb_node *parent = NULL;
78 struct btrfs_block_group_cache *cache;
79
80 spin_lock(&info->block_group_cache_lock);
81 p = &info->block_group_cache_tree.rb_node;
82
83 while (*p) {
84 parent = *p;
85 cache = rb_entry(parent, struct btrfs_block_group_cache,
86 cache_node);
87 if (block_group->key.objectid < cache->key.objectid) {
88 p = &(*p)->rb_left;
89 } else if (block_group->key.objectid > cache->key.objectid) {
90 p = &(*p)->rb_right;
91 } else {
92 spin_unlock(&info->block_group_cache_lock);
93 return -EEXIST;
94 }
95 }
96
97 rb_link_node(&block_group->cache_node, parent, p);
98 rb_insert_color(&block_group->cache_node,
99 &info->block_group_cache_tree);
100 spin_unlock(&info->block_group_cache_lock);
101
102 return 0;
103}
104
105/*
106 * This will return the block group at or after bytenr if contains is 0, else
107 * it will return the block group that contains the bytenr
108 */
109static struct btrfs_block_group_cache *
110block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
111 int contains)
112{
113 struct btrfs_block_group_cache *cache, *ret = NULL;
114 struct rb_node *n;
115 u64 end, start;
116
117 spin_lock(&info->block_group_cache_lock);
118 n = info->block_group_cache_tree.rb_node;
119
120 while (n) {
121 cache = rb_entry(n, struct btrfs_block_group_cache,
122 cache_node);
123 end = cache->key.objectid + cache->key.offset - 1;
124 start = cache->key.objectid;
125
126 if (bytenr < start) {
127 if (!contains && (!ret || start < ret->key.objectid))
128 ret = cache;
129 n = n->rb_left;
130 } else if (bytenr > start) {
131 if (contains && bytenr <= end) {
132 ret = cache;
133 break;
134 }
135 n = n->rb_right;
136 } else {
137 ret = cache;
138 break;
139 }
140 }
141 if (ret)
142 atomic_inc(&ret->count);
143 spin_unlock(&info->block_group_cache_lock);
144
145 return ret;
146}
147
148/*
149 * this is only called by cache_block_group, since we could have freed extents
150 * we need to check the pinned_extents for any extents that can't be used yet
151 * since their free space will be released as soon as the transaction commits.
152 */
153static int add_new_free_space(struct btrfs_block_group_cache *block_group,
154 struct btrfs_fs_info *info, u64 start, u64 end)
155{
156 u64 extent_start, extent_end, size;
157 int ret;
158
159 mutex_lock(&info->pinned_mutex);
160 while (start < end) {
161 ret = find_first_extent_bit(&info->pinned_extents, start,
162 &extent_start, &extent_end,
163 EXTENT_DIRTY);
164 if (ret)
165 break;
166
167 if (extent_start == start) {
168 start = extent_end + 1;
169 } else if (extent_start > start && extent_start < end) {
170 size = extent_start - start;
171 ret = btrfs_add_free_space(block_group, start,
172 size);
173 BUG_ON(ret);
174 start = extent_end + 1;
175 } else {
176 break;
177 }
178 }
179
180 if (start < end) {
181 size = end - start;
182 ret = btrfs_add_free_space(block_group, start, size);
183 BUG_ON(ret);
184 }
185 mutex_unlock(&info->pinned_mutex);
186
187 return 0;
188}
189
190static int remove_sb_from_cache(struct btrfs_root *root,
191 struct btrfs_block_group_cache *cache)
192{
193 u64 bytenr;
194 u64 *logical;
195 int stripe_len;
196 int i, nr, ret;
197
198 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
199 bytenr = btrfs_sb_offset(i);
200 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
201 cache->key.objectid, bytenr, 0,
202 &logical, &nr, &stripe_len);
203 BUG_ON(ret);
204 while (nr--) {
205 btrfs_remove_free_space(cache, logical[nr],
206 stripe_len);
207 }
208 kfree(logical);
209 }
210 return 0;
211}
212
213static int cache_block_group(struct btrfs_root *root,
214 struct btrfs_block_group_cache *block_group)
215{
216 struct btrfs_path *path;
217 int ret = 0;
218 struct btrfs_key key;
219 struct extent_buffer *leaf;
220 int slot;
221 u64 last;
222
223 if (!block_group)
224 return 0;
225
226 root = root->fs_info->extent_root;
227
228 if (block_group->cached)
229 return 0;
230
231 path = btrfs_alloc_path();
232 if (!path)
233 return -ENOMEM;
234
235 path->reada = 2;
236 /*
237 * we get into deadlocks with paths held by callers of this function.
238 * since the alloc_mutex is protecting things right now, just
239 * skip the locking here
240 */
241 path->skip_locking = 1;
242 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
243 key.objectid = last;
244 key.offset = 0;
245 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
246 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
247 if (ret < 0)
248 goto err;
249
250 while (1) {
251 leaf = path->nodes[0];
252 slot = path->slots[0];
253 if (slot >= btrfs_header_nritems(leaf)) {
254 ret = btrfs_next_leaf(root, path);
255 if (ret < 0)
256 goto err;
257 if (ret == 0)
258 continue;
259 else
260 break;
261 }
262 btrfs_item_key_to_cpu(leaf, &key, slot);
263 if (key.objectid < block_group->key.objectid)
264 goto next;
265
266 if (key.objectid >= block_group->key.objectid +
267 block_group->key.offset)
268 break;
269
270 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
271 add_new_free_space(block_group, root->fs_info, last,
272 key.objectid);
273
274 last = key.objectid + key.offset;
275 }
276next:
277 path->slots[0]++;
278 }
279
280 add_new_free_space(block_group, root->fs_info, last,
281 block_group->key.objectid +
282 block_group->key.offset);
283
284 remove_sb_from_cache(root, block_group);
285 block_group->cached = 1;
286 ret = 0;
287err:
288 btrfs_free_path(path);
289 return ret;
290}
291
292/*
293 * return the block group that starts at or after bytenr
294 */
295static struct btrfs_block_group_cache *
296btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
297{
298 struct btrfs_block_group_cache *cache;
299
300 cache = block_group_cache_tree_search(info, bytenr, 0);
301
302 return cache;
303}
304
305/*
306 * return the block group that contains teh given bytenr
307 */
308struct btrfs_block_group_cache *btrfs_lookup_block_group(
309 struct btrfs_fs_info *info,
310 u64 bytenr)
311{
312 struct btrfs_block_group_cache *cache;
313
314 cache = block_group_cache_tree_search(info, bytenr, 1);
315
316 return cache;
317}
318
319static inline void put_block_group(struct btrfs_block_group_cache *cache)
320{
321 if (atomic_dec_and_test(&cache->count))
322 kfree(cache);
323}
324
325static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags)
327{
328 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found;
331 list_for_each(cur, head) {
332 found = list_entry(cur, struct btrfs_space_info, list);
333 if (found->flags == flags)
334 return found;
335 }
336 return NULL;
337}
338
339static u64 div_factor(u64 num, int factor)
340{
341 if (factor == 10)
342 return num;
343 num *= factor;
344 do_div(num, 10);
345 return num;
346}
347
348u64 btrfs_find_block_group(struct btrfs_root *root,
349 u64 search_start, u64 search_hint, int owner)
350{
351 struct btrfs_block_group_cache *cache;
352 u64 used;
353 u64 last = max(search_hint, search_start);
354 u64 group_start = 0;
355 int full_search = 0;
356 int factor = 9;
357 int wrapped = 0;
358again:
359 while (1) {
360 cache = btrfs_lookup_first_block_group(root->fs_info, last);
361 if (!cache)
362 break;
363
364 spin_lock(&cache->lock);
365 last = cache->key.objectid + cache->key.offset;
366 used = btrfs_block_group_used(&cache->item);
367
368 if ((full_search || !cache->ro) &&
369 block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
370 if (used + cache->pinned + cache->reserved <
371 div_factor(cache->key.offset, factor)) {
372 group_start = cache->key.objectid;
373 spin_unlock(&cache->lock);
374 put_block_group(cache);
375 goto found;
376 }
377 }
378 spin_unlock(&cache->lock);
379 put_block_group(cache);
380 cond_resched();
381 }
382 if (!wrapped) {
383 last = search_start;
384 wrapped = 1;
385 goto again;
386 }
387 if (!full_search && factor < 10) {
388 last = search_start;
389 full_search = 1;
390 factor = 10;
391 goto again;
392 }
393found:
394 return group_start;
395}
396
397/* simple helper to search for an existing extent at a given offset */
398int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
399{
400 int ret;
401 struct btrfs_key key;
402 struct btrfs_path *path;
403
404 path = btrfs_alloc_path();
405 BUG_ON(!path);
406 key.objectid = start;
407 key.offset = len;
408 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
409 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
410 0, 0);
411 btrfs_free_path(path);
412 return ret;
413}
414
415/*
416 * Back reference rules. Back refs have three main goals:
417 *
418 * 1) differentiate between all holders of references to an extent so that
419 * when a reference is dropped we can make sure it was a valid reference
420 * before freeing the extent.
421 *
422 * 2) Provide enough information to quickly find the holders of an extent
423 * if we notice a given block is corrupted or bad.
424 *
425 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
426 * maintenance. This is actually the same as #2, but with a slightly
427 * different use case.
428 *
429 * File extents can be referenced by:
430 *
431 * - multiple snapshots, subvolumes, or different generations in one subvol
432 * - different files inside a single subvolume
433 * - different offsets inside a file (bookend extents in file.c)
434 *
435 * The extent ref structure has fields for:
436 *
437 * - Objectid of the subvolume root
438 * - Generation number of the tree holding the reference
439 * - objectid of the file holding the reference
440 * - number of references holding by parent node (alway 1 for tree blocks)
441 *
442 * Btree leaf may hold multiple references to a file extent. In most cases,
443 * these references are from same file and the corresponding offsets inside
444 * the file are close together.
445 *
446 * When a file extent is allocated the fields are filled in:
447 * (root_key.objectid, trans->transid, inode objectid, 1)
448 *
449 * When a leaf is cow'd new references are added for every file extent found
450 * in the leaf. It looks similar to the create case, but trans->transid will
451 * be different when the block is cow'd.
452 *
453 * (root_key.objectid, trans->transid, inode objectid,
454 * number of references in the leaf)
455 *
456 * When a file extent is removed either during snapshot deletion or
457 * file truncation, we find the corresponding back reference and check
458 * the following fields:
459 *
460 * (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
461 * inode objectid)
462 *
463 * Btree extents can be referenced by:
464 *
465 * - Different subvolumes
466 * - Different generations of the same subvolume
467 *
468 * When a tree block is created, back references are inserted:
469 *
470 * (root->root_key.objectid, trans->transid, level, 1)
471 *
472 * When a tree block is cow'd, new back references are added for all the
473 * blocks it points to. If the tree block isn't in reference counted root,
474 * the old back references are removed. These new back references are of
475 * the form (trans->transid will have increased since creation):
476 *
477 * (root->root_key.objectid, trans->transid, level, 1)
478 *
479 * When a backref is in deleting, the following fields are checked:
480 *
481 * if backref was for a tree root:
482 * (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
483 * else
484 * (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
485 *
486 * Back Reference Key composing:
487 *
488 * The key objectid corresponds to the first byte in the extent, the key
489 * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
490 * byte of parent extent. If a extent is tree root, the key offset is set
491 * to the key objectid.
492 */
493
494static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
495 struct btrfs_root *root,
496 struct btrfs_path *path,
497 u64 bytenr, u64 parent,
498 u64 ref_root, u64 ref_generation,
499 u64 owner_objectid, int del)
500{
501 struct btrfs_key key;
502 struct btrfs_extent_ref *ref;
503 struct extent_buffer *leaf;
504 u64 ref_objectid;
505 int ret;
506
507 key.objectid = bytenr;
508 key.type = BTRFS_EXTENT_REF_KEY;
509 key.offset = parent;
510
511 ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
512 if (ret < 0)
513 goto out;
514 if (ret > 0) {
515 ret = -ENOENT;
516 goto out;
517 }
518
519 leaf = path->nodes[0];
520 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
521 ref_objectid = btrfs_ref_objectid(leaf, ref);
522 if (btrfs_ref_root(leaf, ref) != ref_root ||
523 btrfs_ref_generation(leaf, ref) != ref_generation ||
524 (ref_objectid != owner_objectid &&
525 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
526 ret = -EIO;
527 WARN_ON(1);
528 goto out;
529 }
530 ret = 0;
531out:
532 return ret;
533}
534
535/*
536 * updates all the backrefs that are pending on update_list for the
537 * extent_root
538 */
539static noinline int update_backrefs(struct btrfs_trans_handle *trans,
540 struct btrfs_root *extent_root,
541 struct btrfs_path *path,
542 struct list_head *update_list)
543{
544 struct btrfs_key key;
545 struct btrfs_extent_ref *ref;
546 struct btrfs_fs_info *info = extent_root->fs_info;
547 struct pending_extent_op *op;
548 struct extent_buffer *leaf;
549 int ret = 0;
550 struct list_head *cur = update_list->next;
551 u64 ref_objectid;
552 u64 ref_root = extent_root->root_key.objectid;
553
554 op = list_entry(cur, struct pending_extent_op, list);
555
556search:
557 key.objectid = op->bytenr;
558 key.type = BTRFS_EXTENT_REF_KEY;
559 key.offset = op->orig_parent;
560
561 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
562 BUG_ON(ret);
563
564 leaf = path->nodes[0];
565
566loop:
567 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
568
569 ref_objectid = btrfs_ref_objectid(leaf, ref);
570
571 if (btrfs_ref_root(leaf, ref) != ref_root ||
572 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
573 (ref_objectid != op->level &&
574 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
575 printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
576 "root %llu, owner %u\n",
577 (unsigned long long)op->bytenr,
578 (unsigned long long)op->orig_parent,
579 (unsigned long long)ref_root, op->level);
580 btrfs_print_leaf(extent_root, leaf);
581 BUG();
582 }
583
584 key.objectid = op->bytenr;
585 key.offset = op->parent;
586 key.type = BTRFS_EXTENT_REF_KEY;
587 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
588 BUG_ON(ret);
589 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
590 btrfs_set_ref_generation(leaf, ref, op->generation);
591
592 cur = cur->next;
593
594 list_del_init(&op->list);
595 unlock_extent(&info->extent_ins, op->bytenr,
596 op->bytenr + op->num_bytes - 1, GFP_NOFS);
597 kfree(op);
598
599 if (cur == update_list) {
600 btrfs_mark_buffer_dirty(path->nodes[0]);
601 btrfs_release_path(extent_root, path);
602 goto out;
603 }
604
605 op = list_entry(cur, struct pending_extent_op, list);
606
607 path->slots[0]++;
608 while (path->slots[0] < btrfs_header_nritems(leaf)) {
609 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
610 if (key.objectid == op->bytenr &&
611 key.type == BTRFS_EXTENT_REF_KEY)
612 goto loop;
613 path->slots[0]++;
614 }
615
616 btrfs_mark_buffer_dirty(path->nodes[0]);
617 btrfs_release_path(extent_root, path);
618 goto search;
619
620out:
621 return 0;
622}
623
624static noinline int insert_extents(struct btrfs_trans_handle *trans,
625 struct btrfs_root *extent_root,
626 struct btrfs_path *path,
627 struct list_head *insert_list, int nr)
628{
629 struct btrfs_key *keys;
630 u32 *data_size;
631 struct pending_extent_op *op;
632 struct extent_buffer *leaf;
633 struct list_head *cur = insert_list->next;
634 struct btrfs_fs_info *info = extent_root->fs_info;
635 u64 ref_root = extent_root->root_key.objectid;
636 int i = 0, last = 0, ret;
637 int total = nr * 2;
638
639 if (!nr)
640 return 0;
641
642 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
643 if (!keys)
644 return -ENOMEM;
645
646 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
647 if (!data_size) {
648 kfree(keys);
649 return -ENOMEM;
650 }
651
652 list_for_each_entry(op, insert_list, list) {
653 keys[i].objectid = op->bytenr;
654 keys[i].offset = op->num_bytes;
655 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
656 data_size[i] = sizeof(struct btrfs_extent_item);
657 i++;
658
659 keys[i].objectid = op->bytenr;
660 keys[i].offset = op->parent;
661 keys[i].type = BTRFS_EXTENT_REF_KEY;
662 data_size[i] = sizeof(struct btrfs_extent_ref);
663 i++;
664 }
665
666 op = list_entry(cur, struct pending_extent_op, list);
667 i = 0;
668 while (i < total) {
669 int c;
670 ret = btrfs_insert_some_items(trans, extent_root, path,
671 keys+i, data_size+i, total-i);
672 BUG_ON(ret < 0);
673
674 if (last && ret > 1)
675 BUG();
676
677 leaf = path->nodes[0];
678 for (c = 0; c < ret; c++) {
679 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
680
681 /*
682 * if the first item we inserted was a backref, then
683 * the EXTENT_ITEM will be the odd c's, else it will
684 * be the even c's
685 */
686 if ((ref_first && (c % 2)) ||
687 (!ref_first && !(c % 2))) {
688 struct btrfs_extent_item *itm;
689
690 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
691 struct btrfs_extent_item);
692 btrfs_set_extent_refs(path->nodes[0], itm, 1);
693 op->del++;
694 } else {
695 struct btrfs_extent_ref *ref;
696
697 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
698 struct btrfs_extent_ref);
699 btrfs_set_ref_root(leaf, ref, ref_root);
700 btrfs_set_ref_generation(leaf, ref,
701 op->generation);
702 btrfs_set_ref_objectid(leaf, ref, op->level);
703 btrfs_set_ref_num_refs(leaf, ref, 1);
704 op->del++;
705 }
706
707 /*
708 * using del to see when its ok to free up the
709 * pending_extent_op. In the case where we insert the
710 * last item on the list in order to help do batching
711 * we need to not free the extent op until we actually
712 * insert the extent_item
713 */
714 if (op->del == 2) {
715 unlock_extent(&info->extent_ins, op->bytenr,
716 op->bytenr + op->num_bytes - 1,
717 GFP_NOFS);
718 cur = cur->next;
719 list_del_init(&op->list);
720 kfree(op);
721 if (cur != insert_list)
722 op = list_entry(cur,
723 struct pending_extent_op,
724 list);
725 }
726 }
727 btrfs_mark_buffer_dirty(leaf);
728 btrfs_release_path(extent_root, path);
729
730 /*
731 * Ok backref's and items usually go right next to eachother,
732 * but if we could only insert 1 item that means that we
733 * inserted on the end of a leaf, and we have no idea what may
734 * be on the next leaf so we just play it safe. In order to
735 * try and help this case we insert the last thing on our
736 * insert list so hopefully it will end up being the last
737 * thing on the leaf and everything else will be before it,
738 * which will let us insert a whole bunch of items at the same
739 * time.
740 */
741 if (ret == 1 && !last && (i + ret < total)) {
742 /*
743 * last: where we will pick up the next time around
744 * i: our current key to insert, will be total - 1
745 * cur: the current op we are screwing with
746 * op: duh
747 */
748 last = i + ret;
749 i = total - 1;
750 cur = insert_list->prev;
751 op = list_entry(cur, struct pending_extent_op, list);
752 } else if (last) {
753 /*
754 * ok we successfully inserted the last item on the
755 * list, lets reset everything
756 *
757 * i: our current key to insert, so where we left off
758 * last time
759 * last: done with this
760 * cur: the op we are messing with
761 * op: duh
762 * total: since we inserted the last key, we need to
763 * decrement total so we dont overflow
764 */
765 i = last;
766 last = 0;
767 total--;
768 if (i < total) {
769 cur = insert_list->next;
770 op = list_entry(cur, struct pending_extent_op,
771 list);
772 }
773 } else {
774 i += ret;
775 }
776
777 cond_resched();
778 }
779 ret = 0;
780 kfree(keys);
781 kfree(data_size);
782 return ret;
783}
784
785static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
786 struct btrfs_root *root,
787 struct btrfs_path *path,
788 u64 bytenr, u64 parent,
789 u64 ref_root, u64 ref_generation,
790 u64 owner_objectid)
791{
792 struct btrfs_key key;
793 struct extent_buffer *leaf;
794 struct btrfs_extent_ref *ref;
795 u32 num_refs;
796 int ret;
797
798 key.objectid = bytenr;
799 key.type = BTRFS_EXTENT_REF_KEY;
800 key.offset = parent;
801
802 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
803 if (ret == 0) {
804 leaf = path->nodes[0];
805 ref = btrfs_item_ptr(leaf, path->slots[0],
806 struct btrfs_extent_ref);
807 btrfs_set_ref_root(leaf, ref, ref_root);
808 btrfs_set_ref_generation(leaf, ref, ref_generation);
809 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
810 btrfs_set_ref_num_refs(leaf, ref, 1);
811 } else if (ret == -EEXIST) {
812 u64 existing_owner;
813 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
814 leaf = path->nodes[0];
815 ref = btrfs_item_ptr(leaf, path->slots[0],
816 struct btrfs_extent_ref);
817 if (btrfs_ref_root(leaf, ref) != ref_root ||
818 btrfs_ref_generation(leaf, ref) != ref_generation) {
819 ret = -EIO;
820 WARN_ON(1);
821 goto out;
822 }
823
824 num_refs = btrfs_ref_num_refs(leaf, ref);
825 BUG_ON(num_refs == 0);
826 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
827
828 existing_owner = btrfs_ref_objectid(leaf, ref);
829 if (existing_owner != owner_objectid &&
830 existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
831 btrfs_set_ref_objectid(leaf, ref,
832 BTRFS_MULTIPLE_OBJECTIDS);
833 }
834 ret = 0;
835 } else {
836 goto out;
837 }
838 btrfs_mark_buffer_dirty(path->nodes[0]);
839out:
840 btrfs_release_path(root, path);
841 return ret;
842}
843
844static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
845 struct btrfs_root *root,
846 struct btrfs_path *path)
847{
848 struct extent_buffer *leaf;
849 struct btrfs_extent_ref *ref;
850 u32 num_refs;
851 int ret = 0;
852
853 leaf = path->nodes[0];
854 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
855 num_refs = btrfs_ref_num_refs(leaf, ref);
856 BUG_ON(num_refs == 0);
857 num_refs -= 1;
858 if (num_refs == 0) {
859 ret = btrfs_del_item(trans, root, path);
860 } else {
861 btrfs_set_ref_num_refs(leaf, ref, num_refs);
862 btrfs_mark_buffer_dirty(leaf);
863 }
864 btrfs_release_path(root, path);
865 return ret;
866}
867
868#ifdef BIO_RW_DISCARD
869static void btrfs_issue_discard(struct block_device *bdev,
870 u64 start, u64 len)
871{
872#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
873 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
874#else
875 blkdev_issue_discard(bdev, start >> 9, len >> 9);
876#endif
877}
878#endif
879
880static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
881 u64 num_bytes)
882{
883#ifdef BIO_RW_DISCARD
884 int ret;
885 u64 map_length = num_bytes;
886 struct btrfs_multi_bio *multi = NULL;
887
888 /* Tell the block device(s) that the sectors can be discarded */
889 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
890 bytenr, &map_length, &multi, 0);
891 if (!ret) {
892 struct btrfs_bio_stripe *stripe = multi->stripes;
893 int i;
894
895 if (map_length > num_bytes)
896 map_length = num_bytes;
897
898 for (i = 0; i < multi->num_stripes; i++, stripe++) {
899 btrfs_issue_discard(stripe->dev->bdev,
900 stripe->physical,
901 map_length);
902 }
903 kfree(multi);
904 }
905
906 return ret;
907#else
908 return 0;
909#endif
910}
911
912static noinline int free_extents(struct btrfs_trans_handle *trans,
913 struct btrfs_root *extent_root,
914 struct list_head *del_list)
915{
916 struct btrfs_fs_info *info = extent_root->fs_info;
917 struct btrfs_path *path;
918 struct btrfs_key key, found_key;
919 struct extent_buffer *leaf;
920 struct list_head *cur;
921 struct pending_extent_op *op;
922 struct btrfs_extent_item *ei;
923 int ret, num_to_del, extent_slot = 0, found_extent = 0;
924 u32 refs;
925 u64 bytes_freed = 0;
926
927 path = btrfs_alloc_path();
928 if (!path)
929 return -ENOMEM;
930 path->reada = 1;
931
932search:
933 /* search for the backref for the current ref we want to delete */
934 cur = del_list->next;
935 op = list_entry(cur, struct pending_extent_op, list);
936 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
937 op->orig_parent,
938 extent_root->root_key.objectid,
939 op->orig_generation, op->level, 1);
940 if (ret) {
941 printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
942 "root %llu gen %llu owner %u\n",
943 (unsigned long long)op->bytenr,
944 (unsigned long long)extent_root->root_key.objectid,
945 (unsigned long long)op->orig_generation, op->level);
946 btrfs_print_leaf(extent_root, path->nodes[0]);
947 WARN_ON(1);
948 goto out;
949 }
950
951 extent_slot = path->slots[0];
952 num_to_del = 1;
953 found_extent = 0;
954
955 /*
956 * if we aren't the first item on the leaf we can move back one and see
957 * if our ref is right next to our extent item
958 */
959 if (likely(extent_slot)) {
960 extent_slot--;
961 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
962 extent_slot);
963 if (found_key.objectid == op->bytenr &&
964 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
965 found_key.offset == op->num_bytes) {
966 num_to_del++;
967 found_extent = 1;
968 }
969 }
970
971 /*
972 * if we didn't find the extent we need to delete the backref and then
973 * search for the extent item key so we can update its ref count
974 */
975 if (!found_extent) {
976 key.objectid = op->bytenr;
977 key.type = BTRFS_EXTENT_ITEM_KEY;
978 key.offset = op->num_bytes;
979
980 ret = remove_extent_backref(trans, extent_root, path);
981 BUG_ON(ret);
982 btrfs_release_path(extent_root, path);
983 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
984 BUG_ON(ret);
985 extent_slot = path->slots[0];
986 }
987
988 /* this is where we update the ref count for the extent */
989 leaf = path->nodes[0];
990 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
991 refs = btrfs_extent_refs(leaf, ei);
992 BUG_ON(refs == 0);
993 refs--;
994 btrfs_set_extent_refs(leaf, ei, refs);
995
996 btrfs_mark_buffer_dirty(leaf);
997
998 /*
999 * This extent needs deleting. The reason cur_slot is extent_slot +
1000 * num_to_del is because extent_slot points to the slot where the extent
1001 * is, and if the backref was not right next to the extent we will be
1002 * deleting at least 1 item, and will want to start searching at the
1003 * slot directly next to extent_slot. However if we did find the
1004 * backref next to the extent item them we will be deleting at least 2
1005 * items and will want to start searching directly after the ref slot
1006 */
1007 if (!refs) {
1008 struct list_head *pos, *n, *end;
1009 int cur_slot = extent_slot+num_to_del;
1010 u64 super_used;
1011 u64 root_used;
1012
1013 path->slots[0] = extent_slot;
1014 bytes_freed = op->num_bytes;
1015
1016 mutex_lock(&info->pinned_mutex);
1017 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1018 op->num_bytes, op->level >=
1019 BTRFS_FIRST_FREE_OBJECTID);
1020 mutex_unlock(&info->pinned_mutex);
1021 BUG_ON(ret < 0);
1022 op->del = ret;
1023
1024 /*
1025 * we need to see if we can delete multiple things at once, so
1026 * start looping through the list of extents we are wanting to
1027 * delete and see if their extent/backref's are right next to
1028 * eachother and the extents only have 1 ref
1029 */
1030 for (pos = cur->next; pos != del_list; pos = pos->next) {
1031 struct pending_extent_op *tmp;
1032
1033 tmp = list_entry(pos, struct pending_extent_op, list);
1034
1035 /* we only want to delete extent+ref at this stage */
1036 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1037 break;
1038
1039 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1040 if (found_key.objectid != tmp->bytenr ||
1041 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1042 found_key.offset != tmp->num_bytes)
1043 break;
1044
1045 /* check to make sure this extent only has one ref */
1046 ei = btrfs_item_ptr(leaf, cur_slot,
1047 struct btrfs_extent_item);
1048 if (btrfs_extent_refs(leaf, ei) != 1)
1049 break;
1050
1051 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1052 if (found_key.objectid != tmp->bytenr ||
1053 found_key.type != BTRFS_EXTENT_REF_KEY ||
1054 found_key.offset != tmp->orig_parent)
1055 break;
1056
1057 /*
1058 * the ref is right next to the extent, we can set the
1059 * ref count to 0 since we will delete them both now
1060 */
1061 btrfs_set_extent_refs(leaf, ei, 0);
1062
1063 /* pin down the bytes for this extent */
1064 mutex_lock(&info->pinned_mutex);
1065 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1066 tmp->num_bytes, tmp->level >=
1067 BTRFS_FIRST_FREE_OBJECTID);
1068 mutex_unlock(&info->pinned_mutex);
1069 BUG_ON(ret < 0);
1070
1071 /*
1072 * use the del field to tell if we need to go ahead and
1073 * free up the extent when we delete the item or not.
1074 */
1075 tmp->del = ret;
1076 bytes_freed += tmp->num_bytes;
1077
1078 num_to_del += 2;
1079 cur_slot += 2;
1080 }
1081 end = pos;
1082
1083 /* update the free space counters */
1084 spin_lock(&info->delalloc_lock);
1085 super_used = btrfs_super_bytes_used(&info->super_copy);
1086 btrfs_set_super_bytes_used(&info->super_copy,
1087 super_used - bytes_freed);
1088
1089 root_used = btrfs_root_used(&extent_root->root_item);
1090 btrfs_set_root_used(&extent_root->root_item,
1091 root_used - bytes_freed);
1092 spin_unlock(&info->delalloc_lock);
1093
1094 /* delete the items */
1095 ret = btrfs_del_items(trans, extent_root, path,
1096 path->slots[0], num_to_del);
1097 BUG_ON(ret);
1098
1099 /*
1100 * loop through the extents we deleted and do the cleanup work
1101 * on them
1102 */
1103 for (pos = cur, n = pos->next; pos != end;
1104 pos = n, n = pos->next) {
1105 struct pending_extent_op *tmp;
1106 tmp = list_entry(pos, struct pending_extent_op, list);
1107
1108 /*
1109 * remember tmp->del tells us wether or not we pinned
1110 * down the extent
1111 */
1112 ret = update_block_group(trans, extent_root,
1113 tmp->bytenr, tmp->num_bytes, 0,
1114 tmp->del);
1115 BUG_ON(ret);
1116
1117 list_del_init(&tmp->list);
1118 unlock_extent(&info->extent_ins, tmp->bytenr,
1119 tmp->bytenr + tmp->num_bytes - 1,
1120 GFP_NOFS);
1121 kfree(tmp);
1122 }
1123 } else if (refs && found_extent) {
1124 /*
1125 * the ref and extent were right next to eachother, but the
1126 * extent still has a ref, so just free the backref and keep
1127 * going
1128 */
1129 ret = remove_extent_backref(trans, extent_root, path);
1130 BUG_ON(ret);
1131
1132 list_del_init(&op->list);
1133 unlock_extent(&info->extent_ins, op->bytenr,
1134 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1135 kfree(op);
1136 } else {
1137 /*
1138 * the extent has multiple refs and the backref we were looking
1139 * for was not right next to it, so just unlock and go next,
1140 * we're good to go
1141 */
1142 list_del_init(&op->list);
1143 unlock_extent(&info->extent_ins, op->bytenr,
1144 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1145 kfree(op);
1146 }
1147
1148 btrfs_release_path(extent_root, path);
1149 if (!list_empty(del_list))
1150 goto search;
1151
1152out:
1153 btrfs_free_path(path);
1154 return ret;
1155}
1156
1157static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1158 struct btrfs_root *root, u64 bytenr,
1159 u64 orig_parent, u64 parent,
1160 u64 orig_root, u64 ref_root,
1161 u64 orig_generation, u64 ref_generation,
1162 u64 owner_objectid)
1163{
1164 int ret;
1165 struct btrfs_root *extent_root = root->fs_info->extent_root;
1166 struct btrfs_path *path;
1167
1168 if (root == root->fs_info->extent_root) {
1169 struct pending_extent_op *extent_op;
1170 u64 num_bytes;
1171
1172 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1173 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1174 mutex_lock(&root->fs_info->extent_ins_mutex);
1175 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1176 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1177 u64 priv;
1178 ret = get_state_private(&root->fs_info->extent_ins,
1179 bytenr, &priv);
1180 BUG_ON(ret);
1181 extent_op = (struct pending_extent_op *)
1182 (unsigned long)priv;
1183 BUG_ON(extent_op->parent != orig_parent);
1184 BUG_ON(extent_op->generation != orig_generation);
1185
1186 extent_op->parent = parent;
1187 extent_op->generation = ref_generation;
1188 } else {
1189 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1190 BUG_ON(!extent_op);
1191
1192 extent_op->type = PENDING_BACKREF_UPDATE;
1193 extent_op->bytenr = bytenr;
1194 extent_op->num_bytes = num_bytes;
1195 extent_op->parent = parent;
1196 extent_op->orig_parent = orig_parent;
1197 extent_op->generation = ref_generation;
1198 extent_op->orig_generation = orig_generation;
1199 extent_op->level = (int)owner_objectid;
1200 INIT_LIST_HEAD(&extent_op->list);
1201 extent_op->del = 0;
1202
1203 set_extent_bits(&root->fs_info->extent_ins,
1204 bytenr, bytenr + num_bytes - 1,
1205 EXTENT_WRITEBACK, GFP_NOFS);
1206 set_state_private(&root->fs_info->extent_ins,
1207 bytenr, (unsigned long)extent_op);
1208 }
1209 mutex_unlock(&root->fs_info->extent_ins_mutex);
1210 return 0;
1211 }
1212
1213 path = btrfs_alloc_path();
1214 if (!path)
1215 return -ENOMEM;
1216 ret = lookup_extent_backref(trans, extent_root, path,
1217 bytenr, orig_parent, orig_root,
1218 orig_generation, owner_objectid, 1);
1219 if (ret)
1220 goto out;
1221 ret = remove_extent_backref(trans, extent_root, path);
1222 if (ret)
1223 goto out;
1224 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1225 parent, ref_root, ref_generation,
1226 owner_objectid);
1227 BUG_ON(ret);
1228 finish_current_insert(trans, extent_root, 0);
1229 del_pending_extents(trans, extent_root, 0);
1230out:
1231 btrfs_free_path(path);
1232 return ret;
1233}
1234
1235int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1236 struct btrfs_root *root, u64 bytenr,
1237 u64 orig_parent, u64 parent,
1238 u64 ref_root, u64 ref_generation,
1239 u64 owner_objectid)
1240{
1241 int ret;
1242 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1243 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1244 return 0;
1245 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
1246 parent, ref_root, ref_root,
1247 ref_generation, ref_generation,
1248 owner_objectid);
1249 return ret;
1250}
1251
1252static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1253 struct btrfs_root *root, u64 bytenr,
1254 u64 orig_parent, u64 parent,
1255 u64 orig_root, u64 ref_root,
1256 u64 orig_generation, u64 ref_generation,
1257 u64 owner_objectid)
1258{
1259 struct btrfs_path *path;
1260 int ret;
1261 struct btrfs_key key;
1262 struct extent_buffer *l;
1263 struct btrfs_extent_item *item;
1264 u32 refs;
1265
1266 path = btrfs_alloc_path();
1267 if (!path)
1268 return -ENOMEM;
1269
1270 path->reada = 1;
1271 key.objectid = bytenr;
1272 key.type = BTRFS_EXTENT_ITEM_KEY;
1273 key.offset = (u64)-1;
1274
1275 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1276 0, 1);
1277 if (ret < 0)
1278 return ret;
1279 BUG_ON(ret == 0 || path->slots[0] == 0);
1280
1281 path->slots[0]--;
1282 l = path->nodes[0];
1283
1284 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1285 if (key.objectid != bytenr) {
1286 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
1287 printk(KERN_ERR "btrfs wanted %llu found %llu\n",
1288 (unsigned long long)bytenr,
1289 (unsigned long long)key.objectid);
1290 BUG();
1291 }
1292 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1293
1294 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1295 refs = btrfs_extent_refs(l, item);
1296 btrfs_set_extent_refs(l, item, refs + 1);
1297 btrfs_mark_buffer_dirty(path->nodes[0]);
1298
1299 btrfs_release_path(root->fs_info->extent_root, path);
1300
1301 path->reada = 1;
1302 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1303 path, bytenr, parent,
1304 ref_root, ref_generation,
1305 owner_objectid);
1306 BUG_ON(ret);
1307 finish_current_insert(trans, root->fs_info->extent_root, 0);
1308 del_pending_extents(trans, root->fs_info->extent_root, 0);
1309
1310 btrfs_free_path(path);
1311 return 0;
1312}
1313
1314int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1315 struct btrfs_root *root,
1316 u64 bytenr, u64 num_bytes, u64 parent,
1317 u64 ref_root, u64 ref_generation,
1318 u64 owner_objectid)
1319{
1320 int ret;
1321 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1322 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1323 return 0;
1324 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
1325 0, ref_root, 0, ref_generation,
1326 owner_objectid);
1327 return ret;
1328}
1329
1330int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1331 struct btrfs_root *root)
1332{
1333 finish_current_insert(trans, root->fs_info->extent_root, 1);
1334 del_pending_extents(trans, root->fs_info->extent_root, 1);
1335 return 0;
1336}
1337
1338int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1339 struct btrfs_root *root, u64 bytenr,
1340 u64 num_bytes, u32 *refs)
1341{
1342 struct btrfs_path *path;
1343 int ret;
1344 struct btrfs_key key;
1345 struct extent_buffer *l;
1346 struct btrfs_extent_item *item;
1347
1348 WARN_ON(num_bytes < root->sectorsize);
1349 path = btrfs_alloc_path();
1350 path->reada = 1;
1351 key.objectid = bytenr;
1352 key.offset = num_bytes;
1353 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1354 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1355 0, 0);
1356 if (ret < 0)
1357 goto out;
1358 if (ret != 0) {
1359 btrfs_print_leaf(root, path->nodes[0]);
1360 printk(KERN_INFO "btrfs failed to find block number %llu\n",
1361 (unsigned long long)bytenr);
1362 BUG();
1363 }
1364 l = path->nodes[0];
1365 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1366 *refs = btrfs_extent_refs(l, item);
1367out:
1368 btrfs_free_path(path);
1369 return 0;
1370}
1371
1372int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1373 struct btrfs_root *root, u64 objectid, u64 bytenr)
1374{
1375 struct btrfs_root *extent_root = root->fs_info->extent_root;
1376 struct btrfs_path *path;
1377 struct extent_buffer *leaf;
1378 struct btrfs_extent_ref *ref_item;
1379 struct btrfs_key key;
1380 struct btrfs_key found_key;
1381 u64 ref_root;
1382 u64 last_snapshot;
1383 u32 nritems;
1384 int ret;
1385
1386 key.objectid = bytenr;
1387 key.offset = (u64)-1;
1388 key.type = BTRFS_EXTENT_ITEM_KEY;
1389
1390 path = btrfs_alloc_path();
1391 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1392 if (ret < 0)
1393 goto out;
1394 BUG_ON(ret == 0);
1395
1396 ret = -ENOENT;
1397 if (path->slots[0] == 0)
1398 goto out;
1399
1400 path->slots[0]--;
1401 leaf = path->nodes[0];
1402 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1403
1404 if (found_key.objectid != bytenr ||
1405 found_key.type != BTRFS_EXTENT_ITEM_KEY)
1406 goto out;
1407
1408 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1409 while (1) {
1410 leaf = path->nodes[0];
1411 nritems = btrfs_header_nritems(leaf);
1412 if (path->slots[0] >= nritems) {
1413 ret = btrfs_next_leaf(extent_root, path);
1414 if (ret < 0)
1415 goto out;
1416 if (ret == 0)
1417 continue;
1418 break;
1419 }
1420 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1421 if (found_key.objectid != bytenr)
1422 break;
1423
1424 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1425 path->slots[0]++;
1426 continue;
1427 }
1428
1429 ref_item = btrfs_item_ptr(leaf, path->slots[0],
1430 struct btrfs_extent_ref);
1431 ref_root = btrfs_ref_root(leaf, ref_item);
1432 if ((ref_root != root->root_key.objectid &&
1433 ref_root != BTRFS_TREE_LOG_OBJECTID) ||
1434 objectid != btrfs_ref_objectid(leaf, ref_item)) {
1435 ret = 1;
1436 goto out;
1437 }
1438 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1439 ret = 1;
1440 goto out;
1441 }
1442
1443 path->slots[0]++;
1444 }
1445 ret = 0;
1446out:
1447 btrfs_free_path(path);
1448 return ret;
1449}
1450
1451int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1452 struct extent_buffer *buf, u32 nr_extents)
1453{
1454 struct btrfs_key key;
1455 struct btrfs_file_extent_item *fi;
1456 u64 root_gen;
1457 u32 nritems;
1458 int i;
1459 int level;
1460 int ret = 0;
1461 int shared = 0;
1462
1463 if (!root->ref_cows)
1464 return 0;
1465
1466 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1467 shared = 0;
1468 root_gen = root->root_key.offset;
1469 } else {
1470 shared = 1;
1471 root_gen = trans->transid - 1;
1472 }
1473
1474 level = btrfs_header_level(buf);
1475 nritems = btrfs_header_nritems(buf);
1476
1477 if (level == 0) {
1478 struct btrfs_leaf_ref *ref;
1479 struct btrfs_extent_info *info;
1480
1481 ref = btrfs_alloc_leaf_ref(root, nr_extents);
1482 if (!ref) {
1483 ret = -ENOMEM;
1484 goto out;
1485 }
1486
1487 ref->root_gen = root_gen;
1488 ref->bytenr = buf->start;
1489 ref->owner = btrfs_header_owner(buf);
1490 ref->generation = btrfs_header_generation(buf);
1491 ref->nritems = nr_extents;
1492 info = ref->extents;
1493
1494 for (i = 0; nr_extents > 0 && i < nritems; i++) {
1495 u64 disk_bytenr;
1496 btrfs_item_key_to_cpu(buf, &key, i);
1497 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1498 continue;
1499 fi = btrfs_item_ptr(buf, i,
1500 struct btrfs_file_extent_item);
1501 if (btrfs_file_extent_type(buf, fi) ==
1502 BTRFS_FILE_EXTENT_INLINE)
1503 continue;
1504 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1505 if (disk_bytenr == 0)
1506 continue;
1507
1508 info->bytenr = disk_bytenr;
1509 info->num_bytes =
1510 btrfs_file_extent_disk_num_bytes(buf, fi);
1511 info->objectid = key.objectid;
1512 info->offset = key.offset;
1513 info++;
1514 }
1515
1516 ret = btrfs_add_leaf_ref(root, ref, shared);
1517 if (ret == -EEXIST && shared) {
1518 struct btrfs_leaf_ref *old;
1519 old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1520 BUG_ON(!old);
1521 btrfs_remove_leaf_ref(root, old);
1522 btrfs_free_leaf_ref(root, old);
1523 ret = btrfs_add_leaf_ref(root, ref, shared);
1524 }
1525 WARN_ON(ret);
1526 btrfs_free_leaf_ref(root, ref);
1527 }
1528out:
1529 return ret;
1530}
1531
1532int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1533 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1534 u32 *nr_extents)
1535{
1536 u64 bytenr;
1537 u64 ref_root;
1538 u64 orig_root;
1539 u64 ref_generation;
1540 u64 orig_generation;
1541 u32 nritems;
1542 u32 nr_file_extents = 0;
1543 struct btrfs_key key;
1544 struct btrfs_file_extent_item *fi;
1545 int i;
1546 int level;
1547 int ret = 0;
1548 int faili = 0;
1549 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1550 u64, u64, u64, u64, u64, u64, u64, u64);
1551
1552 ref_root = btrfs_header_owner(buf);
1553 ref_generation = btrfs_header_generation(buf);
1554 orig_root = btrfs_header_owner(orig_buf);
1555 orig_generation = btrfs_header_generation(orig_buf);
1556
1557 nritems = btrfs_header_nritems(buf);
1558 level = btrfs_header_level(buf);
1559
1560 if (root->ref_cows) {
1561 process_func = __btrfs_inc_extent_ref;
1562 } else {
1563 if (level == 0 &&
1564 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1565 goto out;
1566 if (level != 0 &&
1567 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1568 goto out;
1569 process_func = __btrfs_update_extent_ref;
1570 }
1571
1572 for (i = 0; i < nritems; i++) {
1573 cond_resched();
1574 if (level == 0) {
1575 btrfs_item_key_to_cpu(buf, &key, i);
1576 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1577 continue;
1578 fi = btrfs_item_ptr(buf, i,
1579 struct btrfs_file_extent_item);
1580 if (btrfs_file_extent_type(buf, fi) ==
1581 BTRFS_FILE_EXTENT_INLINE)
1582 continue;
1583 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1584 if (bytenr == 0)
1585 continue;
1586
1587 nr_file_extents++;
1588
1589 ret = process_func(trans, root, bytenr,
1590 orig_buf->start, buf->start,
1591 orig_root, ref_root,
1592 orig_generation, ref_generation,
1593 key.objectid);
1594
1595 if (ret) {
1596 faili = i;
1597 WARN_ON(1);
1598 goto fail;
1599 }
1600 } else {
1601 bytenr = btrfs_node_blockptr(buf, i);
1602 ret = process_func(trans, root, bytenr,
1603 orig_buf->start, buf->start,
1604 orig_root, ref_root,
1605 orig_generation, ref_generation,
1606 level - 1);
1607 if (ret) {
1608 faili = i;
1609 WARN_ON(1);
1610 goto fail;
1611 }
1612 }
1613 }
1614out:
1615 if (nr_extents) {
1616 if (level == 0)
1617 *nr_extents = nr_file_extents;
1618 else
1619 *nr_extents = nritems;
1620 }
1621 return 0;
1622fail:
1623 WARN_ON(1);
1624 return ret;
1625}
1626
1627int btrfs_update_ref(struct btrfs_trans_handle *trans,
1628 struct btrfs_root *root, struct extent_buffer *orig_buf,
1629 struct extent_buffer *buf, int start_slot, int nr)
1630
1631{
1632 u64 bytenr;
1633 u64 ref_root;
1634 u64 orig_root;
1635 u64 ref_generation;
1636 u64 orig_generation;
1637 struct btrfs_key key;
1638 struct btrfs_file_extent_item *fi;
1639 int i;
1640 int ret;
1641 int slot;
1642 int level;
1643
1644 BUG_ON(start_slot < 0);
1645 BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1646
1647 ref_root = btrfs_header_owner(buf);
1648 ref_generation = btrfs_header_generation(buf);
1649 orig_root = btrfs_header_owner(orig_buf);
1650 orig_generation = btrfs_header_generation(orig_buf);
1651 level = btrfs_header_level(buf);
1652
1653 if (!root->ref_cows) {
1654 if (level == 0 &&
1655 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1656 return 0;
1657 if (level != 0 &&
1658 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1659 return 0;
1660 }
1661
1662 for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1663 cond_resched();
1664 if (level == 0) {
1665 btrfs_item_key_to_cpu(buf, &key, slot);
1666 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1667 continue;
1668 fi = btrfs_item_ptr(buf, slot,
1669 struct btrfs_file_extent_item);
1670 if (btrfs_file_extent_type(buf, fi) ==
1671 BTRFS_FILE_EXTENT_INLINE)
1672 continue;
1673 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1674 if (bytenr == 0)
1675 continue;
1676 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1677 orig_buf->start, buf->start,
1678 orig_root, ref_root,
1679 orig_generation, ref_generation,
1680 key.objectid);
1681 if (ret)
1682 goto fail;
1683 } else {
1684 bytenr = btrfs_node_blockptr(buf, slot);
1685 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1686 orig_buf->start, buf->start,
1687 orig_root, ref_root,
1688 orig_generation, ref_generation,
1689 level - 1);
1690 if (ret)
1691 goto fail;
1692 }
1693 }
1694 return 0;
1695fail:
1696 WARN_ON(1);
1697 return -1;
1698}
1699
1700static int write_one_cache_group(struct btrfs_trans_handle *trans,
1701 struct btrfs_root *root,
1702 struct btrfs_path *path,
1703 struct btrfs_block_group_cache *cache)
1704{
1705 int ret;
1706 int pending_ret;
1707 struct btrfs_root *extent_root = root->fs_info->extent_root;
1708 unsigned long bi;
1709 struct extent_buffer *leaf;
1710
1711 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1712 if (ret < 0)
1713 goto fail;
1714 BUG_ON(ret);
1715
1716 leaf = path->nodes[0];
1717 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1718 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1719 btrfs_mark_buffer_dirty(leaf);
1720 btrfs_release_path(extent_root, path);
1721fail:
1722 finish_current_insert(trans, extent_root, 0);
1723 pending_ret = del_pending_extents(trans, extent_root, 0);
1724 if (ret)
1725 return ret;
1726 if (pending_ret)
1727 return pending_ret;
1728 return 0;
1729
1730}
1731
1732int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1733 struct btrfs_root *root)
1734{
1735 struct btrfs_block_group_cache *cache, *entry;
1736 struct rb_node *n;
1737 int err = 0;
1738 int werr = 0;
1739 struct btrfs_path *path;
1740 u64 last = 0;
1741
1742 path = btrfs_alloc_path();
1743 if (!path)
1744 return -ENOMEM;
1745
1746 while (1) {
1747 cache = NULL;
1748 spin_lock(&root->fs_info->block_group_cache_lock);
1749 for (n = rb_first(&root->fs_info->block_group_cache_tree);
1750 n; n = rb_next(n)) {
1751 entry = rb_entry(n, struct btrfs_block_group_cache,
1752 cache_node);
1753 if (entry->dirty) {
1754 cache = entry;
1755 break;
1756 }
1757 }
1758 spin_unlock(&root->fs_info->block_group_cache_lock);
1759
1760 if (!cache)
1761 break;
1762
1763 cache->dirty = 0;
1764 last += cache->key.offset;
1765
1766 err = write_one_cache_group(trans, root,
1767 path, cache);
1768 /*
1769 * if we fail to write the cache group, we want
1770 * to keep it marked dirty in hopes that a later
1771 * write will work
1772 */
1773 if (err) {
1774 werr = err;
1775 continue;
1776 }
1777 }
1778 btrfs_free_path(path);
1779 return werr;
1780}
1781
1782int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1783{
1784 struct btrfs_block_group_cache *block_group;
1785 int readonly = 0;
1786
1787 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1788 if (!block_group || block_group->ro)
1789 readonly = 1;
1790 if (block_group)
1791 put_block_group(block_group);
1792 return readonly;
1793}
1794
1795static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1796 u64 total_bytes, u64 bytes_used,
1797 struct btrfs_space_info **space_info)
1798{
1799 struct btrfs_space_info *found;
1800
1801 found = __find_space_info(info, flags);
1802 if (found) {
1803 spin_lock(&found->lock);
1804 found->total_bytes += total_bytes;
1805 found->bytes_used += bytes_used;
1806 found->full = 0;
1807 spin_unlock(&found->lock);
1808 *space_info = found;
1809 return 0;
1810 }
1811 found = kzalloc(sizeof(*found), GFP_NOFS);
1812 if (!found)
1813 return -ENOMEM;
1814
1815 list_add(&found->list, &info->space_info);
1816 INIT_LIST_HEAD(&found->block_groups);
1817 init_rwsem(&found->groups_sem);
1818 spin_lock_init(&found->lock);
1819 found->flags = flags;
1820 found->total_bytes = total_bytes;
1821 found->bytes_used = bytes_used;
1822 found->bytes_pinned = 0;
1823 found->bytes_reserved = 0;
1824 found->bytes_readonly = 0;
1825 found->full = 0;
1826 found->force_alloc = 0;
1827 *space_info = found;
1828 return 0;
1829}
1830
1831static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1832{
1833 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1834 BTRFS_BLOCK_GROUP_RAID1 |
1835 BTRFS_BLOCK_GROUP_RAID10 |
1836 BTRFS_BLOCK_GROUP_DUP);
1837 if (extra_flags) {
1838 if (flags & BTRFS_BLOCK_GROUP_DATA)
1839 fs_info->avail_data_alloc_bits |= extra_flags;
1840 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1841 fs_info->avail_metadata_alloc_bits |= extra_flags;
1842 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1843 fs_info->avail_system_alloc_bits |= extra_flags;
1844 }
1845}
1846
1847static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1848{
1849 spin_lock(&cache->space_info->lock);
1850 spin_lock(&cache->lock);
1851 if (!cache->ro) {
1852 cache->space_info->bytes_readonly += cache->key.offset -
1853 btrfs_block_group_used(&cache->item);
1854 cache->ro = 1;
1855 }
1856 spin_unlock(&cache->lock);
1857 spin_unlock(&cache->space_info->lock);
1858}
1859
1860u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1861{
1862 u64 num_devices = root->fs_info->fs_devices->rw_devices;
1863
1864 if (num_devices == 1)
1865 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1866 if (num_devices < 4)
1867 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1868
1869 if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1870 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1871 BTRFS_BLOCK_GROUP_RAID10))) {
1872 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1873 }
1874
1875 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1876 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1877 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1878 }
1879
1880 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1881 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1882 (flags & BTRFS_BLOCK_GROUP_RAID10) |
1883 (flags & BTRFS_BLOCK_GROUP_DUP)))
1884 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1885 return flags;
1886}
1887
1888static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1889 struct btrfs_root *extent_root, u64 alloc_bytes,
1890 u64 flags, int force)
1891{
1892 struct btrfs_space_info *space_info;
1893 u64 thresh;
1894 int ret = 0;
1895
1896 mutex_lock(&extent_root->fs_info->chunk_mutex);
1897
1898 flags = btrfs_reduce_alloc_profile(extent_root, flags);
1899
1900 space_info = __find_space_info(extent_root->fs_info, flags);
1901 if (!space_info) {
1902 ret = update_space_info(extent_root->fs_info, flags,
1903 0, 0, &space_info);
1904 BUG_ON(ret);
1905 }
1906 BUG_ON(!space_info);
1907
1908 spin_lock(&space_info->lock);
1909 if (space_info->force_alloc) {
1910 force = 1;
1911 space_info->force_alloc = 0;
1912 }
1913 if (space_info->full) {
1914 spin_unlock(&space_info->lock);
1915 goto out;
1916 }
1917
1918 thresh = space_info->total_bytes - space_info->bytes_readonly;
1919 thresh = div_factor(thresh, 6);
1920 if (!force &&
1921 (space_info->bytes_used + space_info->bytes_pinned +
1922 space_info->bytes_reserved + alloc_bytes) < thresh) {
1923 spin_unlock(&space_info->lock);
1924 goto out;
1925 }
1926 spin_unlock(&space_info->lock);
1927
1928 ret = btrfs_alloc_chunk(trans, extent_root, flags);
1929 if (ret)
1930 space_info->full = 1;
1931out:
1932 mutex_unlock(&extent_root->fs_info->chunk_mutex);
1933 return ret;
1934}
1935
1936static int update_block_group(struct btrfs_trans_handle *trans,
1937 struct btrfs_root *root,
1938 u64 bytenr, u64 num_bytes, int alloc,
1939 int mark_free)
1940{
1941 struct btrfs_block_group_cache *cache;
1942 struct btrfs_fs_info *info = root->fs_info;
1943 u64 total = num_bytes;
1944 u64 old_val;
1945 u64 byte_in_group;
1946
1947 while (total) {
1948 cache = btrfs_lookup_block_group(info, bytenr);
1949 if (!cache)
1950 return -1;
1951 byte_in_group = bytenr - cache->key.objectid;
1952 WARN_ON(byte_in_group > cache->key.offset);
1953
1954 spin_lock(&cache->space_info->lock);
1955 spin_lock(&cache->lock);
1956 cache->dirty = 1;
1957 old_val = btrfs_block_group_used(&cache->item);
1958 num_bytes = min(total, cache->key.offset - byte_in_group);
1959 if (alloc) {
1960 old_val += num_bytes;
1961 cache->space_info->bytes_used += num_bytes;
1962 if (cache->ro)
1963 cache->space_info->bytes_readonly -= num_bytes;
1964 btrfs_set_block_group_used(&cache->item, old_val);
1965 spin_unlock(&cache->lock);
1966 spin_unlock(&cache->space_info->lock);
1967 } else {
1968 old_val -= num_bytes;
1969 cache->space_info->bytes_used -= num_bytes;
1970 if (cache->ro)
1971 cache->space_info->bytes_readonly += num_bytes;
1972 btrfs_set_block_group_used(&cache->item, old_val);
1973 spin_unlock(&cache->lock);
1974 spin_unlock(&cache->space_info->lock);
1975 if (mark_free) {
1976 int ret;
1977
1978 ret = btrfs_discard_extent(root, bytenr,
1979 num_bytes);
1980 WARN_ON(ret);
1981
1982 ret = btrfs_add_free_space(cache, bytenr,
1983 num_bytes);
1984 WARN_ON(ret);
1985 }
1986 }
1987 put_block_group(cache);
1988 total -= num_bytes;
1989 bytenr += num_bytes;
1990 }
1991 return 0;
1992}
1993
1994static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1995{
1996 struct btrfs_block_group_cache *cache;
1997 u64 bytenr;
1998
1999 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
2000 if (!cache)
2001 return 0;
2002
2003 bytenr = cache->key.objectid;
2004 put_block_group(cache);
2005
2006 return bytenr;
2007}
2008
2009int btrfs_update_pinned_extents(struct btrfs_root *root,
2010 u64 bytenr, u64 num, int pin)
2011{
2012 u64 len;
2013 struct btrfs_block_group_cache *cache;
2014 struct btrfs_fs_info *fs_info = root->fs_info;
2015
2016 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2017 if (pin) {
2018 set_extent_dirty(&fs_info->pinned_extents,
2019 bytenr, bytenr + num - 1, GFP_NOFS);
2020 } else {
2021 clear_extent_dirty(&fs_info->pinned_extents,
2022 bytenr, bytenr + num - 1, GFP_NOFS);
2023 }
2024 while (num > 0) {
2025 cache = btrfs_lookup_block_group(fs_info, bytenr);
2026 BUG_ON(!cache);
2027 len = min(num, cache->key.offset -
2028 (bytenr - cache->key.objectid));
2029 if (pin) {
2030 spin_lock(&cache->space_info->lock);
2031 spin_lock(&cache->lock);
2032 cache->pinned += len;
2033 cache->space_info->bytes_pinned += len;
2034 spin_unlock(&cache->lock);
2035 spin_unlock(&cache->space_info->lock);
2036 fs_info->total_pinned += len;
2037 } else {
2038 spin_lock(&cache->space_info->lock);
2039 spin_lock(&cache->lock);
2040 cache->pinned -= len;
2041 cache->space_info->bytes_pinned -= len;
2042 spin_unlock(&cache->lock);
2043 spin_unlock(&cache->space_info->lock);
2044 fs_info->total_pinned -= len;
2045 if (cache->cached)
2046 btrfs_add_free_space(cache, bytenr, len);
2047 }
2048 put_block_group(cache);
2049 bytenr += len;
2050 num -= len;
2051 }
2052 return 0;
2053}
2054
2055static int update_reserved_extents(struct btrfs_root *root,
2056 u64 bytenr, u64 num, int reserve)
2057{
2058 u64 len;
2059 struct btrfs_block_group_cache *cache;
2060 struct btrfs_fs_info *fs_info = root->fs_info;
2061
2062 while (num > 0) {
2063 cache = btrfs_lookup_block_group(fs_info, bytenr);
2064 BUG_ON(!cache);
2065 len = min(num, cache->key.offset -
2066 (bytenr - cache->key.objectid));
2067
2068 spin_lock(&cache->space_info->lock);
2069 spin_lock(&cache->lock);
2070 if (reserve) {
2071 cache->reserved += len;
2072 cache->space_info->bytes_reserved += len;
2073 } else {
2074 cache->reserved -= len;
2075 cache->space_info->bytes_reserved -= len;
2076 }
2077 spin_unlock(&cache->lock);
2078 spin_unlock(&cache->space_info->lock);
2079 put_block_group(cache);
2080 bytenr += len;
2081 num -= len;
2082 }
2083 return 0;
2084}
2085
2086int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2087{
2088 u64 last = 0;
2089 u64 start;
2090 u64 end;
2091 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2092 int ret;
2093
2094 mutex_lock(&root->fs_info->pinned_mutex);
2095 while (1) {
2096 ret = find_first_extent_bit(pinned_extents, last,
2097 &start, &end, EXTENT_DIRTY);
2098 if (ret)
2099 break;
2100 set_extent_dirty(copy, start, end, GFP_NOFS);
2101 last = end + 1;
2102 }
2103 mutex_unlock(&root->fs_info->pinned_mutex);
2104 return 0;
2105}
2106
2107int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2108 struct btrfs_root *root,
2109 struct extent_io_tree *unpin)
2110{
2111 u64 start;
2112 u64 end;
2113 int ret;
2114
2115 mutex_lock(&root->fs_info->pinned_mutex);
2116 while (1) {
2117 ret = find_first_extent_bit(unpin, 0, &start, &end,
2118 EXTENT_DIRTY);
2119 if (ret)
2120 break;
2121
2122 ret = btrfs_discard_extent(root, start, end + 1 - start);
2123
2124 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2125 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2126
2127 if (need_resched()) {
2128 mutex_unlock(&root->fs_info->pinned_mutex);
2129 cond_resched();
2130 mutex_lock(&root->fs_info->pinned_mutex);
2131 }
2132 }
2133 mutex_unlock(&root->fs_info->pinned_mutex);
2134 return ret;
2135}
2136
2137static int finish_current_insert(struct btrfs_trans_handle *trans,
2138 struct btrfs_root *extent_root, int all)
2139{
2140 u64 start;
2141 u64 end;
2142 u64 priv;
2143 u64 search = 0;
2144 u64 skipped = 0;
2145 struct btrfs_fs_info *info = extent_root->fs_info;
2146 struct btrfs_path *path;
2147 struct pending_extent_op *extent_op, *tmp;
2148 struct list_head insert_list, update_list;
2149 int ret;
2150 int num_inserts = 0, max_inserts;
2151
2152 path = btrfs_alloc_path();
2153 INIT_LIST_HEAD(&insert_list);
2154 INIT_LIST_HEAD(&update_list);
2155
2156 max_inserts = extent_root->leafsize /
2157 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2158 sizeof(struct btrfs_extent_ref) +
2159 sizeof(struct btrfs_extent_item));
2160again:
2161 mutex_lock(&info->extent_ins_mutex);
2162 while (1) {
2163 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2164 &end, EXTENT_WRITEBACK);
2165 if (ret) {
2166 if (skipped && all && !num_inserts) {
2167 skipped = 0;
2168 search = 0;
2169 continue;
2170 }
2171 mutex_unlock(&info->extent_ins_mutex);
2172 break;
2173 }
2174
2175 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2176 if (!ret) {
2177 skipped = 1;
2178 search = end + 1;
2179 if (need_resched()) {
2180 mutex_unlock(&info->extent_ins_mutex);
2181 cond_resched();
2182 mutex_lock(&info->extent_ins_mutex);
2183 }
2184 continue;
2185 }
2186
2187 ret = get_state_private(&info->extent_ins, start, &priv);
2188 BUG_ON(ret);
2189 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2190
2191 if (extent_op->type == PENDING_EXTENT_INSERT) {
2192 num_inserts++;
2193 list_add_tail(&extent_op->list, &insert_list);
2194 search = end + 1;
2195 if (num_inserts == max_inserts) {
2196 mutex_unlock(&info->extent_ins_mutex);
2197 break;
2198 }
2199 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2200 list_add_tail(&extent_op->list, &update_list);
2201 search = end + 1;
2202 } else {
2203 BUG();
2204 }
2205 }
2206
2207 /*
2208 * process the update list, clear the writeback bit for it, and if
2209 * somebody marked this thing for deletion then just unlock it and be
2210 * done, the free_extents will handle it
2211 */
2212 mutex_lock(&info->extent_ins_mutex);
2213 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2214 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2215 extent_op->bytenr + extent_op->num_bytes - 1,
2216 EXTENT_WRITEBACK, GFP_NOFS);
2217 if (extent_op->del) {
2218 list_del_init(&extent_op->list);
2219 unlock_extent(&info->extent_ins, extent_op->bytenr,
2220 extent_op->bytenr + extent_op->num_bytes
2221 - 1, GFP_NOFS);
2222 kfree(extent_op);
2223 }
2224 }
2225 mutex_unlock(&info->extent_ins_mutex);
2226
2227 /*
2228 * still have things left on the update list, go ahead an update
2229 * everything
2230 */
2231 if (!list_empty(&update_list)) {
2232 ret = update_backrefs(trans, extent_root, path, &update_list);
2233 BUG_ON(ret);
2234 }
2235
2236 /*
2237 * if no inserts need to be done, but we skipped some extents and we
2238 * need to make sure everything is cleaned then reset everything and
2239 * go back to the beginning
2240 */
2241 if (!num_inserts && all && skipped) {
2242 search = 0;
2243 skipped = 0;
2244 INIT_LIST_HEAD(&update_list);
2245 INIT_LIST_HEAD(&insert_list);
2246 goto again;
2247 } else if (!num_inserts) {
2248 goto out;
2249 }
2250
2251 /*
2252 * process the insert extents list. Again if we are deleting this
2253 * extent, then just unlock it, pin down the bytes if need be, and be
2254 * done with it. Saves us from having to actually insert the extent
2255 * into the tree and then subsequently come along and delete it
2256 */
2257 mutex_lock(&info->extent_ins_mutex);
2258 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2259 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2260 extent_op->bytenr + extent_op->num_bytes - 1,
2261 EXTENT_WRITEBACK, GFP_NOFS);
2262 if (extent_op->del) {
2263 u64 used;
2264 list_del_init(&extent_op->list);
2265 unlock_extent(&info->extent_ins, extent_op->bytenr,
2266 extent_op->bytenr + extent_op->num_bytes
2267 - 1, GFP_NOFS);
2268
2269 mutex_lock(&extent_root->fs_info->pinned_mutex);
2270 ret = pin_down_bytes(trans, extent_root,
2271 extent_op->bytenr,
2272 extent_op->num_bytes, 0);
2273 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2274
2275 spin_lock(&info->delalloc_lock);
2276 used = btrfs_super_bytes_used(&info->super_copy);
2277 btrfs_set_super_bytes_used(&info->super_copy,
2278 used - extent_op->num_bytes);
2279 used = btrfs_root_used(&extent_root->root_item);
2280 btrfs_set_root_used(&extent_root->root_item,
2281 used - extent_op->num_bytes);
2282 spin_unlock(&info->delalloc_lock);
2283
2284 ret = update_block_group(trans, extent_root,
2285 extent_op->bytenr,
2286 extent_op->num_bytes,
2287 0, ret > 0);
2288 BUG_ON(ret);
2289 kfree(extent_op);
2290 num_inserts--;
2291 }
2292 }
2293 mutex_unlock(&info->extent_ins_mutex);
2294
2295 ret = insert_extents(trans, extent_root, path, &insert_list,
2296 num_inserts);
2297 BUG_ON(ret);
2298
2299 /*
2300 * if we broke out of the loop in order to insert stuff because we hit
2301 * the maximum number of inserts at a time we can handle, then loop
2302 * back and pick up where we left off
2303 */
2304 if (num_inserts == max_inserts) {
2305 INIT_LIST_HEAD(&insert_list);
2306 INIT_LIST_HEAD(&update_list);
2307 num_inserts = 0;
2308 goto again;
2309 }
2310
2311 /*
2312 * again, if we need to make absolutely sure there are no more pending
2313 * extent operations left and we know that we skipped some, go back to
2314 * the beginning and do it all again
2315 */
2316 if (all && skipped) {
2317 INIT_LIST_HEAD(&insert_list);
2318 INIT_LIST_HEAD(&update_list);
2319 search = 0;
2320 skipped = 0;
2321 num_inserts = 0;
2322 goto again;
2323 }
2324out:
2325 btrfs_free_path(path);
2326 return 0;
2327}
2328
2329static int pin_down_bytes(struct btrfs_trans_handle *trans,
2330 struct btrfs_root *root,
2331 u64 bytenr, u64 num_bytes, int is_data)
2332{
2333 int err = 0;
2334 struct extent_buffer *buf;
2335
2336 if (is_data)
2337 goto pinit;
2338
2339 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2340 if (!buf)
2341 goto pinit;
2342
2343 /* we can reuse a block if it hasn't been written
2344 * and it is from this transaction. We can't
2345 * reuse anything from the tree log root because
2346 * it has tiny sub-transactions.
2347 */
2348 if (btrfs_buffer_uptodate(buf, 0) &&
2349 btrfs_try_tree_lock(buf)) {
2350 u64 header_owner = btrfs_header_owner(buf);
2351 u64 header_transid = btrfs_header_generation(buf);
2352 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2353 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2354 header_transid == trans->transid &&
2355 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2356 clean_tree_block(NULL, root, buf);
2357 btrfs_tree_unlock(buf);
2358 free_extent_buffer(buf);
2359 return 1;
2360 }
2361 btrfs_tree_unlock(buf);
2362 }
2363 free_extent_buffer(buf);
2364pinit:
2365 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2366
2367 BUG_ON(err < 0);
2368 return 0;
2369}
2370
2371/*
2372 * remove an extent from the root, returns 0 on success
2373 */
2374static int __free_extent(struct btrfs_trans_handle *trans,
2375 struct btrfs_root *root,
2376 u64 bytenr, u64 num_bytes, u64 parent,
2377 u64 root_objectid, u64 ref_generation,
2378 u64 owner_objectid, int pin, int mark_free)
2379{
2380 struct btrfs_path *path;
2381 struct btrfs_key key;
2382 struct btrfs_fs_info *info = root->fs_info;
2383 struct btrfs_root *extent_root = info->extent_root;
2384 struct extent_buffer *leaf;
2385 int ret;
2386 int extent_slot = 0;
2387 int found_extent = 0;
2388 int num_to_del = 1;
2389 struct btrfs_extent_item *ei;
2390 u32 refs;
2391
2392 key.objectid = bytenr;
2393 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2394 key.offset = num_bytes;
2395 path = btrfs_alloc_path();
2396 if (!path)
2397 return -ENOMEM;
2398
2399 path->reada = 1;
2400 ret = lookup_extent_backref(trans, extent_root, path,
2401 bytenr, parent, root_objectid,
2402 ref_generation, owner_objectid, 1);
2403 if (ret == 0) {
2404 struct btrfs_key found_key;
2405 extent_slot = path->slots[0];
2406 while (extent_slot > 0) {
2407 extent_slot--;
2408 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2409 extent_slot);
2410 if (found_key.objectid != bytenr)
2411 break;
2412 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2413 found_key.offset == num_bytes) {
2414 found_extent = 1;
2415 break;
2416 }
2417 if (path->slots[0] - extent_slot > 5)
2418 break;
2419 }
2420 if (!found_extent) {
2421 ret = remove_extent_backref(trans, extent_root, path);
2422 BUG_ON(ret);
2423 btrfs_release_path(extent_root, path);
2424 ret = btrfs_search_slot(trans, extent_root,
2425 &key, path, -1, 1);
2426 if (ret) {
2427 printk(KERN_ERR "umm, got %d back from search"
2428 ", was looking for %llu\n", ret,
2429 (unsigned long long)bytenr);
2430 btrfs_print_leaf(extent_root, path->nodes[0]);
2431 }
2432 BUG_ON(ret);
2433 extent_slot = path->slots[0];
2434 }
2435 } else {
2436 btrfs_print_leaf(extent_root, path->nodes[0]);
2437 WARN_ON(1);
2438 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2439 "root %llu gen %llu owner %llu\n",
2440 (unsigned long long)bytenr,
2441 (unsigned long long)root_objectid,
2442 (unsigned long long)ref_generation,
2443 (unsigned long long)owner_objectid);
2444 }
2445
2446 leaf = path->nodes[0];
2447 ei = btrfs_item_ptr(leaf, extent_slot,
2448 struct btrfs_extent_item);
2449 refs = btrfs_extent_refs(leaf, ei);
2450 BUG_ON(refs == 0);
2451 refs -= 1;
2452 btrfs_set_extent_refs(leaf, ei, refs);
2453
2454 btrfs_mark_buffer_dirty(leaf);
2455
2456 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
2457 struct btrfs_extent_ref *ref;
2458 ref = btrfs_item_ptr(leaf, path->slots[0],
2459 struct btrfs_extent_ref);
2460 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
2461 /* if the back ref and the extent are next to each other
2462 * they get deleted below in one shot
2463 */
2464 path->slots[0] = extent_slot;
2465 num_to_del = 2;
2466 } else if (found_extent) {
2467 /* otherwise delete the extent back ref */
2468 ret = remove_extent_backref(trans, extent_root, path);
2469 BUG_ON(ret);
2470 /* if refs are 0, we need to setup the path for deletion */
2471 if (refs == 0) {
2472 btrfs_release_path(extent_root, path);
2473 ret = btrfs_search_slot(trans, extent_root, &key, path,
2474 -1, 1);
2475 BUG_ON(ret);
2476 }
2477 }
2478
2479 if (refs == 0) {
2480 u64 super_used;
2481 u64 root_used;
2482
2483 if (pin) {
2484 mutex_lock(&root->fs_info->pinned_mutex);
2485 ret = pin_down_bytes(trans, root, bytenr, num_bytes,
2486 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
2487 mutex_unlock(&root->fs_info->pinned_mutex);
2488 if (ret > 0)
2489 mark_free = 1;
2490 BUG_ON(ret < 0);
2491 }
2492 /* block accounting for super block */
2493 spin_lock(&info->delalloc_lock);
2494 super_used = btrfs_super_bytes_used(&info->super_copy);
2495 btrfs_set_super_bytes_used(&info->super_copy,
2496 super_used - num_bytes);
2497
2498 /* block accounting for root item */
2499 root_used = btrfs_root_used(&root->root_item);
2500 btrfs_set_root_used(&root->root_item,
2501 root_used - num_bytes);
2502 spin_unlock(&info->delalloc_lock);
2503 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2504 num_to_del);
2505 BUG_ON(ret);
2506 btrfs_release_path(extent_root, path);
2507
2508 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2509 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2510 BUG_ON(ret);
2511 }
2512
2513 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2514 mark_free);
2515 BUG_ON(ret);
2516 }
2517 btrfs_free_path(path);
2518 finish_current_insert(trans, extent_root, 0);
2519 return ret;
2520}
2521
2522/*
2523 * find all the blocks marked as pending in the radix tree and remove
2524 * them from the extent map
2525 */
2526static int del_pending_extents(struct btrfs_trans_handle *trans,
2527 struct btrfs_root *extent_root, int all)
2528{
2529 int ret;
2530 int err = 0;
2531 u64 start;
2532 u64 end;
2533 u64 priv;
2534 u64 search = 0;
2535 int nr = 0, skipped = 0;
2536 struct extent_io_tree *pending_del;
2537 struct extent_io_tree *extent_ins;
2538 struct pending_extent_op *extent_op;
2539 struct btrfs_fs_info *info = extent_root->fs_info;
2540 struct list_head delete_list;
2541
2542 INIT_LIST_HEAD(&delete_list);
2543 extent_ins = &extent_root->fs_info->extent_ins;
2544 pending_del = &extent_root->fs_info->pending_del;
2545
2546again:
2547 mutex_lock(&info->extent_ins_mutex);
2548 while (1) {
2549 ret = find_first_extent_bit(pending_del, search, &start, &end,
2550 EXTENT_WRITEBACK);
2551 if (ret) {
2552 if (all && skipped && !nr) {
2553 search = 0;
2554 continue;
2555 }
2556 mutex_unlock(&info->extent_ins_mutex);
2557 break;
2558 }
2559
2560 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2561 if (!ret) {
2562 search = end+1;
2563 skipped = 1;
2564
2565 if (need_resched()) {
2566 mutex_unlock(&info->extent_ins_mutex);
2567 cond_resched();
2568 mutex_lock(&info->extent_ins_mutex);
2569 }
2570
2571 continue;
2572 }
2573 BUG_ON(ret < 0);
2574
2575 ret = get_state_private(pending_del, start, &priv);
2576 BUG_ON(ret);
2577 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2578
2579 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2580 GFP_NOFS);
2581 if (!test_range_bit(extent_ins, start, end,
2582 EXTENT_WRITEBACK, 0)) {
2583 list_add_tail(&extent_op->list, &delete_list);
2584 nr++;
2585 } else {
2586 kfree(extent_op);
2587
2588 ret = get_state_private(&info->extent_ins, start,
2589 &priv);
2590 BUG_ON(ret);
2591 extent_op = (struct pending_extent_op *)
2592 (unsigned long)priv;
2593
2594 clear_extent_bits(&info->extent_ins, start, end,
2595 EXTENT_WRITEBACK, GFP_NOFS);
2596
2597 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2598 list_add_tail(&extent_op->list, &delete_list);
2599 search = end + 1;
2600 nr++;
2601 continue;
2602 }
2603
2604 mutex_lock(&extent_root->fs_info->pinned_mutex);
2605 ret = pin_down_bytes(trans, extent_root, start,
2606 end + 1 - start, 0);
2607 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2608
2609 ret = update_block_group(trans, extent_root, start,
2610 end + 1 - start, 0, ret > 0);
2611
2612 unlock_extent(extent_ins, start, end, GFP_NOFS);
2613 BUG_ON(ret);
2614 kfree(extent_op);
2615 }
2616 if (ret)
2617 err = ret;
2618
2619 search = end + 1;
2620
2621 if (need_resched()) {
2622 mutex_unlock(&info->extent_ins_mutex);
2623 cond_resched();
2624 mutex_lock(&info->extent_ins_mutex);
2625 }
2626 }
2627
2628 if (nr) {
2629 ret = free_extents(trans, extent_root, &delete_list);
2630 BUG_ON(ret);
2631 }
2632
2633 if (all && skipped) {
2634 INIT_LIST_HEAD(&delete_list);
2635 search = 0;
2636 nr = 0;
2637 goto again;
2638 }
2639
2640 return err;
2641}
2642
2643/*
2644 * remove an extent from the root, returns 0 on success
2645 */
2646static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2647 struct btrfs_root *root,
2648 u64 bytenr, u64 num_bytes, u64 parent,
2649 u64 root_objectid, u64 ref_generation,
2650 u64 owner_objectid, int pin)
2651{
2652 struct btrfs_root *extent_root = root->fs_info->extent_root;
2653 int pending_ret;
2654 int ret;
2655
2656 WARN_ON(num_bytes < root->sectorsize);
2657 if (root == extent_root) {
2658 struct pending_extent_op *extent_op = NULL;
2659
2660 mutex_lock(&root->fs_info->extent_ins_mutex);
2661 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
2662 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
2663 u64 priv;
2664 ret = get_state_private(&root->fs_info->extent_ins,
2665 bytenr, &priv);
2666 BUG_ON(ret);
2667 extent_op = (struct pending_extent_op *)
2668 (unsigned long)priv;
2669
2670 extent_op->del = 1;
2671 if (extent_op->type == PENDING_EXTENT_INSERT) {
2672 mutex_unlock(&root->fs_info->extent_ins_mutex);
2673 return 0;
2674 }
2675 }
2676
2677 if (extent_op) {
2678 ref_generation = extent_op->orig_generation;
2679 parent = extent_op->orig_parent;
2680 }
2681
2682 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2683 BUG_ON(!extent_op);
2684
2685 extent_op->type = PENDING_EXTENT_DELETE;
2686 extent_op->bytenr = bytenr;
2687 extent_op->num_bytes = num_bytes;
2688 extent_op->parent = parent;
2689 extent_op->orig_parent = parent;
2690 extent_op->generation = ref_generation;
2691 extent_op->orig_generation = ref_generation;
2692 extent_op->level = (int)owner_objectid;
2693 INIT_LIST_HEAD(&extent_op->list);
2694 extent_op->del = 0;
2695
2696 set_extent_bits(&root->fs_info->pending_del,
2697 bytenr, bytenr + num_bytes - 1,
2698 EXTENT_WRITEBACK, GFP_NOFS);
2699 set_state_private(&root->fs_info->pending_del,
2700 bytenr, (unsigned long)extent_op);
2701 mutex_unlock(&root->fs_info->extent_ins_mutex);
2702 return 0;
2703 }
2704 /* if metadata always pin */
2705 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2706 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2707 struct btrfs_block_group_cache *cache;
2708
2709 /* btrfs_free_reserved_extent */
2710 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2711 BUG_ON(!cache);
2712 btrfs_add_free_space(cache, bytenr, num_bytes);
2713 put_block_group(cache);
2714 update_reserved_extents(root, bytenr, num_bytes, 0);
2715 return 0;
2716 }
2717 pin = 1;
2718 }
2719
2720 /* if data pin when any transaction has committed this */
2721 if (ref_generation != trans->transid)
2722 pin = 1;
2723
2724 ret = __free_extent(trans, root, bytenr, num_bytes, parent,
2725 root_objectid, ref_generation,
2726 owner_objectid, pin, pin == 0);
2727
2728 finish_current_insert(trans, root->fs_info->extent_root, 0);
2729 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
2730 return ret ? ret : pending_ret;
2731}
2732
2733int btrfs_free_extent(struct btrfs_trans_handle *trans,
2734 struct btrfs_root *root,
2735 u64 bytenr, u64 num_bytes, u64 parent,
2736 u64 root_objectid, u64 ref_generation,
2737 u64 owner_objectid, int pin)
2738{
2739 int ret;
2740
2741 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
2742 root_objectid, ref_generation,
2743 owner_objectid, pin);
2744 return ret;
2745}
2746
2747static u64 stripe_align(struct btrfs_root *root, u64 val)
2748{
2749 u64 mask = ((u64)root->stripesize - 1);
2750 u64 ret = (val + mask) & ~mask;
2751 return ret;
2752}
2753
2754/*
2755 * walks the btree of allocated extents and find a hole of a given size.
2756 * The key ins is changed to record the hole:
2757 * ins->objectid == block start
2758 * ins->flags = BTRFS_EXTENT_ITEM_KEY
2759 * ins->offset == number of blocks
2760 * Any available blocks before search_start are skipped.
2761 */
2762static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2763 struct btrfs_root *orig_root,
2764 u64 num_bytes, u64 empty_size,
2765 u64 search_start, u64 search_end,
2766 u64 hint_byte, struct btrfs_key *ins,
2767 u64 exclude_start, u64 exclude_nr,
2768 int data)
2769{
2770 int ret = 0;
2771 struct btrfs_root *root = orig_root->fs_info->extent_root;
2772 u64 total_needed = num_bytes;
2773 u64 *last_ptr = NULL;
2774 u64 last_wanted = 0;
2775 struct btrfs_block_group_cache *block_group = NULL;
2776 int chunk_alloc_done = 0;
2777 int empty_cluster = 2 * 1024 * 1024;
2778 int allowed_chunk_alloc = 0;
2779 struct list_head *head = NULL, *cur = NULL;
2780 int loop = 0;
2781 int extra_loop = 0;
2782 struct btrfs_space_info *space_info;
2783
2784 WARN_ON(num_bytes < root->sectorsize);
2785 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2786 ins->objectid = 0;
2787 ins->offset = 0;
2788
2789 if (orig_root->ref_cows || empty_size)
2790 allowed_chunk_alloc = 1;
2791
2792 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2793 last_ptr = &root->fs_info->last_alloc;
2794 empty_cluster = 64 * 1024;
2795 }
2796
2797 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2798 last_ptr = &root->fs_info->last_data_alloc;
2799
2800 if (last_ptr) {
2801 if (*last_ptr) {
2802 hint_byte = *last_ptr;
2803 last_wanted = *last_ptr;
2804 } else
2805 empty_size += empty_cluster;
2806 } else {
2807 empty_cluster = 0;
2808 }
2809 search_start = max(search_start, first_logical_byte(root, 0));
2810 search_start = max(search_start, hint_byte);
2811
2812 if (last_wanted && search_start != last_wanted) {
2813 last_wanted = 0;
2814 empty_size += empty_cluster;
2815 }
2816
2817 total_needed += empty_size;
2818 block_group = btrfs_lookup_block_group(root->fs_info, search_start);
2819 if (!block_group)
2820 block_group = btrfs_lookup_first_block_group(root->fs_info,
2821 search_start);
2822 space_info = __find_space_info(root->fs_info, data);
2823
2824 down_read(&space_info->groups_sem);
2825 while (1) {
2826 struct btrfs_free_space *free_space;
2827 /*
2828 * the only way this happens if our hint points to a block
2829 * group thats not of the proper type, while looping this
2830 * should never happen
2831 */
2832 if (empty_size)
2833 extra_loop = 1;
2834
2835 if (!block_group)
2836 goto new_group_no_lock;
2837
2838 if (unlikely(!block_group->cached)) {
2839 mutex_lock(&block_group->cache_mutex);
2840 ret = cache_block_group(root, block_group);
2841 mutex_unlock(&block_group->cache_mutex);
2842 if (ret)
2843 break;
2844 }
2845
2846 mutex_lock(&block_group->alloc_mutex);
2847 if (unlikely(!block_group_bits(block_group, data)))
2848 goto new_group;
2849
2850 if (unlikely(block_group->ro))
2851 goto new_group;
2852
2853 free_space = btrfs_find_free_space(block_group, search_start,
2854 total_needed);
2855 if (free_space) {
2856 u64 start = block_group->key.objectid;
2857 u64 end = block_group->key.objectid +
2858 block_group->key.offset;
2859
2860 search_start = stripe_align(root, free_space->offset);
2861
2862 /* move on to the next group */
2863 if (search_start + num_bytes >= search_end)
2864 goto new_group;
2865
2866 /* move on to the next group */
2867 if (search_start + num_bytes > end)
2868 goto new_group;
2869
2870 if (last_wanted && search_start != last_wanted) {
2871 total_needed += empty_cluster;
2872 empty_size += empty_cluster;
2873 last_wanted = 0;
2874 /*
2875 * if search_start is still in this block group
2876 * then we just re-search this block group
2877 */
2878 if (search_start >= start &&
2879 search_start < end) {
2880 mutex_unlock(&block_group->alloc_mutex);
2881 continue;
2882 }
2883
2884 /* else we go to the next block group */
2885 goto new_group;
2886 }
2887
2888 if (exclude_nr > 0 &&
2889 (search_start + num_bytes > exclude_start &&
2890 search_start < exclude_start + exclude_nr)) {
2891 search_start = exclude_start + exclude_nr;
2892 /*
2893 * if search_start is still in this block group
2894 * then we just re-search this block group
2895 */
2896 if (search_start >= start &&
2897 search_start < end) {
2898 mutex_unlock(&block_group->alloc_mutex);
2899 last_wanted = 0;
2900 continue;
2901 }
2902
2903 /* else we go to the next block group */
2904 goto new_group;
2905 }
2906
2907 ins->objectid = search_start;
2908 ins->offset = num_bytes;
2909
2910 btrfs_remove_free_space_lock(block_group, search_start,
2911 num_bytes);
2912 /* we are all good, lets return */
2913 mutex_unlock(&block_group->alloc_mutex);
2914 break;
2915 }
2916new_group:
2917 mutex_unlock(&block_group->alloc_mutex);
2918 put_block_group(block_group);
2919 block_group = NULL;
2920new_group_no_lock:
2921 /* don't try to compare new allocations against the
2922 * last allocation any more
2923 */
2924 last_wanted = 0;
2925
2926 /*
2927 * Here's how this works.
2928 * loop == 0: we were searching a block group via a hint
2929 * and didn't find anything, so we start at
2930 * the head of the block groups and keep searching
2931 * loop == 1: we're searching through all of the block groups
2932 * if we hit the head again we have searched
2933 * all of the block groups for this space and we
2934 * need to try and allocate, if we cant error out.
2935 * loop == 2: we allocated more space and are looping through
2936 * all of the block groups again.
2937 */
2938 if (loop == 0) {
2939 head = &space_info->block_groups;
2940 cur = head->next;
2941 loop++;
2942 } else if (loop == 1 && cur == head) {
2943 int keep_going;
2944
2945 /* at this point we give up on the empty_size
2946 * allocations and just try to allocate the min
2947 * space.
2948 *
2949 * The extra_loop field was set if an empty_size
2950 * allocation was attempted above, and if this
2951 * is try we need to try the loop again without
2952 * the additional empty_size.
2953 */
2954 total_needed -= empty_size;
2955 empty_size = 0;
2956 keep_going = extra_loop;
2957 loop++;
2958
2959 if (allowed_chunk_alloc && !chunk_alloc_done) {
2960 up_read(&space_info->groups_sem);
2961 ret = do_chunk_alloc(trans, root, num_bytes +
2962 2 * 1024 * 1024, data, 1);
2963 down_read(&space_info->groups_sem);
2964 if (ret < 0)
2965 goto loop_check;
2966 head = &space_info->block_groups;
2967 /*
2968 * we've allocated a new chunk, keep
2969 * trying
2970 */
2971 keep_going = 1;
2972 chunk_alloc_done = 1;
2973 } else if (!allowed_chunk_alloc) {
2974 space_info->force_alloc = 1;
2975 }
2976loop_check:
2977 if (keep_going) {
2978 cur = head->next;
2979 extra_loop = 0;
2980 } else {
2981 break;
2982 }
2983 } else if (cur == head) {
2984 break;
2985 }
2986
2987 block_group = list_entry(cur, struct btrfs_block_group_cache,
2988 list);
2989 atomic_inc(&block_group->count);
2990
2991 search_start = block_group->key.objectid;
2992 cur = cur->next;
2993 }
2994
2995 /* we found what we needed */
2996 if (ins->objectid) {
2997 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2998 trans->block_group = block_group->key.objectid;
2999
3000 if (last_ptr)
3001 *last_ptr = ins->objectid + ins->offset;
3002 ret = 0;
3003 } else if (!ret) {
3004 printk(KERN_ERR "btrfs searching for %llu bytes, "
3005 "num_bytes %llu, loop %d, allowed_alloc %d\n",
3006 (unsigned long long)total_needed,
3007 (unsigned long long)num_bytes,
3008 loop, allowed_chunk_alloc);
3009 ret = -ENOSPC;
3010 }
3011 if (block_group)
3012 put_block_group(block_group);
3013
3014 up_read(&space_info->groups_sem);
3015 return ret;
3016}
3017
3018static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3019{
3020 struct btrfs_block_group_cache *cache;
3021 struct list_head *l;
3022
3023 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3024 (unsigned long long)(info->total_bytes - info->bytes_used -
3025 info->bytes_pinned - info->bytes_reserved),
3026 (info->full) ? "" : "not ");
3027
3028 down_read(&info->groups_sem);
3029 list_for_each(l, &info->block_groups) {
3030 cache = list_entry(l, struct btrfs_block_group_cache, list);
3031 spin_lock(&cache->lock);
3032 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3033 "%llu pinned %llu reserved\n",
3034 (unsigned long long)cache->key.objectid,
3035 (unsigned long long)cache->key.offset,
3036 (unsigned long long)btrfs_block_group_used(&cache->item),
3037 (unsigned long long)cache->pinned,
3038 (unsigned long long)cache->reserved);
3039 btrfs_dump_free_space(cache, bytes);
3040 spin_unlock(&cache->lock);
3041 }
3042 up_read(&info->groups_sem);
3043}
3044
3045static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3046 struct btrfs_root *root,
3047 u64 num_bytes, u64 min_alloc_size,
3048 u64 empty_size, u64 hint_byte,
3049 u64 search_end, struct btrfs_key *ins,
3050 u64 data)
3051{
3052 int ret;
3053 u64 search_start = 0;
3054 u64 alloc_profile;
3055 struct btrfs_fs_info *info = root->fs_info;
3056
3057 if (data) {
3058 alloc_profile = info->avail_data_alloc_bits &
3059 info->data_alloc_profile;
3060 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3061 } else if (root == root->fs_info->chunk_root) {
3062 alloc_profile = info->avail_system_alloc_bits &
3063 info->system_alloc_profile;
3064 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3065 } else {
3066 alloc_profile = info->avail_metadata_alloc_bits &
3067 info->metadata_alloc_profile;
3068 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3069 }
3070again:
3071 data = btrfs_reduce_alloc_profile(root, data);
3072 /*
3073 * the only place that sets empty_size is btrfs_realloc_node, which
3074 * is not called recursively on allocations
3075 */
3076 if (empty_size || root->ref_cows) {
3077 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
3078 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3079 2 * 1024 * 1024,
3080 BTRFS_BLOCK_GROUP_METADATA |
3081 (info->metadata_alloc_profile &
3082 info->avail_metadata_alloc_bits), 0);
3083 }
3084 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3085 num_bytes + 2 * 1024 * 1024, data, 0);
3086 }
3087
3088 WARN_ON(num_bytes < root->sectorsize);
3089 ret = find_free_extent(trans, root, num_bytes, empty_size,
3090 search_start, search_end, hint_byte, ins,
3091 trans->alloc_exclude_start,
3092 trans->alloc_exclude_nr, data);
3093
3094 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
3095 num_bytes = num_bytes >> 1;
3096 num_bytes = num_bytes & ~(root->sectorsize - 1);
3097 num_bytes = max(num_bytes, min_alloc_size);
3098 do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes, data, 1);
3100 goto again;
3101 }
3102 if (ret) {
3103 struct btrfs_space_info *sinfo;
3104
3105 sinfo = __find_space_info(root->fs_info, data);
3106 printk(KERN_ERR "btrfs allocation failed flags %llu, "
3107 "wanted %llu\n", (unsigned long long)data,
3108 (unsigned long long)num_bytes);
3109 dump_space_info(sinfo, num_bytes);
3110 BUG();
3111 }
3112
3113 return ret;
3114}
3115
3116int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3117{
3118 struct btrfs_block_group_cache *cache;
3119 int ret = 0;
3120
3121 cache = btrfs_lookup_block_group(root->fs_info, start);
3122 if (!cache) {
3123 printk(KERN_ERR "Unable to find block group for %llu\n",
3124 (unsigned long long)start);
3125 return -ENOSPC;
3126 }
3127
3128 ret = btrfs_discard_extent(root, start, len);
3129
3130 btrfs_add_free_space(cache, start, len);
3131 put_block_group(cache);
3132 update_reserved_extents(root, start, len, 0);
3133
3134 return ret;
3135}
3136
3137int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3138 struct btrfs_root *root,
3139 u64 num_bytes, u64 min_alloc_size,
3140 u64 empty_size, u64 hint_byte,
3141 u64 search_end, struct btrfs_key *ins,
3142 u64 data)
3143{
3144 int ret;
3145 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3146 empty_size, hint_byte, search_end, ins,
3147 data);
3148 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3149 return ret;
3150}
3151
3152static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3153 struct btrfs_root *root, u64 parent,
3154 u64 root_objectid, u64 ref_generation,
3155 u64 owner, struct btrfs_key *ins)
3156{
3157 int ret;
3158 int pending_ret;
3159 u64 super_used;
3160 u64 root_used;
3161 u64 num_bytes = ins->offset;
3162 u32 sizes[2];
3163 struct btrfs_fs_info *info = root->fs_info;
3164 struct btrfs_root *extent_root = info->extent_root;
3165 struct btrfs_extent_item *extent_item;
3166 struct btrfs_extent_ref *ref;
3167 struct btrfs_path *path;
3168 struct btrfs_key keys[2];
3169
3170 if (parent == 0)
3171 parent = ins->objectid;
3172
3173 /* block accounting for super block */
3174 spin_lock(&info->delalloc_lock);
3175 super_used = btrfs_super_bytes_used(&info->super_copy);
3176 btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
3177
3178 /* block accounting for root item */
3179 root_used = btrfs_root_used(&root->root_item);
3180 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3181 spin_unlock(&info->delalloc_lock);
3182
3183 if (root == extent_root) {
3184 struct pending_extent_op *extent_op;
3185
3186 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3187 BUG_ON(!extent_op);
3188
3189 extent_op->type = PENDING_EXTENT_INSERT;
3190 extent_op->bytenr = ins->objectid;
3191 extent_op->num_bytes = ins->offset;
3192 extent_op->parent = parent;
3193 extent_op->orig_parent = 0;
3194 extent_op->generation = ref_generation;
3195 extent_op->orig_generation = 0;
3196 extent_op->level = (int)owner;
3197 INIT_LIST_HEAD(&extent_op->list);
3198 extent_op->del = 0;
3199
3200 mutex_lock(&root->fs_info->extent_ins_mutex);
3201 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3202 ins->objectid + ins->offset - 1,
3203 EXTENT_WRITEBACK, GFP_NOFS);
3204 set_state_private(&root->fs_info->extent_ins,
3205 ins->objectid, (unsigned long)extent_op);
3206 mutex_unlock(&root->fs_info->extent_ins_mutex);
3207 goto update_block;
3208 }
3209
3210 memcpy(&keys[0], ins, sizeof(*ins));
3211 keys[1].objectid = ins->objectid;
3212 keys[1].type = BTRFS_EXTENT_REF_KEY;
3213 keys[1].offset = parent;
3214 sizes[0] = sizeof(*extent_item);
3215 sizes[1] = sizeof(*ref);
3216
3217 path = btrfs_alloc_path();
3218 BUG_ON(!path);
3219
3220 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3221 sizes, 2);
3222 BUG_ON(ret);
3223
3224 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3225 struct btrfs_extent_item);
3226 btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
3227 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3228 struct btrfs_extent_ref);
3229
3230 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3231 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3232 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3233 btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
3234
3235 btrfs_mark_buffer_dirty(path->nodes[0]);
3236
3237 trans->alloc_exclude_start = 0;
3238 trans->alloc_exclude_nr = 0;
3239 btrfs_free_path(path);
3240 finish_current_insert(trans, extent_root, 0);
3241 pending_ret = del_pending_extents(trans, extent_root, 0);
3242
3243 if (ret)
3244 goto out;
3245 if (pending_ret) {
3246 ret = pending_ret;
3247 goto out;
3248 }
3249
3250update_block:
3251 ret = update_block_group(trans, root, ins->objectid,
3252 ins->offset, 1, 0);
3253 if (ret) {
3254 printk(KERN_ERR "btrfs update block group failed for %llu "
3255 "%llu\n", (unsigned long long)ins->objectid,
3256 (unsigned long long)ins->offset);
3257 BUG();
3258 }
3259out:
3260 return ret;
3261}
3262
3263int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3264 struct btrfs_root *root, u64 parent,
3265 u64 root_objectid, u64 ref_generation,
3266 u64 owner, struct btrfs_key *ins)
3267{
3268 int ret;
3269
3270 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3271 return 0;
3272 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3273 ref_generation, owner, ins);
3274 update_reserved_extents(root, ins->objectid, ins->offset, 0);
3275 return ret;
3276}
3277
3278/*
3279 * this is used by the tree logging recovery code. It records that
3280 * an extent has been allocated and makes sure to clear the free
3281 * space cache bits as well
3282 */
3283int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3284 struct btrfs_root *root, u64 parent,
3285 u64 root_objectid, u64 ref_generation,
3286 u64 owner, struct btrfs_key *ins)
3287{
3288 int ret;
3289 struct btrfs_block_group_cache *block_group;
3290
3291 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3292 mutex_lock(&block_group->cache_mutex);
3293 cache_block_group(root, block_group);
3294 mutex_unlock(&block_group->cache_mutex);
3295
3296 ret = btrfs_remove_free_space(block_group, ins->objectid,
3297 ins->offset);
3298 BUG_ON(ret);
3299 put_block_group(block_group);
3300 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3301 ref_generation, owner, ins);
3302 return ret;
3303}
3304
3305/*
3306 * finds a free extent and does all the dirty work required for allocation
3307 * returns the key for the extent through ins, and a tree buffer for
3308 * the first block of the extent through buf.
3309 *
3310 * returns 0 if everything worked, non-zero otherwise.
3311 */
3312int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3313 struct btrfs_root *root,
3314 u64 num_bytes, u64 parent, u64 min_alloc_size,
3315 u64 root_objectid, u64 ref_generation,
3316 u64 owner_objectid, u64 empty_size, u64 hint_byte,
3317 u64 search_end, struct btrfs_key *ins, u64 data)
3318{
3319 int ret;
3320
3321 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3322 min_alloc_size, empty_size, hint_byte,
3323 search_end, ins, data);
3324 BUG_ON(ret);
3325 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3326 ret = __btrfs_alloc_reserved_extent(trans, root, parent,
3327 root_objectid, ref_generation,
3328 owner_objectid, ins);
3329 BUG_ON(ret);
3330
3331 } else {
3332 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3333 }
3334 return ret;
3335}
3336
3337struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3338 struct btrfs_root *root,
3339 u64 bytenr, u32 blocksize)
3340{
3341 struct extent_buffer *buf;
3342
3343 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3344 if (!buf)
3345 return ERR_PTR(-ENOMEM);
3346 btrfs_set_header_generation(buf, trans->transid);
3347 btrfs_tree_lock(buf);
3348 clean_tree_block(trans, root, buf);
3349 btrfs_set_buffer_uptodate(buf);
3350 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3351 set_extent_dirty(&root->dirty_log_pages, buf->start,
3352 buf->start + buf->len - 1, GFP_NOFS);
3353 } else {
3354 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3355 buf->start + buf->len - 1, GFP_NOFS);
3356 }
3357 trans->blocks_used++;
3358 return buf;
3359}
3360
3361/*
3362 * helper function to allocate a block for a given tree
3363 * returns the tree buffer or NULL.
3364 */
3365struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3366 struct btrfs_root *root,
3367 u32 blocksize, u64 parent,
3368 u64 root_objectid,
3369 u64 ref_generation,
3370 int level,
3371 u64 hint,
3372 u64 empty_size)
3373{
3374 struct btrfs_key ins;
3375 int ret;
3376 struct extent_buffer *buf;
3377
3378 ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3379 root_objectid, ref_generation, level,
3380 empty_size, hint, (u64)-1, &ins, 0);
3381 if (ret) {
3382 BUG_ON(ret > 0);
3383 return ERR_PTR(ret);
3384 }
3385
3386 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
3387 return buf;
3388}
3389
3390int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3391 struct btrfs_root *root, struct extent_buffer *leaf)
3392{
3393 u64 leaf_owner;
3394 u64 leaf_generation;
3395 struct btrfs_key key;
3396 struct btrfs_file_extent_item *fi;
3397 int i;
3398 int nritems;
3399 int ret;
3400
3401 BUG_ON(!btrfs_is_leaf(leaf));
3402 nritems = btrfs_header_nritems(leaf);
3403 leaf_owner = btrfs_header_owner(leaf);
3404 leaf_generation = btrfs_header_generation(leaf);
3405
3406 for (i = 0; i < nritems; i++) {
3407 u64 disk_bytenr;
3408 cond_resched();
3409
3410 btrfs_item_key_to_cpu(leaf, &key, i);
3411 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3412 continue;
3413 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3414 if (btrfs_file_extent_type(leaf, fi) ==
3415 BTRFS_FILE_EXTENT_INLINE)
3416 continue;
3417 /*
3418 * FIXME make sure to insert a trans record that
3419 * repeats the snapshot del on crash
3420 */
3421 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3422 if (disk_bytenr == 0)
3423 continue;
3424
3425 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3426 btrfs_file_extent_disk_num_bytes(leaf, fi),
3427 leaf->start, leaf_owner, leaf_generation,
3428 key.objectid, 0);
3429 BUG_ON(ret);
3430
3431 atomic_inc(&root->fs_info->throttle_gen);
3432 wake_up(&root->fs_info->transaction_throttle);
3433 cond_resched();
3434 }
3435 return 0;
3436}
3437
3438static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3439 struct btrfs_root *root,
3440 struct btrfs_leaf_ref *ref)
3441{
3442 int i;
3443 int ret;
3444 struct btrfs_extent_info *info = ref->extents;
3445
3446 for (i = 0; i < ref->nritems; i++) {
3447 ret = __btrfs_free_extent(trans, root, info->bytenr,
3448 info->num_bytes, ref->bytenr,
3449 ref->owner, ref->generation,
3450 info->objectid, 0);
3451
3452 atomic_inc(&root->fs_info->throttle_gen);
3453 wake_up(&root->fs_info->transaction_throttle);
3454 cond_resched();
3455
3456 BUG_ON(ret);
3457 info++;
3458 }
3459
3460 return 0;
3461}
3462
3463static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3464 u64 len, u32 *refs)
3465{
3466 int ret;
3467
3468 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
3469 BUG_ON(ret);
3470
3471#if 0 /* some debugging code in case we see problems here */
3472 /* if the refs count is one, it won't get increased again. But
3473 * if the ref count is > 1, someone may be decreasing it at
3474 * the same time we are.
3475 */
3476 if (*refs != 1) {
3477 struct extent_buffer *eb = NULL;
3478 eb = btrfs_find_create_tree_block(root, start, len);
3479 if (eb)
3480 btrfs_tree_lock(eb);
3481
3482 mutex_lock(&root->fs_info->alloc_mutex);
3483 ret = lookup_extent_ref(NULL, root, start, len, refs);
3484 BUG_ON(ret);
3485 mutex_unlock(&root->fs_info->alloc_mutex);
3486
3487 if (eb) {
3488 btrfs_tree_unlock(eb);
3489 free_extent_buffer(eb);
3490 }
3491 if (*refs == 1) {
3492 printk(KERN_ERR "btrfs block %llu went down to one "
3493 "during drop_snap\n", (unsigned long long)start);
3494 }
3495
3496 }
3497#endif
3498
3499 cond_resched();
3500 return ret;
3501}
3502
3503/*
3504 * helper function for drop_snapshot, this walks down the tree dropping ref
3505 * counts as it goes.
3506 */
3507static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3508 struct btrfs_root *root,
3509 struct btrfs_path *path, int *level)
3510{
3511 u64 root_owner;
3512 u64 root_gen;
3513 u64 bytenr;
3514 u64 ptr_gen;
3515 struct extent_buffer *next;
3516 struct extent_buffer *cur;
3517 struct extent_buffer *parent;
3518 struct btrfs_leaf_ref *ref;
3519 u32 blocksize;
3520 int ret;
3521 u32 refs;
3522
3523 WARN_ON(*level < 0);
3524 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3525 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
3526 path->nodes[*level]->len, &refs);
3527 BUG_ON(ret);
3528 if (refs > 1)
3529 goto out;
3530
3531 /*
3532 * walk down to the last node level and free all the leaves
3533 */
3534 while (*level >= 0) {
3535 WARN_ON(*level < 0);
3536 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3537 cur = path->nodes[*level];
3538
3539 if (btrfs_header_level(cur) != *level)
3540 WARN_ON(1);
3541
3542 if (path->slots[*level] >=
3543 btrfs_header_nritems(cur))
3544 break;
3545 if (*level == 0) {
3546 ret = btrfs_drop_leaf_ref(trans, root, cur);
3547 BUG_ON(ret);
3548 break;
3549 }
3550 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3551 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3552 blocksize = btrfs_level_size(root, *level - 1);
3553
3554 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3555 BUG_ON(ret);
3556 if (refs != 1) {
3557 parent = path->nodes[*level];
3558 root_owner = btrfs_header_owner(parent);
3559 root_gen = btrfs_header_generation(parent);
3560 path->slots[*level]++;
3561
3562 ret = __btrfs_free_extent(trans, root, bytenr,
3563 blocksize, parent->start,
3564 root_owner, root_gen,
3565 *level - 1, 1);
3566 BUG_ON(ret);
3567
3568 atomic_inc(&root->fs_info->throttle_gen);
3569 wake_up(&root->fs_info->transaction_throttle);
3570 cond_resched();
3571
3572 continue;
3573 }
3574 /*
3575 * at this point, we have a single ref, and since the
3576 * only place referencing this extent is a dead root
3577 * the reference count should never go higher.
3578 * So, we don't need to check it again
3579 */
3580 if (*level == 1) {
3581 ref = btrfs_lookup_leaf_ref(root, bytenr);
3582 if (ref && ref->generation != ptr_gen) {
3583 btrfs_free_leaf_ref(root, ref);
3584 ref = NULL;
3585 }
3586 if (ref) {
3587 ret = cache_drop_leaf_ref(trans, root, ref);
3588 BUG_ON(ret);
3589 btrfs_remove_leaf_ref(root, ref);
3590 btrfs_free_leaf_ref(root, ref);
3591 *level = 0;
3592 break;
3593 }
3594 }
3595 next = btrfs_find_tree_block(root, bytenr, blocksize);
3596 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3597 free_extent_buffer(next);
3598
3599 next = read_tree_block(root, bytenr, blocksize,
3600 ptr_gen);
3601 cond_resched();
3602#if 0
3603 /*
3604 * this is a debugging check and can go away
3605 * the ref should never go all the way down to 1
3606 * at this point
3607 */
3608 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3609 &refs);
3610 BUG_ON(ret);
3611 WARN_ON(refs != 1);
3612#endif
3613 }
3614 WARN_ON(*level <= 0);
3615 if (path->nodes[*level-1])
3616 free_extent_buffer(path->nodes[*level-1]);
3617 path->nodes[*level-1] = next;
3618 *level = btrfs_header_level(next);
3619 path->slots[*level] = 0;
3620 cond_resched();
3621 }
3622out:
3623 WARN_ON(*level < 0);
3624 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3625
3626 if (path->nodes[*level] == root->node) {
3627 parent = path->nodes[*level];
3628 bytenr = path->nodes[*level]->start;
3629 } else {
3630 parent = path->nodes[*level + 1];
3631 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3632 }
3633
3634 blocksize = btrfs_level_size(root, *level);
3635 root_owner = btrfs_header_owner(parent);
3636 root_gen = btrfs_header_generation(parent);
3637
3638 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3639 parent->start, root_owner, root_gen,
3640 *level, 1);
3641 free_extent_buffer(path->nodes[*level]);
3642 path->nodes[*level] = NULL;
3643 *level += 1;
3644 BUG_ON(ret);
3645
3646 cond_resched();
3647 return 0;
3648}
3649
3650/*
3651 * helper function for drop_subtree, this function is similar to
3652 * walk_down_tree. The main difference is that it checks reference
3653 * counts while tree blocks are locked.
3654 */
3655static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3656 struct btrfs_root *root,
3657 struct btrfs_path *path, int *level)
3658{
3659 struct extent_buffer *next;
3660 struct extent_buffer *cur;
3661 struct extent_buffer *parent;
3662 u64 bytenr;
3663 u64 ptr_gen;
3664 u32 blocksize;
3665 u32 refs;
3666 int ret;
3667
3668 cur = path->nodes[*level];
3669 ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3670 &refs);
3671 BUG_ON(ret);
3672 if (refs > 1)
3673 goto out;
3674
3675 while (*level >= 0) {
3676 cur = path->nodes[*level];
3677 if (*level == 0) {
3678 ret = btrfs_drop_leaf_ref(trans, root, cur);
3679 BUG_ON(ret);
3680 clean_tree_block(trans, root, cur);
3681 break;
3682 }
3683 if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3684 clean_tree_block(trans, root, cur);
3685 break;
3686 }
3687
3688 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3689 blocksize = btrfs_level_size(root, *level - 1);
3690 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3691
3692 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3693 btrfs_tree_lock(next);
3694
3695 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3696 &refs);
3697 BUG_ON(ret);
3698 if (refs > 1) {
3699 parent = path->nodes[*level];
3700 ret = btrfs_free_extent(trans, root, bytenr,
3701 blocksize, parent->start,
3702 btrfs_header_owner(parent),
3703 btrfs_header_generation(parent),
3704 *level - 1, 1);
3705 BUG_ON(ret);
3706 path->slots[*level]++;
3707 btrfs_tree_unlock(next);
3708 free_extent_buffer(next);
3709 continue;
3710 }
3711
3712 *level = btrfs_header_level(next);
3713 path->nodes[*level] = next;
3714 path->slots[*level] = 0;
3715 path->locks[*level] = 1;
3716 cond_resched();
3717 }
3718out:
3719 parent = path->nodes[*level + 1];
3720 bytenr = path->nodes[*level]->start;
3721 blocksize = path->nodes[*level]->len;
3722
3723 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3724 parent->start, btrfs_header_owner(parent),
3725 btrfs_header_generation(parent), *level, 1);
3726 BUG_ON(ret);
3727
3728 if (path->locks[*level]) {
3729 btrfs_tree_unlock(path->nodes[*level]);
3730 path->locks[*level] = 0;
3731 }
3732 free_extent_buffer(path->nodes[*level]);
3733 path->nodes[*level] = NULL;
3734 *level += 1;
3735 cond_resched();
3736 return 0;
3737}
3738
3739/*
3740 * helper for dropping snapshots. This walks back up the tree in the path
3741 * to find the first node higher up where we haven't yet gone through
3742 * all the slots
3743 */
3744static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3745 struct btrfs_root *root,
3746 struct btrfs_path *path,
3747 int *level, int max_level)
3748{
3749 u64 root_owner;
3750 u64 root_gen;
3751 struct btrfs_root_item *root_item = &root->root_item;
3752 int i;
3753 int slot;
3754 int ret;
3755
3756 for (i = *level; i < max_level && path->nodes[i]; i++) {
3757 slot = path->slots[i];
3758 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3759 struct extent_buffer *node;
3760 struct btrfs_disk_key disk_key;
3761 node = path->nodes[i];
3762 path->slots[i]++;
3763 *level = i;
3764 WARN_ON(*level == 0);
3765 btrfs_node_key(node, &disk_key, path->slots[i]);
3766 memcpy(&root_item->drop_progress,
3767 &disk_key, sizeof(disk_key));
3768 root_item->drop_level = i;
3769 return 0;
3770 } else {
3771 struct extent_buffer *parent;
3772 if (path->nodes[*level] == root->node)
3773 parent = path->nodes[*level];
3774 else
3775 parent = path->nodes[*level + 1];
3776
3777 root_owner = btrfs_header_owner(parent);
3778 root_gen = btrfs_header_generation(parent);
3779
3780 clean_tree_block(trans, root, path->nodes[*level]);
3781 ret = btrfs_free_extent(trans, root,
3782 path->nodes[*level]->start,
3783 path->nodes[*level]->len,
3784 parent->start, root_owner,
3785 root_gen, *level, 1);
3786 BUG_ON(ret);
3787 if (path->locks[*level]) {
3788 btrfs_tree_unlock(path->nodes[*level]);
3789 path->locks[*level] = 0;
3790 }
3791 free_extent_buffer(path->nodes[*level]);
3792 path->nodes[*level] = NULL;
3793 *level = i + 1;
3794 }
3795 }
3796 return 1;
3797}
3798
3799/*
3800 * drop the reference count on the tree rooted at 'snap'. This traverses
3801 * the tree freeing any blocks that have a ref count of zero after being
3802 * decremented.
3803 */
3804int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3805 *root)
3806{
3807 int ret = 0;
3808 int wret;
3809 int level;
3810 struct btrfs_path *path;
3811 int i;
3812 int orig_level;
3813 struct btrfs_root_item *root_item = &root->root_item;
3814
3815 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3816 path = btrfs_alloc_path();
3817 BUG_ON(!path);
3818
3819 level = btrfs_header_level(root->node);
3820 orig_level = level;
3821 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3822 path->nodes[level] = root->node;
3823 extent_buffer_get(root->node);
3824 path->slots[level] = 0;
3825 } else {
3826 struct btrfs_key key;
3827 struct btrfs_disk_key found_key;
3828 struct extent_buffer *node;
3829
3830 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3831 level = root_item->drop_level;
3832 path->lowest_level = level;
3833 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3834 if (wret < 0) {
3835 ret = wret;
3836 goto out;
3837 }
3838 node = path->nodes[level];
3839 btrfs_node_key(node, &found_key, path->slots[level]);
3840 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3841 sizeof(found_key)));
3842 /*
3843 * unlock our path, this is safe because only this
3844 * function is allowed to delete this snapshot
3845 */
3846 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3847 if (path->nodes[i] && path->locks[i]) {
3848 path->locks[i] = 0;
3849 btrfs_tree_unlock(path->nodes[i]);
3850 }
3851 }
3852 }
3853 while (1) {
3854 wret = walk_down_tree(trans, root, path, &level);
3855 if (wret > 0)
3856 break;
3857 if (wret < 0)
3858 ret = wret;
3859
3860 wret = walk_up_tree(trans, root, path, &level,
3861 BTRFS_MAX_LEVEL);
3862 if (wret > 0)
3863 break;
3864 if (wret < 0)
3865 ret = wret;
3866 if (trans->transaction->in_commit) {
3867 ret = -EAGAIN;
3868 break;
3869 }
3870 atomic_inc(&root->fs_info->throttle_gen);
3871 wake_up(&root->fs_info->transaction_throttle);
3872 }
3873 for (i = 0; i <= orig_level; i++) {
3874 if (path->nodes[i]) {
3875 free_extent_buffer(path->nodes[i]);
3876 path->nodes[i] = NULL;
3877 }
3878 }
3879out:
3880 btrfs_free_path(path);
3881 return ret;
3882}
3883
3884int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3885 struct btrfs_root *root,
3886 struct extent_buffer *node,
3887 struct extent_buffer *parent)
3888{
3889 struct btrfs_path *path;
3890 int level;
3891 int parent_level;
3892 int ret = 0;
3893 int wret;
3894
3895 path = btrfs_alloc_path();
3896 BUG_ON(!path);
3897
3898 BUG_ON(!btrfs_tree_locked(parent));
3899 parent_level = btrfs_header_level(parent);
3900 extent_buffer_get(parent);
3901 path->nodes[parent_level] = parent;
3902 path->slots[parent_level] = btrfs_header_nritems(parent);
3903
3904 BUG_ON(!btrfs_tree_locked(node));
3905 level = btrfs_header_level(node);
3906 extent_buffer_get(node);
3907 path->nodes[level] = node;
3908 path->slots[level] = 0;
3909
3910 while (1) {
3911 wret = walk_down_subtree(trans, root, path, &level);
3912 if (wret < 0)
3913 ret = wret;
3914 if (wret != 0)
3915 break;
3916
3917 wret = walk_up_tree(trans, root, path, &level, parent_level);
3918 if (wret < 0)
3919 ret = wret;
3920 if (wret != 0)
3921 break;
3922 }
3923
3924 btrfs_free_path(path);
3925 return ret;
3926}
3927
3928static unsigned long calc_ra(unsigned long start, unsigned long last,
3929 unsigned long nr)
3930{
3931 return min(last, start + nr - 1);
3932}
3933
3934static noinline int relocate_inode_pages(struct inode *inode, u64 start,
3935 u64 len)
3936{
3937 u64 page_start;
3938 u64 page_end;
3939 unsigned long first_index;
3940 unsigned long last_index;
3941 unsigned long i;
3942 struct page *page;
3943 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3944 struct file_ra_state *ra;
3945 struct btrfs_ordered_extent *ordered;
3946 unsigned int total_read = 0;
3947 unsigned int total_dirty = 0;
3948 int ret = 0;
3949
3950 ra = kzalloc(sizeof(*ra), GFP_NOFS);
3951
3952 mutex_lock(&inode->i_mutex);
3953 first_index = start >> PAGE_CACHE_SHIFT;
3954 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3955
3956 /* make sure the dirty trick played by the caller work */
3957 ret = invalidate_inode_pages2_range(inode->i_mapping,
3958 first_index, last_index);
3959 if (ret)
3960 goto out_unlock;
3961
3962 file_ra_state_init(ra, inode->i_mapping);
3963
3964 for (i = first_index ; i <= last_index; i++) {
3965 if (total_read % ra->ra_pages == 0) {
3966 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3967 calc_ra(i, last_index, ra->ra_pages));
3968 }
3969 total_read++;
3970again:
3971 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3972 BUG_ON(1);
3973 page = grab_cache_page(inode->i_mapping, i);
3974 if (!page) {
3975 ret = -ENOMEM;
3976 goto out_unlock;
3977 }
3978 if (!PageUptodate(page)) {
3979 btrfs_readpage(NULL, page);
3980 lock_page(page);
3981 if (!PageUptodate(page)) {
3982 unlock_page(page);
3983 page_cache_release(page);
3984 ret = -EIO;
3985 goto out_unlock;
3986 }
3987 }
3988 wait_on_page_writeback(page);
3989
3990 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3991 page_end = page_start + PAGE_CACHE_SIZE - 1;
3992 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3993
3994 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3995 if (ordered) {
3996 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3997 unlock_page(page);
3998 page_cache_release(page);
3999 btrfs_start_ordered_extent(inode, ordered, 1);
4000 btrfs_put_ordered_extent(ordered);
4001 goto again;
4002 }
4003 set_page_extent_mapped(page);
4004
4005 if (i == first_index)
4006 set_extent_bits(io_tree, page_start, page_end,
4007 EXTENT_BOUNDARY, GFP_NOFS);
4008 btrfs_set_extent_delalloc(inode, page_start, page_end);
4009
4010 set_page_dirty(page);
4011 total_dirty++;
4012
4013 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4014 unlock_page(page);
4015 page_cache_release(page);
4016 }
4017
4018out_unlock:
4019 kfree(ra);
4020 mutex_unlock(&inode->i_mutex);
4021 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
4022 return ret;
4023}
4024
4025static noinline int relocate_data_extent(struct inode *reloc_inode,
4026 struct btrfs_key *extent_key,
4027 u64 offset)
4028{
4029 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4030 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4031 struct extent_map *em;
4032 u64 start = extent_key->objectid - offset;
4033 u64 end = start + extent_key->offset - 1;
4034
4035 em = alloc_extent_map(GFP_NOFS);
4036 BUG_ON(!em || IS_ERR(em));
4037
4038 em->start = start;
4039 em->len = extent_key->offset;
4040 em->block_len = extent_key->offset;
4041 em->block_start = extent_key->objectid;
4042 em->bdev = root->fs_info->fs_devices->latest_bdev;
4043 set_bit(EXTENT_FLAG_PINNED, &em->flags);
4044
4045 /* setup extent map to cheat btrfs_readpage */
4046 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4047 while (1) {
4048 int ret;
4049 spin_lock(&em_tree->lock);
4050 ret = add_extent_mapping(em_tree, em);
4051 spin_unlock(&em_tree->lock);
4052 if (ret != -EEXIST) {
4053 free_extent_map(em);
4054 break;
4055 }
4056 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4057 }
4058 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4059
4060 return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4061}
4062
4063struct btrfs_ref_path {
4064 u64 extent_start;
4065 u64 nodes[BTRFS_MAX_LEVEL];
4066 u64 root_objectid;
4067 u64 root_generation;
4068 u64 owner_objectid;
4069 u32 num_refs;
4070 int lowest_level;
4071 int current_level;
4072 int shared_level;
4073
4074 struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4075 u64 new_nodes[BTRFS_MAX_LEVEL];
4076};
4077
4078struct disk_extent {
4079 u64 ram_bytes;
4080 u64 disk_bytenr;
4081 u64 disk_num_bytes;
4082 u64 offset;
4083 u64 num_bytes;
4084 u8 compression;
4085 u8 encryption;
4086 u16 other_encoding;
4087};
4088
4089static int is_cowonly_root(u64 root_objectid)
4090{
4091 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4092 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4093 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4094 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4095 root_objectid == BTRFS_TREE_LOG_OBJECTID ||
4096 root_objectid == BTRFS_CSUM_TREE_OBJECTID)
4097 return 1;
4098 return 0;
4099}
4100
4101static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
4102 struct btrfs_root *extent_root,
4103 struct btrfs_ref_path *ref_path,
4104 int first_time)
4105{
4106 struct extent_buffer *leaf;
4107 struct btrfs_path *path;
4108 struct btrfs_extent_ref *ref;
4109 struct btrfs_key key;
4110 struct btrfs_key found_key;
4111 u64 bytenr;
4112 u32 nritems;
4113 int level;
4114 int ret = 1;
4115
4116 path = btrfs_alloc_path();
4117 if (!path)
4118 return -ENOMEM;
4119
4120 if (first_time) {
4121 ref_path->lowest_level = -1;
4122 ref_path->current_level = -1;
4123 ref_path->shared_level = -1;
4124 goto walk_up;
4125 }
4126walk_down:
4127 level = ref_path->current_level - 1;
4128 while (level >= -1) {
4129 u64 parent;
4130 if (level < ref_path->lowest_level)
4131 break;
4132
4133 if (level >= 0)
4134 bytenr = ref_path->nodes[level];
4135 else
4136 bytenr = ref_path->extent_start;
4137 BUG_ON(bytenr == 0);
4138
4139 parent = ref_path->nodes[level + 1];
4140 ref_path->nodes[level + 1] = 0;
4141 ref_path->current_level = level;
4142 BUG_ON(parent == 0);
4143
4144 key.objectid = bytenr;
4145 key.offset = parent + 1;
4146 key.type = BTRFS_EXTENT_REF_KEY;
4147
4148 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4149 if (ret < 0)
4150 goto out;
4151 BUG_ON(ret == 0);
4152
4153 leaf = path->nodes[0];
4154 nritems = btrfs_header_nritems(leaf);
4155 if (path->slots[0] >= nritems) {
4156 ret = btrfs_next_leaf(extent_root, path);
4157 if (ret < 0)
4158 goto out;
4159 if (ret > 0)
4160 goto next;
4161 leaf = path->nodes[0];
4162 }
4163
4164 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4165 if (found_key.objectid == bytenr &&
4166 found_key.type == BTRFS_EXTENT_REF_KEY) {
4167 if (level < ref_path->shared_level)
4168 ref_path->shared_level = level;
4169 goto found;
4170 }
4171next:
4172 level--;
4173 btrfs_release_path(extent_root, path);
4174 cond_resched();
4175 }
4176 /* reached lowest level */
4177 ret = 1;
4178 goto out;
4179walk_up:
4180 level = ref_path->current_level;
4181 while (level < BTRFS_MAX_LEVEL - 1) {
4182 u64 ref_objectid;
4183
4184 if (level >= 0)
4185 bytenr = ref_path->nodes[level];
4186 else
4187 bytenr = ref_path->extent_start;
4188
4189 BUG_ON(bytenr == 0);
4190
4191 key.objectid = bytenr;
4192 key.offset = 0;
4193 key.type = BTRFS_EXTENT_REF_KEY;
4194
4195 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4196 if (ret < 0)
4197 goto out;
4198
4199 leaf = path->nodes[0];
4200 nritems = btrfs_header_nritems(leaf);
4201 if (path->slots[0] >= nritems) {
4202 ret = btrfs_next_leaf(extent_root, path);
4203 if (ret < 0)
4204 goto out;
4205 if (ret > 0) {
4206 /* the extent was freed by someone */
4207 if (ref_path->lowest_level == level)
4208 goto out;
4209 btrfs_release_path(extent_root, path);
4210 goto walk_down;
4211 }
4212 leaf = path->nodes[0];
4213 }
4214
4215 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4216 if (found_key.objectid != bytenr ||
4217 found_key.type != BTRFS_EXTENT_REF_KEY) {
4218 /* the extent was freed by someone */
4219 if (ref_path->lowest_level == level) {
4220 ret = 1;
4221 goto out;
4222 }
4223 btrfs_release_path(extent_root, path);
4224 goto walk_down;
4225 }
4226found:
4227 ref = btrfs_item_ptr(leaf, path->slots[0],
4228 struct btrfs_extent_ref);
4229 ref_objectid = btrfs_ref_objectid(leaf, ref);
4230 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4231 if (first_time) {
4232 level = (int)ref_objectid;
4233 BUG_ON(level >= BTRFS_MAX_LEVEL);
4234 ref_path->lowest_level = level;
4235 ref_path->current_level = level;
4236 ref_path->nodes[level] = bytenr;
4237 } else {
4238 WARN_ON(ref_objectid != level);
4239 }
4240 } else {
4241 WARN_ON(level != -1);
4242 }
4243 first_time = 0;
4244
4245 if (ref_path->lowest_level == level) {
4246 ref_path->owner_objectid = ref_objectid;
4247 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4248 }
4249
4250 /*
4251 * the block is tree root or the block isn't in reference
4252 * counted tree.
4253 */
4254 if (found_key.objectid == found_key.offset ||
4255 is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4256 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4257 ref_path->root_generation =
4258 btrfs_ref_generation(leaf, ref);
4259 if (level < 0) {
4260 /* special reference from the tree log */
4261 ref_path->nodes[0] = found_key.offset;
4262 ref_path->current_level = 0;
4263 }
4264 ret = 0;
4265 goto out;
4266 }
4267
4268 level++;
4269 BUG_ON(ref_path->nodes[level] != 0);
4270 ref_path->nodes[level] = found_key.offset;
4271 ref_path->current_level = level;
4272
4273 /*
4274 * the reference was created in the running transaction,
4275 * no need to continue walking up.
4276 */
4277 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4278 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4279 ref_path->root_generation =
4280 btrfs_ref_generation(leaf, ref);
4281 ret = 0;
4282 goto out;
4283 }
4284
4285 btrfs_release_path(extent_root, path);
4286 cond_resched();
4287 }
4288 /* reached max tree level, but no tree root found. */
4289 BUG();
4290out:
4291 btrfs_free_path(path);
4292 return ret;
4293}
4294
4295static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4296 struct btrfs_root *extent_root,
4297 struct btrfs_ref_path *ref_path,
4298 u64 extent_start)
4299{
4300 memset(ref_path, 0, sizeof(*ref_path));
4301 ref_path->extent_start = extent_start;
4302
4303 return __next_ref_path(trans, extent_root, ref_path, 1);
4304}
4305
4306static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4307 struct btrfs_root *extent_root,
4308 struct btrfs_ref_path *ref_path)
4309{
4310 return __next_ref_path(trans, extent_root, ref_path, 0);
4311}
4312
4313static noinline int get_new_locations(struct inode *reloc_inode,
4314 struct btrfs_key *extent_key,
4315 u64 offset, int no_fragment,
4316 struct disk_extent **extents,
4317 int *nr_extents)
4318{
4319 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4320 struct btrfs_path *path;
4321 struct btrfs_file_extent_item *fi;
4322 struct extent_buffer *leaf;
4323 struct disk_extent *exts = *extents;
4324 struct btrfs_key found_key;
4325 u64 cur_pos;
4326 u64 last_byte;
4327 u32 nritems;
4328 int nr = 0;
4329 int max = *nr_extents;
4330 int ret;
4331
4332 WARN_ON(!no_fragment && *extents);
4333 if (!exts) {
4334 max = 1;
4335 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4336 if (!exts)
4337 return -ENOMEM;
4338 }
4339
4340 path = btrfs_alloc_path();
4341 BUG_ON(!path);
4342
4343 cur_pos = extent_key->objectid - offset;
4344 last_byte = extent_key->objectid + extent_key->offset;
4345 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4346 cur_pos, 0);
4347 if (ret < 0)
4348 goto out;
4349 if (ret > 0) {
4350 ret = -ENOENT;
4351 goto out;
4352 }
4353
4354 while (1) {
4355 leaf = path->nodes[0];
4356 nritems = btrfs_header_nritems(leaf);
4357 if (path->slots[0] >= nritems) {
4358 ret = btrfs_next_leaf(root, path);
4359 if (ret < 0)
4360 goto out;
4361 if (ret > 0)
4362 break;
4363 leaf = path->nodes[0];
4364 }
4365
4366 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4367 if (found_key.offset != cur_pos ||
4368 found_key.type != BTRFS_EXTENT_DATA_KEY ||
4369 found_key.objectid != reloc_inode->i_ino)
4370 break;
4371
4372 fi = btrfs_item_ptr(leaf, path->slots[0],
4373 struct btrfs_file_extent_item);
4374 if (btrfs_file_extent_type(leaf, fi) !=
4375 BTRFS_FILE_EXTENT_REG ||
4376 btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4377 break;
4378
4379 if (nr == max) {
4380 struct disk_extent *old = exts;
4381 max *= 2;
4382 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4383 memcpy(exts, old, sizeof(*exts) * nr);
4384 if (old != *extents)
4385 kfree(old);
4386 }
4387
4388 exts[nr].disk_bytenr =
4389 btrfs_file_extent_disk_bytenr(leaf, fi);
4390 exts[nr].disk_num_bytes =
4391 btrfs_file_extent_disk_num_bytes(leaf, fi);
4392 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4393 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4394 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4395 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4396 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4397 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4398 fi);
4399 BUG_ON(exts[nr].offset > 0);
4400 BUG_ON(exts[nr].compression || exts[nr].encryption);
4401 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4402
4403 cur_pos += exts[nr].num_bytes;
4404 nr++;
4405
4406 if (cur_pos + offset >= last_byte)
4407 break;
4408
4409 if (no_fragment) {
4410 ret = 1;
4411 goto out;
4412 }
4413 path->slots[0]++;
4414 }
4415
4416 BUG_ON(cur_pos + offset > last_byte);
4417 if (cur_pos + offset < last_byte) {
4418 ret = -ENOENT;
4419 goto out;
4420 }
4421 ret = 0;
4422out:
4423 btrfs_free_path(path);
4424 if (ret) {
4425 if (exts != *extents)
4426 kfree(exts);
4427 } else {
4428 *extents = exts;
4429 *nr_extents = nr;
4430 }
4431 return ret;
4432}
4433
4434static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4435 struct btrfs_root *root,
4436 struct btrfs_path *path,
4437 struct btrfs_key *extent_key,
4438 struct btrfs_key *leaf_key,
4439 struct btrfs_ref_path *ref_path,
4440 struct disk_extent *new_extents,
4441 int nr_extents)
4442{
4443 struct extent_buffer *leaf;
4444 struct btrfs_file_extent_item *fi;
4445 struct inode *inode = NULL;
4446 struct btrfs_key key;
4447 u64 lock_start = 0;
4448 u64 lock_end = 0;
4449 u64 num_bytes;
4450 u64 ext_offset;
4451 u64 first_pos;
4452 u32 nritems;
4453 int nr_scaned = 0;
4454 int extent_locked = 0;
4455 int extent_type;
4456 int ret;
4457
4458 memcpy(&key, leaf_key, sizeof(key));
4459 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4460 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4461 if (key.objectid < ref_path->owner_objectid ||
4462 (key.objectid == ref_path->owner_objectid &&
4463 key.type < BTRFS_EXTENT_DATA_KEY)) {
4464 key.objectid = ref_path->owner_objectid;
4465 key.type = BTRFS_EXTENT_DATA_KEY;
4466 key.offset = 0;
4467 }
4468 }
4469
4470 while (1) {
4471 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4472 if (ret < 0)
4473 goto out;
4474
4475 leaf = path->nodes[0];
4476 nritems = btrfs_header_nritems(leaf);
4477next:
4478 if (extent_locked && ret > 0) {
4479 /*
4480 * the file extent item was modified by someone
4481 * before the extent got locked.
4482 */
4483 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4484 lock_end, GFP_NOFS);
4485 extent_locked = 0;
4486 }
4487
4488 if (path->slots[0] >= nritems) {
4489 if (++nr_scaned > 2)
4490 break;
4491
4492 BUG_ON(extent_locked);
4493 ret = btrfs_next_leaf(root, path);
4494 if (ret < 0)
4495 goto out;
4496 if (ret > 0)
4497 break;
4498 leaf = path->nodes[0];
4499 nritems = btrfs_header_nritems(leaf);
4500 }
4501
4502 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4503
4504 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4505 if ((key.objectid > ref_path->owner_objectid) ||
4506 (key.objectid == ref_path->owner_objectid &&
4507 key.type > BTRFS_EXTENT_DATA_KEY) ||
4508 (key.offset >= first_pos + extent_key->offset))
4509 break;
4510 }
4511
4512 if (inode && key.objectid != inode->i_ino) {
4513 BUG_ON(extent_locked);
4514 btrfs_release_path(root, path);
4515 mutex_unlock(&inode->i_mutex);
4516 iput(inode);
4517 inode = NULL;
4518 continue;
4519 }
4520
4521 if (key.type != BTRFS_EXTENT_DATA_KEY) {
4522 path->slots[0]++;
4523 ret = 1;
4524 goto next;
4525 }
4526 fi = btrfs_item_ptr(leaf, path->slots[0],
4527 struct btrfs_file_extent_item);
4528 extent_type = btrfs_file_extent_type(leaf, fi);
4529 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
4530 extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
4531 (btrfs_file_extent_disk_bytenr(leaf, fi) !=
4532 extent_key->objectid)) {
4533 path->slots[0]++;
4534 ret = 1;
4535 goto next;
4536 }
4537
4538 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4539 ext_offset = btrfs_file_extent_offset(leaf, fi);
4540
4541 if (first_pos > key.offset - ext_offset)
4542 first_pos = key.offset - ext_offset;
4543
4544 if (!extent_locked) {
4545 lock_start = key.offset;
4546 lock_end = lock_start + num_bytes - 1;
4547 } else {
4548 if (lock_start > key.offset ||
4549 lock_end + 1 < key.offset + num_bytes) {
4550 unlock_extent(&BTRFS_I(inode)->io_tree,
4551 lock_start, lock_end, GFP_NOFS);
4552 extent_locked = 0;
4553 }
4554 }
4555
4556 if (!inode) {
4557 btrfs_release_path(root, path);
4558
4559 inode = btrfs_iget_locked(root->fs_info->sb,
4560 key.objectid, root);
4561 if (inode->i_state & I_NEW) {
4562 BTRFS_I(inode)->root = root;
4563 BTRFS_I(inode)->location.objectid =
4564 key.objectid;
4565 BTRFS_I(inode)->location.type =
4566 BTRFS_INODE_ITEM_KEY;
4567 BTRFS_I(inode)->location.offset = 0;
4568 btrfs_read_locked_inode(inode);
4569 unlock_new_inode(inode);
4570 }
4571 /*
4572 * some code call btrfs_commit_transaction while
4573 * holding the i_mutex, so we can't use mutex_lock
4574 * here.
4575 */
4576 if (is_bad_inode(inode) ||
4577 !mutex_trylock(&inode->i_mutex)) {
4578 iput(inode);
4579 inode = NULL;
4580 key.offset = (u64)-1;
4581 goto skip;
4582 }
4583 }
4584
4585 if (!extent_locked) {
4586 struct btrfs_ordered_extent *ordered;
4587
4588 btrfs_release_path(root, path);
4589
4590 lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4591 lock_end, GFP_NOFS);
4592 ordered = btrfs_lookup_first_ordered_extent(inode,
4593 lock_end);
4594 if (ordered &&
4595 ordered->file_offset <= lock_end &&
4596 ordered->file_offset + ordered->len > lock_start) {
4597 unlock_extent(&BTRFS_I(inode)->io_tree,
4598 lock_start, lock_end, GFP_NOFS);
4599 btrfs_start_ordered_extent(inode, ordered, 1);
4600 btrfs_put_ordered_extent(ordered);
4601 key.offset += num_bytes;
4602 goto skip;
4603 }
4604 if (ordered)
4605 btrfs_put_ordered_extent(ordered);
4606
4607 extent_locked = 1;
4608 continue;
4609 }
4610
4611 if (nr_extents == 1) {
4612 /* update extent pointer in place */
4613 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4614 new_extents[0].disk_bytenr);
4615 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4616 new_extents[0].disk_num_bytes);
4617 btrfs_mark_buffer_dirty(leaf);
4618
4619 btrfs_drop_extent_cache(inode, key.offset,
4620 key.offset + num_bytes - 1, 0);
4621
4622 ret = btrfs_inc_extent_ref(trans, root,
4623 new_extents[0].disk_bytenr,
4624 new_extents[0].disk_num_bytes,
4625 leaf->start,
4626 root->root_key.objectid,
4627 trans->transid,
4628 key.objectid);
4629 BUG_ON(ret);
4630
4631 ret = btrfs_free_extent(trans, root,
4632 extent_key->objectid,
4633 extent_key->offset,
4634 leaf->start,
4635 btrfs_header_owner(leaf),
4636 btrfs_header_generation(leaf),
4637 key.objectid, 0);
4638 BUG_ON(ret);
4639
4640 btrfs_release_path(root, path);
4641 key.offset += num_bytes;
4642 } else {
4643 BUG_ON(1);
4644#if 0
4645 u64 alloc_hint;
4646 u64 extent_len;
4647 int i;
4648 /*
4649 * drop old extent pointer at first, then insert the
4650 * new pointers one bye one
4651 */
4652 btrfs_release_path(root, path);
4653 ret = btrfs_drop_extents(trans, root, inode, key.offset,
4654 key.offset + num_bytes,
4655 key.offset, &alloc_hint);
4656 BUG_ON(ret);
4657
4658 for (i = 0; i < nr_extents; i++) {
4659 if (ext_offset >= new_extents[i].num_bytes) {
4660 ext_offset -= new_extents[i].num_bytes;
4661 continue;
4662 }
4663 extent_len = min(new_extents[i].num_bytes -
4664 ext_offset, num_bytes);
4665
4666 ret = btrfs_insert_empty_item(trans, root,
4667 path, &key,
4668 sizeof(*fi));
4669 BUG_ON(ret);
4670
4671 leaf = path->nodes[0];
4672 fi = btrfs_item_ptr(leaf, path->slots[0],
4673 struct btrfs_file_extent_item);
4674 btrfs_set_file_extent_generation(leaf, fi,
4675 trans->transid);
4676 btrfs_set_file_extent_type(leaf, fi,
4677 BTRFS_FILE_EXTENT_REG);
4678 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4679 new_extents[i].disk_bytenr);
4680 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4681 new_extents[i].disk_num_bytes);
4682 btrfs_set_file_extent_ram_bytes(leaf, fi,
4683 new_extents[i].ram_bytes);
4684
4685 btrfs_set_file_extent_compression(leaf, fi,
4686 new_extents[i].compression);
4687 btrfs_set_file_extent_encryption(leaf, fi,
4688 new_extents[i].encryption);
4689 btrfs_set_file_extent_other_encoding(leaf, fi,
4690 new_extents[i].other_encoding);
4691
4692 btrfs_set_file_extent_num_bytes(leaf, fi,
4693 extent_len);
4694 ext_offset += new_extents[i].offset;
4695 btrfs_set_file_extent_offset(leaf, fi,
4696 ext_offset);
4697 btrfs_mark_buffer_dirty(leaf);
4698
4699 btrfs_drop_extent_cache(inode, key.offset,
4700 key.offset + extent_len - 1, 0);
4701
4702 ret = btrfs_inc_extent_ref(trans, root,
4703 new_extents[i].disk_bytenr,
4704 new_extents[i].disk_num_bytes,
4705 leaf->start,
4706 root->root_key.objectid,
4707 trans->transid, key.objectid);
4708 BUG_ON(ret);
4709 btrfs_release_path(root, path);
4710
4711 inode_add_bytes(inode, extent_len);
4712
4713 ext_offset = 0;
4714 num_bytes -= extent_len;
4715 key.offset += extent_len;
4716
4717 if (num_bytes == 0)
4718 break;
4719 }
4720 BUG_ON(i >= nr_extents);
4721#endif
4722 }
4723
4724 if (extent_locked) {
4725 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4726 lock_end, GFP_NOFS);
4727 extent_locked = 0;
4728 }
4729skip:
4730 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4731 key.offset >= first_pos + extent_key->offset)
4732 break;
4733
4734 cond_resched();
4735 }
4736 ret = 0;
4737out:
4738 btrfs_release_path(root, path);
4739 if (inode) {
4740 mutex_unlock(&inode->i_mutex);
4741 if (extent_locked) {
4742 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4743 lock_end, GFP_NOFS);
4744 }
4745 iput(inode);
4746 }
4747 return ret;
4748}
4749
4750int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4751 struct btrfs_root *root,
4752 struct extent_buffer *buf, u64 orig_start)
4753{
4754 int level;
4755 int ret;
4756
4757 BUG_ON(btrfs_header_generation(buf) != trans->transid);
4758 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4759
4760 level = btrfs_header_level(buf);
4761 if (level == 0) {
4762 struct btrfs_leaf_ref *ref;
4763 struct btrfs_leaf_ref *orig_ref;
4764
4765 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4766 if (!orig_ref)
4767 return -ENOENT;
4768
4769 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4770 if (!ref) {
4771 btrfs_free_leaf_ref(root, orig_ref);
4772 return -ENOMEM;
4773 }
4774
4775 ref->nritems = orig_ref->nritems;
4776 memcpy(ref->extents, orig_ref->extents,
4777 sizeof(ref->extents[0]) * ref->nritems);
4778
4779 btrfs_free_leaf_ref(root, orig_ref);
4780
4781 ref->root_gen = trans->transid;
4782 ref->bytenr = buf->start;
4783 ref->owner = btrfs_header_owner(buf);
4784 ref->generation = btrfs_header_generation(buf);
4785 ret = btrfs_add_leaf_ref(root, ref, 0);
4786 WARN_ON(ret);
4787 btrfs_free_leaf_ref(root, ref);
4788 }
4789 return 0;
4790}
4791
4792static noinline int invalidate_extent_cache(struct btrfs_root *root,
4793 struct extent_buffer *leaf,
4794 struct btrfs_block_group_cache *group,
4795 struct btrfs_root *target_root)
4796{
4797 struct btrfs_key key;
4798 struct inode *inode = NULL;
4799 struct btrfs_file_extent_item *fi;
4800 u64 num_bytes;
4801 u64 skip_objectid = 0;
4802 u32 nritems;
4803 u32 i;
4804
4805 nritems = btrfs_header_nritems(leaf);
4806 for (i = 0; i < nritems; i++) {
4807 btrfs_item_key_to_cpu(leaf, &key, i);
4808 if (key.objectid == skip_objectid ||
4809 key.type != BTRFS_EXTENT_DATA_KEY)
4810 continue;
4811 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4812 if (btrfs_file_extent_type(leaf, fi) ==
4813 BTRFS_FILE_EXTENT_INLINE)
4814 continue;
4815 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4816 continue;
4817 if (!inode || inode->i_ino != key.objectid) {
4818 iput(inode);
4819 inode = btrfs_ilookup(target_root->fs_info->sb,
4820 key.objectid, target_root, 1);
4821 }
4822 if (!inode) {
4823 skip_objectid = key.objectid;
4824 continue;
4825 }
4826 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4827
4828 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4829 key.offset + num_bytes - 1, GFP_NOFS);
4830 btrfs_drop_extent_cache(inode, key.offset,
4831 key.offset + num_bytes - 1, 1);
4832 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4833 key.offset + num_bytes - 1, GFP_NOFS);
4834 cond_resched();
4835 }
4836 iput(inode);
4837 return 0;
4838}
4839
4840static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4841 struct btrfs_root *root,
4842 struct extent_buffer *leaf,
4843 struct btrfs_block_group_cache *group,
4844 struct inode *reloc_inode)
4845{
4846 struct btrfs_key key;
4847 struct btrfs_key extent_key;
4848 struct btrfs_file_extent_item *fi;
4849 struct btrfs_leaf_ref *ref;
4850 struct disk_extent *new_extent;
4851 u64 bytenr;
4852 u64 num_bytes;
4853 u32 nritems;
4854 u32 i;
4855 int ext_index;
4856 int nr_extent;
4857 int ret;
4858
4859 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4860 BUG_ON(!new_extent);
4861
4862 ref = btrfs_lookup_leaf_ref(root, leaf->start);
4863 BUG_ON(!ref);
4864
4865 ext_index = -1;
4866 nritems = btrfs_header_nritems(leaf);
4867 for (i = 0; i < nritems; i++) {
4868 btrfs_item_key_to_cpu(leaf, &key, i);
4869 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4870 continue;
4871 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4872 if (btrfs_file_extent_type(leaf, fi) ==
4873 BTRFS_FILE_EXTENT_INLINE)
4874 continue;
4875 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4876 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4877 if (bytenr == 0)
4878 continue;
4879
4880 ext_index++;
4881 if (bytenr >= group->key.objectid + group->key.offset ||
4882 bytenr + num_bytes <= group->key.objectid)
4883 continue;
4884
4885 extent_key.objectid = bytenr;
4886 extent_key.offset = num_bytes;
4887 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4888 nr_extent = 1;
4889 ret = get_new_locations(reloc_inode, &extent_key,
4890 group->key.objectid, 1,
4891 &new_extent, &nr_extent);
4892 if (ret > 0)
4893 continue;
4894 BUG_ON(ret < 0);
4895
4896 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4897 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4898 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4899 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4900
4901 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4902 new_extent->disk_bytenr);
4903 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4904 new_extent->disk_num_bytes);
4905 btrfs_mark_buffer_dirty(leaf);
4906
4907 ret = btrfs_inc_extent_ref(trans, root,
4908 new_extent->disk_bytenr,
4909 new_extent->disk_num_bytes,
4910 leaf->start,
4911 root->root_key.objectid,
4912 trans->transid, key.objectid);
4913 BUG_ON(ret);
4914 ret = btrfs_free_extent(trans, root,
4915 bytenr, num_bytes, leaf->start,
4916 btrfs_header_owner(leaf),
4917 btrfs_header_generation(leaf),
4918 key.objectid, 0);
4919 BUG_ON(ret);
4920 cond_resched();
4921 }
4922 kfree(new_extent);
4923 BUG_ON(ext_index + 1 != ref->nritems);
4924 btrfs_free_leaf_ref(root, ref);
4925 return 0;
4926}
4927
4928int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4929 struct btrfs_root *root)
4930{
4931 struct btrfs_root *reloc_root;
4932 int ret;
4933
4934 if (root->reloc_root) {
4935 reloc_root = root->reloc_root;
4936 root->reloc_root = NULL;
4937 list_add(&reloc_root->dead_list,
4938 &root->fs_info->dead_reloc_roots);
4939
4940 btrfs_set_root_bytenr(&reloc_root->root_item,
4941 reloc_root->node->start);
4942 btrfs_set_root_level(&root->root_item,
4943 btrfs_header_level(reloc_root->node));
4944 memset(&reloc_root->root_item.drop_progress, 0,
4945 sizeof(struct btrfs_disk_key));
4946 reloc_root->root_item.drop_level = 0;
4947
4948 ret = btrfs_update_root(trans, root->fs_info->tree_root,
4949 &reloc_root->root_key,
4950 &reloc_root->root_item);
4951 BUG_ON(ret);
4952 }
4953 return 0;
4954}
4955
4956int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4957{
4958 struct btrfs_trans_handle *trans;
4959 struct btrfs_root *reloc_root;
4960 struct btrfs_root *prev_root = NULL;
4961 struct list_head dead_roots;
4962 int ret;
4963 unsigned long nr;
4964
4965 INIT_LIST_HEAD(&dead_roots);
4966 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4967
4968 while (!list_empty(&dead_roots)) {
4969 reloc_root = list_entry(dead_roots.prev,
4970 struct btrfs_root, dead_list);
4971 list_del_init(&reloc_root->dead_list);
4972
4973 BUG_ON(reloc_root->commit_root != NULL);
4974 while (1) {
4975 trans = btrfs_join_transaction(root, 1);
4976 BUG_ON(!trans);
4977
4978 mutex_lock(&root->fs_info->drop_mutex);
4979 ret = btrfs_drop_snapshot(trans, reloc_root);
4980 if (ret != -EAGAIN)
4981 break;
4982 mutex_unlock(&root->fs_info->drop_mutex);
4983
4984 nr = trans->blocks_used;
4985 ret = btrfs_end_transaction(trans, root);
4986 BUG_ON(ret);
4987 btrfs_btree_balance_dirty(root, nr);
4988 }
4989
4990 free_extent_buffer(reloc_root->node);
4991
4992 ret = btrfs_del_root(trans, root->fs_info->tree_root,
4993 &reloc_root->root_key);
4994 BUG_ON(ret);
4995 mutex_unlock(&root->fs_info->drop_mutex);
4996
4997 nr = trans->blocks_used;
4998 ret = btrfs_end_transaction(trans, root);
4999 BUG_ON(ret);
5000 btrfs_btree_balance_dirty(root, nr);
5001
5002 kfree(prev_root);
5003 prev_root = reloc_root;
5004 }
5005 if (prev_root) {
5006 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
5007 kfree(prev_root);
5008 }
5009 return 0;
5010}
5011
5012int btrfs_add_dead_reloc_root(struct btrfs_root *root)
5013{
5014 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
5015 return 0;
5016}
5017
5018int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
5019{
5020 struct btrfs_root *reloc_root;
5021 struct btrfs_trans_handle *trans;
5022 struct btrfs_key location;
5023 int found;
5024 int ret;
5025
5026 mutex_lock(&root->fs_info->tree_reloc_mutex);
5027 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
5028 BUG_ON(ret);
5029 found = !list_empty(&root->fs_info->dead_reloc_roots);
5030 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5031
5032 if (found) {
5033 trans = btrfs_start_transaction(root, 1);
5034 BUG_ON(!trans);
5035 ret = btrfs_commit_transaction(trans, root);
5036 BUG_ON(ret);
5037 }
5038
5039 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5040 location.offset = (u64)-1;
5041 location.type = BTRFS_ROOT_ITEM_KEY;
5042
5043 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5044 BUG_ON(!reloc_root);
5045 btrfs_orphan_cleanup(reloc_root);
5046 return 0;
5047}
5048
5049static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
5050 struct btrfs_root *root)
5051{
5052 struct btrfs_root *reloc_root;
5053 struct extent_buffer *eb;
5054 struct btrfs_root_item *root_item;
5055 struct btrfs_key root_key;
5056 int ret;
5057
5058 BUG_ON(!root->ref_cows);
5059 if (root->reloc_root)
5060 return 0;
5061
5062 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5063 BUG_ON(!root_item);
5064
5065 ret = btrfs_copy_root(trans, root, root->commit_root,
5066 &eb, BTRFS_TREE_RELOC_OBJECTID);
5067 BUG_ON(ret);
5068
5069 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5070 root_key.offset = root->root_key.objectid;
5071 root_key.type = BTRFS_ROOT_ITEM_KEY;
5072
5073 memcpy(root_item, &root->root_item, sizeof(root_item));
5074 btrfs_set_root_refs(root_item, 0);
5075 btrfs_set_root_bytenr(root_item, eb->start);
5076 btrfs_set_root_level(root_item, btrfs_header_level(eb));
5077 btrfs_set_root_generation(root_item, trans->transid);
5078
5079 btrfs_tree_unlock(eb);
5080 free_extent_buffer(eb);
5081
5082 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5083 &root_key, root_item);
5084 BUG_ON(ret);
5085 kfree(root_item);
5086
5087 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5088 &root_key);
5089 BUG_ON(!reloc_root);
5090 reloc_root->last_trans = trans->transid;
5091 reloc_root->commit_root = NULL;
5092 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5093
5094 root->reloc_root = reloc_root;
5095 return 0;
5096}
5097
5098/*
5099 * Core function of space balance.
5100 *
5101 * The idea is using reloc trees to relocate tree blocks in reference
5102 * counted roots. There is one reloc tree for each subvol, and all
5103 * reloc trees share same root key objectid. Reloc trees are snapshots
5104 * of the latest committed roots of subvols (root->commit_root).
5105 *
5106 * To relocate a tree block referenced by a subvol, there are two steps.
5107 * COW the block through subvol's reloc tree, then update block pointer
5108 * in the subvol to point to the new block. Since all reloc trees share
5109 * same root key objectid, doing special handing for tree blocks owned
5110 * by them is easy. Once a tree block has been COWed in one reloc tree,
5111 * we can use the resulting new block directly when the same block is
5112 * required to COW again through other reloc trees. By this way, relocated
5113 * tree blocks are shared between reloc trees, so they are also shared
5114 * between subvols.
5115 */
5116static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
5117 struct btrfs_root *root,
5118 struct btrfs_path *path,
5119 struct btrfs_key *first_key,
5120 struct btrfs_ref_path *ref_path,
5121 struct btrfs_block_group_cache *group,
5122 struct inode *reloc_inode)
5123{
5124 struct btrfs_root *reloc_root;
5125 struct extent_buffer *eb = NULL;
5126 struct btrfs_key *keys;
5127 u64 *nodes;
5128 int level;
5129 int shared_level;
5130 int lowest_level = 0;
5131 int ret;
5132
5133 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5134 lowest_level = ref_path->owner_objectid;
5135
5136 if (!root->ref_cows) {
5137 path->lowest_level = lowest_level;
5138 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5139 BUG_ON(ret < 0);
5140 path->lowest_level = 0;
5141 btrfs_release_path(root, path);
5142 return 0;
5143 }
5144
5145 mutex_lock(&root->fs_info->tree_reloc_mutex);
5146 ret = init_reloc_tree(trans, root);
5147 BUG_ON(ret);
5148 reloc_root = root->reloc_root;
5149
5150 shared_level = ref_path->shared_level;
5151 ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5152
5153 keys = ref_path->node_keys;
5154 nodes = ref_path->new_nodes;
5155 memset(&keys[shared_level + 1], 0,
5156 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5157 memset(&nodes[shared_level + 1], 0,
5158 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5159
5160 if (nodes[lowest_level] == 0) {
5161 path->lowest_level = lowest_level;
5162 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5163 0, 1);
5164 BUG_ON(ret);
5165 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5166 eb = path->nodes[level];
5167 if (!eb || eb == reloc_root->node)
5168 break;
5169 nodes[level] = eb->start;
5170 if (level == 0)
5171 btrfs_item_key_to_cpu(eb, &keys[level], 0);
5172 else
5173 btrfs_node_key_to_cpu(eb, &keys[level], 0);
5174 }
5175 if (nodes[0] &&
5176 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5177 eb = path->nodes[0];
5178 ret = replace_extents_in_leaf(trans, reloc_root, eb,
5179 group, reloc_inode);
5180 BUG_ON(ret);
5181 }
5182 btrfs_release_path(reloc_root, path);
5183 } else {
5184 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5185 lowest_level);
5186 BUG_ON(ret);
5187 }
5188
5189 /*
5190 * replace tree blocks in the fs tree with tree blocks in
5191 * the reloc tree.
5192 */
5193 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5194 BUG_ON(ret < 0);
5195
5196 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5197 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5198 0, 0);
5199 BUG_ON(ret);
5200 extent_buffer_get(path->nodes[0]);
5201 eb = path->nodes[0];
5202 btrfs_release_path(reloc_root, path);
5203 ret = invalidate_extent_cache(reloc_root, eb, group, root);
5204 BUG_ON(ret);
5205 free_extent_buffer(eb);
5206 }
5207
5208 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5209 path->lowest_level = 0;
5210 return 0;
5211}
5212
5213static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5214 struct btrfs_root *root,
5215 struct btrfs_path *path,
5216 struct btrfs_key *first_key,
5217 struct btrfs_ref_path *ref_path)
5218{
5219 int ret;
5220
5221 ret = relocate_one_path(trans, root, path, first_key,
5222 ref_path, NULL, NULL);
5223 BUG_ON(ret);
5224
5225 if (root == root->fs_info->extent_root)
5226 btrfs_extent_post_op(trans, root);
5227
5228 return 0;
5229}
5230
5231static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
5232 struct btrfs_root *extent_root,
5233 struct btrfs_path *path,
5234 struct btrfs_key *extent_key)
5235{
5236 int ret;
5237
5238 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5239 if (ret)
5240 goto out;
5241 ret = btrfs_del_item(trans, extent_root, path);
5242out:
5243 btrfs_release_path(extent_root, path);
5244 return ret;
5245}
5246
5247static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
5248 struct btrfs_ref_path *ref_path)
5249{
5250 struct btrfs_key root_key;
5251
5252 root_key.objectid = ref_path->root_objectid;
5253 root_key.type = BTRFS_ROOT_ITEM_KEY;
5254 if (is_cowonly_root(ref_path->root_objectid))
5255 root_key.offset = 0;
5256 else
5257 root_key.offset = (u64)-1;
5258
5259 return btrfs_read_fs_root_no_name(fs_info, &root_key);
5260}
5261
5262static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5263 struct btrfs_path *path,
5264 struct btrfs_key *extent_key,
5265 struct btrfs_block_group_cache *group,
5266 struct inode *reloc_inode, int pass)
5267{
5268 struct btrfs_trans_handle *trans;
5269 struct btrfs_root *found_root;
5270 struct btrfs_ref_path *ref_path = NULL;
5271 struct disk_extent *new_extents = NULL;
5272 int nr_extents = 0;
5273 int loops;
5274 int ret;
5275 int level;
5276 struct btrfs_key first_key;
5277 u64 prev_block = 0;
5278
5279
5280 trans = btrfs_start_transaction(extent_root, 1);
5281 BUG_ON(!trans);
5282
5283 if (extent_key->objectid == 0) {
5284 ret = del_extent_zero(trans, extent_root, path, extent_key);
5285 goto out;
5286 }
5287
5288 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5289 if (!ref_path) {
5290 ret = -ENOMEM;
5291 goto out;
5292 }
5293
5294 for (loops = 0; ; loops++) {
5295 if (loops == 0) {
5296 ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5297 extent_key->objectid);
5298 } else {
5299 ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5300 }
5301 if (ret < 0)
5302 goto out;
5303 if (ret > 0)
5304 break;
5305
5306 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5307 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5308 continue;
5309
5310 found_root = read_ref_root(extent_root->fs_info, ref_path);
5311 BUG_ON(!found_root);
5312 /*
5313 * for reference counted tree, only process reference paths
5314 * rooted at the latest committed root.
5315 */
5316 if (found_root->ref_cows &&
5317 ref_path->root_generation != found_root->root_key.offset)
5318 continue;
5319
5320 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5321 if (pass == 0) {
5322 /*
5323 * copy data extents to new locations
5324 */
5325 u64 group_start = group->key.objectid;
5326 ret = relocate_data_extent(reloc_inode,
5327 extent_key,
5328 group_start);
5329 if (ret < 0)
5330 goto out;
5331 break;
5332 }
5333 level = 0;
5334 } else {
5335 level = ref_path->owner_objectid;
5336 }
5337
5338 if (prev_block != ref_path->nodes[level]) {
5339 struct extent_buffer *eb;
5340 u64 block_start = ref_path->nodes[level];
5341 u64 block_size = btrfs_level_size(found_root, level);
5342
5343 eb = read_tree_block(found_root, block_start,
5344 block_size, 0);
5345 btrfs_tree_lock(eb);
5346 BUG_ON(level != btrfs_header_level(eb));
5347
5348 if (level == 0)
5349 btrfs_item_key_to_cpu(eb, &first_key, 0);
5350 else
5351 btrfs_node_key_to_cpu(eb, &first_key, 0);
5352
5353 btrfs_tree_unlock(eb);
5354 free_extent_buffer(eb);
5355 prev_block = block_start;
5356 }
5357
5358 btrfs_record_root_in_trans(found_root);
5359 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5360 /*
5361 * try to update data extent references while
5362 * keeping metadata shared between snapshots.
5363 */
5364 if (pass == 1) {
5365 ret = relocate_one_path(trans, found_root,
5366 path, &first_key, ref_path,
5367 group, reloc_inode);
5368 if (ret < 0)
5369 goto out;
5370 continue;
5371 }
5372 /*
5373 * use fallback method to process the remaining
5374 * references.
5375 */
5376 if (!new_extents) {
5377 u64 group_start = group->key.objectid;
5378 new_extents = kmalloc(sizeof(*new_extents),
5379 GFP_NOFS);
5380 nr_extents = 1;
5381 ret = get_new_locations(reloc_inode,
5382 extent_key,
5383 group_start, 1,
5384 &new_extents,
5385 &nr_extents);
5386 if (ret)
5387 goto out;
5388 }
5389 ret = replace_one_extent(trans, found_root,
5390 path, extent_key,
5391 &first_key, ref_path,
5392 new_extents, nr_extents);
5393 } else {
5394 ret = relocate_tree_block(trans, found_root, path,
5395 &first_key, ref_path);
5396 }
5397 if (ret < 0)
5398 goto out;
5399 }
5400 ret = 0;
5401out:
5402 btrfs_end_transaction(trans, extent_root);
5403 kfree(new_extents);
5404 kfree(ref_path);
5405 return ret;
5406}
5407
5408static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5409{
5410 u64 num_devices;
5411 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5412 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5413
5414 num_devices = root->fs_info->fs_devices->rw_devices;
5415 if (num_devices == 1) {
5416 stripped |= BTRFS_BLOCK_GROUP_DUP;
5417 stripped = flags & ~stripped;
5418
5419 /* turn raid0 into single device chunks */
5420 if (flags & BTRFS_BLOCK_GROUP_RAID0)
5421 return stripped;
5422
5423 /* turn mirroring into duplication */
5424 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5425 BTRFS_BLOCK_GROUP_RAID10))
5426 return stripped | BTRFS_BLOCK_GROUP_DUP;
5427 return flags;
5428 } else {
5429 /* they already had raid on here, just return */
5430 if (flags & stripped)
5431 return flags;
5432
5433 stripped |= BTRFS_BLOCK_GROUP_DUP;
5434 stripped = flags & ~stripped;
5435
5436 /* switch duplicated blocks with raid1 */
5437 if (flags & BTRFS_BLOCK_GROUP_DUP)
5438 return stripped | BTRFS_BLOCK_GROUP_RAID1;
5439
5440 /* turn single device chunks into raid0 */
5441 return stripped | BTRFS_BLOCK_GROUP_RAID0;
5442 }
5443 return flags;
5444}
5445
5446static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5447 struct btrfs_block_group_cache *shrink_block_group,
5448 int force)
5449{
5450 struct btrfs_trans_handle *trans;
5451 u64 new_alloc_flags;
5452 u64 calc;
5453
5454 spin_lock(&shrink_block_group->lock);
5455 if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5456 spin_unlock(&shrink_block_group->lock);
5457
5458 trans = btrfs_start_transaction(root, 1);
5459 spin_lock(&shrink_block_group->lock);
5460
5461 new_alloc_flags = update_block_group_flags(root,
5462 shrink_block_group->flags);
5463 if (new_alloc_flags != shrink_block_group->flags) {
5464 calc =
5465 btrfs_block_group_used(&shrink_block_group->item);
5466 } else {
5467 calc = shrink_block_group->key.offset;
5468 }
5469 spin_unlock(&shrink_block_group->lock);
5470
5471 do_chunk_alloc(trans, root->fs_info->extent_root,
5472 calc + 2 * 1024 * 1024, new_alloc_flags, force);
5473
5474 btrfs_end_transaction(trans, root);
5475 } else
5476 spin_unlock(&shrink_block_group->lock);
5477 return 0;
5478}
5479
5480static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5481 struct btrfs_root *root,
5482 u64 objectid, u64 size)
5483{
5484 struct btrfs_path *path;
5485 struct btrfs_inode_item *item;
5486 struct extent_buffer *leaf;
5487 int ret;
5488
5489 path = btrfs_alloc_path();
5490 if (!path)
5491 return -ENOMEM;
5492
5493 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5494 if (ret)
5495 goto out;
5496
5497 leaf = path->nodes[0];
5498 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
5499 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
5500 btrfs_set_inode_generation(leaf, item, 1);
5501 btrfs_set_inode_size(leaf, item, size);
5502 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
5503 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
5504 btrfs_mark_buffer_dirty(leaf);
5505 btrfs_release_path(root, path);
5506out:
5507 btrfs_free_path(path);
5508 return ret;
5509}
5510
5511static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
5512 struct btrfs_block_group_cache *group)
5513{
5514 struct inode *inode = NULL;
5515 struct btrfs_trans_handle *trans;
5516 struct btrfs_root *root;
5517 struct btrfs_key root_key;
5518 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
5519 int err = 0;
5520
5521 root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5522 root_key.type = BTRFS_ROOT_ITEM_KEY;
5523 root_key.offset = (u64)-1;
5524 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
5525 if (IS_ERR(root))
5526 return ERR_CAST(root);
5527
5528 trans = btrfs_start_transaction(root, 1);
5529 BUG_ON(!trans);
5530
5531 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
5532 if (err)
5533 goto out;
5534
5535 err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
5536 BUG_ON(err);
5537
5538 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
5539 group->key.offset, 0, group->key.offset,
5540 0, 0, 0);
5541 BUG_ON(err);
5542
5543 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
5544 if (inode->i_state & I_NEW) {
5545 BTRFS_I(inode)->root = root;
5546 BTRFS_I(inode)->location.objectid = objectid;
5547 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5548 BTRFS_I(inode)->location.offset = 0;
5549 btrfs_read_locked_inode(inode);
5550 unlock_new_inode(inode);
5551 BUG_ON(is_bad_inode(inode));
5552 } else {
5553 BUG_ON(1);
5554 }
5555 BTRFS_I(inode)->index_cnt = group->key.objectid;
5556
5557 err = btrfs_orphan_add(trans, inode);
5558out:
5559 btrfs_end_transaction(trans, root);
5560 if (err) {
5561 if (inode)
5562 iput(inode);
5563 inode = ERR_PTR(err);
5564 }
5565 return inode;
5566}
5567
5568int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
5569{
5570
5571 struct btrfs_ordered_sum *sums;
5572 struct btrfs_sector_sum *sector_sum;
5573 struct btrfs_ordered_extent *ordered;
5574 struct btrfs_root *root = BTRFS_I(inode)->root;
5575 struct list_head list;
5576 size_t offset;
5577 int ret;
5578 u64 disk_bytenr;
5579
5580 INIT_LIST_HEAD(&list);
5581
5582 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
5583 BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
5584
5585 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
5586 ret = btrfs_lookup_csums_range(root, disk_bytenr,
5587 disk_bytenr + len - 1, &list);
5588
5589 while (!list_empty(&list)) {
5590 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
5591 list_del_init(&sums->list);
5592
5593 sector_sum = sums->sums;
5594 sums->bytenr = ordered->start;
5595
5596 offset = 0;
5597 while (offset < sums->len) {
5598 sector_sum->bytenr += ordered->start - disk_bytenr;
5599 sector_sum++;
5600 offset += root->sectorsize;
5601 }
5602
5603 btrfs_add_ordered_sum(inode, ordered, sums);
5604 }
5605 btrfs_put_ordered_extent(ordered);
5606 return 0;
5607}
5608
5609int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
5610{
5611 struct btrfs_trans_handle *trans;
5612 struct btrfs_path *path;
5613 struct btrfs_fs_info *info = root->fs_info;
5614 struct extent_buffer *leaf;
5615 struct inode *reloc_inode;
5616 struct btrfs_block_group_cache *block_group;
5617 struct btrfs_key key;
5618 u64 skipped;
5619 u64 cur_byte;
5620 u64 total_found;
5621 u32 nritems;
5622 int ret;
5623 int progress;
5624 int pass = 0;
5625
5626 root = root->fs_info->extent_root;
5627
5628 block_group = btrfs_lookup_block_group(info, group_start);
5629 BUG_ON(!block_group);
5630
5631 printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
5632 (unsigned long long)block_group->key.objectid,
5633 (unsigned long long)block_group->flags);
5634
5635 path = btrfs_alloc_path();
5636 BUG_ON(!path);
5637
5638 reloc_inode = create_reloc_inode(info, block_group);
5639 BUG_ON(IS_ERR(reloc_inode));
5640
5641 __alloc_chunk_for_shrink(root, block_group, 1);
5642 set_block_group_readonly(block_group);
5643
5644 btrfs_start_delalloc_inodes(info->tree_root);
5645 btrfs_wait_ordered_extents(info->tree_root, 0);
5646again:
5647 skipped = 0;
5648 total_found = 0;
5649 progress = 0;
5650 key.objectid = block_group->key.objectid;
5651 key.offset = 0;
5652 key.type = 0;
5653 cur_byte = key.objectid;
5654
5655 trans = btrfs_start_transaction(info->tree_root, 1);
5656 btrfs_commit_transaction(trans, info->tree_root);
5657
5658 mutex_lock(&root->fs_info->cleaner_mutex);
5659 btrfs_clean_old_snapshots(info->tree_root);
5660 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5661 mutex_unlock(&root->fs_info->cleaner_mutex);
5662
5663 while (1) {
5664 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5665 if (ret < 0)
5666 goto out;
5667next:
5668 leaf = path->nodes[0];
5669 nritems = btrfs_header_nritems(leaf);
5670 if (path->slots[0] >= nritems) {
5671 ret = btrfs_next_leaf(root, path);
5672 if (ret < 0)
5673 goto out;
5674 if (ret == 1) {
5675 ret = 0;
5676 break;
5677 }
5678 leaf = path->nodes[0];
5679 nritems = btrfs_header_nritems(leaf);
5680 }
5681
5682 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5683
5684 if (key.objectid >= block_group->key.objectid +
5685 block_group->key.offset)
5686 break;
5687
5688 if (progress && need_resched()) {
5689 btrfs_release_path(root, path);
5690 cond_resched();
5691 progress = 0;
5692 continue;
5693 }
5694 progress = 1;
5695
5696 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
5697 key.objectid + key.offset <= cur_byte) {
5698 path->slots[0]++;
5699 goto next;
5700 }
5701
5702 total_found++;
5703 cur_byte = key.objectid + key.offset;
5704 btrfs_release_path(root, path);
5705
5706 __alloc_chunk_for_shrink(root, block_group, 0);
5707 ret = relocate_one_extent(root, path, &key, block_group,
5708 reloc_inode, pass);
5709 BUG_ON(ret < 0);
5710 if (ret > 0)
5711 skipped++;
5712
5713 key.objectid = cur_byte;
5714 key.type = 0;
5715 key.offset = 0;
5716 }
5717
5718 btrfs_release_path(root, path);
5719
5720 if (pass == 0) {
5721 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
5722 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
5723 }
5724
5725 if (total_found > 0) {
5726 printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
5727 (unsigned long long)total_found, pass);
5728 pass++;
5729 if (total_found == skipped && pass > 2) {
5730 iput(reloc_inode);
5731 reloc_inode = create_reloc_inode(info, block_group);
5732 pass = 0;
5733 }
5734 goto again;
5735 }
5736
5737 /* delete reloc_inode */
5738 iput(reloc_inode);
5739
5740 /* unpin extents in this range */
5741 trans = btrfs_start_transaction(info->tree_root, 1);
5742 btrfs_commit_transaction(trans, info->tree_root);
5743
5744 spin_lock(&block_group->lock);
5745 WARN_ON(block_group->pinned > 0);
5746 WARN_ON(block_group->reserved > 0);
5747 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5748 spin_unlock(&block_group->lock);
5749 put_block_group(block_group);
5750 ret = 0;
5751out:
5752 btrfs_free_path(path);
5753 return ret;
5754}
5755
5756static int find_first_block_group(struct btrfs_root *root,
5757 struct btrfs_path *path, struct btrfs_key *key)
5758{
5759 int ret = 0;
5760 struct btrfs_key found_key;
5761 struct extent_buffer *leaf;
5762 int slot;
5763
5764 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5765 if (ret < 0)
5766 goto out;
5767
5768 while (1) {
5769 slot = path->slots[0];
5770 leaf = path->nodes[0];
5771 if (slot >= btrfs_header_nritems(leaf)) {
5772 ret = btrfs_next_leaf(root, path);
5773 if (ret == 0)
5774 continue;
5775 if (ret < 0)
5776 goto out;
5777 break;
5778 }
5779 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5780
5781 if (found_key.objectid >= key->objectid &&
5782 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5783 ret = 0;
5784 goto out;
5785 }
5786 path->slots[0]++;
5787 }
5788 ret = -ENOENT;
5789out:
5790 return ret;
5791}
5792
5793int btrfs_free_block_groups(struct btrfs_fs_info *info)
5794{
5795 struct btrfs_block_group_cache *block_group;
5796 struct rb_node *n;
5797
5798 spin_lock(&info->block_group_cache_lock);
5799 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5800 block_group = rb_entry(n, struct btrfs_block_group_cache,
5801 cache_node);
5802 rb_erase(&block_group->cache_node,
5803 &info->block_group_cache_tree);
5804 spin_unlock(&info->block_group_cache_lock);
5805
5806 btrfs_remove_free_space_cache(block_group);
5807 down_write(&block_group->space_info->groups_sem);
5808 list_del(&block_group->list);
5809 up_write(&block_group->space_info->groups_sem);
5810
5811 WARN_ON(atomic_read(&block_group->count) != 1);
5812 kfree(block_group);
5813
5814 spin_lock(&info->block_group_cache_lock);
5815 }
5816 spin_unlock(&info->block_group_cache_lock);
5817 return 0;
5818}
5819
5820int btrfs_read_block_groups(struct btrfs_root *root)
5821{
5822 struct btrfs_path *path;
5823 int ret;
5824 struct btrfs_block_group_cache *cache;
5825 struct btrfs_fs_info *info = root->fs_info;
5826 struct btrfs_space_info *space_info;
5827 struct btrfs_key key;
5828 struct btrfs_key found_key;
5829 struct extent_buffer *leaf;
5830
5831 root = info->extent_root;
5832 key.objectid = 0;
5833 key.offset = 0;
5834 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5835 path = btrfs_alloc_path();
5836 if (!path)
5837 return -ENOMEM;
5838
5839 while (1) {
5840 ret = find_first_block_group(root, path, &key);
5841 if (ret > 0) {
5842 ret = 0;
5843 goto error;
5844 }
5845 if (ret != 0)
5846 goto error;
5847
5848 leaf = path->nodes[0];
5849 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5850 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5851 if (!cache) {
5852 ret = -ENOMEM;
5853 break;
5854 }
5855
5856 atomic_set(&cache->count, 1);
5857 spin_lock_init(&cache->lock);
5858 mutex_init(&cache->alloc_mutex);
5859 mutex_init(&cache->cache_mutex);
5860 INIT_LIST_HEAD(&cache->list);
5861 read_extent_buffer(leaf, &cache->item,
5862 btrfs_item_ptr_offset(leaf, path->slots[0]),
5863 sizeof(cache->item));
5864 memcpy(&cache->key, &found_key, sizeof(found_key));
5865
5866 key.objectid = found_key.objectid + found_key.offset;
5867 btrfs_release_path(root, path);
5868 cache->flags = btrfs_block_group_flags(&cache->item);
5869
5870 ret = update_space_info(info, cache->flags, found_key.offset,
5871 btrfs_block_group_used(&cache->item),
5872 &space_info);
5873 BUG_ON(ret);
5874 cache->space_info = space_info;
5875 down_write(&space_info->groups_sem);
5876 list_add_tail(&cache->list, &space_info->block_groups);
5877 up_write(&space_info->groups_sem);
5878
5879 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5880 BUG_ON(ret);
5881
5882 set_avail_alloc_bits(root->fs_info, cache->flags);
5883 if (btrfs_chunk_readonly(root, cache->key.objectid))
5884 set_block_group_readonly(cache);
5885 }
5886 ret = 0;
5887error:
5888 btrfs_free_path(path);
5889 return ret;
5890}
5891
5892int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5893 struct btrfs_root *root, u64 bytes_used,
5894 u64 type, u64 chunk_objectid, u64 chunk_offset,
5895 u64 size)
5896{
5897 int ret;
5898 struct btrfs_root *extent_root;
5899 struct btrfs_block_group_cache *cache;
5900
5901 extent_root = root->fs_info->extent_root;
5902
5903 root->fs_info->last_trans_new_blockgroup = trans->transid;
5904
5905 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5906 if (!cache)
5907 return -ENOMEM;
5908
5909 cache->key.objectid = chunk_offset;
5910 cache->key.offset = size;
5911 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5912 atomic_set(&cache->count, 1);
5913 spin_lock_init(&cache->lock);
5914 mutex_init(&cache->alloc_mutex);
5915 mutex_init(&cache->cache_mutex);
5916 INIT_LIST_HEAD(&cache->list);
5917
5918 btrfs_set_block_group_used(&cache->item, bytes_used);
5919 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5920 cache->flags = type;
5921 btrfs_set_block_group_flags(&cache->item, type);
5922
5923 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5924 &cache->space_info);
5925 BUG_ON(ret);
5926 down_write(&cache->space_info->groups_sem);
5927 list_add_tail(&cache->list, &cache->space_info->block_groups);
5928 up_write(&cache->space_info->groups_sem);
5929
5930 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5931 BUG_ON(ret);
5932
5933 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5934 sizeof(cache->item));
5935 BUG_ON(ret);
5936
5937 finish_current_insert(trans, extent_root, 0);
5938 ret = del_pending_extents(trans, extent_root, 0);
5939 BUG_ON(ret);
5940 set_avail_alloc_bits(extent_root->fs_info, type);
5941
5942 return 0;
5943}
5944
5945int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5946 struct btrfs_root *root, u64 group_start)
5947{
5948 struct btrfs_path *path;
5949 struct btrfs_block_group_cache *block_group;
5950 struct btrfs_key key;
5951 int ret;
5952
5953 root = root->fs_info->extent_root;
5954
5955 block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5956 BUG_ON(!block_group);
5957 BUG_ON(!block_group->ro);
5958
5959 memcpy(&key, &block_group->key, sizeof(key));
5960
5961 path = btrfs_alloc_path();
5962 BUG_ON(!path);
5963
5964 btrfs_remove_free_space_cache(block_group);
5965 rb_erase(&block_group->cache_node,
5966 &root->fs_info->block_group_cache_tree);
5967 down_write(&block_group->space_info->groups_sem);
5968 list_del(&block_group->list);
5969 up_write(&block_group->space_info->groups_sem);
5970
5971 spin_lock(&block_group->space_info->lock);
5972 block_group->space_info->total_bytes -= block_group->key.offset;
5973 block_group->space_info->bytes_readonly -= block_group->key.offset;
5974 spin_unlock(&block_group->space_info->lock);
5975 block_group->space_info->full = 0;
5976
5977 put_block_group(block_group);
5978 put_block_group(block_group);
5979
5980 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5981 if (ret > 0)
5982 ret = -EIO;
5983 if (ret < 0)
5984 goto out;
5985
5986 ret = btrfs_del_item(trans, root, path);
5987out:
5988 btrfs_free_path(path);
5989 return ret;
5990}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..e086d407f1fa
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3717 @@
1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h>
7#include <linux/page-flags.h>
8#include <linux/module.h>
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
15#include "extent_io.h"
16#include "extent_map.h"
17#include "compat.h"
18#include "ctree.h"
19#include "btrfs_inode.h"
20
21/* temporary define until extent_map moves out of btrfs */
22struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
23 unsigned long extra_flags,
24 void (*ctor)(void *, struct kmem_cache *,
25 unsigned long));
26
27static struct kmem_cache *extent_state_cache;
28static struct kmem_cache *extent_buffer_cache;
29
30static LIST_HEAD(buffers);
31static LIST_HEAD(states);
32
33#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG
35static DEFINE_SPINLOCK(leak_lock);
36#endif
37
38#define BUFFER_LRU_MAX 64
39
40struct tree_entry {
41 u64 start;
42 u64 end;
43 struct rb_node rb_node;
44};
45
46struct extent_page_data {
47 struct bio *bio;
48 struct extent_io_tree *tree;
49 get_extent_t *get_extent;
50
51 /* tells writepage not to lock the state bits for this range
52 * it still does the unlocking
53 */
54 int extent_locked;
55};
56
57int __init extent_io_init(void)
58{
59 extent_state_cache = btrfs_cache_create("extent_state",
60 sizeof(struct extent_state), 0,
61 NULL);
62 if (!extent_state_cache)
63 return -ENOMEM;
64
65 extent_buffer_cache = btrfs_cache_create("extent_buffers",
66 sizeof(struct extent_buffer), 0,
67 NULL);
68 if (!extent_buffer_cache)
69 goto free_state_cache;
70 return 0;
71
72free_state_cache:
73 kmem_cache_destroy(extent_state_cache);
74 return -ENOMEM;
75}
76
77void extent_io_exit(void)
78{
79 struct extent_state *state;
80 struct extent_buffer *eb;
81
82 while (!list_empty(&states)) {
83 state = list_entry(states.next, struct extent_state, leak_list);
84 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
85 "state %lu in tree %p refs %d\n",
86 (unsigned long long)state->start,
87 (unsigned long long)state->end,
88 state->state, state->tree, atomic_read(&state->refs));
89 list_del(&state->leak_list);
90 kmem_cache_free(extent_state_cache, state);
91
92 }
93
94 while (!list_empty(&buffers)) {
95 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
96 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
97 "refs %d\n", (unsigned long long)eb->start,
98 eb->len, atomic_read(&eb->refs));
99 list_del(&eb->leak_list);
100 kmem_cache_free(extent_buffer_cache, eb);
101 }
102 if (extent_state_cache)
103 kmem_cache_destroy(extent_state_cache);
104 if (extent_buffer_cache)
105 kmem_cache_destroy(extent_buffer_cache);
106}
107
108void extent_io_tree_init(struct extent_io_tree *tree,
109 struct address_space *mapping, gfp_t mask)
110{
111 tree->state.rb_node = NULL;
112 tree->buffer.rb_node = NULL;
113 tree->ops = NULL;
114 tree->dirty_bytes = 0;
115 spin_lock_init(&tree->lock);
116 spin_lock_init(&tree->buffer_lock);
117 tree->mapping = mapping;
118}
119
120static struct extent_state *alloc_extent_state(gfp_t mask)
121{
122 struct extent_state *state;
123#ifdef LEAK_DEBUG
124 unsigned long flags;
125#endif
126
127 state = kmem_cache_alloc(extent_state_cache, mask);
128 if (!state)
129 return state;
130 state->state = 0;
131 state->private = 0;
132 state->tree = NULL;
133#ifdef LEAK_DEBUG
134 spin_lock_irqsave(&leak_lock, flags);
135 list_add(&state->leak_list, &states);
136 spin_unlock_irqrestore(&leak_lock, flags);
137#endif
138 atomic_set(&state->refs, 1);
139 init_waitqueue_head(&state->wq);
140 return state;
141}
142
143static void free_extent_state(struct extent_state *state)
144{
145 if (!state)
146 return;
147 if (atomic_dec_and_test(&state->refs)) {
148#ifdef LEAK_DEBUG
149 unsigned long flags;
150#endif
151 WARN_ON(state->tree);
152#ifdef LEAK_DEBUG
153 spin_lock_irqsave(&leak_lock, flags);
154 list_del(&state->leak_list);
155 spin_unlock_irqrestore(&leak_lock, flags);
156#endif
157 kmem_cache_free(extent_state_cache, state);
158 }
159}
160
161static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
162 struct rb_node *node)
163{
164 struct rb_node **p = &root->rb_node;
165 struct rb_node *parent = NULL;
166 struct tree_entry *entry;
167
168 while (*p) {
169 parent = *p;
170 entry = rb_entry(parent, struct tree_entry, rb_node);
171
172 if (offset < entry->start)
173 p = &(*p)->rb_left;
174 else if (offset > entry->end)
175 p = &(*p)->rb_right;
176 else
177 return parent;
178 }
179
180 entry = rb_entry(node, struct tree_entry, rb_node);
181 rb_link_node(node, parent, p);
182 rb_insert_color(node, root);
183 return NULL;
184}
185
186static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
187 struct rb_node **prev_ret,
188 struct rb_node **next_ret)
189{
190 struct rb_root *root = &tree->state;
191 struct rb_node *n = root->rb_node;
192 struct rb_node *prev = NULL;
193 struct rb_node *orig_prev = NULL;
194 struct tree_entry *entry;
195 struct tree_entry *prev_entry = NULL;
196
197 while (n) {
198 entry = rb_entry(n, struct tree_entry, rb_node);
199 prev = n;
200 prev_entry = entry;
201
202 if (offset < entry->start)
203 n = n->rb_left;
204 else if (offset > entry->end)
205 n = n->rb_right;
206 else
207 return n;
208 }
209
210 if (prev_ret) {
211 orig_prev = prev;
212 while (prev && offset > prev_entry->end) {
213 prev = rb_next(prev);
214 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
215 }
216 *prev_ret = prev;
217 prev = orig_prev;
218 }
219
220 if (next_ret) {
221 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
222 while (prev && offset < prev_entry->start) {
223 prev = rb_prev(prev);
224 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
225 }
226 *next_ret = prev;
227 }
228 return NULL;
229}
230
231static inline struct rb_node *tree_search(struct extent_io_tree *tree,
232 u64 offset)
233{
234 struct rb_node *prev = NULL;
235 struct rb_node *ret;
236
237 ret = __etree_search(tree, offset, &prev, NULL);
238 if (!ret)
239 return prev;
240 return ret;
241}
242
243static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
244 u64 offset, struct rb_node *node)
245{
246 struct rb_root *root = &tree->buffer;
247 struct rb_node **p = &root->rb_node;
248 struct rb_node *parent = NULL;
249 struct extent_buffer *eb;
250
251 while (*p) {
252 parent = *p;
253 eb = rb_entry(parent, struct extent_buffer, rb_node);
254
255 if (offset < eb->start)
256 p = &(*p)->rb_left;
257 else if (offset > eb->start)
258 p = &(*p)->rb_right;
259 else
260 return eb;
261 }
262
263 rb_link_node(node, parent, p);
264 rb_insert_color(node, root);
265 return NULL;
266}
267
268static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
269 u64 offset)
270{
271 struct rb_root *root = &tree->buffer;
272 struct rb_node *n = root->rb_node;
273 struct extent_buffer *eb;
274
275 while (n) {
276 eb = rb_entry(n, struct extent_buffer, rb_node);
277 if (offset < eb->start)
278 n = n->rb_left;
279 else if (offset > eb->start)
280 n = n->rb_right;
281 else
282 return eb;
283 }
284 return NULL;
285}
286
287/*
288 * utility function to look for merge candidates inside a given range.
289 * Any extents with matching state are merged together into a single
290 * extent in the tree. Extents with EXTENT_IO in their state field
291 * are not merged because the end_io handlers need to be able to do
292 * operations on them without sleeping (or doing allocations/splits).
293 *
294 * This should be called with the tree lock held.
295 */
296static int merge_state(struct extent_io_tree *tree,
297 struct extent_state *state)
298{
299 struct extent_state *other;
300 struct rb_node *other_node;
301
302 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
303 return 0;
304
305 other_node = rb_prev(&state->rb_node);
306 if (other_node) {
307 other = rb_entry(other_node, struct extent_state, rb_node);
308 if (other->end == state->start - 1 &&
309 other->state == state->state) {
310 state->start = other->start;
311 other->tree = NULL;
312 rb_erase(&other->rb_node, &tree->state);
313 free_extent_state(other);
314 }
315 }
316 other_node = rb_next(&state->rb_node);
317 if (other_node) {
318 other = rb_entry(other_node, struct extent_state, rb_node);
319 if (other->start == state->end + 1 &&
320 other->state == state->state) {
321 other->start = state->start;
322 state->tree = NULL;
323 rb_erase(&state->rb_node, &tree->state);
324 free_extent_state(state);
325 }
326 }
327 return 0;
328}
329
330static void set_state_cb(struct extent_io_tree *tree,
331 struct extent_state *state,
332 unsigned long bits)
333{
334 if (tree->ops && tree->ops->set_bit_hook) {
335 tree->ops->set_bit_hook(tree->mapping->host, state->start,
336 state->end, state->state, bits);
337 }
338}
339
340static void clear_state_cb(struct extent_io_tree *tree,
341 struct extent_state *state,
342 unsigned long bits)
343{
344 if (tree->ops && tree->ops->clear_bit_hook) {
345 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
346 state->end, state->state, bits);
347 }
348}
349
350/*
351 * insert an extent_state struct into the tree. 'bits' are set on the
352 * struct before it is inserted.
353 *
354 * This may return -EEXIST if the extent is already there, in which case the
355 * state struct is freed.
356 *
357 * The tree lock is not taken internally. This is a utility function and
358 * probably isn't what you want to call (see set/clear_extent_bit).
359 */
360static int insert_state(struct extent_io_tree *tree,
361 struct extent_state *state, u64 start, u64 end,
362 int bits)
363{
364 struct rb_node *node;
365
366 if (end < start) {
367 printk(KERN_ERR "btrfs end < start %llu %llu\n",
368 (unsigned long long)end,
369 (unsigned long long)start);
370 WARN_ON(1);
371 }
372 if (bits & EXTENT_DIRTY)
373 tree->dirty_bytes += end - start + 1;
374 set_state_cb(tree, state, bits);
375 state->state |= bits;
376 state->start = start;
377 state->end = end;
378 node = tree_insert(&tree->state, end, &state->rb_node);
379 if (node) {
380 struct extent_state *found;
381 found = rb_entry(node, struct extent_state, rb_node);
382 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
383 "%llu %llu\n", (unsigned long long)found->start,
384 (unsigned long long)found->end,
385 (unsigned long long)start, (unsigned long long)end);
386 free_extent_state(state);
387 return -EEXIST;
388 }
389 state->tree = tree;
390 merge_state(tree, state);
391 return 0;
392}
393
394/*
395 * split a given extent state struct in two, inserting the preallocated
396 * struct 'prealloc' as the newly created second half. 'split' indicates an
397 * offset inside 'orig' where it should be split.
398 *
399 * Before calling,
400 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
401 * are two extent state structs in the tree:
402 * prealloc: [orig->start, split - 1]
403 * orig: [ split, orig->end ]
404 *
405 * The tree locks are not taken by this function. They need to be held
406 * by the caller.
407 */
408static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
409 struct extent_state *prealloc, u64 split)
410{
411 struct rb_node *node;
412 prealloc->start = orig->start;
413 prealloc->end = split - 1;
414 prealloc->state = orig->state;
415 orig->start = split;
416
417 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
418 if (node) {
419 struct extent_state *found;
420 found = rb_entry(node, struct extent_state, rb_node);
421 free_extent_state(prealloc);
422 return -EEXIST;
423 }
424 prealloc->tree = tree;
425 return 0;
426}
427
428/*
429 * utility function to clear some bits in an extent state struct.
430 * it will optionally wake up any one waiting on this state (wake == 1), or
431 * forcibly remove the state from the tree (delete == 1).
432 *
433 * If no bits are set on the state struct after clearing things, the
434 * struct is freed and removed from the tree
435 */
436static int clear_state_bit(struct extent_io_tree *tree,
437 struct extent_state *state, int bits, int wake,
438 int delete)
439{
440 int ret = state->state & bits;
441
442 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
443 u64 range = state->end - state->start + 1;
444 WARN_ON(range > tree->dirty_bytes);
445 tree->dirty_bytes -= range;
446 }
447 clear_state_cb(tree, state, bits);
448 state->state &= ~bits;
449 if (wake)
450 wake_up(&state->wq);
451 if (delete || state->state == 0) {
452 if (state->tree) {
453 clear_state_cb(tree, state, state->state);
454 rb_erase(&state->rb_node, &tree->state);
455 state->tree = NULL;
456 free_extent_state(state);
457 } else {
458 WARN_ON(1);
459 }
460 } else {
461 merge_state(tree, state);
462 }
463 return ret;
464}
465
466/*
467 * clear some bits on a range in the tree. This may require splitting
468 * or inserting elements in the tree, so the gfp mask is used to
469 * indicate which allocations or sleeping are allowed.
470 *
471 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
472 * the given range from the tree regardless of state (ie for truncate).
473 *
474 * the range [start, end] is inclusive.
475 *
476 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
477 * bits were already set, or zero if none of the bits were already set.
478 */
479int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
480 int bits, int wake, int delete, gfp_t mask)
481{
482 struct extent_state *state;
483 struct extent_state *prealloc = NULL;
484 struct rb_node *node;
485 int err;
486 int set = 0;
487
488again:
489 if (!prealloc && (mask & __GFP_WAIT)) {
490 prealloc = alloc_extent_state(mask);
491 if (!prealloc)
492 return -ENOMEM;
493 }
494
495 spin_lock(&tree->lock);
496 /*
497 * this search will find the extents that end after
498 * our range starts
499 */
500 node = tree_search(tree, start);
501 if (!node)
502 goto out;
503 state = rb_entry(node, struct extent_state, rb_node);
504 if (state->start > end)
505 goto out;
506 WARN_ON(state->end < start);
507
508 /*
509 * | ---- desired range ---- |
510 * | state | or
511 * | ------------- state -------------- |
512 *
513 * We need to split the extent we found, and may flip
514 * bits on second half.
515 *
516 * If the extent we found extends past our range, we
517 * just split and search again. It'll get split again
518 * the next time though.
519 *
520 * If the extent we found is inside our range, we clear
521 * the desired bit on it.
522 */
523
524 if (state->start < start) {
525 if (!prealloc)
526 prealloc = alloc_extent_state(GFP_ATOMIC);
527 err = split_state(tree, state, prealloc, start);
528 BUG_ON(err == -EEXIST);
529 prealloc = NULL;
530 if (err)
531 goto out;
532 if (state->end <= end) {
533 start = state->end + 1;
534 set |= clear_state_bit(tree, state, bits,
535 wake, delete);
536 } else {
537 start = state->start;
538 }
539 goto search_again;
540 }
541 /*
542 * | ---- desired range ---- |
543 * | state |
544 * We need to split the extent, and clear the bit
545 * on the first half
546 */
547 if (state->start <= end && state->end > end) {
548 if (!prealloc)
549 prealloc = alloc_extent_state(GFP_ATOMIC);
550 err = split_state(tree, state, prealloc, end + 1);
551 BUG_ON(err == -EEXIST);
552
553 if (wake)
554 wake_up(&state->wq);
555 set |= clear_state_bit(tree, prealloc, bits,
556 wake, delete);
557 prealloc = NULL;
558 goto out;
559 }
560
561 start = state->end + 1;
562 set |= clear_state_bit(tree, state, bits, wake, delete);
563 goto search_again;
564
565out:
566 spin_unlock(&tree->lock);
567 if (prealloc)
568 free_extent_state(prealloc);
569
570 return set;
571
572search_again:
573 if (start > end)
574 goto out;
575 spin_unlock(&tree->lock);
576 if (mask & __GFP_WAIT)
577 cond_resched();
578 goto again;
579}
580
581static int wait_on_state(struct extent_io_tree *tree,
582 struct extent_state *state)
583 __releases(tree->lock)
584 __acquires(tree->lock)
585{
586 DEFINE_WAIT(wait);
587 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
588 spin_unlock(&tree->lock);
589 schedule();
590 spin_lock(&tree->lock);
591 finish_wait(&state->wq, &wait);
592 return 0;
593}
594
595/*
596 * waits for one or more bits to clear on a range in the state tree.
597 * The range [start, end] is inclusive.
598 * The tree lock is taken by this function
599 */
600int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
601{
602 struct extent_state *state;
603 struct rb_node *node;
604
605 spin_lock(&tree->lock);
606again:
607 while (1) {
608 /*
609 * this search will find all the extents that end after
610 * our range starts
611 */
612 node = tree_search(tree, start);
613 if (!node)
614 break;
615
616 state = rb_entry(node, struct extent_state, rb_node);
617
618 if (state->start > end)
619 goto out;
620
621 if (state->state & bits) {
622 start = state->start;
623 atomic_inc(&state->refs);
624 wait_on_state(tree, state);
625 free_extent_state(state);
626 goto again;
627 }
628 start = state->end + 1;
629
630 if (start > end)
631 break;
632
633 if (need_resched()) {
634 spin_unlock(&tree->lock);
635 cond_resched();
636 spin_lock(&tree->lock);
637 }
638 }
639out:
640 spin_unlock(&tree->lock);
641 return 0;
642}
643
644static void set_state_bits(struct extent_io_tree *tree,
645 struct extent_state *state,
646 int bits)
647{
648 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
649 u64 range = state->end - state->start + 1;
650 tree->dirty_bytes += range;
651 }
652 set_state_cb(tree, state, bits);
653 state->state |= bits;
654}
655
656/*
657 * set some bits on a range in the tree. This may require allocations
658 * or sleeping, so the gfp mask is used to indicate what is allowed.
659 *
660 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
661 * range already has the desired bits set. The start of the existing
662 * range is returned in failed_start in this case.
663 *
664 * [start, end] is inclusive
665 * This takes the tree lock.
666 */
667static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
668 int bits, int exclusive, u64 *failed_start,
669 gfp_t mask)
670{
671 struct extent_state *state;
672 struct extent_state *prealloc = NULL;
673 struct rb_node *node;
674 int err = 0;
675 int set;
676 u64 last_start;
677 u64 last_end;
678again:
679 if (!prealloc && (mask & __GFP_WAIT)) {
680 prealloc = alloc_extent_state(mask);
681 if (!prealloc)
682 return -ENOMEM;
683 }
684
685 spin_lock(&tree->lock);
686 /*
687 * this search will find all the extents that end after
688 * our range starts.
689 */
690 node = tree_search(tree, start);
691 if (!node) {
692 err = insert_state(tree, prealloc, start, end, bits);
693 prealloc = NULL;
694 BUG_ON(err == -EEXIST);
695 goto out;
696 }
697
698 state = rb_entry(node, struct extent_state, rb_node);
699 last_start = state->start;
700 last_end = state->end;
701
702 /*
703 * | ---- desired range ---- |
704 * | state |
705 *
706 * Just lock what we found and keep going
707 */
708 if (state->start == start && state->end <= end) {
709 set = state->state & bits;
710 if (set && exclusive) {
711 *failed_start = state->start;
712 err = -EEXIST;
713 goto out;
714 }
715 set_state_bits(tree, state, bits);
716 start = state->end + 1;
717 merge_state(tree, state);
718 goto search_again;
719 }
720
721 /*
722 * | ---- desired range ---- |
723 * | state |
724 * or
725 * | ------------- state -------------- |
726 *
727 * We need to split the extent we found, and may flip bits on
728 * second half.
729 *
730 * If the extent we found extends past our
731 * range, we just split and search again. It'll get split
732 * again the next time though.
733 *
734 * If the extent we found is inside our range, we set the
735 * desired bit on it.
736 */
737 if (state->start < start) {
738 set = state->state & bits;
739 if (exclusive && set) {
740 *failed_start = start;
741 err = -EEXIST;
742 goto out;
743 }
744 err = split_state(tree, state, prealloc, start);
745 BUG_ON(err == -EEXIST);
746 prealloc = NULL;
747 if (err)
748 goto out;
749 if (state->end <= end) {
750 set_state_bits(tree, state, bits);
751 start = state->end + 1;
752 merge_state(tree, state);
753 } else {
754 start = state->start;
755 }
756 goto search_again;
757 }
758 /*
759 * | ---- desired range ---- |
760 * | state | or | state |
761 *
762 * There's a hole, we need to insert something in it and
763 * ignore the extent we found.
764 */
765 if (state->start > start) {
766 u64 this_end;
767 if (end < last_start)
768 this_end = end;
769 else
770 this_end = last_start - 1;
771 err = insert_state(tree, prealloc, start, this_end,
772 bits);
773 prealloc = NULL;
774 BUG_ON(err == -EEXIST);
775 if (err)
776 goto out;
777 start = this_end + 1;
778 goto search_again;
779 }
780 /*
781 * | ---- desired range ---- |
782 * | state |
783 * We need to split the extent, and set the bit
784 * on the first half
785 */
786 if (state->start <= end && state->end > end) {
787 set = state->state & bits;
788 if (exclusive && set) {
789 *failed_start = start;
790 err = -EEXIST;
791 goto out;
792 }
793 err = split_state(tree, state, prealloc, end + 1);
794 BUG_ON(err == -EEXIST);
795
796 set_state_bits(tree, prealloc, bits);
797 merge_state(tree, prealloc);
798 prealloc = NULL;
799 goto out;
800 }
801
802 goto search_again;
803
804out:
805 spin_unlock(&tree->lock);
806 if (prealloc)
807 free_extent_state(prealloc);
808
809 return err;
810
811search_again:
812 if (start > end)
813 goto out;
814 spin_unlock(&tree->lock);
815 if (mask & __GFP_WAIT)
816 cond_resched();
817 goto again;
818}
819
820/* wrappers around set/clear extent bit */
821int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
822 gfp_t mask)
823{
824 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
825 mask);
826}
827
828int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
829 gfp_t mask)
830{
831 return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
832}
833
834int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
835 int bits, gfp_t mask)
836{
837 return set_extent_bit(tree, start, end, bits, 0, NULL,
838 mask);
839}
840
841int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
842 int bits, gfp_t mask)
843{
844 return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
845}
846
847int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
848 gfp_t mask)
849{
850 return set_extent_bit(tree, start, end,
851 EXTENT_DELALLOC | EXTENT_DIRTY,
852 0, NULL, mask);
853}
854
855int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
856 gfp_t mask)
857{
858 return clear_extent_bit(tree, start, end,
859 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
860}
861
862int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
863 gfp_t mask)
864{
865 return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
866}
867
868int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
869 gfp_t mask)
870{
871 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
872 mask);
873}
874
875static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
876 gfp_t mask)
877{
878 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
879}
880
881int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
882 gfp_t mask)
883{
884 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
885 mask);
886}
887
888static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
889 u64 end, gfp_t mask)
890{
891 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
892}
893
894static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
895 gfp_t mask)
896{
897 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
898 0, NULL, mask);
899}
900
901static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
902 u64 end, gfp_t mask)
903{
904 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
905}
906
907int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
908{
909 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
910}
911
912/*
913 * either insert or lock state struct between start and end use mask to tell
914 * us if waiting is desired.
915 */
916int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
917{
918 int err;
919 u64 failed_start;
920 while (1) {
921 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
922 &failed_start, mask);
923 if (err == -EEXIST && (mask & __GFP_WAIT)) {
924 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
925 start = failed_start;
926 } else {
927 break;
928 }
929 WARN_ON(start > end);
930 }
931 return err;
932}
933
934int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
935 gfp_t mask)
936{
937 int err;
938 u64 failed_start;
939
940 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
941 &failed_start, mask);
942 if (err == -EEXIST) {
943 if (failed_start > start)
944 clear_extent_bit(tree, start, failed_start - 1,
945 EXTENT_LOCKED, 1, 0, mask);
946 return 0;
947 }
948 return 1;
949}
950
951int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
952 gfp_t mask)
953{
954 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
955}
956
957/*
958 * helper function to set pages and extents in the tree dirty
959 */
960int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
961{
962 unsigned long index = start >> PAGE_CACHE_SHIFT;
963 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
964 struct page *page;
965
966 while (index <= end_index) {
967 page = find_get_page(tree->mapping, index);
968 BUG_ON(!page);
969 __set_page_dirty_nobuffers(page);
970 page_cache_release(page);
971 index++;
972 }
973 set_extent_dirty(tree, start, end, GFP_NOFS);
974 return 0;
975}
976
977/*
978 * helper function to set both pages and extents in the tree writeback
979 */
980static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
981{
982 unsigned long index = start >> PAGE_CACHE_SHIFT;
983 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
984 struct page *page;
985
986 while (index <= end_index) {
987 page = find_get_page(tree->mapping, index);
988 BUG_ON(!page);
989 set_page_writeback(page);
990 page_cache_release(page);
991 index++;
992 }
993 set_extent_writeback(tree, start, end, GFP_NOFS);
994 return 0;
995}
996
997/*
998 * find the first offset in the io tree with 'bits' set. zero is
999 * returned if we find something, and *start_ret and *end_ret are
1000 * set to reflect the state struct that was found.
1001 *
1002 * If nothing was found, 1 is returned, < 0 on error
1003 */
1004int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1005 u64 *start_ret, u64 *end_ret, int bits)
1006{
1007 struct rb_node *node;
1008 struct extent_state *state;
1009 int ret = 1;
1010
1011 spin_lock(&tree->lock);
1012 /*
1013 * this search will find all the extents that end after
1014 * our range starts.
1015 */
1016 node = tree_search(tree, start);
1017 if (!node)
1018 goto out;
1019
1020 while (1) {
1021 state = rb_entry(node, struct extent_state, rb_node);
1022 if (state->end >= start && (state->state & bits)) {
1023 *start_ret = state->start;
1024 *end_ret = state->end;
1025 ret = 0;
1026 break;
1027 }
1028 node = rb_next(node);
1029 if (!node)
1030 break;
1031 }
1032out:
1033 spin_unlock(&tree->lock);
1034 return ret;
1035}
1036
1037/* find the first state struct with 'bits' set after 'start', and
1038 * return it. tree->lock must be held. NULL will returned if
1039 * nothing was found after 'start'
1040 */
1041struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1042 u64 start, int bits)
1043{
1044 struct rb_node *node;
1045 struct extent_state *state;
1046
1047 /*
1048 * this search will find all the extents that end after
1049 * our range starts.
1050 */
1051 node = tree_search(tree, start);
1052 if (!node)
1053 goto out;
1054
1055 while (1) {
1056 state = rb_entry(node, struct extent_state, rb_node);
1057 if (state->end >= start && (state->state & bits))
1058 return state;
1059
1060 node = rb_next(node);
1061 if (!node)
1062 break;
1063 }
1064out:
1065 return NULL;
1066}
1067
1068/*
1069 * find a contiguous range of bytes in the file marked as delalloc, not
1070 * more than 'max_bytes'. start and end are used to return the range,
1071 *
1072 * 1 is returned if we find something, 0 if nothing was in the tree
1073 */
1074static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1075 u64 *start, u64 *end, u64 max_bytes)
1076{
1077 struct rb_node *node;
1078 struct extent_state *state;
1079 u64 cur_start = *start;
1080 u64 found = 0;
1081 u64 total_bytes = 0;
1082
1083 spin_lock(&tree->lock);
1084
1085 /*
1086 * this search will find all the extents that end after
1087 * our range starts.
1088 */
1089 node = tree_search(tree, cur_start);
1090 if (!node) {
1091 if (!found)
1092 *end = (u64)-1;
1093 goto out;
1094 }
1095
1096 while (1) {
1097 state = rb_entry(node, struct extent_state, rb_node);
1098 if (found && (state->start != cur_start ||
1099 (state->state & EXTENT_BOUNDARY))) {
1100 goto out;
1101 }
1102 if (!(state->state & EXTENT_DELALLOC)) {
1103 if (!found)
1104 *end = state->end;
1105 goto out;
1106 }
1107 if (!found)
1108 *start = state->start;
1109 found++;
1110 *end = state->end;
1111 cur_start = state->end + 1;
1112 node = rb_next(node);
1113 if (!node)
1114 break;
1115 total_bytes += state->end - state->start + 1;
1116 if (total_bytes >= max_bytes)
1117 break;
1118 }
1119out:
1120 spin_unlock(&tree->lock);
1121 return found;
1122}
1123
1124static noinline int __unlock_for_delalloc(struct inode *inode,
1125 struct page *locked_page,
1126 u64 start, u64 end)
1127{
1128 int ret;
1129 struct page *pages[16];
1130 unsigned long index = start >> PAGE_CACHE_SHIFT;
1131 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1132 unsigned long nr_pages = end_index - index + 1;
1133 int i;
1134
1135 if (index == locked_page->index && end_index == index)
1136 return 0;
1137
1138 while (nr_pages > 0) {
1139 ret = find_get_pages_contig(inode->i_mapping, index,
1140 min_t(unsigned long, nr_pages,
1141 ARRAY_SIZE(pages)), pages);
1142 for (i = 0; i < ret; i++) {
1143 if (pages[i] != locked_page)
1144 unlock_page(pages[i]);
1145 page_cache_release(pages[i]);
1146 }
1147 nr_pages -= ret;
1148 index += ret;
1149 cond_resched();
1150 }
1151 return 0;
1152}
1153
1154static noinline int lock_delalloc_pages(struct inode *inode,
1155 struct page *locked_page,
1156 u64 delalloc_start,
1157 u64 delalloc_end)
1158{
1159 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1160 unsigned long start_index = index;
1161 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1162 unsigned long pages_locked = 0;
1163 struct page *pages[16];
1164 unsigned long nrpages;
1165 int ret;
1166 int i;
1167
1168 /* the caller is responsible for locking the start index */
1169 if (index == locked_page->index && index == end_index)
1170 return 0;
1171
1172 /* skip the page at the start index */
1173 nrpages = end_index - index + 1;
1174 while (nrpages > 0) {
1175 ret = find_get_pages_contig(inode->i_mapping, index,
1176 min_t(unsigned long,
1177 nrpages, ARRAY_SIZE(pages)), pages);
1178 if (ret == 0) {
1179 ret = -EAGAIN;
1180 goto done;
1181 }
1182 /* now we have an array of pages, lock them all */
1183 for (i = 0; i < ret; i++) {
1184 /*
1185 * the caller is taking responsibility for
1186 * locked_page
1187 */
1188 if (pages[i] != locked_page) {
1189 lock_page(pages[i]);
1190 if (!PageDirty(pages[i]) ||
1191 pages[i]->mapping != inode->i_mapping) {
1192 ret = -EAGAIN;
1193 unlock_page(pages[i]);
1194 page_cache_release(pages[i]);
1195 goto done;
1196 }
1197 }
1198 page_cache_release(pages[i]);
1199 pages_locked++;
1200 }
1201 nrpages -= ret;
1202 index += ret;
1203 cond_resched();
1204 }
1205 ret = 0;
1206done:
1207 if (ret && pages_locked) {
1208 __unlock_for_delalloc(inode, locked_page,
1209 delalloc_start,
1210 ((u64)(start_index + pages_locked - 1)) <<
1211 PAGE_CACHE_SHIFT);
1212 }
1213 return ret;
1214}
1215
1216/*
1217 * find a contiguous range of bytes in the file marked as delalloc, not
1218 * more than 'max_bytes'. start and end are used to return the range,
1219 *
1220 * 1 is returned if we find something, 0 if nothing was in the tree
1221 */
1222static noinline u64 find_lock_delalloc_range(struct inode *inode,
1223 struct extent_io_tree *tree,
1224 struct page *locked_page,
1225 u64 *start, u64 *end,
1226 u64 max_bytes)
1227{
1228 u64 delalloc_start;
1229 u64 delalloc_end;
1230 u64 found;
1231 int ret;
1232 int loops = 0;
1233
1234again:
1235 /* step one, find a bunch of delalloc bytes starting at start */
1236 delalloc_start = *start;
1237 delalloc_end = 0;
1238 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1239 max_bytes);
1240 if (!found || delalloc_end <= *start) {
1241 *start = delalloc_start;
1242 *end = delalloc_end;
1243 return found;
1244 }
1245
1246 /*
1247 * start comes from the offset of locked_page. We have to lock
1248 * pages in order, so we can't process delalloc bytes before
1249 * locked_page
1250 */
1251 if (delalloc_start < *start)
1252 delalloc_start = *start;
1253
1254 /*
1255 * make sure to limit the number of pages we try to lock down
1256 * if we're looping.
1257 */
1258 if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1259 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1260
1261 /* step two, lock all the pages after the page that has start */
1262 ret = lock_delalloc_pages(inode, locked_page,
1263 delalloc_start, delalloc_end);
1264 if (ret == -EAGAIN) {
1265 /* some of the pages are gone, lets avoid looping by
1266 * shortening the size of the delalloc range we're searching
1267 */
1268 if (!loops) {
1269 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1270 max_bytes = PAGE_CACHE_SIZE - offset;
1271 loops = 1;
1272 goto again;
1273 } else {
1274 found = 0;
1275 goto out_failed;
1276 }
1277 }
1278 BUG_ON(ret);
1279
1280 /* step three, lock the state bits for the whole range */
1281 lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1282
1283 /* then test to make sure it is all still delalloc */
1284 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1285 EXTENT_DELALLOC, 1);
1286 if (!ret) {
1287 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1288 __unlock_for_delalloc(inode, locked_page,
1289 delalloc_start, delalloc_end);
1290 cond_resched();
1291 goto again;
1292 }
1293 *start = delalloc_start;
1294 *end = delalloc_end;
1295out_failed:
1296 return found;
1297}
1298
1299int extent_clear_unlock_delalloc(struct inode *inode,
1300 struct extent_io_tree *tree,
1301 u64 start, u64 end, struct page *locked_page,
1302 int unlock_pages,
1303 int clear_unlock,
1304 int clear_delalloc, int clear_dirty,
1305 int set_writeback,
1306 int end_writeback)
1307{
1308 int ret;
1309 struct page *pages[16];
1310 unsigned long index = start >> PAGE_CACHE_SHIFT;
1311 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1312 unsigned long nr_pages = end_index - index + 1;
1313 int i;
1314 int clear_bits = 0;
1315
1316 if (clear_unlock)
1317 clear_bits |= EXTENT_LOCKED;
1318 if (clear_dirty)
1319 clear_bits |= EXTENT_DIRTY;
1320
1321 if (clear_delalloc)
1322 clear_bits |= EXTENT_DELALLOC;
1323
1324 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1325 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1326 return 0;
1327
1328 while (nr_pages > 0) {
1329 ret = find_get_pages_contig(inode->i_mapping, index,
1330 min_t(unsigned long,
1331 nr_pages, ARRAY_SIZE(pages)), pages);
1332 for (i = 0; i < ret; i++) {
1333 if (pages[i] == locked_page) {
1334 page_cache_release(pages[i]);
1335 continue;
1336 }
1337 if (clear_dirty)
1338 clear_page_dirty_for_io(pages[i]);
1339 if (set_writeback)
1340 set_page_writeback(pages[i]);
1341 if (end_writeback)
1342 end_page_writeback(pages[i]);
1343 if (unlock_pages)
1344 unlock_page(pages[i]);
1345 page_cache_release(pages[i]);
1346 }
1347 nr_pages -= ret;
1348 index += ret;
1349 cond_resched();
1350 }
1351 return 0;
1352}
1353
1354/*
1355 * count the number of bytes in the tree that have a given bit(s)
1356 * set. This can be fairly slow, except for EXTENT_DIRTY which is
1357 * cached. The total number found is returned.
1358 */
1359u64 count_range_bits(struct extent_io_tree *tree,
1360 u64 *start, u64 search_end, u64 max_bytes,
1361 unsigned long bits)
1362{
1363 struct rb_node *node;
1364 struct extent_state *state;
1365 u64 cur_start = *start;
1366 u64 total_bytes = 0;
1367 int found = 0;
1368
1369 if (search_end <= cur_start) {
1370 WARN_ON(1);
1371 return 0;
1372 }
1373
1374 spin_lock(&tree->lock);
1375 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1376 total_bytes = tree->dirty_bytes;
1377 goto out;
1378 }
1379 /*
1380 * this search will find all the extents that end after
1381 * our range starts.
1382 */
1383 node = tree_search(tree, cur_start);
1384 if (!node)
1385 goto out;
1386
1387 while (1) {
1388 state = rb_entry(node, struct extent_state, rb_node);
1389 if (state->start > search_end)
1390 break;
1391 if (state->end >= cur_start && (state->state & bits)) {
1392 total_bytes += min(search_end, state->end) + 1 -
1393 max(cur_start, state->start);
1394 if (total_bytes >= max_bytes)
1395 break;
1396 if (!found) {
1397 *start = state->start;
1398 found = 1;
1399 }
1400 }
1401 node = rb_next(node);
1402 if (!node)
1403 break;
1404 }
1405out:
1406 spin_unlock(&tree->lock);
1407 return total_bytes;
1408}
1409
1410#if 0
1411/*
1412 * helper function to lock both pages and extents in the tree.
1413 * pages must be locked first.
1414 */
1415static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1416{
1417 unsigned long index = start >> PAGE_CACHE_SHIFT;
1418 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1419 struct page *page;
1420 int err;
1421
1422 while (index <= end_index) {
1423 page = grab_cache_page(tree->mapping, index);
1424 if (!page) {
1425 err = -ENOMEM;
1426 goto failed;
1427 }
1428 if (IS_ERR(page)) {
1429 err = PTR_ERR(page);
1430 goto failed;
1431 }
1432 index++;
1433 }
1434 lock_extent(tree, start, end, GFP_NOFS);
1435 return 0;
1436
1437failed:
1438 /*
1439 * we failed above in getting the page at 'index', so we undo here
1440 * up to but not including the page at 'index'
1441 */
1442 end_index = index;
1443 index = start >> PAGE_CACHE_SHIFT;
1444 while (index < end_index) {
1445 page = find_get_page(tree->mapping, index);
1446 unlock_page(page);
1447 page_cache_release(page);
1448 index++;
1449 }
1450 return err;
1451}
1452
1453/*
1454 * helper function to unlock both pages and extents in the tree.
1455 */
1456static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1457{
1458 unsigned long index = start >> PAGE_CACHE_SHIFT;
1459 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1460 struct page *page;
1461
1462 while (index <= end_index) {
1463 page = find_get_page(tree->mapping, index);
1464 unlock_page(page);
1465 page_cache_release(page);
1466 index++;
1467 }
1468 unlock_extent(tree, start, end, GFP_NOFS);
1469 return 0;
1470}
1471#endif
1472
1473/*
1474 * set the private field for a given byte offset in the tree. If there isn't
1475 * an extent_state there already, this does nothing.
1476 */
1477int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1478{
1479 struct rb_node *node;
1480 struct extent_state *state;
1481 int ret = 0;
1482
1483 spin_lock(&tree->lock);
1484 /*
1485 * this search will find all the extents that end after
1486 * our range starts.
1487 */
1488 node = tree_search(tree, start);
1489 if (!node) {
1490 ret = -ENOENT;
1491 goto out;
1492 }
1493 state = rb_entry(node, struct extent_state, rb_node);
1494 if (state->start != start) {
1495 ret = -ENOENT;
1496 goto out;
1497 }
1498 state->private = private;
1499out:
1500 spin_unlock(&tree->lock);
1501 return ret;
1502}
1503
1504int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1505{
1506 struct rb_node *node;
1507 struct extent_state *state;
1508 int ret = 0;
1509
1510 spin_lock(&tree->lock);
1511 /*
1512 * this search will find all the extents that end after
1513 * our range starts.
1514 */
1515 node = tree_search(tree, start);
1516 if (!node) {
1517 ret = -ENOENT;
1518 goto out;
1519 }
1520 state = rb_entry(node, struct extent_state, rb_node);
1521 if (state->start != start) {
1522 ret = -ENOENT;
1523 goto out;
1524 }
1525 *private = state->private;
1526out:
1527 spin_unlock(&tree->lock);
1528 return ret;
1529}
1530
1531/*
1532 * searches a range in the state tree for a given mask.
1533 * If 'filled' == 1, this returns 1 only if every extent in the tree
1534 * has the bits set. Otherwise, 1 is returned if any bit in the
1535 * range is found set.
1536 */
1537int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1538 int bits, int filled)
1539{
1540 struct extent_state *state = NULL;
1541 struct rb_node *node;
1542 int bitset = 0;
1543
1544 spin_lock(&tree->lock);
1545 node = tree_search(tree, start);
1546 while (node && start <= end) {
1547 state = rb_entry(node, struct extent_state, rb_node);
1548
1549 if (filled && state->start > start) {
1550 bitset = 0;
1551 break;
1552 }
1553
1554 if (state->start > end)
1555 break;
1556
1557 if (state->state & bits) {
1558 bitset = 1;
1559 if (!filled)
1560 break;
1561 } else if (filled) {
1562 bitset = 0;
1563 break;
1564 }
1565 start = state->end + 1;
1566 if (start > end)
1567 break;
1568 node = rb_next(node);
1569 if (!node) {
1570 if (filled)
1571 bitset = 0;
1572 break;
1573 }
1574 }
1575 spin_unlock(&tree->lock);
1576 return bitset;
1577}
1578
1579/*
1580 * helper function to set a given page up to date if all the
1581 * extents in the tree for that page are up to date
1582 */
1583static int check_page_uptodate(struct extent_io_tree *tree,
1584 struct page *page)
1585{
1586 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1587 u64 end = start + PAGE_CACHE_SIZE - 1;
1588 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1589 SetPageUptodate(page);
1590 return 0;
1591}
1592
1593/*
1594 * helper function to unlock a page if all the extents in the tree
1595 * for that page are unlocked
1596 */
1597static int check_page_locked(struct extent_io_tree *tree,
1598 struct page *page)
1599{
1600 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1601 u64 end = start + PAGE_CACHE_SIZE - 1;
1602 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1603 unlock_page(page);
1604 return 0;
1605}
1606
1607/*
1608 * helper function to end page writeback if all the extents
1609 * in the tree for that page are done with writeback
1610 */
1611static int check_page_writeback(struct extent_io_tree *tree,
1612 struct page *page)
1613{
1614 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1615 u64 end = start + PAGE_CACHE_SIZE - 1;
1616 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1617 end_page_writeback(page);
1618 return 0;
1619}
1620
1621/* lots and lots of room for performance fixes in the end_bio funcs */
1622
1623/*
1624 * after a writepage IO is done, we need to:
1625 * clear the uptodate bits on error
1626 * clear the writeback bits in the extent tree for this IO
1627 * end_page_writeback if the page has no more pending IO
1628 *
1629 * Scheduling is not allowed, so the extent state tree is expected
1630 * to have one and only one object corresponding to this IO.
1631 */
1632static void end_bio_extent_writepage(struct bio *bio, int err)
1633{
1634 int uptodate = err == 0;
1635 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1636 struct extent_io_tree *tree;
1637 u64 start;
1638 u64 end;
1639 int whole_page;
1640 int ret;
1641
1642 do {
1643 struct page *page = bvec->bv_page;
1644 tree = &BTRFS_I(page->mapping->host)->io_tree;
1645
1646 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1647 bvec->bv_offset;
1648 end = start + bvec->bv_len - 1;
1649
1650 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1651 whole_page = 1;
1652 else
1653 whole_page = 0;
1654
1655 if (--bvec >= bio->bi_io_vec)
1656 prefetchw(&bvec->bv_page->flags);
1657 if (tree->ops && tree->ops->writepage_end_io_hook) {
1658 ret = tree->ops->writepage_end_io_hook(page, start,
1659 end, NULL, uptodate);
1660 if (ret)
1661 uptodate = 0;
1662 }
1663
1664 if (!uptodate && tree->ops &&
1665 tree->ops->writepage_io_failed_hook) {
1666 ret = tree->ops->writepage_io_failed_hook(bio, page,
1667 start, end, NULL);
1668 if (ret == 0) {
1669 uptodate = (err == 0);
1670 continue;
1671 }
1672 }
1673
1674 if (!uptodate) {
1675 clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1676 ClearPageUptodate(page);
1677 SetPageError(page);
1678 }
1679
1680 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1681
1682 if (whole_page)
1683 end_page_writeback(page);
1684 else
1685 check_page_writeback(tree, page);
1686 } while (bvec >= bio->bi_io_vec);
1687
1688 bio_put(bio);
1689}
1690
1691/*
1692 * after a readpage IO is done, we need to:
1693 * clear the uptodate bits on error
1694 * set the uptodate bits if things worked
1695 * set the page up to date if all extents in the tree are uptodate
1696 * clear the lock bit in the extent tree
1697 * unlock the page if there are no other extents locked for it
1698 *
1699 * Scheduling is not allowed, so the extent state tree is expected
1700 * to have one and only one object corresponding to this IO.
1701 */
1702static void end_bio_extent_readpage(struct bio *bio, int err)
1703{
1704 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1705 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1706 struct extent_io_tree *tree;
1707 u64 start;
1708 u64 end;
1709 int whole_page;
1710 int ret;
1711
1712 if (err)
1713 uptodate = 0;
1714
1715 do {
1716 struct page *page = bvec->bv_page;
1717 tree = &BTRFS_I(page->mapping->host)->io_tree;
1718
1719 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1720 bvec->bv_offset;
1721 end = start + bvec->bv_len - 1;
1722
1723 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1724 whole_page = 1;
1725 else
1726 whole_page = 0;
1727
1728 if (--bvec >= bio->bi_io_vec)
1729 prefetchw(&bvec->bv_page->flags);
1730
1731 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1732 ret = tree->ops->readpage_end_io_hook(page, start, end,
1733 NULL);
1734 if (ret)
1735 uptodate = 0;
1736 }
1737 if (!uptodate && tree->ops &&
1738 tree->ops->readpage_io_failed_hook) {
1739 ret = tree->ops->readpage_io_failed_hook(bio, page,
1740 start, end, NULL);
1741 if (ret == 0) {
1742 uptodate =
1743 test_bit(BIO_UPTODATE, &bio->bi_flags);
1744 if (err)
1745 uptodate = 0;
1746 continue;
1747 }
1748 }
1749
1750 if (uptodate) {
1751 set_extent_uptodate(tree, start, end,
1752 GFP_ATOMIC);
1753 }
1754 unlock_extent(tree, start, end, GFP_ATOMIC);
1755
1756 if (whole_page) {
1757 if (uptodate) {
1758 SetPageUptodate(page);
1759 } else {
1760 ClearPageUptodate(page);
1761 SetPageError(page);
1762 }
1763 unlock_page(page);
1764 } else {
1765 if (uptodate) {
1766 check_page_uptodate(tree, page);
1767 } else {
1768 ClearPageUptodate(page);
1769 SetPageError(page);
1770 }
1771 check_page_locked(tree, page);
1772 }
1773 } while (bvec >= bio->bi_io_vec);
1774
1775 bio_put(bio);
1776}
1777
1778/*
1779 * IO done from prepare_write is pretty simple, we just unlock
1780 * the structs in the extent tree when done, and set the uptodate bits
1781 * as appropriate.
1782 */
1783static void end_bio_extent_preparewrite(struct bio *bio, int err)
1784{
1785 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1786 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1787 struct extent_io_tree *tree;
1788 u64 start;
1789 u64 end;
1790
1791 do {
1792 struct page *page = bvec->bv_page;
1793 tree = &BTRFS_I(page->mapping->host)->io_tree;
1794
1795 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1796 bvec->bv_offset;
1797 end = start + bvec->bv_len - 1;
1798
1799 if (--bvec >= bio->bi_io_vec)
1800 prefetchw(&bvec->bv_page->flags);
1801
1802 if (uptodate) {
1803 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1804 } else {
1805 ClearPageUptodate(page);
1806 SetPageError(page);
1807 }
1808
1809 unlock_extent(tree, start, end, GFP_ATOMIC);
1810
1811 } while (bvec >= bio->bi_io_vec);
1812
1813 bio_put(bio);
1814}
1815
1816static struct bio *
1817extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1818 gfp_t gfp_flags)
1819{
1820 struct bio *bio;
1821
1822 bio = bio_alloc(gfp_flags, nr_vecs);
1823
1824 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1825 while (!bio && (nr_vecs /= 2))
1826 bio = bio_alloc(gfp_flags, nr_vecs);
1827 }
1828
1829 if (bio) {
1830 bio->bi_size = 0;
1831 bio->bi_bdev = bdev;
1832 bio->bi_sector = first_sector;
1833 }
1834 return bio;
1835}
1836
1837static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1838 unsigned long bio_flags)
1839{
1840 int ret = 0;
1841 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1842 struct page *page = bvec->bv_page;
1843 struct extent_io_tree *tree = bio->bi_private;
1844 u64 start;
1845 u64 end;
1846
1847 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1848 end = start + bvec->bv_len - 1;
1849
1850 bio->bi_private = NULL;
1851
1852 bio_get(bio);
1853
1854 if (tree->ops && tree->ops->submit_bio_hook)
1855 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1856 mirror_num, bio_flags);
1857 else
1858 submit_bio(rw, bio);
1859 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1860 ret = -EOPNOTSUPP;
1861 bio_put(bio);
1862 return ret;
1863}
1864
1865static int submit_extent_page(int rw, struct extent_io_tree *tree,
1866 struct page *page, sector_t sector,
1867 size_t size, unsigned long offset,
1868 struct block_device *bdev,
1869 struct bio **bio_ret,
1870 unsigned long max_pages,
1871 bio_end_io_t end_io_func,
1872 int mirror_num,
1873 unsigned long prev_bio_flags,
1874 unsigned long bio_flags)
1875{
1876 int ret = 0;
1877 struct bio *bio;
1878 int nr;
1879 int contig = 0;
1880 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1881 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1882 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1883
1884 if (bio_ret && *bio_ret) {
1885 bio = *bio_ret;
1886 if (old_compressed)
1887 contig = bio->bi_sector == sector;
1888 else
1889 contig = bio->bi_sector + (bio->bi_size >> 9) ==
1890 sector;
1891
1892 if (prev_bio_flags != bio_flags || !contig ||
1893 (tree->ops && tree->ops->merge_bio_hook &&
1894 tree->ops->merge_bio_hook(page, offset, page_size, bio,
1895 bio_flags)) ||
1896 bio_add_page(bio, page, page_size, offset) < page_size) {
1897 ret = submit_one_bio(rw, bio, mirror_num,
1898 prev_bio_flags);
1899 bio = NULL;
1900 } else {
1901 return 0;
1902 }
1903 }
1904 if (this_compressed)
1905 nr = BIO_MAX_PAGES;
1906 else
1907 nr = bio_get_nr_vecs(bdev);
1908
1909 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1910
1911 bio_add_page(bio, page, page_size, offset);
1912 bio->bi_end_io = end_io_func;
1913 bio->bi_private = tree;
1914
1915 if (bio_ret)
1916 *bio_ret = bio;
1917 else
1918 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1919
1920 return ret;
1921}
1922
1923void set_page_extent_mapped(struct page *page)
1924{
1925 if (!PagePrivate(page)) {
1926 SetPagePrivate(page);
1927 page_cache_get(page);
1928 set_page_private(page, EXTENT_PAGE_PRIVATE);
1929 }
1930}
1931
1932static void set_page_extent_head(struct page *page, unsigned long len)
1933{
1934 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1935}
1936
1937/*
1938 * basic readpage implementation. Locked extent state structs are inserted
1939 * into the tree that are removed when the IO is done (by the end_io
1940 * handlers)
1941 */
1942static int __extent_read_full_page(struct extent_io_tree *tree,
1943 struct page *page,
1944 get_extent_t *get_extent,
1945 struct bio **bio, int mirror_num,
1946 unsigned long *bio_flags)
1947{
1948 struct inode *inode = page->mapping->host;
1949 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1950 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1951 u64 end;
1952 u64 cur = start;
1953 u64 extent_offset;
1954 u64 last_byte = i_size_read(inode);
1955 u64 block_start;
1956 u64 cur_end;
1957 sector_t sector;
1958 struct extent_map *em;
1959 struct block_device *bdev;
1960 int ret;
1961 int nr = 0;
1962 size_t page_offset = 0;
1963 size_t iosize;
1964 size_t disk_io_size;
1965 size_t blocksize = inode->i_sb->s_blocksize;
1966 unsigned long this_bio_flag = 0;
1967
1968 set_page_extent_mapped(page);
1969
1970 end = page_end;
1971 lock_extent(tree, start, end, GFP_NOFS);
1972
1973 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1974 char *userpage;
1975 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1976
1977 if (zero_offset) {
1978 iosize = PAGE_CACHE_SIZE - zero_offset;
1979 userpage = kmap_atomic(page, KM_USER0);
1980 memset(userpage + zero_offset, 0, iosize);
1981 flush_dcache_page(page);
1982 kunmap_atomic(userpage, KM_USER0);
1983 }
1984 }
1985 while (cur <= end) {
1986 if (cur >= last_byte) {
1987 char *userpage;
1988 iosize = PAGE_CACHE_SIZE - page_offset;
1989 userpage = kmap_atomic(page, KM_USER0);
1990 memset(userpage + page_offset, 0, iosize);
1991 flush_dcache_page(page);
1992 kunmap_atomic(userpage, KM_USER0);
1993 set_extent_uptodate(tree, cur, cur + iosize - 1,
1994 GFP_NOFS);
1995 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1996 break;
1997 }
1998 em = get_extent(inode, page, page_offset, cur,
1999 end - cur + 1, 0);
2000 if (IS_ERR(em) || !em) {
2001 SetPageError(page);
2002 unlock_extent(tree, cur, end, GFP_NOFS);
2003 break;
2004 }
2005 extent_offset = cur - em->start;
2006 BUG_ON(extent_map_end(em) <= cur);
2007 BUG_ON(end < cur);
2008
2009 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2010 this_bio_flag = EXTENT_BIO_COMPRESSED;
2011
2012 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2013 cur_end = min(extent_map_end(em) - 1, end);
2014 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2015 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2016 disk_io_size = em->block_len;
2017 sector = em->block_start >> 9;
2018 } else {
2019 sector = (em->block_start + extent_offset) >> 9;
2020 disk_io_size = iosize;
2021 }
2022 bdev = em->bdev;
2023 block_start = em->block_start;
2024 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2025 block_start = EXTENT_MAP_HOLE;
2026 free_extent_map(em);
2027 em = NULL;
2028
2029 /* we've found a hole, just zero and go on */
2030 if (block_start == EXTENT_MAP_HOLE) {
2031 char *userpage;
2032 userpage = kmap_atomic(page, KM_USER0);
2033 memset(userpage + page_offset, 0, iosize);
2034 flush_dcache_page(page);
2035 kunmap_atomic(userpage, KM_USER0);
2036
2037 set_extent_uptodate(tree, cur, cur + iosize - 1,
2038 GFP_NOFS);
2039 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2040 cur = cur + iosize;
2041 page_offset += iosize;
2042 continue;
2043 }
2044 /* the get_extent function already copied into the page */
2045 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
2046 check_page_uptodate(tree, page);
2047 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2048 cur = cur + iosize;
2049 page_offset += iosize;
2050 continue;
2051 }
2052 /* we have an inline extent but it didn't get marked up
2053 * to date. Error out
2054 */
2055 if (block_start == EXTENT_MAP_INLINE) {
2056 SetPageError(page);
2057 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2058 cur = cur + iosize;
2059 page_offset += iosize;
2060 continue;
2061 }
2062
2063 ret = 0;
2064 if (tree->ops && tree->ops->readpage_io_hook) {
2065 ret = tree->ops->readpage_io_hook(page, cur,
2066 cur + iosize - 1);
2067 }
2068 if (!ret) {
2069 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2070 pnr -= page->index;
2071 ret = submit_extent_page(READ, tree, page,
2072 sector, disk_io_size, page_offset,
2073 bdev, bio, pnr,
2074 end_bio_extent_readpage, mirror_num,
2075 *bio_flags,
2076 this_bio_flag);
2077 nr++;
2078 *bio_flags = this_bio_flag;
2079 }
2080 if (ret)
2081 SetPageError(page);
2082 cur = cur + iosize;
2083 page_offset += iosize;
2084 }
2085 if (!nr) {
2086 if (!PageError(page))
2087 SetPageUptodate(page);
2088 unlock_page(page);
2089 }
2090 return 0;
2091}
2092
2093int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2094 get_extent_t *get_extent)
2095{
2096 struct bio *bio = NULL;
2097 unsigned long bio_flags = 0;
2098 int ret;
2099
2100 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2101 &bio_flags);
2102 if (bio)
2103 submit_one_bio(READ, bio, 0, bio_flags);
2104 return ret;
2105}
2106
2107/*
2108 * the writepage semantics are similar to regular writepage. extent
2109 * records are inserted to lock ranges in the tree, and as dirty areas
2110 * are found, they are marked writeback. Then the lock bits are removed
2111 * and the end_io handler clears the writeback ranges
2112 */
2113static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2114 void *data)
2115{
2116 struct inode *inode = page->mapping->host;
2117 struct extent_page_data *epd = data;
2118 struct extent_io_tree *tree = epd->tree;
2119 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2120 u64 delalloc_start;
2121 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2122 u64 end;
2123 u64 cur = start;
2124 u64 extent_offset;
2125 u64 last_byte = i_size_read(inode);
2126 u64 block_start;
2127 u64 iosize;
2128 u64 unlock_start;
2129 sector_t sector;
2130 struct extent_map *em;
2131 struct block_device *bdev;
2132 int ret;
2133 int nr = 0;
2134 size_t pg_offset = 0;
2135 size_t blocksize;
2136 loff_t i_size = i_size_read(inode);
2137 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2138 u64 nr_delalloc;
2139 u64 delalloc_end;
2140 int page_started;
2141 int compressed;
2142 unsigned long nr_written = 0;
2143
2144 WARN_ON(!PageLocked(page));
2145 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2146 if (page->index > end_index ||
2147 (page->index == end_index && !pg_offset)) {
2148 page->mapping->a_ops->invalidatepage(page, 0);
2149 unlock_page(page);
2150 return 0;
2151 }
2152
2153 if (page->index == end_index) {
2154 char *userpage;
2155
2156 userpage = kmap_atomic(page, KM_USER0);
2157 memset(userpage + pg_offset, 0,
2158 PAGE_CACHE_SIZE - pg_offset);
2159 kunmap_atomic(userpage, KM_USER0);
2160 flush_dcache_page(page);
2161 }
2162 pg_offset = 0;
2163
2164 set_page_extent_mapped(page);
2165
2166 delalloc_start = start;
2167 delalloc_end = 0;
2168 page_started = 0;
2169 if (!epd->extent_locked) {
2170 while (delalloc_end < page_end) {
2171 nr_delalloc = find_lock_delalloc_range(inode, tree,
2172 page,
2173 &delalloc_start,
2174 &delalloc_end,
2175 128 * 1024 * 1024);
2176 if (nr_delalloc == 0) {
2177 delalloc_start = delalloc_end + 1;
2178 continue;
2179 }
2180 tree->ops->fill_delalloc(inode, page, delalloc_start,
2181 delalloc_end, &page_started,
2182 &nr_written);
2183 delalloc_start = delalloc_end + 1;
2184 }
2185
2186 /* did the fill delalloc function already unlock and start
2187 * the IO?
2188 */
2189 if (page_started) {
2190 ret = 0;
2191 goto update_nr_written;
2192 }
2193 }
2194 lock_extent(tree, start, page_end, GFP_NOFS);
2195
2196 unlock_start = start;
2197
2198 if (tree->ops && tree->ops->writepage_start_hook) {
2199 ret = tree->ops->writepage_start_hook(page, start,
2200 page_end);
2201 if (ret == -EAGAIN) {
2202 unlock_extent(tree, start, page_end, GFP_NOFS);
2203 redirty_page_for_writepage(wbc, page);
2204 unlock_page(page);
2205 ret = 0;
2206 goto update_nr_written;
2207 }
2208 }
2209
2210 nr_written++;
2211
2212 end = page_end;
2213 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
2214 printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
2215
2216 if (last_byte <= start) {
2217 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2218 unlock_extent(tree, start, page_end, GFP_NOFS);
2219 if (tree->ops && tree->ops->writepage_end_io_hook)
2220 tree->ops->writepage_end_io_hook(page, start,
2221 page_end, NULL, 1);
2222 unlock_start = page_end + 1;
2223 goto done;
2224 }
2225
2226 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2227 blocksize = inode->i_sb->s_blocksize;
2228
2229 while (cur <= end) {
2230 if (cur >= last_byte) {
2231 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2232 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2233 if (tree->ops && tree->ops->writepage_end_io_hook)
2234 tree->ops->writepage_end_io_hook(page, cur,
2235 page_end, NULL, 1);
2236 unlock_start = page_end + 1;
2237 break;
2238 }
2239 em = epd->get_extent(inode, page, pg_offset, cur,
2240 end - cur + 1, 1);
2241 if (IS_ERR(em) || !em) {
2242 SetPageError(page);
2243 break;
2244 }
2245
2246 extent_offset = cur - em->start;
2247 BUG_ON(extent_map_end(em) <= cur);
2248 BUG_ON(end < cur);
2249 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2250 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2251 sector = (em->block_start + extent_offset) >> 9;
2252 bdev = em->bdev;
2253 block_start = em->block_start;
2254 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2255 free_extent_map(em);
2256 em = NULL;
2257
2258 /*
2259 * compressed and inline extents are written through other
2260 * paths in the FS
2261 */
2262 if (compressed || block_start == EXTENT_MAP_HOLE ||
2263 block_start == EXTENT_MAP_INLINE) {
2264 clear_extent_dirty(tree, cur,
2265 cur + iosize - 1, GFP_NOFS);
2266
2267 unlock_extent(tree, unlock_start, cur + iosize - 1,
2268 GFP_NOFS);
2269
2270 /*
2271 * end_io notification does not happen here for
2272 * compressed extents
2273 */
2274 if (!compressed && tree->ops &&
2275 tree->ops->writepage_end_io_hook)
2276 tree->ops->writepage_end_io_hook(page, cur,
2277 cur + iosize - 1,
2278 NULL, 1);
2279 else if (compressed) {
2280 /* we don't want to end_page_writeback on
2281 * a compressed extent. this happens
2282 * elsewhere
2283 */
2284 nr++;
2285 }
2286
2287 cur += iosize;
2288 pg_offset += iosize;
2289 unlock_start = cur;
2290 continue;
2291 }
2292 /* leave this out until we have a page_mkwrite call */
2293 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2294 EXTENT_DIRTY, 0)) {
2295 cur = cur + iosize;
2296 pg_offset += iosize;
2297 continue;
2298 }
2299
2300 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2301 if (tree->ops && tree->ops->writepage_io_hook) {
2302 ret = tree->ops->writepage_io_hook(page, cur,
2303 cur + iosize - 1);
2304 } else {
2305 ret = 0;
2306 }
2307 if (ret) {
2308 SetPageError(page);
2309 } else {
2310 unsigned long max_nr = end_index + 1;
2311
2312 set_range_writeback(tree, cur, cur + iosize - 1);
2313 if (!PageWriteback(page)) {
2314 printk(KERN_ERR "btrfs warning page %lu not "
2315 "writeback, cur %llu end %llu\n",
2316 page->index, (unsigned long long)cur,
2317 (unsigned long long)end);
2318 }
2319
2320 ret = submit_extent_page(WRITE, tree, page, sector,
2321 iosize, pg_offset, bdev,
2322 &epd->bio, max_nr,
2323 end_bio_extent_writepage,
2324 0, 0, 0);
2325 if (ret)
2326 SetPageError(page);
2327 }
2328 cur = cur + iosize;
2329 pg_offset += iosize;
2330 nr++;
2331 }
2332done:
2333 if (nr == 0) {
2334 /* make sure the mapping tag for page dirty gets cleared */
2335 set_page_writeback(page);
2336 end_page_writeback(page);
2337 }
2338 if (unlock_start <= page_end)
2339 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2340 unlock_page(page);
2341
2342update_nr_written:
2343 wbc->nr_to_write -= nr_written;
2344 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2345 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2346 page->mapping->writeback_index = page->index + nr_written;
2347 return 0;
2348}
2349
2350/**
2351 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2352 * @mapping: address space structure to write
2353 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2354 * @writepage: function called for each page
2355 * @data: data passed to writepage function
2356 *
2357 * If a page is already under I/O, write_cache_pages() skips it, even
2358 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2359 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2360 * and msync() need to guarantee that all the data which was dirty at the time
2361 * the call was made get new I/O started against them. If wbc->sync_mode is
2362 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2363 * existing IO to complete.
2364 */
2365static int extent_write_cache_pages(struct extent_io_tree *tree,
2366 struct address_space *mapping,
2367 struct writeback_control *wbc,
2368 writepage_t writepage, void *data,
2369 void (*flush_fn)(void *))
2370{
2371 struct backing_dev_info *bdi = mapping->backing_dev_info;
2372 int ret = 0;
2373 int done = 0;
2374 struct pagevec pvec;
2375 int nr_pages;
2376 pgoff_t index;
2377 pgoff_t end; /* Inclusive */
2378 int scanned = 0;
2379 int range_whole = 0;
2380
2381 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2382 wbc->encountered_congestion = 1;
2383 return 0;
2384 }
2385
2386 pagevec_init(&pvec, 0);
2387 if (wbc->range_cyclic) {
2388 index = mapping->writeback_index; /* Start from prev offset */
2389 end = -1;
2390 } else {
2391 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2392 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2393 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2394 range_whole = 1;
2395 scanned = 1;
2396 }
2397retry:
2398 while (!done && (index <= end) &&
2399 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2400 PAGECACHE_TAG_DIRTY, min(end - index,
2401 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2402 unsigned i;
2403
2404 scanned = 1;
2405 for (i = 0; i < nr_pages; i++) {
2406 struct page *page = pvec.pages[i];
2407
2408 /*
2409 * At this point we hold neither mapping->tree_lock nor
2410 * lock on the page itself: the page may be truncated or
2411 * invalidated (changing page->mapping to NULL), or even
2412 * swizzled back from swapper_space to tmpfs file
2413 * mapping
2414 */
2415 if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2416 tree->ops->write_cache_pages_lock_hook(page);
2417 else
2418 lock_page(page);
2419
2420 if (unlikely(page->mapping != mapping)) {
2421 unlock_page(page);
2422 continue;
2423 }
2424
2425 if (!wbc->range_cyclic && page->index > end) {
2426 done = 1;
2427 unlock_page(page);
2428 continue;
2429 }
2430
2431 if (wbc->sync_mode != WB_SYNC_NONE) {
2432 if (PageWriteback(page))
2433 flush_fn(data);
2434 wait_on_page_writeback(page);
2435 }
2436
2437 if (PageWriteback(page) ||
2438 !clear_page_dirty_for_io(page)) {
2439 unlock_page(page);
2440 continue;
2441 }
2442
2443 ret = (*writepage)(page, wbc, data);
2444
2445 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2446 unlock_page(page);
2447 ret = 0;
2448 }
2449 if (ret || wbc->nr_to_write <= 0)
2450 done = 1;
2451 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2452 wbc->encountered_congestion = 1;
2453 done = 1;
2454 }
2455 }
2456 pagevec_release(&pvec);
2457 cond_resched();
2458 }
2459 if (!scanned && !done) {
2460 /*
2461 * We hit the last page and there is more work to be done: wrap
2462 * back to the start of the file
2463 */
2464 scanned = 1;
2465 index = 0;
2466 goto retry;
2467 }
2468 return ret;
2469}
2470
2471static noinline void flush_write_bio(void *data)
2472{
2473 struct extent_page_data *epd = data;
2474 if (epd->bio) {
2475 submit_one_bio(WRITE, epd->bio, 0, 0);
2476 epd->bio = NULL;
2477 }
2478}
2479
2480int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2481 get_extent_t *get_extent,
2482 struct writeback_control *wbc)
2483{
2484 int ret;
2485 struct address_space *mapping = page->mapping;
2486 struct extent_page_data epd = {
2487 .bio = NULL,
2488 .tree = tree,
2489 .get_extent = get_extent,
2490 .extent_locked = 0,
2491 };
2492 struct writeback_control wbc_writepages = {
2493 .bdi = wbc->bdi,
2494 .sync_mode = WB_SYNC_NONE,
2495 .older_than_this = NULL,
2496 .nr_to_write = 64,
2497 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2498 .range_end = (loff_t)-1,
2499 };
2500
2501
2502 ret = __extent_writepage(page, wbc, &epd);
2503
2504 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2505 __extent_writepage, &epd, flush_write_bio);
2506 if (epd.bio)
2507 submit_one_bio(WRITE, epd.bio, 0, 0);
2508 return ret;
2509}
2510
2511int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2512 u64 start, u64 end, get_extent_t *get_extent,
2513 int mode)
2514{
2515 int ret = 0;
2516 struct address_space *mapping = inode->i_mapping;
2517 struct page *page;
2518 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2519 PAGE_CACHE_SHIFT;
2520
2521 struct extent_page_data epd = {
2522 .bio = NULL,
2523 .tree = tree,
2524 .get_extent = get_extent,
2525 .extent_locked = 1,
2526 };
2527 struct writeback_control wbc_writepages = {
2528 .bdi = inode->i_mapping->backing_dev_info,
2529 .sync_mode = mode,
2530 .older_than_this = NULL,
2531 .nr_to_write = nr_pages * 2,
2532 .range_start = start,
2533 .range_end = end + 1,
2534 };
2535
2536 while (start <= end) {
2537 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2538 if (clear_page_dirty_for_io(page))
2539 ret = __extent_writepage(page, &wbc_writepages, &epd);
2540 else {
2541 if (tree->ops && tree->ops->writepage_end_io_hook)
2542 tree->ops->writepage_end_io_hook(page, start,
2543 start + PAGE_CACHE_SIZE - 1,
2544 NULL, 1);
2545 unlock_page(page);
2546 }
2547 page_cache_release(page);
2548 start += PAGE_CACHE_SIZE;
2549 }
2550
2551 if (epd.bio)
2552 submit_one_bio(WRITE, epd.bio, 0, 0);
2553 return ret;
2554}
2555
2556int extent_writepages(struct extent_io_tree *tree,
2557 struct address_space *mapping,
2558 get_extent_t *get_extent,
2559 struct writeback_control *wbc)
2560{
2561 int ret = 0;
2562 struct extent_page_data epd = {
2563 .bio = NULL,
2564 .tree = tree,
2565 .get_extent = get_extent,
2566 .extent_locked = 0,
2567 };
2568
2569 ret = extent_write_cache_pages(tree, mapping, wbc,
2570 __extent_writepage, &epd,
2571 flush_write_bio);
2572 if (epd.bio)
2573 submit_one_bio(WRITE, epd.bio, 0, 0);
2574 return ret;
2575}
2576
2577int extent_readpages(struct extent_io_tree *tree,
2578 struct address_space *mapping,
2579 struct list_head *pages, unsigned nr_pages,
2580 get_extent_t get_extent)
2581{
2582 struct bio *bio = NULL;
2583 unsigned page_idx;
2584 struct pagevec pvec;
2585 unsigned long bio_flags = 0;
2586
2587 pagevec_init(&pvec, 0);
2588 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2589 struct page *page = list_entry(pages->prev, struct page, lru);
2590
2591 prefetchw(&page->flags);
2592 list_del(&page->lru);
2593 /*
2594 * what we want to do here is call add_to_page_cache_lru,
2595 * but that isn't exported, so we reproduce it here
2596 */
2597 if (!add_to_page_cache(page, mapping,
2598 page->index, GFP_KERNEL)) {
2599
2600 /* open coding of lru_cache_add, also not exported */
2601 page_cache_get(page);
2602 if (!pagevec_add(&pvec, page))
2603 __pagevec_lru_add_file(&pvec);
2604 __extent_read_full_page(tree, page, get_extent,
2605 &bio, 0, &bio_flags);
2606 }
2607 page_cache_release(page);
2608 }
2609 if (pagevec_count(&pvec))
2610 __pagevec_lru_add_file(&pvec);
2611 BUG_ON(!list_empty(pages));
2612 if (bio)
2613 submit_one_bio(READ, bio, 0, bio_flags);
2614 return 0;
2615}
2616
2617/*
2618 * basic invalidatepage code, this waits on any locked or writeback
2619 * ranges corresponding to the page, and then deletes any extent state
2620 * records from the tree
2621 */
2622int extent_invalidatepage(struct extent_io_tree *tree,
2623 struct page *page, unsigned long offset)
2624{
2625 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2626 u64 end = start + PAGE_CACHE_SIZE - 1;
2627 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2628
2629 start += (offset + blocksize - 1) & ~(blocksize - 1);
2630 if (start > end)
2631 return 0;
2632
2633 lock_extent(tree, start, end, GFP_NOFS);
2634 wait_on_extent_writeback(tree, start, end);
2635 clear_extent_bit(tree, start, end,
2636 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2637 1, 1, GFP_NOFS);
2638 return 0;
2639}
2640
2641/*
2642 * simple commit_write call, set_range_dirty is used to mark both
2643 * the pages and the extent records as dirty
2644 */
2645int extent_commit_write(struct extent_io_tree *tree,
2646 struct inode *inode, struct page *page,
2647 unsigned from, unsigned to)
2648{
2649 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2650
2651 set_page_extent_mapped(page);
2652 set_page_dirty(page);
2653
2654 if (pos > inode->i_size) {
2655 i_size_write(inode, pos);
2656 mark_inode_dirty(inode);
2657 }
2658 return 0;
2659}
2660
2661int extent_prepare_write(struct extent_io_tree *tree,
2662 struct inode *inode, struct page *page,
2663 unsigned from, unsigned to, get_extent_t *get_extent)
2664{
2665 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2666 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2667 u64 block_start;
2668 u64 orig_block_start;
2669 u64 block_end;
2670 u64 cur_end;
2671 struct extent_map *em;
2672 unsigned blocksize = 1 << inode->i_blkbits;
2673 size_t page_offset = 0;
2674 size_t block_off_start;
2675 size_t block_off_end;
2676 int err = 0;
2677 int iocount = 0;
2678 int ret = 0;
2679 int isnew;
2680
2681 set_page_extent_mapped(page);
2682
2683 block_start = (page_start + from) & ~((u64)blocksize - 1);
2684 block_end = (page_start + to - 1) | (blocksize - 1);
2685 orig_block_start = block_start;
2686
2687 lock_extent(tree, page_start, page_end, GFP_NOFS);
2688 while (block_start <= block_end) {
2689 em = get_extent(inode, page, page_offset, block_start,
2690 block_end - block_start + 1, 1);
2691 if (IS_ERR(em) || !em)
2692 goto err;
2693
2694 cur_end = min(block_end, extent_map_end(em) - 1);
2695 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2696 block_off_end = block_off_start + blocksize;
2697 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2698
2699 if (!PageUptodate(page) && isnew &&
2700 (block_off_end > to || block_off_start < from)) {
2701 void *kaddr;
2702
2703 kaddr = kmap_atomic(page, KM_USER0);
2704 if (block_off_end > to)
2705 memset(kaddr + to, 0, block_off_end - to);
2706 if (block_off_start < from)
2707 memset(kaddr + block_off_start, 0,
2708 from - block_off_start);
2709 flush_dcache_page(page);
2710 kunmap_atomic(kaddr, KM_USER0);
2711 }
2712 if ((em->block_start != EXTENT_MAP_HOLE &&
2713 em->block_start != EXTENT_MAP_INLINE) &&
2714 !isnew && !PageUptodate(page) &&
2715 (block_off_end > to || block_off_start < from) &&
2716 !test_range_bit(tree, block_start, cur_end,
2717 EXTENT_UPTODATE, 1)) {
2718 u64 sector;
2719 u64 extent_offset = block_start - em->start;
2720 size_t iosize;
2721 sector = (em->block_start + extent_offset) >> 9;
2722 iosize = (cur_end - block_start + blocksize) &
2723 ~((u64)blocksize - 1);
2724 /*
2725 * we've already got the extent locked, but we
2726 * need to split the state such that our end_bio
2727 * handler can clear the lock.
2728 */
2729 set_extent_bit(tree, block_start,
2730 block_start + iosize - 1,
2731 EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2732 ret = submit_extent_page(READ, tree, page,
2733 sector, iosize, page_offset, em->bdev,
2734 NULL, 1,
2735 end_bio_extent_preparewrite, 0,
2736 0, 0);
2737 iocount++;
2738 block_start = block_start + iosize;
2739 } else {
2740 set_extent_uptodate(tree, block_start, cur_end,
2741 GFP_NOFS);
2742 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2743 block_start = cur_end + 1;
2744 }
2745 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2746 free_extent_map(em);
2747 }
2748 if (iocount) {
2749 wait_extent_bit(tree, orig_block_start,
2750 block_end, EXTENT_LOCKED);
2751 }
2752 check_page_uptodate(tree, page);
2753err:
2754 /* FIXME, zero out newly allocated blocks on error */
2755 return err;
2756}
2757
2758/*
2759 * a helper for releasepage, this tests for areas of the page that
2760 * are locked or under IO and drops the related state bits if it is safe
2761 * to drop the page.
2762 */
2763int try_release_extent_state(struct extent_map_tree *map,
2764 struct extent_io_tree *tree, struct page *page,
2765 gfp_t mask)
2766{
2767 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2768 u64 end = start + PAGE_CACHE_SIZE - 1;
2769 int ret = 1;
2770
2771 if (test_range_bit(tree, start, end,
2772 EXTENT_IOBITS | EXTENT_ORDERED, 0))
2773 ret = 0;
2774 else {
2775 if ((mask & GFP_NOFS) == GFP_NOFS)
2776 mask = GFP_NOFS;
2777 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2778 1, 1, mask);
2779 }
2780 return ret;
2781}
2782
2783/*
2784 * a helper for releasepage. As long as there are no locked extents
2785 * in the range corresponding to the page, both state records and extent
2786 * map records are removed
2787 */
2788int try_release_extent_mapping(struct extent_map_tree *map,
2789 struct extent_io_tree *tree, struct page *page,
2790 gfp_t mask)
2791{
2792 struct extent_map *em;
2793 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2794 u64 end = start + PAGE_CACHE_SIZE - 1;
2795
2796 if ((mask & __GFP_WAIT) &&
2797 page->mapping->host->i_size > 16 * 1024 * 1024) {
2798 u64 len;
2799 while (start <= end) {
2800 len = end - start + 1;
2801 spin_lock(&map->lock);
2802 em = lookup_extent_mapping(map, start, len);
2803 if (!em || IS_ERR(em)) {
2804 spin_unlock(&map->lock);
2805 break;
2806 }
2807 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2808 em->start != start) {
2809 spin_unlock(&map->lock);
2810 free_extent_map(em);
2811 break;
2812 }
2813 if (!test_range_bit(tree, em->start,
2814 extent_map_end(em) - 1,
2815 EXTENT_LOCKED | EXTENT_WRITEBACK |
2816 EXTENT_ORDERED,
2817 0)) {
2818 remove_extent_mapping(map, em);
2819 /* once for the rb tree */
2820 free_extent_map(em);
2821 }
2822 start = extent_map_end(em);
2823 spin_unlock(&map->lock);
2824
2825 /* once for us */
2826 free_extent_map(em);
2827 }
2828 }
2829 return try_release_extent_state(map, tree, page, mask);
2830}
2831
2832sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2833 get_extent_t *get_extent)
2834{
2835 struct inode *inode = mapping->host;
2836 u64 start = iblock << inode->i_blkbits;
2837 sector_t sector = 0;
2838 size_t blksize = (1 << inode->i_blkbits);
2839 struct extent_map *em;
2840
2841 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2842 GFP_NOFS);
2843 em = get_extent(inode, NULL, 0, start, blksize, 0);
2844 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2845 GFP_NOFS);
2846 if (!em || IS_ERR(em))
2847 return 0;
2848
2849 if (em->block_start > EXTENT_MAP_LAST_BYTE)
2850 goto out;
2851
2852 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2853out:
2854 free_extent_map(em);
2855 return sector;
2856}
2857
2858static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2859 unsigned long i)
2860{
2861 struct page *p;
2862 struct address_space *mapping;
2863
2864 if (i == 0)
2865 return eb->first_page;
2866 i += eb->start >> PAGE_CACHE_SHIFT;
2867 mapping = eb->first_page->mapping;
2868 if (!mapping)
2869 return NULL;
2870
2871 /*
2872 * extent_buffer_page is only called after pinning the page
2873 * by increasing the reference count. So we know the page must
2874 * be in the radix tree.
2875 */
2876 rcu_read_lock();
2877 p = radix_tree_lookup(&mapping->page_tree, i);
2878 rcu_read_unlock();
2879
2880 return p;
2881}
2882
2883static inline unsigned long num_extent_pages(u64 start, u64 len)
2884{
2885 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2886 (start >> PAGE_CACHE_SHIFT);
2887}
2888
2889static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2890 u64 start,
2891 unsigned long len,
2892 gfp_t mask)
2893{
2894 struct extent_buffer *eb = NULL;
2895#ifdef LEAK_DEBUG
2896 unsigned long flags;
2897#endif
2898
2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2900 eb->start = start;
2901 eb->len = len;
2902 mutex_init(&eb->mutex);
2903#ifdef LEAK_DEBUG
2904 spin_lock_irqsave(&leak_lock, flags);
2905 list_add(&eb->leak_list, &buffers);
2906 spin_unlock_irqrestore(&leak_lock, flags);
2907#endif
2908 atomic_set(&eb->refs, 1);
2909
2910 return eb;
2911}
2912
2913static void __free_extent_buffer(struct extent_buffer *eb)
2914{
2915#ifdef LEAK_DEBUG
2916 unsigned long flags;
2917 spin_lock_irqsave(&leak_lock, flags);
2918 list_del(&eb->leak_list);
2919 spin_unlock_irqrestore(&leak_lock, flags);
2920#endif
2921 kmem_cache_free(extent_buffer_cache, eb);
2922}
2923
2924struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2925 u64 start, unsigned long len,
2926 struct page *page0,
2927 gfp_t mask)
2928{
2929 unsigned long num_pages = num_extent_pages(start, len);
2930 unsigned long i;
2931 unsigned long index = start >> PAGE_CACHE_SHIFT;
2932 struct extent_buffer *eb;
2933 struct extent_buffer *exists = NULL;
2934 struct page *p;
2935 struct address_space *mapping = tree->mapping;
2936 int uptodate = 1;
2937
2938 spin_lock(&tree->buffer_lock);
2939 eb = buffer_search(tree, start);
2940 if (eb) {
2941 atomic_inc(&eb->refs);
2942 spin_unlock(&tree->buffer_lock);
2943 mark_page_accessed(eb->first_page);
2944 return eb;
2945 }
2946 spin_unlock(&tree->buffer_lock);
2947
2948 eb = __alloc_extent_buffer(tree, start, len, mask);
2949 if (!eb)
2950 return NULL;
2951
2952 if (page0) {
2953 eb->first_page = page0;
2954 i = 1;
2955 index++;
2956 page_cache_get(page0);
2957 mark_page_accessed(page0);
2958 set_page_extent_mapped(page0);
2959 set_page_extent_head(page0, len);
2960 uptodate = PageUptodate(page0);
2961 } else {
2962 i = 0;
2963 }
2964 for (; i < num_pages; i++, index++) {
2965 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2966 if (!p) {
2967 WARN_ON(1);
2968 goto free_eb;
2969 }
2970 set_page_extent_mapped(p);
2971 mark_page_accessed(p);
2972 if (i == 0) {
2973 eb->first_page = p;
2974 set_page_extent_head(p, len);
2975 } else {
2976 set_page_private(p, EXTENT_PAGE_PRIVATE);
2977 }
2978 if (!PageUptodate(p))
2979 uptodate = 0;
2980 unlock_page(p);
2981 }
2982 if (uptodate)
2983 eb->flags |= EXTENT_UPTODATE;
2984 eb->flags |= EXTENT_BUFFER_FILLED;
2985
2986 spin_lock(&tree->buffer_lock);
2987 exists = buffer_tree_insert(tree, start, &eb->rb_node);
2988 if (exists) {
2989 /* add one reference for the caller */
2990 atomic_inc(&exists->refs);
2991 spin_unlock(&tree->buffer_lock);
2992 goto free_eb;
2993 }
2994 spin_unlock(&tree->buffer_lock);
2995
2996 /* add one reference for the tree */
2997 atomic_inc(&eb->refs);
2998 return eb;
2999
3000free_eb:
3001 if (!atomic_dec_and_test(&eb->refs))
3002 return exists;
3003 for (index = 1; index < i; index++)
3004 page_cache_release(extent_buffer_page(eb, index));
3005 page_cache_release(extent_buffer_page(eb, 0));
3006 __free_extent_buffer(eb);
3007 return exists;
3008}
3009
3010struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3011 u64 start, unsigned long len,
3012 gfp_t mask)
3013{
3014 struct extent_buffer *eb;
3015
3016 spin_lock(&tree->buffer_lock);
3017 eb = buffer_search(tree, start);
3018 if (eb)
3019 atomic_inc(&eb->refs);
3020 spin_unlock(&tree->buffer_lock);
3021
3022 if (eb)
3023 mark_page_accessed(eb->first_page);
3024
3025 return eb;
3026}
3027
3028void free_extent_buffer(struct extent_buffer *eb)
3029{
3030 if (!eb)
3031 return;
3032
3033 if (!atomic_dec_and_test(&eb->refs))
3034 return;
3035
3036 WARN_ON(1);
3037}
3038
3039int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3040 struct extent_buffer *eb)
3041{
3042 int set;
3043 unsigned long i;
3044 unsigned long num_pages;
3045 struct page *page;
3046
3047 u64 start = eb->start;
3048 u64 end = start + eb->len - 1;
3049
3050 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3051 num_pages = num_extent_pages(eb->start, eb->len);
3052
3053 for (i = 0; i < num_pages; i++) {
3054 page = extent_buffer_page(eb, i);
3055 if (!set && !PageDirty(page))
3056 continue;
3057
3058 lock_page(page);
3059 if (i == 0)
3060 set_page_extent_head(page, eb->len);
3061 else
3062 set_page_private(page, EXTENT_PAGE_PRIVATE);
3063
3064 /*
3065 * if we're on the last page or the first page and the
3066 * block isn't aligned on a page boundary, do extra checks
3067 * to make sure we don't clean page that is partially dirty
3068 */
3069 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3070 ((i == num_pages - 1) &&
3071 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3072 start = (u64)page->index << PAGE_CACHE_SHIFT;
3073 end = start + PAGE_CACHE_SIZE - 1;
3074 if (test_range_bit(tree, start, end,
3075 EXTENT_DIRTY, 0)) {
3076 unlock_page(page);
3077 continue;
3078 }
3079 }
3080 clear_page_dirty_for_io(page);
3081 spin_lock_irq(&page->mapping->tree_lock);
3082 if (!PageDirty(page)) {
3083 radix_tree_tag_clear(&page->mapping->page_tree,
3084 page_index(page),
3085 PAGECACHE_TAG_DIRTY);
3086 }
3087 spin_unlock_irq(&page->mapping->tree_lock);
3088 unlock_page(page);
3089 }
3090 return 0;
3091}
3092
3093int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3094 struct extent_buffer *eb)
3095{
3096 return wait_on_extent_writeback(tree, eb->start,
3097 eb->start + eb->len - 1);
3098}
3099
3100int set_extent_buffer_dirty(struct extent_io_tree *tree,
3101 struct extent_buffer *eb)
3102{
3103 unsigned long i;
3104 unsigned long num_pages;
3105
3106 num_pages = num_extent_pages(eb->start, eb->len);
3107 for (i = 0; i < num_pages; i++) {
3108 struct page *page = extent_buffer_page(eb, i);
3109 /* writepage may need to do something special for the
3110 * first page, we have to make sure page->private is
3111 * properly set. releasepage may drop page->private
3112 * on us if the page isn't already dirty.
3113 */
3114 lock_page(page);
3115 if (i == 0) {
3116 set_page_extent_head(page, eb->len);
3117 } else if (PagePrivate(page) &&
3118 page->private != EXTENT_PAGE_PRIVATE) {
3119 set_page_extent_mapped(page);
3120 }
3121 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3122 set_extent_dirty(tree, page_offset(page),
3123 page_offset(page) + PAGE_CACHE_SIZE - 1,
3124 GFP_NOFS);
3125 unlock_page(page);
3126 }
3127 return 0;
3128}
3129
3130int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3131 struct extent_buffer *eb)
3132{
3133 unsigned long i;
3134 struct page *page;
3135 unsigned long num_pages;
3136
3137 num_pages = num_extent_pages(eb->start, eb->len);
3138 eb->flags &= ~EXTENT_UPTODATE;
3139
3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3141 GFP_NOFS);
3142 for (i = 0; i < num_pages; i++) {
3143 page = extent_buffer_page(eb, i);
3144 if (page)
3145 ClearPageUptodate(page);
3146 }
3147 return 0;
3148}
3149
3150int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3151 struct extent_buffer *eb)
3152{
3153 unsigned long i;
3154 struct page *page;
3155 unsigned long num_pages;
3156
3157 num_pages = num_extent_pages(eb->start, eb->len);
3158
3159 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3160 GFP_NOFS);
3161 for (i = 0; i < num_pages; i++) {
3162 page = extent_buffer_page(eb, i);
3163 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3164 ((i == num_pages - 1) &&
3165 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3166 check_page_uptodate(tree, page);
3167 continue;
3168 }
3169 SetPageUptodate(page);
3170 }
3171 return 0;
3172}
3173
3174int extent_range_uptodate(struct extent_io_tree *tree,
3175 u64 start, u64 end)
3176{
3177 struct page *page;
3178 int ret;
3179 int pg_uptodate = 1;
3180 int uptodate;
3181 unsigned long index;
3182
3183 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
3184 if (ret)
3185 return 1;
3186 while (start <= end) {
3187 index = start >> PAGE_CACHE_SHIFT;
3188 page = find_get_page(tree->mapping, index);
3189 uptodate = PageUptodate(page);
3190 page_cache_release(page);
3191 if (!uptodate) {
3192 pg_uptodate = 0;
3193 break;
3194 }
3195 start += PAGE_CACHE_SIZE;
3196 }
3197 return pg_uptodate;
3198}
3199
3200int extent_buffer_uptodate(struct extent_io_tree *tree,
3201 struct extent_buffer *eb)
3202{
3203 int ret = 0;
3204 unsigned long num_pages;
3205 unsigned long i;
3206 struct page *page;
3207 int pg_uptodate = 1;
3208
3209 if (eb->flags & EXTENT_UPTODATE)
3210 return 1;
3211
3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3213 EXTENT_UPTODATE, 1);
3214 if (ret)
3215 return ret;
3216
3217 num_pages = num_extent_pages(eb->start, eb->len);
3218 for (i = 0; i < num_pages; i++) {
3219 page = extent_buffer_page(eb, i);
3220 if (!PageUptodate(page)) {
3221 pg_uptodate = 0;
3222 break;
3223 }
3224 }
3225 return pg_uptodate;
3226}
3227
3228int read_extent_buffer_pages(struct extent_io_tree *tree,
3229 struct extent_buffer *eb,
3230 u64 start, int wait,
3231 get_extent_t *get_extent, int mirror_num)
3232{
3233 unsigned long i;
3234 unsigned long start_i;
3235 struct page *page;
3236 int err;
3237 int ret = 0;
3238 int locked_pages = 0;
3239 int all_uptodate = 1;
3240 int inc_all_pages = 0;
3241 unsigned long num_pages;
3242 struct bio *bio = NULL;
3243 unsigned long bio_flags = 0;
3244
3245 if (eb->flags & EXTENT_UPTODATE)
3246 return 0;
3247
3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3249 EXTENT_UPTODATE, 1)) {
3250 return 0;
3251 }
3252
3253 if (start) {
3254 WARN_ON(start < eb->start);
3255 start_i = (start >> PAGE_CACHE_SHIFT) -
3256 (eb->start >> PAGE_CACHE_SHIFT);
3257 } else {
3258 start_i = 0;
3259 }
3260
3261 num_pages = num_extent_pages(eb->start, eb->len);
3262 for (i = start_i; i < num_pages; i++) {
3263 page = extent_buffer_page(eb, i);
3264 if (!wait) {
3265 if (!trylock_page(page))
3266 goto unlock_exit;
3267 } else {
3268 lock_page(page);
3269 }
3270 locked_pages++;
3271 if (!PageUptodate(page))
3272 all_uptodate = 0;
3273 }
3274 if (all_uptodate) {
3275 if (start_i == 0)
3276 eb->flags |= EXTENT_UPTODATE;
3277 goto unlock_exit;
3278 }
3279
3280 for (i = start_i; i < num_pages; i++) {
3281 page = extent_buffer_page(eb, i);
3282 if (inc_all_pages)
3283 page_cache_get(page);
3284 if (!PageUptodate(page)) {
3285 if (start_i == 0)
3286 inc_all_pages = 1;
3287 ClearPageError(page);
3288 err = __extent_read_full_page(tree, page,
3289 get_extent, &bio,
3290 mirror_num, &bio_flags);
3291 if (err)
3292 ret = err;
3293 } else {
3294 unlock_page(page);
3295 }
3296 }
3297
3298 if (bio)
3299 submit_one_bio(READ, bio, mirror_num, bio_flags);
3300
3301 if (ret || !wait)
3302 return ret;
3303
3304 for (i = start_i; i < num_pages; i++) {
3305 page = extent_buffer_page(eb, i);
3306 wait_on_page_locked(page);
3307 if (!PageUptodate(page))
3308 ret = -EIO;
3309 }
3310
3311 if (!ret)
3312 eb->flags |= EXTENT_UPTODATE;
3313 return ret;
3314
3315unlock_exit:
3316 i = start_i;
3317 while (locked_pages > 0) {
3318 page = extent_buffer_page(eb, i);
3319 i++;
3320 unlock_page(page);
3321 locked_pages--;
3322 }
3323 return ret;
3324}
3325
3326void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3327 unsigned long start,
3328 unsigned long len)
3329{
3330 size_t cur;
3331 size_t offset;
3332 struct page *page;
3333 char *kaddr;
3334 char *dst = (char *)dstv;
3335 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3336 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3337
3338 WARN_ON(start > eb->len);
3339 WARN_ON(start + len > eb->start + eb->len);
3340
3341 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3342
3343 while (len > 0) {
3344 page = extent_buffer_page(eb, i);
3345
3346 cur = min(len, (PAGE_CACHE_SIZE - offset));
3347 kaddr = kmap_atomic(page, KM_USER1);
3348 memcpy(dst, kaddr + offset, cur);
3349 kunmap_atomic(kaddr, KM_USER1);
3350
3351 dst += cur;
3352 len -= cur;
3353 offset = 0;
3354 i++;
3355 }
3356}
3357
3358int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3359 unsigned long min_len, char **token, char **map,
3360 unsigned long *map_start,
3361 unsigned long *map_len, int km)
3362{
3363 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3364 char *kaddr;
3365 struct page *p;
3366 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3367 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3368 unsigned long end_i = (start_offset + start + min_len - 1) >>
3369 PAGE_CACHE_SHIFT;
3370
3371 if (i != end_i)
3372 return -EINVAL;
3373
3374 if (i == 0) {
3375 offset = start_offset;
3376 *map_start = 0;
3377 } else {
3378 offset = 0;
3379 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3380 }
3381
3382 if (start + min_len > eb->len) {
3383 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3384 "wanted %lu %lu\n", (unsigned long long)eb->start,
3385 eb->len, start, min_len);
3386 WARN_ON(1);
3387 }
3388
3389 p = extent_buffer_page(eb, i);
3390 kaddr = kmap_atomic(p, km);
3391 *token = kaddr;
3392 *map = kaddr + offset;
3393 *map_len = PAGE_CACHE_SIZE - offset;
3394 return 0;
3395}
3396
3397int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3398 unsigned long min_len,
3399 char **token, char **map,
3400 unsigned long *map_start,
3401 unsigned long *map_len, int km)
3402{
3403 int err;
3404 int save = 0;
3405 if (eb->map_token) {
3406 unmap_extent_buffer(eb, eb->map_token, km);
3407 eb->map_token = NULL;
3408 save = 1;
3409 WARN_ON(!mutex_is_locked(&eb->mutex));
3410 }
3411 err = map_private_extent_buffer(eb, start, min_len, token, map,
3412 map_start, map_len, km);
3413 if (!err && save) {
3414 eb->map_token = *token;
3415 eb->kaddr = *map;
3416 eb->map_start = *map_start;
3417 eb->map_len = *map_len;
3418 }
3419 return err;
3420}
3421
3422void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3423{
3424 kunmap_atomic(token, km);
3425}
3426
3427int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3428 unsigned long start,
3429 unsigned long len)
3430{
3431 size_t cur;
3432 size_t offset;
3433 struct page *page;
3434 char *kaddr;
3435 char *ptr = (char *)ptrv;
3436 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3437 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3438 int ret = 0;
3439
3440 WARN_ON(start > eb->len);
3441 WARN_ON(start + len > eb->start + eb->len);
3442
3443 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3444
3445 while (len > 0) {
3446 page = extent_buffer_page(eb, i);
3447
3448 cur = min(len, (PAGE_CACHE_SIZE - offset));
3449
3450 kaddr = kmap_atomic(page, KM_USER0);
3451 ret = memcmp(ptr, kaddr + offset, cur);
3452 kunmap_atomic(kaddr, KM_USER0);
3453 if (ret)
3454 break;
3455
3456 ptr += cur;
3457 len -= cur;
3458 offset = 0;
3459 i++;
3460 }
3461 return ret;
3462}
3463
3464void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3465 unsigned long start, unsigned long len)
3466{
3467 size_t cur;
3468 size_t offset;
3469 struct page *page;
3470 char *kaddr;
3471 char *src = (char *)srcv;
3472 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3473 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3474
3475 WARN_ON(start > eb->len);
3476 WARN_ON(start + len > eb->start + eb->len);
3477
3478 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3479
3480 while (len > 0) {
3481 page = extent_buffer_page(eb, i);
3482 WARN_ON(!PageUptodate(page));
3483
3484 cur = min(len, PAGE_CACHE_SIZE - offset);
3485 kaddr = kmap_atomic(page, KM_USER1);
3486 memcpy(kaddr + offset, src, cur);
3487 kunmap_atomic(kaddr, KM_USER1);
3488
3489 src += cur;
3490 len -= cur;
3491 offset = 0;
3492 i++;
3493 }
3494}
3495
3496void memset_extent_buffer(struct extent_buffer *eb, char c,
3497 unsigned long start, unsigned long len)
3498{
3499 size_t cur;
3500 size_t offset;
3501 struct page *page;
3502 char *kaddr;
3503 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3504 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3505
3506 WARN_ON(start > eb->len);
3507 WARN_ON(start + len > eb->start + eb->len);
3508
3509 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3510
3511 while (len > 0) {
3512 page = extent_buffer_page(eb, i);
3513 WARN_ON(!PageUptodate(page));
3514
3515 cur = min(len, PAGE_CACHE_SIZE - offset);
3516 kaddr = kmap_atomic(page, KM_USER0);
3517 memset(kaddr + offset, c, cur);
3518 kunmap_atomic(kaddr, KM_USER0);
3519
3520 len -= cur;
3521 offset = 0;
3522 i++;
3523 }
3524}
3525
3526void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3527 unsigned long dst_offset, unsigned long src_offset,
3528 unsigned long len)
3529{
3530 u64 dst_len = dst->len;
3531 size_t cur;
3532 size_t offset;
3533 struct page *page;
3534 char *kaddr;
3535 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3536 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3537
3538 WARN_ON(src->len != dst_len);
3539
3540 offset = (start_offset + dst_offset) &
3541 ((unsigned long)PAGE_CACHE_SIZE - 1);
3542
3543 while (len > 0) {
3544 page = extent_buffer_page(dst, i);
3545 WARN_ON(!PageUptodate(page));
3546
3547 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3548
3549 kaddr = kmap_atomic(page, KM_USER0);
3550 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3551 kunmap_atomic(kaddr, KM_USER0);
3552
3553 src_offset += cur;
3554 len -= cur;
3555 offset = 0;
3556 i++;
3557 }
3558}
3559
3560static void move_pages(struct page *dst_page, struct page *src_page,
3561 unsigned long dst_off, unsigned long src_off,
3562 unsigned long len)
3563{
3564 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3565 if (dst_page == src_page) {
3566 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3567 } else {
3568 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3569 char *p = dst_kaddr + dst_off + len;
3570 char *s = src_kaddr + src_off + len;
3571
3572 while (len--)
3573 *--p = *--s;
3574
3575 kunmap_atomic(src_kaddr, KM_USER1);
3576 }
3577 kunmap_atomic(dst_kaddr, KM_USER0);
3578}
3579
3580static void copy_pages(struct page *dst_page, struct page *src_page,
3581 unsigned long dst_off, unsigned long src_off,
3582 unsigned long len)
3583{
3584 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3585 char *src_kaddr;
3586
3587 if (dst_page != src_page)
3588 src_kaddr = kmap_atomic(src_page, KM_USER1);
3589 else
3590 src_kaddr = dst_kaddr;
3591
3592 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3593 kunmap_atomic(dst_kaddr, KM_USER0);
3594 if (dst_page != src_page)
3595 kunmap_atomic(src_kaddr, KM_USER1);
3596}
3597
3598void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3599 unsigned long src_offset, unsigned long len)
3600{
3601 size_t cur;
3602 size_t dst_off_in_page;
3603 size_t src_off_in_page;
3604 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3605 unsigned long dst_i;
3606 unsigned long src_i;
3607
3608 if (src_offset + len > dst->len) {
3609 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3610 "len %lu dst len %lu\n", src_offset, len, dst->len);
3611 BUG_ON(1);
3612 }
3613 if (dst_offset + len > dst->len) {
3614 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3615 "len %lu dst len %lu\n", dst_offset, len, dst->len);
3616 BUG_ON(1);
3617 }
3618
3619 while (len > 0) {
3620 dst_off_in_page = (start_offset + dst_offset) &
3621 ((unsigned long)PAGE_CACHE_SIZE - 1);
3622 src_off_in_page = (start_offset + src_offset) &
3623 ((unsigned long)PAGE_CACHE_SIZE - 1);
3624
3625 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3626 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3627
3628 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3629 src_off_in_page));
3630 cur = min_t(unsigned long, cur,
3631 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3632
3633 copy_pages(extent_buffer_page(dst, dst_i),
3634 extent_buffer_page(dst, src_i),
3635 dst_off_in_page, src_off_in_page, cur);
3636
3637 src_offset += cur;
3638 dst_offset += cur;
3639 len -= cur;
3640 }
3641}
3642
3643void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3644 unsigned long src_offset, unsigned long len)
3645{
3646 size_t cur;
3647 size_t dst_off_in_page;
3648 size_t src_off_in_page;
3649 unsigned long dst_end = dst_offset + len - 1;
3650 unsigned long src_end = src_offset + len - 1;
3651 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3652 unsigned long dst_i;
3653 unsigned long src_i;
3654
3655 if (src_offset + len > dst->len) {
3656 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3657 "len %lu len %lu\n", src_offset, len, dst->len);
3658 BUG_ON(1);
3659 }
3660 if (dst_offset + len > dst->len) {
3661 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3662 "len %lu len %lu\n", dst_offset, len, dst->len);
3663 BUG_ON(1);
3664 }
3665 if (dst_offset < src_offset) {
3666 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3667 return;
3668 }
3669 while (len > 0) {
3670 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3671 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3672
3673 dst_off_in_page = (start_offset + dst_end) &
3674 ((unsigned long)PAGE_CACHE_SIZE - 1);
3675 src_off_in_page = (start_offset + src_end) &
3676 ((unsigned long)PAGE_CACHE_SIZE - 1);
3677
3678 cur = min_t(unsigned long, len, src_off_in_page + 1);
3679 cur = min(cur, dst_off_in_page + 1);
3680 move_pages(extent_buffer_page(dst, dst_i),
3681 extent_buffer_page(dst, src_i),
3682 dst_off_in_page - cur + 1,
3683 src_off_in_page - cur + 1, cur);
3684
3685 dst_end -= cur;
3686 src_end -= cur;
3687 len -= cur;
3688 }
3689}
3690
3691int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3692{
3693 u64 start = page_offset(page);
3694 struct extent_buffer *eb;
3695 int ret = 1;
3696 unsigned long i;
3697 unsigned long num_pages;
3698
3699 spin_lock(&tree->buffer_lock);
3700 eb = buffer_search(tree, start);
3701 if (!eb)
3702 goto out;
3703
3704 if (atomic_read(&eb->refs) > 1) {
3705 ret = 0;
3706 goto out;
3707 }
3708 /* at this point we can safely release the extent buffer */
3709 num_pages = num_extent_pages(eb->start, eb->len);
3710 for (i = 0; i < num_pages; i++)
3711 page_cache_release(extent_buffer_page(eb, i));
3712 rb_erase(&eb->rb_node, &tree->buffer);
3713 __free_extent_buffer(eb);
3714out:
3715 spin_unlock(&tree->buffer_lock);
3716 return ret;
3717}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000000..c5b483a79137
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,269 @@
1#ifndef __EXTENTIO__
2#define __EXTENTIO__
3
4#include <linux/rbtree.h>
5
6/* bits for the extent state */
7#define EXTENT_DIRTY 1
8#define EXTENT_WRITEBACK (1 << 1)
9#define EXTENT_UPTODATE (1 << 2)
10#define EXTENT_LOCKED (1 << 3)
11#define EXTENT_NEW (1 << 4)
12#define EXTENT_DELALLOC (1 << 5)
13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_ORDERED (1 << 9)
17#define EXTENT_ORDERED_METADATA (1 << 10)
18#define EXTENT_BOUNDARY (1 << 11)
19#define EXTENT_NODATASUM (1 << 12)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21
22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1
24
25/*
26 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one.
28 */
29#define EXTENT_PAGE_PRIVATE 1
30#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
31
32struct extent_state;
33
34typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
35 struct bio *bio, int mirror_num,
36 unsigned long bio_flags);
37struct extent_io_ops {
38 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
39 u64 start, u64 end, int *page_started,
40 unsigned long *nr_written);
41 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
42 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
43 extent_submit_bio_hook_t *submit_bio_hook;
44 int (*merge_bio_hook)(struct page *page, unsigned long offset,
45 size_t size, struct bio *bio,
46 unsigned long bio_flags);
47 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
48 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
49 u64 start, u64 end,
50 struct extent_state *state);
51 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
52 u64 start, u64 end,
53 struct extent_state *state);
54 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
55 struct extent_state *state);
56 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
57 struct extent_state *state, int uptodate);
58 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
59 unsigned long old, unsigned long bits);
60 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
61 unsigned long old, unsigned long bits);
62 int (*write_cache_pages_lock_hook)(struct page *page);
63};
64
65struct extent_io_tree {
66 struct rb_root state;
67 struct rb_root buffer;
68 struct address_space *mapping;
69 u64 dirty_bytes;
70 spinlock_t lock;
71 spinlock_t buffer_lock;
72 struct extent_io_ops *ops;
73};
74
75struct extent_state {
76 u64 start;
77 u64 end; /* inclusive */
78 struct rb_node rb_node;
79 struct extent_io_tree *tree;
80 wait_queue_head_t wq;
81 atomic_t refs;
82 unsigned long state;
83
84 /* for use by the FS */
85 u64 private;
86
87 struct list_head leak_list;
88};
89
90struct extent_buffer {
91 u64 start;
92 unsigned long len;
93 char *map_token;
94 char *kaddr;
95 unsigned long map_start;
96 unsigned long map_len;
97 struct page *first_page;
98 atomic_t refs;
99 int flags;
100 struct list_head leak_list;
101 struct rb_node rb_node;
102 struct mutex mutex;
103};
104
105struct extent_map_tree;
106
107static inline struct extent_state *extent_state_next(struct extent_state *state)
108{
109 struct rb_node *node;
110 node = rb_next(&state->rb_node);
111 if (!node)
112 return NULL;
113 return rb_entry(node, struct extent_state, rb_node);
114}
115
116typedef struct extent_map *(get_extent_t)(struct inode *inode,
117 struct page *page,
118 size_t page_offset,
119 u64 start, u64 len,
120 int create);
121
122void extent_io_tree_init(struct extent_io_tree *tree,
123 struct address_space *mapping, gfp_t mask);
124int try_release_extent_mapping(struct extent_map_tree *map,
125 struct extent_io_tree *tree, struct page *page,
126 gfp_t mask);
127int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
128int try_release_extent_state(struct extent_map_tree *map,
129 struct extent_io_tree *tree, struct page *page,
130 gfp_t mask);
131int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
132int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
133int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
134 gfp_t mask);
135int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
136 get_extent_t *get_extent);
137int __init extent_io_init(void);
138void extent_io_exit(void);
139
140u64 count_range_bits(struct extent_io_tree *tree,
141 u64 *start, u64 search_end,
142 u64 max_bytes, unsigned long bits);
143
144int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
145 int bits, int filled);
146int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
147 int bits, gfp_t mask);
148int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
149 int bits, int wake, int delete, gfp_t mask);
150int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
151 int bits, gfp_t mask);
152int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
153 gfp_t mask);
154int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
155 gfp_t mask);
156int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
157 gfp_t mask);
158int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
159 gfp_t mask);
160int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
161 gfp_t mask);
162int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
163 u64 end, gfp_t mask);
164int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
165 gfp_t mask);
166int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask);
168int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
169 u64 *start_ret, u64 *end_ret, int bits);
170struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
171 u64 start, int bits);
172int extent_invalidatepage(struct extent_io_tree *tree,
173 struct page *page, unsigned long offset);
174int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
175 get_extent_t *get_extent,
176 struct writeback_control *wbc);
177int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
178 u64 start, u64 end, get_extent_t *get_extent,
179 int mode);
180int extent_writepages(struct extent_io_tree *tree,
181 struct address_space *mapping,
182 get_extent_t *get_extent,
183 struct writeback_control *wbc);
184int extent_readpages(struct extent_io_tree *tree,
185 struct address_space *mapping,
186 struct list_head *pages, unsigned nr_pages,
187 get_extent_t get_extent);
188int extent_prepare_write(struct extent_io_tree *tree,
189 struct inode *inode, struct page *page,
190 unsigned from, unsigned to, get_extent_t *get_extent);
191int extent_commit_write(struct extent_io_tree *tree,
192 struct inode *inode, struct page *page,
193 unsigned from, unsigned to);
194sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
195 get_extent_t *get_extent);
196int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
197int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
198int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
199void set_page_extent_mapped(struct page *page);
200
201struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
202 u64 start, unsigned long len,
203 struct page *page0,
204 gfp_t mask);
205struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
206 u64 start, unsigned long len,
207 gfp_t mask);
208void free_extent_buffer(struct extent_buffer *eb);
209int read_extent_buffer_pages(struct extent_io_tree *tree,
210 struct extent_buffer *eb, u64 start, int wait,
211 get_extent_t *get_extent, int mirror_num);
212
213static inline void extent_buffer_get(struct extent_buffer *eb)
214{
215 atomic_inc(&eb->refs);
216}
217
218int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
219 unsigned long start,
220 unsigned long len);
221void read_extent_buffer(struct extent_buffer *eb, void *dst,
222 unsigned long start,
223 unsigned long len);
224void write_extent_buffer(struct extent_buffer *eb, const void *src,
225 unsigned long start, unsigned long len);
226void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
227 unsigned long dst_offset, unsigned long src_offset,
228 unsigned long len);
229void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
230 unsigned long src_offset, unsigned long len);
231void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
232 unsigned long src_offset, unsigned long len);
233void memset_extent_buffer(struct extent_buffer *eb, char c,
234 unsigned long start, unsigned long len);
235int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
236 struct extent_buffer *eb);
237int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
238int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
239int clear_extent_buffer_dirty(struct extent_io_tree *tree,
240 struct extent_buffer *eb);
241int set_extent_buffer_dirty(struct extent_io_tree *tree,
242 struct extent_buffer *eb);
243int set_extent_buffer_uptodate(struct extent_io_tree *tree,
244 struct extent_buffer *eb);
245int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
246 struct extent_buffer *eb);
247int extent_buffer_uptodate(struct extent_io_tree *tree,
248 struct extent_buffer *eb);
249int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
250 unsigned long min_len, char **token, char **map,
251 unsigned long *map_start,
252 unsigned long *map_len, int km);
253int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
254 unsigned long min_len, char **token, char **map,
255 unsigned long *map_start,
256 unsigned long *map_len, int km);
257void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
258int release_extent_buffer_tail_pages(struct extent_buffer *eb);
259int extent_range_uptodate(struct extent_io_tree *tree,
260 u64 start, u64 end);
261int extent_clear_unlock_delalloc(struct inode *inode,
262 struct extent_io_tree *tree,
263 u64 start, u64 end, struct page *locked_page,
264 int unlock_page,
265 int clear_unlock,
266 int clear_delalloc, int clear_dirty,
267 int set_writeback,
268 int end_writeback);
269#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000000..4a83e33ada32
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,351 @@
1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h>
4#include <linux/module.h>
5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h>
8#include "extent_map.h"
9
10/* temporary define until extent_map moves out of btrfs */
11struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
12 unsigned long extra_flags,
13 void (*ctor)(void *, struct kmem_cache *,
14 unsigned long));
15
16static struct kmem_cache *extent_map_cache;
17
18int __init extent_map_init(void)
19{
20 extent_map_cache = btrfs_cache_create("extent_map",
21 sizeof(struct extent_map), 0,
22 NULL);
23 if (!extent_map_cache)
24 return -ENOMEM;
25 return 0;
26}
27
28void extent_map_exit(void)
29{
30 if (extent_map_cache)
31 kmem_cache_destroy(extent_map_cache);
32}
33
34/**
35 * extent_map_tree_init - initialize extent map tree
36 * @tree: tree to initialize
37 * @mask: flags for memory allocations during tree operations
38 *
39 * Initialize the extent tree @tree. Should be called for each new inode
40 * or other user of the extent_map interface.
41 */
42void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
43{
44 tree->map.rb_node = NULL;
45 spin_lock_init(&tree->lock);
46}
47EXPORT_SYMBOL(extent_map_tree_init);
48
49/**
50 * alloc_extent_map - allocate new extent map structure
51 * @mask: memory allocation flags
52 *
53 * Allocate a new extent_map structure. The new structure is
54 * returned with a reference count of one and needs to be
55 * freed using free_extent_map()
56 */
57struct extent_map *alloc_extent_map(gfp_t mask)
58{
59 struct extent_map *em;
60 em = kmem_cache_alloc(extent_map_cache, mask);
61 if (!em || IS_ERR(em))
62 return em;
63 em->in_tree = 0;
64 em->flags = 0;
65 atomic_set(&em->refs, 1);
66 return em;
67}
68EXPORT_SYMBOL(alloc_extent_map);
69
70/**
71 * free_extent_map - drop reference count of an extent_map
72 * @em: extent map beeing releasead
73 *
74 * Drops the reference out on @em by one and free the structure
75 * if the reference count hits zero.
76 */
77void free_extent_map(struct extent_map *em)
78{
79 if (!em)
80 return;
81 WARN_ON(atomic_read(&em->refs) == 0);
82 if (atomic_dec_and_test(&em->refs)) {
83 WARN_ON(em->in_tree);
84 kmem_cache_free(extent_map_cache, em);
85 }
86}
87EXPORT_SYMBOL(free_extent_map);
88
89static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
90 struct rb_node *node)
91{
92 struct rb_node **p = &root->rb_node;
93 struct rb_node *parent = NULL;
94 struct extent_map *entry;
95
96 while (*p) {
97 parent = *p;
98 entry = rb_entry(parent, struct extent_map, rb_node);
99
100 WARN_ON(!entry->in_tree);
101
102 if (offset < entry->start)
103 p = &(*p)->rb_left;
104 else if (offset >= extent_map_end(entry))
105 p = &(*p)->rb_right;
106 else
107 return parent;
108 }
109
110 entry = rb_entry(node, struct extent_map, rb_node);
111 entry->in_tree = 1;
112 rb_link_node(node, parent, p);
113 rb_insert_color(node, root);
114 return NULL;
115}
116
117/*
118 * search through the tree for an extent_map with a given offset. If
119 * it can't be found, try to find some neighboring extents
120 */
121static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
122 struct rb_node **prev_ret,
123 struct rb_node **next_ret)
124{
125 struct rb_node *n = root->rb_node;
126 struct rb_node *prev = NULL;
127 struct rb_node *orig_prev = NULL;
128 struct extent_map *entry;
129 struct extent_map *prev_entry = NULL;
130
131 while (n) {
132 entry = rb_entry(n, struct extent_map, rb_node);
133 prev = n;
134 prev_entry = entry;
135
136 WARN_ON(!entry->in_tree);
137
138 if (offset < entry->start)
139 n = n->rb_left;
140 else if (offset >= extent_map_end(entry))
141 n = n->rb_right;
142 else
143 return n;
144 }
145
146 if (prev_ret) {
147 orig_prev = prev;
148 while (prev && offset >= extent_map_end(prev_entry)) {
149 prev = rb_next(prev);
150 prev_entry = rb_entry(prev, struct extent_map, rb_node);
151 }
152 *prev_ret = prev;
153 prev = orig_prev;
154 }
155
156 if (next_ret) {
157 prev_entry = rb_entry(prev, struct extent_map, rb_node);
158 while (prev && offset < prev_entry->start) {
159 prev = rb_prev(prev);
160 prev_entry = rb_entry(prev, struct extent_map, rb_node);
161 }
162 *next_ret = prev;
163 }
164 return NULL;
165}
166
167/*
168 * look for an offset in the tree, and if it can't be found, return
169 * the first offset we can find smaller than 'offset'.
170 */
171static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
172{
173 struct rb_node *prev;
174 struct rb_node *ret;
175 ret = __tree_search(root, offset, &prev, NULL);
176 if (!ret)
177 return prev;
178 return ret;
179}
180
181/* check to see if two extent_map structs are adjacent and safe to merge */
182static int mergable_maps(struct extent_map *prev, struct extent_map *next)
183{
184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
185 return 0;
186
187 /*
188 * don't merge compressed extents, we need to know their
189 * actual size
190 */
191 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
192 return 0;
193
194 if (extent_map_end(prev) == next->start &&
195 prev->flags == next->flags &&
196 prev->bdev == next->bdev &&
197 ((next->block_start == EXTENT_MAP_HOLE &&
198 prev->block_start == EXTENT_MAP_HOLE) ||
199 (next->block_start == EXTENT_MAP_INLINE &&
200 prev->block_start == EXTENT_MAP_INLINE) ||
201 (next->block_start == EXTENT_MAP_DELALLOC &&
202 prev->block_start == EXTENT_MAP_DELALLOC) ||
203 (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
204 next->block_start == extent_map_block_end(prev)))) {
205 return 1;
206 }
207 return 0;
208}
209
210/**
211 * add_extent_mapping - add new extent map to the extent tree
212 * @tree: tree to insert new map in
213 * @em: map to insert
214 *
215 * Insert @em into @tree or perform a simple forward/backward merge with
216 * existing mappings. The extent_map struct passed in will be inserted
217 * into the tree directly, with an additional reference taken, or a
218 * reference dropped if the merge attempt was sucessfull.
219 */
220int add_extent_mapping(struct extent_map_tree *tree,
221 struct extent_map *em)
222{
223 int ret = 0;
224 struct extent_map *merge = NULL;
225 struct rb_node *rb;
226 struct extent_map *exist;
227
228 exist = lookup_extent_mapping(tree, em->start, em->len);
229 if (exist) {
230 free_extent_map(exist);
231 ret = -EEXIST;
232 goto out;
233 }
234 assert_spin_locked(&tree->lock);
235 rb = tree_insert(&tree->map, em->start, &em->rb_node);
236 if (rb) {
237 ret = -EEXIST;
238 free_extent_map(merge);
239 goto out;
240 }
241 atomic_inc(&em->refs);
242 if (em->start != 0) {
243 rb = rb_prev(&em->rb_node);
244 if (rb)
245 merge = rb_entry(rb, struct extent_map, rb_node);
246 if (rb && mergable_maps(merge, em)) {
247 em->start = merge->start;
248 em->len += merge->len;
249 em->block_len += merge->block_len;
250 em->block_start = merge->block_start;
251 merge->in_tree = 0;
252 rb_erase(&merge->rb_node, &tree->map);
253 free_extent_map(merge);
254 }
255 }
256 rb = rb_next(&em->rb_node);
257 if (rb)
258 merge = rb_entry(rb, struct extent_map, rb_node);
259 if (rb && mergable_maps(em, merge)) {
260 em->len += merge->len;
261 em->block_len += merge->len;
262 rb_erase(&merge->rb_node, &tree->map);
263 merge->in_tree = 0;
264 free_extent_map(merge);
265 }
266out:
267 return ret;
268}
269EXPORT_SYMBOL(add_extent_mapping);
270
271/* simple helper to do math around the end of an extent, handling wrap */
272static u64 range_end(u64 start, u64 len)
273{
274 if (start + len < start)
275 return (u64)-1;
276 return start + len;
277}
278
279/**
280 * lookup_extent_mapping - lookup extent_map
281 * @tree: tree to lookup in
282 * @start: byte offset to start the search
283 * @len: length of the lookup range
284 *
285 * Find and return the first extent_map struct in @tree that intersects the
286 * [start, len] range. There may be additional objects in the tree that
287 * intersect, so check the object returned carefully to make sure that no
288 * additional lookups are needed.
289 */
290struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
291 u64 start, u64 len)
292{
293 struct extent_map *em;
294 struct rb_node *rb_node;
295 struct rb_node *prev = NULL;
296 struct rb_node *next = NULL;
297 u64 end = range_end(start, len);
298
299 assert_spin_locked(&tree->lock);
300 rb_node = __tree_search(&tree->map, start, &prev, &next);
301 if (!rb_node && prev) {
302 em = rb_entry(prev, struct extent_map, rb_node);
303 if (end > em->start && start < extent_map_end(em))
304 goto found;
305 }
306 if (!rb_node && next) {
307 em = rb_entry(next, struct extent_map, rb_node);
308 if (end > em->start && start < extent_map_end(em))
309 goto found;
310 }
311 if (!rb_node) {
312 em = NULL;
313 goto out;
314 }
315 if (IS_ERR(rb_node)) {
316 em = ERR_PTR(PTR_ERR(rb_node));
317 goto out;
318 }
319 em = rb_entry(rb_node, struct extent_map, rb_node);
320 if (end > em->start && start < extent_map_end(em))
321 goto found;
322
323 em = NULL;
324 goto out;
325
326found:
327 atomic_inc(&em->refs);
328out:
329 return em;
330}
331EXPORT_SYMBOL(lookup_extent_mapping);
332
333/**
334 * remove_extent_mapping - removes an extent_map from the extent tree
335 * @tree: extent tree to remove from
336 * @em: extent map beeing removed
337 *
338 * Removes @em from @tree. No reference counts are dropped, and no checks
339 * are done to see if the range is in use
340 */
341int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
342{
343 int ret = 0;
344
345 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
346 assert_spin_locked(&tree->lock);
347 rb_erase(&em->rb_node, &tree->map);
348 em->in_tree = 0;
349 return ret;
350}
351EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000000..fb6eeef06bb0
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,62 @@
1#ifndef __EXTENTMAP__
2#define __EXTENTMAP__
3
4#include <linux/rbtree.h>
5
6#define EXTENT_MAP_LAST_BYTE (u64)-4
7#define EXTENT_MAP_HOLE (u64)-3
8#define EXTENT_MAP_INLINE (u64)-2
9#define EXTENT_MAP_DELALLOC (u64)-1
10
11/* bits for the flags field */
12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
13#define EXTENT_FLAG_COMPRESSED 1
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16
17struct extent_map {
18 struct rb_node rb_node;
19
20 /* all of these are in bytes */
21 u64 start;
22 u64 len;
23 u64 orig_start;
24 u64 block_start;
25 u64 block_len;
26 unsigned long flags;
27 struct block_device *bdev;
28 atomic_t refs;
29 int in_tree;
30};
31
32struct extent_map_tree {
33 struct rb_root map;
34 spinlock_t lock;
35};
36
37static inline u64 extent_map_end(struct extent_map *em)
38{
39 if (em->start + em->len < em->start)
40 return (u64)-1;
41 return em->start + em->len;
42}
43
44static inline u64 extent_map_block_end(struct extent_map *em)
45{
46 if (em->block_start + em->block_len < em->block_start)
47 return (u64)-1;
48 return em->block_start + em->block_len;
49}
50
51void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
52struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
53 u64 start, u64 len);
54int add_extent_mapping(struct extent_map_tree *tree,
55 struct extent_map *em);
56int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
57
58struct extent_map *alloc_extent_map(gfp_t mask);
59void free_extent_map(struct extent_map *em);
60int __init extent_map_init(void);
61void extent_map_exit(void);
62#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000000..b11abfad81a5
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,821 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/bio.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include "ctree.h"
23#include "disk-io.h"
24#include "transaction.h"
25#include "print-tree.h"
26
27#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
28 sizeof(struct btrfs_item) * 2) / \
29 size) - 1))
30int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root,
32 u64 objectid, u64 pos,
33 u64 disk_offset, u64 disk_num_bytes,
34 u64 num_bytes, u64 offset, u64 ram_bytes,
35 u8 compression, u8 encryption, u16 other_encoding)
36{
37 int ret = 0;
38 struct btrfs_file_extent_item *item;
39 struct btrfs_key file_key;
40 struct btrfs_path *path;
41 struct extent_buffer *leaf;
42
43 path = btrfs_alloc_path();
44 BUG_ON(!path);
45 file_key.objectid = objectid;
46 file_key.offset = pos;
47 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
48
49 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
50 sizeof(*item));
51 if (ret < 0)
52 goto out;
53 BUG_ON(ret);
54 leaf = path->nodes[0];
55 item = btrfs_item_ptr(leaf, path->slots[0],
56 struct btrfs_file_extent_item);
57 btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
58 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
59 btrfs_set_file_extent_offset(leaf, item, offset);
60 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
61 btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
62 btrfs_set_file_extent_generation(leaf, item, trans->transid);
63 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
64 btrfs_set_file_extent_compression(leaf, item, compression);
65 btrfs_set_file_extent_encryption(leaf, item, encryption);
66 btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
67
68 btrfs_mark_buffer_dirty(leaf);
69out:
70 btrfs_free_path(path);
71 return ret;
72}
73
74struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
75 struct btrfs_root *root,
76 struct btrfs_path *path,
77 u64 bytenr, int cow)
78{
79 int ret;
80 struct btrfs_key file_key;
81 struct btrfs_key found_key;
82 struct btrfs_csum_item *item;
83 struct extent_buffer *leaf;
84 u64 csum_offset = 0;
85 u16 csum_size =
86 btrfs_super_csum_size(&root->fs_info->super_copy);
87 int csums_in_item;
88
89 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
90 file_key.offset = bytenr;
91 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
92 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
93 if (ret < 0)
94 goto fail;
95 leaf = path->nodes[0];
96 if (ret > 0) {
97 ret = 1;
98 if (path->slots[0] == 0)
99 goto fail;
100 path->slots[0]--;
101 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
102 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
103 goto fail;
104
105 csum_offset = (bytenr - found_key.offset) >>
106 root->fs_info->sb->s_blocksize_bits;
107 csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
108 csums_in_item /= csum_size;
109
110 if (csum_offset >= csums_in_item) {
111 ret = -EFBIG;
112 goto fail;
113 }
114 }
115 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
116 item = (struct btrfs_csum_item *)((unsigned char *)item +
117 csum_offset * csum_size);
118 return item;
119fail:
120 if (ret > 0)
121 ret = -ENOENT;
122 return ERR_PTR(ret);
123}
124
125
126int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
127 struct btrfs_root *root,
128 struct btrfs_path *path, u64 objectid,
129 u64 offset, int mod)
130{
131 int ret;
132 struct btrfs_key file_key;
133 int ins_len = mod < 0 ? -1 : 0;
134 int cow = mod != 0;
135
136 file_key.objectid = objectid;
137 file_key.offset = offset;
138 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
139 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
140 return ret;
141}
142
143
144int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
145 struct bio *bio, u32 *dst)
146{
147 u32 sum;
148 struct bio_vec *bvec = bio->bi_io_vec;
149 int bio_index = 0;
150 u64 offset;
151 u64 item_start_offset = 0;
152 u64 item_last_offset = 0;
153 u64 disk_bytenr;
154 u32 diff;
155 u16 csum_size =
156 btrfs_super_csum_size(&root->fs_info->super_copy);
157 int ret;
158 struct btrfs_path *path;
159 struct btrfs_csum_item *item = NULL;
160 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
161
162 path = btrfs_alloc_path();
163 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
164 path->reada = 2;
165
166 WARN_ON(bio->bi_vcnt <= 0);
167
168 disk_bytenr = (u64)bio->bi_sector << 9;
169 while (bio_index < bio->bi_vcnt) {
170 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
171 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
172 if (ret == 0)
173 goto found;
174
175 if (!item || disk_bytenr < item_start_offset ||
176 disk_bytenr >= item_last_offset) {
177 struct btrfs_key found_key;
178 u32 item_size;
179
180 if (item)
181 btrfs_release_path(root, path);
182 item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
183 path, disk_bytenr, 0);
184 if (IS_ERR(item)) {
185 ret = PTR_ERR(item);
186 if (ret == -ENOENT || ret == -EFBIG)
187 ret = 0;
188 sum = 0;
189 if (BTRFS_I(inode)->root->root_key.objectid ==
190 BTRFS_DATA_RELOC_TREE_OBJECTID) {
191 set_extent_bits(io_tree, offset,
192 offset + bvec->bv_len - 1,
193 EXTENT_NODATASUM, GFP_NOFS);
194 } else {
195 printk(KERN_INFO "btrfs no csum found "
196 "for inode %lu start %llu\n",
197 inode->i_ino,
198 (unsigned long long)offset);
199 }
200 item = NULL;
201 btrfs_release_path(root, path);
202 goto found;
203 }
204 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
205 path->slots[0]);
206
207 item_start_offset = found_key.offset;
208 item_size = btrfs_item_size_nr(path->nodes[0],
209 path->slots[0]);
210 item_last_offset = item_start_offset +
211 (item_size / csum_size) *
212 root->sectorsize;
213 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
214 struct btrfs_csum_item);
215 }
216 /*
217 * this byte range must be able to fit inside
218 * a single leaf so it will also fit inside a u32
219 */
220 diff = disk_bytenr - item_start_offset;
221 diff = diff / root->sectorsize;
222 diff = diff * csum_size;
223
224 read_extent_buffer(path->nodes[0], &sum,
225 ((unsigned long)item) + diff,
226 csum_size);
227found:
228 if (dst)
229 *dst++ = sum;
230 else
231 set_state_private(io_tree, offset, sum);
232 disk_bytenr += bvec->bv_len;
233 bio_index++;
234 bvec++;
235 }
236 btrfs_free_path(path);
237 return 0;
238}
239
240int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
241 struct list_head *list)
242{
243 struct btrfs_key key;
244 struct btrfs_path *path;
245 struct extent_buffer *leaf;
246 struct btrfs_ordered_sum *sums;
247 struct btrfs_sector_sum *sector_sum;
248 struct btrfs_csum_item *item;
249 unsigned long offset;
250 int ret;
251 size_t size;
252 u64 csum_end;
253 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
254
255 path = btrfs_alloc_path();
256 BUG_ON(!path);
257
258 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
259 key.offset = start;
260 key.type = BTRFS_EXTENT_CSUM_KEY;
261
262 ret = btrfs_search_slot(NULL, root->fs_info->csum_root,
263 &key, path, 0, 0);
264 if (ret < 0)
265 goto fail;
266 if (ret > 0 && path->slots[0] > 0) {
267 leaf = path->nodes[0];
268 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
269 if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
270 key.type == BTRFS_EXTENT_CSUM_KEY) {
271 offset = (start - key.offset) >>
272 root->fs_info->sb->s_blocksize_bits;
273 if (offset * csum_size <
274 btrfs_item_size_nr(leaf, path->slots[0] - 1))
275 path->slots[0]--;
276 }
277 }
278
279 while (start <= end) {
280 leaf = path->nodes[0];
281 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
282 ret = btrfs_next_leaf(root->fs_info->csum_root, path);
283 if (ret < 0)
284 goto fail;
285 if (ret > 0)
286 break;
287 leaf = path->nodes[0];
288 }
289
290 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
291 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
292 key.type != BTRFS_EXTENT_CSUM_KEY)
293 break;
294
295 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
296 if (key.offset > end)
297 break;
298
299 if (key.offset > start)
300 start = key.offset;
301
302 size = btrfs_item_size_nr(leaf, path->slots[0]);
303 csum_end = key.offset + (size / csum_size) * root->sectorsize;
304 if (csum_end <= start) {
305 path->slots[0]++;
306 continue;
307 }
308
309 size = min(csum_end, end + 1) - start;
310 sums = kzalloc(btrfs_ordered_sum_size(root, size), GFP_NOFS);
311 BUG_ON(!sums);
312
313 sector_sum = sums->sums;
314 sums->bytenr = start;
315 sums->len = size;
316
317 offset = (start - key.offset) >>
318 root->fs_info->sb->s_blocksize_bits;
319 offset *= csum_size;
320
321 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
322 struct btrfs_csum_item);
323 while (size > 0) {
324 read_extent_buffer(path->nodes[0], &sector_sum->sum,
325 ((unsigned long)item) + offset,
326 csum_size);
327 sector_sum->bytenr = start;
328
329 size -= root->sectorsize;
330 start += root->sectorsize;
331 offset += csum_size;
332 sector_sum++;
333 }
334 list_add_tail(&sums->list, list);
335
336 path->slots[0]++;
337 }
338 ret = 0;
339fail:
340 btrfs_free_path(path);
341 return ret;
342}
343
344int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
345 struct bio *bio, u64 file_start, int contig)
346{
347 struct btrfs_ordered_sum *sums;
348 struct btrfs_sector_sum *sector_sum;
349 struct btrfs_ordered_extent *ordered;
350 char *data;
351 struct bio_vec *bvec = bio->bi_io_vec;
352 int bio_index = 0;
353 unsigned long total_bytes = 0;
354 unsigned long this_sum_bytes = 0;
355 u64 offset;
356 u64 disk_bytenr;
357
358 WARN_ON(bio->bi_vcnt <= 0);
359 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
360 if (!sums)
361 return -ENOMEM;
362
363 sector_sum = sums->sums;
364 disk_bytenr = (u64)bio->bi_sector << 9;
365 sums->len = bio->bi_size;
366 INIT_LIST_HEAD(&sums->list);
367
368 if (contig)
369 offset = file_start;
370 else
371 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
372
373 ordered = btrfs_lookup_ordered_extent(inode, offset);
374 BUG_ON(!ordered);
375 sums->bytenr = ordered->start;
376
377 while (bio_index < bio->bi_vcnt) {
378 if (!contig)
379 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
380
381 if (!contig && (offset >= ordered->file_offset + ordered->len ||
382 offset < ordered->file_offset)) {
383 unsigned long bytes_left;
384 sums->len = this_sum_bytes;
385 this_sum_bytes = 0;
386 btrfs_add_ordered_sum(inode, ordered, sums);
387 btrfs_put_ordered_extent(ordered);
388
389 bytes_left = bio->bi_size - total_bytes;
390
391 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
392 GFP_NOFS);
393 BUG_ON(!sums);
394 sector_sum = sums->sums;
395 sums->len = bytes_left;
396 ordered = btrfs_lookup_ordered_extent(inode, offset);
397 BUG_ON(!ordered);
398 sums->bytenr = ordered->start;
399 }
400
401 data = kmap_atomic(bvec->bv_page, KM_USER0);
402 sector_sum->sum = ~(u32)0;
403 sector_sum->sum = btrfs_csum_data(root,
404 data + bvec->bv_offset,
405 sector_sum->sum,
406 bvec->bv_len);
407 kunmap_atomic(data, KM_USER0);
408 btrfs_csum_final(sector_sum->sum,
409 (char *)&sector_sum->sum);
410 sector_sum->bytenr = disk_bytenr;
411
412 sector_sum++;
413 bio_index++;
414 total_bytes += bvec->bv_len;
415 this_sum_bytes += bvec->bv_len;
416 disk_bytenr += bvec->bv_len;
417 offset += bvec->bv_len;
418 bvec++;
419 }
420 this_sum_bytes = 0;
421 btrfs_add_ordered_sum(inode, ordered, sums);
422 btrfs_put_ordered_extent(ordered);
423 return 0;
424}
425
426/*
427 * helper function for csum removal, this expects the
428 * key to describe the csum pointed to by the path, and it expects
429 * the csum to overlap the range [bytenr, len]
430 *
431 * The csum should not be entirely contained in the range and the
432 * range should not be entirely contained in the csum.
433 *
434 * This calls btrfs_truncate_item with the correct args based on the
435 * overlap, and fixes up the key as required.
436 */
437static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
438 struct btrfs_root *root,
439 struct btrfs_path *path,
440 struct btrfs_key *key,
441 u64 bytenr, u64 len)
442{
443 struct extent_buffer *leaf;
444 u16 csum_size =
445 btrfs_super_csum_size(&root->fs_info->super_copy);
446 u64 csum_end;
447 u64 end_byte = bytenr + len;
448 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
449 int ret;
450
451 leaf = path->nodes[0];
452 csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
453 csum_end <<= root->fs_info->sb->s_blocksize_bits;
454 csum_end += key->offset;
455
456 if (key->offset < bytenr && csum_end <= end_byte) {
457 /*
458 * [ bytenr - len ]
459 * [ ]
460 * [csum ]
461 * A simple truncate off the end of the item
462 */
463 u32 new_size = (bytenr - key->offset) >> blocksize_bits;
464 new_size *= csum_size;
465 ret = btrfs_truncate_item(trans, root, path, new_size, 1);
466 BUG_ON(ret);
467 } else if (key->offset >= bytenr && csum_end > end_byte &&
468 end_byte > key->offset) {
469 /*
470 * [ bytenr - len ]
471 * [ ]
472 * [csum ]
473 * we need to truncate from the beginning of the csum
474 */
475 u32 new_size = (csum_end - end_byte) >> blocksize_bits;
476 new_size *= csum_size;
477
478 ret = btrfs_truncate_item(trans, root, path, new_size, 0);
479 BUG_ON(ret);
480
481 key->offset = end_byte;
482 ret = btrfs_set_item_key_safe(trans, root, path, key);
483 BUG_ON(ret);
484 } else {
485 BUG();
486 }
487 return 0;
488}
489
490/*
491 * deletes the csum items from the csum tree for a given
492 * range of bytes.
493 */
494int btrfs_del_csums(struct btrfs_trans_handle *trans,
495 struct btrfs_root *root, u64 bytenr, u64 len)
496{
497 struct btrfs_path *path;
498 struct btrfs_key key;
499 u64 end_byte = bytenr + len;
500 u64 csum_end;
501 struct extent_buffer *leaf;
502 int ret;
503 u16 csum_size =
504 btrfs_super_csum_size(&root->fs_info->super_copy);
505 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
506
507 root = root->fs_info->csum_root;
508
509 path = btrfs_alloc_path();
510
511 while (1) {
512 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
513 key.offset = end_byte - 1;
514 key.type = BTRFS_EXTENT_CSUM_KEY;
515
516 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
517 if (ret > 0) {
518 if (path->slots[0] == 0)
519 goto out;
520 path->slots[0]--;
521 }
522 leaf = path->nodes[0];
523 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
524
525 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
526 key.type != BTRFS_EXTENT_CSUM_KEY) {
527 break;
528 }
529
530 if (key.offset >= end_byte)
531 break;
532
533 csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
534 csum_end <<= blocksize_bits;
535 csum_end += key.offset;
536
537 /* this csum ends before we start, we're done */
538 if (csum_end <= bytenr)
539 break;
540
541 /* delete the entire item, it is inside our range */
542 if (key.offset >= bytenr && csum_end <= end_byte) {
543 ret = btrfs_del_item(trans, root, path);
544 BUG_ON(ret);
545 if (key.offset == bytenr)
546 break;
547 } else if (key.offset < bytenr && csum_end > end_byte) {
548 unsigned long offset;
549 unsigned long shift_len;
550 unsigned long item_offset;
551 /*
552 * [ bytenr - len ]
553 * [csum ]
554 *
555 * Our bytes are in the middle of the csum,
556 * we need to split this item and insert a new one.
557 *
558 * But we can't drop the path because the
559 * csum could change, get removed, extended etc.
560 *
561 * The trick here is the max size of a csum item leaves
562 * enough room in the tree block for a single
563 * item header. So, we split the item in place,
564 * adding a new header pointing to the existing
565 * bytes. Then we loop around again and we have
566 * a nicely formed csum item that we can neatly
567 * truncate.
568 */
569 offset = (bytenr - key.offset) >> blocksize_bits;
570 offset *= csum_size;
571
572 shift_len = (len >> blocksize_bits) * csum_size;
573
574 item_offset = btrfs_item_ptr_offset(leaf,
575 path->slots[0]);
576
577 memset_extent_buffer(leaf, 0, item_offset + offset,
578 shift_len);
579 key.offset = bytenr;
580
581 /*
582 * btrfs_split_item returns -EAGAIN when the
583 * item changed size or key
584 */
585 ret = btrfs_split_item(trans, root, path, &key, offset);
586 BUG_ON(ret && ret != -EAGAIN);
587
588 key.offset = end_byte - 1;
589 } else {
590 ret = truncate_one_csum(trans, root, path,
591 &key, bytenr, len);
592 BUG_ON(ret);
593 if (key.offset < bytenr)
594 break;
595 }
596 btrfs_release_path(root, path);
597 }
598out:
599 btrfs_free_path(path);
600 return 0;
601}
602
603int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
604 struct btrfs_root *root,
605 struct btrfs_ordered_sum *sums)
606{
607 u64 bytenr;
608 int ret;
609 struct btrfs_key file_key;
610 struct btrfs_key found_key;
611 u64 next_offset;
612 u64 total_bytes = 0;
613 int found_next;
614 struct btrfs_path *path;
615 struct btrfs_csum_item *item;
616 struct btrfs_csum_item *item_end;
617 struct extent_buffer *leaf = NULL;
618 u64 csum_offset;
619 struct btrfs_sector_sum *sector_sum;
620 u32 nritems;
621 u32 ins_size;
622 char *eb_map;
623 char *eb_token;
624 unsigned long map_len;
625 unsigned long map_start;
626 u16 csum_size =
627 btrfs_super_csum_size(&root->fs_info->super_copy);
628
629 path = btrfs_alloc_path();
630 BUG_ON(!path);
631 sector_sum = sums->sums;
632again:
633 next_offset = (u64)-1;
634 found_next = 0;
635 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
636 file_key.offset = sector_sum->bytenr;
637 bytenr = sector_sum->bytenr;
638 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
639
640 item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
641 if (!IS_ERR(item)) {
642 leaf = path->nodes[0];
643 ret = 0;
644 goto found;
645 }
646 ret = PTR_ERR(item);
647 if (ret == -EFBIG) {
648 u32 item_size;
649 /* we found one, but it isn't big enough yet */
650 leaf = path->nodes[0];
651 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
652 if ((item_size / csum_size) >=
653 MAX_CSUM_ITEMS(root, csum_size)) {
654 /* already at max size, make a new one */
655 goto insert;
656 }
657 } else {
658 int slot = path->slots[0] + 1;
659 /* we didn't find a csum item, insert one */
660 nritems = btrfs_header_nritems(path->nodes[0]);
661 if (path->slots[0] >= nritems - 1) {
662 ret = btrfs_next_leaf(root, path);
663 if (ret == 1)
664 found_next = 1;
665 if (ret != 0)
666 goto insert;
667 slot = 0;
668 }
669 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
670 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
671 found_key.type != BTRFS_EXTENT_CSUM_KEY) {
672 found_next = 1;
673 goto insert;
674 }
675 next_offset = found_key.offset;
676 found_next = 1;
677 goto insert;
678 }
679
680 /*
681 * at this point, we know the tree has an item, but it isn't big
682 * enough yet to put our csum in. Grow it
683 */
684 btrfs_release_path(root, path);
685 ret = btrfs_search_slot(trans, root, &file_key, path,
686 csum_size, 1);
687 if (ret < 0)
688 goto fail_unlock;
689
690 if (ret > 0) {
691 if (path->slots[0] == 0)
692 goto insert;
693 path->slots[0]--;
694 }
695
696 leaf = path->nodes[0];
697 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
698 csum_offset = (bytenr - found_key.offset) >>
699 root->fs_info->sb->s_blocksize_bits;
700
701 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
702 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
703 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
704 goto insert;
705 }
706
707 if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
708 csum_size) {
709 u32 diff = (csum_offset + 1) * csum_size;
710
711 /*
712 * is the item big enough already? we dropped our lock
713 * before and need to recheck
714 */
715 if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
716 goto csum;
717
718 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
719 if (diff != csum_size)
720 goto insert;
721
722 ret = btrfs_extend_item(trans, root, path, diff);
723 BUG_ON(ret);
724 goto csum;
725 }
726
727insert:
728 btrfs_release_path(root, path);
729 csum_offset = 0;
730 if (found_next) {
731 u64 tmp = total_bytes + root->sectorsize;
732 u64 next_sector = sector_sum->bytenr;
733 struct btrfs_sector_sum *next = sector_sum + 1;
734
735 while (tmp < sums->len) {
736 if (next_sector + root->sectorsize != next->bytenr)
737 break;
738 tmp += root->sectorsize;
739 next_sector = next->bytenr;
740 next++;
741 }
742 tmp = min(tmp, next_offset - file_key.offset);
743 tmp >>= root->fs_info->sb->s_blocksize_bits;
744 tmp = max((u64)1, tmp);
745 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
746 ins_size = csum_size * tmp;
747 } else {
748 ins_size = csum_size;
749 }
750 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
751 ins_size);
752 if (ret < 0)
753 goto fail_unlock;
754 if (ret != 0) {
755 WARN_ON(1);
756 goto fail_unlock;
757 }
758csum:
759 leaf = path->nodes[0];
760 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
761 ret = 0;
762 item = (struct btrfs_csum_item *)((unsigned char *)item +
763 csum_offset * csum_size);
764found:
765 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
766 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
767 btrfs_item_size_nr(leaf, path->slots[0]));
768 eb_token = NULL;
769 cond_resched();
770next_sector:
771
772 if (!eb_token ||
773 (unsigned long)item + csum_size >= map_start + map_len) {
774 int err;
775
776 if (eb_token)
777 unmap_extent_buffer(leaf, eb_token, KM_USER1);
778 eb_token = NULL;
779 err = map_private_extent_buffer(leaf, (unsigned long)item,
780 csum_size,
781 &eb_token, &eb_map,
782 &map_start, &map_len, KM_USER1);
783 if (err)
784 eb_token = NULL;
785 }
786 if (eb_token) {
787 memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
788 &sector_sum->sum, csum_size);
789 } else {
790 write_extent_buffer(leaf, &sector_sum->sum,
791 (unsigned long)item, csum_size);
792 }
793
794 total_bytes += root->sectorsize;
795 sector_sum++;
796 if (total_bytes < sums->len) {
797 item = (struct btrfs_csum_item *)((char *)item +
798 csum_size);
799 if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
800 sector_sum->bytenr) {
801 bytenr = sector_sum->bytenr;
802 goto next_sector;
803 }
804 }
805 if (eb_token) {
806 unmap_extent_buffer(leaf, eb_token, KM_USER1);
807 eb_token = NULL;
808 }
809 btrfs_mark_buffer_dirty(path->nodes[0]);
810 cond_resched();
811 if (total_bytes < sums->len) {
812 btrfs_release_path(root, path);
813 goto again;
814 }
815out:
816 btrfs_free_path(path);
817 return ret;
818
819fail_unlock:
820 goto out;
821}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000000..0e3a13a45653
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1292 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/string.h>
25#include <linux/smp_lock.h>
26#include <linux/backing-dev.h>
27#include <linux/mpage.h>
28#include <linux/swap.h>
29#include <linux/writeback.h>
30#include <linux/statfs.h>
31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h"
34#include "disk-io.h"
35#include "transaction.h"
36#include "btrfs_inode.h"
37#include "ioctl.h"
38#include "print-tree.h"
39#include "tree-log.h"
40#include "locking.h"
41#include "compat.h"
42
43
44/* simple helper to fault in pages and copy. This should go away
45 * and be replaced with calls into generic code.
46 */
47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 int write_bytes,
49 struct page **prepared_pages,
50 const char __user *buf)
51{
52 long page_fault = 0;
53 int i;
54 int offset = pos & (PAGE_CACHE_SIZE - 1);
55
56 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
57 size_t count = min_t(size_t,
58 PAGE_CACHE_SIZE - offset, write_bytes);
59 struct page *page = prepared_pages[i];
60 fault_in_pages_readable(buf, count);
61
62 /* Copy data from userspace to the current page */
63 kmap(page);
64 page_fault = __copy_from_user(page_address(page) + offset,
65 buf, count);
66 /* Flush processor's dcache for this page */
67 flush_dcache_page(page);
68 kunmap(page);
69 buf += count;
70 write_bytes -= count;
71
72 if (page_fault)
73 break;
74 }
75 return page_fault ? -EFAULT : 0;
76}
77
78/*
79 * unlocks pages after btrfs_file_write is done with them
80 */
81static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
82{
83 size_t i;
84 for (i = 0; i < num_pages; i++) {
85 if (!pages[i])
86 break;
87 /* page checked is some magic around finding pages that
88 * have been modified without going through btrfs_set_page_dirty
89 * clear it here
90 */
91 ClearPageChecked(pages[i]);
92 unlock_page(pages[i]);
93 mark_page_accessed(pages[i]);
94 page_cache_release(pages[i]);
95 }
96}
97
98/*
99 * after copy_from_user, pages need to be dirtied and we need to make
100 * sure holes are created between the current EOF and the start of
101 * any next extents (if required).
102 *
103 * this also makes the decision about creating an inline extent vs
104 * doing real data extents, marking pages dirty and delalloc as required.
105 */
106static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root,
108 struct file *file,
109 struct page **pages,
110 size_t num_pages,
111 loff_t pos,
112 size_t write_bytes)
113{
114 int err = 0;
115 int i;
116 struct inode *inode = fdentry(file)->d_inode;
117 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
118 u64 hint_byte;
119 u64 num_bytes;
120 u64 start_pos;
121 u64 end_of_last_block;
122 u64 end_pos = pos + write_bytes;
123 loff_t isize = i_size_read(inode);
124
125 start_pos = pos & ~((u64)root->sectorsize - 1);
126 num_bytes = (write_bytes + pos - start_pos +
127 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
128
129 end_of_last_block = start_pos + num_bytes - 1;
130
131 lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
132 trans = btrfs_join_transaction(root, 1);
133 if (!trans) {
134 err = -ENOMEM;
135 goto out_unlock;
136 }
137 btrfs_set_trans_block_group(trans, inode);
138 hint_byte = 0;
139
140 set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
141
142 /* check for reserved extents on each page, we don't want
143 * to reset the delalloc bit on things that already have
144 * extents reserved.
145 */
146 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
147 for (i = 0; i < num_pages; i++) {
148 struct page *p = pages[i];
149 SetPageUptodate(p);
150 ClearPageChecked(p);
151 set_page_dirty(p);
152 }
153 if (end_pos > isize) {
154 i_size_write(inode, end_pos);
155 btrfs_update_inode(trans, root, inode);
156 }
157 err = btrfs_end_transaction(trans, root);
158out_unlock:
159 unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
160 return err;
161}
162
163/*
164 * this drops all the extents in the cache that intersect the range
165 * [start, end]. Existing extents are split as required.
166 */
167int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
168 int skip_pinned)
169{
170 struct extent_map *em;
171 struct extent_map *split = NULL;
172 struct extent_map *split2 = NULL;
173 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
174 u64 len = end - start + 1;
175 int ret;
176 int testend = 1;
177 unsigned long flags;
178 int compressed = 0;
179
180 WARN_ON(end < start);
181 if (end == (u64)-1) {
182 len = (u64)-1;
183 testend = 0;
184 }
185 while (1) {
186 if (!split)
187 split = alloc_extent_map(GFP_NOFS);
188 if (!split2)
189 split2 = alloc_extent_map(GFP_NOFS);
190
191 spin_lock(&em_tree->lock);
192 em = lookup_extent_mapping(em_tree, start, len);
193 if (!em) {
194 spin_unlock(&em_tree->lock);
195 break;
196 }
197 flags = em->flags;
198 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
199 spin_unlock(&em_tree->lock);
200 if (em->start <= start &&
201 (!testend || em->start + em->len >= start + len)) {
202 free_extent_map(em);
203 break;
204 }
205 if (start < em->start) {
206 len = em->start - start;
207 } else {
208 len = start + len - (em->start + em->len);
209 start = em->start + em->len;
210 }
211 free_extent_map(em);
212 continue;
213 }
214 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
215 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
216 remove_extent_mapping(em_tree, em);
217
218 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
219 em->start < start) {
220 split->start = em->start;
221 split->len = start - em->start;
222 split->orig_start = em->orig_start;
223 split->block_start = em->block_start;
224
225 if (compressed)
226 split->block_len = em->block_len;
227 else
228 split->block_len = split->len;
229
230 split->bdev = em->bdev;
231 split->flags = flags;
232 ret = add_extent_mapping(em_tree, split);
233 BUG_ON(ret);
234 free_extent_map(split);
235 split = split2;
236 split2 = NULL;
237 }
238 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
239 testend && em->start + em->len > start + len) {
240 u64 diff = start + len - em->start;
241
242 split->start = start + len;
243 split->len = em->start + em->len - (start + len);
244 split->bdev = em->bdev;
245 split->flags = flags;
246
247 if (compressed) {
248 split->block_len = em->block_len;
249 split->block_start = em->block_start;
250 split->orig_start = em->orig_start;
251 } else {
252 split->block_len = split->len;
253 split->block_start = em->block_start + diff;
254 split->orig_start = split->start;
255 }
256
257 ret = add_extent_mapping(em_tree, split);
258 BUG_ON(ret);
259 free_extent_map(split);
260 split = NULL;
261 }
262 spin_unlock(&em_tree->lock);
263
264 /* once for us */
265 free_extent_map(em);
266 /* once for the tree*/
267 free_extent_map(em);
268 }
269 if (split)
270 free_extent_map(split);
271 if (split2)
272 free_extent_map(split2);
273 return 0;
274}
275
276int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
277{
278 return 0;
279#if 0
280 struct btrfs_path *path;
281 struct btrfs_key found_key;
282 struct extent_buffer *leaf;
283 struct btrfs_file_extent_item *extent;
284 u64 last_offset = 0;
285 int nritems;
286 int slot;
287 int found_type;
288 int ret;
289 int err = 0;
290 u64 extent_end = 0;
291
292 path = btrfs_alloc_path();
293 ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
294 last_offset, 0);
295 while (1) {
296 nritems = btrfs_header_nritems(path->nodes[0]);
297 if (path->slots[0] >= nritems) {
298 ret = btrfs_next_leaf(root, path);
299 if (ret)
300 goto out;
301 nritems = btrfs_header_nritems(path->nodes[0]);
302 }
303 slot = path->slots[0];
304 leaf = path->nodes[0];
305 btrfs_item_key_to_cpu(leaf, &found_key, slot);
306 if (found_key.objectid != inode->i_ino)
307 break;
308 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
309 goto out;
310
311 if (found_key.offset < last_offset) {
312 WARN_ON(1);
313 btrfs_print_leaf(root, leaf);
314 printk(KERN_ERR "inode %lu found offset %llu "
315 "expected %llu\n", inode->i_ino,
316 (unsigned long long)found_key.offset,
317 (unsigned long long)last_offset);
318 err = 1;
319 goto out;
320 }
321 extent = btrfs_item_ptr(leaf, slot,
322 struct btrfs_file_extent_item);
323 found_type = btrfs_file_extent_type(leaf, extent);
324 if (found_type == BTRFS_FILE_EXTENT_REG) {
325 extent_end = found_key.offset +
326 btrfs_file_extent_num_bytes(leaf, extent);
327 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
328 struct btrfs_item *item;
329 item = btrfs_item_nr(leaf, slot);
330 extent_end = found_key.offset +
331 btrfs_file_extent_inline_len(leaf, extent);
332 extent_end = (extent_end + root->sectorsize - 1) &
333 ~((u64)root->sectorsize - 1);
334 }
335 last_offset = extent_end;
336 path->slots[0]++;
337 }
338 if (0 && last_offset < inode->i_size) {
339 WARN_ON(1);
340 btrfs_print_leaf(root, leaf);
341 printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
342 inode->i_ino, (unsigned long long)last_offset,
343 (unsigned long long)inode->i_size);
344 err = 1;
345
346 }
347out:
348 btrfs_free_path(path);
349 return err;
350#endif
351}
352
353/*
354 * this is very complex, but the basic idea is to drop all extents
355 * in the range start - end. hint_block is filled in with a block number
356 * that would be a good hint to the block allocator for this file.
357 *
358 * If an extent intersects the range but is not entirely inside the range
359 * it is either truncated or split. Anything entirely inside the range
360 * is deleted from the tree.
361 *
362 * inline_limit is used to tell this code which offsets in the file to keep
363 * if they contain inline extents.
364 */
365noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
366 struct btrfs_root *root, struct inode *inode,
367 u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
368{
369 u64 extent_end = 0;
370 u64 locked_end = end;
371 u64 search_start = start;
372 u64 leaf_start;
373 u64 ram_bytes = 0;
374 u64 orig_parent = 0;
375 u64 disk_bytenr = 0;
376 u8 compression;
377 u8 encryption;
378 u16 other_encoding = 0;
379 u64 root_gen;
380 u64 root_owner;
381 struct extent_buffer *leaf;
382 struct btrfs_file_extent_item *extent;
383 struct btrfs_path *path;
384 struct btrfs_key key;
385 struct btrfs_file_extent_item old;
386 int keep;
387 int slot;
388 int bookend;
389 int found_type = 0;
390 int found_extent;
391 int found_inline;
392 int recow;
393 int ret;
394
395 inline_limit = 0;
396 btrfs_drop_extent_cache(inode, start, end - 1, 0);
397
398 path = btrfs_alloc_path();
399 if (!path)
400 return -ENOMEM;
401 while (1) {
402 recow = 0;
403 btrfs_release_path(root, path);
404 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
405 search_start, -1);
406 if (ret < 0)
407 goto out;
408 if (ret > 0) {
409 if (path->slots[0] == 0) {
410 ret = 0;
411 goto out;
412 }
413 path->slots[0]--;
414 }
415next_slot:
416 keep = 0;
417 bookend = 0;
418 found_extent = 0;
419 found_inline = 0;
420 leaf_start = 0;
421 root_gen = 0;
422 root_owner = 0;
423 compression = 0;
424 encryption = 0;
425 extent = NULL;
426 leaf = path->nodes[0];
427 slot = path->slots[0];
428 ret = 0;
429 btrfs_item_key_to_cpu(leaf, &key, slot);
430 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
431 key.offset >= end) {
432 goto out;
433 }
434 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
435 key.objectid != inode->i_ino) {
436 goto out;
437 }
438 if (recow) {
439 search_start = max(key.offset, start);
440 continue;
441 }
442 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
443 extent = btrfs_item_ptr(leaf, slot,
444 struct btrfs_file_extent_item);
445 found_type = btrfs_file_extent_type(leaf, extent);
446 compression = btrfs_file_extent_compression(leaf,
447 extent);
448 encryption = btrfs_file_extent_encryption(leaf,
449 extent);
450 other_encoding = btrfs_file_extent_other_encoding(leaf,
451 extent);
452 if (found_type == BTRFS_FILE_EXTENT_REG ||
453 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
454 extent_end =
455 btrfs_file_extent_disk_bytenr(leaf,
456 extent);
457 if (extent_end)
458 *hint_byte = extent_end;
459
460 extent_end = key.offset +
461 btrfs_file_extent_num_bytes(leaf, extent);
462 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
463 extent);
464 found_extent = 1;
465 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
466 found_inline = 1;
467 extent_end = key.offset +
468 btrfs_file_extent_inline_len(leaf, extent);
469 }
470 } else {
471 extent_end = search_start;
472 }
473
474 /* we found nothing we can drop */
475 if ((!found_extent && !found_inline) ||
476 search_start >= extent_end) {
477 int nextret;
478 u32 nritems;
479 nritems = btrfs_header_nritems(leaf);
480 if (slot >= nritems - 1) {
481 nextret = btrfs_next_leaf(root, path);
482 if (nextret)
483 goto out;
484 recow = 1;
485 } else {
486 path->slots[0]++;
487 }
488 goto next_slot;
489 }
490
491 if (end <= extent_end && start >= key.offset && found_inline)
492 *hint_byte = EXTENT_MAP_INLINE;
493
494 if (found_extent) {
495 read_extent_buffer(leaf, &old, (unsigned long)extent,
496 sizeof(old));
497 root_gen = btrfs_header_generation(leaf);
498 root_owner = btrfs_header_owner(leaf);
499 leaf_start = leaf->start;
500 }
501
502 if (end < extent_end && end >= key.offset) {
503 bookend = 1;
504 if (found_inline && start <= key.offset)
505 keep = 1;
506 }
507
508 if (bookend && found_extent) {
509 if (locked_end < extent_end) {
510 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
511 locked_end, extent_end - 1,
512 GFP_NOFS);
513 if (!ret) {
514 btrfs_release_path(root, path);
515 lock_extent(&BTRFS_I(inode)->io_tree,
516 locked_end, extent_end - 1,
517 GFP_NOFS);
518 locked_end = extent_end;
519 continue;
520 }
521 locked_end = extent_end;
522 }
523 orig_parent = path->nodes[0]->start;
524 disk_bytenr = le64_to_cpu(old.disk_bytenr);
525 if (disk_bytenr != 0) {
526 ret = btrfs_inc_extent_ref(trans, root,
527 disk_bytenr,
528 le64_to_cpu(old.disk_num_bytes),
529 orig_parent, root->root_key.objectid,
530 trans->transid, inode->i_ino);
531 BUG_ON(ret);
532 }
533 }
534
535 if (found_inline) {
536 u64 mask = root->sectorsize - 1;
537 search_start = (extent_end + mask) & ~mask;
538 } else
539 search_start = extent_end;
540
541 /* truncate existing extent */
542 if (start > key.offset) {
543 u64 new_num;
544 u64 old_num;
545 keep = 1;
546 WARN_ON(start & (root->sectorsize - 1));
547 if (found_extent) {
548 new_num = start - key.offset;
549 old_num = btrfs_file_extent_num_bytes(leaf,
550 extent);
551 *hint_byte =
552 btrfs_file_extent_disk_bytenr(leaf,
553 extent);
554 if (btrfs_file_extent_disk_bytenr(leaf,
555 extent)) {
556 inode_sub_bytes(inode, old_num -
557 new_num);
558 }
559 if (!compression && !encryption) {
560 btrfs_set_file_extent_ram_bytes(leaf,
561 extent, new_num);
562 }
563 btrfs_set_file_extent_num_bytes(leaf,
564 extent, new_num);
565 btrfs_mark_buffer_dirty(leaf);
566 } else if (key.offset < inline_limit &&
567 (end > extent_end) &&
568 (inline_limit < extent_end)) {
569 u32 new_size;
570 new_size = btrfs_file_extent_calc_inline_size(
571 inline_limit - key.offset);
572 inode_sub_bytes(inode, extent_end -
573 inline_limit);
574 btrfs_set_file_extent_ram_bytes(leaf, extent,
575 new_size);
576 if (!compression && !encryption) {
577 btrfs_truncate_item(trans, root, path,
578 new_size, 1);
579 }
580 }
581 }
582 /* delete the entire extent */
583 if (!keep) {
584 if (found_inline)
585 inode_sub_bytes(inode, extent_end -
586 key.offset);
587 ret = btrfs_del_item(trans, root, path);
588 /* TODO update progress marker and return */
589 BUG_ON(ret);
590 extent = NULL;
591 btrfs_release_path(root, path);
592 /* the extent will be freed later */
593 }
594 if (bookend && found_inline && start <= key.offset) {
595 u32 new_size;
596 new_size = btrfs_file_extent_calc_inline_size(
597 extent_end - end);
598 inode_sub_bytes(inode, end - key.offset);
599 btrfs_set_file_extent_ram_bytes(leaf, extent,
600 new_size);
601 if (!compression && !encryption)
602 ret = btrfs_truncate_item(trans, root, path,
603 new_size, 0);
604 BUG_ON(ret);
605 }
606 /* create bookend, splitting the extent in two */
607 if (bookend && found_extent) {
608 struct btrfs_key ins;
609 ins.objectid = inode->i_ino;
610 ins.offset = end;
611 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
612
613 btrfs_release_path(root, path);
614 ret = btrfs_insert_empty_item(trans, root, path, &ins,
615 sizeof(*extent));
616 BUG_ON(ret);
617
618 leaf = path->nodes[0];
619 extent = btrfs_item_ptr(leaf, path->slots[0],
620 struct btrfs_file_extent_item);
621 write_extent_buffer(leaf, &old,
622 (unsigned long)extent, sizeof(old));
623
624 btrfs_set_file_extent_compression(leaf, extent,
625 compression);
626 btrfs_set_file_extent_encryption(leaf, extent,
627 encryption);
628 btrfs_set_file_extent_other_encoding(leaf, extent,
629 other_encoding);
630 btrfs_set_file_extent_offset(leaf, extent,
631 le64_to_cpu(old.offset) + end - key.offset);
632 WARN_ON(le64_to_cpu(old.num_bytes) <
633 (extent_end - end));
634 btrfs_set_file_extent_num_bytes(leaf, extent,
635 extent_end - end);
636
637 /*
638 * set the ram bytes to the size of the full extent
639 * before splitting. This is a worst case flag,
640 * but its the best we can do because we don't know
641 * how splitting affects compression
642 */
643 btrfs_set_file_extent_ram_bytes(leaf, extent,
644 ram_bytes);
645 btrfs_set_file_extent_type(leaf, extent, found_type);
646
647 btrfs_mark_buffer_dirty(path->nodes[0]);
648
649 if (disk_bytenr != 0) {
650 ret = btrfs_update_extent_ref(trans, root,
651 disk_bytenr, orig_parent,
652 leaf->start,
653 root->root_key.objectid,
654 trans->transid, ins.objectid);
655
656 BUG_ON(ret);
657 }
658 btrfs_release_path(root, path);
659 if (disk_bytenr != 0)
660 inode_add_bytes(inode, extent_end - end);
661 }
662
663 if (found_extent && !keep) {
664 u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
665
666 if (old_disk_bytenr != 0) {
667 inode_sub_bytes(inode,
668 le64_to_cpu(old.num_bytes));
669 ret = btrfs_free_extent(trans, root,
670 old_disk_bytenr,
671 le64_to_cpu(old.disk_num_bytes),
672 leaf_start, root_owner,
673 root_gen, key.objectid, 0);
674 BUG_ON(ret);
675 *hint_byte = old_disk_bytenr;
676 }
677 }
678
679 if (search_start >= end) {
680 ret = 0;
681 goto out;
682 }
683 }
684out:
685 btrfs_free_path(path);
686 if (locked_end > end) {
687 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
688 GFP_NOFS);
689 }
690 btrfs_check_file(root, inode);
691 return ret;
692}
693
694static int extent_mergeable(struct extent_buffer *leaf, int slot,
695 u64 objectid, u64 bytenr, u64 *start, u64 *end)
696{
697 struct btrfs_file_extent_item *fi;
698 struct btrfs_key key;
699 u64 extent_end;
700
701 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
702 return 0;
703
704 btrfs_item_key_to_cpu(leaf, &key, slot);
705 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
706 return 0;
707
708 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
709 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
710 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
711 btrfs_file_extent_compression(leaf, fi) ||
712 btrfs_file_extent_encryption(leaf, fi) ||
713 btrfs_file_extent_other_encoding(leaf, fi))
714 return 0;
715
716 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
717 if ((*start && *start != key.offset) || (*end && *end != extent_end))
718 return 0;
719
720 *start = key.offset;
721 *end = extent_end;
722 return 1;
723}
724
725/*
726 * Mark extent in the range start - end as written.
727 *
728 * This changes extent type from 'pre-allocated' to 'regular'. If only
729 * part of extent is marked as written, the extent will be split into
730 * two or three.
731 */
732int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
733 struct btrfs_root *root,
734 struct inode *inode, u64 start, u64 end)
735{
736 struct extent_buffer *leaf;
737 struct btrfs_path *path;
738 struct btrfs_file_extent_item *fi;
739 struct btrfs_key key;
740 u64 bytenr;
741 u64 num_bytes;
742 u64 extent_end;
743 u64 extent_offset;
744 u64 other_start;
745 u64 other_end;
746 u64 split = start;
747 u64 locked_end = end;
748 u64 orig_parent;
749 int extent_type;
750 int split_end = 1;
751 int ret;
752
753 btrfs_drop_extent_cache(inode, start, end - 1, 0);
754
755 path = btrfs_alloc_path();
756 BUG_ON(!path);
757again:
758 key.objectid = inode->i_ino;
759 key.type = BTRFS_EXTENT_DATA_KEY;
760 if (split == start)
761 key.offset = split;
762 else
763 key.offset = split - 1;
764
765 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
766 if (ret > 0 && path->slots[0] > 0)
767 path->slots[0]--;
768
769 leaf = path->nodes[0];
770 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
771 BUG_ON(key.objectid != inode->i_ino ||
772 key.type != BTRFS_EXTENT_DATA_KEY);
773 fi = btrfs_item_ptr(leaf, path->slots[0],
774 struct btrfs_file_extent_item);
775 extent_type = btrfs_file_extent_type(leaf, fi);
776 BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
777 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
778 BUG_ON(key.offset > start || extent_end < end);
779
780 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
781 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
782 extent_offset = btrfs_file_extent_offset(leaf, fi);
783
784 if (key.offset == start)
785 split = end;
786
787 if (key.offset == start && extent_end == end) {
788 int del_nr = 0;
789 int del_slot = 0;
790 u64 leaf_owner = btrfs_header_owner(leaf);
791 u64 leaf_gen = btrfs_header_generation(leaf);
792 other_start = end;
793 other_end = 0;
794 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
795 bytenr, &other_start, &other_end)) {
796 extent_end = other_end;
797 del_slot = path->slots[0] + 1;
798 del_nr++;
799 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
800 leaf->start, leaf_owner,
801 leaf_gen, inode->i_ino, 0);
802 BUG_ON(ret);
803 }
804 other_start = 0;
805 other_end = start;
806 if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
807 bytenr, &other_start, &other_end)) {
808 key.offset = other_start;
809 del_slot = path->slots[0];
810 del_nr++;
811 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
812 leaf->start, leaf_owner,
813 leaf_gen, inode->i_ino, 0);
814 BUG_ON(ret);
815 }
816 split_end = 0;
817 if (del_nr == 0) {
818 btrfs_set_file_extent_type(leaf, fi,
819 BTRFS_FILE_EXTENT_REG);
820 goto done;
821 }
822
823 fi = btrfs_item_ptr(leaf, del_slot - 1,
824 struct btrfs_file_extent_item);
825 btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
826 btrfs_set_file_extent_num_bytes(leaf, fi,
827 extent_end - key.offset);
828 btrfs_mark_buffer_dirty(leaf);
829
830 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
831 BUG_ON(ret);
832 goto done;
833 } else if (split == start) {
834 if (locked_end < extent_end) {
835 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
836 locked_end, extent_end - 1, GFP_NOFS);
837 if (!ret) {
838 btrfs_release_path(root, path);
839 lock_extent(&BTRFS_I(inode)->io_tree,
840 locked_end, extent_end - 1, GFP_NOFS);
841 locked_end = extent_end;
842 goto again;
843 }
844 locked_end = extent_end;
845 }
846 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
847 extent_offset += split - key.offset;
848 } else {
849 BUG_ON(key.offset != start);
850 btrfs_set_file_extent_offset(leaf, fi, extent_offset +
851 split - key.offset);
852 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
853 key.offset = split;
854 btrfs_set_item_key_safe(trans, root, path, &key);
855 extent_end = split;
856 }
857
858 if (extent_end == end) {
859 split_end = 0;
860 extent_type = BTRFS_FILE_EXTENT_REG;
861 }
862 if (extent_end == end && split == start) {
863 other_start = end;
864 other_end = 0;
865 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
866 bytenr, &other_start, &other_end)) {
867 path->slots[0]++;
868 fi = btrfs_item_ptr(leaf, path->slots[0],
869 struct btrfs_file_extent_item);
870 key.offset = split;
871 btrfs_set_item_key_safe(trans, root, path, &key);
872 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
873 btrfs_set_file_extent_num_bytes(leaf, fi,
874 other_end - split);
875 goto done;
876 }
877 }
878 if (extent_end == end && split == end) {
879 other_start = 0;
880 other_end = start;
881 if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
882 bytenr, &other_start, &other_end)) {
883 path->slots[0]--;
884 fi = btrfs_item_ptr(leaf, path->slots[0],
885 struct btrfs_file_extent_item);
886 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
887 other_start);
888 goto done;
889 }
890 }
891
892 btrfs_mark_buffer_dirty(leaf);
893
894 orig_parent = leaf->start;
895 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
896 orig_parent, root->root_key.objectid,
897 trans->transid, inode->i_ino);
898 BUG_ON(ret);
899 btrfs_release_path(root, path);
900
901 key.offset = start;
902 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
903 BUG_ON(ret);
904
905 leaf = path->nodes[0];
906 fi = btrfs_item_ptr(leaf, path->slots[0],
907 struct btrfs_file_extent_item);
908 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
909 btrfs_set_file_extent_type(leaf, fi, extent_type);
910 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
911 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
912 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
913 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
914 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
915 btrfs_set_file_extent_compression(leaf, fi, 0);
916 btrfs_set_file_extent_encryption(leaf, fi, 0);
917 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
918
919 if (orig_parent != leaf->start) {
920 ret = btrfs_update_extent_ref(trans, root, bytenr,
921 orig_parent, leaf->start,
922 root->root_key.objectid,
923 trans->transid, inode->i_ino);
924 BUG_ON(ret);
925 }
926done:
927 btrfs_mark_buffer_dirty(leaf);
928 btrfs_release_path(root, path);
929 if (split_end && split == start) {
930 split = end;
931 goto again;
932 }
933 if (locked_end > end) {
934 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
935 GFP_NOFS);
936 }
937 btrfs_free_path(path);
938 return 0;
939}
940
941/*
942 * this gets pages into the page cache and locks them down, it also properly
943 * waits for data=ordered extents to finish before allowing the pages to be
944 * modified.
945 */
946static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
947 struct page **pages, size_t num_pages,
948 loff_t pos, unsigned long first_index,
949 unsigned long last_index, size_t write_bytes)
950{
951 int i;
952 unsigned long index = pos >> PAGE_CACHE_SHIFT;
953 struct inode *inode = fdentry(file)->d_inode;
954 int err = 0;
955 u64 start_pos;
956 u64 last_pos;
957
958 start_pos = pos & ~((u64)root->sectorsize - 1);
959 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
960
961 if (start_pos > inode->i_size) {
962 err = btrfs_cont_expand(inode, start_pos);
963 if (err)
964 return err;
965 }
966
967 memset(pages, 0, num_pages * sizeof(struct page *));
968again:
969 for (i = 0; i < num_pages; i++) {
970 pages[i] = grab_cache_page(inode->i_mapping, index + i);
971 if (!pages[i]) {
972 err = -ENOMEM;
973 BUG_ON(1);
974 }
975 wait_on_page_writeback(pages[i]);
976 }
977 if (start_pos < inode->i_size) {
978 struct btrfs_ordered_extent *ordered;
979 lock_extent(&BTRFS_I(inode)->io_tree,
980 start_pos, last_pos - 1, GFP_NOFS);
981 ordered = btrfs_lookup_first_ordered_extent(inode,
982 last_pos - 1);
983 if (ordered &&
984 ordered->file_offset + ordered->len > start_pos &&
985 ordered->file_offset < last_pos) {
986 btrfs_put_ordered_extent(ordered);
987 unlock_extent(&BTRFS_I(inode)->io_tree,
988 start_pos, last_pos - 1, GFP_NOFS);
989 for (i = 0; i < num_pages; i++) {
990 unlock_page(pages[i]);
991 page_cache_release(pages[i]);
992 }
993 btrfs_wait_ordered_range(inode, start_pos,
994 last_pos - start_pos);
995 goto again;
996 }
997 if (ordered)
998 btrfs_put_ordered_extent(ordered);
999
1000 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
1001 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
1002 GFP_NOFS);
1003 unlock_extent(&BTRFS_I(inode)->io_tree,
1004 start_pos, last_pos - 1, GFP_NOFS);
1005 }
1006 for (i = 0; i < num_pages; i++) {
1007 clear_page_dirty_for_io(pages[i]);
1008 set_page_extent_mapped(pages[i]);
1009 WARN_ON(!PageLocked(pages[i]));
1010 }
1011 return 0;
1012}
1013
1014static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1015 size_t count, loff_t *ppos)
1016{
1017 loff_t pos;
1018 loff_t start_pos;
1019 ssize_t num_written = 0;
1020 ssize_t err = 0;
1021 int ret = 0;
1022 struct inode *inode = fdentry(file)->d_inode;
1023 struct btrfs_root *root = BTRFS_I(inode)->root;
1024 struct page **pages = NULL;
1025 int nrptrs;
1026 struct page *pinned[2];
1027 unsigned long first_index;
1028 unsigned long last_index;
1029 int will_write;
1030
1031 will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
1032 (file->f_flags & O_DIRECT));
1033
1034 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
1035 PAGE_CACHE_SIZE / (sizeof(struct page *)));
1036 pinned[0] = NULL;
1037 pinned[1] = NULL;
1038
1039 pos = *ppos;
1040 start_pos = pos;
1041
1042 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1043 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1044 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1045 if (err)
1046 goto out_nolock;
1047 if (count == 0)
1048 goto out_nolock;
1049
1050 err = file_remove_suid(file);
1051 if (err)
1052 goto out_nolock;
1053 file_update_time(file);
1054
1055 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1056
1057 mutex_lock(&inode->i_mutex);
1058 BTRFS_I(inode)->sequence++;
1059 first_index = pos >> PAGE_CACHE_SHIFT;
1060 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
1061
1062 /*
1063 * there are lots of better ways to do this, but this code
1064 * makes sure the first and last page in the file range are
1065 * up to date and ready for cow
1066 */
1067 if ((pos & (PAGE_CACHE_SIZE - 1))) {
1068 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
1069 if (!PageUptodate(pinned[0])) {
1070 ret = btrfs_readpage(NULL, pinned[0]);
1071 BUG_ON(ret);
1072 wait_on_page_locked(pinned[0]);
1073 } else {
1074 unlock_page(pinned[0]);
1075 }
1076 }
1077 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
1078 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
1079 if (!PageUptodate(pinned[1])) {
1080 ret = btrfs_readpage(NULL, pinned[1]);
1081 BUG_ON(ret);
1082 wait_on_page_locked(pinned[1]);
1083 } else {
1084 unlock_page(pinned[1]);
1085 }
1086 }
1087
1088 while (count > 0) {
1089 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1090 size_t write_bytes = min(count, nrptrs *
1091 (size_t)PAGE_CACHE_SIZE -
1092 offset);
1093 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
1094 PAGE_CACHE_SHIFT;
1095
1096 WARN_ON(num_pages > nrptrs);
1097 memset(pages, 0, sizeof(struct page *) * nrptrs);
1098
1099 ret = btrfs_check_free_space(root, write_bytes, 0);
1100 if (ret)
1101 goto out;
1102
1103 ret = prepare_pages(root, file, pages, num_pages,
1104 pos, first_index, last_index,
1105 write_bytes);
1106 if (ret)
1107 goto out;
1108
1109 ret = btrfs_copy_from_user(pos, num_pages,
1110 write_bytes, pages, buf);
1111 if (ret) {
1112 btrfs_drop_pages(pages, num_pages);
1113 goto out;
1114 }
1115
1116 ret = dirty_and_release_pages(NULL, root, file, pages,
1117 num_pages, pos, write_bytes);
1118 btrfs_drop_pages(pages, num_pages);
1119 if (ret)
1120 goto out;
1121
1122 if (will_write) {
1123 btrfs_fdatawrite_range(inode->i_mapping, pos,
1124 pos + write_bytes - 1,
1125 WB_SYNC_NONE);
1126 } else {
1127 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1128 num_pages);
1129 if (num_pages <
1130 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1131 btrfs_btree_balance_dirty(root, 1);
1132 btrfs_throttle(root);
1133 }
1134
1135 buf += write_bytes;
1136 count -= write_bytes;
1137 pos += write_bytes;
1138 num_written += write_bytes;
1139
1140 cond_resched();
1141 }
1142out:
1143 mutex_unlock(&inode->i_mutex);
1144
1145out_nolock:
1146 kfree(pages);
1147 if (pinned[0])
1148 page_cache_release(pinned[0]);
1149 if (pinned[1])
1150 page_cache_release(pinned[1]);
1151 *ppos = pos;
1152
1153 if (num_written > 0 && will_write) {
1154 struct btrfs_trans_handle *trans;
1155
1156 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1157 if (err)
1158 num_written = err;
1159
1160 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
1161 trans = btrfs_start_transaction(root, 1);
1162 ret = btrfs_log_dentry_safe(trans, root,
1163 file->f_dentry);
1164 if (ret == 0) {
1165 btrfs_sync_log(trans, root);
1166 btrfs_end_transaction(trans, root);
1167 } else {
1168 btrfs_commit_transaction(trans, root);
1169 }
1170 }
1171 if (file->f_flags & O_DIRECT) {
1172 invalidate_mapping_pages(inode->i_mapping,
1173 start_pos >> PAGE_CACHE_SHIFT,
1174 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1175 }
1176 }
1177 current->backing_dev_info = NULL;
1178 return num_written ? num_written : err;
1179}
1180
1181int btrfs_release_file(struct inode *inode, struct file *filp)
1182{
1183 if (filp->private_data)
1184 btrfs_ioctl_trans_end(filp);
1185 return 0;
1186}
1187
1188/*
1189 * fsync call for both files and directories. This logs the inode into
1190 * the tree log instead of forcing full commits whenever possible.
1191 *
1192 * It needs to call filemap_fdatawait so that all ordered extent updates are
1193 * in the metadata btree are up to date for copying to the log.
1194 *
1195 * It drops the inode mutex before doing the tree log commit. This is an
1196 * important optimization for directories because holding the mutex prevents
1197 * new operations on the dir while we write to disk.
1198 */
1199int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1200{
1201 struct inode *inode = dentry->d_inode;
1202 struct btrfs_root *root = BTRFS_I(inode)->root;
1203 int ret = 0;
1204 struct btrfs_trans_handle *trans;
1205
1206 /*
1207 * check the transaction that last modified this inode
1208 * and see if its already been committed
1209 */
1210 if (!BTRFS_I(inode)->last_trans)
1211 goto out;
1212
1213 mutex_lock(&root->fs_info->trans_mutex);
1214 if (BTRFS_I(inode)->last_trans <=
1215 root->fs_info->last_trans_committed) {
1216 BTRFS_I(inode)->last_trans = 0;
1217 mutex_unlock(&root->fs_info->trans_mutex);
1218 goto out;
1219 }
1220 mutex_unlock(&root->fs_info->trans_mutex);
1221
1222 root->fs_info->tree_log_batch++;
1223 filemap_fdatawrite(inode->i_mapping);
1224 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1225 root->fs_info->tree_log_batch++;
1226
1227 /*
1228 * ok we haven't committed the transaction yet, lets do a commit
1229 */
1230 if (file->private_data)
1231 btrfs_ioctl_trans_end(file);
1232
1233 trans = btrfs_start_transaction(root, 1);
1234 if (!trans) {
1235 ret = -ENOMEM;
1236 goto out;
1237 }
1238
1239 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
1240 if (ret < 0)
1241 goto out;
1242
1243 /* we've logged all the items and now have a consistent
1244 * version of the file in the log. It is possible that
1245 * someone will come in and modify the file, but that's
1246 * fine because the log is consistent on disk, and we
1247 * have references to all of the file's extents
1248 *
1249 * It is possible that someone will come in and log the
1250 * file again, but that will end up using the synchronization
1251 * inside btrfs_sync_log to keep things safe.
1252 */
1253 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
1254
1255 if (ret > 0) {
1256 ret = btrfs_commit_transaction(trans, root);
1257 } else {
1258 btrfs_sync_log(trans, root);
1259 ret = btrfs_end_transaction(trans, root);
1260 }
1261 mutex_lock(&file->f_dentry->d_inode->i_mutex);
1262out:
1263 return ret > 0 ? EIO : ret;
1264}
1265
1266static struct vm_operations_struct btrfs_file_vm_ops = {
1267 .fault = filemap_fault,
1268 .page_mkwrite = btrfs_page_mkwrite,
1269};
1270
1271static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1272{
1273 vma->vm_ops = &btrfs_file_vm_ops;
1274 file_accessed(filp);
1275 return 0;
1276}
1277
1278struct file_operations btrfs_file_operations = {
1279 .llseek = generic_file_llseek,
1280 .read = do_sync_read,
1281 .aio_read = generic_file_aio_read,
1282 .splice_read = generic_file_splice_read,
1283 .write = btrfs_file_write,
1284 .mmap = btrfs_file_mmap,
1285 .open = generic_file_open,
1286 .release = btrfs_release_file,
1287 .fsync = btrfs_sync_file,
1288 .unlocked_ioctl = btrfs_ioctl,
1289#ifdef CONFIG_COMPAT
1290 .compat_ioctl = btrfs_ioctl,
1291#endif
1292};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..d1e5f0e84c58
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,495 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21
22static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node)
24{
25 struct rb_node **p = &root->rb_node;
26 struct rb_node *parent = NULL;
27 struct btrfs_free_space *info;
28
29 while (*p) {
30 parent = *p;
31 info = rb_entry(parent, struct btrfs_free_space, offset_index);
32
33 if (offset < info->offset)
34 p = &(*p)->rb_left;
35 else if (offset > info->offset)
36 p = &(*p)->rb_right;
37 else
38 return -EEXIST;
39 }
40
41 rb_link_node(node, parent, p);
42 rb_insert_color(node, root);
43
44 return 0;
45}
46
47static int tree_insert_bytes(struct rb_root *root, u64 bytes,
48 struct rb_node *node)
49{
50 struct rb_node **p = &root->rb_node;
51 struct rb_node *parent = NULL;
52 struct btrfs_free_space *info;
53
54 while (*p) {
55 parent = *p;
56 info = rb_entry(parent, struct btrfs_free_space, bytes_index);
57
58 if (bytes < info->bytes)
59 p = &(*p)->rb_left;
60 else
61 p = &(*p)->rb_right;
62 }
63
64 rb_link_node(node, parent, p);
65 rb_insert_color(node, root);
66
67 return 0;
68}
69
70/*
71 * searches the tree for the given offset. If contains is set we will return
72 * the free space that contains the given offset. If contains is not set we
73 * will return the free space that starts at or after the given offset and is
74 * at least bytes long.
75 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes,
78 int contains)
79{
80 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL;
82
83 while (n) {
84 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85
86 if (offset < entry->offset) {
87 if (!contains &&
88 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes))
90 ret = entry;
91 n = n->rb_left;
92 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) {
95 ret = entry;
96 break;
97 }
98 n = n->rb_right;
99 } else {
100 if (bytes > entry->bytes) {
101 n = n->rb_right;
102 continue;
103 }
104 ret = entry;
105 break;
106 }
107 }
108
109 return ret;
110}
111
112/*
113 * return a chunk at least bytes size, as close to offset that we can get.
114 */
115static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
116 u64 offset, u64 bytes)
117{
118 struct rb_node *n = root->rb_node;
119 struct btrfs_free_space *entry, *ret = NULL;
120
121 while (n) {
122 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
123
124 if (bytes < entry->bytes) {
125 /*
126 * We prefer to get a hole size as close to the size we
127 * are asking for so we don't take small slivers out of
128 * huge holes, but we also want to get as close to the
129 * offset as possible so we don't have a whole lot of
130 * fragmentation.
131 */
132 if (offset <= entry->offset) {
133 if (!ret)
134 ret = entry;
135 else if (entry->bytes < ret->bytes)
136 ret = entry;
137 else if (entry->offset < ret->offset)
138 ret = entry;
139 }
140 n = n->rb_left;
141 } else if (bytes > entry->bytes) {
142 n = n->rb_right;
143 } else {
144 /*
145 * Ok we may have multiple chunks of the wanted size,
146 * so we don't want to take the first one we find, we
147 * want to take the one closest to our given offset, so
148 * keep searching just in case theres a better match.
149 */
150 n = n->rb_right;
151 if (offset > entry->offset)
152 continue;
153 else if (!ret || entry->offset < ret->offset)
154 ret = entry;
155 }
156 }
157
158 return ret;
159}
160
161static void unlink_free_space(struct btrfs_block_group_cache *block_group,
162 struct btrfs_free_space *info)
163{
164 rb_erase(&info->offset_index, &block_group->free_space_offset);
165 rb_erase(&info->bytes_index, &block_group->free_space_bytes);
166}
167
168static int link_free_space(struct btrfs_block_group_cache *block_group,
169 struct btrfs_free_space *info)
170{
171 int ret = 0;
172
173
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index);
176 if (ret)
177 return ret;
178
179 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
180 &info->bytes_index);
181 if (ret)
182 return ret;
183
184 return ret;
185}
186
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes)
189{
190 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0;
195
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info)
198 return -ENOMEM;
199
200 /*
201 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range
204 */
205 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1);
207 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1);
209
210 if (right_info && right_info->offset == offset+bytes) {
211 unlink_free_space(block_group, right_info);
212 info = right_info;
213 info->offset = offset;
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 }
225
226 if (left_info) {
227 unlink_free_space(block_group, left_info);
228
229 if (unlikely((left_info->offset + left_info->bytes) !=
230 offset)) {
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 }
251
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info);
265 if (ret)
266 kfree(info);
267out:
268 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST)
271 BUG();
272 }
273
274 kfree(alloc_info);
275
276 return ret;
277}
278
279static int
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
281 u64 offset, u64 bytes)
282{
283 struct btrfs_free_space *info;
284 int ret = 0;
285
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1);
288
289 if (info && info->offset == offset) {
290 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu,"
292 "trying to use %llu\n",
293 (unsigned long long)info->offset,
294 (unsigned long long)info->bytes,
295 (unsigned long long)bytes);
296 WARN_ON(1);
297 ret = -EINVAL;
298 goto out;
299 }
300 unlink_free_space(block_group, info);
301
302 if (info->bytes == bytes) {
303 kfree(info);
304 goto out;
305 }
306
307 info->offset += bytes;
308 info->bytes -= bytes;
309
310 ret = link_free_space(block_group, info);
311 BUG_ON(ret);
312 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) {
314 u64 old_start = info->offset;
315 /*
316 * we're freeing space in the middle of the info,
317 * this can happen during tree log replay
318 *
319 * first unlink the old info and then
320 * insert it again after the hole we're creating
321 */
322 unlink_free_space(block_group, info);
323 if (offset + bytes < info->offset + info->bytes) {
324 u64 old_end = info->offset + info->bytes;
325
326 info->offset = offset + bytes;
327 info->bytes = old_end - info->offset;
328 ret = link_free_space(block_group, info);
329 BUG_ON(ret);
330 } else {
331 /* the hole we're creating ends at the end
332 * of the info struct, just free the info
333 */
334 kfree(info);
335 }
336
337 /* step two, insert a new info struct to cover anything
338 * before the hole
339 */
340 ret = __btrfs_add_free_space(block_group, old_start,
341 offset - old_start);
342 BUG_ON(ret);
343 } else {
344 WARN_ON(1);
345 }
346out:
347 return ret;
348}
349
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes)
402{
403 struct btrfs_free_space *info;
404 struct rb_node *n;
405 int count = 0;
406
407 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
408 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes)
410 count++;
411 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count);
414}
415
416u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
417{
418 struct btrfs_free_space *info;
419 struct rb_node *n;
420 u64 ret = 0;
421
422 for (n = rb_first(&block_group->free_space_offset); n;
423 n = rb_next(n)) {
424 info = rb_entry(n, struct btrfs_free_space, offset_index);
425 ret += info->bytes;
426 }
427
428 return ret;
429}
430
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{
433 struct btrfs_free_space *info;
434 struct rb_node *node;
435
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info);
440 kfree(info);
441 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex);
443 cond_resched();
444 mutex_lock(&block_group->alloc_mutex);
445 }
446 }
447 mutex_unlock(&block_group->alloc_mutex);
448}
449
450#if 0
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{
456 struct btrfs_free_space *ret;
457
458 mutex_lock(&block_group->alloc_mutex);
459 ret = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0);
461 mutex_unlock(&block_group->alloc_mutex);
462
463 return ret;
464}
465
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
467 btrfs_block_group_cache
468 *block_group, u64 offset,
469 u64 bytes)
470{
471 struct btrfs_free_space *ret;
472
473 mutex_lock(&block_group->alloc_mutex);
474
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
476 mutex_unlock(&block_group->alloc_mutex);
477
478 return ret;
479}
480#endif
481
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
483 *block_group, u64 offset,
484 u64 bytes)
485{
486 struct btrfs_free_space *ret = NULL;
487
488 ret = tree_search_offset(&block_group->free_space_offset, offset,
489 bytes, 0);
490 if (!ret)
491 ret = tree_search_bytes(&block_group->free_space_bytes,
492 offset, bytes);
493
494 return ret;
495}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 000000000000..2a020b276768
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __HASH__
20#define __HASH__
21
22#include "crc32c.h"
23static inline u64 btrfs_name_hash(const char *name, int len)
24{
25 return btrfs_crc32c((u32)~1, name, len);
26}
27#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000000..3d46fa1f29a4
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23static int find_name_in_backref(struct btrfs_path *path, const char *name,
24 int name_len, struct btrfs_inode_ref **ref_ret)
25{
26 struct extent_buffer *leaf;
27 struct btrfs_inode_ref *ref;
28 unsigned long ptr;
29 unsigned long name_ptr;
30 u32 item_size;
31 u32 cur_offset = 0;
32 int len;
33
34 leaf = path->nodes[0];
35 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
36 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
37 while (cur_offset < item_size) {
38 ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
39 len = btrfs_inode_ref_name_len(leaf, ref);
40 name_ptr = (unsigned long)(ref + 1);
41 cur_offset += len + sizeof(*ref);
42 if (len != name_len)
43 continue;
44 if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
45 *ref_ret = ref;
46 return 1;
47 }
48 }
49 return 0;
50}
51
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root,
54 const char *name, int name_len,
55 u64 inode_objectid, u64 ref_objectid, u64 *index)
56{
57 struct btrfs_path *path;
58 struct btrfs_key key;
59 struct btrfs_inode_ref *ref;
60 struct extent_buffer *leaf;
61 unsigned long ptr;
62 unsigned long item_start;
63 u32 item_size;
64 u32 sub_item_len;
65 int ret;
66 int del_len = name_len + sizeof(*ref);
67
68 key.objectid = inode_objectid;
69 key.offset = ref_objectid;
70 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
71
72 path = btrfs_alloc_path();
73 if (!path)
74 return -ENOMEM;
75
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) {
78 ret = -ENOENT;
79 goto out;
80 } else if (ret < 0) {
81 goto out;
82 }
83 if (!find_name_in_backref(path, name, name_len, &ref)) {
84 ret = -ENOENT;
85 goto out;
86 }
87 leaf = path->nodes[0];
88 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
89
90 if (index)
91 *index = btrfs_inode_ref_index(leaf, ref);
92
93 if (del_len == item_size) {
94 ret = btrfs_del_item(trans, root, path);
95 goto out;
96 }
97 ptr = (unsigned long)ref;
98 sub_item_len = name_len + sizeof(*ref);
99 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
100 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
101 item_size - (ptr + sub_item_len - item_start));
102 ret = btrfs_truncate_item(trans, root, path,
103 item_size - sub_item_len, 1);
104 BUG_ON(ret);
105out:
106 btrfs_free_path(path);
107 return ret;
108}
109
110int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
111 struct btrfs_root *root,
112 const char *name, int name_len,
113 u64 inode_objectid, u64 ref_objectid, u64 index)
114{
115 struct btrfs_path *path;
116 struct btrfs_key key;
117 struct btrfs_inode_ref *ref;
118 unsigned long ptr;
119 int ret;
120 int ins_len = name_len + sizeof(*ref);
121
122 key.objectid = inode_objectid;
123 key.offset = ref_objectid;
124 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
125
126 path = btrfs_alloc_path();
127 if (!path)
128 return -ENOMEM;
129
130 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len);
132 if (ret == -EEXIST) {
133 u32 old_size;
134
135 if (find_name_in_backref(path, name, name_len, &ref))
136 goto out;
137
138 old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
139 ret = btrfs_extend_item(trans, root, path, ins_len);
140 BUG_ON(ret);
141 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
142 struct btrfs_inode_ref);
143 ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
144 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
145 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
146 ptr = (unsigned long)(ref + 1);
147 ret = 0;
148 } else if (ret < 0) {
149 goto out;
150 } else {
151 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
152 struct btrfs_inode_ref);
153 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
154 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
155 ptr = (unsigned long)(ref + 1);
156 }
157 write_extent_buffer(path->nodes[0], name, ptr, name_len);
158 btrfs_mark_buffer_dirty(path->nodes[0]);
159
160out:
161 btrfs_free_path(path);
162 return ret;
163}
164
165int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
166 struct btrfs_root *root,
167 struct btrfs_path *path, u64 objectid)
168{
169 struct btrfs_key key;
170 int ret;
171 key.objectid = objectid;
172 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
173 key.offset = 0;
174
175 ret = btrfs_insert_empty_item(trans, root, path, &key,
176 sizeof(struct btrfs_inode_item));
177 if (ret == 0 && objectid > root->highest_inode)
178 root->highest_inode = objectid;
179 return ret;
180}
181
182int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
183 *root, struct btrfs_path *path,
184 struct btrfs_key *location, int mod)
185{
186 int ins_len = mod < 0 ? -1 : 0;
187 int cow = mod != 0;
188 int ret;
189 int slot;
190 struct extent_buffer *leaf;
191 struct btrfs_key found_key;
192
193 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
194 if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
195 location->offset == (u64)-1 && path->slots[0] != 0) {
196 slot = path->slots[0] - 1;
197 leaf = path->nodes[0];
198 btrfs_item_key_to_cpu(leaf, &found_key, slot);
199 if (found_key.objectid == location->objectid &&
200 btrfs_key_type(&found_key) == btrfs_key_type(location)) {
201 path->slots[0]--;
202 return 0;
203 }
204 }
205 return ret;
206}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 000000000000..2aa79873eb46
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,144 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
24{
25 struct btrfs_path *path;
26 int ret;
27 struct extent_buffer *l;
28 struct btrfs_key search_key;
29 struct btrfs_key found_key;
30 int slot;
31
32 path = btrfs_alloc_path();
33 BUG_ON(!path);
34
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1;
37 search_key.offset = (u64)-1;
38 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
39 if (ret < 0)
40 goto error;
41 BUG_ON(ret == 0);
42 if (path->slots[0] > 0) {
43 slot = path->slots[0] - 1;
44 l = path->nodes[0];
45 btrfs_item_key_to_cpu(l, &found_key, slot);
46 *objectid = found_key.objectid;
47 } else {
48 *objectid = BTRFS_FIRST_FREE_OBJECTID;
49 }
50 ret = 0;
51error:
52 btrfs_free_path(path);
53 return ret;
54}
55
56/*
57 * walks the btree of allocated inodes and find a hole.
58 */
59int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 dirid, u64 *objectid)
62{
63 struct btrfs_path *path;
64 struct btrfs_key key;
65 int ret;
66 int slot = 0;
67 u64 last_ino = 0;
68 int start_found;
69 struct extent_buffer *l;
70 struct btrfs_key search_key;
71 u64 search_start = dirid;
72
73 mutex_lock(&root->objectid_mutex);
74 if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
75 root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
76 *objectid = ++root->last_inode_alloc;
77 mutex_unlock(&root->objectid_mutex);
78 return 0;
79 }
80 path = btrfs_alloc_path();
81 BUG_ON(!path);
82 search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start;
84 search_key.type = 0;
85 search_key.offset = 0;
86
87 btrfs_init_path(path);
88 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0)
91 goto error;
92
93 while (1) {
94 l = path->nodes[0];
95 slot = path->slots[0];
96 if (slot >= btrfs_header_nritems(l)) {
97 ret = btrfs_next_leaf(root, path);
98 if (ret == 0)
99 continue;
100 if (ret < 0)
101 goto error;
102 if (!start_found) {
103 *objectid = search_start;
104 start_found = 1;
105 goto found;
106 }
107 *objectid = last_ino > search_start ?
108 last_ino : search_start;
109 goto found;
110 }
111 btrfs_item_key_to_cpu(l, &key, slot);
112 if (key.objectid >= search_start) {
113 if (start_found) {
114 if (last_ino < search_start)
115 last_ino = search_start;
116 if (key.objectid > last_ino) {
117 *objectid = last_ino;
118 goto found;
119 }
120 } else if (key.objectid > search_start) {
121 *objectid = search_start;
122 goto found;
123 }
124 }
125 if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
126 break;
127
128 start_found = 1;
129 last_ino = key.objectid + 1;
130 path->slots[0]++;
131 }
132 BUG_ON(1);
133found:
134 btrfs_release_path(root, path);
135 btrfs_free_path(path);
136 BUG_ON(*objectid < search_start);
137 mutex_unlock(&root->objectid_mutex);
138 return 0;
139error:
140 btrfs_release_path(root, path);
141 btrfs_free_path(path);
142 mutex_unlock(&root->objectid_mutex);
143 return ret;
144}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000000..1b35ea63b6ce
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,5040 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
40#include <linux/falloc.h>
41#include "compat.h"
42#include "ctree.h"
43#include "disk-io.h"
44#include "transaction.h"
45#include "btrfs_inode.h"
46#include "ioctl.h"
47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h"
50#include "xattr.h"
51#include "tree-log.h"
52#include "ref-cache.h"
53#include "compression.h"
54
55struct btrfs_iget_args {
56 u64 ino;
57 struct btrfs_root *root;
58};
59
60static struct inode_operations btrfs_dir_inode_operations;
61static struct inode_operations btrfs_symlink_inode_operations;
62static struct inode_operations btrfs_dir_ro_inode_operations;
63static struct inode_operations btrfs_special_inode_operations;
64static struct inode_operations btrfs_file_inode_operations;
65static struct address_space_operations btrfs_aops;
66static struct address_space_operations btrfs_symlink_aops;
67static struct file_operations btrfs_dir_file_operations;
68static struct extent_io_ops btrfs_extent_io_ops;
69
70static struct kmem_cache *btrfs_inode_cachep;
71struct kmem_cache *btrfs_trans_handle_cachep;
72struct kmem_cache *btrfs_transaction_cachep;
73struct kmem_cache *btrfs_bit_radix_cachep;
74struct kmem_cache *btrfs_path_cachep;
75
76#define S_SHIFT 12
77static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
78 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
79 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
80 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
81 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
82 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
83 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
84 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
85};
86
87static void btrfs_truncate(struct inode *inode);
88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89static noinline int cow_file_range(struct inode *inode,
90 struct page *locked_page,
91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock);
93
94/*
95 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing.
97 */
98int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
99 int for_del)
100{
101 u64 total;
102 u64 used;
103 u64 thresh;
104 int ret = 0;
105
106 spin_lock(&root->fs_info->delalloc_lock);
107 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
108 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
109 if (for_del)
110 thresh = total * 90;
111 else
112 thresh = total * 85;
113
114 do_div(thresh, 100);
115
116 if (used + root->fs_info->delalloc_bytes + num_required > thresh)
117 ret = -ENOSPC;
118 spin_unlock(&root->fs_info->delalloc_lock);
119 return ret;
120}
121
122/*
123 * this does all the hard work for inserting an inline extent into
124 * the btree. The caller should have done a btrfs_drop_extents so that
125 * no overlapping inline items exist in the btree
126 */
127static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
128 struct btrfs_root *root, struct inode *inode,
129 u64 start, size_t size, size_t compressed_size,
130 struct page **compressed_pages)
131{
132 struct btrfs_key key;
133 struct btrfs_path *path;
134 struct extent_buffer *leaf;
135 struct page *page = NULL;
136 char *kaddr;
137 unsigned long ptr;
138 struct btrfs_file_extent_item *ei;
139 int err = 0;
140 int ret;
141 size_t cur_size = size;
142 size_t datasize;
143 unsigned long offset;
144 int use_compress = 0;
145
146 if (compressed_size && compressed_pages) {
147 use_compress = 1;
148 cur_size = compressed_size;
149 }
150
151 path = btrfs_alloc_path();
152 if (!path)
153 return -ENOMEM;
154
155 btrfs_set_trans_block_group(trans, inode);
156
157 key.objectid = inode->i_ino;
158 key.offset = start;
159 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
160 inode_add_bytes(inode, size);
161 datasize = btrfs_file_extent_calc_inline_size(cur_size);
162
163 inode_add_bytes(inode, size);
164 ret = btrfs_insert_empty_item(trans, root, path, &key,
165 datasize);
166 BUG_ON(ret);
167 if (ret) {
168 err = ret;
169 goto fail;
170 }
171 leaf = path->nodes[0];
172 ei = btrfs_item_ptr(leaf, path->slots[0],
173 struct btrfs_file_extent_item);
174 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
175 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
176 btrfs_set_file_extent_encryption(leaf, ei, 0);
177 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
178 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
179 ptr = btrfs_file_extent_inline_start(ei);
180
181 if (use_compress) {
182 struct page *cpage;
183 int i = 0;
184 while (compressed_size > 0) {
185 cpage = compressed_pages[i];
186 cur_size = min_t(unsigned long, compressed_size,
187 PAGE_CACHE_SIZE);
188
189 kaddr = kmap(cpage);
190 write_extent_buffer(leaf, kaddr, ptr, cur_size);
191 kunmap(cpage);
192
193 i++;
194 ptr += cur_size;
195 compressed_size -= cur_size;
196 }
197 btrfs_set_file_extent_compression(leaf, ei,
198 BTRFS_COMPRESS_ZLIB);
199 } else {
200 page = find_get_page(inode->i_mapping,
201 start >> PAGE_CACHE_SHIFT);
202 btrfs_set_file_extent_compression(leaf, ei, 0);
203 kaddr = kmap_atomic(page, KM_USER0);
204 offset = start & (PAGE_CACHE_SIZE - 1);
205 write_extent_buffer(leaf, kaddr + offset, ptr, size);
206 kunmap_atomic(kaddr, KM_USER0);
207 page_cache_release(page);
208 }
209 btrfs_mark_buffer_dirty(leaf);
210 btrfs_free_path(path);
211
212 BTRFS_I(inode)->disk_i_size = inode->i_size;
213 btrfs_update_inode(trans, root, inode);
214 return 0;
215fail:
216 btrfs_free_path(path);
217 return err;
218}
219
220
221/*
222 * conditionally insert an inline extent into the file. This
223 * does the checks required to make sure the data is small enough
224 * to fit as an inline extent.
225 */
226static int cow_file_range_inline(struct btrfs_trans_handle *trans,
227 struct btrfs_root *root,
228 struct inode *inode, u64 start, u64 end,
229 size_t compressed_size,
230 struct page **compressed_pages)
231{
232 u64 isize = i_size_read(inode);
233 u64 actual_end = min(end + 1, isize);
234 u64 inline_len = actual_end - start;
235 u64 aligned_end = (end + root->sectorsize - 1) &
236 ~((u64)root->sectorsize - 1);
237 u64 hint_byte;
238 u64 data_len = inline_len;
239 int ret;
240
241 if (compressed_size)
242 data_len = compressed_size;
243
244 if (start > 0 ||
245 actual_end >= PAGE_CACHE_SIZE ||
246 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
247 (!compressed_size &&
248 (actual_end & (root->sectorsize - 1)) == 0) ||
249 end + 1 < isize ||
250 data_len > root->fs_info->max_inline) {
251 return 1;
252 }
253
254 ret = btrfs_drop_extents(trans, root, inode, start,
255 aligned_end, start, &hint_byte);
256 BUG_ON(ret);
257
258 if (isize > actual_end)
259 inline_len = min_t(u64, isize, actual_end);
260 ret = insert_inline_extent(trans, root, inode, start,
261 inline_len, compressed_size,
262 compressed_pages);
263 BUG_ON(ret);
264 btrfs_drop_extent_cache(inode, start, aligned_end, 0);
265 return 0;
266}
267
268struct async_extent {
269 u64 start;
270 u64 ram_size;
271 u64 compressed_size;
272 struct page **pages;
273 unsigned long nr_pages;
274 struct list_head list;
275};
276
277struct async_cow {
278 struct inode *inode;
279 struct btrfs_root *root;
280 struct page *locked_page;
281 u64 start;
282 u64 end;
283 struct list_head extents;
284 struct btrfs_work work;
285};
286
287static noinline int add_async_extent(struct async_cow *cow,
288 u64 start, u64 ram_size,
289 u64 compressed_size,
290 struct page **pages,
291 unsigned long nr_pages)
292{
293 struct async_extent *async_extent;
294
295 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
296 async_extent->start = start;
297 async_extent->ram_size = ram_size;
298 async_extent->compressed_size = compressed_size;
299 async_extent->pages = pages;
300 async_extent->nr_pages = nr_pages;
301 list_add_tail(&async_extent->list, &cow->extents);
302 return 0;
303}
304
305/*
306 * we create compressed extents in two phases. The first
307 * phase compresses a range of pages that have already been
308 * locked (both pages and state bits are locked).
309 *
310 * This is done inside an ordered work queue, and the compression
311 * is spread across many cpus. The actual IO submission is step
312 * two, and the ordered work queue takes care of making sure that
313 * happens in the same order things were put onto the queue by
314 * writepages and friends.
315 *
316 * If this code finds it can't get good compression, it puts an
317 * entry onto the work queue to write the uncompressed bytes. This
318 * makes sure that both compressed inodes and uncompressed inodes
319 * are written in the same order that pdflush sent them down.
320 */
321static noinline int compress_file_range(struct inode *inode,
322 struct page *locked_page,
323 u64 start, u64 end,
324 struct async_cow *async_cow,
325 int *num_added)
326{
327 struct btrfs_root *root = BTRFS_I(inode)->root;
328 struct btrfs_trans_handle *trans;
329 u64 num_bytes;
330 u64 orig_start;
331 u64 disk_num_bytes;
332 u64 blocksize = root->sectorsize;
333 u64 actual_end;
334 u64 isize = i_size_read(inode);
335 int ret = 0;
336 struct page **pages = NULL;
337 unsigned long nr_pages;
338 unsigned long nr_pages_ret = 0;
339 unsigned long total_compressed = 0;
340 unsigned long total_in = 0;
341 unsigned long max_compressed = 128 * 1024;
342 unsigned long max_uncompressed = 128 * 1024;
343 int i;
344 int will_compress;
345
346 orig_start = start;
347
348 actual_end = min_t(u64, isize, end + 1);
349again:
350 will_compress = 0;
351 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
352 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
353
354 total_compressed = actual_end - start;
355
356 /* we want to make sure that amount of ram required to uncompress
357 * an extent is reasonable, so we limit the total size in ram
358 * of a compressed extent to 128k. This is a crucial number
359 * because it also controls how easily we can spread reads across
360 * cpus for decompression.
361 *
362 * We also want to make sure the amount of IO required to do
363 * a random read is reasonably small, so we limit the size of
364 * a compressed extent to 128k.
365 */
366 total_compressed = min(total_compressed, max_uncompressed);
367 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
368 num_bytes = max(blocksize, num_bytes);
369 disk_num_bytes = num_bytes;
370 total_in = 0;
371 ret = 0;
372
373 /*
374 * we do compression for mount -o compress and when the
375 * inode has not been flagged as nocompress. This flag can
376 * change at any time if we discover bad compression ratios.
377 */
378 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
379 btrfs_test_opt(root, COMPRESS)) {
380 WARN_ON(pages);
381 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
382
383 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
384 total_compressed, pages,
385 nr_pages, &nr_pages_ret,
386 &total_in,
387 &total_compressed,
388 max_compressed);
389
390 if (!ret) {
391 unsigned long offset = total_compressed &
392 (PAGE_CACHE_SIZE - 1);
393 struct page *page = pages[nr_pages_ret - 1];
394 char *kaddr;
395
396 /* zero the tail end of the last page, we might be
397 * sending it down to disk
398 */
399 if (offset) {
400 kaddr = kmap_atomic(page, KM_USER0);
401 memset(kaddr + offset, 0,
402 PAGE_CACHE_SIZE - offset);
403 kunmap_atomic(kaddr, KM_USER0);
404 }
405 will_compress = 1;
406 }
407 }
408 if (start == 0) {
409 trans = btrfs_join_transaction(root, 1);
410 BUG_ON(!trans);
411 btrfs_set_trans_block_group(trans, inode);
412
413 /* lets try to make an inline extent */
414 if (ret || total_in < (actual_end - start)) {
415 /* we didn't compress the entire range, try
416 * to make an uncompressed inline extent.
417 */
418 ret = cow_file_range_inline(trans, root, inode,
419 start, end, 0, NULL);
420 } else {
421 /* try making a compressed inline extent */
422 ret = cow_file_range_inline(trans, root, inode,
423 start, end,
424 total_compressed, pages);
425 }
426 btrfs_end_transaction(trans, root);
427 if (ret == 0) {
428 /*
429 * inline extent creation worked, we don't need
430 * to create any more async work items. Unlock
431 * and free up our temp pages.
432 */
433 extent_clear_unlock_delalloc(inode,
434 &BTRFS_I(inode)->io_tree,
435 start, end, NULL, 1, 0,
436 0, 1, 1, 1);
437 ret = 0;
438 goto free_pages_out;
439 }
440 }
441
442 if (will_compress) {
443 /*
444 * we aren't doing an inline extent round the compressed size
445 * up to a block size boundary so the allocator does sane
446 * things
447 */
448 total_compressed = (total_compressed + blocksize - 1) &
449 ~(blocksize - 1);
450
451 /*
452 * one last check to make sure the compression is really a
453 * win, compare the page count read with the blocks on disk
454 */
455 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
456 ~(PAGE_CACHE_SIZE - 1);
457 if (total_compressed >= total_in) {
458 will_compress = 0;
459 } else {
460 disk_num_bytes = total_compressed;
461 num_bytes = total_in;
462 }
463 }
464 if (!will_compress && pages) {
465 /*
466 * the compression code ran but failed to make things smaller,
467 * free any pages it allocated and our page pointer array
468 */
469 for (i = 0; i < nr_pages_ret; i++) {
470 WARN_ON(pages[i]->mapping);
471 page_cache_release(pages[i]);
472 }
473 kfree(pages);
474 pages = NULL;
475 total_compressed = 0;
476 nr_pages_ret = 0;
477
478 /* flag the file so we don't compress in the future */
479 btrfs_set_flag(inode, NOCOMPRESS);
480 }
481 if (will_compress) {
482 *num_added += 1;
483
484 /* the async work queues will take care of doing actual
485 * allocation on disk for these compressed pages,
486 * and will submit them to the elevator.
487 */
488 add_async_extent(async_cow, start, num_bytes,
489 total_compressed, pages, nr_pages_ret);
490
491 if (start + num_bytes < end && start + num_bytes < actual_end) {
492 start += num_bytes;
493 pages = NULL;
494 cond_resched();
495 goto again;
496 }
497 } else {
498 /*
499 * No compression, but we still need to write the pages in
500 * the file we've been given so far. redirty the locked
501 * page if it corresponds to our extent and set things up
502 * for the async work queue to run cow_file_range to do
503 * the normal delalloc dance
504 */
505 if (page_offset(locked_page) >= start &&
506 page_offset(locked_page) <= end) {
507 __set_page_dirty_nobuffers(locked_page);
508 /* unlocked later on in the async handlers */
509 }
510 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
511 *num_added += 1;
512 }
513
514out:
515 return 0;
516
517free_pages_out:
518 for (i = 0; i < nr_pages_ret; i++) {
519 WARN_ON(pages[i]->mapping);
520 page_cache_release(pages[i]);
521 }
522 kfree(pages);
523
524 goto out;
525}
526
527/*
528 * phase two of compressed writeback. This is the ordered portion
529 * of the code, which only gets called in the order the work was
530 * queued. We walk all the async extents created by compress_file_range
531 * and send them down to the disk.
532 */
533static noinline int submit_compressed_extents(struct inode *inode,
534 struct async_cow *async_cow)
535{
536 struct async_extent *async_extent;
537 u64 alloc_hint = 0;
538 struct btrfs_trans_handle *trans;
539 struct btrfs_key ins;
540 struct extent_map *em;
541 struct btrfs_root *root = BTRFS_I(inode)->root;
542 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
543 struct extent_io_tree *io_tree;
544 int ret;
545
546 if (list_empty(&async_cow->extents))
547 return 0;
548
549 trans = btrfs_join_transaction(root, 1);
550
551 while (!list_empty(&async_cow->extents)) {
552 async_extent = list_entry(async_cow->extents.next,
553 struct async_extent, list);
554 list_del(&async_extent->list);
555
556 io_tree = &BTRFS_I(inode)->io_tree;
557
558 /* did the compression code fall back to uncompressed IO? */
559 if (!async_extent->pages) {
560 int page_started = 0;
561 unsigned long nr_written = 0;
562
563 lock_extent(io_tree, async_extent->start,
564 async_extent->start +
565 async_extent->ram_size - 1, GFP_NOFS);
566
567 /* allocate blocks */
568 cow_file_range(inode, async_cow->locked_page,
569 async_extent->start,
570 async_extent->start +
571 async_extent->ram_size - 1,
572 &page_started, &nr_written, 0);
573
574 /*
575 * if page_started, cow_file_range inserted an
576 * inline extent and took care of all the unlocking
577 * and IO for us. Otherwise, we need to submit
578 * all those pages down to the drive.
579 */
580 if (!page_started)
581 extent_write_locked_range(io_tree,
582 inode, async_extent->start,
583 async_extent->start +
584 async_extent->ram_size - 1,
585 btrfs_get_extent,
586 WB_SYNC_ALL);
587 kfree(async_extent);
588 cond_resched();
589 continue;
590 }
591
592 lock_extent(io_tree, async_extent->start,
593 async_extent->start + async_extent->ram_size - 1,
594 GFP_NOFS);
595 /*
596 * here we're doing allocation and writeback of the
597 * compressed pages
598 */
599 btrfs_drop_extent_cache(inode, async_extent->start,
600 async_extent->start +
601 async_extent->ram_size - 1, 0);
602
603 ret = btrfs_reserve_extent(trans, root,
604 async_extent->compressed_size,
605 async_extent->compressed_size,
606 0, alloc_hint,
607 (u64)-1, &ins, 1);
608 BUG_ON(ret);
609 em = alloc_extent_map(GFP_NOFS);
610 em->start = async_extent->start;
611 em->len = async_extent->ram_size;
612 em->orig_start = em->start;
613
614 em->block_start = ins.objectid;
615 em->block_len = ins.offset;
616 em->bdev = root->fs_info->fs_devices->latest_bdev;
617 set_bit(EXTENT_FLAG_PINNED, &em->flags);
618 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
619
620 while (1) {
621 spin_lock(&em_tree->lock);
622 ret = add_extent_mapping(em_tree, em);
623 spin_unlock(&em_tree->lock);
624 if (ret != -EEXIST) {
625 free_extent_map(em);
626 break;
627 }
628 btrfs_drop_extent_cache(inode, async_extent->start,
629 async_extent->start +
630 async_extent->ram_size - 1, 0);
631 }
632
633 ret = btrfs_add_ordered_extent(inode, async_extent->start,
634 ins.objectid,
635 async_extent->ram_size,
636 ins.offset,
637 BTRFS_ORDERED_COMPRESSED);
638 BUG_ON(ret);
639
640 btrfs_end_transaction(trans, root);
641
642 /*
643 * clear dirty, set writeback and unlock the pages.
644 */
645 extent_clear_unlock_delalloc(inode,
646 &BTRFS_I(inode)->io_tree,
647 async_extent->start,
648 async_extent->start +
649 async_extent->ram_size - 1,
650 NULL, 1, 1, 0, 1, 1, 0);
651
652 ret = btrfs_submit_compressed_write(inode,
653 async_extent->start,
654 async_extent->ram_size,
655 ins.objectid,
656 ins.offset, async_extent->pages,
657 async_extent->nr_pages);
658
659 BUG_ON(ret);
660 trans = btrfs_join_transaction(root, 1);
661 alloc_hint = ins.objectid + ins.offset;
662 kfree(async_extent);
663 cond_resched();
664 }
665
666 btrfs_end_transaction(trans, root);
667 return 0;
668}
669
670/*
671 * when extent_io.c finds a delayed allocation range in the file,
672 * the call backs end up in this code. The basic idea is to
673 * allocate extents on disk for the range, and create ordered data structs
674 * in ram to track those extents.
675 *
676 * locked_page is the page that writepage had locked already. We use
677 * it to make sure we don't do extra locks or unlocks.
678 *
679 * *page_started is set to one if we unlock locked_page and do everything
680 * required to start IO on it. It may be clean and already done with
681 * IO when we return.
682 */
683static noinline int cow_file_range(struct inode *inode,
684 struct page *locked_page,
685 u64 start, u64 end, int *page_started,
686 unsigned long *nr_written,
687 int unlock)
688{
689 struct btrfs_root *root = BTRFS_I(inode)->root;
690 struct btrfs_trans_handle *trans;
691 u64 alloc_hint = 0;
692 u64 num_bytes;
693 unsigned long ram_size;
694 u64 disk_num_bytes;
695 u64 cur_alloc_size;
696 u64 blocksize = root->sectorsize;
697 u64 actual_end;
698 u64 isize = i_size_read(inode);
699 struct btrfs_key ins;
700 struct extent_map *em;
701 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
702 int ret = 0;
703
704 trans = btrfs_join_transaction(root, 1);
705 BUG_ON(!trans);
706 btrfs_set_trans_block_group(trans, inode);
707
708 actual_end = min_t(u64, isize, end + 1);
709
710 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
711 num_bytes = max(blocksize, num_bytes);
712 disk_num_bytes = num_bytes;
713 ret = 0;
714
715 if (start == 0) {
716 /* lets try to make an inline extent */
717 ret = cow_file_range_inline(trans, root, inode,
718 start, end, 0, NULL);
719 if (ret == 0) {
720 extent_clear_unlock_delalloc(inode,
721 &BTRFS_I(inode)->io_tree,
722 start, end, NULL, 1, 1,
723 1, 1, 1, 1);
724 *nr_written = *nr_written +
725 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
726 *page_started = 1;
727 ret = 0;
728 goto out;
729 }
730 }
731
732 BUG_ON(disk_num_bytes >
733 btrfs_super_total_bytes(&root->fs_info->super_copy));
734
735 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
736
737 while (disk_num_bytes > 0) {
738 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
739 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
740 root->sectorsize, 0, alloc_hint,
741 (u64)-1, &ins, 1);
742 BUG_ON(ret);
743
744 em = alloc_extent_map(GFP_NOFS);
745 em->start = start;
746 em->orig_start = em->start;
747
748 ram_size = ins.offset;
749 em->len = ins.offset;
750
751 em->block_start = ins.objectid;
752 em->block_len = ins.offset;
753 em->bdev = root->fs_info->fs_devices->latest_bdev;
754 set_bit(EXTENT_FLAG_PINNED, &em->flags);
755
756 while (1) {
757 spin_lock(&em_tree->lock);
758 ret = add_extent_mapping(em_tree, em);
759 spin_unlock(&em_tree->lock);
760 if (ret != -EEXIST) {
761 free_extent_map(em);
762 break;
763 }
764 btrfs_drop_extent_cache(inode, start,
765 start + ram_size - 1, 0);
766 }
767
768 cur_alloc_size = ins.offset;
769 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
770 ram_size, cur_alloc_size, 0);
771 BUG_ON(ret);
772
773 if (root->root_key.objectid ==
774 BTRFS_DATA_RELOC_TREE_OBJECTID) {
775 ret = btrfs_reloc_clone_csums(inode, start,
776 cur_alloc_size);
777 BUG_ON(ret);
778 }
779
780 if (disk_num_bytes < cur_alloc_size)
781 break;
782
783 /* we're not doing compressed IO, don't unlock the first
784 * page (which the caller expects to stay locked), don't
785 * clear any dirty bits and don't set any writeback bits
786 */
787 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
788 start, start + ram_size - 1,
789 locked_page, unlock, 1,
790 1, 0, 0, 0);
791 disk_num_bytes -= cur_alloc_size;
792 num_bytes -= cur_alloc_size;
793 alloc_hint = ins.objectid + ins.offset;
794 start += cur_alloc_size;
795 }
796out:
797 ret = 0;
798 btrfs_end_transaction(trans, root);
799
800 return ret;
801}
802
803/*
804 * work queue call back to started compression on a file and pages
805 */
806static noinline void async_cow_start(struct btrfs_work *work)
807{
808 struct async_cow *async_cow;
809 int num_added = 0;
810 async_cow = container_of(work, struct async_cow, work);
811
812 compress_file_range(async_cow->inode, async_cow->locked_page,
813 async_cow->start, async_cow->end, async_cow,
814 &num_added);
815 if (num_added == 0)
816 async_cow->inode = NULL;
817}
818
819/*
820 * work queue call back to submit previously compressed pages
821 */
822static noinline void async_cow_submit(struct btrfs_work *work)
823{
824 struct async_cow *async_cow;
825 struct btrfs_root *root;
826 unsigned long nr_pages;
827
828 async_cow = container_of(work, struct async_cow, work);
829
830 root = async_cow->root;
831 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
832 PAGE_CACHE_SHIFT;
833
834 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
835
836 if (atomic_read(&root->fs_info->async_delalloc_pages) <
837 5 * 1042 * 1024 &&
838 waitqueue_active(&root->fs_info->async_submit_wait))
839 wake_up(&root->fs_info->async_submit_wait);
840
841 if (async_cow->inode)
842 submit_compressed_extents(async_cow->inode, async_cow);
843}
844
845static noinline void async_cow_free(struct btrfs_work *work)
846{
847 struct async_cow *async_cow;
848 async_cow = container_of(work, struct async_cow, work);
849 kfree(async_cow);
850}
851
852static int cow_file_range_async(struct inode *inode, struct page *locked_page,
853 u64 start, u64 end, int *page_started,
854 unsigned long *nr_written)
855{
856 struct async_cow *async_cow;
857 struct btrfs_root *root = BTRFS_I(inode)->root;
858 unsigned long nr_pages;
859 u64 cur_end;
860 int limit = 10 * 1024 * 1042;
861
862 if (!btrfs_test_opt(root, COMPRESS)) {
863 return cow_file_range(inode, locked_page, start, end,
864 page_started, nr_written, 1);
865 }
866
867 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
868 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
869 while (start < end) {
870 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
871 async_cow->inode = inode;
872 async_cow->root = root;
873 async_cow->locked_page = locked_page;
874 async_cow->start = start;
875
876 if (btrfs_test_flag(inode, NOCOMPRESS))
877 cur_end = end;
878 else
879 cur_end = min(end, start + 512 * 1024 - 1);
880
881 async_cow->end = cur_end;
882 INIT_LIST_HEAD(&async_cow->extents);
883
884 async_cow->work.func = async_cow_start;
885 async_cow->work.ordered_func = async_cow_submit;
886 async_cow->work.ordered_free = async_cow_free;
887 async_cow->work.flags = 0;
888
889 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
890 PAGE_CACHE_SHIFT;
891 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
892
893 btrfs_queue_worker(&root->fs_info->delalloc_workers,
894 &async_cow->work);
895
896 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
897 wait_event(root->fs_info->async_submit_wait,
898 (atomic_read(&root->fs_info->async_delalloc_pages) <
899 limit));
900 }
901
902 while (atomic_read(&root->fs_info->async_submit_draining) &&
903 atomic_read(&root->fs_info->async_delalloc_pages)) {
904 wait_event(root->fs_info->async_submit_wait,
905 (atomic_read(&root->fs_info->async_delalloc_pages) ==
906 0));
907 }
908
909 *nr_written += nr_pages;
910 start = cur_end + 1;
911 }
912 *page_started = 1;
913 return 0;
914}
915
916static noinline int csum_exist_in_range(struct btrfs_root *root,
917 u64 bytenr, u64 num_bytes)
918{
919 int ret;
920 struct btrfs_ordered_sum *sums;
921 LIST_HEAD(list);
922
923 ret = btrfs_lookup_csums_range(root, bytenr, bytenr + num_bytes - 1,
924 &list);
925 if (ret == 0 && list_empty(&list))
926 return 0;
927
928 while (!list_empty(&list)) {
929 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
930 list_del(&sums->list);
931 kfree(sums);
932 }
933 return 1;
934}
935
936/*
937 * when nowcow writeback call back. This checks for snapshots or COW copies
938 * of the extents that exist in the file, and COWs the file as required.
939 *
940 * If no cow copies or snapshots exist, we write directly to the existing
941 * blocks on disk
942 */
943static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
944 u64 start, u64 end, int *page_started, int force,
945 unsigned long *nr_written)
946{
947 struct btrfs_root *root = BTRFS_I(inode)->root;
948 struct btrfs_trans_handle *trans;
949 struct extent_buffer *leaf;
950 struct btrfs_path *path;
951 struct btrfs_file_extent_item *fi;
952 struct btrfs_key found_key;
953 u64 cow_start;
954 u64 cur_offset;
955 u64 extent_end;
956 u64 disk_bytenr;
957 u64 num_bytes;
958 int extent_type;
959 int ret;
960 int type;
961 int nocow;
962 int check_prev = 1;
963
964 path = btrfs_alloc_path();
965 BUG_ON(!path);
966 trans = btrfs_join_transaction(root, 1);
967 BUG_ON(!trans);
968
969 cow_start = (u64)-1;
970 cur_offset = start;
971 while (1) {
972 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
973 cur_offset, 0);
974 BUG_ON(ret < 0);
975 if (ret > 0 && path->slots[0] > 0 && check_prev) {
976 leaf = path->nodes[0];
977 btrfs_item_key_to_cpu(leaf, &found_key,
978 path->slots[0] - 1);
979 if (found_key.objectid == inode->i_ino &&
980 found_key.type == BTRFS_EXTENT_DATA_KEY)
981 path->slots[0]--;
982 }
983 check_prev = 0;
984next_slot:
985 leaf = path->nodes[0];
986 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
987 ret = btrfs_next_leaf(root, path);
988 if (ret < 0)
989 BUG_ON(1);
990 if (ret > 0)
991 break;
992 leaf = path->nodes[0];
993 }
994
995 nocow = 0;
996 disk_bytenr = 0;
997 num_bytes = 0;
998 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
999
1000 if (found_key.objectid > inode->i_ino ||
1001 found_key.type > BTRFS_EXTENT_DATA_KEY ||
1002 found_key.offset > end)
1003 break;
1004
1005 if (found_key.offset > cur_offset) {
1006 extent_end = found_key.offset;
1007 goto out_check;
1008 }
1009
1010 fi = btrfs_item_ptr(leaf, path->slots[0],
1011 struct btrfs_file_extent_item);
1012 extent_type = btrfs_file_extent_type(leaf, fi);
1013
1014 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1015 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1016 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1017 extent_end = found_key.offset +
1018 btrfs_file_extent_num_bytes(leaf, fi);
1019 if (extent_end <= start) {
1020 path->slots[0]++;
1021 goto next_slot;
1022 }
1023 if (disk_bytenr == 0)
1024 goto out_check;
1025 if (btrfs_file_extent_compression(leaf, fi) ||
1026 btrfs_file_extent_encryption(leaf, fi) ||
1027 btrfs_file_extent_other_encoding(leaf, fi))
1028 goto out_check;
1029 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1030 goto out_check;
1031 if (btrfs_extent_readonly(root, disk_bytenr))
1032 goto out_check;
1033 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
1034 disk_bytenr))
1035 goto out_check;
1036 disk_bytenr += btrfs_file_extent_offset(leaf, fi);
1037 disk_bytenr += cur_offset - found_key.offset;
1038 num_bytes = min(end + 1, extent_end) - cur_offset;
1039 /*
1040 * force cow if csum exists in the range.
1041 * this ensure that csum for a given extent are
1042 * either valid or do not exist.
1043 */
1044 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1045 goto out_check;
1046 nocow = 1;
1047 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1048 extent_end = found_key.offset +
1049 btrfs_file_extent_inline_len(leaf, fi);
1050 extent_end = ALIGN(extent_end, root->sectorsize);
1051 } else {
1052 BUG_ON(1);
1053 }
1054out_check:
1055 if (extent_end <= start) {
1056 path->slots[0]++;
1057 goto next_slot;
1058 }
1059 if (!nocow) {
1060 if (cow_start == (u64)-1)
1061 cow_start = cur_offset;
1062 cur_offset = extent_end;
1063 if (cur_offset > end)
1064 break;
1065 path->slots[0]++;
1066 goto next_slot;
1067 }
1068
1069 btrfs_release_path(root, path);
1070 if (cow_start != (u64)-1) {
1071 ret = cow_file_range(inode, locked_page, cow_start,
1072 found_key.offset - 1, page_started,
1073 nr_written, 1);
1074 BUG_ON(ret);
1075 cow_start = (u64)-1;
1076 }
1077
1078 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1079 struct extent_map *em;
1080 struct extent_map_tree *em_tree;
1081 em_tree = &BTRFS_I(inode)->extent_tree;
1082 em = alloc_extent_map(GFP_NOFS);
1083 em->start = cur_offset;
1084 em->orig_start = em->start;
1085 em->len = num_bytes;
1086 em->block_len = num_bytes;
1087 em->block_start = disk_bytenr;
1088 em->bdev = root->fs_info->fs_devices->latest_bdev;
1089 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1090 while (1) {
1091 spin_lock(&em_tree->lock);
1092 ret = add_extent_mapping(em_tree, em);
1093 spin_unlock(&em_tree->lock);
1094 if (ret != -EEXIST) {
1095 free_extent_map(em);
1096 break;
1097 }
1098 btrfs_drop_extent_cache(inode, em->start,
1099 em->start + em->len - 1, 0);
1100 }
1101 type = BTRFS_ORDERED_PREALLOC;
1102 } else {
1103 type = BTRFS_ORDERED_NOCOW;
1104 }
1105
1106 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1107 num_bytes, num_bytes, type);
1108 BUG_ON(ret);
1109
1110 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1111 cur_offset, cur_offset + num_bytes - 1,
1112 locked_page, 1, 1, 1, 0, 0, 0);
1113 cur_offset = extent_end;
1114 if (cur_offset > end)
1115 break;
1116 }
1117 btrfs_release_path(root, path);
1118
1119 if (cur_offset <= end && cow_start == (u64)-1)
1120 cow_start = cur_offset;
1121 if (cow_start != (u64)-1) {
1122 ret = cow_file_range(inode, locked_page, cow_start, end,
1123 page_started, nr_written, 1);
1124 BUG_ON(ret);
1125 }
1126
1127 ret = btrfs_end_transaction(trans, root);
1128 BUG_ON(ret);
1129 btrfs_free_path(path);
1130 return 0;
1131}
1132
1133/*
1134 * extent_io.c call back to do delayed allocation processing
1135 */
1136static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1137 u64 start, u64 end, int *page_started,
1138 unsigned long *nr_written)
1139{
1140 int ret;
1141
1142 if (btrfs_test_flag(inode, NODATACOW))
1143 ret = run_delalloc_nocow(inode, locked_page, start, end,
1144 page_started, 1, nr_written);
1145 else if (btrfs_test_flag(inode, PREALLOC))
1146 ret = run_delalloc_nocow(inode, locked_page, start, end,
1147 page_started, 0, nr_written);
1148 else
1149 ret = cow_file_range_async(inode, locked_page, start, end,
1150 page_started, nr_written);
1151
1152 return ret;
1153}
1154
1155/*
1156 * extent_io.c set_bit_hook, used to track delayed allocation
1157 * bytes in this file, and to maintain the list of inodes that
1158 * have pending delalloc work to be done.
1159 */
1160static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1161 unsigned long old, unsigned long bits)
1162{
1163 /*
1164 * set_bit and clear bit hooks normally require _irqsave/restore
1165 * but in this case, we are only testeing for the DELALLOC
1166 * bit, which is only set or cleared with irqs on
1167 */
1168 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1169 struct btrfs_root *root = BTRFS_I(inode)->root;
1170 spin_lock(&root->fs_info->delalloc_lock);
1171 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1172 root->fs_info->delalloc_bytes += end - start + 1;
1173 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1174 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1175 &root->fs_info->delalloc_inodes);
1176 }
1177 spin_unlock(&root->fs_info->delalloc_lock);
1178 }
1179 return 0;
1180}
1181
1182/*
1183 * extent_io.c clear_bit_hook, see set_bit_hook for why
1184 */
1185static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1186 unsigned long old, unsigned long bits)
1187{
1188 /*
1189 * set_bit and clear bit hooks normally require _irqsave/restore
1190 * but in this case, we are only testeing for the DELALLOC
1191 * bit, which is only set or cleared with irqs on
1192 */
1193 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1194 struct btrfs_root *root = BTRFS_I(inode)->root;
1195
1196 spin_lock(&root->fs_info->delalloc_lock);
1197 if (end - start + 1 > root->fs_info->delalloc_bytes) {
1198 printk(KERN_INFO "btrfs warning: delalloc account "
1199 "%llu %llu\n",
1200 (unsigned long long)end - start + 1,
1201 (unsigned long long)
1202 root->fs_info->delalloc_bytes);
1203 root->fs_info->delalloc_bytes = 0;
1204 BTRFS_I(inode)->delalloc_bytes = 0;
1205 } else {
1206 root->fs_info->delalloc_bytes -= end - start + 1;
1207 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1208 }
1209 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1210 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1211 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1212 }
1213 spin_unlock(&root->fs_info->delalloc_lock);
1214 }
1215 return 0;
1216}
1217
1218/*
1219 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1220 * we don't create bios that span stripes or chunks
1221 */
1222int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1223 size_t size, struct bio *bio,
1224 unsigned long bio_flags)
1225{
1226 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1227 struct btrfs_mapping_tree *map_tree;
1228 u64 logical = (u64)bio->bi_sector << 9;
1229 u64 length = 0;
1230 u64 map_length;
1231 int ret;
1232
1233 if (bio_flags & EXTENT_BIO_COMPRESSED)
1234 return 0;
1235
1236 length = bio->bi_size;
1237 map_tree = &root->fs_info->mapping_tree;
1238 map_length = length;
1239 ret = btrfs_map_block(map_tree, READ, logical,
1240 &map_length, NULL, 0);
1241
1242 if (map_length < length + size)
1243 return 1;
1244 return 0;
1245}
1246
1247/*
1248 * in order to insert checksums into the metadata in large chunks,
1249 * we wait until bio submission time. All the pages in the bio are
1250 * checksummed and sums are attached onto the ordered extent record.
1251 *
1252 * At IO completion time the cums attached on the ordered extent record
1253 * are inserted into the btree
1254 */
1255static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1256 struct bio *bio, int mirror_num,
1257 unsigned long bio_flags)
1258{
1259 struct btrfs_root *root = BTRFS_I(inode)->root;
1260 int ret = 0;
1261
1262 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1263 BUG_ON(ret);
1264 return 0;
1265}
1266
1267/*
1268 * in order to insert checksums into the metadata in large chunks,
1269 * we wait until bio submission time. All the pages in the bio are
1270 * checksummed and sums are attached onto the ordered extent record.
1271 *
1272 * At IO completion time the cums attached on the ordered extent record
1273 * are inserted into the btree
1274 */
1275static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1276 int mirror_num, unsigned long bio_flags)
1277{
1278 struct btrfs_root *root = BTRFS_I(inode)->root;
1279 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
1280}
1281
1282/*
1283 * extent_io.c submission hook. This does the right thing for csum calculation
1284 * on write, or reading the csums from the tree before a read
1285 */
1286static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1287 int mirror_num, unsigned long bio_flags)
1288{
1289 struct btrfs_root *root = BTRFS_I(inode)->root;
1290 int ret = 0;
1291 int skip_sum;
1292
1293 skip_sum = btrfs_test_flag(inode, NODATASUM);
1294
1295 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1296 BUG_ON(ret);
1297
1298 if (!(rw & (1 << BIO_RW))) {
1299 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1300 return btrfs_submit_compressed_read(inode, bio,
1301 mirror_num, bio_flags);
1302 } else if (!skip_sum)
1303 btrfs_lookup_bio_sums(root, inode, bio, NULL);
1304 goto mapit;
1305 } else if (!skip_sum) {
1306 /* csum items have already been cloned */
1307 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1308 goto mapit;
1309 /* we're doing a write, do the async checksumming */
1310 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1311 inode, rw, bio, mirror_num,
1312 bio_flags, __btrfs_submit_bio_start,
1313 __btrfs_submit_bio_done);
1314 }
1315
1316mapit:
1317 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
1318}
1319
1320/*
1321 * given a list of ordered sums record them in the inode. This happens
1322 * at IO completion time based on sums calculated at bio submission time.
1323 */
1324static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1325 struct inode *inode, u64 file_offset,
1326 struct list_head *list)
1327{
1328 struct list_head *cur;
1329 struct btrfs_ordered_sum *sum;
1330
1331 btrfs_set_trans_block_group(trans, inode);
1332 list_for_each(cur, list) {
1333 sum = list_entry(cur, struct btrfs_ordered_sum, list);
1334 btrfs_csum_file_blocks(trans,
1335 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1336 }
1337 return 0;
1338}
1339
1340int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
1341{
1342 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1343 WARN_ON(1);
1344 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1345 GFP_NOFS);
1346}
1347
1348/* see btrfs_writepage_start_hook for details on why this is required */
1349struct btrfs_writepage_fixup {
1350 struct page *page;
1351 struct btrfs_work work;
1352};
1353
1354static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1355{
1356 struct btrfs_writepage_fixup *fixup;
1357 struct btrfs_ordered_extent *ordered;
1358 struct page *page;
1359 struct inode *inode;
1360 u64 page_start;
1361 u64 page_end;
1362
1363 fixup = container_of(work, struct btrfs_writepage_fixup, work);
1364 page = fixup->page;
1365again:
1366 lock_page(page);
1367 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1368 ClearPageChecked(page);
1369 goto out_page;
1370 }
1371
1372 inode = page->mapping->host;
1373 page_start = page_offset(page);
1374 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1375
1376 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1377
1378 /* already ordered? We're done */
1379 if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
1380 EXTENT_ORDERED, 0)) {
1381 goto out;
1382 }
1383
1384 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1385 if (ordered) {
1386 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
1387 page_end, GFP_NOFS);
1388 unlock_page(page);
1389 btrfs_start_ordered_extent(inode, ordered, 1);
1390 goto again;
1391 }
1392
1393 btrfs_set_extent_delalloc(inode, page_start, page_end);
1394 ClearPageChecked(page);
1395out:
1396 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1397out_page:
1398 unlock_page(page);
1399 page_cache_release(page);
1400}
1401
1402/*
1403 * There are a few paths in the higher layers of the kernel that directly
1404 * set the page dirty bit without asking the filesystem if it is a
1405 * good idea. This causes problems because we want to make sure COW
1406 * properly happens and the data=ordered rules are followed.
1407 *
1408 * In our case any range that doesn't have the ORDERED bit set
1409 * hasn't been properly setup for IO. We kick off an async process
1410 * to fix it up. The async helper will wait for ordered extents, set
1411 * the delalloc bit and make it safe to write the page.
1412 */
1413static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1414{
1415 struct inode *inode = page->mapping->host;
1416 struct btrfs_writepage_fixup *fixup;
1417 struct btrfs_root *root = BTRFS_I(inode)->root;
1418 int ret;
1419
1420 ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1421 EXTENT_ORDERED, 0);
1422 if (ret)
1423 return 0;
1424
1425 if (PageChecked(page))
1426 return -EAGAIN;
1427
1428 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1429 if (!fixup)
1430 return -EAGAIN;
1431
1432 SetPageChecked(page);
1433 page_cache_get(page);
1434 fixup->work.func = btrfs_writepage_fixup_worker;
1435 fixup->page = page;
1436 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1437 return -EAGAIN;
1438}
1439
1440static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1441 struct inode *inode, u64 file_pos,
1442 u64 disk_bytenr, u64 disk_num_bytes,
1443 u64 num_bytes, u64 ram_bytes,
1444 u8 compression, u8 encryption,
1445 u16 other_encoding, int extent_type)
1446{
1447 struct btrfs_root *root = BTRFS_I(inode)->root;
1448 struct btrfs_file_extent_item *fi;
1449 struct btrfs_path *path;
1450 struct extent_buffer *leaf;
1451 struct btrfs_key ins;
1452 u64 hint;
1453 int ret;
1454
1455 path = btrfs_alloc_path();
1456 BUG_ON(!path);
1457
1458 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1459 file_pos + num_bytes, file_pos, &hint);
1460 BUG_ON(ret);
1461
1462 ins.objectid = inode->i_ino;
1463 ins.offset = file_pos;
1464 ins.type = BTRFS_EXTENT_DATA_KEY;
1465 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1466 BUG_ON(ret);
1467 leaf = path->nodes[0];
1468 fi = btrfs_item_ptr(leaf, path->slots[0],
1469 struct btrfs_file_extent_item);
1470 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1471 btrfs_set_file_extent_type(leaf, fi, extent_type);
1472 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1473 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1474 btrfs_set_file_extent_offset(leaf, fi, 0);
1475 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1476 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1477 btrfs_set_file_extent_compression(leaf, fi, compression);
1478 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1479 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1480 btrfs_mark_buffer_dirty(leaf);
1481
1482 inode_add_bytes(inode, num_bytes);
1483 btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
1484
1485 ins.objectid = disk_bytenr;
1486 ins.offset = disk_num_bytes;
1487 ins.type = BTRFS_EXTENT_ITEM_KEY;
1488 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
1489 root->root_key.objectid,
1490 trans->transid, inode->i_ino, &ins);
1491 BUG_ON(ret);
1492
1493 btrfs_free_path(path);
1494 return 0;
1495}
1496
1497/* as ordered data IO finishes, this gets called so we can finish
1498 * an ordered extent if the range of bytes in the file it covers are
1499 * fully written.
1500 */
1501static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1502{
1503 struct btrfs_root *root = BTRFS_I(inode)->root;
1504 struct btrfs_trans_handle *trans;
1505 struct btrfs_ordered_extent *ordered_extent;
1506 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1507 int compressed = 0;
1508 int ret;
1509
1510 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
1511 if (!ret)
1512 return 0;
1513
1514 trans = btrfs_join_transaction(root, 1);
1515
1516 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1517 BUG_ON(!ordered_extent);
1518 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1519 goto nocow;
1520
1521 lock_extent(io_tree, ordered_extent->file_offset,
1522 ordered_extent->file_offset + ordered_extent->len - 1,
1523 GFP_NOFS);
1524
1525 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1526 compressed = 1;
1527 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1528 BUG_ON(compressed);
1529 ret = btrfs_mark_extent_written(trans, root, inode,
1530 ordered_extent->file_offset,
1531 ordered_extent->file_offset +
1532 ordered_extent->len);
1533 BUG_ON(ret);
1534 } else {
1535 ret = insert_reserved_file_extent(trans, inode,
1536 ordered_extent->file_offset,
1537 ordered_extent->start,
1538 ordered_extent->disk_len,
1539 ordered_extent->len,
1540 ordered_extent->len,
1541 compressed, 0, 0,
1542 BTRFS_FILE_EXTENT_REG);
1543 BUG_ON(ret);
1544 }
1545 unlock_extent(io_tree, ordered_extent->file_offset,
1546 ordered_extent->file_offset + ordered_extent->len - 1,
1547 GFP_NOFS);
1548nocow:
1549 add_pending_csums(trans, inode, ordered_extent->file_offset,
1550 &ordered_extent->list);
1551
1552 mutex_lock(&BTRFS_I(inode)->extent_mutex);
1553 btrfs_ordered_update_i_size(inode, ordered_extent);
1554 btrfs_update_inode(trans, root, inode);
1555 btrfs_remove_ordered_extent(inode, ordered_extent);
1556 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
1557
1558 /* once for us */
1559 btrfs_put_ordered_extent(ordered_extent);
1560 /* once for the tree */
1561 btrfs_put_ordered_extent(ordered_extent);
1562
1563 btrfs_end_transaction(trans, root);
1564 return 0;
1565}
1566
1567static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1568 struct extent_state *state, int uptodate)
1569{
1570 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1571}
1572
1573/*
1574 * When IO fails, either with EIO or csum verification fails, we
1575 * try other mirrors that might have a good copy of the data. This
1576 * io_failure_record is used to record state as we go through all the
1577 * mirrors. If another mirror has good data, the page is set up to date
1578 * and things continue. If a good mirror can't be found, the original
1579 * bio end_io callback is called to indicate things have failed.
1580 */
1581struct io_failure_record {
1582 struct page *page;
1583 u64 start;
1584 u64 len;
1585 u64 logical;
1586 unsigned long bio_flags;
1587 int last_mirror;
1588};
1589
1590static int btrfs_io_failed_hook(struct bio *failed_bio,
1591 struct page *page, u64 start, u64 end,
1592 struct extent_state *state)
1593{
1594 struct io_failure_record *failrec = NULL;
1595 u64 private;
1596 struct extent_map *em;
1597 struct inode *inode = page->mapping->host;
1598 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1599 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1600 struct bio *bio;
1601 int num_copies;
1602 int ret;
1603 int rw;
1604 u64 logical;
1605
1606 ret = get_state_private(failure_tree, start, &private);
1607 if (ret) {
1608 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1609 if (!failrec)
1610 return -ENOMEM;
1611 failrec->start = start;
1612 failrec->len = end - start + 1;
1613 failrec->last_mirror = 0;
1614 failrec->bio_flags = 0;
1615
1616 spin_lock(&em_tree->lock);
1617 em = lookup_extent_mapping(em_tree, start, failrec->len);
1618 if (em->start > start || em->start + em->len < start) {
1619 free_extent_map(em);
1620 em = NULL;
1621 }
1622 spin_unlock(&em_tree->lock);
1623
1624 if (!em || IS_ERR(em)) {
1625 kfree(failrec);
1626 return -EIO;
1627 }
1628 logical = start - em->start;
1629 logical = em->block_start + logical;
1630 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1631 logical = em->block_start;
1632 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1633 }
1634 failrec->logical = logical;
1635 free_extent_map(em);
1636 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1637 EXTENT_DIRTY, GFP_NOFS);
1638 set_state_private(failure_tree, start,
1639 (u64)(unsigned long)failrec);
1640 } else {
1641 failrec = (struct io_failure_record *)(unsigned long)private;
1642 }
1643 num_copies = btrfs_num_copies(
1644 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1645 failrec->logical, failrec->len);
1646 failrec->last_mirror++;
1647 if (!state) {
1648 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1649 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1650 failrec->start,
1651 EXTENT_LOCKED);
1652 if (state && state->start != failrec->start)
1653 state = NULL;
1654 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1655 }
1656 if (!state || failrec->last_mirror > num_copies) {
1657 set_state_private(failure_tree, failrec->start, 0);
1658 clear_extent_bits(failure_tree, failrec->start,
1659 failrec->start + failrec->len - 1,
1660 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1661 kfree(failrec);
1662 return -EIO;
1663 }
1664 bio = bio_alloc(GFP_NOFS, 1);
1665 bio->bi_private = state;
1666 bio->bi_end_io = failed_bio->bi_end_io;
1667 bio->bi_sector = failrec->logical >> 9;
1668 bio->bi_bdev = failed_bio->bi_bdev;
1669 bio->bi_size = 0;
1670
1671 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1672 if (failed_bio->bi_rw & (1 << BIO_RW))
1673 rw = WRITE;
1674 else
1675 rw = READ;
1676
1677 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1678 failrec->last_mirror,
1679 failrec->bio_flags);
1680 return 0;
1681}
1682
1683/*
1684 * each time an IO finishes, we do a fast check in the IO failure tree
1685 * to see if we need to process or clean up an io_failure_record
1686 */
1687static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1688{
1689 u64 private;
1690 u64 private_failure;
1691 struct io_failure_record *failure;
1692 int ret;
1693
1694 private = 0;
1695 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1696 (u64)-1, 1, EXTENT_DIRTY)) {
1697 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1698 start, &private_failure);
1699 if (ret == 0) {
1700 failure = (struct io_failure_record *)(unsigned long)
1701 private_failure;
1702 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1703 failure->start, 0);
1704 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1705 failure->start,
1706 failure->start + failure->len - 1,
1707 EXTENT_DIRTY | EXTENT_LOCKED,
1708 GFP_NOFS);
1709 kfree(failure);
1710 }
1711 }
1712 return 0;
1713}
1714
1715/*
1716 * when reads are done, we need to check csums to verify the data is correct
1717 * if there's a match, we allow the bio to finish. If not, we go through
1718 * the io_failure_record routines to find good copies
1719 */
1720static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1721 struct extent_state *state)
1722{
1723 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
1724 struct inode *inode = page->mapping->host;
1725 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1726 char *kaddr;
1727 u64 private = ~(u32)0;
1728 int ret;
1729 struct btrfs_root *root = BTRFS_I(inode)->root;
1730 u32 csum = ~(u32)0;
1731 unsigned long flags;
1732
1733 if (PageChecked(page)) {
1734 ClearPageChecked(page);
1735 goto good;
1736 }
1737 if (btrfs_test_flag(inode, NODATASUM))
1738 return 0;
1739
1740 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1741 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
1742 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
1743 GFP_NOFS);
1744 return 0;
1745 }
1746
1747 if (state && state->start == start) {
1748 private = state->private;
1749 ret = 0;
1750 } else {
1751 ret = get_state_private(io_tree, start, &private);
1752 }
1753 local_irq_save(flags);
1754 kaddr = kmap_atomic(page, KM_IRQ0);
1755 if (ret)
1756 goto zeroit;
1757
1758 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
1759 btrfs_csum_final(csum, (char *)&csum);
1760 if (csum != private)
1761 goto zeroit;
1762
1763 kunmap_atomic(kaddr, KM_IRQ0);
1764 local_irq_restore(flags);
1765good:
1766 /* if the io failure tree for this inode is non-empty,
1767 * check to see if we've recovered from a failed IO
1768 */
1769 btrfs_clean_io_failures(inode, start);
1770 return 0;
1771
1772zeroit:
1773 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
1774 "private %llu\n", page->mapping->host->i_ino,
1775 (unsigned long long)start, csum,
1776 (unsigned long long)private);
1777 memset(kaddr + offset, 1, end - start + 1);
1778 flush_dcache_page(page);
1779 kunmap_atomic(kaddr, KM_IRQ0);
1780 local_irq_restore(flags);
1781 if (private == 0)
1782 return 0;
1783 return -EIO;
1784}
1785
1786/*
1787 * This creates an orphan entry for the given inode in case something goes
1788 * wrong in the middle of an unlink/truncate.
1789 */
1790int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
1791{
1792 struct btrfs_root *root = BTRFS_I(inode)->root;
1793 int ret = 0;
1794
1795 spin_lock(&root->list_lock);
1796
1797 /* already on the orphan list, we're good */
1798 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
1799 spin_unlock(&root->list_lock);
1800 return 0;
1801 }
1802
1803 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1804
1805 spin_unlock(&root->list_lock);
1806
1807 /*
1808 * insert an orphan item to track this unlinked/truncated file
1809 */
1810 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
1811
1812 return ret;
1813}
1814
1815/*
1816 * We have done the truncate/delete so we can go ahead and remove the orphan
1817 * item for this particular inode.
1818 */
1819int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
1820{
1821 struct btrfs_root *root = BTRFS_I(inode)->root;
1822 int ret = 0;
1823
1824 spin_lock(&root->list_lock);
1825
1826 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
1827 spin_unlock(&root->list_lock);
1828 return 0;
1829 }
1830
1831 list_del_init(&BTRFS_I(inode)->i_orphan);
1832 if (!trans) {
1833 spin_unlock(&root->list_lock);
1834 return 0;
1835 }
1836
1837 spin_unlock(&root->list_lock);
1838
1839 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
1840
1841 return ret;
1842}
1843
1844/*
1845 * this cleans up any orphans that may be left on the list from the last use
1846 * of this root.
1847 */
1848void btrfs_orphan_cleanup(struct btrfs_root *root)
1849{
1850 struct btrfs_path *path;
1851 struct extent_buffer *leaf;
1852 struct btrfs_item *item;
1853 struct btrfs_key key, found_key;
1854 struct btrfs_trans_handle *trans;
1855 struct inode *inode;
1856 int ret = 0, nr_unlink = 0, nr_truncate = 0;
1857
1858 path = btrfs_alloc_path();
1859 if (!path)
1860 return;
1861 path->reada = -1;
1862
1863 key.objectid = BTRFS_ORPHAN_OBJECTID;
1864 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1865 key.offset = (u64)-1;
1866
1867
1868 while (1) {
1869 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1870 if (ret < 0) {
1871 printk(KERN_ERR "Error searching slot for orphan: %d"
1872 "\n", ret);
1873 break;
1874 }
1875
1876 /*
1877 * if ret == 0 means we found what we were searching for, which
1878 * is weird, but possible, so only screw with path if we didnt
1879 * find the key and see if we have stuff that matches
1880 */
1881 if (ret > 0) {
1882 if (path->slots[0] == 0)
1883 break;
1884 path->slots[0]--;
1885 }
1886
1887 /* pull out the item */
1888 leaf = path->nodes[0];
1889 item = btrfs_item_nr(leaf, path->slots[0]);
1890 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1891
1892 /* make sure the item matches what we want */
1893 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
1894 break;
1895 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
1896 break;
1897
1898 /* release the path since we're done with it */
1899 btrfs_release_path(root, path);
1900
1901 /*
1902 * this is where we are basically btrfs_lookup, without the
1903 * crossing root thing. we store the inode number in the
1904 * offset of the orphan item.
1905 */
1906 inode = btrfs_iget_locked(root->fs_info->sb,
1907 found_key.offset, root);
1908 if (!inode)
1909 break;
1910
1911 if (inode->i_state & I_NEW) {
1912 BTRFS_I(inode)->root = root;
1913
1914 /* have to set the location manually */
1915 BTRFS_I(inode)->location.objectid = inode->i_ino;
1916 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1917 BTRFS_I(inode)->location.offset = 0;
1918
1919 btrfs_read_locked_inode(inode);
1920 unlock_new_inode(inode);
1921 }
1922
1923 /*
1924 * add this inode to the orphan list so btrfs_orphan_del does
1925 * the proper thing when we hit it
1926 */
1927 spin_lock(&root->list_lock);
1928 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1929 spin_unlock(&root->list_lock);
1930
1931 /*
1932 * if this is a bad inode, means we actually succeeded in
1933 * removing the inode, but not the orphan record, which means
1934 * we need to manually delete the orphan since iput will just
1935 * do a destroy_inode
1936 */
1937 if (is_bad_inode(inode)) {
1938 trans = btrfs_start_transaction(root, 1);
1939 btrfs_orphan_del(trans, inode);
1940 btrfs_end_transaction(trans, root);
1941 iput(inode);
1942 continue;
1943 }
1944
1945 /* if we have links, this was a truncate, lets do that */
1946 if (inode->i_nlink) {
1947 nr_truncate++;
1948 btrfs_truncate(inode);
1949 } else {
1950 nr_unlink++;
1951 }
1952
1953 /* this will do delete_inode and everything for us */
1954 iput(inode);
1955 }
1956
1957 if (nr_unlink)
1958 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
1959 if (nr_truncate)
1960 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
1961
1962 btrfs_free_path(path);
1963}
1964
1965/*
1966 * read an inode from the btree into the in-memory inode
1967 */
1968void btrfs_read_locked_inode(struct inode *inode)
1969{
1970 struct btrfs_path *path;
1971 struct extent_buffer *leaf;
1972 struct btrfs_inode_item *inode_item;
1973 struct btrfs_timespec *tspec;
1974 struct btrfs_root *root = BTRFS_I(inode)->root;
1975 struct btrfs_key location;
1976 u64 alloc_group_block;
1977 u32 rdev;
1978 int ret;
1979
1980 path = btrfs_alloc_path();
1981 BUG_ON(!path);
1982 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
1983
1984 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
1985 if (ret)
1986 goto make_bad;
1987
1988 leaf = path->nodes[0];
1989 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1990 struct btrfs_inode_item);
1991
1992 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
1993 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
1994 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
1995 inode->i_gid = btrfs_inode_gid(leaf, inode_item);
1996 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
1997
1998 tspec = btrfs_inode_atime(inode_item);
1999 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2000 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2001
2002 tspec = btrfs_inode_mtime(inode_item);
2003 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2004 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2005
2006 tspec = btrfs_inode_ctime(inode_item);
2007 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2008 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2009
2010 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2011 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2012 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
2013 inode->i_generation = BTRFS_I(inode)->generation;
2014 inode->i_rdev = 0;
2015 rdev = btrfs_inode_rdev(leaf, inode_item);
2016
2017 BTRFS_I(inode)->index_cnt = (u64)-1;
2018 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2019
2020 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2021 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2022 alloc_group_block, 0);
2023 btrfs_free_path(path);
2024 inode_item = NULL;
2025
2026 switch (inode->i_mode & S_IFMT) {
2027 case S_IFREG:
2028 inode->i_mapping->a_ops = &btrfs_aops;
2029 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2030 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2031 inode->i_fop = &btrfs_file_operations;
2032 inode->i_op = &btrfs_file_inode_operations;
2033 break;
2034 case S_IFDIR:
2035 inode->i_fop = &btrfs_dir_file_operations;
2036 if (root == root->fs_info->tree_root)
2037 inode->i_op = &btrfs_dir_ro_inode_operations;
2038 else
2039 inode->i_op = &btrfs_dir_inode_operations;
2040 break;
2041 case S_IFLNK:
2042 inode->i_op = &btrfs_symlink_inode_operations;
2043 inode->i_mapping->a_ops = &btrfs_symlink_aops;
2044 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2045 break;
2046 default:
2047 init_special_inode(inode, inode->i_mode, rdev);
2048 break;
2049 }
2050 return;
2051
2052make_bad:
2053 btrfs_free_path(path);
2054 make_bad_inode(inode);
2055}
2056
2057/*
2058 * given a leaf and an inode, copy the inode fields into the leaf
2059 */
2060static void fill_inode_item(struct btrfs_trans_handle *trans,
2061 struct extent_buffer *leaf,
2062 struct btrfs_inode_item *item,
2063 struct inode *inode)
2064{
2065 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2066 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2067 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2068 btrfs_set_inode_mode(leaf, item, inode->i_mode);
2069 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2070
2071 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2072 inode->i_atime.tv_sec);
2073 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2074 inode->i_atime.tv_nsec);
2075
2076 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2077 inode->i_mtime.tv_sec);
2078 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2079 inode->i_mtime.tv_nsec);
2080
2081 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2082 inode->i_ctime.tv_sec);
2083 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2084 inode->i_ctime.tv_nsec);
2085
2086 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2087 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2088 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
2089 btrfs_set_inode_transid(leaf, item, trans->transid);
2090 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2091 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2092 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
2093}
2094
2095/*
2096 * copy everything in the in-memory inode into the btree.
2097 */
2098noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2099 struct btrfs_root *root, struct inode *inode)
2100{
2101 struct btrfs_inode_item *inode_item;
2102 struct btrfs_path *path;
2103 struct extent_buffer *leaf;
2104 int ret;
2105
2106 path = btrfs_alloc_path();
2107 BUG_ON(!path);
2108 ret = btrfs_lookup_inode(trans, root, path,
2109 &BTRFS_I(inode)->location, 1);
2110 if (ret) {
2111 if (ret > 0)
2112 ret = -ENOENT;
2113 goto failed;
2114 }
2115
2116 leaf = path->nodes[0];
2117 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2118 struct btrfs_inode_item);
2119
2120 fill_inode_item(trans, leaf, inode_item, inode);
2121 btrfs_mark_buffer_dirty(leaf);
2122 btrfs_set_inode_last_trans(trans, inode);
2123 ret = 0;
2124failed:
2125 btrfs_free_path(path);
2126 return ret;
2127}
2128
2129
2130/*
2131 * unlink helper that gets used here in inode.c and in the tree logging
2132 * recovery code. It remove a link in a directory with a given name, and
2133 * also drops the back refs in the inode to the directory
2134 */
2135int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2136 struct btrfs_root *root,
2137 struct inode *dir, struct inode *inode,
2138 const char *name, int name_len)
2139{
2140 struct btrfs_path *path;
2141 int ret = 0;
2142 struct extent_buffer *leaf;
2143 struct btrfs_dir_item *di;
2144 struct btrfs_key key;
2145 u64 index;
2146
2147 path = btrfs_alloc_path();
2148 if (!path) {
2149 ret = -ENOMEM;
2150 goto err;
2151 }
2152
2153 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2154 name, name_len, -1);
2155 if (IS_ERR(di)) {
2156 ret = PTR_ERR(di);
2157 goto err;
2158 }
2159 if (!di) {
2160 ret = -ENOENT;
2161 goto err;
2162 }
2163 leaf = path->nodes[0];
2164 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2165 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2166 if (ret)
2167 goto err;
2168 btrfs_release_path(root, path);
2169
2170 ret = btrfs_del_inode_ref(trans, root, name, name_len,
2171 inode->i_ino,
2172 dir->i_ino, &index);
2173 if (ret) {
2174 printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
2175 "inode %lu parent %lu\n", name_len, name,
2176 inode->i_ino, dir->i_ino);
2177 goto err;
2178 }
2179
2180 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2181 index, name, name_len, -1);
2182 if (IS_ERR(di)) {
2183 ret = PTR_ERR(di);
2184 goto err;
2185 }
2186 if (!di) {
2187 ret = -ENOENT;
2188 goto err;
2189 }
2190 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2191 btrfs_release_path(root, path);
2192
2193 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2194 inode, dir->i_ino);
2195 BUG_ON(ret != 0 && ret != -ENOENT);
2196 if (ret != -ENOENT)
2197 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2198
2199 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2200 dir, index);
2201 BUG_ON(ret);
2202err:
2203 btrfs_free_path(path);
2204 if (ret)
2205 goto out;
2206
2207 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2208 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2209 btrfs_update_inode(trans, root, dir);
2210 btrfs_drop_nlink(inode);
2211 ret = btrfs_update_inode(trans, root, inode);
2212 dir->i_sb->s_dirt = 1;
2213out:
2214 return ret;
2215}
2216
2217static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2218{
2219 struct btrfs_root *root;
2220 struct btrfs_trans_handle *trans;
2221 struct inode *inode = dentry->d_inode;
2222 int ret;
2223 unsigned long nr = 0;
2224
2225 root = BTRFS_I(dir)->root;
2226
2227 ret = btrfs_check_free_space(root, 1, 1);
2228 if (ret)
2229 goto fail;
2230
2231 trans = btrfs_start_transaction(root, 1);
2232
2233 btrfs_set_trans_block_group(trans, dir);
2234 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2235 dentry->d_name.name, dentry->d_name.len);
2236
2237 if (inode->i_nlink == 0)
2238 ret = btrfs_orphan_add(trans, inode);
2239
2240 nr = trans->blocks_used;
2241
2242 btrfs_end_transaction_throttle(trans, root);
2243fail:
2244 btrfs_btree_balance_dirty(root, nr);
2245 return ret;
2246}
2247
2248static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2249{
2250 struct inode *inode = dentry->d_inode;
2251 int err = 0;
2252 int ret;
2253 struct btrfs_root *root = BTRFS_I(dir)->root;
2254 struct btrfs_trans_handle *trans;
2255 unsigned long nr = 0;
2256
2257 /*
2258 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
2259 * the root of a subvolume or snapshot
2260 */
2261 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2262 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
2263 return -ENOTEMPTY;
2264 }
2265
2266 ret = btrfs_check_free_space(root, 1, 1);
2267 if (ret)
2268 goto fail;
2269
2270 trans = btrfs_start_transaction(root, 1);
2271 btrfs_set_trans_block_group(trans, dir);
2272
2273 err = btrfs_orphan_add(trans, inode);
2274 if (err)
2275 goto fail_trans;
2276
2277 /* now the directory is empty */
2278 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2279 dentry->d_name.name, dentry->d_name.len);
2280 if (!err)
2281 btrfs_i_size_write(inode, 0);
2282
2283fail_trans:
2284 nr = trans->blocks_used;
2285 ret = btrfs_end_transaction_throttle(trans, root);
2286fail:
2287 btrfs_btree_balance_dirty(root, nr);
2288
2289 if (ret && !err)
2290 err = ret;
2291 return err;
2292}
2293
2294#if 0
2295/*
2296 * when truncating bytes in a file, it is possible to avoid reading
2297 * the leaves that contain only checksum items. This can be the
2298 * majority of the IO required to delete a large file, but it must
2299 * be done carefully.
2300 *
2301 * The keys in the level just above the leaves are checked to make sure
2302 * the lowest key in a given leaf is a csum key, and starts at an offset
2303 * after the new size.
2304 *
2305 * Then the key for the next leaf is checked to make sure it also has
2306 * a checksum item for the same file. If it does, we know our target leaf
2307 * contains only checksum items, and it can be safely freed without reading
2308 * it.
2309 *
2310 * This is just an optimization targeted at large files. It may do
2311 * nothing. It will return 0 unless things went badly.
2312 */
2313static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
2314 struct btrfs_root *root,
2315 struct btrfs_path *path,
2316 struct inode *inode, u64 new_size)
2317{
2318 struct btrfs_key key;
2319 int ret;
2320 int nritems;
2321 struct btrfs_key found_key;
2322 struct btrfs_key other_key;
2323 struct btrfs_leaf_ref *ref;
2324 u64 leaf_gen;
2325 u64 leaf_start;
2326
2327 path->lowest_level = 1;
2328 key.objectid = inode->i_ino;
2329 key.type = BTRFS_CSUM_ITEM_KEY;
2330 key.offset = new_size;
2331again:
2332 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2333 if (ret < 0)
2334 goto out;
2335
2336 if (path->nodes[1] == NULL) {
2337 ret = 0;
2338 goto out;
2339 }
2340 ret = 0;
2341 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
2342 nritems = btrfs_header_nritems(path->nodes[1]);
2343
2344 if (!nritems)
2345 goto out;
2346
2347 if (path->slots[1] >= nritems)
2348 goto next_node;
2349
2350 /* did we find a key greater than anything we want to delete? */
2351 if (found_key.objectid > inode->i_ino ||
2352 (found_key.objectid == inode->i_ino && found_key.type > key.type))
2353 goto out;
2354
2355 /* we check the next key in the node to make sure the leave contains
2356 * only checksum items. This comparison doesn't work if our
2357 * leaf is the last one in the node
2358 */
2359 if (path->slots[1] + 1 >= nritems) {
2360next_node:
2361 /* search forward from the last key in the node, this
2362 * will bring us into the next node in the tree
2363 */
2364 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
2365
2366 /* unlikely, but we inc below, so check to be safe */
2367 if (found_key.offset == (u64)-1)
2368 goto out;
2369
2370 /* search_forward needs a path with locks held, do the
2371 * search again for the original key. It is possible
2372 * this will race with a balance and return a path that
2373 * we could modify, but this drop is just an optimization
2374 * and is allowed to miss some leaves.
2375 */
2376 btrfs_release_path(root, path);
2377 found_key.offset++;
2378
2379 /* setup a max key for search_forward */
2380 other_key.offset = (u64)-1;
2381 other_key.type = key.type;
2382 other_key.objectid = key.objectid;
2383
2384 path->keep_locks = 1;
2385 ret = btrfs_search_forward(root, &found_key, &other_key,
2386 path, 0, 0);
2387 path->keep_locks = 0;
2388 if (ret || found_key.objectid != key.objectid ||
2389 found_key.type != key.type) {
2390 ret = 0;
2391 goto out;
2392 }
2393
2394 key.offset = found_key.offset;
2395 btrfs_release_path(root, path);
2396 cond_resched();
2397 goto again;
2398 }
2399
2400 /* we know there's one more slot after us in the tree,
2401 * read that key so we can verify it is also a checksum item
2402 */
2403 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
2404
2405 if (found_key.objectid < inode->i_ino)
2406 goto next_key;
2407
2408 if (found_key.type != key.type || found_key.offset < new_size)
2409 goto next_key;
2410
2411 /*
2412 * if the key for the next leaf isn't a csum key from this objectid,
2413 * we can't be sure there aren't good items inside this leaf.
2414 * Bail out
2415 */
2416 if (other_key.objectid != inode->i_ino || other_key.type != key.type)
2417 goto out;
2418
2419 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
2420 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
2421 /*
2422 * it is safe to delete this leaf, it contains only
2423 * csum items from this inode at an offset >= new_size
2424 */
2425 ret = btrfs_del_leaf(trans, root, path, leaf_start);
2426 BUG_ON(ret);
2427
2428 if (root->ref_cows && leaf_gen < trans->transid) {
2429 ref = btrfs_alloc_leaf_ref(root, 0);
2430 if (ref) {
2431 ref->root_gen = root->root_key.offset;
2432 ref->bytenr = leaf_start;
2433 ref->owner = 0;
2434 ref->generation = leaf_gen;
2435 ref->nritems = 0;
2436
2437 ret = btrfs_add_leaf_ref(root, ref, 0);
2438 WARN_ON(ret);
2439 btrfs_free_leaf_ref(root, ref);
2440 } else {
2441 WARN_ON(1);
2442 }
2443 }
2444next_key:
2445 btrfs_release_path(root, path);
2446
2447 if (other_key.objectid == inode->i_ino &&
2448 other_key.type == key.type && other_key.offset > key.offset) {
2449 key.offset = other_key.offset;
2450 cond_resched();
2451 goto again;
2452 }
2453 ret = 0;
2454out:
2455 /* fixup any changes we've made to the path */
2456 path->lowest_level = 0;
2457 path->keep_locks = 0;
2458 btrfs_release_path(root, path);
2459 return ret;
2460}
2461
2462#endif
2463
2464/*
2465 * this can truncate away extent items, csum items and directory items.
2466 * It starts at a high offset and removes keys until it can't find
2467 * any higher than new_size
2468 *
2469 * csum items that cross the new i_size are truncated to the new size
2470 * as well.
2471 *
2472 * min_type is the minimum key type to truncate down to. If set to 0, this
2473 * will kill all the items on this inode, including the INODE_ITEM_KEY.
2474 */
2475noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2476 struct btrfs_root *root,
2477 struct inode *inode,
2478 u64 new_size, u32 min_type)
2479{
2480 int ret;
2481 struct btrfs_path *path;
2482 struct btrfs_key key;
2483 struct btrfs_key found_key;
2484 u32 found_type;
2485 struct extent_buffer *leaf;
2486 struct btrfs_file_extent_item *fi;
2487 u64 extent_start = 0;
2488 u64 extent_num_bytes = 0;
2489 u64 item_end = 0;
2490 u64 root_gen = 0;
2491 u64 root_owner = 0;
2492 int found_extent;
2493 int del_item;
2494 int pending_del_nr = 0;
2495 int pending_del_slot = 0;
2496 int extent_type = -1;
2497 int encoding;
2498 u64 mask = root->sectorsize - 1;
2499
2500 if (root->ref_cows)
2501 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2502 path = btrfs_alloc_path();
2503 path->reada = -1;
2504 BUG_ON(!path);
2505
2506 /* FIXME, add redo link to tree so we don't leak on crash */
2507 key.objectid = inode->i_ino;
2508 key.offset = (u64)-1;
2509 key.type = (u8)-1;
2510
2511 btrfs_init_path(path);
2512
2513search_again:
2514 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2515 if (ret < 0)
2516 goto error;
2517
2518 if (ret > 0) {
2519 /* there are no items in the tree for us to truncate, we're
2520 * done
2521 */
2522 if (path->slots[0] == 0) {
2523 ret = 0;
2524 goto error;
2525 }
2526 path->slots[0]--;
2527 }
2528
2529 while (1) {
2530 fi = NULL;
2531 leaf = path->nodes[0];
2532 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2533 found_type = btrfs_key_type(&found_key);
2534 encoding = 0;
2535
2536 if (found_key.objectid != inode->i_ino)
2537 break;
2538
2539 if (found_type < min_type)
2540 break;
2541
2542 item_end = found_key.offset;
2543 if (found_type == BTRFS_EXTENT_DATA_KEY) {
2544 fi = btrfs_item_ptr(leaf, path->slots[0],
2545 struct btrfs_file_extent_item);
2546 extent_type = btrfs_file_extent_type(leaf, fi);
2547 encoding = btrfs_file_extent_compression(leaf, fi);
2548 encoding |= btrfs_file_extent_encryption(leaf, fi);
2549 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2550
2551 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2552 item_end +=
2553 btrfs_file_extent_num_bytes(leaf, fi);
2554 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2555 item_end += btrfs_file_extent_inline_len(leaf,
2556 fi);
2557 }
2558 item_end--;
2559 }
2560 if (item_end < new_size) {
2561 if (found_type == BTRFS_DIR_ITEM_KEY)
2562 found_type = BTRFS_INODE_ITEM_KEY;
2563 else if (found_type == BTRFS_EXTENT_ITEM_KEY)
2564 found_type = BTRFS_EXTENT_DATA_KEY;
2565 else if (found_type == BTRFS_EXTENT_DATA_KEY)
2566 found_type = BTRFS_XATTR_ITEM_KEY;
2567 else if (found_type == BTRFS_XATTR_ITEM_KEY)
2568 found_type = BTRFS_INODE_REF_KEY;
2569 else if (found_type)
2570 found_type--;
2571 else
2572 break;
2573 btrfs_set_key_type(&key, found_type);
2574 goto next;
2575 }
2576 if (found_key.offset >= new_size)
2577 del_item = 1;
2578 else
2579 del_item = 0;
2580 found_extent = 0;
2581
2582 /* FIXME, shrink the extent if the ref count is only 1 */
2583 if (found_type != BTRFS_EXTENT_DATA_KEY)
2584 goto delete;
2585
2586 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2587 u64 num_dec;
2588 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2589 if (!del_item && !encoding) {
2590 u64 orig_num_bytes =
2591 btrfs_file_extent_num_bytes(leaf, fi);
2592 extent_num_bytes = new_size -
2593 found_key.offset + root->sectorsize - 1;
2594 extent_num_bytes = extent_num_bytes &
2595 ~((u64)root->sectorsize - 1);
2596 btrfs_set_file_extent_num_bytes(leaf, fi,
2597 extent_num_bytes);
2598 num_dec = (orig_num_bytes -
2599 extent_num_bytes);
2600 if (root->ref_cows && extent_start != 0)
2601 inode_sub_bytes(inode, num_dec);
2602 btrfs_mark_buffer_dirty(leaf);
2603 } else {
2604 extent_num_bytes =
2605 btrfs_file_extent_disk_num_bytes(leaf,
2606 fi);
2607 /* FIXME blocksize != 4096 */
2608 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2609 if (extent_start != 0) {
2610 found_extent = 1;
2611 if (root->ref_cows)
2612 inode_sub_bytes(inode, num_dec);
2613 }
2614 root_gen = btrfs_header_generation(leaf);
2615 root_owner = btrfs_header_owner(leaf);
2616 }
2617 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2618 /*
2619 * we can't truncate inline items that have had
2620 * special encodings
2621 */
2622 if (!del_item &&
2623 btrfs_file_extent_compression(leaf, fi) == 0 &&
2624 btrfs_file_extent_encryption(leaf, fi) == 0 &&
2625 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
2626 u32 size = new_size - found_key.offset;
2627
2628 if (root->ref_cows) {
2629 inode_sub_bytes(inode, item_end + 1 -
2630 new_size);
2631 }
2632 size =
2633 btrfs_file_extent_calc_inline_size(size);
2634 ret = btrfs_truncate_item(trans, root, path,
2635 size, 1);
2636 BUG_ON(ret);
2637 } else if (root->ref_cows) {
2638 inode_sub_bytes(inode, item_end + 1 -
2639 found_key.offset);
2640 }
2641 }
2642delete:
2643 if (del_item) {
2644 if (!pending_del_nr) {
2645 /* no pending yet, add ourselves */
2646 pending_del_slot = path->slots[0];
2647 pending_del_nr = 1;
2648 } else if (pending_del_nr &&
2649 path->slots[0] + 1 == pending_del_slot) {
2650 /* hop on the pending chunk */
2651 pending_del_nr++;
2652 pending_del_slot = path->slots[0];
2653 } else {
2654 BUG();
2655 }
2656 } else {
2657 break;
2658 }
2659 if (found_extent) {
2660 ret = btrfs_free_extent(trans, root, extent_start,
2661 extent_num_bytes,
2662 leaf->start, root_owner,
2663 root_gen, inode->i_ino, 0);
2664 BUG_ON(ret);
2665 }
2666next:
2667 if (path->slots[0] == 0) {
2668 if (pending_del_nr)
2669 goto del_pending;
2670 btrfs_release_path(root, path);
2671 goto search_again;
2672 }
2673
2674 path->slots[0]--;
2675 if (pending_del_nr &&
2676 path->slots[0] + 1 != pending_del_slot) {
2677 struct btrfs_key debug;
2678del_pending:
2679 btrfs_item_key_to_cpu(path->nodes[0], &debug,
2680 pending_del_slot);
2681 ret = btrfs_del_items(trans, root, path,
2682 pending_del_slot,
2683 pending_del_nr);
2684 BUG_ON(ret);
2685 pending_del_nr = 0;
2686 btrfs_release_path(root, path);
2687 goto search_again;
2688 }
2689 }
2690 ret = 0;
2691error:
2692 if (pending_del_nr) {
2693 ret = btrfs_del_items(trans, root, path, pending_del_slot,
2694 pending_del_nr);
2695 }
2696 btrfs_free_path(path);
2697 inode->i_sb->s_dirt = 1;
2698 return ret;
2699}
2700
2701/*
2702 * taken from block_truncate_page, but does cow as it zeros out
2703 * any bytes left in the last page in the file.
2704 */
2705static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
2706{
2707 struct inode *inode = mapping->host;
2708 struct btrfs_root *root = BTRFS_I(inode)->root;
2709 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2710 struct btrfs_ordered_extent *ordered;
2711 char *kaddr;
2712 u32 blocksize = root->sectorsize;
2713 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2714 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2715 struct page *page;
2716 int ret = 0;
2717 u64 page_start;
2718 u64 page_end;
2719
2720 if ((offset & (blocksize - 1)) == 0)
2721 goto out;
2722
2723 ret = -ENOMEM;
2724again:
2725 page = grab_cache_page(mapping, index);
2726 if (!page)
2727 goto out;
2728
2729 page_start = page_offset(page);
2730 page_end = page_start + PAGE_CACHE_SIZE - 1;
2731
2732 if (!PageUptodate(page)) {
2733 ret = btrfs_readpage(NULL, page);
2734 lock_page(page);
2735 if (page->mapping != mapping) {
2736 unlock_page(page);
2737 page_cache_release(page);
2738 goto again;
2739 }
2740 if (!PageUptodate(page)) {
2741 ret = -EIO;
2742 goto out_unlock;
2743 }
2744 }
2745 wait_on_page_writeback(page);
2746
2747 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2748 set_page_extent_mapped(page);
2749
2750 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2751 if (ordered) {
2752 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2753 unlock_page(page);
2754 page_cache_release(page);
2755 btrfs_start_ordered_extent(inode, ordered, 1);
2756 btrfs_put_ordered_extent(ordered);
2757 goto again;
2758 }
2759
2760 btrfs_set_extent_delalloc(inode, page_start, page_end);
2761 ret = 0;
2762 if (offset != PAGE_CACHE_SIZE) {
2763 kaddr = kmap(page);
2764 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2765 flush_dcache_page(page);
2766 kunmap(page);
2767 }
2768 ClearPageChecked(page);
2769 set_page_dirty(page);
2770 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2771
2772out_unlock:
2773 unlock_page(page);
2774 page_cache_release(page);
2775out:
2776 return ret;
2777}
2778
2779int btrfs_cont_expand(struct inode *inode, loff_t size)
2780{
2781 struct btrfs_trans_handle *trans;
2782 struct btrfs_root *root = BTRFS_I(inode)->root;
2783 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2784 struct extent_map *em;
2785 u64 mask = root->sectorsize - 1;
2786 u64 hole_start = (inode->i_size + mask) & ~mask;
2787 u64 block_end = (size + mask) & ~mask;
2788 u64 last_byte;
2789 u64 cur_offset;
2790 u64 hole_size;
2791 int err;
2792
2793 if (size <= hole_start)
2794 return 0;
2795
2796 err = btrfs_check_free_space(root, 1, 0);
2797 if (err)
2798 return err;
2799
2800 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2801
2802 while (1) {
2803 struct btrfs_ordered_extent *ordered;
2804 btrfs_wait_ordered_range(inode, hole_start,
2805 block_end - hole_start);
2806 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2807 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
2808 if (!ordered)
2809 break;
2810 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2811 btrfs_put_ordered_extent(ordered);
2812 }
2813
2814 trans = btrfs_start_transaction(root, 1);
2815 btrfs_set_trans_block_group(trans, inode);
2816
2817 cur_offset = hole_start;
2818 while (1) {
2819 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2820 block_end - cur_offset, 0);
2821 BUG_ON(IS_ERR(em) || !em);
2822 last_byte = min(extent_map_end(em), block_end);
2823 last_byte = (last_byte + mask) & ~mask;
2824 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2825 u64 hint_byte = 0;
2826 hole_size = last_byte - cur_offset;
2827 err = btrfs_drop_extents(trans, root, inode,
2828 cur_offset,
2829 cur_offset + hole_size,
2830 cur_offset, &hint_byte);
2831 if (err)
2832 break;
2833 err = btrfs_insert_file_extent(trans, root,
2834 inode->i_ino, cur_offset, 0,
2835 0, hole_size, 0, hole_size,
2836 0, 0, 0);
2837 btrfs_drop_extent_cache(inode, hole_start,
2838 last_byte - 1, 0);
2839 }
2840 free_extent_map(em);
2841 cur_offset = last_byte;
2842 if (err || cur_offset >= block_end)
2843 break;
2844 }
2845
2846 btrfs_end_transaction(trans, root);
2847 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2848 return err;
2849}
2850
2851static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2852{
2853 struct inode *inode = dentry->d_inode;
2854 int err;
2855
2856 err = inode_change_ok(inode, attr);
2857 if (err)
2858 return err;
2859
2860 if (S_ISREG(inode->i_mode) &&
2861 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
2862 err = btrfs_cont_expand(inode, attr->ia_size);
2863 if (err)
2864 return err;
2865 }
2866
2867 err = inode_setattr(inode, attr);
2868
2869 if (!err && ((attr->ia_valid & ATTR_MODE)))
2870 err = btrfs_acl_chmod(inode);
2871 return err;
2872}
2873
2874void btrfs_delete_inode(struct inode *inode)
2875{
2876 struct btrfs_trans_handle *trans;
2877 struct btrfs_root *root = BTRFS_I(inode)->root;
2878 unsigned long nr;
2879 int ret;
2880
2881 truncate_inode_pages(&inode->i_data, 0);
2882 if (is_bad_inode(inode)) {
2883 btrfs_orphan_del(NULL, inode);
2884 goto no_delete;
2885 }
2886 btrfs_wait_ordered_range(inode, 0, (u64)-1);
2887
2888 btrfs_i_size_write(inode, 0);
2889 trans = btrfs_start_transaction(root, 1);
2890
2891 btrfs_set_trans_block_group(trans, inode);
2892 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
2893 if (ret) {
2894 btrfs_orphan_del(NULL, inode);
2895 goto no_delete_lock;
2896 }
2897
2898 btrfs_orphan_del(trans, inode);
2899
2900 nr = trans->blocks_used;
2901 clear_inode(inode);
2902
2903 btrfs_end_transaction(trans, root);
2904 btrfs_btree_balance_dirty(root, nr);
2905 return;
2906
2907no_delete_lock:
2908 nr = trans->blocks_used;
2909 btrfs_end_transaction(trans, root);
2910 btrfs_btree_balance_dirty(root, nr);
2911no_delete:
2912 clear_inode(inode);
2913}
2914
2915/*
2916 * this returns the key found in the dir entry in the location pointer.
2917 * If no dir entries were found, location->objectid is 0.
2918 */
2919static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
2920 struct btrfs_key *location)
2921{
2922 const char *name = dentry->d_name.name;
2923 int namelen = dentry->d_name.len;
2924 struct btrfs_dir_item *di;
2925 struct btrfs_path *path;
2926 struct btrfs_root *root = BTRFS_I(dir)->root;
2927 int ret = 0;
2928
2929 path = btrfs_alloc_path();
2930 BUG_ON(!path);
2931
2932 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
2933 namelen, 0);
2934 if (IS_ERR(di))
2935 ret = PTR_ERR(di);
2936
2937 if (!di || IS_ERR(di))
2938 goto out_err;
2939
2940 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
2941out:
2942 btrfs_free_path(path);
2943 return ret;
2944out_err:
2945 location->objectid = 0;
2946 goto out;
2947}
2948
2949/*
2950 * when we hit a tree root in a directory, the btrfs part of the inode
2951 * needs to be changed to reflect the root directory of the tree root. This
2952 * is kind of like crossing a mount point.
2953 */
2954static int fixup_tree_root_location(struct btrfs_root *root,
2955 struct btrfs_key *location,
2956 struct btrfs_root **sub_root,
2957 struct dentry *dentry)
2958{
2959 struct btrfs_root_item *ri;
2960
2961 if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
2962 return 0;
2963 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
2964 return 0;
2965
2966 *sub_root = btrfs_read_fs_root(root->fs_info, location,
2967 dentry->d_name.name,
2968 dentry->d_name.len);
2969 if (IS_ERR(*sub_root))
2970 return PTR_ERR(*sub_root);
2971
2972 ri = &(*sub_root)->root_item;
2973 location->objectid = btrfs_root_dirid(ri);
2974 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2975 location->offset = 0;
2976
2977 return 0;
2978}
2979
2980static noinline void init_btrfs_i(struct inode *inode)
2981{
2982 struct btrfs_inode *bi = BTRFS_I(inode);
2983
2984 bi->i_acl = NULL;
2985 bi->i_default_acl = NULL;
2986
2987 bi->generation = 0;
2988 bi->sequence = 0;
2989 bi->last_trans = 0;
2990 bi->logged_trans = 0;
2991 bi->delalloc_bytes = 0;
2992 bi->disk_i_size = 0;
2993 bi->flags = 0;
2994 bi->index_cnt = (u64)-1;
2995 bi->log_dirty_trans = 0;
2996 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2997 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2998 inode->i_mapping, GFP_NOFS);
2999 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3000 inode->i_mapping, GFP_NOFS);
3001 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3002 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3003 mutex_init(&BTRFS_I(inode)->extent_mutex);
3004 mutex_init(&BTRFS_I(inode)->log_mutex);
3005}
3006
3007static int btrfs_init_locked_inode(struct inode *inode, void *p)
3008{
3009 struct btrfs_iget_args *args = p;
3010 inode->i_ino = args->ino;
3011 init_btrfs_i(inode);
3012 BTRFS_I(inode)->root = args->root;
3013 return 0;
3014}
3015
3016static int btrfs_find_actor(struct inode *inode, void *opaque)
3017{
3018 struct btrfs_iget_args *args = opaque;
3019 return args->ino == inode->i_ino &&
3020 args->root == BTRFS_I(inode)->root;
3021}
3022
3023struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
3024 struct btrfs_root *root, int wait)
3025{
3026 struct inode *inode;
3027 struct btrfs_iget_args args;
3028 args.ino = objectid;
3029 args.root = root;
3030
3031 if (wait) {
3032 inode = ilookup5(s, objectid, btrfs_find_actor,
3033 (void *)&args);
3034 } else {
3035 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
3036 (void *)&args);
3037 }
3038 return inode;
3039}
3040
3041struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3042 struct btrfs_root *root)
3043{
3044 struct inode *inode;
3045 struct btrfs_iget_args args;
3046 args.ino = objectid;
3047 args.root = root;
3048
3049 inode = iget5_locked(s, objectid, btrfs_find_actor,
3050 btrfs_init_locked_inode,
3051 (void *)&args);
3052 return inode;
3053}
3054
3055/* Get an inode object given its location and corresponding root.
3056 * Returns in *is_new if the inode was read from disk
3057 */
3058struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3059 struct btrfs_root *root, int *is_new)
3060{
3061 struct inode *inode;
3062
3063 inode = btrfs_iget_locked(s, location->objectid, root);
3064 if (!inode)
3065 return ERR_PTR(-EACCES);
3066
3067 if (inode->i_state & I_NEW) {
3068 BTRFS_I(inode)->root = root;
3069 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3070 btrfs_read_locked_inode(inode);
3071 unlock_new_inode(inode);
3072 if (is_new)
3073 *is_new = 1;
3074 } else {
3075 if (is_new)
3076 *is_new = 0;
3077 }
3078
3079 return inode;
3080}
3081
3082struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3083{
3084 struct inode *inode;
3085 struct btrfs_inode *bi = BTRFS_I(dir);
3086 struct btrfs_root *root = bi->root;
3087 struct btrfs_root *sub_root = root;
3088 struct btrfs_key location;
3089 int ret, new;
3090
3091 if (dentry->d_name.len > BTRFS_NAME_LEN)
3092 return ERR_PTR(-ENAMETOOLONG);
3093
3094 ret = btrfs_inode_by_name(dir, dentry, &location);
3095
3096 if (ret < 0)
3097 return ERR_PTR(ret);
3098
3099 inode = NULL;
3100 if (location.objectid) {
3101 ret = fixup_tree_root_location(root, &location, &sub_root,
3102 dentry);
3103 if (ret < 0)
3104 return ERR_PTR(ret);
3105 if (ret > 0)
3106 return ERR_PTR(-ENOENT);
3107 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
3108 if (IS_ERR(inode))
3109 return ERR_CAST(inode);
3110 }
3111 return inode;
3112}
3113
3114static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3115 struct nameidata *nd)
3116{
3117 struct inode *inode;
3118
3119 if (dentry->d_name.len > BTRFS_NAME_LEN)
3120 return ERR_PTR(-ENAMETOOLONG);
3121
3122 inode = btrfs_lookup_dentry(dir, dentry);
3123 if (IS_ERR(inode))
3124 return ERR_CAST(inode);
3125
3126 return d_splice_alias(inode, dentry);
3127}
3128
3129static unsigned char btrfs_filetype_table[] = {
3130 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
3131};
3132
3133static int btrfs_real_readdir(struct file *filp, void *dirent,
3134 filldir_t filldir)
3135{
3136 struct inode *inode = filp->f_dentry->d_inode;
3137 struct btrfs_root *root = BTRFS_I(inode)->root;
3138 struct btrfs_item *item;
3139 struct btrfs_dir_item *di;
3140 struct btrfs_key key;
3141 struct btrfs_key found_key;
3142 struct btrfs_path *path;
3143 int ret;
3144 u32 nritems;
3145 struct extent_buffer *leaf;
3146 int slot;
3147 int advance;
3148 unsigned char d_type;
3149 int over = 0;
3150 u32 di_cur;
3151 u32 di_total;
3152 u32 di_len;
3153 int key_type = BTRFS_DIR_INDEX_KEY;
3154 char tmp_name[32];
3155 char *name_ptr;
3156 int name_len;
3157
3158 /* FIXME, use a real flag for deciding about the key type */
3159 if (root->fs_info->tree_root == root)
3160 key_type = BTRFS_DIR_ITEM_KEY;
3161
3162 /* special case for "." */
3163 if (filp->f_pos == 0) {
3164 over = filldir(dirent, ".", 1,
3165 1, inode->i_ino,
3166 DT_DIR);
3167 if (over)
3168 return 0;
3169 filp->f_pos = 1;
3170 }
3171 /* special case for .., just use the back ref */
3172 if (filp->f_pos == 1) {
3173 u64 pino = parent_ino(filp->f_path.dentry);
3174 over = filldir(dirent, "..", 2,
3175 2, pino, DT_DIR);
3176 if (over)
3177 return 0;
3178 filp->f_pos = 2;
3179 }
3180 path = btrfs_alloc_path();
3181 path->reada = 2;
3182
3183 btrfs_set_key_type(&key, key_type);
3184 key.offset = filp->f_pos;
3185 key.objectid = inode->i_ino;
3186
3187 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3188 if (ret < 0)
3189 goto err;
3190 advance = 0;
3191
3192 while (1) {
3193 leaf = path->nodes[0];
3194 nritems = btrfs_header_nritems(leaf);
3195 slot = path->slots[0];
3196 if (advance || slot >= nritems) {
3197 if (slot >= nritems - 1) {
3198 ret = btrfs_next_leaf(root, path);
3199 if (ret)
3200 break;
3201 leaf = path->nodes[0];
3202 nritems = btrfs_header_nritems(leaf);
3203 slot = path->slots[0];
3204 } else {
3205 slot++;
3206 path->slots[0]++;
3207 }
3208 }
3209
3210 advance = 1;
3211 item = btrfs_item_nr(leaf, slot);
3212 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3213
3214 if (found_key.objectid != key.objectid)
3215 break;
3216 if (btrfs_key_type(&found_key) != key_type)
3217 break;
3218 if (found_key.offset < filp->f_pos)
3219 continue;
3220
3221 filp->f_pos = found_key.offset;
3222
3223 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
3224 di_cur = 0;
3225 di_total = btrfs_item_size(leaf, item);
3226
3227 while (di_cur < di_total) {
3228 struct btrfs_key location;
3229
3230 name_len = btrfs_dir_name_len(leaf, di);
3231 if (name_len <= sizeof(tmp_name)) {
3232 name_ptr = tmp_name;
3233 } else {
3234 name_ptr = kmalloc(name_len, GFP_NOFS);
3235 if (!name_ptr) {
3236 ret = -ENOMEM;
3237 goto err;
3238 }
3239 }
3240 read_extent_buffer(leaf, name_ptr,
3241 (unsigned long)(di + 1), name_len);
3242
3243 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
3244 btrfs_dir_item_key_to_cpu(leaf, di, &location);
3245
3246 /* is this a reference to our own snapshot? If so
3247 * skip it
3248 */
3249 if (location.type == BTRFS_ROOT_ITEM_KEY &&
3250 location.objectid == root->root_key.objectid) {
3251 over = 0;
3252 goto skip;
3253 }
3254 over = filldir(dirent, name_ptr, name_len,
3255 found_key.offset, location.objectid,
3256 d_type);
3257
3258skip:
3259 if (name_ptr != tmp_name)
3260 kfree(name_ptr);
3261
3262 if (over)
3263 goto nopos;
3264 di_len = btrfs_dir_name_len(leaf, di) +
3265 btrfs_dir_data_len(leaf, di) + sizeof(*di);
3266 di_cur += di_len;
3267 di = (struct btrfs_dir_item *)((char *)di + di_len);
3268 }
3269 }
3270
3271 /* Reached end of directory/root. Bump pos past the last item. */
3272 if (key_type == BTRFS_DIR_INDEX_KEY)
3273 filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
3274 else
3275 filp->f_pos++;
3276nopos:
3277 ret = 0;
3278err:
3279 btrfs_free_path(path);
3280 return ret;
3281}
3282
3283int btrfs_write_inode(struct inode *inode, int wait)
3284{
3285 struct btrfs_root *root = BTRFS_I(inode)->root;
3286 struct btrfs_trans_handle *trans;
3287 int ret = 0;
3288
3289 if (root->fs_info->btree_inode == inode)
3290 return 0;
3291
3292 if (wait) {
3293 trans = btrfs_join_transaction(root, 1);
3294 btrfs_set_trans_block_group(trans, inode);
3295 ret = btrfs_commit_transaction(trans, root);
3296 }
3297 return ret;
3298}
3299
3300/*
3301 * This is somewhat expensive, updating the tree every time the
3302 * inode changes. But, it is most likely to find the inode in cache.
3303 * FIXME, needs more benchmarking...there are no reasons other than performance
3304 * to keep or drop this code.
3305 */
3306void btrfs_dirty_inode(struct inode *inode)
3307{
3308 struct btrfs_root *root = BTRFS_I(inode)->root;
3309 struct btrfs_trans_handle *trans;
3310
3311 trans = btrfs_join_transaction(root, 1);
3312 btrfs_set_trans_block_group(trans, inode);
3313 btrfs_update_inode(trans, root, inode);
3314 btrfs_end_transaction(trans, root);
3315}
3316
3317/*
3318 * find the highest existing sequence number in a directory
3319 * and then set the in-memory index_cnt variable to reflect
3320 * free sequence numbers
3321 */
3322static int btrfs_set_inode_index_count(struct inode *inode)
3323{
3324 struct btrfs_root *root = BTRFS_I(inode)->root;
3325 struct btrfs_key key, found_key;
3326 struct btrfs_path *path;
3327 struct extent_buffer *leaf;
3328 int ret;
3329
3330 key.objectid = inode->i_ino;
3331 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
3332 key.offset = (u64)-1;
3333
3334 path = btrfs_alloc_path();
3335 if (!path)
3336 return -ENOMEM;
3337
3338 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3339 if (ret < 0)
3340 goto out;
3341 /* FIXME: we should be able to handle this */
3342 if (ret == 0)
3343 goto out;
3344 ret = 0;
3345
3346 /*
3347 * MAGIC NUMBER EXPLANATION:
3348 * since we search a directory based on f_pos we have to start at 2
3349 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
3350 * else has to start at 2
3351 */
3352 if (path->slots[0] == 0) {
3353 BTRFS_I(inode)->index_cnt = 2;
3354 goto out;
3355 }
3356
3357 path->slots[0]--;
3358
3359 leaf = path->nodes[0];
3360 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3361
3362 if (found_key.objectid != inode->i_ino ||
3363 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
3364 BTRFS_I(inode)->index_cnt = 2;
3365 goto out;
3366 }
3367
3368 BTRFS_I(inode)->index_cnt = found_key.offset + 1;
3369out:
3370 btrfs_free_path(path);
3371 return ret;
3372}
3373
3374/*
3375 * helper to find a free sequence number in a given directory. This current
3376 * code is very simple, later versions will do smarter things in the btree
3377 */
3378int btrfs_set_inode_index(struct inode *dir, u64 *index)
3379{
3380 int ret = 0;
3381
3382 if (BTRFS_I(dir)->index_cnt == (u64)-1) {
3383 ret = btrfs_set_inode_index_count(dir);
3384 if (ret)
3385 return ret;
3386 }
3387
3388 *index = BTRFS_I(dir)->index_cnt;
3389 BTRFS_I(dir)->index_cnt++;
3390
3391 return ret;
3392}
3393
3394static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3395 struct btrfs_root *root,
3396 struct inode *dir,
3397 const char *name, int name_len,
3398 u64 ref_objectid, u64 objectid,
3399 u64 alloc_hint, int mode, u64 *index)
3400{
3401 struct inode *inode;
3402 struct btrfs_inode_item *inode_item;
3403 struct btrfs_key *location;
3404 struct btrfs_path *path;
3405 struct btrfs_inode_ref *ref;
3406 struct btrfs_key key[2];
3407 u32 sizes[2];
3408 unsigned long ptr;
3409 int ret;
3410 int owner;
3411
3412 path = btrfs_alloc_path();
3413 BUG_ON(!path);
3414
3415 inode = new_inode(root->fs_info->sb);
3416 if (!inode)
3417 return ERR_PTR(-ENOMEM);
3418
3419 if (dir) {
3420 ret = btrfs_set_inode_index(dir, index);
3421 if (ret)
3422 return ERR_PTR(ret);
3423 }
3424 /*
3425 * index_cnt is ignored for everything but a dir,
3426 * btrfs_get_inode_index_count has an explanation for the magic
3427 * number
3428 */
3429 init_btrfs_i(inode);
3430 BTRFS_I(inode)->index_cnt = 2;
3431 BTRFS_I(inode)->root = root;
3432 BTRFS_I(inode)->generation = trans->transid;
3433
3434 if (mode & S_IFDIR)
3435 owner = 0;
3436 else
3437 owner = 1;
3438 BTRFS_I(inode)->block_group =
3439 btrfs_find_block_group(root, 0, alloc_hint, owner);
3440 if ((mode & S_IFREG)) {
3441 if (btrfs_test_opt(root, NODATASUM))
3442 btrfs_set_flag(inode, NODATASUM);
3443 if (btrfs_test_opt(root, NODATACOW))
3444 btrfs_set_flag(inode, NODATACOW);
3445 }
3446
3447 key[0].objectid = objectid;
3448 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
3449 key[0].offset = 0;
3450
3451 key[1].objectid = objectid;
3452 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
3453 key[1].offset = ref_objectid;
3454
3455 sizes[0] = sizeof(struct btrfs_inode_item);
3456 sizes[1] = name_len + sizeof(*ref);
3457
3458 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3459 if (ret != 0)
3460 goto fail;
3461
3462 if (objectid > root->highest_inode)
3463 root->highest_inode = objectid;
3464
3465 inode->i_uid = current_fsuid();
3466 inode->i_gid = current_fsgid();
3467 inode->i_mode = mode;
3468 inode->i_ino = objectid;
3469 inode_set_bytes(inode, 0);
3470 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3471 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3472 struct btrfs_inode_item);
3473 fill_inode_item(trans, path->nodes[0], inode_item, inode);
3474
3475 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3476 struct btrfs_inode_ref);
3477 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
3478 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
3479 ptr = (unsigned long)(ref + 1);
3480 write_extent_buffer(path->nodes[0], name, ptr, name_len);
3481
3482 btrfs_mark_buffer_dirty(path->nodes[0]);
3483 btrfs_free_path(path);
3484
3485 location = &BTRFS_I(inode)->location;
3486 location->objectid = objectid;
3487 location->offset = 0;
3488 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3489
3490 insert_inode_hash(inode);
3491 return inode;
3492fail:
3493 if (dir)
3494 BTRFS_I(dir)->index_cnt--;
3495 btrfs_free_path(path);
3496 return ERR_PTR(ret);
3497}
3498
3499static inline u8 btrfs_inode_type(struct inode *inode)
3500{
3501 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
3502}
3503
3504/*
3505 * utility function to add 'inode' into 'parent_inode' with
3506 * a give name and a given sequence number.
3507 * if 'add_backref' is true, also insert a backref from the
3508 * inode to the parent directory.
3509 */
3510int btrfs_add_link(struct btrfs_trans_handle *trans,
3511 struct inode *parent_inode, struct inode *inode,
3512 const char *name, int name_len, int add_backref, u64 index)
3513{
3514 int ret;
3515 struct btrfs_key key;
3516 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
3517
3518 key.objectid = inode->i_ino;
3519 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
3520 key.offset = 0;
3521
3522 ret = btrfs_insert_dir_item(trans, root, name, name_len,
3523 parent_inode->i_ino,
3524 &key, btrfs_inode_type(inode),
3525 index);
3526 if (ret == 0) {
3527 if (add_backref) {
3528 ret = btrfs_insert_inode_ref(trans, root,
3529 name, name_len,
3530 inode->i_ino,
3531 parent_inode->i_ino,
3532 index);
3533 }
3534 btrfs_i_size_write(parent_inode, parent_inode->i_size +
3535 name_len * 2);
3536 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
3537 ret = btrfs_update_inode(trans, root, parent_inode);
3538 }
3539 return ret;
3540}
3541
3542static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
3543 struct dentry *dentry, struct inode *inode,
3544 int backref, u64 index)
3545{
3546 int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3547 inode, dentry->d_name.name,
3548 dentry->d_name.len, backref, index);
3549 if (!err) {
3550 d_instantiate(dentry, inode);
3551 return 0;
3552 }
3553 if (err > 0)
3554 err = -EEXIST;
3555 return err;
3556}
3557
3558static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3559 int mode, dev_t rdev)
3560{
3561 struct btrfs_trans_handle *trans;
3562 struct btrfs_root *root = BTRFS_I(dir)->root;
3563 struct inode *inode = NULL;
3564 int err;
3565 int drop_inode = 0;
3566 u64 objectid;
3567 unsigned long nr = 0;
3568 u64 index = 0;
3569
3570 if (!new_valid_dev(rdev))
3571 return -EINVAL;
3572
3573 err = btrfs_check_free_space(root, 1, 0);
3574 if (err)
3575 goto fail;
3576
3577 trans = btrfs_start_transaction(root, 1);
3578 btrfs_set_trans_block_group(trans, dir);
3579
3580 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3581 if (err) {
3582 err = -ENOSPC;
3583 goto out_unlock;
3584 }
3585
3586 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3587 dentry->d_name.len,
3588 dentry->d_parent->d_inode->i_ino, objectid,
3589 BTRFS_I(dir)->block_group, mode, &index);
3590 err = PTR_ERR(inode);
3591 if (IS_ERR(inode))
3592 goto out_unlock;
3593
3594 err = btrfs_init_acl(inode, dir);
3595 if (err) {
3596 drop_inode = 1;
3597 goto out_unlock;
3598 }
3599
3600 btrfs_set_trans_block_group(trans, inode);
3601 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3602 if (err)
3603 drop_inode = 1;
3604 else {
3605 inode->i_op = &btrfs_special_inode_operations;
3606 init_special_inode(inode, inode->i_mode, rdev);
3607 btrfs_update_inode(trans, root, inode);
3608 }
3609 dir->i_sb->s_dirt = 1;
3610 btrfs_update_inode_block_group(trans, inode);
3611 btrfs_update_inode_block_group(trans, dir);
3612out_unlock:
3613 nr = trans->blocks_used;
3614 btrfs_end_transaction_throttle(trans, root);
3615fail:
3616 if (drop_inode) {
3617 inode_dec_link_count(inode);
3618 iput(inode);
3619 }
3620 btrfs_btree_balance_dirty(root, nr);
3621 return err;
3622}
3623
3624static int btrfs_create(struct inode *dir, struct dentry *dentry,
3625 int mode, struct nameidata *nd)
3626{
3627 struct btrfs_trans_handle *trans;
3628 struct btrfs_root *root = BTRFS_I(dir)->root;
3629 struct inode *inode = NULL;
3630 int err;
3631 int drop_inode = 0;
3632 unsigned long nr = 0;
3633 u64 objectid;
3634 u64 index = 0;
3635
3636 err = btrfs_check_free_space(root, 1, 0);
3637 if (err)
3638 goto fail;
3639 trans = btrfs_start_transaction(root, 1);
3640 btrfs_set_trans_block_group(trans, dir);
3641
3642 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3643 if (err) {
3644 err = -ENOSPC;
3645 goto out_unlock;
3646 }
3647
3648 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3649 dentry->d_name.len,
3650 dentry->d_parent->d_inode->i_ino,
3651 objectid, BTRFS_I(dir)->block_group, mode,
3652 &index);
3653 err = PTR_ERR(inode);
3654 if (IS_ERR(inode))
3655 goto out_unlock;
3656
3657 err = btrfs_init_acl(inode, dir);
3658 if (err) {
3659 drop_inode = 1;
3660 goto out_unlock;
3661 }
3662
3663 btrfs_set_trans_block_group(trans, inode);
3664 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3665 if (err)
3666 drop_inode = 1;
3667 else {
3668 inode->i_mapping->a_ops = &btrfs_aops;
3669 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3670 inode->i_fop = &btrfs_file_operations;
3671 inode->i_op = &btrfs_file_inode_operations;
3672 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3673 }
3674 dir->i_sb->s_dirt = 1;
3675 btrfs_update_inode_block_group(trans, inode);
3676 btrfs_update_inode_block_group(trans, dir);
3677out_unlock:
3678 nr = trans->blocks_used;
3679 btrfs_end_transaction_throttle(trans, root);
3680fail:
3681 if (drop_inode) {
3682 inode_dec_link_count(inode);
3683 iput(inode);
3684 }
3685 btrfs_btree_balance_dirty(root, nr);
3686 return err;
3687}
3688
3689static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3690 struct dentry *dentry)
3691{
3692 struct btrfs_trans_handle *trans;
3693 struct btrfs_root *root = BTRFS_I(dir)->root;
3694 struct inode *inode = old_dentry->d_inode;
3695 u64 index;
3696 unsigned long nr = 0;
3697 int err;
3698 int drop_inode = 0;
3699
3700 if (inode->i_nlink == 0)
3701 return -ENOENT;
3702
3703 btrfs_inc_nlink(inode);
3704 err = btrfs_check_free_space(root, 1, 0);
3705 if (err)
3706 goto fail;
3707 err = btrfs_set_inode_index(dir, &index);
3708 if (err)
3709 goto fail;
3710
3711 trans = btrfs_start_transaction(root, 1);
3712
3713 btrfs_set_trans_block_group(trans, dir);
3714 atomic_inc(&inode->i_count);
3715
3716 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
3717
3718 if (err)
3719 drop_inode = 1;
3720
3721 dir->i_sb->s_dirt = 1;
3722 btrfs_update_inode_block_group(trans, dir);
3723 err = btrfs_update_inode(trans, root, inode);
3724
3725 if (err)
3726 drop_inode = 1;
3727
3728 nr = trans->blocks_used;
3729 btrfs_end_transaction_throttle(trans, root);
3730fail:
3731 if (drop_inode) {
3732 inode_dec_link_count(inode);
3733 iput(inode);
3734 }
3735 btrfs_btree_balance_dirty(root, nr);
3736 return err;
3737}
3738
3739static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3740{
3741 struct inode *inode = NULL;
3742 struct btrfs_trans_handle *trans;
3743 struct btrfs_root *root = BTRFS_I(dir)->root;
3744 int err = 0;
3745 int drop_on_err = 0;
3746 u64 objectid = 0;
3747 u64 index = 0;
3748 unsigned long nr = 1;
3749
3750 err = btrfs_check_free_space(root, 1, 0);
3751 if (err)
3752 goto out_unlock;
3753
3754 trans = btrfs_start_transaction(root, 1);
3755 btrfs_set_trans_block_group(trans, dir);
3756
3757 if (IS_ERR(trans)) {
3758 err = PTR_ERR(trans);
3759 goto out_unlock;
3760 }
3761
3762 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3763 if (err) {
3764 err = -ENOSPC;
3765 goto out_unlock;
3766 }
3767
3768 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3769 dentry->d_name.len,
3770 dentry->d_parent->d_inode->i_ino, objectid,
3771 BTRFS_I(dir)->block_group, S_IFDIR | mode,
3772 &index);
3773 if (IS_ERR(inode)) {
3774 err = PTR_ERR(inode);
3775 goto out_fail;
3776 }
3777
3778 drop_on_err = 1;
3779
3780 err = btrfs_init_acl(inode, dir);
3781 if (err)
3782 goto out_fail;
3783
3784 inode->i_op = &btrfs_dir_inode_operations;
3785 inode->i_fop = &btrfs_dir_file_operations;
3786 btrfs_set_trans_block_group(trans, inode);
3787
3788 btrfs_i_size_write(inode, 0);
3789 err = btrfs_update_inode(trans, root, inode);
3790 if (err)
3791 goto out_fail;
3792
3793 err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3794 inode, dentry->d_name.name,
3795 dentry->d_name.len, 0, index);
3796 if (err)
3797 goto out_fail;
3798
3799 d_instantiate(dentry, inode);
3800 drop_on_err = 0;
3801 dir->i_sb->s_dirt = 1;
3802 btrfs_update_inode_block_group(trans, inode);
3803 btrfs_update_inode_block_group(trans, dir);
3804
3805out_fail:
3806 nr = trans->blocks_used;
3807 btrfs_end_transaction_throttle(trans, root);
3808
3809out_unlock:
3810 if (drop_on_err)
3811 iput(inode);
3812 btrfs_btree_balance_dirty(root, nr);
3813 return err;
3814}
3815
3816/* helper for btfs_get_extent. Given an existing extent in the tree,
3817 * and an extent that you want to insert, deal with overlap and insert
3818 * the new extent into the tree.
3819 */
3820static int merge_extent_mapping(struct extent_map_tree *em_tree,
3821 struct extent_map *existing,
3822 struct extent_map *em,
3823 u64 map_start, u64 map_len)
3824{
3825 u64 start_diff;
3826
3827 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
3828 start_diff = map_start - em->start;
3829 em->start = map_start;
3830 em->len = map_len;
3831 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3832 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3833 em->block_start += start_diff;
3834 em->block_len -= start_diff;
3835 }
3836 return add_extent_mapping(em_tree, em);
3837}
3838
3839static noinline int uncompress_inline(struct btrfs_path *path,
3840 struct inode *inode, struct page *page,
3841 size_t pg_offset, u64 extent_offset,
3842 struct btrfs_file_extent_item *item)
3843{
3844 int ret;
3845 struct extent_buffer *leaf = path->nodes[0];
3846 char *tmp;
3847 size_t max_size;
3848 unsigned long inline_size;
3849 unsigned long ptr;
3850
3851 WARN_ON(pg_offset != 0);
3852 max_size = btrfs_file_extent_ram_bytes(leaf, item);
3853 inline_size = btrfs_file_extent_inline_item_len(leaf,
3854 btrfs_item_nr(leaf, path->slots[0]));
3855 tmp = kmalloc(inline_size, GFP_NOFS);
3856 ptr = btrfs_file_extent_inline_start(item);
3857
3858 read_extent_buffer(leaf, tmp, ptr, inline_size);
3859
3860 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
3861 ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3862 inline_size, max_size);
3863 if (ret) {
3864 char *kaddr = kmap_atomic(page, KM_USER0);
3865 unsigned long copy_size = min_t(u64,
3866 PAGE_CACHE_SIZE - pg_offset,
3867 max_size - extent_offset);
3868 memset(kaddr + pg_offset, 0, copy_size);
3869 kunmap_atomic(kaddr, KM_USER0);
3870 }
3871 kfree(tmp);
3872 return 0;
3873}
3874
3875/*
3876 * a bit scary, this does extent mapping from logical file offset to the disk.
3877 * the ugly parts come from merging extents from the disk with the in-ram
3878 * representation. This gets more complex because of the data=ordered code,
3879 * where the in-ram extents might be locked pending data=ordered completion.
3880 *
3881 * This also copies inline extents directly into the page.
3882 */
3883
3884struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
3885 size_t pg_offset, u64 start, u64 len,
3886 int create)
3887{
3888 int ret;
3889 int err = 0;
3890 u64 bytenr;
3891 u64 extent_start = 0;
3892 u64 extent_end = 0;
3893 u64 objectid = inode->i_ino;
3894 u32 found_type;
3895 struct btrfs_path *path = NULL;
3896 struct btrfs_root *root = BTRFS_I(inode)->root;
3897 struct btrfs_file_extent_item *item;
3898 struct extent_buffer *leaf;
3899 struct btrfs_key found_key;
3900 struct extent_map *em = NULL;
3901 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3902 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3903 struct btrfs_trans_handle *trans = NULL;
3904 int compressed;
3905
3906again:
3907 spin_lock(&em_tree->lock);
3908 em = lookup_extent_mapping(em_tree, start, len);
3909 if (em)
3910 em->bdev = root->fs_info->fs_devices->latest_bdev;
3911 spin_unlock(&em_tree->lock);
3912
3913 if (em) {
3914 if (em->start > start || em->start + em->len <= start)
3915 free_extent_map(em);
3916 else if (em->block_start == EXTENT_MAP_INLINE && page)
3917 free_extent_map(em);
3918 else
3919 goto out;
3920 }
3921 em = alloc_extent_map(GFP_NOFS);
3922 if (!em) {
3923 err = -ENOMEM;
3924 goto out;
3925 }
3926 em->bdev = root->fs_info->fs_devices->latest_bdev;
3927 em->start = EXTENT_MAP_HOLE;
3928 em->orig_start = EXTENT_MAP_HOLE;
3929 em->len = (u64)-1;
3930 em->block_len = (u64)-1;
3931
3932 if (!path) {
3933 path = btrfs_alloc_path();
3934 BUG_ON(!path);
3935 }
3936
3937 ret = btrfs_lookup_file_extent(trans, root, path,
3938 objectid, start, trans != NULL);
3939 if (ret < 0) {
3940 err = ret;
3941 goto out;
3942 }
3943
3944 if (ret != 0) {
3945 if (path->slots[0] == 0)
3946 goto not_found;
3947 path->slots[0]--;
3948 }
3949
3950 leaf = path->nodes[0];
3951 item = btrfs_item_ptr(leaf, path->slots[0],
3952 struct btrfs_file_extent_item);
3953 /* are we inside the extent that was found? */
3954 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3955 found_type = btrfs_key_type(&found_key);
3956 if (found_key.objectid != objectid ||
3957 found_type != BTRFS_EXTENT_DATA_KEY) {
3958 goto not_found;
3959 }
3960
3961 found_type = btrfs_file_extent_type(leaf, item);
3962 extent_start = found_key.offset;
3963 compressed = btrfs_file_extent_compression(leaf, item);
3964 if (found_type == BTRFS_FILE_EXTENT_REG ||
3965 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3966 extent_end = extent_start +
3967 btrfs_file_extent_num_bytes(leaf, item);
3968 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3969 size_t size;
3970 size = btrfs_file_extent_inline_len(leaf, item);
3971 extent_end = (extent_start + size + root->sectorsize - 1) &
3972 ~((u64)root->sectorsize - 1);
3973 }
3974
3975 if (start >= extent_end) {
3976 path->slots[0]++;
3977 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3978 ret = btrfs_next_leaf(root, path);
3979 if (ret < 0) {
3980 err = ret;
3981 goto out;
3982 }
3983 if (ret > 0)
3984 goto not_found;
3985 leaf = path->nodes[0];
3986 }
3987 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3988 if (found_key.objectid != objectid ||
3989 found_key.type != BTRFS_EXTENT_DATA_KEY)
3990 goto not_found;
3991 if (start + len <= found_key.offset)
3992 goto not_found;
3993 em->start = start;
3994 em->len = found_key.offset - start;
3995 goto not_found_em;
3996 }
3997
3998 if (found_type == BTRFS_FILE_EXTENT_REG ||
3999 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
4000 em->start = extent_start;
4001 em->len = extent_end - extent_start;
4002 em->orig_start = extent_start -
4003 btrfs_file_extent_offset(leaf, item);
4004 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
4005 if (bytenr == 0) {
4006 em->block_start = EXTENT_MAP_HOLE;
4007 goto insert;
4008 }
4009 if (compressed) {
4010 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4011 em->block_start = bytenr;
4012 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
4013 item);
4014 } else {
4015 bytenr += btrfs_file_extent_offset(leaf, item);
4016 em->block_start = bytenr;
4017 em->block_len = em->len;
4018 if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
4019 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
4020 }
4021 goto insert;
4022 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
4023 unsigned long ptr;
4024 char *map;
4025 size_t size;
4026 size_t extent_offset;
4027 size_t copy_size;
4028
4029 em->block_start = EXTENT_MAP_INLINE;
4030 if (!page || create) {
4031 em->start = extent_start;
4032 em->len = extent_end - extent_start;
4033 goto out;
4034 }
4035
4036 size = btrfs_file_extent_inline_len(leaf, item);
4037 extent_offset = page_offset(page) + pg_offset - extent_start;
4038 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
4039 size - extent_offset);
4040 em->start = extent_start + extent_offset;
4041 em->len = (copy_size + root->sectorsize - 1) &
4042 ~((u64)root->sectorsize - 1);
4043 em->orig_start = EXTENT_MAP_INLINE;
4044 if (compressed)
4045 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4046 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
4047 if (create == 0 && !PageUptodate(page)) {
4048 if (btrfs_file_extent_compression(leaf, item) ==
4049 BTRFS_COMPRESS_ZLIB) {
4050 ret = uncompress_inline(path, inode, page,
4051 pg_offset,
4052 extent_offset, item);
4053 BUG_ON(ret);
4054 } else {
4055 map = kmap(page);
4056 read_extent_buffer(leaf, map + pg_offset, ptr,
4057 copy_size);
4058 kunmap(page);
4059 }
4060 flush_dcache_page(page);
4061 } else if (create && PageUptodate(page)) {
4062 if (!trans) {
4063 kunmap(page);
4064 free_extent_map(em);
4065 em = NULL;
4066 btrfs_release_path(root, path);
4067 trans = btrfs_join_transaction(root, 1);
4068 goto again;
4069 }
4070 map = kmap(page);
4071 write_extent_buffer(leaf, map + pg_offset, ptr,
4072 copy_size);
4073 kunmap(page);
4074 btrfs_mark_buffer_dirty(leaf);
4075 }
4076 set_extent_uptodate(io_tree, em->start,
4077 extent_map_end(em) - 1, GFP_NOFS);
4078 goto insert;
4079 } else {
4080 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
4081 WARN_ON(1);
4082 }
4083not_found:
4084 em->start = start;
4085 em->len = len;
4086not_found_em:
4087 em->block_start = EXTENT_MAP_HOLE;
4088 set_bit(EXTENT_FLAG_VACANCY, &em->flags);
4089insert:
4090 btrfs_release_path(root, path);
4091 if (em->start > start || extent_map_end(em) <= start) {
4092 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
4093 "[%llu %llu]\n", (unsigned long long)em->start,
4094 (unsigned long long)em->len,
4095 (unsigned long long)start,
4096 (unsigned long long)len);
4097 err = -EIO;
4098 goto out;
4099 }
4100
4101 err = 0;
4102 spin_lock(&em_tree->lock);
4103 ret = add_extent_mapping(em_tree, em);
4104 /* it is possible that someone inserted the extent into the tree
4105 * while we had the lock dropped. It is also possible that
4106 * an overlapping map exists in the tree
4107 */
4108 if (ret == -EEXIST) {
4109 struct extent_map *existing;
4110
4111 ret = 0;
4112
4113 existing = lookup_extent_mapping(em_tree, start, len);
4114 if (existing && (existing->start > start ||
4115 existing->start + existing->len <= start)) {
4116 free_extent_map(existing);
4117 existing = NULL;
4118 }
4119 if (!existing) {
4120 existing = lookup_extent_mapping(em_tree, em->start,
4121 em->len);
4122 if (existing) {
4123 err = merge_extent_mapping(em_tree, existing,
4124 em, start,
4125 root->sectorsize);
4126 free_extent_map(existing);
4127 if (err) {
4128 free_extent_map(em);
4129 em = NULL;
4130 }
4131 } else {
4132 err = -EIO;
4133 free_extent_map(em);
4134 em = NULL;
4135 }
4136 } else {
4137 free_extent_map(em);
4138 em = existing;
4139 err = 0;
4140 }
4141 }
4142 spin_unlock(&em_tree->lock);
4143out:
4144 if (path)
4145 btrfs_free_path(path);
4146 if (trans) {
4147 ret = btrfs_end_transaction(trans, root);
4148 if (!err)
4149 err = ret;
4150 }
4151 if (err) {
4152 free_extent_map(em);
4153 WARN_ON(1);
4154 return ERR_PTR(err);
4155 }
4156 return em;
4157}
4158
4159static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4160 const struct iovec *iov, loff_t offset,
4161 unsigned long nr_segs)
4162{
4163 return -EINVAL;
4164}
4165
4166static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
4167{
4168 return extent_bmap(mapping, iblock, btrfs_get_extent);
4169}
4170
4171int btrfs_readpage(struct file *file, struct page *page)
4172{
4173 struct extent_io_tree *tree;
4174 tree = &BTRFS_I(page->mapping->host)->io_tree;
4175 return extent_read_full_page(tree, page, btrfs_get_extent);
4176}
4177
4178static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
4179{
4180 struct extent_io_tree *tree;
4181
4182
4183 if (current->flags & PF_MEMALLOC) {
4184 redirty_page_for_writepage(wbc, page);
4185 unlock_page(page);
4186 return 0;
4187 }
4188 tree = &BTRFS_I(page->mapping->host)->io_tree;
4189 return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
4190}
4191
4192int btrfs_writepages(struct address_space *mapping,
4193 struct writeback_control *wbc)
4194{
4195 struct extent_io_tree *tree;
4196
4197 tree = &BTRFS_I(mapping->host)->io_tree;
4198 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
4199}
4200
4201static int
4202btrfs_readpages(struct file *file, struct address_space *mapping,
4203 struct list_head *pages, unsigned nr_pages)
4204{
4205 struct extent_io_tree *tree;
4206 tree = &BTRFS_I(mapping->host)->io_tree;
4207 return extent_readpages(tree, mapping, pages, nr_pages,
4208 btrfs_get_extent);
4209}
4210static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4211{
4212 struct extent_io_tree *tree;
4213 struct extent_map_tree *map;
4214 int ret;
4215
4216 tree = &BTRFS_I(page->mapping->host)->io_tree;
4217 map = &BTRFS_I(page->mapping->host)->extent_tree;
4218 ret = try_release_extent_mapping(map, tree, page, gfp_flags);
4219 if (ret == 1) {
4220 ClearPagePrivate(page);
4221 set_page_private(page, 0);
4222 page_cache_release(page);
4223 }
4224 return ret;
4225}
4226
4227static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4228{
4229 if (PageWriteback(page) || PageDirty(page))
4230 return 0;
4231 return __btrfs_releasepage(page, gfp_flags);
4232}
4233
4234static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4235{
4236 struct extent_io_tree *tree;
4237 struct btrfs_ordered_extent *ordered;
4238 u64 page_start = page_offset(page);
4239 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4240
4241 wait_on_page_writeback(page);
4242 tree = &BTRFS_I(page->mapping->host)->io_tree;
4243 if (offset) {
4244 btrfs_releasepage(page, GFP_NOFS);
4245 return;
4246 }
4247
4248 lock_extent(tree, page_start, page_end, GFP_NOFS);
4249 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4250 page_offset(page));
4251 if (ordered) {
4252 /*
4253 * IO on this page will never be started, so we need
4254 * to account for any ordered extents now
4255 */
4256 clear_extent_bit(tree, page_start, page_end,
4257 EXTENT_DIRTY | EXTENT_DELALLOC |
4258 EXTENT_LOCKED, 1, 0, GFP_NOFS);
4259 btrfs_finish_ordered_io(page->mapping->host,
4260 page_start, page_end);
4261 btrfs_put_ordered_extent(ordered);
4262 lock_extent(tree, page_start, page_end, GFP_NOFS);
4263 }
4264 clear_extent_bit(tree, page_start, page_end,
4265 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4266 EXTENT_ORDERED,
4267 1, 1, GFP_NOFS);
4268 __btrfs_releasepage(page, GFP_NOFS);
4269
4270 ClearPageChecked(page);
4271 if (PagePrivate(page)) {
4272 ClearPagePrivate(page);
4273 set_page_private(page, 0);
4274 page_cache_release(page);
4275 }
4276}
4277
4278/*
4279 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
4280 * called from a page fault handler when a page is first dirtied. Hence we must
4281 * be careful to check for EOF conditions here. We set the page up correctly
4282 * for a written page which means we get ENOSPC checking when writing into
4283 * holes and correct delalloc and unwritten extent mapping on filesystems that
4284 * support these features.
4285 *
4286 * We are not allowed to take the i_mutex here so we have to play games to
4287 * protect against truncate races as the page could now be beyond EOF. Because
4288 * vmtruncate() writes the inode size before removing pages, once we have the
4289 * page lock we can determine safely if the page is beyond EOF. If it is not
4290 * beyond EOF, then the page is guaranteed safe against truncation until we
4291 * unlock the page.
4292 */
4293int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4294{
4295 struct inode *inode = fdentry(vma->vm_file)->d_inode;
4296 struct btrfs_root *root = BTRFS_I(inode)->root;
4297 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4298 struct btrfs_ordered_extent *ordered;
4299 char *kaddr;
4300 unsigned long zero_start;
4301 loff_t size;
4302 int ret;
4303 u64 page_start;
4304 u64 page_end;
4305
4306 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
4307 if (ret)
4308 goto out;
4309
4310 ret = -EINVAL;
4311again:
4312 lock_page(page);
4313 size = i_size_read(inode);
4314 page_start = page_offset(page);
4315 page_end = page_start + PAGE_CACHE_SIZE - 1;
4316
4317 if ((page->mapping != inode->i_mapping) ||
4318 (page_start >= size)) {
4319 /* page got truncated out from underneath us */
4320 goto out_unlock;
4321 }
4322 wait_on_page_writeback(page);
4323
4324 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
4325 set_page_extent_mapped(page);
4326
4327 /*
4328 * we can't set the delalloc bits if there are pending ordered
4329 * extents. Drop our locks and wait for them to finish
4330 */
4331 ordered = btrfs_lookup_ordered_extent(inode, page_start);
4332 if (ordered) {
4333 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4334 unlock_page(page);
4335 btrfs_start_ordered_extent(inode, ordered, 1);
4336 btrfs_put_ordered_extent(ordered);
4337 goto again;
4338 }
4339
4340 btrfs_set_extent_delalloc(inode, page_start, page_end);
4341 ret = 0;
4342
4343 /* page is wholly or partially inside EOF */
4344 if (page_start + PAGE_CACHE_SIZE > size)
4345 zero_start = size & ~PAGE_CACHE_MASK;
4346 else
4347 zero_start = PAGE_CACHE_SIZE;
4348
4349 if (zero_start != PAGE_CACHE_SIZE) {
4350 kaddr = kmap(page);
4351 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
4352 flush_dcache_page(page);
4353 kunmap(page);
4354 }
4355 ClearPageChecked(page);
4356 set_page_dirty(page);
4357 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4358
4359out_unlock:
4360 unlock_page(page);
4361out:
4362 return ret;
4363}
4364
4365static void btrfs_truncate(struct inode *inode)
4366{
4367 struct btrfs_root *root = BTRFS_I(inode)->root;
4368 int ret;
4369 struct btrfs_trans_handle *trans;
4370 unsigned long nr;
4371 u64 mask = root->sectorsize - 1;
4372
4373 if (!S_ISREG(inode->i_mode))
4374 return;
4375 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4376 return;
4377
4378 btrfs_truncate_page(inode->i_mapping, inode->i_size);
4379 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4380
4381 trans = btrfs_start_transaction(root, 1);
4382 btrfs_set_trans_block_group(trans, inode);
4383 btrfs_i_size_write(inode, inode->i_size);
4384
4385 ret = btrfs_orphan_add(trans, inode);
4386 if (ret)
4387 goto out;
4388 /* FIXME, add redo link to tree so we don't leak on crash */
4389 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
4390 BTRFS_EXTENT_DATA_KEY);
4391 btrfs_update_inode(trans, root, inode);
4392
4393 ret = btrfs_orphan_del(trans, inode);
4394 BUG_ON(ret);
4395
4396out:
4397 nr = trans->blocks_used;
4398 ret = btrfs_end_transaction_throttle(trans, root);
4399 BUG_ON(ret);
4400 btrfs_btree_balance_dirty(root, nr);
4401}
4402
4403/*
4404 * create a new subvolume directory/inode (helper for the ioctl).
4405 */
4406int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
4407 struct btrfs_root *new_root, struct dentry *dentry,
4408 u64 new_dirid, u64 alloc_hint)
4409{
4410 struct inode *inode;
4411 int error;
4412 u64 index = 0;
4413
4414 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
4415 new_dirid, alloc_hint, S_IFDIR | 0700, &index);
4416 if (IS_ERR(inode))
4417 return PTR_ERR(inode);
4418 inode->i_op = &btrfs_dir_inode_operations;
4419 inode->i_fop = &btrfs_dir_file_operations;
4420
4421 inode->i_nlink = 1;
4422 btrfs_i_size_write(inode, 0);
4423
4424 error = btrfs_update_inode(trans, new_root, inode);
4425 if (error)
4426 return error;
4427
4428 d_instantiate(dentry, inode);
4429 return 0;
4430}
4431
4432/* helper function for file defrag and space balancing. This
4433 * forces readahead on a given range of bytes in an inode
4434 */
4435unsigned long btrfs_force_ra(struct address_space *mapping,
4436 struct file_ra_state *ra, struct file *file,
4437 pgoff_t offset, pgoff_t last_index)
4438{
4439 pgoff_t req_size = last_index - offset + 1;
4440
4441 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
4442 return offset + req_size;
4443}
4444
4445struct inode *btrfs_alloc_inode(struct super_block *sb)
4446{
4447 struct btrfs_inode *ei;
4448
4449 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
4450 if (!ei)
4451 return NULL;
4452 ei->last_trans = 0;
4453 ei->logged_trans = 0;
4454 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4455 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4456 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4457 INIT_LIST_HEAD(&ei->i_orphan);
4458 return &ei->vfs_inode;
4459}
4460
4461void btrfs_destroy_inode(struct inode *inode)
4462{
4463 struct btrfs_ordered_extent *ordered;
4464 WARN_ON(!list_empty(&inode->i_dentry));
4465 WARN_ON(inode->i_data.nrpages);
4466
4467 if (BTRFS_I(inode)->i_acl &&
4468 BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
4469 posix_acl_release(BTRFS_I(inode)->i_acl);
4470 if (BTRFS_I(inode)->i_default_acl &&
4471 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4472 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4473
4474 spin_lock(&BTRFS_I(inode)->root->list_lock);
4475 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4476 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4477 " list\n", inode->i_ino);
4478 dump_stack();
4479 }
4480 spin_unlock(&BTRFS_I(inode)->root->list_lock);
4481
4482 while (1) {
4483 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
4484 if (!ordered)
4485 break;
4486 else {
4487 printk(KERN_ERR "btrfs found ordered "
4488 "extent %llu %llu on inode cleanup\n",
4489 (unsigned long long)ordered->file_offset,
4490 (unsigned long long)ordered->len);
4491 btrfs_remove_ordered_extent(inode, ordered);
4492 btrfs_put_ordered_extent(ordered);
4493 btrfs_put_ordered_extent(ordered);
4494 }
4495 }
4496 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4497 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4498}
4499
4500static void init_once(void *foo)
4501{
4502 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
4503
4504 inode_init_once(&ei->vfs_inode);
4505}
4506
4507void btrfs_destroy_cachep(void)
4508{
4509 if (btrfs_inode_cachep)
4510 kmem_cache_destroy(btrfs_inode_cachep);
4511 if (btrfs_trans_handle_cachep)
4512 kmem_cache_destroy(btrfs_trans_handle_cachep);
4513 if (btrfs_transaction_cachep)
4514 kmem_cache_destroy(btrfs_transaction_cachep);
4515 if (btrfs_bit_radix_cachep)
4516 kmem_cache_destroy(btrfs_bit_radix_cachep);
4517 if (btrfs_path_cachep)
4518 kmem_cache_destroy(btrfs_path_cachep);
4519}
4520
4521struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
4522 unsigned long extra_flags,
4523 void (*ctor)(void *))
4524{
4525 return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
4526 SLAB_MEM_SPREAD | extra_flags), ctor);
4527}
4528
4529int btrfs_init_cachep(void)
4530{
4531 btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
4532 sizeof(struct btrfs_inode),
4533 0, init_once);
4534 if (!btrfs_inode_cachep)
4535 goto fail;
4536 btrfs_trans_handle_cachep =
4537 btrfs_cache_create("btrfs_trans_handle_cache",
4538 sizeof(struct btrfs_trans_handle),
4539 0, NULL);
4540 if (!btrfs_trans_handle_cachep)
4541 goto fail;
4542 btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
4543 sizeof(struct btrfs_transaction),
4544 0, NULL);
4545 if (!btrfs_transaction_cachep)
4546 goto fail;
4547 btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
4548 sizeof(struct btrfs_path),
4549 0, NULL);
4550 if (!btrfs_path_cachep)
4551 goto fail;
4552 btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
4553 SLAB_DESTROY_BY_RCU, NULL);
4554 if (!btrfs_bit_radix_cachep)
4555 goto fail;
4556 return 0;
4557fail:
4558 btrfs_destroy_cachep();
4559 return -ENOMEM;
4560}
4561
4562static int btrfs_getattr(struct vfsmount *mnt,
4563 struct dentry *dentry, struct kstat *stat)
4564{
4565 struct inode *inode = dentry->d_inode;
4566 generic_fillattr(inode, stat);
4567 stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
4568 stat->blksize = PAGE_CACHE_SIZE;
4569 stat->blocks = (inode_get_bytes(inode) +
4570 BTRFS_I(inode)->delalloc_bytes) >> 9;
4571 return 0;
4572}
4573
4574static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4575 struct inode *new_dir, struct dentry *new_dentry)
4576{
4577 struct btrfs_trans_handle *trans;
4578 struct btrfs_root *root = BTRFS_I(old_dir)->root;
4579 struct inode *new_inode = new_dentry->d_inode;
4580 struct inode *old_inode = old_dentry->d_inode;
4581 struct timespec ctime = CURRENT_TIME;
4582 u64 index = 0;
4583 int ret;
4584
4585 /* we're not allowed to rename between subvolumes */
4586 if (BTRFS_I(old_inode)->root->root_key.objectid !=
4587 BTRFS_I(new_dir)->root->root_key.objectid)
4588 return -EXDEV;
4589
4590 if (S_ISDIR(old_inode->i_mode) && new_inode &&
4591 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
4592 return -ENOTEMPTY;
4593 }
4594
4595 /* to rename a snapshot or subvolume, we need to juggle the
4596 * backrefs. This isn't coded yet
4597 */
4598 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4599 return -EXDEV;
4600
4601 ret = btrfs_check_free_space(root, 1, 0);
4602 if (ret)
4603 goto out_unlock;
4604
4605 trans = btrfs_start_transaction(root, 1);
4606
4607 btrfs_set_trans_block_group(trans, new_dir);
4608
4609 btrfs_inc_nlink(old_dentry->d_inode);
4610 old_dir->i_ctime = old_dir->i_mtime = ctime;
4611 new_dir->i_ctime = new_dir->i_mtime = ctime;
4612 old_inode->i_ctime = ctime;
4613
4614 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4615 old_dentry->d_name.name,
4616 old_dentry->d_name.len);
4617 if (ret)
4618 goto out_fail;
4619
4620 if (new_inode) {
4621 new_inode->i_ctime = CURRENT_TIME;
4622 ret = btrfs_unlink_inode(trans, root, new_dir,
4623 new_dentry->d_inode,
4624 new_dentry->d_name.name,
4625 new_dentry->d_name.len);
4626 if (ret)
4627 goto out_fail;
4628 if (new_inode->i_nlink == 0) {
4629 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
4630 if (ret)
4631 goto out_fail;
4632 }
4633
4634 }
4635 ret = btrfs_set_inode_index(new_dir, &index);
4636 if (ret)
4637 goto out_fail;
4638
4639 ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
4640 old_inode, new_dentry->d_name.name,
4641 new_dentry->d_name.len, 1, index);
4642 if (ret)
4643 goto out_fail;
4644
4645out_fail:
4646 btrfs_end_transaction_throttle(trans, root);
4647out_unlock:
4648 return ret;
4649}
4650
4651/*
4652 * some fairly slow code that needs optimization. This walks the list
4653 * of all the inodes with pending delalloc and forces them to disk.
4654 */
4655int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4656{
4657 struct list_head *head = &root->fs_info->delalloc_inodes;
4658 struct btrfs_inode *binode;
4659 struct inode *inode;
4660
4661 if (root->fs_info->sb->s_flags & MS_RDONLY)
4662 return -EROFS;
4663
4664 spin_lock(&root->fs_info->delalloc_lock);
4665 while (!list_empty(head)) {
4666 binode = list_entry(head->next, struct btrfs_inode,
4667 delalloc_inodes);
4668 inode = igrab(&binode->vfs_inode);
4669 if (!inode)
4670 list_del_init(&binode->delalloc_inodes);
4671 spin_unlock(&root->fs_info->delalloc_lock);
4672 if (inode) {
4673 filemap_flush(inode->i_mapping);
4674 iput(inode);
4675 }
4676 cond_resched();
4677 spin_lock(&root->fs_info->delalloc_lock);
4678 }
4679 spin_unlock(&root->fs_info->delalloc_lock);
4680
4681 /* the filemap_flush will queue IO into the worker threads, but
4682 * we have to make sure the IO is actually started and that
4683 * ordered extents get created before we return
4684 */
4685 atomic_inc(&root->fs_info->async_submit_draining);
4686 while (atomic_read(&root->fs_info->nr_async_submits) ||
4687 atomic_read(&root->fs_info->async_delalloc_pages)) {
4688 wait_event(root->fs_info->async_submit_wait,
4689 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4690 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4691 }
4692 atomic_dec(&root->fs_info->async_submit_draining);
4693 return 0;
4694}
4695
4696static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4697 const char *symname)
4698{
4699 struct btrfs_trans_handle *trans;
4700 struct btrfs_root *root = BTRFS_I(dir)->root;
4701 struct btrfs_path *path;
4702 struct btrfs_key key;
4703 struct inode *inode = NULL;
4704 int err;
4705 int drop_inode = 0;
4706 u64 objectid;
4707 u64 index = 0 ;
4708 int name_len;
4709 int datasize;
4710 unsigned long ptr;
4711 struct btrfs_file_extent_item *ei;
4712 struct extent_buffer *leaf;
4713 unsigned long nr = 0;
4714
4715 name_len = strlen(symname) + 1;
4716 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4717 return -ENAMETOOLONG;
4718
4719 err = btrfs_check_free_space(root, 1, 0);
4720 if (err)
4721 goto out_fail;
4722
4723 trans = btrfs_start_transaction(root, 1);
4724 btrfs_set_trans_block_group(trans, dir);
4725
4726 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4727 if (err) {
4728 err = -ENOSPC;
4729 goto out_unlock;
4730 }
4731
4732 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4733 dentry->d_name.len,
4734 dentry->d_parent->d_inode->i_ino, objectid,
4735 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
4736 &index);
4737 err = PTR_ERR(inode);
4738 if (IS_ERR(inode))
4739 goto out_unlock;
4740
4741 err = btrfs_init_acl(inode, dir);
4742 if (err) {
4743 drop_inode = 1;
4744 goto out_unlock;
4745 }
4746
4747 btrfs_set_trans_block_group(trans, inode);
4748 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4749 if (err)
4750 drop_inode = 1;
4751 else {
4752 inode->i_mapping->a_ops = &btrfs_aops;
4753 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4754 inode->i_fop = &btrfs_file_operations;
4755 inode->i_op = &btrfs_file_inode_operations;
4756 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4757 }
4758 dir->i_sb->s_dirt = 1;
4759 btrfs_update_inode_block_group(trans, inode);
4760 btrfs_update_inode_block_group(trans, dir);
4761 if (drop_inode)
4762 goto out_unlock;
4763
4764 path = btrfs_alloc_path();
4765 BUG_ON(!path);
4766 key.objectid = inode->i_ino;
4767 key.offset = 0;
4768 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
4769 datasize = btrfs_file_extent_calc_inline_size(name_len);
4770 err = btrfs_insert_empty_item(trans, root, path, &key,
4771 datasize);
4772 if (err) {
4773 drop_inode = 1;
4774 goto out_unlock;
4775 }
4776 leaf = path->nodes[0];
4777 ei = btrfs_item_ptr(leaf, path->slots[0],
4778 struct btrfs_file_extent_item);
4779 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
4780 btrfs_set_file_extent_type(leaf, ei,
4781 BTRFS_FILE_EXTENT_INLINE);
4782 btrfs_set_file_extent_encryption(leaf, ei, 0);
4783 btrfs_set_file_extent_compression(leaf, ei, 0);
4784 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4785 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4786
4787 ptr = btrfs_file_extent_inline_start(ei);
4788 write_extent_buffer(leaf, symname, ptr, name_len);
4789 btrfs_mark_buffer_dirty(leaf);
4790 btrfs_free_path(path);
4791
4792 inode->i_op = &btrfs_symlink_inode_operations;
4793 inode->i_mapping->a_ops = &btrfs_symlink_aops;
4794 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4795 inode_set_bytes(inode, name_len);
4796 btrfs_i_size_write(inode, name_len - 1);
4797 err = btrfs_update_inode(trans, root, inode);
4798 if (err)
4799 drop_inode = 1;
4800
4801out_unlock:
4802 nr = trans->blocks_used;
4803 btrfs_end_transaction_throttle(trans, root);
4804out_fail:
4805 if (drop_inode) {
4806 inode_dec_link_count(inode);
4807 iput(inode);
4808 }
4809 btrfs_btree_balance_dirty(root, nr);
4810 return err;
4811}
4812
4813static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4814 u64 alloc_hint, int mode)
4815{
4816 struct btrfs_trans_handle *trans;
4817 struct btrfs_root *root = BTRFS_I(inode)->root;
4818 struct btrfs_key ins;
4819 u64 alloc_size;
4820 u64 cur_offset = start;
4821 u64 num_bytes = end - start;
4822 int ret = 0;
4823
4824 trans = btrfs_join_transaction(root, 1);
4825 BUG_ON(!trans);
4826 btrfs_set_trans_block_group(trans, inode);
4827
4828 while (num_bytes > 0) {
4829 alloc_size = min(num_bytes, root->fs_info->max_extent);
4830 ret = btrfs_reserve_extent(trans, root, alloc_size,
4831 root->sectorsize, 0, alloc_hint,
4832 (u64)-1, &ins, 1);
4833 if (ret) {
4834 WARN_ON(1);
4835 goto out;
4836 }
4837 ret = insert_reserved_file_extent(trans, inode,
4838 cur_offset, ins.objectid,
4839 ins.offset, ins.offset,
4840 ins.offset, 0, 0, 0,
4841 BTRFS_FILE_EXTENT_PREALLOC);
4842 BUG_ON(ret);
4843 num_bytes -= ins.offset;
4844 cur_offset += ins.offset;
4845 alloc_hint = ins.objectid + ins.offset;
4846 }
4847out:
4848 if (cur_offset > start) {
4849 inode->i_ctime = CURRENT_TIME;
4850 btrfs_set_flag(inode, PREALLOC);
4851 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4852 cur_offset > i_size_read(inode))
4853 btrfs_i_size_write(inode, cur_offset);
4854 ret = btrfs_update_inode(trans, root, inode);
4855 BUG_ON(ret);
4856 }
4857
4858 btrfs_end_transaction(trans, root);
4859 return ret;
4860}
4861
4862static long btrfs_fallocate(struct inode *inode, int mode,
4863 loff_t offset, loff_t len)
4864{
4865 u64 cur_offset;
4866 u64 last_byte;
4867 u64 alloc_start;
4868 u64 alloc_end;
4869 u64 alloc_hint = 0;
4870 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
4871 struct extent_map *em;
4872 int ret;
4873
4874 alloc_start = offset & ~mask;
4875 alloc_end = (offset + len + mask) & ~mask;
4876
4877 mutex_lock(&inode->i_mutex);
4878 if (alloc_start > inode->i_size) {
4879 ret = btrfs_cont_expand(inode, alloc_start);
4880 if (ret)
4881 goto out;
4882 }
4883
4884 while (1) {
4885 struct btrfs_ordered_extent *ordered;
4886 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
4887 alloc_end - 1, GFP_NOFS);
4888 ordered = btrfs_lookup_first_ordered_extent(inode,
4889 alloc_end - 1);
4890 if (ordered &&
4891 ordered->file_offset + ordered->len > alloc_start &&
4892 ordered->file_offset < alloc_end) {
4893 btrfs_put_ordered_extent(ordered);
4894 unlock_extent(&BTRFS_I(inode)->io_tree,
4895 alloc_start, alloc_end - 1, GFP_NOFS);
4896 btrfs_wait_ordered_range(inode, alloc_start,
4897 alloc_end - alloc_start);
4898 } else {
4899 if (ordered)
4900 btrfs_put_ordered_extent(ordered);
4901 break;
4902 }
4903 }
4904
4905 cur_offset = alloc_start;
4906 while (1) {
4907 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4908 alloc_end - cur_offset, 0);
4909 BUG_ON(IS_ERR(em) || !em);
4910 last_byte = min(extent_map_end(em), alloc_end);
4911 last_byte = (last_byte + mask) & ~mask;
4912 if (em->block_start == EXTENT_MAP_HOLE) {
4913 ret = prealloc_file_range(inode, cur_offset,
4914 last_byte, alloc_hint, mode);
4915 if (ret < 0) {
4916 free_extent_map(em);
4917 break;
4918 }
4919 }
4920 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
4921 alloc_hint = em->block_start;
4922 free_extent_map(em);
4923
4924 cur_offset = last_byte;
4925 if (cur_offset >= alloc_end) {
4926 ret = 0;
4927 break;
4928 }
4929 }
4930 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
4931 GFP_NOFS);
4932out:
4933 mutex_unlock(&inode->i_mutex);
4934 return ret;
4935}
4936
4937static int btrfs_set_page_dirty(struct page *page)
4938{
4939 return __set_page_dirty_nobuffers(page);
4940}
4941
4942static int btrfs_permission(struct inode *inode, int mask)
4943{
4944 if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
4945 return -EACCES;
4946 return generic_permission(inode, mask, btrfs_check_acl);
4947}
4948
4949static struct inode_operations btrfs_dir_inode_operations = {
4950 .getattr = btrfs_getattr,
4951 .lookup = btrfs_lookup,
4952 .create = btrfs_create,
4953 .unlink = btrfs_unlink,
4954 .link = btrfs_link,
4955 .mkdir = btrfs_mkdir,
4956 .rmdir = btrfs_rmdir,
4957 .rename = btrfs_rename,
4958 .symlink = btrfs_symlink,
4959 .setattr = btrfs_setattr,
4960 .mknod = btrfs_mknod,
4961 .setxattr = btrfs_setxattr,
4962 .getxattr = btrfs_getxattr,
4963 .listxattr = btrfs_listxattr,
4964 .removexattr = btrfs_removexattr,
4965 .permission = btrfs_permission,
4966};
4967static struct inode_operations btrfs_dir_ro_inode_operations = {
4968 .lookup = btrfs_lookup,
4969 .permission = btrfs_permission,
4970};
4971static struct file_operations btrfs_dir_file_operations = {
4972 .llseek = generic_file_llseek,
4973 .read = generic_read_dir,
4974 .readdir = btrfs_real_readdir,
4975 .unlocked_ioctl = btrfs_ioctl,
4976#ifdef CONFIG_COMPAT
4977 .compat_ioctl = btrfs_ioctl,
4978#endif
4979 .release = btrfs_release_file,
4980 .fsync = btrfs_sync_file,
4981};
4982
4983static struct extent_io_ops btrfs_extent_io_ops = {
4984 .fill_delalloc = run_delalloc_range,
4985 .submit_bio_hook = btrfs_submit_bio_hook,
4986 .merge_bio_hook = btrfs_merge_bio_hook,
4987 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
4988 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
4989 .writepage_start_hook = btrfs_writepage_start_hook,
4990 .readpage_io_failed_hook = btrfs_io_failed_hook,
4991 .set_bit_hook = btrfs_set_bit_hook,
4992 .clear_bit_hook = btrfs_clear_bit_hook,
4993};
4994
4995static struct address_space_operations btrfs_aops = {
4996 .readpage = btrfs_readpage,
4997 .writepage = btrfs_writepage,
4998 .writepages = btrfs_writepages,
4999 .readpages = btrfs_readpages,
5000 .sync_page = block_sync_page,
5001 .bmap = btrfs_bmap,
5002 .direct_IO = btrfs_direct_IO,
5003 .invalidatepage = btrfs_invalidatepage,
5004 .releasepage = btrfs_releasepage,
5005 .set_page_dirty = btrfs_set_page_dirty,
5006};
5007
5008static struct address_space_operations btrfs_symlink_aops = {
5009 .readpage = btrfs_readpage,
5010 .writepage = btrfs_writepage,
5011 .invalidatepage = btrfs_invalidatepage,
5012 .releasepage = btrfs_releasepage,
5013};
5014
5015static struct inode_operations btrfs_file_inode_operations = {
5016 .truncate = btrfs_truncate,
5017 .getattr = btrfs_getattr,
5018 .setattr = btrfs_setattr,
5019 .setxattr = btrfs_setxattr,
5020 .getxattr = btrfs_getxattr,
5021 .listxattr = btrfs_listxattr,
5022 .removexattr = btrfs_removexattr,
5023 .permission = btrfs_permission,
5024 .fallocate = btrfs_fallocate,
5025};
5026static struct inode_operations btrfs_special_inode_operations = {
5027 .getattr = btrfs_getattr,
5028 .setattr = btrfs_setattr,
5029 .permission = btrfs_permission,
5030 .setxattr = btrfs_setxattr,
5031 .getxattr = btrfs_getxattr,
5032 .listxattr = btrfs_listxattr,
5033 .removexattr = btrfs_removexattr,
5034};
5035static struct inode_operations btrfs_symlink_inode_operations = {
5036 .readlink = generic_readlink,
5037 .follow_link = page_follow_link_light,
5038 .put_link = page_put_link,
5039 .permission = btrfs_permission,
5040};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000000..c2aa33e3feb5
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,1132 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/fsnotify.h>
25#include <linux/pagemap.h>
26#include <linux/highmem.h>
27#include <linux/time.h>
28#include <linux/init.h>
29#include <linux/string.h>
30#include <linux/smp_lock.h>
31#include <linux/backing-dev.h>
32#include <linux/mount.h>
33#include <linux/mpage.h>
34#include <linux/namei.h>
35#include <linux/swap.h>
36#include <linux/writeback.h>
37#include <linux/statfs.h>
38#include <linux/compat.h>
39#include <linux/bit_spinlock.h>
40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h>
43#include <linux/vmalloc.h>
44#include "compat.h"
45#include "ctree.h"
46#include "disk-io.h"
47#include "transaction.h"
48#include "btrfs_inode.h"
49#include "ioctl.h"
50#include "print-tree.h"
51#include "volumes.h"
52#include "locking.h"
53
54
55
56static noinline int create_subvol(struct btrfs_root *root,
57 struct dentry *dentry,
58 char *name, int namelen)
59{
60 struct btrfs_trans_handle *trans;
61 struct btrfs_key key;
62 struct btrfs_root_item root_item;
63 struct btrfs_inode_item *inode_item;
64 struct extent_buffer *leaf;
65 struct btrfs_root *new_root = root;
66 struct inode *dir;
67 int ret;
68 int err;
69 u64 objectid;
70 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
71 u64 index = 0;
72 unsigned long nr = 1;
73
74 ret = btrfs_check_free_space(root, 1, 0);
75 if (ret)
76 goto fail_commit;
77
78 trans = btrfs_start_transaction(root, 1);
79 BUG_ON(!trans);
80
81 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
82 0, &objectid);
83 if (ret)
84 goto fail;
85
86 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
87 objectid, trans->transid, 0, 0, 0);
88 if (IS_ERR(leaf)) {
89 ret = PTR_ERR(leaf);
90 goto fail;
91 }
92
93 btrfs_set_header_nritems(leaf, 0);
94 btrfs_set_header_level(leaf, 0);
95 btrfs_set_header_bytenr(leaf, leaf->start);
96 btrfs_set_header_generation(leaf, trans->transid);
97 btrfs_set_header_owner(leaf, objectid);
98
99 write_extent_buffer(leaf, root->fs_info->fsid,
100 (unsigned long)btrfs_header_fsid(leaf),
101 BTRFS_FSID_SIZE);
102 btrfs_mark_buffer_dirty(leaf);
103
104 inode_item = &root_item.inode;
105 memset(inode_item, 0, sizeof(*inode_item));
106 inode_item->generation = cpu_to_le64(1);
107 inode_item->size = cpu_to_le64(3);
108 inode_item->nlink = cpu_to_le32(1);
109 inode_item->nbytes = cpu_to_le64(root->leafsize);
110 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
111
112 btrfs_set_root_bytenr(&root_item, leaf->start);
113 btrfs_set_root_generation(&root_item, trans->transid);
114 btrfs_set_root_level(&root_item, 0);
115 btrfs_set_root_refs(&root_item, 1);
116 btrfs_set_root_used(&root_item, 0);
117 btrfs_set_root_last_snapshot(&root_item, 0);
118
119 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
120 root_item.drop_level = 0;
121
122 btrfs_tree_unlock(leaf);
123 free_extent_buffer(leaf);
124 leaf = NULL;
125
126 btrfs_set_root_dirid(&root_item, new_dirid);
127
128 key.objectid = objectid;
129 key.offset = 1;
130 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
131 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
132 &root_item);
133 if (ret)
134 goto fail;
135
136 /*
137 * insert the directory item
138 */
139 key.offset = (u64)-1;
140 dir = dentry->d_parent->d_inode;
141 ret = btrfs_set_inode_index(dir, &index);
142 BUG_ON(ret);
143
144 ret = btrfs_insert_dir_item(trans, root,
145 name, namelen, dir->i_ino, &key,
146 BTRFS_FT_DIR, index);
147 if (ret)
148 goto fail;
149
150 btrfs_i_size_write(dir, dir->i_size + namelen * 2);
151 ret = btrfs_update_inode(trans, root, dir);
152 BUG_ON(ret);
153
154 /* add the backref first */
155 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
156 objectid, BTRFS_ROOT_BACKREF_KEY,
157 root->root_key.objectid,
158 dir->i_ino, index, name, namelen);
159
160 BUG_ON(ret);
161
162 /* now add the forward ref */
163 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
164 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
165 objectid,
166 dir->i_ino, index, name, namelen);
167
168 BUG_ON(ret);
169
170 ret = btrfs_commit_transaction(trans, root);
171 if (ret)
172 goto fail_commit;
173
174 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
175 BUG_ON(!new_root);
176
177 trans = btrfs_start_transaction(new_root, 1);
178 BUG_ON(!trans);
179
180 ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
181 BTRFS_I(dir)->block_group);
182 if (ret)
183 goto fail;
184
185fail:
186 nr = trans->blocks_used;
187 err = btrfs_commit_transaction(trans, new_root);
188 if (err && !ret)
189 ret = err;
190fail_commit:
191 btrfs_btree_balance_dirty(root, nr);
192 return ret;
193}
194
195static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
196 char *name, int namelen)
197{
198 struct btrfs_pending_snapshot *pending_snapshot;
199 struct btrfs_trans_handle *trans;
200 int ret = 0;
201 int err;
202 unsigned long nr = 0;
203
204 if (!root->ref_cows)
205 return -EINVAL;
206
207 ret = btrfs_check_free_space(root, 1, 0);
208 if (ret)
209 goto fail_unlock;
210
211 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
212 if (!pending_snapshot) {
213 ret = -ENOMEM;
214 goto fail_unlock;
215 }
216 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
217 if (!pending_snapshot->name) {
218 ret = -ENOMEM;
219 kfree(pending_snapshot);
220 goto fail_unlock;
221 }
222 memcpy(pending_snapshot->name, name, namelen);
223 pending_snapshot->name[namelen] = '\0';
224 pending_snapshot->dentry = dentry;
225 trans = btrfs_start_transaction(root, 1);
226 BUG_ON(!trans);
227 pending_snapshot->root = root;
228 list_add(&pending_snapshot->list,
229 &trans->transaction->pending_snapshots);
230 err = btrfs_commit_transaction(trans, root);
231
232fail_unlock:
233 btrfs_btree_balance_dirty(root, nr);
234 return ret;
235}
236
237/* copy of may_create in fs/namei.c() */
238static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
239{
240 if (child->d_inode)
241 return -EEXIST;
242 if (IS_DEADDIR(dir))
243 return -ENOENT;
244 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
245}
246
247/*
248 * Create a new subvolume below @parent. This is largely modeled after
249 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
250 * inside this filesystem so it's quite a bit simpler.
251 */
252static noinline int btrfs_mksubvol(struct path *parent, char *name,
253 int mode, int namelen,
254 struct btrfs_root *snap_src)
255{
256 struct dentry *dentry;
257 int error;
258
259 mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
260
261 dentry = lookup_one_len(name, parent->dentry, namelen);
262 error = PTR_ERR(dentry);
263 if (IS_ERR(dentry))
264 goto out_unlock;
265
266 error = -EEXIST;
267 if (dentry->d_inode)
268 goto out_dput;
269
270 if (!IS_POSIXACL(parent->dentry->d_inode))
271 mode &= ~current->fs->umask;
272
273 error = mnt_want_write(parent->mnt);
274 if (error)
275 goto out_dput;
276
277 error = btrfs_may_create(parent->dentry->d_inode, dentry);
278 if (error)
279 goto out_drop_write;
280
281 /*
282 * Actually perform the low-level subvolume creation after all
283 * this VFS fuzz.
284 *
285 * Eventually we want to pass in an inode under which we create this
286 * subvolume, but for now all are under the filesystem root.
287 *
288 * Also we should pass on the mode eventually to allow creating new
289 * subvolume with specific mode bits.
290 */
291 if (snap_src) {
292 struct dentry *dir = dentry->d_parent;
293 struct dentry *test = dir->d_parent;
294 struct btrfs_path *path = btrfs_alloc_path();
295 int ret;
296 u64 test_oid;
297 u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
298
299 test_oid = snap_src->root_key.objectid;
300
301 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
302 path, parent_oid, test_oid);
303 if (ret == 0)
304 goto create;
305 btrfs_release_path(snap_src->fs_info->tree_root, path);
306
307 /* we need to make sure we aren't creating a directory loop
308 * by taking a snapshot of something that has our current
309 * subvol in its directory tree. So, this loops through
310 * the dentries and checks the forward refs for each subvolume
311 * to see if is references the subvolume where we are
312 * placing this new snapshot.
313 */
314 while (1) {
315 if (!test ||
316 dir == snap_src->fs_info->sb->s_root ||
317 test == snap_src->fs_info->sb->s_root ||
318 test->d_inode->i_sb != snap_src->fs_info->sb) {
319 break;
320 }
321 if (S_ISLNK(test->d_inode->i_mode)) {
322 printk(KERN_INFO "Btrfs symlink in snapshot "
323 "path, failed\n");
324 error = -EMLINK;
325 btrfs_free_path(path);
326 goto out_drop_write;
327 }
328 test_oid =
329 BTRFS_I(test->d_inode)->root->root_key.objectid;
330 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
331 path, test_oid, parent_oid);
332 if (ret == 0) {
333 printk(KERN_INFO "Btrfs snapshot creation "
334 "failed, looping\n");
335 error = -EMLINK;
336 btrfs_free_path(path);
337 goto out_drop_write;
338 }
339 btrfs_release_path(snap_src->fs_info->tree_root, path);
340 test = test->d_parent;
341 }
342create:
343 btrfs_free_path(path);
344 error = create_snapshot(snap_src, dentry, name, namelen);
345 } else {
346 error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
347 dentry, name, namelen);
348 }
349 if (error)
350 goto out_drop_write;
351
352 fsnotify_mkdir(parent->dentry->d_inode, dentry);
353out_drop_write:
354 mnt_drop_write(parent->mnt);
355out_dput:
356 dput(dentry);
357out_unlock:
358 mutex_unlock(&parent->dentry->d_inode->i_mutex);
359 return error;
360}
361
362
363static int btrfs_defrag_file(struct file *file)
364{
365 struct inode *inode = fdentry(file)->d_inode;
366 struct btrfs_root *root = BTRFS_I(inode)->root;
367 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
368 struct btrfs_ordered_extent *ordered;
369 struct page *page;
370 unsigned long last_index;
371 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
372 unsigned long total_read = 0;
373 u64 page_start;
374 u64 page_end;
375 unsigned long i;
376 int ret;
377
378 ret = btrfs_check_free_space(root, inode->i_size, 0);
379 if (ret)
380 return -ENOSPC;
381
382 mutex_lock(&inode->i_mutex);
383 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
384 for (i = 0; i <= last_index; i++) {
385 if (total_read % ra_pages == 0) {
386 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
387 min(last_index, i + ra_pages - 1));
388 }
389 total_read++;
390again:
391 page = grab_cache_page(inode->i_mapping, i);
392 if (!page)
393 goto out_unlock;
394 if (!PageUptodate(page)) {
395 btrfs_readpage(NULL, page);
396 lock_page(page);
397 if (!PageUptodate(page)) {
398 unlock_page(page);
399 page_cache_release(page);
400 goto out_unlock;
401 }
402 }
403
404 wait_on_page_writeback(page);
405
406 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
407 page_end = page_start + PAGE_CACHE_SIZE - 1;
408 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
409
410 ordered = btrfs_lookup_ordered_extent(inode, page_start);
411 if (ordered) {
412 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
413 unlock_page(page);
414 page_cache_release(page);
415 btrfs_start_ordered_extent(inode, ordered, 1);
416 btrfs_put_ordered_extent(ordered);
417 goto again;
418 }
419 set_page_extent_mapped(page);
420
421 /*
422 * this makes sure page_mkwrite is called on the
423 * page if it is dirtied again later
424 */
425 clear_page_dirty_for_io(page);
426
427 btrfs_set_extent_delalloc(inode, page_start, page_end);
428
429 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
430 set_page_dirty(page);
431 unlock_page(page);
432 page_cache_release(page);
433 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
434 }
435
436out_unlock:
437 mutex_unlock(&inode->i_mutex);
438 return 0;
439}
440
441/*
442 * Called inside transaction, so use GFP_NOFS
443 */
444
445static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
446{
447 u64 new_size;
448 u64 old_size;
449 u64 devid = 1;
450 struct btrfs_ioctl_vol_args *vol_args;
451 struct btrfs_trans_handle *trans;
452 struct btrfs_device *device = NULL;
453 char *sizestr;
454 char *devstr = NULL;
455 int ret = 0;
456 int namelen;
457 int mod = 0;
458
459 if (root->fs_info->sb->s_flags & MS_RDONLY)
460 return -EROFS;
461
462 if (!capable(CAP_SYS_ADMIN))
463 return -EPERM;
464
465 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
466
467 if (!vol_args)
468 return -ENOMEM;
469
470 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
471 ret = -EFAULT;
472 goto out;
473 }
474
475 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
476 namelen = strlen(vol_args->name);
477
478 mutex_lock(&root->fs_info->volume_mutex);
479 sizestr = vol_args->name;
480 devstr = strchr(sizestr, ':');
481 if (devstr) {
482 char *end;
483 sizestr = devstr + 1;
484 *devstr = '\0';
485 devstr = vol_args->name;
486 devid = simple_strtoull(devstr, &end, 10);
487 printk(KERN_INFO "resizing devid %llu\n", devid);
488 }
489 device = btrfs_find_device(root, devid, NULL, NULL);
490 if (!device) {
491 printk(KERN_INFO "resizer unable to find device %llu\n", devid);
492 ret = -EINVAL;
493 goto out_unlock;
494 }
495 if (!strcmp(sizestr, "max"))
496 new_size = device->bdev->bd_inode->i_size;
497 else {
498 if (sizestr[0] == '-') {
499 mod = -1;
500 sizestr++;
501 } else if (sizestr[0] == '+') {
502 mod = 1;
503 sizestr++;
504 }
505 new_size = btrfs_parse_size(sizestr);
506 if (new_size == 0) {
507 ret = -EINVAL;
508 goto out_unlock;
509 }
510 }
511
512 old_size = device->total_bytes;
513
514 if (mod < 0) {
515 if (new_size > old_size) {
516 ret = -EINVAL;
517 goto out_unlock;
518 }
519 new_size = old_size - new_size;
520 } else if (mod > 0) {
521 new_size = old_size + new_size;
522 }
523
524 if (new_size < 256 * 1024 * 1024) {
525 ret = -EINVAL;
526 goto out_unlock;
527 }
528 if (new_size > device->bdev->bd_inode->i_size) {
529 ret = -EFBIG;
530 goto out_unlock;
531 }
532
533 do_div(new_size, root->sectorsize);
534 new_size *= root->sectorsize;
535
536 printk(KERN_INFO "new size for %s is %llu\n",
537 device->name, (unsigned long long)new_size);
538
539 if (new_size > old_size) {
540 trans = btrfs_start_transaction(root, 1);
541 ret = btrfs_grow_device(trans, device, new_size);
542 btrfs_commit_transaction(trans, root);
543 } else {
544 ret = btrfs_shrink_device(device, new_size);
545 }
546
547out_unlock:
548 mutex_unlock(&root->fs_info->volume_mutex);
549out:
550 kfree(vol_args);
551 return ret;
552}
553
554static noinline int btrfs_ioctl_snap_create(struct file *file,
555 void __user *arg, int subvol)
556{
557 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
558 struct btrfs_ioctl_vol_args *vol_args;
559 struct btrfs_dir_item *di;
560 struct btrfs_path *path;
561 struct file *src_file;
562 u64 root_dirid;
563 int namelen;
564 int ret = 0;
565
566 if (root->fs_info->sb->s_flags & MS_RDONLY)
567 return -EROFS;
568
569 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
570
571 if (!vol_args)
572 return -ENOMEM;
573
574 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
575 ret = -EFAULT;
576 goto out;
577 }
578
579 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
580 namelen = strlen(vol_args->name);
581 if (strchr(vol_args->name, '/')) {
582 ret = -EINVAL;
583 goto out;
584 }
585
586 path = btrfs_alloc_path();
587 if (!path) {
588 ret = -ENOMEM;
589 goto out;
590 }
591
592 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
593 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
594 path, root_dirid,
595 vol_args->name, namelen, 0);
596 btrfs_free_path(path);
597
598 if (di && !IS_ERR(di)) {
599 ret = -EEXIST;
600 goto out;
601 }
602
603 if (IS_ERR(di)) {
604 ret = PTR_ERR(di);
605 goto out;
606 }
607
608 if (subvol) {
609 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
610 file->f_path.dentry->d_inode->i_mode,
611 namelen, NULL);
612 } else {
613 struct inode *src_inode;
614 src_file = fget(vol_args->fd);
615 if (!src_file) {
616 ret = -EINVAL;
617 goto out;
618 }
619
620 src_inode = src_file->f_path.dentry->d_inode;
621 if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
622 printk(KERN_INFO "btrfs: Snapshot src from "
623 "another FS\n");
624 ret = -EINVAL;
625 fput(src_file);
626 goto out;
627 }
628 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
629 file->f_path.dentry->d_inode->i_mode,
630 namelen, BTRFS_I(src_inode)->root);
631 fput(src_file);
632 }
633
634out:
635 kfree(vol_args);
636 return ret;
637}
638
639static int btrfs_ioctl_defrag(struct file *file)
640{
641 struct inode *inode = fdentry(file)->d_inode;
642 struct btrfs_root *root = BTRFS_I(inode)->root;
643 int ret;
644
645 ret = mnt_want_write(file->f_path.mnt);
646 if (ret)
647 return ret;
648
649 switch (inode->i_mode & S_IFMT) {
650 case S_IFDIR:
651 if (!capable(CAP_SYS_ADMIN)) {
652 ret = -EPERM;
653 goto out;
654 }
655 btrfs_defrag_root(root, 0);
656 btrfs_defrag_root(root->fs_info->extent_root, 0);
657 break;
658 case S_IFREG:
659 if (!(file->f_mode & FMODE_WRITE)) {
660 ret = -EINVAL;
661 goto out;
662 }
663 btrfs_defrag_file(file);
664 break;
665 }
666out:
667 mnt_drop_write(file->f_path.mnt);
668 return ret;
669}
670
671static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
672{
673 struct btrfs_ioctl_vol_args *vol_args;
674 int ret;
675
676 if (!capable(CAP_SYS_ADMIN))
677 return -EPERM;
678
679 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
680
681 if (!vol_args)
682 return -ENOMEM;
683
684 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
685 ret = -EFAULT;
686 goto out;
687 }
688 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
689 ret = btrfs_init_new_device(root, vol_args->name);
690
691out:
692 kfree(vol_args);
693 return ret;
694}
695
696static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
697{
698 struct btrfs_ioctl_vol_args *vol_args;
699 int ret;
700
701 if (!capable(CAP_SYS_ADMIN))
702 return -EPERM;
703
704 if (root->fs_info->sb->s_flags & MS_RDONLY)
705 return -EROFS;
706
707 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
708
709 if (!vol_args)
710 return -ENOMEM;
711
712 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
713 ret = -EFAULT;
714 goto out;
715 }
716 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
717 ret = btrfs_rm_device(root, vol_args->name);
718
719out:
720 kfree(vol_args);
721 return ret;
722}
723
724static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
725 u64 off, u64 olen, u64 destoff)
726{
727 struct inode *inode = fdentry(file)->d_inode;
728 struct btrfs_root *root = BTRFS_I(inode)->root;
729 struct file *src_file;
730 struct inode *src;
731 struct btrfs_trans_handle *trans;
732 struct btrfs_path *path;
733 struct extent_buffer *leaf;
734 char *buf;
735 struct btrfs_key key;
736 u32 nritems;
737 int slot;
738 int ret;
739 u64 len = olen;
740 u64 bs = root->fs_info->sb->s_blocksize;
741 u64 hint_byte;
742
743 /*
744 * TODO:
745 * - split compressed inline extents. annoying: we need to
746 * decompress into destination's address_space (the file offset
747 * may change, so source mapping won't do), then recompress (or
748 * otherwise reinsert) a subrange.
749 * - allow ranges within the same file to be cloned (provided
750 * they don't overlap)?
751 */
752
753 /* the destination must be opened for writing */
754 if (!(file->f_mode & FMODE_WRITE))
755 return -EINVAL;
756
757 ret = mnt_want_write(file->f_path.mnt);
758 if (ret)
759 return ret;
760
761 src_file = fget(srcfd);
762 if (!src_file) {
763 ret = -EBADF;
764 goto out_drop_write;
765 }
766 src = src_file->f_dentry->d_inode;
767
768 ret = -EINVAL;
769 if (src == inode)
770 goto out_fput;
771
772 ret = -EISDIR;
773 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
774 goto out_fput;
775
776 ret = -EXDEV;
777 if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
778 goto out_fput;
779
780 ret = -ENOMEM;
781 buf = vmalloc(btrfs_level_size(root, 0));
782 if (!buf)
783 goto out_fput;
784
785 path = btrfs_alloc_path();
786 if (!path) {
787 vfree(buf);
788 goto out_fput;
789 }
790 path->reada = 2;
791
792 if (inode < src) {
793 mutex_lock(&inode->i_mutex);
794 mutex_lock(&src->i_mutex);
795 } else {
796 mutex_lock(&src->i_mutex);
797 mutex_lock(&inode->i_mutex);
798 }
799
800 /* determine range to clone */
801 ret = -EINVAL;
802 if (off >= src->i_size || off + len > src->i_size)
803 goto out_unlock;
804 if (len == 0)
805 olen = len = src->i_size - off;
806 /* if we extend to eof, continue to block boundary */
807 if (off + len == src->i_size)
808 len = ((src->i_size + bs-1) & ~(bs-1))
809 - off;
810
811 /* verify the end result is block aligned */
812 if ((off & (bs-1)) ||
813 ((off + len) & (bs-1)))
814 goto out_unlock;
815
816 /* do any pending delalloc/csum calc on src, one way or
817 another, and lock file content */
818 while (1) {
819 struct btrfs_ordered_extent *ordered;
820 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
821 ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
822 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
823 break;
824 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
825 if (ordered)
826 btrfs_put_ordered_extent(ordered);
827 btrfs_wait_ordered_range(src, off, off+len);
828 }
829
830 trans = btrfs_start_transaction(root, 1);
831 BUG_ON(!trans);
832
833 /* punch hole in destination first */
834 btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
835
836 /* clone data */
837 key.objectid = src->i_ino;
838 key.type = BTRFS_EXTENT_DATA_KEY;
839 key.offset = 0;
840
841 while (1) {
842 /*
843 * note the key will change type as we walk through the
844 * tree.
845 */
846 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
847 if (ret < 0)
848 goto out;
849
850 nritems = btrfs_header_nritems(path->nodes[0]);
851 if (path->slots[0] >= nritems) {
852 ret = btrfs_next_leaf(root, path);
853 if (ret < 0)
854 goto out;
855 if (ret > 0)
856 break;
857 nritems = btrfs_header_nritems(path->nodes[0]);
858 }
859 leaf = path->nodes[0];
860 slot = path->slots[0];
861
862 btrfs_item_key_to_cpu(leaf, &key, slot);
863 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
864 key.objectid != src->i_ino)
865 break;
866
867 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
868 struct btrfs_file_extent_item *extent;
869 int type;
870 u32 size;
871 struct btrfs_key new_key;
872 u64 disko = 0, diskl = 0;
873 u64 datao = 0, datal = 0;
874 u8 comp;
875
876 size = btrfs_item_size_nr(leaf, slot);
877 read_extent_buffer(leaf, buf,
878 btrfs_item_ptr_offset(leaf, slot),
879 size);
880
881 extent = btrfs_item_ptr(leaf, slot,
882 struct btrfs_file_extent_item);
883 comp = btrfs_file_extent_compression(leaf, extent);
884 type = btrfs_file_extent_type(leaf, extent);
885 if (type == BTRFS_FILE_EXTENT_REG) {
886 disko = btrfs_file_extent_disk_bytenr(leaf,
887 extent);
888 diskl = btrfs_file_extent_disk_num_bytes(leaf,
889 extent);
890 datao = btrfs_file_extent_offset(leaf, extent);
891 datal = btrfs_file_extent_num_bytes(leaf,
892 extent);
893 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
894 /* take upper bound, may be compressed */
895 datal = btrfs_file_extent_ram_bytes(leaf,
896 extent);
897 }
898 btrfs_release_path(root, path);
899
900 if (key.offset + datal < off ||
901 key.offset >= off+len)
902 goto next;
903
904 memcpy(&new_key, &key, sizeof(new_key));
905 new_key.objectid = inode->i_ino;
906 new_key.offset = key.offset + destoff - off;
907
908 if (type == BTRFS_FILE_EXTENT_REG) {
909 ret = btrfs_insert_empty_item(trans, root, path,
910 &new_key, size);
911 if (ret)
912 goto out;
913
914 leaf = path->nodes[0];
915 slot = path->slots[0];
916 write_extent_buffer(leaf, buf,
917 btrfs_item_ptr_offset(leaf, slot),
918 size);
919
920 extent = btrfs_item_ptr(leaf, slot,
921 struct btrfs_file_extent_item);
922
923 if (off > key.offset) {
924 datao += off - key.offset;
925 datal -= off - key.offset;
926 }
927 if (key.offset + datao + datal + key.offset >
928 off + len)
929 datal = off + len - key.offset - datao;
930 /* disko == 0 means it's a hole */
931 if (!disko)
932 datao = 0;
933
934 btrfs_set_file_extent_offset(leaf, extent,
935 datao);
936 btrfs_set_file_extent_num_bytes(leaf, extent,
937 datal);
938 if (disko) {
939 inode_add_bytes(inode, datal);
940 ret = btrfs_inc_extent_ref(trans, root,
941 disko, diskl, leaf->start,
942 root->root_key.objectid,
943 trans->transid,
944 inode->i_ino);
945 BUG_ON(ret);
946 }
947 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
948 u64 skip = 0;
949 u64 trim = 0;
950 if (off > key.offset) {
951 skip = off - key.offset;
952 new_key.offset += skip;
953 }
954
955 if (key.offset + datal > off+len)
956 trim = key.offset + datal - (off+len);
957
958 if (comp && (skip || trim)) {
959 ret = -EINVAL;
960 goto out;
961 }
962 size -= skip + trim;
963 datal -= skip + trim;
964 ret = btrfs_insert_empty_item(trans, root, path,
965 &new_key, size);
966 if (ret)
967 goto out;
968
969 if (skip) {
970 u32 start =
971 btrfs_file_extent_calc_inline_size(0);
972 memmove(buf+start, buf+start+skip,
973 datal);
974 }
975
976 leaf = path->nodes[0];
977 slot = path->slots[0];
978 write_extent_buffer(leaf, buf,
979 btrfs_item_ptr_offset(leaf, slot),
980 size);
981 inode_add_bytes(inode, datal);
982 }
983
984 btrfs_mark_buffer_dirty(leaf);
985 }
986
987next:
988 btrfs_release_path(root, path);
989 key.offset++;
990 }
991 ret = 0;
992out:
993 btrfs_release_path(root, path);
994 if (ret == 0) {
995 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
996 if (destoff + olen > inode->i_size)
997 btrfs_i_size_write(inode, destoff + olen);
998 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
999 ret = btrfs_update_inode(trans, root, inode);
1000 }
1001 btrfs_end_transaction(trans, root);
1002 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1003 if (ret)
1004 vmtruncate(inode, 0);
1005out_unlock:
1006 mutex_unlock(&src->i_mutex);
1007 mutex_unlock(&inode->i_mutex);
1008 vfree(buf);
1009 btrfs_free_path(path);
1010out_fput:
1011 fput(src_file);
1012out_drop_write:
1013 mnt_drop_write(file->f_path.mnt);
1014 return ret;
1015}
1016
1017static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
1018{
1019 struct btrfs_ioctl_clone_range_args args;
1020
1021 if (copy_from_user(&args, argp, sizeof(args)))
1022 return -EFAULT;
1023 return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
1024 args.src_length, args.dest_offset);
1025}
1026
1027/*
1028 * there are many ways the trans_start and trans_end ioctls can lead
1029 * to deadlocks. They should only be used by applications that
1030 * basically own the machine, and have a very in depth understanding
1031 * of all the possible deadlocks and enospc problems.
1032 */
1033static long btrfs_ioctl_trans_start(struct file *file)
1034{
1035 struct inode *inode = fdentry(file)->d_inode;
1036 struct btrfs_root *root = BTRFS_I(inode)->root;
1037 struct btrfs_trans_handle *trans;
1038 int ret = 0;
1039
1040 if (!capable(CAP_SYS_ADMIN))
1041 return -EPERM;
1042
1043 if (file->private_data) {
1044 ret = -EINPROGRESS;
1045 goto out;
1046 }
1047
1048 ret = mnt_want_write(file->f_path.mnt);
1049 if (ret)
1050 goto out;
1051
1052 mutex_lock(&root->fs_info->trans_mutex);
1053 root->fs_info->open_ioctl_trans++;
1054 mutex_unlock(&root->fs_info->trans_mutex);
1055
1056 trans = btrfs_start_ioctl_transaction(root, 0);
1057 if (trans)
1058 file->private_data = trans;
1059 else
1060 ret = -ENOMEM;
1061 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
1062out:
1063 return ret;
1064}
1065
1066/*
1067 * there are many ways the trans_start and trans_end ioctls can lead
1068 * to deadlocks. They should only be used by applications that
1069 * basically own the machine, and have a very in depth understanding
1070 * of all the possible deadlocks and enospc problems.
1071 */
1072long btrfs_ioctl_trans_end(struct file *file)
1073{
1074 struct inode *inode = fdentry(file)->d_inode;
1075 struct btrfs_root *root = BTRFS_I(inode)->root;
1076 struct btrfs_trans_handle *trans;
1077 int ret = 0;
1078
1079 trans = file->private_data;
1080 if (!trans) {
1081 ret = -EINVAL;
1082 goto out;
1083 }
1084 btrfs_end_transaction(trans, root);
1085 file->private_data = NULL;
1086
1087 mutex_lock(&root->fs_info->trans_mutex);
1088 root->fs_info->open_ioctl_trans--;
1089 mutex_unlock(&root->fs_info->trans_mutex);
1090
1091 mnt_drop_write(file->f_path.mnt);
1092
1093out:
1094 return ret;
1095}
1096
1097long btrfs_ioctl(struct file *file, unsigned int
1098 cmd, unsigned long arg)
1099{
1100 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1101 void __user *argp = (void __user *)arg;
1102
1103 switch (cmd) {
1104 case BTRFS_IOC_SNAP_CREATE:
1105 return btrfs_ioctl_snap_create(file, argp, 0);
1106 case BTRFS_IOC_SUBVOL_CREATE:
1107 return btrfs_ioctl_snap_create(file, argp, 1);
1108 case BTRFS_IOC_DEFRAG:
1109 return btrfs_ioctl_defrag(file);
1110 case BTRFS_IOC_RESIZE:
1111 return btrfs_ioctl_resize(root, argp);
1112 case BTRFS_IOC_ADD_DEV:
1113 return btrfs_ioctl_add_dev(root, argp);
1114 case BTRFS_IOC_RM_DEV:
1115 return btrfs_ioctl_rm_dev(root, argp);
1116 case BTRFS_IOC_BALANCE:
1117 return btrfs_balance(root->fs_info->dev_root);
1118 case BTRFS_IOC_CLONE:
1119 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
1120 case BTRFS_IOC_CLONE_RANGE:
1121 return btrfs_ioctl_clone_range(file, argp);
1122 case BTRFS_IOC_TRANS_START:
1123 return btrfs_ioctl_trans_start(file);
1124 case BTRFS_IOC_TRANS_END:
1125 return btrfs_ioctl_trans_end(file);
1126 case BTRFS_IOC_SYNC:
1127 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1128 return 0;
1129 }
1130
1131 return -ENOTTY;
1132}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..78049ea208db
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __IOCTL_
20#define __IOCTL_
21#include <linux/ioctl.h>
22
23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 3072
26
27struct btrfs_ioctl_vol_args {
28 __s64 fd;
29 char name[BTRFS_PATH_NAME_MAX + 1];
30};
31
32#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
33 struct btrfs_ioctl_vol_args)
34#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
35 struct btrfs_ioctl_vol_args)
36#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
37 struct btrfs_ioctl_vol_args)
38#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
39 struct btrfs_ioctl_vol_args)
40/* trans start and trans end are dangerous, and only for
41 * use by applications that know how to avoid the
42 * resulting deadlocks
43 */
44#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
45#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
46#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
47
48#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
49#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
50 struct btrfs_ioctl_vol_args)
51#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
52 struct btrfs_ioctl_vol_args)
53#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
54 struct btrfs_ioctl_vol_args)
55struct btrfs_ioctl_clone_range_args {
56 __s64 src_fd;
57 __u64 src_offset, src_length;
58 __u64 dest_offset;
59};
60
61#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
62 struct btrfs_ioctl_clone_range_args)
63
64#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
65 struct btrfs_ioctl_vol_args)
66
67#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000000..39bae7761db6
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,88 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h>
21#include <linux/spinlock.h>
22#include <linux/page-flags.h>
23#include <asm/bug.h>
24#include "ctree.h"
25#include "extent_io.h"
26#include "locking.h"
27
28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks
30 * and the spin is not tuned very extensively. The spinning does make a big
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */
37
38int btrfs_tree_lock(struct extent_buffer *eb)
39{
40 int i;
41
42 if (mutex_trylock(&eb->mutex))
43 return 0;
44 for (i = 0; i < 512; i++) {
45 cpu_relax();
46 if (mutex_trylock(&eb->mutex))
47 return 0;
48 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0;
52}
53
54int btrfs_try_tree_lock(struct extent_buffer *eb)
55{
56 return mutex_trylock(&eb->mutex);
57}
58
59int btrfs_tree_unlock(struct extent_buffer *eb)
60{
61 mutex_unlock(&eb->mutex);
62 return 0;
63}
64
65int btrfs_tree_locked(struct extent_buffer *eb)
66{
67 return mutex_is_locked(&eb->mutex);
68}
69
70/*
71 * btrfs_search_slot uses this to decide if it should drop its locks
72 * before doing something expensive like allocating free blocks for cow.
73 */
74int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
75{
76 int i;
77 struct extent_buffer *eb;
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
79 eb = path->nodes[i];
80 if (!eb)
81 break;
82 smp_mb();
83 if (!list_empty(&eb->mutex.wait_list))
84 return 1;
85 }
86 return 0;
87}
88
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000000..bc1faef12519
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_
21
22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb);
25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
27#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000000..a20940170274
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,730 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/gfp.h>
20#include <linux/slab.h>
21#include <linux/blkdev.h>
22#include <linux/writeback.h>
23#include <linux/pagevec.h>
24#include "ctree.h"
25#include "transaction.h"
26#include "btrfs_inode.h"
27#include "extent_io.h"
28
29static u64 entry_end(struct btrfs_ordered_extent *entry)
30{
31 if (entry->file_offset + entry->len < entry->file_offset)
32 return (u64)-1;
33 return entry->file_offset + entry->len;
34}
35
36/* returns NULL if the insertion worked, or it returns the node it did find
37 * in the tree
38 */
39static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
40 struct rb_node *node)
41{
42 struct rb_node **p = &root->rb_node;
43 struct rb_node *parent = NULL;
44 struct btrfs_ordered_extent *entry;
45
46 while (*p) {
47 parent = *p;
48 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
49
50 if (file_offset < entry->file_offset)
51 p = &(*p)->rb_left;
52 else if (file_offset >= entry_end(entry))
53 p = &(*p)->rb_right;
54 else
55 return parent;
56 }
57
58 rb_link_node(node, parent, p);
59 rb_insert_color(node, root);
60 return NULL;
61}
62
63/*
64 * look for a given offset in the tree, and if it can't be found return the
65 * first lesser offset
66 */
67static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
68 struct rb_node **prev_ret)
69{
70 struct rb_node *n = root->rb_node;
71 struct rb_node *prev = NULL;
72 struct rb_node *test;
73 struct btrfs_ordered_extent *entry;
74 struct btrfs_ordered_extent *prev_entry = NULL;
75
76 while (n) {
77 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
78 prev = n;
79 prev_entry = entry;
80
81 if (file_offset < entry->file_offset)
82 n = n->rb_left;
83 else if (file_offset >= entry_end(entry))
84 n = n->rb_right;
85 else
86 return n;
87 }
88 if (!prev_ret)
89 return NULL;
90
91 while (prev && file_offset >= entry_end(prev_entry)) {
92 test = rb_next(prev);
93 if (!test)
94 break;
95 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
96 rb_node);
97 if (file_offset < entry_end(prev_entry))
98 break;
99
100 prev = test;
101 }
102 if (prev)
103 prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
104 rb_node);
105 while (prev && file_offset < entry_end(prev_entry)) {
106 test = rb_prev(prev);
107 if (!test)
108 break;
109 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
110 rb_node);
111 prev = test;
112 }
113 *prev_ret = prev;
114 return NULL;
115}
116
117/*
118 * helper to check if a given offset is inside a given entry
119 */
120static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
121{
122 if (file_offset < entry->file_offset ||
123 entry->file_offset + entry->len <= file_offset)
124 return 0;
125 return 1;
126}
127
128/*
129 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset
131 */
132static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
133 u64 file_offset)
134{
135 struct rb_root *root = &tree->tree;
136 struct rb_node *prev;
137 struct rb_node *ret;
138 struct btrfs_ordered_extent *entry;
139
140 if (tree->last) {
141 entry = rb_entry(tree->last, struct btrfs_ordered_extent,
142 rb_node);
143 if (offset_in_entry(entry, file_offset))
144 return tree->last;
145 }
146 ret = __tree_search(root, file_offset, &prev);
147 if (!ret)
148 ret = prev;
149 if (ret)
150 tree->last = ret;
151 return ret;
152}
153
154/* allocate and add a new ordered_extent into the per-inode tree.
155 * file_offset is the logical offset in the file
156 *
157 * start is the disk block number of an extent already reserved in the
158 * extent allocation tree
159 *
160 * len is the length of the extent
161 *
162 * This also sets the EXTENT_ORDERED bit on the range in the inode.
163 *
164 * The tree is given a single reference on the ordered extent that was
165 * inserted.
166 */
167int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
168 u64 start, u64 len, u64 disk_len, int type)
169{
170 struct btrfs_ordered_inode_tree *tree;
171 struct rb_node *node;
172 struct btrfs_ordered_extent *entry;
173
174 tree = &BTRFS_I(inode)->ordered_tree;
175 entry = kzalloc(sizeof(*entry), GFP_NOFS);
176 if (!entry)
177 return -ENOMEM;
178
179 mutex_lock(&tree->mutex);
180 entry->file_offset = file_offset;
181 entry->start = start;
182 entry->len = len;
183 entry->disk_len = disk_len;
184 entry->inode = inode;
185 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
186 set_bit(type, &entry->flags);
187
188 /* one ref for the tree */
189 atomic_set(&entry->refs, 1);
190 init_waitqueue_head(&entry->wait);
191 INIT_LIST_HEAD(&entry->list);
192 INIT_LIST_HEAD(&entry->root_extent_list);
193
194 node = tree_insert(&tree->tree, file_offset,
195 &entry->rb_node);
196 BUG_ON(node);
197
198 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
199 entry_end(entry) - 1, GFP_NOFS);
200
201 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
202 list_add_tail(&entry->root_extent_list,
203 &BTRFS_I(inode)->root->fs_info->ordered_extents);
204 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
205
206 mutex_unlock(&tree->mutex);
207 BUG_ON(node);
208 return 0;
209}
210
211/*
212 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
213 * when an ordered extent is finished. If the list covers more than one
214 * ordered extent, it is split across multiples.
215 */
216int btrfs_add_ordered_sum(struct inode *inode,
217 struct btrfs_ordered_extent *entry,
218 struct btrfs_ordered_sum *sum)
219{
220 struct btrfs_ordered_inode_tree *tree;
221
222 tree = &BTRFS_I(inode)->ordered_tree;
223 mutex_lock(&tree->mutex);
224 list_add_tail(&sum->list, &entry->list);
225 mutex_unlock(&tree->mutex);
226 return 0;
227}
228
229/*
230 * this is used to account for finished IO across a given range
231 * of the file. The IO should not span ordered extents. If
232 * a given ordered_extent is completely done, 1 is returned, otherwise
233 * 0.
234 *
235 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
236 * to make sure this function only returns 1 once for a given ordered extent.
237 */
238int btrfs_dec_test_ordered_pending(struct inode *inode,
239 u64 file_offset, u64 io_size)
240{
241 struct btrfs_ordered_inode_tree *tree;
242 struct rb_node *node;
243 struct btrfs_ordered_extent *entry;
244 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
245 int ret;
246
247 tree = &BTRFS_I(inode)->ordered_tree;
248 mutex_lock(&tree->mutex);
249 clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
250 GFP_NOFS);
251 node = tree_search(tree, file_offset);
252 if (!node) {
253 ret = 1;
254 goto out;
255 }
256
257 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
258 if (!offset_in_entry(entry, file_offset)) {
259 ret = 1;
260 goto out;
261 }
262
263 ret = test_range_bit(io_tree, entry->file_offset,
264 entry->file_offset + entry->len - 1,
265 EXTENT_ORDERED, 0);
266 if (ret == 0)
267 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
268out:
269 mutex_unlock(&tree->mutex);
270 return ret == 0;
271}
272
273/*
274 * used to drop a reference on an ordered extent. This will free
275 * the extent if the last reference is dropped
276 */
277int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
278{
279 struct list_head *cur;
280 struct btrfs_ordered_sum *sum;
281
282 if (atomic_dec_and_test(&entry->refs)) {
283 while (!list_empty(&entry->list)) {
284 cur = entry->list.next;
285 sum = list_entry(cur, struct btrfs_ordered_sum, list);
286 list_del(&sum->list);
287 kfree(sum);
288 }
289 kfree(entry);
290 }
291 return 0;
292}
293
294/*
295 * remove an ordered extent from the tree. No references are dropped
296 * but, anyone waiting on this extent is woken up.
297 */
298int btrfs_remove_ordered_extent(struct inode *inode,
299 struct btrfs_ordered_extent *entry)
300{
301 struct btrfs_ordered_inode_tree *tree;
302 struct rb_node *node;
303
304 tree = &BTRFS_I(inode)->ordered_tree;
305 mutex_lock(&tree->mutex);
306 node = &entry->rb_node;
307 rb_erase(node, &tree->tree);
308 tree->last = NULL;
309 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list);
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314
315 mutex_unlock(&tree->mutex);
316 wake_up(&entry->wait);
317 return 0;
318}
319
320/*
321 * wait for all the ordered extents in a root. This is done when balancing
322 * space between drives.
323 */
324int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
325{
326 struct list_head splice;
327 struct list_head *cur;
328 struct btrfs_ordered_extent *ordered;
329 struct inode *inode;
330
331 INIT_LIST_HEAD(&splice);
332
333 spin_lock(&root->fs_info->ordered_extent_lock);
334 list_splice_init(&root->fs_info->ordered_extents, &splice);
335 while (!list_empty(&splice)) {
336 cur = splice.next;
337 ordered = list_entry(cur, struct btrfs_ordered_extent,
338 root_extent_list);
339 if (nocow_only &&
340 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
341 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
342 list_move(&ordered->root_extent_list,
343 &root->fs_info->ordered_extents);
344 cond_resched_lock(&root->fs_info->ordered_extent_lock);
345 continue;
346 }
347
348 list_del_init(&ordered->root_extent_list);
349 atomic_inc(&ordered->refs);
350
351 /*
352 * the inode may be getting freed (in sys_unlink path).
353 */
354 inode = igrab(ordered->inode);
355
356 spin_unlock(&root->fs_info->ordered_extent_lock);
357
358 if (inode) {
359 btrfs_start_ordered_extent(inode, ordered, 1);
360 btrfs_put_ordered_extent(ordered);
361 iput(inode);
362 } else {
363 btrfs_put_ordered_extent(ordered);
364 }
365
366 spin_lock(&root->fs_info->ordered_extent_lock);
367 }
368 spin_unlock(&root->fs_info->ordered_extent_lock);
369 return 0;
370}
371
372/*
373 * Used to start IO or wait for a given ordered extent to finish.
374 *
375 * If wait is one, this effectively waits on page writeback for all the pages
376 * in the extent, and it waits on the io completion code to insert
377 * metadata into the btree corresponding to the extent
378 */
379void btrfs_start_ordered_extent(struct inode *inode,
380 struct btrfs_ordered_extent *entry,
381 int wait)
382{
383 u64 start = entry->file_offset;
384 u64 end = start + entry->len - 1;
385
386 /*
387 * pages in the range can be dirty, clean or writeback. We
388 * start IO on any dirty ones so the wait doesn't stall waiting
389 * for pdflush to find them
390 */
391 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
392 if (wait) {
393 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
394 &entry->flags));
395 }
396}
397
398/*
399 * Used to wait on ordered extents across a large range of bytes.
400 */
401int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
402{
403 u64 end;
404 u64 orig_end;
405 u64 wait_end;
406 struct btrfs_ordered_extent *ordered;
407
408 if (start + len < start) {
409 orig_end = INT_LIMIT(loff_t);
410 } else {
411 orig_end = start + len - 1;
412 if (orig_end > INT_LIMIT(loff_t))
413 orig_end = INT_LIMIT(loff_t);
414 }
415 wait_end = orig_end;
416again:
417 /* start IO across the range first to instantiate any delalloc
418 * extents
419 */
420 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
421
422 /* The compression code will leave pages locked but return from
423 * writepage without setting the page writeback. Starting again
424 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
425 */
426 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
427
428 btrfs_wait_on_page_writeback_range(inode->i_mapping,
429 start >> PAGE_CACHE_SHIFT,
430 orig_end >> PAGE_CACHE_SHIFT);
431
432 end = orig_end;
433 while (1) {
434 ordered = btrfs_lookup_first_ordered_extent(inode, end);
435 if (!ordered)
436 break;
437 if (ordered->file_offset > orig_end) {
438 btrfs_put_ordered_extent(ordered);
439 break;
440 }
441 if (ordered->file_offset + ordered->len < start) {
442 btrfs_put_ordered_extent(ordered);
443 break;
444 }
445 btrfs_start_ordered_extent(inode, ordered, 1);
446 end = ordered->file_offset;
447 btrfs_put_ordered_extent(ordered);
448 if (end == 0 || end == start)
449 break;
450 end--;
451 }
452 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
453 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
454 schedule_timeout(1);
455 goto again;
456 }
457 return 0;
458}
459
460/*
461 * find an ordered extent corresponding to file_offset. return NULL if
462 * nothing is found, otherwise take a reference on the extent and return it
463 */
464struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
465 u64 file_offset)
466{
467 struct btrfs_ordered_inode_tree *tree;
468 struct rb_node *node;
469 struct btrfs_ordered_extent *entry = NULL;
470
471 tree = &BTRFS_I(inode)->ordered_tree;
472 mutex_lock(&tree->mutex);
473 node = tree_search(tree, file_offset);
474 if (!node)
475 goto out;
476
477 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
478 if (!offset_in_entry(entry, file_offset))
479 entry = NULL;
480 if (entry)
481 atomic_inc(&entry->refs);
482out:
483 mutex_unlock(&tree->mutex);
484 return entry;
485}
486
487/*
488 * lookup and return any extent before 'file_offset'. NULL is returned
489 * if none is found
490 */
491struct btrfs_ordered_extent *
492btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
493{
494 struct btrfs_ordered_inode_tree *tree;
495 struct rb_node *node;
496 struct btrfs_ordered_extent *entry = NULL;
497
498 tree = &BTRFS_I(inode)->ordered_tree;
499 mutex_lock(&tree->mutex);
500 node = tree_search(tree, file_offset);
501 if (!node)
502 goto out;
503
504 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
505 atomic_inc(&entry->refs);
506out:
507 mutex_unlock(&tree->mutex);
508 return entry;
509}
510
511/*
512 * After an extent is done, call this to conditionally update the on disk
513 * i_size. i_size is updated to cover any fully written part of the file.
514 */
515int btrfs_ordered_update_i_size(struct inode *inode,
516 struct btrfs_ordered_extent *ordered)
517{
518 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
519 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
520 u64 disk_i_size;
521 u64 new_i_size;
522 u64 i_size_test;
523 struct rb_node *node;
524 struct btrfs_ordered_extent *test;
525
526 mutex_lock(&tree->mutex);
527 disk_i_size = BTRFS_I(inode)->disk_i_size;
528
529 /*
530 * if the disk i_size is already at the inode->i_size, or
531 * this ordered extent is inside the disk i_size, we're done
532 */
533 if (disk_i_size >= inode->i_size ||
534 ordered->file_offset + ordered->len <= disk_i_size) {
535 goto out;
536 }
537
538 /*
539 * we can't update the disk_isize if there are delalloc bytes
540 * between disk_i_size and this ordered extent
541 */
542 if (test_range_bit(io_tree, disk_i_size,
543 ordered->file_offset + ordered->len - 1,
544 EXTENT_DELALLOC, 0)) {
545 goto out;
546 }
547 /*
548 * walk backward from this ordered extent to disk_i_size.
549 * if we find an ordered extent then we can't update disk i_size
550 * yet
551 */
552 node = &ordered->rb_node;
553 while (1) {
554 node = rb_prev(node);
555 if (!node)
556 break;
557 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
558 if (test->file_offset + test->len <= disk_i_size)
559 break;
560 if (test->file_offset >= inode->i_size)
561 break;
562 if (test->file_offset >= disk_i_size)
563 goto out;
564 }
565 new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
566
567 /*
568 * at this point, we know we can safely update i_size to at least
569 * the offset from this ordered extent. But, we need to
570 * walk forward and see if ios from higher up in the file have
571 * finished.
572 */
573 node = rb_next(&ordered->rb_node);
574 i_size_test = 0;
575 if (node) {
576 /*
577 * do we have an area where IO might have finished
578 * between our ordered extent and the next one.
579 */
580 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
581 if (test->file_offset > entry_end(ordered))
582 i_size_test = test->file_offset;
583 } else {
584 i_size_test = i_size_read(inode);
585 }
586
587 /*
588 * i_size_test is the end of a region after this ordered
589 * extent where there are no ordered extents. As long as there
590 * are no delalloc bytes in this area, it is safe to update
591 * disk_i_size to the end of the region.
592 */
593 if (i_size_test > entry_end(ordered) &&
594 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
595 EXTENT_DELALLOC, 0)) {
596 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
597 }
598 BTRFS_I(inode)->disk_i_size = new_i_size;
599out:
600 mutex_unlock(&tree->mutex);
601 return 0;
602}
603
604/*
605 * search the ordered extents for one corresponding to 'offset' and
606 * try to find a checksum. This is used because we allow pages to
607 * be reclaimed before their checksum is actually put into the btree
608 */
609int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
610 u32 *sum)
611{
612 struct btrfs_ordered_sum *ordered_sum;
613 struct btrfs_sector_sum *sector_sums;
614 struct btrfs_ordered_extent *ordered;
615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
616 struct list_head *cur;
617 unsigned long num_sectors;
618 unsigned long i;
619 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
620 int ret = 1;
621
622 ordered = btrfs_lookup_ordered_extent(inode, offset);
623 if (!ordered)
624 return 1;
625
626 mutex_lock(&tree->mutex);
627 list_for_each_prev(cur, &ordered->list) {
628 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
629 if (disk_bytenr >= ordered_sum->bytenr) {
630 num_sectors = ordered_sum->len / sectorsize;
631 sector_sums = ordered_sum->sums;
632 for (i = 0; i < num_sectors; i++) {
633 if (sector_sums[i].bytenr == disk_bytenr) {
634 *sum = sector_sums[i].sum;
635 ret = 0;
636 goto out;
637 }
638 }
639 }
640 }
641out:
642 mutex_unlock(&tree->mutex);
643 btrfs_put_ordered_extent(ordered);
644 return ret;
645}
646
647
648/**
649 * taken from mm/filemap.c because it isn't exported
650 *
651 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
652 * @mapping: address space structure to write
653 * @start: offset in bytes where the range starts
654 * @end: offset in bytes where the range ends (inclusive)
655 * @sync_mode: enable synchronous operation
656 *
657 * Start writeback against all of a mapping's dirty pages that lie
658 * within the byte offsets <start, end> inclusive.
659 *
660 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
661 * opposed to a regular memory cleansing writeback. The difference between
662 * these two operations is that if a dirty page/buffer is encountered, it must
663 * be waited upon, and not just skipped over.
664 */
665int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
666 loff_t end, int sync_mode)
667{
668 struct writeback_control wbc = {
669 .sync_mode = sync_mode,
670 .nr_to_write = mapping->nrpages * 2,
671 .range_start = start,
672 .range_end = end,
673 .for_writepages = 1,
674 };
675 return btrfs_writepages(mapping, &wbc);
676}
677
678/**
679 * taken from mm/filemap.c because it isn't exported
680 *
681 * wait_on_page_writeback_range - wait for writeback to complete
682 * @mapping: target address_space
683 * @start: beginning page index
684 * @end: ending page index
685 *
686 * Wait for writeback to complete against pages indexed by start->end
687 * inclusive
688 */
689int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
690 pgoff_t start, pgoff_t end)
691{
692 struct pagevec pvec;
693 int nr_pages;
694 int ret = 0;
695 pgoff_t index;
696
697 if (end < start)
698 return 0;
699
700 pagevec_init(&pvec, 0);
701 index = start;
702 while ((index <= end) &&
703 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
704 PAGECACHE_TAG_WRITEBACK,
705 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
706 unsigned i;
707
708 for (i = 0; i < nr_pages; i++) {
709 struct page *page = pvec.pages[i];
710
711 /* until radix tree lookup accepts end_index */
712 if (page->index > end)
713 continue;
714
715 wait_on_page_writeback(page);
716 if (PageError(page))
717 ret = -EIO;
718 }
719 pagevec_release(&pvec);
720 cond_resched();
721 }
722
723 /* Check for outstanding write errors */
724 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
725 ret = -ENOSPC;
726 if (test_and_clear_bit(AS_EIO, &mapping->flags))
727 ret = -EIO;
728
729 return ret;
730}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..ab66d5e8d6d6
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,158 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ORDERED_DATA__
20#define __BTRFS_ORDERED_DATA__
21
22/* one of these per inode */
23struct btrfs_ordered_inode_tree {
24 struct mutex mutex;
25 struct rb_root tree;
26 struct rb_node *last;
27};
28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 /* bytenr on disk */
37 u64 bytenr;
38 u32 sum;
39};
40
41struct btrfs_ordered_sum {
42 /* bytenr is the start of this extent on disk */
43 u64 bytenr;
44
45 /*
46 * this is the length in bytes covered by the sums array below.
47 */
48 unsigned long len;
49 struct list_head list;
50 /* last field is a variable length array of btrfs_sector_sums */
51 struct btrfs_sector_sum sums[];
52};
53
54/*
55 * bits for the flags field:
56 *
57 * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
58 * It is used to make sure metadata is inserted into the tree only once
59 * per extent.
60 *
61 * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
62 * rbtree, just before waking any waiters. It is used to indicate the
63 * IO is done and any metadata is inserted into the tree.
64 */
65#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
66
67#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74
75struct btrfs_ordered_extent {
76 /* logical offset in the file */
77 u64 file_offset;
78
79 /* disk byte number */
80 u64 start;
81
82 /* ram length of the extent in bytes */
83 u64 len;
84
85 /* extent length on disk */
86 u64 disk_len;
87
88 /* flags (described above) */
89 unsigned long flags;
90
91 /* reference count */
92 atomic_t refs;
93
94 /* the inode we belong to */
95 struct inode *inode;
96
97 /* list of checksums for insertion when the extent io is done */
98 struct list_head list;
99
100 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
101 wait_queue_head_t wait;
102
103 /* our friendly rbtree entry */
104 struct rb_node rb_node;
105
106 /* a per root list of all the pending ordered extents */
107 struct list_head root_extent_list;
108};
109
110
111/*
112 * calculates the total size you need to allocate for an ordered sum
113 * structure spanning 'bytes' in the file
114 */
115static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
116 unsigned long bytes)
117{
118 unsigned long num_sectors = (bytes + root->sectorsize - 1) /
119 root->sectorsize;
120 num_sectors++;
121 return sizeof(struct btrfs_ordered_sum) +
122 num_sectors * sizeof(struct btrfs_sector_sum);
123}
124
125static inline void
126btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
127{
128 mutex_init(&t->mutex);
129 t->tree.rb_node = NULL;
130 t->last = NULL;
131}
132
133int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
134int btrfs_remove_ordered_extent(struct inode *inode,
135 struct btrfs_ordered_extent *entry);
136int btrfs_dec_test_ordered_pending(struct inode *inode,
137 u64 file_offset, u64 io_size);
138int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
139 u64 start, u64 len, u64 disk_len, int tyep);
140int btrfs_add_ordered_sum(struct inode *inode,
141 struct btrfs_ordered_extent *entry,
142 struct btrfs_ordered_sum *sum);
143struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
144 u64 file_offset);
145void btrfs_start_ordered_extent(struct inode *inode,
146 struct btrfs_ordered_extent *entry, int wait);
147int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
148struct btrfs_ordered_extent *
149btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
150int btrfs_ordered_update_i_size(struct inode *inode,
151 struct btrfs_ordered_extent *ordered);
152int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
153int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
154 pgoff_t start, pgoff_t end);
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000000..3c0d52af4f80
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21
22int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root, u64 offset)
24{
25 struct btrfs_path *path;
26 struct btrfs_key key;
27 int ret = 0;
28
29 key.objectid = BTRFS_ORPHAN_OBJECTID;
30 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
31 key.offset = offset;
32
33 path = btrfs_alloc_path();
34 if (!path)
35 return -ENOMEM;
36
37 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
38
39 btrfs_free_path(path);
40 return ret;
41}
42
43int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
44 struct btrfs_root *root, u64 offset)
45{
46 struct btrfs_path *path;
47 struct btrfs_key key;
48 int ret = 0;
49
50 key.objectid = BTRFS_ORPHAN_OBJECTID;
51 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
52 key.offset = offset;
53
54 path = btrfs_alloc_path();
55 if (!path)
56 return -ENOMEM;
57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret)
60 goto out;
61
62 ret = btrfs_del_item(trans, root, path);
63
64out:
65 btrfs_free_path(path);
66 return ret;
67}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000000..5f8f218c1005
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,216 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "print-tree.h"
22
23static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
24{
25 int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
26 int i;
27 printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
28 "num_stripes %d\n",
29 (unsigned long long)btrfs_chunk_length(eb, chunk),
30 (unsigned long long)btrfs_chunk_owner(eb, chunk),
31 (unsigned long long)btrfs_chunk_type(eb, chunk),
32 num_stripes);
33 for (i = 0 ; i < num_stripes ; i++) {
34 printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
35 (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
36 (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
37 }
38}
39static void print_dev_item(struct extent_buffer *eb,
40 struct btrfs_dev_item *dev_item)
41{
42 printk(KERN_INFO "\t\tdev item devid %llu "
43 "total_bytes %llu bytes used %llu\n",
44 (unsigned long long)btrfs_device_id(eb, dev_item),
45 (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
46 (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
47}
48void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
49{
50 int i;
51 u32 nr = btrfs_header_nritems(l);
52 struct btrfs_item *item;
53 struct btrfs_extent_item *ei;
54 struct btrfs_root_item *ri;
55 struct btrfs_dir_item *di;
56 struct btrfs_inode_item *ii;
57 struct btrfs_block_group_item *bi;
58 struct btrfs_file_extent_item *fi;
59 struct btrfs_key key;
60 struct btrfs_key found_key;
61 struct btrfs_extent_ref *ref;
62 struct btrfs_dev_extent *dev_extent;
63 u32 type;
64
65 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
66 (unsigned long long)btrfs_header_bytenr(l), nr,
67 btrfs_leaf_free_space(root, l));
68 for (i = 0 ; i < nr ; i++) {
69 item = btrfs_item_nr(l, i);
70 btrfs_item_key_to_cpu(l, &key, i);
71 type = btrfs_key_type(&key);
72 printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
73 "itemsize %d\n",
74 i,
75 (unsigned long long)key.objectid, type,
76 (unsigned long long)key.offset,
77 btrfs_item_offset(l, item), btrfs_item_size(l, item));
78 switch (type) {
79 case BTRFS_INODE_ITEM_KEY:
80 ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
81 printk(KERN_INFO "\t\tinode generation %llu size %llu "
82 "mode %o\n",
83 (unsigned long long)
84 btrfs_inode_generation(l, ii),
85 (unsigned long long)btrfs_inode_size(l, ii),
86 btrfs_inode_mode(l, ii));
87 break;
88 case BTRFS_DIR_ITEM_KEY:
89 di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
90 btrfs_dir_item_key_to_cpu(l, di, &found_key);
91 printk(KERN_INFO "\t\tdir oid %llu type %u\n",
92 (unsigned long long)found_key.objectid,
93 btrfs_dir_type(l, di));
94 break;
95 case BTRFS_ROOT_ITEM_KEY:
96 ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
97 printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
98 (unsigned long long)
99 btrfs_disk_root_bytenr(l, ri),
100 btrfs_disk_root_refs(l, ri));
101 break;
102 case BTRFS_EXTENT_ITEM_KEY:
103 ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
104 printk(KERN_INFO "\t\textent data refs %u\n",
105 btrfs_extent_refs(l, ei));
106 break;
107 case BTRFS_EXTENT_REF_KEY:
108 ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
109 printk(KERN_INFO "\t\textent back ref root %llu "
110 "gen %llu owner %llu num_refs %lu\n",
111 (unsigned long long)btrfs_ref_root(l, ref),
112 (unsigned long long)btrfs_ref_generation(l, ref),
113 (unsigned long long)btrfs_ref_objectid(l, ref),
114 (unsigned long)btrfs_ref_num_refs(l, ref));
115 break;
116
117 case BTRFS_EXTENT_DATA_KEY:
118 fi = btrfs_item_ptr(l, i,
119 struct btrfs_file_extent_item);
120 if (btrfs_file_extent_type(l, fi) ==
121 BTRFS_FILE_EXTENT_INLINE) {
122 printk(KERN_INFO "\t\tinline extent data "
123 "size %u\n",
124 btrfs_file_extent_inline_len(l, fi));
125 break;
126 }
127 printk(KERN_INFO "\t\textent data disk bytenr %llu "
128 "nr %llu\n",
129 (unsigned long long)
130 btrfs_file_extent_disk_bytenr(l, fi),
131 (unsigned long long)
132 btrfs_file_extent_disk_num_bytes(l, fi));
133 printk(KERN_INFO "\t\textent data offset %llu "
134 "nr %llu ram %llu\n",
135 (unsigned long long)
136 btrfs_file_extent_offset(l, fi),
137 (unsigned long long)
138 btrfs_file_extent_num_bytes(l, fi),
139 (unsigned long long)
140 btrfs_file_extent_ram_bytes(l, fi));
141 break;
142 case BTRFS_BLOCK_GROUP_ITEM_KEY:
143 bi = btrfs_item_ptr(l, i,
144 struct btrfs_block_group_item);
145 printk(KERN_INFO "\t\tblock group used %llu\n",
146 (unsigned long long)
147 btrfs_disk_block_group_used(l, bi));
148 break;
149 case BTRFS_CHUNK_ITEM_KEY:
150 print_chunk(l, btrfs_item_ptr(l, i,
151 struct btrfs_chunk));
152 break;
153 case BTRFS_DEV_ITEM_KEY:
154 print_dev_item(l, btrfs_item_ptr(l, i,
155 struct btrfs_dev_item));
156 break;
157 case BTRFS_DEV_EXTENT_KEY:
158 dev_extent = btrfs_item_ptr(l, i,
159 struct btrfs_dev_extent);
160 printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
161 "\t\tchunk objectid %llu chunk offset %llu "
162 "length %llu\n",
163 (unsigned long long)
164 btrfs_dev_extent_chunk_tree(l, dev_extent),
165 (unsigned long long)
166 btrfs_dev_extent_chunk_objectid(l, dev_extent),
167 (unsigned long long)
168 btrfs_dev_extent_chunk_offset(l, dev_extent),
169 (unsigned long long)
170 btrfs_dev_extent_length(l, dev_extent));
171 };
172 }
173}
174
175void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
176{
177 int i; u32 nr;
178 struct btrfs_key key;
179 int level;
180
181 if (!c)
182 return;
183 nr = btrfs_header_nritems(c);
184 level = btrfs_header_level(c);
185 if (level == 0) {
186 btrfs_print_leaf(root, c);
187 return;
188 }
189 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
190 (unsigned long long)btrfs_header_bytenr(c),
191 btrfs_header_level(c), nr,
192 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
193 for (i = 0; i < nr; i++) {
194 btrfs_node_key_to_cpu(c, &key, i);
195 printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
196 i,
197 (unsigned long long)key.objectid,
198 key.type,
199 (unsigned long long)key.offset,
200 (unsigned long long)btrfs_node_blockptr(c, i));
201 }
202 for (i = 0; i < nr; i++) {
203 struct extent_buffer *next = read_tree_block(root,
204 btrfs_node_blockptr(c, i),
205 btrfs_level_size(root, level - 1),
206 btrfs_node_ptr_generation(c, i));
207 if (btrfs_is_leaf(next) &&
208 btrfs_header_level(c) != 1)
209 BUG();
210 if (btrfs_header_level(next) !=
211 btrfs_header_level(c) - 1)
212 BUG();
213 btrfs_print_tree(root, next);
214 free_extent_buffer(next);
215 }
216}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000000..da75efe534d5
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __PRINT_TREE_
20#define __PRINT_TREE_
21void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
22void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
23#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 000000000000..6f0acc4c9eab
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "ref-cache.h"
22#include "transaction.h"
23
24/*
25 * leaf refs are used to cache the information about which extents
26 * a given leaf has references on. This allows us to process that leaf
27 * in btrfs_drop_snapshot without needing to read it back from disk.
28 */
29
30/*
31 * kmalloc a leaf reference struct and update the counters for the
32 * total ref cache size
33 */
34struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
35 int nr_extents)
36{
37 struct btrfs_leaf_ref *ref;
38 size_t size = btrfs_leaf_ref_size(nr_extents);
39
40 ref = kmalloc(size, GFP_NOFS);
41 if (ref) {
42 spin_lock(&root->fs_info->ref_cache_lock);
43 root->fs_info->total_ref_cache_size += size;
44 spin_unlock(&root->fs_info->ref_cache_lock);
45
46 memset(ref, 0, sizeof(*ref));
47 atomic_set(&ref->usage, 1);
48 INIT_LIST_HEAD(&ref->list);
49 }
50 return ref;
51}
52
53/*
54 * free a leaf reference struct and update the counters for the
55 * total ref cache size
56 */
57void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
58{
59 if (!ref)
60 return;
61 WARN_ON(atomic_read(&ref->usage) == 0);
62 if (atomic_dec_and_test(&ref->usage)) {
63 size_t size = btrfs_leaf_ref_size(ref->nritems);
64
65 BUG_ON(ref->in_tree);
66 kfree(ref);
67
68 spin_lock(&root->fs_info->ref_cache_lock);
69 root->fs_info->total_ref_cache_size -= size;
70 spin_unlock(&root->fs_info->ref_cache_lock);
71 }
72}
73
74static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
75 struct rb_node *node)
76{
77 struct rb_node **p = &root->rb_node;
78 struct rb_node *parent = NULL;
79 struct btrfs_leaf_ref *entry;
80
81 while (*p) {
82 parent = *p;
83 entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
84
85 if (bytenr < entry->bytenr)
86 p = &(*p)->rb_left;
87 else if (bytenr > entry->bytenr)
88 p = &(*p)->rb_right;
89 else
90 return parent;
91 }
92
93 entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
94 rb_link_node(node, parent, p);
95 rb_insert_color(node, root);
96 return NULL;
97}
98
99static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
100{
101 struct rb_node *n = root->rb_node;
102 struct btrfs_leaf_ref *entry;
103
104 while (n) {
105 entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
106 WARN_ON(!entry->in_tree);
107
108 if (bytenr < entry->bytenr)
109 n = n->rb_left;
110 else if (bytenr > entry->bytenr)
111 n = n->rb_right;
112 else
113 return n;
114 }
115 return NULL;
116}
117
118int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
119 int shared)
120{
121 struct btrfs_leaf_ref *ref = NULL;
122 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
123
124 if (shared)
125 tree = &root->fs_info->shared_ref_tree;
126 if (!tree)
127 return 0;
128
129 spin_lock(&tree->lock);
130 while (!list_empty(&tree->list)) {
131 ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
132 BUG_ON(ref->tree != tree);
133 if (ref->root_gen > max_root_gen)
134 break;
135 if (!xchg(&ref->in_tree, 0)) {
136 cond_resched_lock(&tree->lock);
137 continue;
138 }
139
140 rb_erase(&ref->rb_node, &tree->root);
141 list_del_init(&ref->list);
142
143 spin_unlock(&tree->lock);
144 btrfs_free_leaf_ref(root, ref);
145 cond_resched();
146 spin_lock(&tree->lock);
147 }
148 spin_unlock(&tree->lock);
149 return 0;
150}
151
152/*
153 * find the leaf ref for a given extent. This returns the ref struct with
154 * a usage reference incremented
155 */
156struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
157 u64 bytenr)
158{
159 struct rb_node *rb;
160 struct btrfs_leaf_ref *ref = NULL;
161 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
162again:
163 if (tree) {
164 spin_lock(&tree->lock);
165 rb = tree_search(&tree->root, bytenr);
166 if (rb)
167 ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
168 if (ref)
169 atomic_inc(&ref->usage);
170 spin_unlock(&tree->lock);
171 if (ref)
172 return ref;
173 }
174 if (tree != &root->fs_info->shared_ref_tree) {
175 tree = &root->fs_info->shared_ref_tree;
176 goto again;
177 }
178 return NULL;
179}
180
181/*
182 * add a fully filled in leaf ref struct
183 * remove all the refs older than a given root generation
184 */
185int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
186 int shared)
187{
188 int ret = 0;
189 struct rb_node *rb;
190 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
191
192 if (shared)
193 tree = &root->fs_info->shared_ref_tree;
194
195 spin_lock(&tree->lock);
196 rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
197 if (rb) {
198 ret = -EEXIST;
199 } else {
200 atomic_inc(&ref->usage);
201 ref->tree = tree;
202 ref->in_tree = 1;
203 list_add_tail(&ref->list, &tree->list);
204 }
205 spin_unlock(&tree->lock);
206 return ret;
207}
208
209/*
210 * remove a single leaf ref from the tree. This drops the ref held by the tree
211 * only
212 */
213int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
214{
215 struct btrfs_leaf_ref_tree *tree;
216
217 if (!xchg(&ref->in_tree, 0))
218 return 0;
219
220 tree = ref->tree;
221 spin_lock(&tree->lock);
222
223 rb_erase(&ref->rb_node, &tree->root);
224 list_del_init(&ref->list);
225
226 spin_unlock(&tree->lock);
227
228 btrfs_free_leaf_ref(root, ref);
229 return 0;
230}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 000000000000..16f3183d7c59
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,77 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REFCACHE__
19#define __REFCACHE__
20
21struct btrfs_extent_info {
22 /* bytenr and num_bytes find the extent in the extent allocation tree */
23 u64 bytenr;
24 u64 num_bytes;
25
26 /* objectid and offset find the back reference for the file */
27 u64 objectid;
28 u64 offset;
29};
30
31struct btrfs_leaf_ref {
32 struct rb_node rb_node;
33 struct btrfs_leaf_ref_tree *tree;
34 int in_tree;
35 atomic_t usage;
36
37 u64 root_gen;
38 u64 bytenr;
39 u64 owner;
40 u64 generation;
41 int nritems;
42
43 struct list_head list;
44 struct btrfs_extent_info extents[];
45};
46
47static inline size_t btrfs_leaf_ref_size(int nr_extents)
48{
49 return sizeof(struct btrfs_leaf_ref) +
50 sizeof(struct btrfs_extent_info) * nr_extents;
51}
52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{
55 tree->root.rb_node = NULL;
56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock);
58}
59
60static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
61{
62 return RB_EMPTY_ROOT(&tree->root);
63}
64
65void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
66struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
67 int nr_extents);
68void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
69struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
70 u64 bytenr);
71int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
72 int shared);
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000000..b48650de4472
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,366 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "transaction.h"
21#include "disk-io.h"
22#include "print-tree.h"
23
24/*
25 * search forward for a root, starting with objectid 'search_start'
26 * if a root key is found, the objectid we find is filled into 'found_objectid'
27 * and 0 is returned. < 0 is returned on error, 1 if there is nothing
28 * left in the tree.
29 */
30int btrfs_search_root(struct btrfs_root *root, u64 search_start,
31 u64 *found_objectid)
32{
33 struct btrfs_path *path;
34 struct btrfs_key search_key;
35 int ret;
36
37 root = root->fs_info->tree_root;
38 search_key.objectid = search_start;
39 search_key.type = (u8)-1;
40 search_key.offset = (u64)-1;
41
42 path = btrfs_alloc_path();
43 BUG_ON(!path);
44again:
45 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
46 if (ret < 0)
47 goto out;
48 if (ret == 0) {
49 ret = 1;
50 goto out;
51 }
52 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
53 ret = btrfs_next_leaf(root, path);
54 if (ret)
55 goto out;
56 }
57 btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
58 if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
59 search_key.offset++;
60 btrfs_release_path(root, path);
61 goto again;
62 }
63 ret = 0;
64 *found_objectid = search_key.objectid;
65
66out:
67 btrfs_free_path(path);
68 return ret;
69}
70
71/*
72 * lookup the root with the highest offset for a given objectid. The key we do
73 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
74 * on error.
75 */
76int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
77 struct btrfs_root_item *item, struct btrfs_key *key)
78{
79 struct btrfs_path *path;
80 struct btrfs_key search_key;
81 struct btrfs_key found_key;
82 struct extent_buffer *l;
83 int ret;
84 int slot;
85
86 search_key.objectid = objectid;
87 search_key.type = BTRFS_ROOT_ITEM_KEY;
88 search_key.offset = (u64)-1;
89
90 path = btrfs_alloc_path();
91 BUG_ON(!path);
92 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
93 if (ret < 0)
94 goto out;
95
96 BUG_ON(ret == 0);
97 l = path->nodes[0];
98 BUG_ON(path->slots[0] == 0);
99 slot = path->slots[0] - 1;
100 btrfs_item_key_to_cpu(l, &found_key, slot);
101 if (found_key.objectid != objectid) {
102 ret = 1;
103 goto out;
104 }
105 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
106 sizeof(*item));
107 memcpy(key, &found_key, sizeof(found_key));
108 ret = 0;
109out:
110 btrfs_free_path(path);
111 return ret;
112}
113
114/*
115 * copy the data in 'item' into the btree
116 */
117int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
118 *root, struct btrfs_key *key, struct btrfs_root_item
119 *item)
120{
121 struct btrfs_path *path;
122 struct extent_buffer *l;
123 int ret;
124 int slot;
125 unsigned long ptr;
126
127 path = btrfs_alloc_path();
128 BUG_ON(!path);
129 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
130 if (ret < 0)
131 goto out;
132
133 if (ret != 0) {
134 btrfs_print_leaf(root, path->nodes[0]);
135 printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
136 (unsigned long long)key->objectid, key->type,
137 (unsigned long long)key->offset);
138 BUG_ON(1);
139 }
140
141 l = path->nodes[0];
142 slot = path->slots[0];
143 ptr = btrfs_item_ptr_offset(l, slot);
144 write_extent_buffer(l, item, ptr, sizeof(*item));
145 btrfs_mark_buffer_dirty(path->nodes[0]);
146out:
147 btrfs_release_path(root, path);
148 btrfs_free_path(path);
149 return ret;
150}
151
152int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
153 *root, struct btrfs_key *key, struct btrfs_root_item
154 *item)
155{
156 int ret;
157 ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
158 return ret;
159}
160
161/*
162 * at mount time we want to find all the old transaction snapshots that were in
163 * the process of being deleted if we crashed. This is any root item with an
164 * offset lower than the latest root. They need to be queued for deletion to
165 * finish what was happening when we crashed.
166 */
167int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
168 struct btrfs_root *latest)
169{
170 struct btrfs_root *dead_root;
171 struct btrfs_item *item;
172 struct btrfs_root_item *ri;
173 struct btrfs_key key;
174 struct btrfs_key found_key;
175 struct btrfs_path *path;
176 int ret;
177 u32 nritems;
178 struct extent_buffer *leaf;
179 int slot;
180
181 key.objectid = objectid;
182 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
183 key.offset = 0;
184 path = btrfs_alloc_path();
185 if (!path)
186 return -ENOMEM;
187
188again:
189 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
190 if (ret < 0)
191 goto err;
192 while (1) {
193 leaf = path->nodes[0];
194 nritems = btrfs_header_nritems(leaf);
195 slot = path->slots[0];
196 if (slot >= nritems) {
197 ret = btrfs_next_leaf(root, path);
198 if (ret)
199 break;
200 leaf = path->nodes[0];
201 nritems = btrfs_header_nritems(leaf);
202 slot = path->slots[0];
203 }
204 item = btrfs_item_nr(leaf, slot);
205 btrfs_item_key_to_cpu(leaf, &key, slot);
206 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
207 goto next;
208
209 if (key.objectid < objectid)
210 goto next;
211
212 if (key.objectid > objectid)
213 break;
214
215 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
216 if (btrfs_disk_root_refs(leaf, ri) != 0)
217 goto next;
218
219 memcpy(&found_key, &key, sizeof(key));
220 key.offset++;
221 btrfs_release_path(root, path);
222 dead_root =
223 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
224 &found_key);
225 if (IS_ERR(dead_root)) {
226 ret = PTR_ERR(dead_root);
227 goto err;
228 }
229
230 if (objectid == BTRFS_TREE_RELOC_OBJECTID)
231 ret = btrfs_add_dead_reloc_root(dead_root);
232 else
233 ret = btrfs_add_dead_root(dead_root, latest);
234 if (ret)
235 goto err;
236 goto again;
237next:
238 slot++;
239 path->slots[0]++;
240 }
241 ret = 0;
242err:
243 btrfs_free_path(path);
244 return ret;
245}
246
247/* drop the root item for 'key' from 'root' */
248int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
249 struct btrfs_key *key)
250{
251 struct btrfs_path *path;
252 int ret;
253 u32 refs;
254 struct btrfs_root_item *ri;
255 struct extent_buffer *leaf;
256
257 path = btrfs_alloc_path();
258 BUG_ON(!path);
259 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
260 if (ret < 0)
261 goto out;
262
263 BUG_ON(ret != 0);
264 leaf = path->nodes[0];
265 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
266
267 refs = btrfs_disk_root_refs(leaf, ri);
268 BUG_ON(refs != 0);
269 ret = btrfs_del_item(trans, root, path);
270out:
271 btrfs_release_path(root, path);
272 btrfs_free_path(path);
273 return ret;
274}
275
276#if 0 /* this will get used when snapshot deletion is implemented */
277int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
278 struct btrfs_root *tree_root,
279 u64 root_id, u8 type, u64 ref_id)
280{
281 struct btrfs_key key;
282 int ret;
283 struct btrfs_path *path;
284
285 path = btrfs_alloc_path();
286
287 key.objectid = root_id;
288 key.type = type;
289 key.offset = ref_id;
290
291 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
292 BUG_ON(ret);
293
294 ret = btrfs_del_item(trans, tree_root, path);
295 BUG_ON(ret);
296
297 btrfs_free_path(path);
298 return ret;
299}
300#endif
301
302int btrfs_find_root_ref(struct btrfs_root *tree_root,
303 struct btrfs_path *path,
304 u64 root_id, u64 ref_id)
305{
306 struct btrfs_key key;
307 int ret;
308
309 key.objectid = root_id;
310 key.type = BTRFS_ROOT_REF_KEY;
311 key.offset = ref_id;
312
313 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
314 return ret;
315}
316
317
318/*
319 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
320 * or BTRFS_ROOT_BACKREF_KEY.
321 *
322 * The dirid, sequence, name and name_len refer to the directory entry
323 * that is referencing the root.
324 *
325 * For a forward ref, the root_id is the id of the tree referencing
326 * the root and ref_id is the id of the subvol or snapshot.
327 *
328 * For a back ref the root_id is the id of the subvol or snapshot and
329 * ref_id is the id of the tree referencing it.
330 */
331int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
332 struct btrfs_root *tree_root,
333 u64 root_id, u8 type, u64 ref_id,
334 u64 dirid, u64 sequence,
335 const char *name, int name_len)
336{
337 struct btrfs_key key;
338 int ret;
339 struct btrfs_path *path;
340 struct btrfs_root_ref *ref;
341 struct extent_buffer *leaf;
342 unsigned long ptr;
343
344
345 path = btrfs_alloc_path();
346
347 key.objectid = root_id;
348 key.type = type;
349 key.offset = ref_id;
350
351 ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
352 sizeof(*ref) + name_len);
353 BUG_ON(ret);
354
355 leaf = path->nodes[0];
356 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
357 btrfs_set_root_ref_dirid(leaf, ref, dirid);
358 btrfs_set_root_ref_sequence(leaf, ref, sequence);
359 btrfs_set_root_ref_name_len(leaf, ref, name_len);
360 ptr = (unsigned long)(ref + 1);
361 write_extent_buffer(leaf, name, ptr, name_len);
362 btrfs_mark_buffer_dirty(leaf);
363
364 btrfs_free_path(path);
365 return ret;
366}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000000..c0f7ecaf1e79
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,139 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/highmem.h>
20
21/* this is some deeply nasty code. ctree.h has a different
22 * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
23 *
24 * The end result is that anyone who #includes ctree.h gets a
25 * declaration for the btrfs_set_foo functions and btrfs_foo functions
26 *
27 * This file declares the macros and then #includes ctree.h, which results
28 * in cpp creating the function here based on the template below.
29 *
30 * These setget functions do all the extent_buffer related mapping
31 * required to efficiently read and write specific fields in the extent
32 * buffers. Every pointer to metadata items in btrfs is really just
33 * an unsigned long offset into the extent buffer which has been
34 * cast to a specific type. This gives us all the gcc type checking.
35 *
36 * The extent buffer api is used to do all the kmapping and page
37 * spanning work required to get extent buffers in highmem and have
38 * a metadata blocksize different from the page size.
39 *
40 * The macro starts with a simple function prototype declaration so that
41 * sparse won't complain about it being static.
42 */
43
44#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
45u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
46void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \
47u##bits btrfs_##name(struct extent_buffer *eb, \
48 type *s) \
49{ \
50 unsigned long part_offset = (unsigned long)s; \
51 unsigned long offset = part_offset + offsetof(type, member); \
52 type *p; \
53 /* ugly, but we want the fast path here */ \
54 if (eb->map_token && offset >= eb->map_start && \
55 offset + sizeof(((type *)0)->member) <= eb->map_start + \
56 eb->map_len) { \
57 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
58 return le##bits##_to_cpu(p->member); \
59 } \
60 { \
61 int err; \
62 char *map_token; \
63 char *kaddr; \
64 int unmap_on_exit = (eb->map_token == NULL); \
65 unsigned long map_start; \
66 unsigned long map_len; \
67 u##bits res; \
68 err = map_extent_buffer(eb, offset, \
69 sizeof(((type *)0)->member), \
70 &map_token, &kaddr, \
71 &map_start, &map_len, KM_USER1); \
72 if (err) { \
73 __le##bits leres; \
74 read_eb_member(eb, s, type, member, &leres); \
75 return le##bits##_to_cpu(leres); \
76 } \
77 p = (type *)(kaddr + part_offset - map_start); \
78 res = le##bits##_to_cpu(p->member); \
79 if (unmap_on_exit) \
80 unmap_extent_buffer(eb, map_token, KM_USER1); \
81 return res; \
82 } \
83} \
84void btrfs_set_##name(struct extent_buffer *eb, \
85 type *s, u##bits val) \
86{ \
87 unsigned long part_offset = (unsigned long)s; \
88 unsigned long offset = part_offset + offsetof(type, member); \
89 type *p; \
90 /* ugly, but we want the fast path here */ \
91 if (eb->map_token && offset >= eb->map_start && \
92 offset + sizeof(((type *)0)->member) <= eb->map_start + \
93 eb->map_len) { \
94 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
95 p->member = cpu_to_le##bits(val); \
96 return; \
97 } \
98 { \
99 int err; \
100 char *map_token; \
101 char *kaddr; \
102 int unmap_on_exit = (eb->map_token == NULL); \
103 unsigned long map_start; \
104 unsigned long map_len; \
105 err = map_extent_buffer(eb, offset, \
106 sizeof(((type *)0)->member), \
107 &map_token, &kaddr, \
108 &map_start, &map_len, KM_USER1); \
109 if (err) { \
110 __le##bits val2; \
111 val2 = cpu_to_le##bits(val); \
112 write_eb_member(eb, s, type, member, &val2); \
113 return; \
114 } \
115 p = (type *)(kaddr + part_offset - map_start); \
116 p->member = cpu_to_le##bits(val); \
117 if (unmap_on_exit) \
118 unmap_extent_buffer(eb, map_token, KM_USER1); \
119 } \
120}
121
122#include "ctree.h"
123
124void btrfs_node_key(struct extent_buffer *eb,
125 struct btrfs_disk_key *disk_key, int nr)
126{
127 unsigned long ptr = btrfs_node_key_ptr_offset(nr);
128 if (eb->map_token && ptr >= eb->map_start &&
129 ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
130 memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
131 sizeof(*disk_key));
132 return;
133 } else if (eb->map_token) {
134 unmap_extent_buffer(eb, eb->map_token, KM_USER1);
135 eb->map_token = NULL;
136 }
137 read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
138 struct btrfs_key_ptr, key, disk_key);
139}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000000..b4c101d9322c
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,720 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/module.h>
21#include <linux/buffer_head.h>
22#include <linux/fs.h>
23#include <linux/pagemap.h>
24#include <linux/highmem.h>
25#include <linux/time.h>
26#include <linux/init.h>
27#include <linux/string.h>
28#include <linux/smp_lock.h>
29#include <linux/backing-dev.h>
30#include <linux/mount.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/parser.h>
37#include <linux/ctype.h>
38#include <linux/namei.h>
39#include <linux/miscdevice.h>
40#include <linux/version.h>
41#include "compat.h"
42#include "ctree.h"
43#include "disk-io.h"
44#include "transaction.h"
45#include "btrfs_inode.h"
46#include "ioctl.h"
47#include "print-tree.h"
48#include "xattr.h"
49#include "volumes.h"
50#include "version.h"
51#include "export.h"
52#include "compression.h"
53
54#define BTRFS_SUPER_MAGIC 0x9123683E
55
56static struct super_operations btrfs_super_ops;
57
58static void btrfs_put_super(struct super_block *sb)
59{
60 struct btrfs_root *root = btrfs_sb(sb);
61 int ret;
62
63 ret = close_ctree(root);
64 sb->s_fs_info = NULL;
65}
66
67enum {
68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err,
71};
72
73static match_table_t tokens = {
74 {Opt_degraded, "degraded"},
75 {Opt_subvol, "subvol=%s"},
76 {Opt_device, "device=%s"},
77 {Opt_nodatasum, "nodatasum"},
78 {Opt_nodatacow, "nodatacow"},
79 {Opt_nobarrier, "nobarrier"},
80 {Opt_max_extent, "max_extent=%s"},
81 {Opt_max_inline, "max_inline=%s"},
82 {Opt_alloc_start, "alloc_start=%s"},
83 {Opt_thread_pool, "thread_pool=%d"},
84 {Opt_compress, "compress"},
85 {Opt_ssd, "ssd"},
86 {Opt_noacl, "noacl"},
87 {Opt_err, NULL},
88};
89
90u64 btrfs_parse_size(char *str)
91{
92 u64 res;
93 int mult = 1;
94 char *end;
95 char last;
96
97 res = simple_strtoul(str, &end, 10);
98
99 last = end[0];
100 if (isalpha(last)) {
101 last = tolower(last);
102 switch (last) {
103 case 'g':
104 mult *= 1024;
105 case 'm':
106 mult *= 1024;
107 case 'k':
108 mult *= 1024;
109 }
110 res = res * mult;
111 }
112 return res;
113}
114
115/*
116 * Regular mount options parser. Everything that is needed only when
117 * reading in a new superblock is parsed here.
118 */
119int btrfs_parse_options(struct btrfs_root *root, char *options)
120{
121 struct btrfs_fs_info *info = root->fs_info;
122 substring_t args[MAX_OPT_ARGS];
123 char *p, *num;
124 int intarg;
125
126 if (!options)
127 return 0;
128
129 /*
130 * strsep changes the string, duplicate it because parse_options
131 * gets called twice
132 */
133 options = kstrdup(options, GFP_NOFS);
134 if (!options)
135 return -ENOMEM;
136
137
138 while ((p = strsep(&options, ",")) != NULL) {
139 int token;
140 if (!*p)
141 continue;
142
143 token = match_token(p, tokens, args);
144 switch (token) {
145 case Opt_degraded:
146 printk(KERN_INFO "btrfs: allowing degraded mounts\n");
147 btrfs_set_opt(info->mount_opt, DEGRADED);
148 break;
149 case Opt_subvol:
150 case Opt_device:
151 /*
152 * These are parsed by btrfs_parse_early_options
153 * and can be happily ignored here.
154 */
155 break;
156 case Opt_nodatasum:
157 printk(KERN_INFO "btrfs: setting nodatacsum\n");
158 btrfs_set_opt(info->mount_opt, NODATASUM);
159 break;
160 case Opt_nodatacow:
161 printk(KERN_INFO "btrfs: setting nodatacow\n");
162 btrfs_set_opt(info->mount_opt, NODATACOW);
163 btrfs_set_opt(info->mount_opt, NODATASUM);
164 break;
165 case Opt_compress:
166 printk(KERN_INFO "btrfs: use compression\n");
167 btrfs_set_opt(info->mount_opt, COMPRESS);
168 break;
169 case Opt_ssd:
170 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
171 btrfs_set_opt(info->mount_opt, SSD);
172 break;
173 case Opt_nobarrier:
174 printk(KERN_INFO "btrfs: turning off barriers\n");
175 btrfs_set_opt(info->mount_opt, NOBARRIER);
176 break;
177 case Opt_thread_pool:
178 intarg = 0;
179 match_int(&args[0], &intarg);
180 if (intarg) {
181 info->thread_pool_size = intarg;
182 printk(KERN_INFO "btrfs: thread pool %d\n",
183 info->thread_pool_size);
184 }
185 break;
186 case Opt_max_extent:
187 num = match_strdup(&args[0]);
188 if (num) {
189 info->max_extent = btrfs_parse_size(num);
190 kfree(num);
191
192 info->max_extent = max_t(u64,
193 info->max_extent, root->sectorsize);
194 printk(KERN_INFO "btrfs: max_extent at %llu\n",
195 info->max_extent);
196 }
197 break;
198 case Opt_max_inline:
199 num = match_strdup(&args[0]);
200 if (num) {
201 info->max_inline = btrfs_parse_size(num);
202 kfree(num);
203
204 if (info->max_inline) {
205 info->max_inline = max_t(u64,
206 info->max_inline,
207 root->sectorsize);
208 }
209 printk(KERN_INFO "btrfs: max_inline at %llu\n",
210 info->max_inline);
211 }
212 break;
213 case Opt_alloc_start:
214 num = match_strdup(&args[0]);
215 if (num) {
216 info->alloc_start = btrfs_parse_size(num);
217 kfree(num);
218 printk(KERN_INFO
219 "btrfs: allocations start at %llu\n",
220 info->alloc_start);
221 }
222 break;
223 case Opt_noacl:
224 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
225 break;
226 default:
227 break;
228 }
229 }
230 kfree(options);
231 return 0;
232}
233
234/*
235 * Parse mount options that are required early in the mount process.
236 *
237 * All other options will be parsed on much later in the mount process and
238 * only when we need to allocate a new super block.
239 */
240static int btrfs_parse_early_options(const char *options, fmode_t flags,
241 void *holder, char **subvol_name,
242 struct btrfs_fs_devices **fs_devices)
243{
244 substring_t args[MAX_OPT_ARGS];
245 char *opts, *p;
246 int error = 0;
247
248 if (!options)
249 goto out;
250
251 /*
252 * strsep changes the string, duplicate it because parse_options
253 * gets called twice
254 */
255 opts = kstrdup(options, GFP_KERNEL);
256 if (!opts)
257 return -ENOMEM;
258
259 while ((p = strsep(&opts, ",")) != NULL) {
260 int token;
261 if (!*p)
262 continue;
263
264 token = match_token(p, tokens, args);
265 switch (token) {
266 case Opt_subvol:
267 *subvol_name = match_strdup(&args[0]);
268 break;
269 case Opt_device:
270 error = btrfs_scan_one_device(match_strdup(&args[0]),
271 flags, holder, fs_devices);
272 if (error)
273 goto out_free_opts;
274 break;
275 default:
276 break;
277 }
278 }
279
280 out_free_opts:
281 kfree(opts);
282 out:
283 /*
284 * If no subvolume name is specified we use the default one. Allocate
285 * a copy of the string "." here so that code later in the
286 * mount path doesn't care if it's the default volume or another one.
287 */
288 if (!*subvol_name) {
289 *subvol_name = kstrdup(".", GFP_KERNEL);
290 if (!*subvol_name)
291 return -ENOMEM;
292 }
293 return error;
294}
295
296static int btrfs_fill_super(struct super_block *sb,
297 struct btrfs_fs_devices *fs_devices,
298 void *data, int silent)
299{
300 struct inode *inode;
301 struct dentry *root_dentry;
302 struct btrfs_super_block *disk_super;
303 struct btrfs_root *tree_root;
304 struct btrfs_inode *bi;
305 int err;
306
307 sb->s_maxbytes = MAX_LFS_FILESIZE;
308 sb->s_magic = BTRFS_SUPER_MAGIC;
309 sb->s_op = &btrfs_super_ops;
310 sb->s_export_op = &btrfs_export_ops;
311 sb->s_xattr = btrfs_xattr_handlers;
312 sb->s_time_gran = 1;
313 sb->s_flags |= MS_POSIXACL;
314
315 tree_root = open_ctree(sb, fs_devices, (char *)data);
316
317 if (IS_ERR(tree_root)) {
318 printk("btrfs: open_ctree failed\n");
319 return PTR_ERR(tree_root);
320 }
321 sb->s_fs_info = tree_root;
322 disk_super = &tree_root->fs_info->super_copy;
323 inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
324 tree_root->fs_info->fs_root);
325 bi = BTRFS_I(inode);
326 bi->location.objectid = inode->i_ino;
327 bi->location.offset = 0;
328 bi->root = tree_root->fs_info->fs_root;
329
330 btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
331
332 if (!inode) {
333 err = -ENOMEM;
334 goto fail_close;
335 }
336 if (inode->i_state & I_NEW) {
337 btrfs_read_locked_inode(inode);
338 unlock_new_inode(inode);
339 }
340
341 root_dentry = d_alloc_root(inode);
342 if (!root_dentry) {
343 iput(inode);
344 err = -ENOMEM;
345 goto fail_close;
346 }
347#if 0
348 /* this does the super kobj at the same time */
349 err = btrfs_sysfs_add_super(tree_root->fs_info);
350 if (err)
351 goto fail_close;
352#endif
353
354 sb->s_root = root_dentry;
355
356 save_mount_options(sb, data);
357 return 0;
358
359fail_close:
360 close_ctree(tree_root);
361 return err;
362}
363
364int btrfs_sync_fs(struct super_block *sb, int wait)
365{
366 struct btrfs_trans_handle *trans;
367 struct btrfs_root *root;
368 int ret;
369 root = btrfs_sb(sb);
370
371 if (sb->s_flags & MS_RDONLY)
372 return 0;
373
374 sb->s_dirt = 0;
375 if (!wait) {
376 filemap_flush(root->fs_info->btree_inode->i_mapping);
377 return 0;
378 }
379
380 btrfs_start_delalloc_inodes(root);
381 btrfs_wait_ordered_extents(root, 0);
382
383 btrfs_clean_old_snapshots(root);
384 trans = btrfs_start_transaction(root, 1);
385 ret = btrfs_commit_transaction(trans, root);
386 sb->s_dirt = 0;
387 return ret;
388}
389
390static void btrfs_write_super(struct super_block *sb)
391{
392 sb->s_dirt = 0;
393}
394
395static int btrfs_test_super(struct super_block *s, void *data)
396{
397 struct btrfs_fs_devices *test_fs_devices = data;
398 struct btrfs_root *root = btrfs_sb(s);
399
400 return root->fs_info->fs_devices == test_fs_devices;
401}
402
403/*
404 * Find a superblock for the given device / mount point.
405 *
406 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
407 * for multiple device setup. Make sure to keep it in sync.
408 */
409static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
410 const char *dev_name, void *data, struct vfsmount *mnt)
411{
412 char *subvol_name = NULL;
413 struct block_device *bdev = NULL;
414 struct super_block *s;
415 struct dentry *root;
416 struct btrfs_fs_devices *fs_devices = NULL;
417 fmode_t mode = FMODE_READ;
418 int error = 0;
419
420 if (!(flags & MS_RDONLY))
421 mode |= FMODE_WRITE;
422
423 error = btrfs_parse_early_options(data, mode, fs_type,
424 &subvol_name, &fs_devices);
425 if (error)
426 return error;
427
428 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
429 if (error)
430 goto error_free_subvol_name;
431
432 error = btrfs_open_devices(fs_devices, mode, fs_type);
433 if (error)
434 goto error_free_subvol_name;
435
436 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
437 error = -EACCES;
438 goto error_close_devices;
439 }
440
441 bdev = fs_devices->latest_bdev;
442 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
443 if (IS_ERR(s))
444 goto error_s;
445
446 if (s->s_root) {
447 if ((flags ^ s->s_flags) & MS_RDONLY) {
448 up_write(&s->s_umount);
449 deactivate_super(s);
450 error = -EBUSY;
451 goto error_close_devices;
452 }
453
454 btrfs_close_devices(fs_devices);
455 } else {
456 char b[BDEVNAME_SIZE];
457
458 s->s_flags = flags;
459 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
460 error = btrfs_fill_super(s, fs_devices, data,
461 flags & MS_SILENT ? 1 : 0);
462 if (error) {
463 up_write(&s->s_umount);
464 deactivate_super(s);
465 goto error_free_subvol_name;
466 }
467
468 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
469 s->s_flags |= MS_ACTIVE;
470 }
471
472 if (!strcmp(subvol_name, "."))
473 root = dget(s->s_root);
474 else {
475 mutex_lock(&s->s_root->d_inode->i_mutex);
476 root = lookup_one_len(subvol_name, s->s_root,
477 strlen(subvol_name));
478 mutex_unlock(&s->s_root->d_inode->i_mutex);
479
480 if (IS_ERR(root)) {
481 up_write(&s->s_umount);
482 deactivate_super(s);
483 error = PTR_ERR(root);
484 goto error_free_subvol_name;
485 }
486 if (!root->d_inode) {
487 dput(root);
488 up_write(&s->s_umount);
489 deactivate_super(s);
490 error = -ENXIO;
491 goto error_free_subvol_name;
492 }
493 }
494
495 mnt->mnt_sb = s;
496 mnt->mnt_root = root;
497
498 kfree(subvol_name);
499 return 0;
500
501error_s:
502 error = PTR_ERR(s);
503error_close_devices:
504 btrfs_close_devices(fs_devices);
505error_free_subvol_name:
506 kfree(subvol_name);
507 return error;
508}
509
510static int btrfs_remount(struct super_block *sb, int *flags, char *data)
511{
512 struct btrfs_root *root = btrfs_sb(sb);
513 int ret;
514
515 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
516 return 0;
517
518 if (*flags & MS_RDONLY) {
519 sb->s_flags |= MS_RDONLY;
520
521 ret = btrfs_commit_super(root);
522 WARN_ON(ret);
523 } else {
524 if (root->fs_info->fs_devices->rw_devices == 0)
525 return -EACCES;
526
527 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
528 return -EINVAL;
529
530 ret = btrfs_cleanup_reloc_trees(root);
531 WARN_ON(ret);
532
533 ret = btrfs_cleanup_fs_roots(root->fs_info);
534 WARN_ON(ret);
535
536 sb->s_flags &= ~MS_RDONLY;
537 }
538
539 return 0;
540}
541
542static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
543{
544 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
545 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
546 int bits = dentry->d_sb->s_blocksize_bits;
547 __be32 *fsid = (__be32 *)root->fs_info->fsid;
548
549 buf->f_namelen = BTRFS_NAME_LEN;
550 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
551 buf->f_bfree = buf->f_blocks -
552 (btrfs_super_bytes_used(disk_super) >> bits);
553 buf->f_bavail = buf->f_bfree;
554 buf->f_bsize = dentry->d_sb->s_blocksize;
555 buf->f_type = BTRFS_SUPER_MAGIC;
556
557 /* We treat it as constant endianness (it doesn't matter _which_)
558 because we want the fsid to come out the same whether mounted
559 on a big-endian or little-endian host */
560 buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
561 buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
562 /* Mask in the root object ID too, to disambiguate subvols */
563 buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
564 buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
565
566 return 0;
567}
568
569static struct file_system_type btrfs_fs_type = {
570 .owner = THIS_MODULE,
571 .name = "btrfs",
572 .get_sb = btrfs_get_sb,
573 .kill_sb = kill_anon_super,
574 .fs_flags = FS_REQUIRES_DEV,
575};
576
577/*
578 * used by btrfsctl to scan devices when no FS is mounted
579 */
580static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
581 unsigned long arg)
582{
583 struct btrfs_ioctl_vol_args *vol;
584 struct btrfs_fs_devices *fs_devices;
585 int ret = 0;
586 int len;
587
588 if (!capable(CAP_SYS_ADMIN))
589 return -EPERM;
590
591 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
592 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
593 ret = -EFAULT;
594 goto out;
595 }
596 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
597 switch (cmd) {
598 case BTRFS_IOC_SCAN_DEV:
599 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
600 &btrfs_fs_type, &fs_devices);
601 break;
602 }
603out:
604 kfree(vol);
605 return ret;
606}
607
608static void btrfs_write_super_lockfs(struct super_block *sb)
609{
610 struct btrfs_root *root = btrfs_sb(sb);
611 mutex_lock(&root->fs_info->transaction_kthread_mutex);
612 mutex_lock(&root->fs_info->cleaner_mutex);
613}
614
615static void btrfs_unlockfs(struct super_block *sb)
616{
617 struct btrfs_root *root = btrfs_sb(sb);
618 mutex_unlock(&root->fs_info->cleaner_mutex);
619 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
620}
621
622static struct super_operations btrfs_super_ops = {
623 .delete_inode = btrfs_delete_inode,
624 .put_super = btrfs_put_super,
625 .write_super = btrfs_write_super,
626 .sync_fs = btrfs_sync_fs,
627 .show_options = generic_show_options,
628 .write_inode = btrfs_write_inode,
629 .dirty_inode = btrfs_dirty_inode,
630 .alloc_inode = btrfs_alloc_inode,
631 .destroy_inode = btrfs_destroy_inode,
632 .statfs = btrfs_statfs,
633 .remount_fs = btrfs_remount,
634 .write_super_lockfs = btrfs_write_super_lockfs,
635 .unlockfs = btrfs_unlockfs,
636};
637
638static const struct file_operations btrfs_ctl_fops = {
639 .unlocked_ioctl = btrfs_control_ioctl,
640 .compat_ioctl = btrfs_control_ioctl,
641 .owner = THIS_MODULE,
642};
643
644static struct miscdevice btrfs_misc = {
645 .minor = MISC_DYNAMIC_MINOR,
646 .name = "btrfs-control",
647 .fops = &btrfs_ctl_fops
648};
649
650static int btrfs_interface_init(void)
651{
652 return misc_register(&btrfs_misc);
653}
654
655static void btrfs_interface_exit(void)
656{
657 if (misc_deregister(&btrfs_misc) < 0)
658 printk(KERN_INFO "misc_deregister failed for control device");
659}
660
661static int __init init_btrfs_fs(void)
662{
663 int err;
664
665 err = btrfs_init_sysfs();
666 if (err)
667 return err;
668
669 err = btrfs_init_cachep();
670 if (err)
671 goto free_sysfs;
672
673 err = extent_io_init();
674 if (err)
675 goto free_cachep;
676
677 err = extent_map_init();
678 if (err)
679 goto free_extent_io;
680
681 err = btrfs_interface_init();
682 if (err)
683 goto free_extent_map;
684
685 err = register_filesystem(&btrfs_fs_type);
686 if (err)
687 goto unregister_ioctl;
688
689 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
690 return 0;
691
692unregister_ioctl:
693 btrfs_interface_exit();
694free_extent_map:
695 extent_map_exit();
696free_extent_io:
697 extent_io_exit();
698free_cachep:
699 btrfs_destroy_cachep();
700free_sysfs:
701 btrfs_exit_sysfs();
702 return err;
703}
704
705static void __exit exit_btrfs_fs(void)
706{
707 btrfs_destroy_cachep();
708 extent_map_exit();
709 extent_io_exit();
710 btrfs_interface_exit();
711 unregister_filesystem(&btrfs_fs_type);
712 btrfs_exit_sysfs();
713 btrfs_cleanup_fs_uuids();
714 btrfs_zlib_exit();
715}
716
717module_init(init_btrfs_fs)
718module_exit(exit_btrfs_fs)
719
720MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000000..a240b6fa81df
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,269 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/spinlock.h>
22#include <linux/completion.h>
23#include <linux/buffer_head.h>
24#include <linux/module.h>
25#include <linux/kobject.h>
26
27#include "ctree.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
32{
33 return snprintf(buf, PAGE_SIZE, "%llu\n",
34 (unsigned long long)btrfs_root_used(&root->root_item));
35}
36
37static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
38{
39 return snprintf(buf, PAGE_SIZE, "%llu\n",
40 (unsigned long long)btrfs_root_limit(&root->root_item));
41}
42
43static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
44{
45
46 return snprintf(buf, PAGE_SIZE, "%llu\n",
47 (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
48}
49
50static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
51{
52 return snprintf(buf, PAGE_SIZE, "%llu\n",
53 (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
54}
55
56static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
57{
58 return snprintf(buf, PAGE_SIZE, "%llu\n",
59 (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
60}
61
62/* this is for root attrs (subvols/snapshots) */
63struct btrfs_root_attr {
64 struct attribute attr;
65 ssize_t (*show)(struct btrfs_root *, char *);
66 ssize_t (*store)(struct btrfs_root *, const char *, size_t);
67};
68
69#define ROOT_ATTR(name, mode, show, store) \
70static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
71 show, store)
72
73ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL);
74ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL);
75
76static struct attribute *btrfs_root_attrs[] = {
77 &btrfs_root_attr_blocks_used.attr,
78 &btrfs_root_attr_block_limit.attr,
79 NULL,
80};
81
82/* this is for super attrs (actual full fs) */
83struct btrfs_super_attr {
84 struct attribute attr;
85 ssize_t (*show)(struct btrfs_fs_info *, char *);
86 ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
87};
88
89#define SUPER_ATTR(name, mode, show, store) \
90static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
91 show, store)
92
93SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL);
94SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL);
95SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL);
96
97static struct attribute *btrfs_super_attrs[] = {
98 &btrfs_super_attr_blocks_used.attr,
99 &btrfs_super_attr_total_blocks.attr,
100 &btrfs_super_attr_blocksize.attr,
101 NULL,
102};
103
104static ssize_t btrfs_super_attr_show(struct kobject *kobj,
105 struct attribute *attr, char *buf)
106{
107 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
108 super_kobj);
109 struct btrfs_super_attr *a = container_of(attr,
110 struct btrfs_super_attr,
111 attr);
112
113 return a->show ? a->show(fs, buf) : 0;
114}
115
116static ssize_t btrfs_super_attr_store(struct kobject *kobj,
117 struct attribute *attr,
118 const char *buf, size_t len)
119{
120 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
121 super_kobj);
122 struct btrfs_super_attr *a = container_of(attr,
123 struct btrfs_super_attr,
124 attr);
125
126 return a->store ? a->store(fs, buf, len) : 0;
127}
128
129static ssize_t btrfs_root_attr_show(struct kobject *kobj,
130 struct attribute *attr, char *buf)
131{
132 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
133 root_kobj);
134 struct btrfs_root_attr *a = container_of(attr,
135 struct btrfs_root_attr,
136 attr);
137
138 return a->show ? a->show(root, buf) : 0;
139}
140
141static ssize_t btrfs_root_attr_store(struct kobject *kobj,
142 struct attribute *attr,
143 const char *buf, size_t len)
144{
145 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
146 root_kobj);
147 struct btrfs_root_attr *a = container_of(attr,
148 struct btrfs_root_attr,
149 attr);
150 return a->store ? a->store(root, buf, len) : 0;
151}
152
153static void btrfs_super_release(struct kobject *kobj)
154{
155 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
156 super_kobj);
157 complete(&fs->kobj_unregister);
158}
159
160static void btrfs_root_release(struct kobject *kobj)
161{
162 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
163 root_kobj);
164 complete(&root->kobj_unregister);
165}
166
167static struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store,
170};
171
172static struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store,
175};
176
177static struct kobj_type btrfs_root_ktype = {
178 .default_attrs = btrfs_root_attrs,
179 .sysfs_ops = &btrfs_root_attr_ops,
180 .release = btrfs_root_release,
181};
182
183static struct kobj_type btrfs_super_ktype = {
184 .default_attrs = btrfs_super_attrs,
185 .sysfs_ops = &btrfs_super_attr_ops,
186 .release = btrfs_super_release,
187};
188
189/* /sys/fs/btrfs/ entry */
190static struct kset *btrfs_kset;
191
192int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
193{
194 int error;
195 char *name;
196 char c;
197 int len = strlen(fs->sb->s_id) + 1;
198 int i;
199
200 name = kmalloc(len, GFP_NOFS);
201 if (!name) {
202 error = -ENOMEM;
203 goto fail;
204 }
205
206 for (i = 0; i < len; i++) {
207 c = fs->sb->s_id[i];
208 if (c == '/' || c == '\\')
209 c = '!';
210 name[i] = c;
211 }
212 name[len] = '\0';
213
214 fs->super_kobj.kset = btrfs_kset;
215 error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
216 NULL, "%s", name);
217 kfree(name);
218 if (error)
219 goto fail;
220
221 return 0;
222
223fail:
224 printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
225 return error;
226}
227
228int btrfs_sysfs_add_root(struct btrfs_root *root)
229{
230 int error;
231
232 error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
233 &root->fs_info->super_kobj,
234 "%s", root->name);
235 if (error)
236 goto fail;
237
238 return 0;
239
240fail:
241 printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
242 return error;
243}
244
245void btrfs_sysfs_del_root(struct btrfs_root *root)
246{
247 kobject_put(&root->root_kobj);
248 wait_for_completion(&root->kobj_unregister);
249}
250
251void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
252{
253 kobject_put(&fs->super_kobj);
254 wait_for_completion(&fs->kobj_unregister);
255}
256
257int btrfs_init_sysfs(void)
258{
259 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
260 if (!btrfs_kset)
261 return -ENOMEM;
262 return 0;
263}
264
265void btrfs_exit_sysfs(void)
266{
267 kset_unregister(btrfs_kset);
268}
269
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000000..56ab1f5ea11b
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1097 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/sched.h>
21#include <linux/writeback.h>
22#include <linux/pagemap.h>
23#include <linux/blkdev.h>
24#include "ctree.h"
25#include "disk-io.h"
26#include "transaction.h"
27#include "locking.h"
28#include "ref-cache.h"
29#include "tree-log.h"
30
31#define BTRFS_ROOT_TRANS_TAG 0
32
33static noinline void put_transaction(struct btrfs_transaction *transaction)
34{
35 WARN_ON(transaction->use_count == 0);
36 transaction->use_count--;
37 if (transaction->use_count == 0) {
38 list_del_init(&transaction->list);
39 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 }
42}
43
44/*
45 * either allocate a new transaction or hop into the existing one
46 */
47static noinline int join_transaction(struct btrfs_root *root)
48{
49 struct btrfs_transaction *cur_trans;
50 cur_trans = root->fs_info->running_transaction;
51 if (!cur_trans) {
52 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
53 GFP_NOFS);
54 BUG_ON(!cur_trans);
55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation;
61 init_waitqueue_head(&cur_trans->writer_wait);
62 init_waitqueue_head(&cur_trans->commit_wait);
63 cur_trans->in_commit = 0;
64 cur_trans->blocked = 0;
65 cur_trans->use_count = 1;
66 cur_trans->commit_done = 0;
67 cur_trans->start_time = get_seconds();
68 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
70 extent_io_tree_init(&cur_trans->dirty_pages,
71 root->fs_info->btree_inode->i_mapping,
72 GFP_NOFS);
73 spin_lock(&root->fs_info->new_trans_lock);
74 root->fs_info->running_transaction = cur_trans;
75 spin_unlock(&root->fs_info->new_trans_lock);
76 } else {
77 cur_trans->num_writers++;
78 cur_trans->num_joined++;
79 }
80
81 return 0;
82}
83
84/*
85 * this does all the record keeping required to make sure that a reference
86 * counted root is properly recorded in a given transaction. This is required
87 * to make sure the old root from before we joined the transaction is deleted
88 * when the transaction commits
89 */
90noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
91{
92 struct btrfs_dirty_root *dirty;
93 u64 running_trans_id = root->fs_info->running_transaction->transid;
94 if (root->ref_cows && root->last_trans < running_trans_id) {
95 WARN_ON(root == root->fs_info->extent_root);
96 if (root->root_item.refs != 0) {
97 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
98 (unsigned long)root->root_key.objectid,
99 BTRFS_ROOT_TRANS_TAG);
100
101 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
102 BUG_ON(!dirty);
103 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
104 BUG_ON(!dirty->root);
105 dirty->latest_root = root;
106 INIT_LIST_HEAD(&dirty->list);
107
108 root->commit_root = btrfs_root_node(root);
109
110 memcpy(dirty->root, root, sizeof(*root));
111 spin_lock_init(&dirty->root->node_lock);
112 spin_lock_init(&dirty->root->list_lock);
113 mutex_init(&dirty->root->objectid_mutex);
114 mutex_init(&dirty->root->log_mutex);
115 INIT_LIST_HEAD(&dirty->root->dead_list);
116 dirty->root->node = root->commit_root;
117 dirty->root->commit_root = NULL;
118
119 spin_lock(&root->list_lock);
120 list_add(&dirty->root->dead_list, &root->dead_list);
121 spin_unlock(&root->list_lock);
122
123 root->dirty_root = dirty;
124 } else {
125 WARN_ON(1);
126 }
127 root->last_trans = running_trans_id;
128 }
129 return 0;
130}
131
132/* wait for commit against the current transaction to become unblocked
133 * when this is done, it is safe to start a new transaction, but the current
134 * transaction might not be fully on disk.
135 */
136static void wait_current_trans(struct btrfs_root *root)
137{
138 struct btrfs_transaction *cur_trans;
139
140 cur_trans = root->fs_info->running_transaction;
141 if (cur_trans && cur_trans->blocked) {
142 DEFINE_WAIT(wait);
143 cur_trans->use_count++;
144 while (1) {
145 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
146 TASK_UNINTERRUPTIBLE);
147 if (cur_trans->blocked) {
148 mutex_unlock(&root->fs_info->trans_mutex);
149 schedule();
150 mutex_lock(&root->fs_info->trans_mutex);
151 finish_wait(&root->fs_info->transaction_wait,
152 &wait);
153 } else {
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 break;
157 }
158 }
159 put_transaction(cur_trans);
160 }
161}
162
163static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
164 int num_blocks, int wait)
165{
166 struct btrfs_trans_handle *h =
167 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
168 int ret;
169
170 mutex_lock(&root->fs_info->trans_mutex);
171 if (!root->fs_info->log_root_recovering &&
172 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
173 wait_current_trans(root);
174 ret = join_transaction(root);
175 BUG_ON(ret);
176
177 btrfs_record_root_in_trans(root);
178 h->transid = root->fs_info->running_transaction->transid;
179 h->transaction = root->fs_info->running_transaction;
180 h->blocks_reserved = num_blocks;
181 h->blocks_used = 0;
182 h->block_group = 0;
183 h->alloc_exclude_nr = 0;
184 h->alloc_exclude_start = 0;
185 root->fs_info->running_transaction->use_count++;
186 mutex_unlock(&root->fs_info->trans_mutex);
187 return h;
188}
189
190struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
191 int num_blocks)
192{
193 return start_transaction(root, num_blocks, 1);
194}
195struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
196 int num_blocks)
197{
198 return start_transaction(root, num_blocks, 0);
199}
200
201struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
202 int num_blocks)
203{
204 return start_transaction(r, num_blocks, 2);
205}
206
207/* wait for a transaction commit to be fully complete */
208static noinline int wait_for_commit(struct btrfs_root *root,
209 struct btrfs_transaction *commit)
210{
211 DEFINE_WAIT(wait);
212 mutex_lock(&root->fs_info->trans_mutex);
213 while (!commit->commit_done) {
214 prepare_to_wait(&commit->commit_wait, &wait,
215 TASK_UNINTERRUPTIBLE);
216 if (commit->commit_done)
217 break;
218 mutex_unlock(&root->fs_info->trans_mutex);
219 schedule();
220 mutex_lock(&root->fs_info->trans_mutex);
221 }
222 mutex_unlock(&root->fs_info->trans_mutex);
223 finish_wait(&commit->commit_wait, &wait);
224 return 0;
225}
226
227/*
228 * rate limit against the drop_snapshot code. This helps to slow down new
229 * operations if the drop_snapshot code isn't able to keep up.
230 */
231static void throttle_on_drops(struct btrfs_root *root)
232{
233 struct btrfs_fs_info *info = root->fs_info;
234 int harder_count = 0;
235
236harder:
237 if (atomic_read(&info->throttles)) {
238 DEFINE_WAIT(wait);
239 int thr;
240 thr = atomic_read(&info->throttle_gen);
241
242 do {
243 prepare_to_wait(&info->transaction_throttle,
244 &wait, TASK_UNINTERRUPTIBLE);
245 if (!atomic_read(&info->throttles)) {
246 finish_wait(&info->transaction_throttle, &wait);
247 break;
248 }
249 schedule();
250 finish_wait(&info->transaction_throttle, &wait);
251 } while (thr == atomic_read(&info->throttle_gen));
252 harder_count++;
253
254 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
255 harder_count < 2)
256 goto harder;
257
258 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
259 harder_count < 10)
260 goto harder;
261
262 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
263 harder_count < 20)
264 goto harder;
265 }
266}
267
268void btrfs_throttle(struct btrfs_root *root)
269{
270 mutex_lock(&root->fs_info->trans_mutex);
271 if (!root->fs_info->open_ioctl_trans)
272 wait_current_trans(root);
273 mutex_unlock(&root->fs_info->trans_mutex);
274
275 throttle_on_drops(root);
276}
277
278static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
279 struct btrfs_root *root, int throttle)
280{
281 struct btrfs_transaction *cur_trans;
282 struct btrfs_fs_info *info = root->fs_info;
283
284 mutex_lock(&info->trans_mutex);
285 cur_trans = info->running_transaction;
286 WARN_ON(cur_trans != trans->transaction);
287 WARN_ON(cur_trans->num_writers < 1);
288 cur_trans->num_writers--;
289
290 if (waitqueue_active(&cur_trans->writer_wait))
291 wake_up(&cur_trans->writer_wait);
292 put_transaction(cur_trans);
293 mutex_unlock(&info->trans_mutex);
294 memset(trans, 0, sizeof(*trans));
295 kmem_cache_free(btrfs_trans_handle_cachep, trans);
296
297 if (throttle)
298 throttle_on_drops(root);
299
300 return 0;
301}
302
303int btrfs_end_transaction(struct btrfs_trans_handle *trans,
304 struct btrfs_root *root)
305{
306 return __btrfs_end_transaction(trans, root, 0);
307}
308
309int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
310 struct btrfs_root *root)
311{
312 return __btrfs_end_transaction(trans, root, 1);
313}
314
315/*
316 * when btree blocks are allocated, they have some corresponding bits set for
317 * them in one of two extent_io trees. This is used to make sure all of
318 * those extents are on disk for transaction or log commit
319 */
320int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
321 struct extent_io_tree *dirty_pages)
322{
323 int ret;
324 int err = 0;
325 int werr = 0;
326 struct page *page;
327 struct inode *btree_inode = root->fs_info->btree_inode;
328 u64 start = 0;
329 u64 end;
330 unsigned long index;
331
332 while (1) {
333 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
334 EXTENT_DIRTY);
335 if (ret)
336 break;
337 while (start <= end) {
338 cond_resched();
339
340 index = start >> PAGE_CACHE_SHIFT;
341 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
342 page = find_get_page(btree_inode->i_mapping, index);
343 if (!page)
344 continue;
345
346 btree_lock_page_hook(page);
347 if (!page->mapping) {
348 unlock_page(page);
349 page_cache_release(page);
350 continue;
351 }
352
353 if (PageWriteback(page)) {
354 if (PageDirty(page))
355 wait_on_page_writeback(page);
356 else {
357 unlock_page(page);
358 page_cache_release(page);
359 continue;
360 }
361 }
362 err = write_one_page(page, 0);
363 if (err)
364 werr = err;
365 page_cache_release(page);
366 }
367 }
368 while (1) {
369 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
370 EXTENT_DIRTY);
371 if (ret)
372 break;
373
374 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
375 while (start <= end) {
376 index = start >> PAGE_CACHE_SHIFT;
377 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
378 page = find_get_page(btree_inode->i_mapping, index);
379 if (!page)
380 continue;
381 if (PageDirty(page)) {
382 btree_lock_page_hook(page);
383 wait_on_page_writeback(page);
384 err = write_one_page(page, 0);
385 if (err)
386 werr = err;
387 }
388 wait_on_page_writeback(page);
389 page_cache_release(page);
390 cond_resched();
391 }
392 }
393 if (err)
394 werr = err;
395 return werr;
396}
397
398int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
399 struct btrfs_root *root)
400{
401 if (!trans || !trans->transaction) {
402 struct inode *btree_inode;
403 btree_inode = root->fs_info->btree_inode;
404 return filemap_write_and_wait(btree_inode->i_mapping);
405 }
406 return btrfs_write_and_wait_marked_extents(root,
407 &trans->transaction->dirty_pages);
408}
409
410/*
411 * this is used to update the root pointer in the tree of tree roots.
412 *
413 * But, in the case of the extent allocation tree, updating the root
414 * pointer may allocate blocks which may change the root of the extent
415 * allocation tree.
416 *
417 * So, this loops and repeats and makes sure the cowonly root didn't
418 * change while the root pointer was being updated in the metadata.
419 */
420static int update_cowonly_root(struct btrfs_trans_handle *trans,
421 struct btrfs_root *root)
422{
423 int ret;
424 u64 old_root_bytenr;
425 struct btrfs_root *tree_root = root->fs_info->tree_root;
426
427 btrfs_extent_post_op(trans, root);
428 btrfs_write_dirty_block_groups(trans, root);
429 btrfs_extent_post_op(trans, root);
430
431 while (1) {
432 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
433 if (old_root_bytenr == root->node->start)
434 break;
435 btrfs_set_root_bytenr(&root->root_item,
436 root->node->start);
437 btrfs_set_root_level(&root->root_item,
438 btrfs_header_level(root->node));
439 btrfs_set_root_generation(&root->root_item, trans->transid);
440
441 btrfs_extent_post_op(trans, root);
442
443 ret = btrfs_update_root(trans, tree_root,
444 &root->root_key,
445 &root->root_item);
446 BUG_ON(ret);
447 btrfs_write_dirty_block_groups(trans, root);
448 btrfs_extent_post_op(trans, root);
449 }
450 return 0;
451}
452
453/*
454 * update all the cowonly tree roots on disk
455 */
456int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
457 struct btrfs_root *root)
458{
459 struct btrfs_fs_info *fs_info = root->fs_info;
460 struct list_head *next;
461 struct extent_buffer *eb;
462
463 btrfs_extent_post_op(trans, fs_info->tree_root);
464
465 eb = btrfs_lock_root_node(fs_info->tree_root);
466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
467 btrfs_tree_unlock(eb);
468 free_extent_buffer(eb);
469
470 btrfs_extent_post_op(trans, fs_info->tree_root);
471
472 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
473 next = fs_info->dirty_cowonly_roots.next;
474 list_del_init(next);
475 root = list_entry(next, struct btrfs_root, dirty_list);
476
477 update_cowonly_root(trans, root);
478 }
479 return 0;
480}
481
482/*
483 * dead roots are old snapshots that need to be deleted. This allocates
484 * a dirty root struct and adds it into the list of dead roots that need to
485 * be deleted
486 */
487int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
488{
489 struct btrfs_dirty_root *dirty;
490
491 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
492 if (!dirty)
493 return -ENOMEM;
494 dirty->root = root;
495 dirty->latest_root = latest;
496
497 mutex_lock(&root->fs_info->trans_mutex);
498 list_add(&dirty->list, &latest->fs_info->dead_roots);
499 mutex_unlock(&root->fs_info->trans_mutex);
500 return 0;
501}
502
503/*
504 * at transaction commit time we need to schedule the old roots for
505 * deletion via btrfs_drop_snapshot. This runs through all the
506 * reference counted roots that were modified in the current
507 * transaction and puts them into the drop list
508 */
509static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
510 struct radix_tree_root *radix,
511 struct list_head *list)
512{
513 struct btrfs_dirty_root *dirty;
514 struct btrfs_root *gang[8];
515 struct btrfs_root *root;
516 int i;
517 int ret;
518 int err = 0;
519 u32 refs;
520
521 while (1) {
522 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
523 ARRAY_SIZE(gang),
524 BTRFS_ROOT_TRANS_TAG);
525 if (ret == 0)
526 break;
527 for (i = 0; i < ret; i++) {
528 root = gang[i];
529 radix_tree_tag_clear(radix,
530 (unsigned long)root->root_key.objectid,
531 BTRFS_ROOT_TRANS_TAG);
532
533 BUG_ON(!root->ref_tree);
534 dirty = root->dirty_root;
535
536 btrfs_free_log(trans, root);
537 btrfs_free_reloc_root(trans, root);
538
539 if (root->commit_root == root->node) {
540 WARN_ON(root->node->start !=
541 btrfs_root_bytenr(&root->root_item));
542
543 free_extent_buffer(root->commit_root);
544 root->commit_root = NULL;
545 root->dirty_root = NULL;
546
547 spin_lock(&root->list_lock);
548 list_del_init(&dirty->root->dead_list);
549 spin_unlock(&root->list_lock);
550
551 kfree(dirty->root);
552 kfree(dirty);
553
554 /* make sure to update the root on disk
555 * so we get any updates to the block used
556 * counts
557 */
558 err = btrfs_update_root(trans,
559 root->fs_info->tree_root,
560 &root->root_key,
561 &root->root_item);
562 continue;
563 }
564
565 memset(&root->root_item.drop_progress, 0,
566 sizeof(struct btrfs_disk_key));
567 root->root_item.drop_level = 0;
568 root->commit_root = NULL;
569 root->dirty_root = NULL;
570 root->root_key.offset = root->fs_info->generation;
571 btrfs_set_root_bytenr(&root->root_item,
572 root->node->start);
573 btrfs_set_root_level(&root->root_item,
574 btrfs_header_level(root->node));
575 btrfs_set_root_generation(&root->root_item,
576 root->root_key.offset);
577
578 err = btrfs_insert_root(trans, root->fs_info->tree_root,
579 &root->root_key,
580 &root->root_item);
581 if (err)
582 break;
583
584 refs = btrfs_root_refs(&dirty->root->root_item);
585 btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
586 err = btrfs_update_root(trans, root->fs_info->tree_root,
587 &dirty->root->root_key,
588 &dirty->root->root_item);
589
590 BUG_ON(err);
591 if (refs == 1) {
592 list_add(&dirty->list, list);
593 } else {
594 WARN_ON(1);
595 free_extent_buffer(dirty->root->node);
596 kfree(dirty->root);
597 kfree(dirty);
598 }
599 }
600 }
601 return err;
602}
603
604/*
605 * defrag a given btree. If cacheonly == 1, this won't read from the disk,
606 * otherwise every leaf in the btree is read and defragged.
607 */
608int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
609{
610 struct btrfs_fs_info *info = root->fs_info;
611 int ret;
612 struct btrfs_trans_handle *trans;
613 unsigned long nr;
614
615 smp_mb();
616 if (root->defrag_running)
617 return 0;
618 trans = btrfs_start_transaction(root, 1);
619 while (1) {
620 root->defrag_running = 1;
621 ret = btrfs_defrag_leaves(trans, root, cacheonly);
622 nr = trans->blocks_used;
623 btrfs_end_transaction(trans, root);
624 btrfs_btree_balance_dirty(info->tree_root, nr);
625 cond_resched();
626
627 trans = btrfs_start_transaction(root, 1);
628 if (root->fs_info->closing || ret != -EAGAIN)
629 break;
630 }
631 root->defrag_running = 0;
632 smp_mb();
633 btrfs_end_transaction(trans, root);
634 return 0;
635}
636
637/*
638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
639 * all of them
640 */
641static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
642 struct list_head *list)
643{
644 struct btrfs_dirty_root *dirty;
645 struct btrfs_trans_handle *trans;
646 unsigned long nr;
647 u64 num_bytes;
648 u64 bytes_used;
649 u64 max_useless;
650 int ret = 0;
651 int err;
652
653 while (!list_empty(list)) {
654 struct btrfs_root *root;
655
656 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
657 list_del_init(&dirty->list);
658
659 num_bytes = btrfs_root_used(&dirty->root->root_item);
660 root = dirty->latest_root;
661 atomic_inc(&root->fs_info->throttles);
662
663 while (1) {
664 trans = btrfs_start_transaction(tree_root, 1);
665 mutex_lock(&root->fs_info->drop_mutex);
666 ret = btrfs_drop_snapshot(trans, dirty->root);
667 if (ret != -EAGAIN)
668 break;
669 mutex_unlock(&root->fs_info->drop_mutex);
670
671 err = btrfs_update_root(trans,
672 tree_root,
673 &dirty->root->root_key,
674 &dirty->root->root_item);
675 if (err)
676 ret = err;
677 nr = trans->blocks_used;
678 ret = btrfs_end_transaction(trans, tree_root);
679 BUG_ON(ret);
680
681 btrfs_btree_balance_dirty(tree_root, nr);
682 cond_resched();
683 }
684 BUG_ON(ret);
685 atomic_dec(&root->fs_info->throttles);
686 wake_up(&root->fs_info->transaction_throttle);
687
688 num_bytes -= btrfs_root_used(&dirty->root->root_item);
689 bytes_used = btrfs_root_used(&root->root_item);
690 if (num_bytes) {
691 btrfs_record_root_in_trans(root);
692 btrfs_set_root_used(&root->root_item,
693 bytes_used - num_bytes);
694 }
695
696 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
697 if (ret) {
698 BUG();
699 break;
700 }
701 mutex_unlock(&root->fs_info->drop_mutex);
702
703 spin_lock(&root->list_lock);
704 list_del_init(&dirty->root->dead_list);
705 if (!list_empty(&root->dead_list)) {
706 struct btrfs_root *oldest;
707 oldest = list_entry(root->dead_list.prev,
708 struct btrfs_root, dead_list);
709 max_useless = oldest->root_key.offset - 1;
710 } else {
711 max_useless = root->root_key.offset - 1;
712 }
713 spin_unlock(&root->list_lock);
714
715 nr = trans->blocks_used;
716 ret = btrfs_end_transaction(trans, tree_root);
717 BUG_ON(ret);
718
719 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
720 BUG_ON(ret);
721
722 free_extent_buffer(dirty->root->node);
723 kfree(dirty->root);
724 kfree(dirty);
725
726 btrfs_btree_balance_dirty(tree_root, nr);
727 cond_resched();
728 }
729 return ret;
730}
731
732/*
733 * new snapshots need to be created at a very specific time in the
734 * transaction commit. This does the actual creation
735 */
736static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
737 struct btrfs_fs_info *fs_info,
738 struct btrfs_pending_snapshot *pending)
739{
740 struct btrfs_key key;
741 struct btrfs_root_item *new_root_item;
742 struct btrfs_root *tree_root = fs_info->tree_root;
743 struct btrfs_root *root = pending->root;
744 struct extent_buffer *tmp;
745 struct extent_buffer *old;
746 int ret;
747 u64 objectid;
748
749 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
750 if (!new_root_item) {
751 ret = -ENOMEM;
752 goto fail;
753 }
754 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
755 if (ret)
756 goto fail;
757
758 btrfs_record_root_in_trans(root);
759 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
760 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
761
762 key.objectid = objectid;
763 key.offset = trans->transid;
764 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
765
766 old = btrfs_lock_root_node(root);
767 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
768
769 btrfs_copy_root(trans, root, old, &tmp, objectid);
770 btrfs_tree_unlock(old);
771 free_extent_buffer(old);
772
773 btrfs_set_root_bytenr(new_root_item, tmp->start);
774 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
775 btrfs_set_root_generation(new_root_item, trans->transid);
776 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
777 new_root_item);
778 btrfs_tree_unlock(tmp);
779 free_extent_buffer(tmp);
780 if (ret)
781 goto fail;
782
783 key.offset = (u64)-1;
784 memcpy(&pending->root_key, &key, sizeof(key));
785fail:
786 kfree(new_root_item);
787 return ret;
788}
789
790static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
791 struct btrfs_pending_snapshot *pending)
792{
793 int ret;
794 int namelen;
795 u64 index = 0;
796 struct btrfs_trans_handle *trans;
797 struct inode *parent_inode;
798 struct inode *inode;
799 struct btrfs_root *parent_root;
800
801 parent_inode = pending->dentry->d_parent->d_inode;
802 parent_root = BTRFS_I(parent_inode)->root;
803 trans = btrfs_start_transaction(parent_root, 1);
804
805 /*
806 * insert the directory item
807 */
808 namelen = strlen(pending->name);
809 ret = btrfs_set_inode_index(parent_inode, &index);
810 ret = btrfs_insert_dir_item(trans, parent_root,
811 pending->name, namelen,
812 parent_inode->i_ino,
813 &pending->root_key, BTRFS_FT_DIR, index);
814
815 if (ret)
816 goto fail;
817
818 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
819 ret = btrfs_update_inode(trans, parent_root, parent_inode);
820 BUG_ON(ret);
821
822 /* add the backref first */
823 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
824 pending->root_key.objectid,
825 BTRFS_ROOT_BACKREF_KEY,
826 parent_root->root_key.objectid,
827 parent_inode->i_ino, index, pending->name,
828 namelen);
829
830 BUG_ON(ret);
831
832 /* now add the forward ref */
833 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
834 parent_root->root_key.objectid,
835 BTRFS_ROOT_REF_KEY,
836 pending->root_key.objectid,
837 parent_inode->i_ino, index, pending->name,
838 namelen);
839
840 inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
841 d_instantiate(pending->dentry, inode);
842fail:
843 btrfs_end_transaction(trans, fs_info->fs_root);
844 return ret;
845}
846
847/*
848 * create all the snapshots we've scheduled for creation
849 */
850static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
851 struct btrfs_fs_info *fs_info)
852{
853 struct btrfs_pending_snapshot *pending;
854 struct list_head *head = &trans->transaction->pending_snapshots;
855 struct list_head *cur;
856 int ret;
857
858 list_for_each(cur, head) {
859 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
860 ret = create_pending_snapshot(trans, fs_info, pending);
861 BUG_ON(ret);
862 }
863 return 0;
864}
865
866static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
867 struct btrfs_fs_info *fs_info)
868{
869 struct btrfs_pending_snapshot *pending;
870 struct list_head *head = &trans->transaction->pending_snapshots;
871 int ret;
872
873 while (!list_empty(head)) {
874 pending = list_entry(head->next,
875 struct btrfs_pending_snapshot, list);
876 ret = finish_pending_snapshot(fs_info, pending);
877 BUG_ON(ret);
878 list_del(&pending->list);
879 kfree(pending->name);
880 kfree(pending);
881 }
882 return 0;
883}
884
885int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
886 struct btrfs_root *root)
887{
888 unsigned long joined = 0;
889 unsigned long timeout = 1;
890 struct btrfs_transaction *cur_trans;
891 struct btrfs_transaction *prev_trans = NULL;
892 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
893 struct list_head dirty_fs_roots;
894 struct extent_io_tree *pinned_copy;
895 DEFINE_WAIT(wait);
896 int ret;
897
898 INIT_LIST_HEAD(&dirty_fs_roots);
899 mutex_lock(&root->fs_info->trans_mutex);
900 if (trans->transaction->in_commit) {
901 cur_trans = trans->transaction;
902 trans->transaction->use_count++;
903 mutex_unlock(&root->fs_info->trans_mutex);
904 btrfs_end_transaction(trans, root);
905
906 ret = wait_for_commit(root, cur_trans);
907 BUG_ON(ret);
908
909 mutex_lock(&root->fs_info->trans_mutex);
910 put_transaction(cur_trans);
911 mutex_unlock(&root->fs_info->trans_mutex);
912
913 return 0;
914 }
915
916 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
917 if (!pinned_copy)
918 return -ENOMEM;
919
920 extent_io_tree_init(pinned_copy,
921 root->fs_info->btree_inode->i_mapping, GFP_NOFS);
922
923 trans->transaction->in_commit = 1;
924 trans->transaction->blocked = 1;
925 cur_trans = trans->transaction;
926 if (cur_trans->list.prev != &root->fs_info->trans_list) {
927 prev_trans = list_entry(cur_trans->list.prev,
928 struct btrfs_transaction, list);
929 if (!prev_trans->commit_done) {
930 prev_trans->use_count++;
931 mutex_unlock(&root->fs_info->trans_mutex);
932
933 wait_for_commit(root, prev_trans);
934
935 mutex_lock(&root->fs_info->trans_mutex);
936 put_transaction(prev_trans);
937 }
938 }
939
940 do {
941 int snap_pending = 0;
942 joined = cur_trans->num_joined;
943 if (!list_empty(&trans->transaction->pending_snapshots))
944 snap_pending = 1;
945
946 WARN_ON(cur_trans != trans->transaction);
947 prepare_to_wait(&cur_trans->writer_wait, &wait,
948 TASK_UNINTERRUPTIBLE);
949
950 if (cur_trans->num_writers > 1)
951 timeout = MAX_SCHEDULE_TIMEOUT;
952 else
953 timeout = 1;
954
955 mutex_unlock(&root->fs_info->trans_mutex);
956
957 if (snap_pending) {
958 ret = btrfs_wait_ordered_extents(root, 1);
959 BUG_ON(ret);
960 }
961
962 schedule_timeout(timeout);
963
964 mutex_lock(&root->fs_info->trans_mutex);
965 finish_wait(&cur_trans->writer_wait, &wait);
966 } while (cur_trans->num_writers > 1 ||
967 (cur_trans->num_joined != joined));
968
969 ret = create_pending_snapshots(trans, root->fs_info);
970 BUG_ON(ret);
971
972 WARN_ON(cur_trans != trans->transaction);
973
974 /* btrfs_commit_tree_roots is responsible for getting the
975 * various roots consistent with each other. Every pointer
976 * in the tree of tree roots has to point to the most up to date
977 * root for every subvolume and other tree. So, we have to keep
978 * the tree logging code from jumping in and changing any
979 * of the trees.
980 *
981 * At this point in the commit, there can't be any tree-log
982 * writers, but a little lower down we drop the trans mutex
983 * and let new people in. By holding the tree_log_mutex
984 * from now until after the super is written, we avoid races
985 * with the tree-log code.
986 */
987 mutex_lock(&root->fs_info->tree_log_mutex);
988 /*
989 * keep tree reloc code from adding new reloc trees
990 */
991 mutex_lock(&root->fs_info->tree_reloc_mutex);
992
993
994 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
995 &dirty_fs_roots);
996 BUG_ON(ret);
997
998 /* add_dirty_roots gets rid of all the tree log roots, it is now
999 * safe to free the root of tree log roots
1000 */
1001 btrfs_free_log_root_tree(trans, root->fs_info);
1002
1003 ret = btrfs_commit_tree_roots(trans, root);
1004 BUG_ON(ret);
1005
1006 cur_trans = root->fs_info->running_transaction;
1007 spin_lock(&root->fs_info->new_trans_lock);
1008 root->fs_info->running_transaction = NULL;
1009 spin_unlock(&root->fs_info->new_trans_lock);
1010 btrfs_set_super_generation(&root->fs_info->super_copy,
1011 cur_trans->transid);
1012 btrfs_set_super_root(&root->fs_info->super_copy,
1013 root->fs_info->tree_root->node->start);
1014 btrfs_set_super_root_level(&root->fs_info->super_copy,
1015 btrfs_header_level(root->fs_info->tree_root->node));
1016
1017 btrfs_set_super_chunk_root(&root->fs_info->super_copy,
1018 chunk_root->node->start);
1019 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
1020 btrfs_header_level(chunk_root->node));
1021 btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
1022 btrfs_header_generation(chunk_root->node));
1023
1024 if (!root->fs_info->log_root_recovering) {
1025 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1026 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1027 }
1028
1029 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1030 sizeof(root->fs_info->super_copy));
1031
1032 btrfs_copy_pinned(root, pinned_copy);
1033
1034 trans->transaction->blocked = 0;
1035 wake_up(&root->fs_info->transaction_throttle);
1036 wake_up(&root->fs_info->transaction_wait);
1037
1038 mutex_unlock(&root->fs_info->trans_mutex);
1039 ret = btrfs_write_and_wait_transaction(trans, root);
1040 BUG_ON(ret);
1041 write_ctree_super(trans, root, 0);
1042
1043 /*
1044 * the super is written, we can safely allow the tree-loggers
1045 * to go about their business
1046 */
1047 mutex_unlock(&root->fs_info->tree_log_mutex);
1048
1049 btrfs_finish_extent_commit(trans, root, pinned_copy);
1050 kfree(pinned_copy);
1051
1052 btrfs_drop_dead_reloc_roots(root);
1053 mutex_unlock(&root->fs_info->tree_reloc_mutex);
1054
1055 /* do the directory inserts of any pending snapshot creations */
1056 finish_pending_snapshots(trans, root->fs_info);
1057
1058 mutex_lock(&root->fs_info->trans_mutex);
1059
1060 cur_trans->commit_done = 1;
1061 root->fs_info->last_trans_committed = cur_trans->transid;
1062 wake_up(&cur_trans->commit_wait);
1063
1064 put_transaction(cur_trans);
1065 put_transaction(cur_trans);
1066
1067 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1068 if (root->fs_info->closing)
1069 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1070
1071 mutex_unlock(&root->fs_info->trans_mutex);
1072
1073 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1074
1075 if (root->fs_info->closing)
1076 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1077 return ret;
1078}
1079
1080/*
1081 * interface function to delete all the snapshots we have scheduled for deletion
1082 */
1083int btrfs_clean_old_snapshots(struct btrfs_root *root)
1084{
1085 struct list_head dirty_roots;
1086 INIT_LIST_HEAD(&dirty_roots);
1087again:
1088 mutex_lock(&root->fs_info->trans_mutex);
1089 list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
1090 mutex_unlock(&root->fs_info->trans_mutex);
1091
1092 if (!list_empty(&dirty_roots)) {
1093 drop_dirty_roots(root, &dirty_roots);
1094 goto again;
1095 }
1096 return 0;
1097}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000000..ea292117f882
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,106 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h"
22
23struct btrfs_transaction {
24 u64 transid;
25 unsigned long num_writers;
26 unsigned long num_joined;
27 int in_commit;
28 int use_count;
29 int commit_done;
30 int blocked;
31 struct list_head list;
32 struct extent_io_tree dirty_pages;
33 unsigned long start_time;
34 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots;
37};
38
39struct btrfs_trans_handle {
40 u64 transid;
41 unsigned long blocks_reserved;
42 unsigned long blocks_used;
43 struct btrfs_transaction *transaction;
44 u64 block_group;
45 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr;
47};
48
49struct btrfs_pending_snapshot {
50 struct dentry *dentry;
51 struct btrfs_root *root;
52 char *name;
53 struct btrfs_key root_key;
54 struct list_head list;
55};
56
57struct btrfs_dirty_root {
58 struct list_head list;
59 struct btrfs_root *root;
60 struct btrfs_root *latest_root;
61};
62
63static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
64 struct inode *inode)
65{
66 trans->block_group = BTRFS_I(inode)->block_group;
67}
68
69static inline void btrfs_update_inode_block_group(
70 struct btrfs_trans_handle *trans,
71 struct inode *inode)
72{
73 BTRFS_I(inode)->block_group = trans->block_group;
74}
75
76static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
77 struct inode *inode)
78{
79 BTRFS_I(inode)->last_trans = trans->transaction->transid;
80}
81
82int btrfs_end_transaction(struct btrfs_trans_handle *trans,
83 struct btrfs_root *root);
84struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
85 int num_blocks);
86struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
87 int num_blocks);
88struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
89 int num_blocks);
90int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
91 struct btrfs_root *root);
92int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
93 struct btrfs_root *root);
94
95int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
96int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
97int btrfs_clean_old_snapshots(struct btrfs_root *root);
98int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
99 struct btrfs_root *root);
100int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
101 struct btrfs_root *root);
102void btrfs_throttle(struct btrfs_root *root);
103int btrfs_record_root_in_trans(struct btrfs_root *root);
104int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
105 struct extent_io_tree *dirty_pages);
106#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000000..3e8358c36165
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,147 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "print-tree.h"
23#include "transaction.h"
24#include "locking.h"
25
26/* defrag all the leaves in a given btree. If cache_only == 1, don't read
27 * things from disk, otherwise read all the leaves and try to get key order to
28 * better reflect disk order
29 */
30
31int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, int cache_only)
33{
34 struct btrfs_path *path = NULL;
35 struct btrfs_key key;
36 int ret = 0;
37 int wret;
38 int level;
39 int orig_level;
40 int is_extent = 0;
41 int next_key_ret = 0;
42 u64 last_ret = 0;
43 u64 min_trans = 0;
44
45 if (cache_only)
46 goto out;
47
48 if (root->fs_info->extent_root == root) {
49 /*
50 * there's recursion here right now in the tree locking,
51 * we can't defrag the extent root without deadlock
52 */
53 goto out;
54 }
55
56 if (root->ref_cows == 0 && !is_extent)
57 goto out;
58
59 if (btrfs_test_opt(root, SSD))
60 goto out;
61
62 path = btrfs_alloc_path();
63 if (!path)
64 return -ENOMEM;
65
66 level = btrfs_header_level(root->node);
67 orig_level = level;
68
69 if (level == 0)
70 goto out;
71
72 if (root->defrag_progress.objectid == 0) {
73 struct extent_buffer *root_node;
74 u32 nritems;
75
76 root_node = btrfs_lock_root_node(root);
77 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */
80 btrfs_node_key_to_cpu(root_node, &root->defrag_max,
81 nritems - 1);
82 btrfs_tree_unlock(root_node);
83 free_extent_buffer(root_node);
84 memset(&key, 0, sizeof(key));
85 } else {
86 memcpy(&key, &root->defrag_progress, sizeof(key));
87 }
88
89 path->keep_locks = 1;
90 if (cache_only)
91 min_trans = root->defrag_trans_start;
92
93 ret = btrfs_search_forward(root, &key, NULL, path,
94 cache_only, min_trans);
95 if (ret < 0)
96 goto out;
97 if (ret > 0) {
98 ret = 0;
99 goto out;
100 }
101 btrfs_release_path(root, path);
102 wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
103
104 if (wret < 0) {
105 ret = wret;
106 goto out;
107 }
108 if (!path->nodes[1]) {
109 ret = 0;
110 goto out;
111 }
112 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
113 next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
114 min_trans);
115 ret = btrfs_realloc_node(trans, root,
116 path->nodes[1], 0,
117 cache_only, &last_ret,
118 &root->defrag_progress);
119 WARN_ON(ret && ret != -EAGAIN);
120 if (next_key_ret == 0) {
121 memcpy(&root->defrag_progress, &key, sizeof(key));
122 ret = -EAGAIN;
123 }
124
125 btrfs_release_path(root, path);
126 if (is_extent)
127 btrfs_extent_post_op(trans, root);
128out:
129 if (path)
130 btrfs_free_path(path);
131 if (ret == -EAGAIN) {
132 if (root->defrag_max.objectid > root->defrag_progress.objectid)
133 goto done;
134 if (root->defrag_max.type > root->defrag_progress.type)
135 goto done;
136 if (root->defrag_max.offset > root->defrag_progress.offset)
137 goto done;
138 ret = 0;
139 }
140done:
141 if (ret != -EAGAIN) {
142 memset(&root->defrag_progress, 0,
143 sizeof(root->defrag_progress));
144 root->defrag_trans_start = trans->transid;
145 }
146 return ret;
147}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000000..3a72a1b6c247
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2996 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "transaction.h"
22#include "disk-io.h"
23#include "locking.h"
24#include "print-tree.h"
25#include "compat.h"
26#include "tree-log.h"
27
28/* magic values for the inode_only field in btrfs_log_inode:
29 *
30 * LOG_INODE_ALL means to log everything
31 * LOG_INODE_EXISTS means to log just enough to recreate the inode
32 * during log replay
33 */
34#define LOG_INODE_ALL 0
35#define LOG_INODE_EXISTS 1
36
37/*
38 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes
41 * we find in the log are created in the subvolume.
42 *
43 * The last stage is to deal with directories and links and extents
44 * and all the other fun semantics
45 */
46#define LOG_WALK_PIN_ONLY 0
47#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2
49
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode,
52 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid);
56
57/*
58 * tree logging is a special write ahead log used to make sure that
59 * fsyncs and O_SYNCs can happen without doing full tree commits.
60 *
61 * Full tree commits are expensive because they require commonly
62 * modified blocks to be recowed, creating many dirty pages in the
63 * extent tree an 4x-6x higher write load than ext3.
64 *
65 * Instead of doing a tree commit on every fsync, we use the
66 * key ranges and transaction ids to find items for a given file or directory
67 * that have changed in this transaction. Those items are copied into
68 * a special tree (one per subvolume root), that tree is written to disk
69 * and then the fsync is considered complete.
70 *
71 * After a crash, items are copied out of the log-tree back into the
72 * subvolume tree. Any file data extents found are recorded in the extent
73 * allocation tree, and the log-tree freed.
74 *
75 * The log tree is read three times, once to pin down all the extents it is
76 * using in ram and once, once to create all the inodes logged in the tree
77 * and once to do all the other items.
78 */
79
80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161fail:
162 return ret;
163}
164
165/*
166 * start a sub transaction and setup the log tree
167 * this increments the log tree writer count to make the people
168 * syncing the tree wait for us to finish
169 */
170static int start_log_trans(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root)
172{
173 int ret;
174 mutex_lock(&root->fs_info->tree_log_mutex);
175 if (!root->fs_info->log_root_tree) {
176 ret = btrfs_init_log_root_tree(trans, root->fs_info);
177 BUG_ON(ret);
178 }
179 if (!root->log_root) {
180 ret = btrfs_add_log_tree(trans, root);
181 BUG_ON(ret);
182 }
183 atomic_inc(&root->fs_info->tree_log_writers);
184 root->fs_info->tree_log_batch++;
185 mutex_unlock(&root->fs_info->tree_log_mutex);
186 return 0;
187}
188
189/*
190 * returns 0 if there was a log transaction running and we were able
191 * to join, or returns -ENOENT if there were not transactions
192 * in progress
193 */
194static int join_running_log_trans(struct btrfs_root *root)
195{
196 int ret = -ENOENT;
197
198 smp_mb();
199 if (!root->log_root)
200 return -ENOENT;
201
202 mutex_lock(&root->fs_info->tree_log_mutex);
203 if (root->log_root) {
204 ret = 0;
205 atomic_inc(&root->fs_info->tree_log_writers);
206 root->fs_info->tree_log_batch++;
207 }
208 mutex_unlock(&root->fs_info->tree_log_mutex);
209 return ret;
210}
211
212/*
213 * indicate we're done making changes to the log tree
214 * and wake up anyone waiting to do a sync
215 */
216static int end_log_trans(struct btrfs_root *root)
217{
218 atomic_dec(&root->fs_info->tree_log_writers);
219 smp_mb();
220 if (waitqueue_active(&root->fs_info->tree_log_wait))
221 wake_up(&root->fs_info->tree_log_wait);
222 return 0;
223}
224
225
226/*
227 * the walk control struct is used to pass state down the chain when
228 * processing the log tree. The stage field tells us which part
229 * of the log tree processing we are currently doing. The others
230 * are state fields used for that specific part
231 */
232struct walk_control {
233 /* should we free the extent on disk when done? This is used
234 * at transaction commit time while freeing a log tree
235 */
236 int free;
237
238 /* should we write out the extent buffer? This is used
239 * while flushing the log tree to disk during a sync
240 */
241 int write;
242
243 /* should we wait for the extent buffer io to finish? Also used
244 * while flushing the log tree to disk for a sync
245 */
246 int wait;
247
248 /* pin only walk, we record which extents on disk belong to the
249 * log trees
250 */
251 int pin;
252
253 /* what stage of the replay code we're currently in */
254 int stage;
255
256 /* the root we are currently replaying */
257 struct btrfs_root *replay_dest;
258
259 /* the trans handle for the current replay */
260 struct btrfs_trans_handle *trans;
261
262 /* the function that gets used to process blocks we find in the
263 * tree. Note the extent_buffer might not be up to date when it is
264 * passed in, and it must be checked or read if you need the data
265 * inside it
266 */
267 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
268 struct walk_control *wc, u64 gen);
269};
270
271/*
272 * process_func used to pin down extents, write them or wait on them
273 */
274static int process_one_buffer(struct btrfs_root *log,
275 struct extent_buffer *eb,
276 struct walk_control *wc, u64 gen)
277{
278 if (wc->pin) {
279 mutex_lock(&log->fs_info->pinned_mutex);
280 btrfs_update_pinned_extents(log->fs_info->extent_root,
281 eb->start, eb->len, 1);
282 mutex_unlock(&log->fs_info->pinned_mutex);
283 }
284
285 if (btrfs_buffer_uptodate(eb, gen)) {
286 if (wc->write)
287 btrfs_write_tree_block(eb);
288 if (wc->wait)
289 btrfs_wait_tree_block_writeback(eb);
290 }
291 return 0;
292}
293
294/*
295 * Item overwrite used by replay and tree logging. eb, slot and key all refer
296 * to the src data we are copying out.
297 *
298 * root is the tree we are copying into, and path is a scratch
299 * path for use in this function (it should be released on entry and
300 * will be released on exit).
301 *
302 * If the key is already in the destination tree the existing item is
303 * overwritten. If the existing item isn't big enough, it is extended.
304 * If it is too large, it is truncated.
305 *
306 * If the key isn't in the destination yet, a new item is inserted.
307 */
308static noinline int overwrite_item(struct btrfs_trans_handle *trans,
309 struct btrfs_root *root,
310 struct btrfs_path *path,
311 struct extent_buffer *eb, int slot,
312 struct btrfs_key *key)
313{
314 int ret;
315 u32 item_size;
316 u64 saved_i_size = 0;
317 int save_old_i_size = 0;
318 unsigned long src_ptr;
319 unsigned long dst_ptr;
320 int overwrite_root = 0;
321
322 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
323 overwrite_root = 1;
324
325 item_size = btrfs_item_size_nr(eb, slot);
326 src_ptr = btrfs_item_ptr_offset(eb, slot);
327
328 /* look for the key in the destination tree */
329 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
330 if (ret == 0) {
331 char *src_copy;
332 char *dst_copy;
333 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
334 path->slots[0]);
335 if (dst_size != item_size)
336 goto insert;
337
338 if (item_size == 0) {
339 btrfs_release_path(root, path);
340 return 0;
341 }
342 dst_copy = kmalloc(item_size, GFP_NOFS);
343 src_copy = kmalloc(item_size, GFP_NOFS);
344
345 read_extent_buffer(eb, src_copy, src_ptr, item_size);
346
347 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
348 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
349 item_size);
350 ret = memcmp(dst_copy, src_copy, item_size);
351
352 kfree(dst_copy);
353 kfree(src_copy);
354 /*
355 * they have the same contents, just return, this saves
356 * us from cowing blocks in the destination tree and doing
357 * extra writes that may not have been done by a previous
358 * sync
359 */
360 if (ret == 0) {
361 btrfs_release_path(root, path);
362 return 0;
363 }
364
365 }
366insert:
367 btrfs_release_path(root, path);
368 /* try to insert the key into the destination tree */
369 ret = btrfs_insert_empty_item(trans, root, path,
370 key, item_size);
371
372 /* make sure any existing item is the correct size */
373 if (ret == -EEXIST) {
374 u32 found_size;
375 found_size = btrfs_item_size_nr(path->nodes[0],
376 path->slots[0]);
377 if (found_size > item_size) {
378 btrfs_truncate_item(trans, root, path, item_size, 1);
379 } else if (found_size < item_size) {
380 ret = btrfs_extend_item(trans, root, path,
381 item_size - found_size);
382 BUG_ON(ret);
383 }
384 } else if (ret) {
385 BUG();
386 }
387 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
388 path->slots[0]);
389
390 /* don't overwrite an existing inode if the generation number
391 * was logged as zero. This is done when the tree logging code
392 * is just logging an inode to make sure it exists after recovery.
393 *
394 * Also, don't overwrite i_size on directories during replay.
395 * log replay inserts and removes directory items based on the
396 * state of the tree found in the subvolume, and i_size is modified
397 * as it goes
398 */
399 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
400 struct btrfs_inode_item *src_item;
401 struct btrfs_inode_item *dst_item;
402
403 src_item = (struct btrfs_inode_item *)src_ptr;
404 dst_item = (struct btrfs_inode_item *)dst_ptr;
405
406 if (btrfs_inode_generation(eb, src_item) == 0)
407 goto no_copy;
408
409 if (overwrite_root &&
410 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
411 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
412 save_old_i_size = 1;
413 saved_i_size = btrfs_inode_size(path->nodes[0],
414 dst_item);
415 }
416 }
417
418 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
419 src_ptr, item_size);
420
421 if (save_old_i_size) {
422 struct btrfs_inode_item *dst_item;
423 dst_item = (struct btrfs_inode_item *)dst_ptr;
424 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
425 }
426
427 /* make sure the generation is filled in */
428 if (key->type == BTRFS_INODE_ITEM_KEY) {
429 struct btrfs_inode_item *dst_item;
430 dst_item = (struct btrfs_inode_item *)dst_ptr;
431 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
432 btrfs_set_inode_generation(path->nodes[0], dst_item,
433 trans->transid);
434 }
435 }
436
437 if (overwrite_root &&
438 key->type == BTRFS_EXTENT_DATA_KEY) {
439 int extent_type;
440 struct btrfs_file_extent_item *fi;
441
442 fi = (struct btrfs_file_extent_item *)dst_ptr;
443 extent_type = btrfs_file_extent_type(path->nodes[0], fi);
444 if (extent_type == BTRFS_FILE_EXTENT_REG ||
445 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
446 struct btrfs_key ins;
447 ins.objectid = btrfs_file_extent_disk_bytenr(
448 path->nodes[0], fi);
449 ins.offset = btrfs_file_extent_disk_num_bytes(
450 path->nodes[0], fi);
451 ins.type = BTRFS_EXTENT_ITEM_KEY;
452
453 /*
454 * is this extent already allocated in the extent
455 * allocation tree? If so, just add a reference
456 */
457 ret = btrfs_lookup_extent(root, ins.objectid,
458 ins.offset);
459 if (ret == 0) {
460 ret = btrfs_inc_extent_ref(trans, root,
461 ins.objectid, ins.offset,
462 path->nodes[0]->start,
463 root->root_key.objectid,
464 trans->transid, key->objectid);
465 } else {
466 /*
467 * insert the extent pointer in the extent
468 * allocation tree
469 */
470 ret = btrfs_alloc_logged_extent(trans, root,
471 path->nodes[0]->start,
472 root->root_key.objectid,
473 trans->transid, key->objectid,
474 &ins);
475 BUG_ON(ret);
476 }
477 }
478 }
479no_copy:
480 btrfs_mark_buffer_dirty(path->nodes[0]);
481 btrfs_release_path(root, path);
482 return 0;
483}
484
485/*
486 * simple helper to read an inode off the disk from a given root
487 * This can only be called for subvolume roots and not for the log
488 */
489static noinline struct inode *read_one_inode(struct btrfs_root *root,
490 u64 objectid)
491{
492 struct inode *inode;
493 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
494 if (inode->i_state & I_NEW) {
495 BTRFS_I(inode)->root = root;
496 BTRFS_I(inode)->location.objectid = objectid;
497 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
498 BTRFS_I(inode)->location.offset = 0;
499 btrfs_read_locked_inode(inode);
500 unlock_new_inode(inode);
501
502 }
503 if (is_bad_inode(inode)) {
504 iput(inode);
505 inode = NULL;
506 }
507 return inode;
508}
509
510/* replays a single extent in 'eb' at 'slot' with 'key' into the
511 * subvolume 'root'. path is released on entry and should be released
512 * on exit.
513 *
514 * extents in the log tree have not been allocated out of the extent
515 * tree yet. So, this completes the allocation, taking a reference
516 * as required if the extent already exists or creating a new extent
517 * if it isn't in the extent allocation tree yet.
518 *
519 * The extent is inserted into the file, dropping any existing extents
520 * from the file that overlap the new one.
521 */
522static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
523 struct btrfs_root *root,
524 struct btrfs_path *path,
525 struct extent_buffer *eb, int slot,
526 struct btrfs_key *key)
527{
528 int found_type;
529 u64 mask = root->sectorsize - 1;
530 u64 extent_end;
531 u64 alloc_hint;
532 u64 start = key->offset;
533 struct btrfs_file_extent_item *item;
534 struct inode *inode = NULL;
535 unsigned long size;
536 int ret = 0;
537
538 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
539 found_type = btrfs_file_extent_type(eb, item);
540
541 if (found_type == BTRFS_FILE_EXTENT_REG ||
542 found_type == BTRFS_FILE_EXTENT_PREALLOC)
543 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
544 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
545 size = btrfs_file_extent_inline_len(eb, item);
546 extent_end = (start + size + mask) & ~mask;
547 } else {
548 ret = 0;
549 goto out;
550 }
551
552 inode = read_one_inode(root, key->objectid);
553 if (!inode) {
554 ret = -EIO;
555 goto out;
556 }
557
558 /*
559 * first check to see if we already have this extent in the
560 * file. This must be done before the btrfs_drop_extents run
561 * so we don't try to drop this extent.
562 */
563 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
564 start, 0);
565
566 if (ret == 0 &&
567 (found_type == BTRFS_FILE_EXTENT_REG ||
568 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
569 struct btrfs_file_extent_item cmp1;
570 struct btrfs_file_extent_item cmp2;
571 struct btrfs_file_extent_item *existing;
572 struct extent_buffer *leaf;
573
574 leaf = path->nodes[0];
575 existing = btrfs_item_ptr(leaf, path->slots[0],
576 struct btrfs_file_extent_item);
577
578 read_extent_buffer(eb, &cmp1, (unsigned long)item,
579 sizeof(cmp1));
580 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
581 sizeof(cmp2));
582
583 /*
584 * we already have a pointer to this exact extent,
585 * we don't have to do anything
586 */
587 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
588 btrfs_release_path(root, path);
589 goto out;
590 }
591 }
592 btrfs_release_path(root, path);
593
594 /* drop any overlapping extents */
595 ret = btrfs_drop_extents(trans, root, inode,
596 start, extent_end, start, &alloc_hint);
597 BUG_ON(ret);
598
599 /* insert the extent */
600 ret = overwrite_item(trans, root, path, eb, slot, key);
601 BUG_ON(ret);
602
603 /* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
604 inode_add_bytes(inode, extent_end - start);
605 btrfs_update_inode(trans, root, inode);
606out:
607 if (inode)
608 iput(inode);
609 return ret;
610}
611
612/*
613 * when cleaning up conflicts between the directory names in the
614 * subvolume, directory names in the log and directory names in the
615 * inode back references, we may have to unlink inodes from directories.
616 *
617 * This is a helper function to do the unlink of a specific directory
618 * item
619 */
620static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
621 struct btrfs_root *root,
622 struct btrfs_path *path,
623 struct inode *dir,
624 struct btrfs_dir_item *di)
625{
626 struct inode *inode;
627 char *name;
628 int name_len;
629 struct extent_buffer *leaf;
630 struct btrfs_key location;
631 int ret;
632
633 leaf = path->nodes[0];
634
635 btrfs_dir_item_key_to_cpu(leaf, di, &location);
636 name_len = btrfs_dir_name_len(leaf, di);
637 name = kmalloc(name_len, GFP_NOFS);
638 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
639 btrfs_release_path(root, path);
640
641 inode = read_one_inode(root, location.objectid);
642 BUG_ON(!inode);
643
644 ret = link_to_fixup_dir(trans, root, path, location.objectid);
645 BUG_ON(ret);
646 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
647 BUG_ON(ret);
648 kfree(name);
649
650 iput(inode);
651 return ret;
652}
653
654/*
655 * helper function to see if a given name and sequence number found
656 * in an inode back reference are already in a directory and correctly
657 * point to this inode
658 */
659static noinline int inode_in_dir(struct btrfs_root *root,
660 struct btrfs_path *path,
661 u64 dirid, u64 objectid, u64 index,
662 const char *name, int name_len)
663{
664 struct btrfs_dir_item *di;
665 struct btrfs_key location;
666 int match = 0;
667
668 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
669 index, name, name_len, 0);
670 if (di && !IS_ERR(di)) {
671 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
672 if (location.objectid != objectid)
673 goto out;
674 } else
675 goto out;
676 btrfs_release_path(root, path);
677
678 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
679 if (di && !IS_ERR(di)) {
680 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
681 if (location.objectid != objectid)
682 goto out;
683 } else
684 goto out;
685 match = 1;
686out:
687 btrfs_release_path(root, path);
688 return match;
689}
690
691/*
692 * helper function to check a log tree for a named back reference in
693 * an inode. This is used to decide if a back reference that is
694 * found in the subvolume conflicts with what we find in the log.
695 *
696 * inode backreferences may have multiple refs in a single item,
697 * during replay we process one reference at a time, and we don't
698 * want to delete valid links to a file from the subvolume if that
699 * link is also in the log.
700 */
701static noinline int backref_in_log(struct btrfs_root *log,
702 struct btrfs_key *key,
703 char *name, int namelen)
704{
705 struct btrfs_path *path;
706 struct btrfs_inode_ref *ref;
707 unsigned long ptr;
708 unsigned long ptr_end;
709 unsigned long name_ptr;
710 int found_name_len;
711 int item_size;
712 int ret;
713 int match = 0;
714
715 path = btrfs_alloc_path();
716 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
717 if (ret != 0)
718 goto out;
719
720 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
721 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
722 ptr_end = ptr + item_size;
723 while (ptr < ptr_end) {
724 ref = (struct btrfs_inode_ref *)ptr;
725 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
726 if (found_name_len == namelen) {
727 name_ptr = (unsigned long)(ref + 1);
728 ret = memcmp_extent_buffer(path->nodes[0], name,
729 name_ptr, namelen);
730 if (ret == 0) {
731 match = 1;
732 goto out;
733 }
734 }
735 ptr = (unsigned long)(ref + 1) + found_name_len;
736 }
737out:
738 btrfs_free_path(path);
739 return match;
740}
741
742
743/*
744 * replay one inode back reference item found in the log tree.
745 * eb, slot and key refer to the buffer and key found in the log tree.
746 * root is the destination we are replaying into, and path is for temp
747 * use by this function. (it should be released on return).
748 */
749static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
750 struct btrfs_root *root,
751 struct btrfs_root *log,
752 struct btrfs_path *path,
753 struct extent_buffer *eb, int slot,
754 struct btrfs_key *key)
755{
756 struct inode *dir;
757 int ret;
758 struct btrfs_key location;
759 struct btrfs_inode_ref *ref;
760 struct btrfs_dir_item *di;
761 struct inode *inode;
762 char *name;
763 int namelen;
764 unsigned long ref_ptr;
765 unsigned long ref_end;
766
767 location.objectid = key->objectid;
768 location.type = BTRFS_INODE_ITEM_KEY;
769 location.offset = 0;
770
771 /*
772 * it is possible that we didn't log all the parent directories
773 * for a given inode. If we don't find the dir, just don't
774 * copy the back ref in. The link count fixup code will take
775 * care of the rest
776 */
777 dir = read_one_inode(root, key->offset);
778 if (!dir)
779 return -ENOENT;
780
781 inode = read_one_inode(root, key->objectid);
782 BUG_ON(!dir);
783
784 ref_ptr = btrfs_item_ptr_offset(eb, slot);
785 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
786
787again:
788 ref = (struct btrfs_inode_ref *)ref_ptr;
789
790 namelen = btrfs_inode_ref_name_len(eb, ref);
791 name = kmalloc(namelen, GFP_NOFS);
792 BUG_ON(!name);
793
794 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
795
796 /* if we already have a perfect match, we're done */
797 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
798 btrfs_inode_ref_index(eb, ref),
799 name, namelen)) {
800 goto out;
801 }
802
803 /*
804 * look for a conflicting back reference in the metadata.
805 * if we find one we have to unlink that name of the file
806 * before we add our new link. Later on, we overwrite any
807 * existing back reference, and we don't want to create
808 * dangling pointers in the directory.
809 */
810conflict_again:
811 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
812 if (ret == 0) {
813 char *victim_name;
814 int victim_name_len;
815 struct btrfs_inode_ref *victim_ref;
816 unsigned long ptr;
817 unsigned long ptr_end;
818 struct extent_buffer *leaf = path->nodes[0];
819
820 /* are we trying to overwrite a back ref for the root directory
821 * if so, just jump out, we're done
822 */
823 if (key->objectid == key->offset)
824 goto out_nowrite;
825
826 /* check all the names in this back reference to see
827 * if they are in the log. if so, we allow them to stay
828 * otherwise they must be unlinked as a conflict
829 */
830 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
831 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
832 while (ptr < ptr_end) {
833 victim_ref = (struct btrfs_inode_ref *)ptr;
834 victim_name_len = btrfs_inode_ref_name_len(leaf,
835 victim_ref);
836 victim_name = kmalloc(victim_name_len, GFP_NOFS);
837 BUG_ON(!victim_name);
838
839 read_extent_buffer(leaf, victim_name,
840 (unsigned long)(victim_ref + 1),
841 victim_name_len);
842
843 if (!backref_in_log(log, key, victim_name,
844 victim_name_len)) {
845 btrfs_inc_nlink(inode);
846 btrfs_release_path(root, path);
847 ret = btrfs_unlink_inode(trans, root, dir,
848 inode, victim_name,
849 victim_name_len);
850 kfree(victim_name);
851 btrfs_release_path(root, path);
852 goto conflict_again;
853 }
854 kfree(victim_name);
855 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
856 }
857 BUG_ON(ret);
858 }
859 btrfs_release_path(root, path);
860
861 /* look for a conflicting sequence number */
862 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
863 btrfs_inode_ref_index(eb, ref),
864 name, namelen, 0);
865 if (di && !IS_ERR(di)) {
866 ret = drop_one_dir_item(trans, root, path, dir, di);
867 BUG_ON(ret);
868 }
869 btrfs_release_path(root, path);
870
871
872 /* look for a conflicting name */
873 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
874 name, namelen, 0);
875 if (di && !IS_ERR(di)) {
876 ret = drop_one_dir_item(trans, root, path, dir, di);
877 BUG_ON(ret);
878 }
879 btrfs_release_path(root, path);
880
881 /* insert our name */
882 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
883 btrfs_inode_ref_index(eb, ref));
884 BUG_ON(ret);
885
886 btrfs_update_inode(trans, root, inode);
887
888out:
889 ref_ptr = (unsigned long)(ref + 1) + namelen;
890 kfree(name);
891 if (ref_ptr < ref_end)
892 goto again;
893
894 /* finally write the back reference in the inode */
895 ret = overwrite_item(trans, root, path, eb, slot, key);
896 BUG_ON(ret);
897
898out_nowrite:
899 btrfs_release_path(root, path);
900 iput(dir);
901 iput(inode);
902 return 0;
903}
904
905/*
906 * replay one csum item from the log tree into the subvolume 'root'
907 * eb, slot and key all refer to the log tree
908 * path is for temp use by this function and should be released on return
909 *
910 * This copies the checksums out of the log tree and inserts them into
911 * the subvolume. Any existing checksums for this range in the file
912 * are overwritten, and new items are added where required.
913 *
914 * We keep this simple by reusing the btrfs_ordered_sum code from
915 * the data=ordered mode. This basically means making a copy
916 * of all the checksums in ram, which we have to do anyway for kmap
917 * rules.
918 *
919 * The copy is then sent down to btrfs_csum_file_blocks, which
920 * does all the hard work of finding existing items in the file
921 * or adding new ones.
922 */
923static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
924 struct btrfs_root *root,
925 struct btrfs_path *path,
926 struct extent_buffer *eb, int slot,
927 struct btrfs_key *key)
928{
929 int ret;
930 u32 item_size = btrfs_item_size_nr(eb, slot);
931 u64 cur_offset;
932 u16 csum_size =
933 btrfs_super_csum_size(&root->fs_info->super_copy);
934 unsigned long file_bytes;
935 struct btrfs_ordered_sum *sums;
936 struct btrfs_sector_sum *sector_sum;
937 unsigned long ptr;
938
939 file_bytes = (item_size / csum_size) * root->sectorsize;
940 sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
941 if (!sums)
942 return -ENOMEM;
943
944 INIT_LIST_HEAD(&sums->list);
945 sums->len = file_bytes;
946 sums->bytenr = key->offset;
947
948 /*
949 * copy all the sums into the ordered sum struct
950 */
951 sector_sum = sums->sums;
952 cur_offset = key->offset;
953 ptr = btrfs_item_ptr_offset(eb, slot);
954 while (item_size > 0) {
955 sector_sum->bytenr = cur_offset;
956 read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
957 sector_sum++;
958 item_size -= csum_size;
959 ptr += csum_size;
960 cur_offset += root->sectorsize;
961 }
962
963 /* let btrfs_csum_file_blocks add them into the file */
964 ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums);
965 BUG_ON(ret);
966 kfree(sums);
967 return 0;
968}
969/*
970 * There are a few corners where the link count of the file can't
971 * be properly maintained during replay. So, instead of adding
972 * lots of complexity to the log code, we just scan the backrefs
973 * for any file that has been through replay.
974 *
975 * The scan will update the link count on the inode to reflect the
976 * number of back refs found. If it goes down to zero, the iput
977 * will free the inode.
978 */
979static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
980 struct btrfs_root *root,
981 struct inode *inode)
982{
983 struct btrfs_path *path;
984 int ret;
985 struct btrfs_key key;
986 u64 nlink = 0;
987 unsigned long ptr;
988 unsigned long ptr_end;
989 int name_len;
990
991 key.objectid = inode->i_ino;
992 key.type = BTRFS_INODE_REF_KEY;
993 key.offset = (u64)-1;
994
995 path = btrfs_alloc_path();
996
997 while (1) {
998 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
999 if (ret < 0)
1000 break;
1001 if (ret > 0) {
1002 if (path->slots[0] == 0)
1003 break;
1004 path->slots[0]--;
1005 }
1006 btrfs_item_key_to_cpu(path->nodes[0], &key,
1007 path->slots[0]);
1008 if (key.objectid != inode->i_ino ||
1009 key.type != BTRFS_INODE_REF_KEY)
1010 break;
1011 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1012 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1013 path->slots[0]);
1014 while (ptr < ptr_end) {
1015 struct btrfs_inode_ref *ref;
1016
1017 ref = (struct btrfs_inode_ref *)ptr;
1018 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1019 ref);
1020 ptr = (unsigned long)(ref + 1) + name_len;
1021 nlink++;
1022 }
1023
1024 if (key.offset == 0)
1025 break;
1026 key.offset--;
1027 btrfs_release_path(root, path);
1028 }
1029 btrfs_free_path(path);
1030 if (nlink != inode->i_nlink) {
1031 inode->i_nlink = nlink;
1032 btrfs_update_inode(trans, root, inode);
1033 }
1034 BTRFS_I(inode)->index_cnt = (u64)-1;
1035
1036 return 0;
1037}
1038
1039static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1040 struct btrfs_root *root,
1041 struct btrfs_path *path)
1042{
1043 int ret;
1044 struct btrfs_key key;
1045 struct inode *inode;
1046
1047 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1048 key.type = BTRFS_ORPHAN_ITEM_KEY;
1049 key.offset = (u64)-1;
1050 while (1) {
1051 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1052 if (ret < 0)
1053 break;
1054
1055 if (ret == 1) {
1056 if (path->slots[0] == 0)
1057 break;
1058 path->slots[0]--;
1059 }
1060
1061 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1062 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1063 key.type != BTRFS_ORPHAN_ITEM_KEY)
1064 break;
1065
1066 ret = btrfs_del_item(trans, root, path);
1067 BUG_ON(ret);
1068
1069 btrfs_release_path(root, path);
1070 inode = read_one_inode(root, key.offset);
1071 BUG_ON(!inode);
1072
1073 ret = fixup_inode_link_count(trans, root, inode);
1074 BUG_ON(ret);
1075
1076 iput(inode);
1077
1078 if (key.offset == 0)
1079 break;
1080 key.offset--;
1081 }
1082 btrfs_release_path(root, path);
1083 return 0;
1084}
1085
1086
1087/*
1088 * record a given inode in the fixup dir so we can check its link
1089 * count when replay is done. The link count is incremented here
1090 * so the inode won't go away until we check it
1091 */
1092static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1093 struct btrfs_root *root,
1094 struct btrfs_path *path,
1095 u64 objectid)
1096{
1097 struct btrfs_key key;
1098 int ret = 0;
1099 struct inode *inode;
1100
1101 inode = read_one_inode(root, objectid);
1102 BUG_ON(!inode);
1103
1104 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1105 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1106 key.offset = objectid;
1107
1108 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1109
1110 btrfs_release_path(root, path);
1111 if (ret == 0) {
1112 btrfs_inc_nlink(inode);
1113 btrfs_update_inode(trans, root, inode);
1114 } else if (ret == -EEXIST) {
1115 ret = 0;
1116 } else {
1117 BUG();
1118 }
1119 iput(inode);
1120
1121 return ret;
1122}
1123
1124/*
1125 * when replaying the log for a directory, we only insert names
1126 * for inodes that actually exist. This means an fsync on a directory
1127 * does not implicitly fsync all the new files in it
1128 */
1129static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1130 struct btrfs_root *root,
1131 struct btrfs_path *path,
1132 u64 dirid, u64 index,
1133 char *name, int name_len, u8 type,
1134 struct btrfs_key *location)
1135{
1136 struct inode *inode;
1137 struct inode *dir;
1138 int ret;
1139
1140 inode = read_one_inode(root, location->objectid);
1141 if (!inode)
1142 return -ENOENT;
1143
1144 dir = read_one_inode(root, dirid);
1145 if (!dir) {
1146 iput(inode);
1147 return -EIO;
1148 }
1149 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1150
1151 /* FIXME, put inode into FIXUP list */
1152
1153 iput(inode);
1154 iput(dir);
1155 return ret;
1156}
1157
1158/*
1159 * take a single entry in a log directory item and replay it into
1160 * the subvolume.
1161 *
1162 * if a conflicting item exists in the subdirectory already,
1163 * the inode it points to is unlinked and put into the link count
1164 * fix up tree.
1165 *
1166 * If a name from the log points to a file or directory that does
1167 * not exist in the FS, it is skipped. fsyncs on directories
1168 * do not force down inodes inside that directory, just changes to the
1169 * names or unlinks in a directory.
1170 */
1171static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1172 struct btrfs_root *root,
1173 struct btrfs_path *path,
1174 struct extent_buffer *eb,
1175 struct btrfs_dir_item *di,
1176 struct btrfs_key *key)
1177{
1178 char *name;
1179 int name_len;
1180 struct btrfs_dir_item *dst_di;
1181 struct btrfs_key found_key;
1182 struct btrfs_key log_key;
1183 struct inode *dir;
1184 u8 log_type;
1185 int exists;
1186 int ret;
1187
1188 dir = read_one_inode(root, key->objectid);
1189 BUG_ON(!dir);
1190
1191 name_len = btrfs_dir_name_len(eb, di);
1192 name = kmalloc(name_len, GFP_NOFS);
1193 log_type = btrfs_dir_type(eb, di);
1194 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1195 name_len);
1196
1197 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1198 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1199 if (exists == 0)
1200 exists = 1;
1201 else
1202 exists = 0;
1203 btrfs_release_path(root, path);
1204
1205 if (key->type == BTRFS_DIR_ITEM_KEY) {
1206 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1207 name, name_len, 1);
1208 } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1209 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1210 key->objectid,
1211 key->offset, name,
1212 name_len, 1);
1213 } else {
1214 BUG();
1215 }
1216 if (!dst_di || IS_ERR(dst_di)) {
1217 /* we need a sequence number to insert, so we only
1218 * do inserts for the BTRFS_DIR_INDEX_KEY types
1219 */
1220 if (key->type != BTRFS_DIR_INDEX_KEY)
1221 goto out;
1222 goto insert;
1223 }
1224
1225 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1226 /* the existing item matches the logged item */
1227 if (found_key.objectid == log_key.objectid &&
1228 found_key.type == log_key.type &&
1229 found_key.offset == log_key.offset &&
1230 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1231 goto out;
1232 }
1233
1234 /*
1235 * don't drop the conflicting directory entry if the inode
1236 * for the new entry doesn't exist
1237 */
1238 if (!exists)
1239 goto out;
1240
1241 ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1242 BUG_ON(ret);
1243
1244 if (key->type == BTRFS_DIR_INDEX_KEY)
1245 goto insert;
1246out:
1247 btrfs_release_path(root, path);
1248 kfree(name);
1249 iput(dir);
1250 return 0;
1251
1252insert:
1253 btrfs_release_path(root, path);
1254 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1255 name, name_len, log_type, &log_key);
1256
1257 if (ret && ret != -ENOENT)
1258 BUG();
1259 goto out;
1260}
1261
1262/*
1263 * find all the names in a directory item and reconcile them into
1264 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
1265 * one name in a directory item, but the same code gets used for
1266 * both directory index types
1267 */
1268static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1269 struct btrfs_root *root,
1270 struct btrfs_path *path,
1271 struct extent_buffer *eb, int slot,
1272 struct btrfs_key *key)
1273{
1274 int ret;
1275 u32 item_size = btrfs_item_size_nr(eb, slot);
1276 struct btrfs_dir_item *di;
1277 int name_len;
1278 unsigned long ptr;
1279 unsigned long ptr_end;
1280
1281 ptr = btrfs_item_ptr_offset(eb, slot);
1282 ptr_end = ptr + item_size;
1283 while (ptr < ptr_end) {
1284 di = (struct btrfs_dir_item *)ptr;
1285 name_len = btrfs_dir_name_len(eb, di);
1286 ret = replay_one_name(trans, root, path, eb, di, key);
1287 BUG_ON(ret);
1288 ptr = (unsigned long)(di + 1);
1289 ptr += name_len;
1290 }
1291 return 0;
1292}
1293
1294/*
1295 * directory replay has two parts. There are the standard directory
1296 * items in the log copied from the subvolume, and range items
1297 * created in the log while the subvolume was logged.
1298 *
1299 * The range items tell us which parts of the key space the log
1300 * is authoritative for. During replay, if a key in the subvolume
1301 * directory is in a logged range item, but not actually in the log
1302 * that means it was deleted from the directory before the fsync
1303 * and should be removed.
1304 */
1305static noinline int find_dir_range(struct btrfs_root *root,
1306 struct btrfs_path *path,
1307 u64 dirid, int key_type,
1308 u64 *start_ret, u64 *end_ret)
1309{
1310 struct btrfs_key key;
1311 u64 found_end;
1312 struct btrfs_dir_log_item *item;
1313 int ret;
1314 int nritems;
1315
1316 if (*start_ret == (u64)-1)
1317 return 1;
1318
1319 key.objectid = dirid;
1320 key.type = key_type;
1321 key.offset = *start_ret;
1322
1323 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1324 if (ret < 0)
1325 goto out;
1326 if (ret > 0) {
1327 if (path->slots[0] == 0)
1328 goto out;
1329 path->slots[0]--;
1330 }
1331 if (ret != 0)
1332 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1333
1334 if (key.type != key_type || key.objectid != dirid) {
1335 ret = 1;
1336 goto next;
1337 }
1338 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1339 struct btrfs_dir_log_item);
1340 found_end = btrfs_dir_log_end(path->nodes[0], item);
1341
1342 if (*start_ret >= key.offset && *start_ret <= found_end) {
1343 ret = 0;
1344 *start_ret = key.offset;
1345 *end_ret = found_end;
1346 goto out;
1347 }
1348 ret = 1;
1349next:
1350 /* check the next slot in the tree to see if it is a valid item */
1351 nritems = btrfs_header_nritems(path->nodes[0]);
1352 if (path->slots[0] >= nritems) {
1353 ret = btrfs_next_leaf(root, path);
1354 if (ret)
1355 goto out;
1356 } else {
1357 path->slots[0]++;
1358 }
1359
1360 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1361
1362 if (key.type != key_type || key.objectid != dirid) {
1363 ret = 1;
1364 goto out;
1365 }
1366 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1367 struct btrfs_dir_log_item);
1368 found_end = btrfs_dir_log_end(path->nodes[0], item);
1369 *start_ret = key.offset;
1370 *end_ret = found_end;
1371 ret = 0;
1372out:
1373 btrfs_release_path(root, path);
1374 return ret;
1375}
1376
1377/*
1378 * this looks for a given directory item in the log. If the directory
1379 * item is not in the log, the item is removed and the inode it points
1380 * to is unlinked
1381 */
1382static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1383 struct btrfs_root *root,
1384 struct btrfs_root *log,
1385 struct btrfs_path *path,
1386 struct btrfs_path *log_path,
1387 struct inode *dir,
1388 struct btrfs_key *dir_key)
1389{
1390 int ret;
1391 struct extent_buffer *eb;
1392 int slot;
1393 u32 item_size;
1394 struct btrfs_dir_item *di;
1395 struct btrfs_dir_item *log_di;
1396 int name_len;
1397 unsigned long ptr;
1398 unsigned long ptr_end;
1399 char *name;
1400 struct inode *inode;
1401 struct btrfs_key location;
1402
1403again:
1404 eb = path->nodes[0];
1405 slot = path->slots[0];
1406 item_size = btrfs_item_size_nr(eb, slot);
1407 ptr = btrfs_item_ptr_offset(eb, slot);
1408 ptr_end = ptr + item_size;
1409 while (ptr < ptr_end) {
1410 di = (struct btrfs_dir_item *)ptr;
1411 name_len = btrfs_dir_name_len(eb, di);
1412 name = kmalloc(name_len, GFP_NOFS);
1413 if (!name) {
1414 ret = -ENOMEM;
1415 goto out;
1416 }
1417 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1418 name_len);
1419 log_di = NULL;
1420 if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
1421 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1422 dir_key->objectid,
1423 name, name_len, 0);
1424 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
1425 log_di = btrfs_lookup_dir_index_item(trans, log,
1426 log_path,
1427 dir_key->objectid,
1428 dir_key->offset,
1429 name, name_len, 0);
1430 }
1431 if (!log_di || IS_ERR(log_di)) {
1432 btrfs_dir_item_key_to_cpu(eb, di, &location);
1433 btrfs_release_path(root, path);
1434 btrfs_release_path(log, log_path);
1435 inode = read_one_inode(root, location.objectid);
1436 BUG_ON(!inode);
1437
1438 ret = link_to_fixup_dir(trans, root,
1439 path, location.objectid);
1440 BUG_ON(ret);
1441 btrfs_inc_nlink(inode);
1442 ret = btrfs_unlink_inode(trans, root, dir, inode,
1443 name, name_len);
1444 BUG_ON(ret);
1445 kfree(name);
1446 iput(inode);
1447
1448 /* there might still be more names under this key
1449 * check and repeat if required
1450 */
1451 ret = btrfs_search_slot(NULL, root, dir_key, path,
1452 0, 0);
1453 if (ret == 0)
1454 goto again;
1455 ret = 0;
1456 goto out;
1457 }
1458 btrfs_release_path(log, log_path);
1459 kfree(name);
1460
1461 ptr = (unsigned long)(di + 1);
1462 ptr += name_len;
1463 }
1464 ret = 0;
1465out:
1466 btrfs_release_path(root, path);
1467 btrfs_release_path(log, log_path);
1468 return ret;
1469}
1470
1471/*
1472 * deletion replay happens before we copy any new directory items
1473 * out of the log or out of backreferences from inodes. It
1474 * scans the log to find ranges of keys that log is authoritative for,
1475 * and then scans the directory to find items in those ranges that are
1476 * not present in the log.
1477 *
1478 * Anything we don't find in the log is unlinked and removed from the
1479 * directory.
1480 */
1481static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1482 struct btrfs_root *root,
1483 struct btrfs_root *log,
1484 struct btrfs_path *path,
1485 u64 dirid)
1486{
1487 u64 range_start;
1488 u64 range_end;
1489 int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1490 int ret = 0;
1491 struct btrfs_key dir_key;
1492 struct btrfs_key found_key;
1493 struct btrfs_path *log_path;
1494 struct inode *dir;
1495
1496 dir_key.objectid = dirid;
1497 dir_key.type = BTRFS_DIR_ITEM_KEY;
1498 log_path = btrfs_alloc_path();
1499 if (!log_path)
1500 return -ENOMEM;
1501
1502 dir = read_one_inode(root, dirid);
1503 /* it isn't an error if the inode isn't there, that can happen
1504 * because we replay the deletes before we copy in the inode item
1505 * from the log
1506 */
1507 if (!dir) {
1508 btrfs_free_path(log_path);
1509 return 0;
1510 }
1511again:
1512 range_start = 0;
1513 range_end = 0;
1514 while (1) {
1515 ret = find_dir_range(log, path, dirid, key_type,
1516 &range_start, &range_end);
1517 if (ret != 0)
1518 break;
1519
1520 dir_key.offset = range_start;
1521 while (1) {
1522 int nritems;
1523 ret = btrfs_search_slot(NULL, root, &dir_key, path,
1524 0, 0);
1525 if (ret < 0)
1526 goto out;
1527
1528 nritems = btrfs_header_nritems(path->nodes[0]);
1529 if (path->slots[0] >= nritems) {
1530 ret = btrfs_next_leaf(root, path);
1531 if (ret)
1532 break;
1533 }
1534 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1535 path->slots[0]);
1536 if (found_key.objectid != dirid ||
1537 found_key.type != dir_key.type)
1538 goto next_type;
1539
1540 if (found_key.offset > range_end)
1541 break;
1542
1543 ret = check_item_in_log(trans, root, log, path,
1544 log_path, dir, &found_key);
1545 BUG_ON(ret);
1546 if (found_key.offset == (u64)-1)
1547 break;
1548 dir_key.offset = found_key.offset + 1;
1549 }
1550 btrfs_release_path(root, path);
1551 if (range_end == (u64)-1)
1552 break;
1553 range_start = range_end + 1;
1554 }
1555
1556next_type:
1557 ret = 0;
1558 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1559 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1560 dir_key.type = BTRFS_DIR_INDEX_KEY;
1561 btrfs_release_path(root, path);
1562 goto again;
1563 }
1564out:
1565 btrfs_release_path(root, path);
1566 btrfs_free_path(log_path);
1567 iput(dir);
1568 return ret;
1569}
1570
1571/*
1572 * the process_func used to replay items from the log tree. This
1573 * gets called in two different stages. The first stage just looks
1574 * for inodes and makes sure they are all copied into the subvolume.
1575 *
1576 * The second stage copies all the other item types from the log into
1577 * the subvolume. The two stage approach is slower, but gets rid of
1578 * lots of complexity around inodes referencing other inodes that exist
1579 * only in the log (references come from either directory items or inode
1580 * back refs).
1581 */
1582static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1583 struct walk_control *wc, u64 gen)
1584{
1585 int nritems;
1586 struct btrfs_path *path;
1587 struct btrfs_root *root = wc->replay_dest;
1588 struct btrfs_key key;
1589 u32 item_size;
1590 int level;
1591 int i;
1592 int ret;
1593
1594 btrfs_read_buffer(eb, gen);
1595
1596 level = btrfs_header_level(eb);
1597
1598 if (level != 0)
1599 return 0;
1600
1601 path = btrfs_alloc_path();
1602 BUG_ON(!path);
1603
1604 nritems = btrfs_header_nritems(eb);
1605 for (i = 0; i < nritems; i++) {
1606 btrfs_item_key_to_cpu(eb, &key, i);
1607 item_size = btrfs_item_size_nr(eb, i);
1608
1609 /* inode keys are done during the first stage */
1610 if (key.type == BTRFS_INODE_ITEM_KEY &&
1611 wc->stage == LOG_WALK_REPLAY_INODES) {
1612 struct inode *inode;
1613 struct btrfs_inode_item *inode_item;
1614 u32 mode;
1615
1616 inode_item = btrfs_item_ptr(eb, i,
1617 struct btrfs_inode_item);
1618 mode = btrfs_inode_mode(eb, inode_item);
1619 if (S_ISDIR(mode)) {
1620 ret = replay_dir_deletes(wc->trans,
1621 root, log, path, key.objectid);
1622 BUG_ON(ret);
1623 }
1624 ret = overwrite_item(wc->trans, root, path,
1625 eb, i, &key);
1626 BUG_ON(ret);
1627
1628 /* for regular files, truncate away
1629 * extents past the new EOF
1630 */
1631 if (S_ISREG(mode)) {
1632 inode = read_one_inode(root,
1633 key.objectid);
1634 BUG_ON(!inode);
1635
1636 ret = btrfs_truncate_inode_items(wc->trans,
1637 root, inode, inode->i_size,
1638 BTRFS_EXTENT_DATA_KEY);
1639 BUG_ON(ret);
1640 iput(inode);
1641 }
1642 ret = link_to_fixup_dir(wc->trans, root,
1643 path, key.objectid);
1644 BUG_ON(ret);
1645 }
1646 if (wc->stage < LOG_WALK_REPLAY_ALL)
1647 continue;
1648
1649 /* these keys are simply copied */
1650 if (key.type == BTRFS_XATTR_ITEM_KEY) {
1651 ret = overwrite_item(wc->trans, root, path,
1652 eb, i, &key);
1653 BUG_ON(ret);
1654 } else if (key.type == BTRFS_INODE_REF_KEY) {
1655 ret = add_inode_ref(wc->trans, root, log, path,
1656 eb, i, &key);
1657 BUG_ON(ret && ret != -ENOENT);
1658 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1659 ret = replay_one_extent(wc->trans, root, path,
1660 eb, i, &key);
1661 BUG_ON(ret);
1662 } else if (key.type == BTRFS_EXTENT_CSUM_KEY) {
1663 ret = replay_one_csum(wc->trans, root, path,
1664 eb, i, &key);
1665 BUG_ON(ret);
1666 } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1667 key.type == BTRFS_DIR_INDEX_KEY) {
1668 ret = replay_one_dir_item(wc->trans, root, path,
1669 eb, i, &key);
1670 BUG_ON(ret);
1671 }
1672 }
1673 btrfs_free_path(path);
1674 return 0;
1675}
1676
1677static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1678 struct btrfs_root *root,
1679 struct btrfs_path *path, int *level,
1680 struct walk_control *wc)
1681{
1682 u64 root_owner;
1683 u64 root_gen;
1684 u64 bytenr;
1685 u64 ptr_gen;
1686 struct extent_buffer *next;
1687 struct extent_buffer *cur;
1688 struct extent_buffer *parent;
1689 u32 blocksize;
1690 int ret = 0;
1691
1692 WARN_ON(*level < 0);
1693 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1694
1695 while (*level > 0) {
1696 WARN_ON(*level < 0);
1697 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1698 cur = path->nodes[*level];
1699
1700 if (btrfs_header_level(cur) != *level)
1701 WARN_ON(1);
1702
1703 if (path->slots[*level] >=
1704 btrfs_header_nritems(cur))
1705 break;
1706
1707 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1708 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1709 blocksize = btrfs_level_size(root, *level - 1);
1710
1711 parent = path->nodes[*level];
1712 root_owner = btrfs_header_owner(parent);
1713 root_gen = btrfs_header_generation(parent);
1714
1715 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1716
1717 wc->process_func(root, next, wc, ptr_gen);
1718
1719 if (*level == 1) {
1720 path->slots[*level]++;
1721 if (wc->free) {
1722 btrfs_read_buffer(next, ptr_gen);
1723
1724 btrfs_tree_lock(next);
1725 clean_tree_block(trans, root, next);
1726 btrfs_wait_tree_block_writeback(next);
1727 btrfs_tree_unlock(next);
1728
1729 ret = btrfs_drop_leaf_ref(trans, root, next);
1730 BUG_ON(ret);
1731
1732 WARN_ON(root_owner !=
1733 BTRFS_TREE_LOG_OBJECTID);
1734 ret = btrfs_free_reserved_extent(root,
1735 bytenr, blocksize);
1736 BUG_ON(ret);
1737 }
1738 free_extent_buffer(next);
1739 continue;
1740 }
1741 btrfs_read_buffer(next, ptr_gen);
1742
1743 WARN_ON(*level <= 0);
1744 if (path->nodes[*level-1])
1745 free_extent_buffer(path->nodes[*level-1]);
1746 path->nodes[*level-1] = next;
1747 *level = btrfs_header_level(next);
1748 path->slots[*level] = 0;
1749 cond_resched();
1750 }
1751 WARN_ON(*level < 0);
1752 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1753
1754 if (path->nodes[*level] == root->node)
1755 parent = path->nodes[*level];
1756 else
1757 parent = path->nodes[*level + 1];
1758
1759 bytenr = path->nodes[*level]->start;
1760
1761 blocksize = btrfs_level_size(root, *level);
1762 root_owner = btrfs_header_owner(parent);
1763 root_gen = btrfs_header_generation(parent);
1764
1765 wc->process_func(root, path->nodes[*level], wc,
1766 btrfs_header_generation(path->nodes[*level]));
1767
1768 if (wc->free) {
1769 next = path->nodes[*level];
1770 btrfs_tree_lock(next);
1771 clean_tree_block(trans, root, next);
1772 btrfs_wait_tree_block_writeback(next);
1773 btrfs_tree_unlock(next);
1774
1775 if (*level == 0) {
1776 ret = btrfs_drop_leaf_ref(trans, root, next);
1777 BUG_ON(ret);
1778 }
1779 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1780 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1781 BUG_ON(ret);
1782 }
1783 free_extent_buffer(path->nodes[*level]);
1784 path->nodes[*level] = NULL;
1785 *level += 1;
1786
1787 cond_resched();
1788 return 0;
1789}
1790
1791static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1792 struct btrfs_root *root,
1793 struct btrfs_path *path, int *level,
1794 struct walk_control *wc)
1795{
1796 u64 root_owner;
1797 u64 root_gen;
1798 int i;
1799 int slot;
1800 int ret;
1801
1802 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1803 slot = path->slots[i];
1804 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1805 struct extent_buffer *node;
1806 node = path->nodes[i];
1807 path->slots[i]++;
1808 *level = i;
1809 WARN_ON(*level == 0);
1810 return 0;
1811 } else {
1812 struct extent_buffer *parent;
1813 if (path->nodes[*level] == root->node)
1814 parent = path->nodes[*level];
1815 else
1816 parent = path->nodes[*level + 1];
1817
1818 root_owner = btrfs_header_owner(parent);
1819 root_gen = btrfs_header_generation(parent);
1820 wc->process_func(root, path->nodes[*level], wc,
1821 btrfs_header_generation(path->nodes[*level]));
1822 if (wc->free) {
1823 struct extent_buffer *next;
1824
1825 next = path->nodes[*level];
1826
1827 btrfs_tree_lock(next);
1828 clean_tree_block(trans, root, next);
1829 btrfs_wait_tree_block_writeback(next);
1830 btrfs_tree_unlock(next);
1831
1832 if (*level == 0) {
1833 ret = btrfs_drop_leaf_ref(trans, root,
1834 next);
1835 BUG_ON(ret);
1836 }
1837
1838 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1839 ret = btrfs_free_reserved_extent(root,
1840 path->nodes[*level]->start,
1841 path->nodes[*level]->len);
1842 BUG_ON(ret);
1843 }
1844 free_extent_buffer(path->nodes[*level]);
1845 path->nodes[*level] = NULL;
1846 *level = i + 1;
1847 }
1848 }
1849 return 1;
1850}
1851
1852/*
1853 * drop the reference count on the tree rooted at 'snap'. This traverses
1854 * the tree freeing any blocks that have a ref count of zero after being
1855 * decremented.
1856 */
1857static int walk_log_tree(struct btrfs_trans_handle *trans,
1858 struct btrfs_root *log, struct walk_control *wc)
1859{
1860 int ret = 0;
1861 int wret;
1862 int level;
1863 struct btrfs_path *path;
1864 int i;
1865 int orig_level;
1866
1867 path = btrfs_alloc_path();
1868 BUG_ON(!path);
1869
1870 level = btrfs_header_level(log->node);
1871 orig_level = level;
1872 path->nodes[level] = log->node;
1873 extent_buffer_get(log->node);
1874 path->slots[level] = 0;
1875
1876 while (1) {
1877 wret = walk_down_log_tree(trans, log, path, &level, wc);
1878 if (wret > 0)
1879 break;
1880 if (wret < 0)
1881 ret = wret;
1882
1883 wret = walk_up_log_tree(trans, log, path, &level, wc);
1884 if (wret > 0)
1885 break;
1886 if (wret < 0)
1887 ret = wret;
1888 }
1889
1890 /* was the root node processed? if not, catch it here */
1891 if (path->nodes[orig_level]) {
1892 wc->process_func(log, path->nodes[orig_level], wc,
1893 btrfs_header_generation(path->nodes[orig_level]));
1894 if (wc->free) {
1895 struct extent_buffer *next;
1896
1897 next = path->nodes[orig_level];
1898
1899 btrfs_tree_lock(next);
1900 clean_tree_block(trans, log, next);
1901 btrfs_wait_tree_block_writeback(next);
1902 btrfs_tree_unlock(next);
1903
1904 if (orig_level == 0) {
1905 ret = btrfs_drop_leaf_ref(trans, log,
1906 next);
1907 BUG_ON(ret);
1908 }
1909 WARN_ON(log->root_key.objectid !=
1910 BTRFS_TREE_LOG_OBJECTID);
1911 ret = btrfs_free_reserved_extent(log, next->start,
1912 next->len);
1913 BUG_ON(ret);
1914 }
1915 }
1916
1917 for (i = 0; i <= orig_level; i++) {
1918 if (path->nodes[i]) {
1919 free_extent_buffer(path->nodes[i]);
1920 path->nodes[i] = NULL;
1921 }
1922 }
1923 btrfs_free_path(path);
1924 if (wc->free)
1925 free_extent_buffer(log->node);
1926 return ret;
1927}
1928
1929static int wait_log_commit(struct btrfs_root *log)
1930{
1931 DEFINE_WAIT(wait);
1932 u64 transid = log->fs_info->tree_log_transid;
1933
1934 do {
1935 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1936 TASK_UNINTERRUPTIBLE);
1937 mutex_unlock(&log->fs_info->tree_log_mutex);
1938 if (atomic_read(&log->fs_info->tree_log_commit))
1939 schedule();
1940 finish_wait(&log->fs_info->tree_log_wait, &wait);
1941 mutex_lock(&log->fs_info->tree_log_mutex);
1942 } while (transid == log->fs_info->tree_log_transid &&
1943 atomic_read(&log->fs_info->tree_log_commit));
1944 return 0;
1945}
1946
1947/*
1948 * btrfs_sync_log does sends a given tree log down to the disk and
1949 * updates the super blocks to record it. When this call is done,
1950 * you know that any inodes previously logged are safely on disk
1951 */
1952int btrfs_sync_log(struct btrfs_trans_handle *trans,
1953 struct btrfs_root *root)
1954{
1955 int ret;
1956 unsigned long batch;
1957 struct btrfs_root *log = root->log_root;
1958
1959 mutex_lock(&log->fs_info->tree_log_mutex);
1960 if (atomic_read(&log->fs_info->tree_log_commit)) {
1961 wait_log_commit(log);
1962 goto out;
1963 }
1964 atomic_set(&log->fs_info->tree_log_commit, 1);
1965
1966 while (1) {
1967 batch = log->fs_info->tree_log_batch;
1968 mutex_unlock(&log->fs_info->tree_log_mutex);
1969 schedule_timeout_uninterruptible(1);
1970 mutex_lock(&log->fs_info->tree_log_mutex);
1971
1972 while (atomic_read(&log->fs_info->tree_log_writers)) {
1973 DEFINE_WAIT(wait);
1974 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1975 TASK_UNINTERRUPTIBLE);
1976 mutex_unlock(&log->fs_info->tree_log_mutex);
1977 if (atomic_read(&log->fs_info->tree_log_writers))
1978 schedule();
1979 mutex_lock(&log->fs_info->tree_log_mutex);
1980 finish_wait(&log->fs_info->tree_log_wait, &wait);
1981 }
1982 if (batch == log->fs_info->tree_log_batch)
1983 break;
1984 }
1985
1986 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1987 BUG_ON(ret);
1988 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
1989 &root->fs_info->log_root_tree->dirty_log_pages);
1990 BUG_ON(ret);
1991
1992 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1993 log->fs_info->log_root_tree->node->start);
1994 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1995 btrfs_header_level(log->fs_info->log_root_tree->node));
1996
1997 write_ctree_super(trans, log->fs_info->tree_root, 2);
1998 log->fs_info->tree_log_transid++;
1999 log->fs_info->tree_log_batch = 0;
2000 atomic_set(&log->fs_info->tree_log_commit, 0);
2001 smp_mb();
2002 if (waitqueue_active(&log->fs_info->tree_log_wait))
2003 wake_up(&log->fs_info->tree_log_wait);
2004out:
2005 mutex_unlock(&log->fs_info->tree_log_mutex);
2006 return 0;
2007}
2008
2009/* * free all the extents used by the tree log. This should be called
2010 * at commit time of the full transaction
2011 */
2012int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2013{
2014 int ret;
2015 struct btrfs_root *log;
2016 struct key;
2017 u64 start;
2018 u64 end;
2019 struct walk_control wc = {
2020 .free = 1,
2021 .process_func = process_one_buffer
2022 };
2023
2024 if (!root->log_root)
2025 return 0;
2026
2027 log = root->log_root;
2028 ret = walk_log_tree(trans, log, &wc);
2029 BUG_ON(ret);
2030
2031 while (1) {
2032 ret = find_first_extent_bit(&log->dirty_log_pages,
2033 0, &start, &end, EXTENT_DIRTY);
2034 if (ret)
2035 break;
2036
2037 clear_extent_dirty(&log->dirty_log_pages,
2038 start, end, GFP_NOFS);
2039 }
2040
2041 log = root->log_root;
2042 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2043 &log->root_key);
2044 BUG_ON(ret);
2045 root->log_root = NULL;
2046 kfree(root->log_root);
2047 return 0;
2048}
2049
2050/*
2051 * helper function to update the item for a given subvolumes log root
2052 * in the tree of log roots
2053 */
2054static int update_log_root(struct btrfs_trans_handle *trans,
2055 struct btrfs_root *log)
2056{
2057 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2058 int ret;
2059
2060 if (log->node->start == bytenr)
2061 return 0;
2062
2063 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2064 btrfs_set_root_generation(&log->root_item, trans->transid);
2065 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2066 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2067 &log->root_key, &log->root_item);
2068 BUG_ON(ret);
2069 return ret;
2070}
2071
2072/*
2073 * If both a file and directory are logged, and unlinks or renames are
2074 * mixed in, we have a few interesting corners:
2075 *
2076 * create file X in dir Y
2077 * link file X to X.link in dir Y
2078 * fsync file X
2079 * unlink file X but leave X.link
2080 * fsync dir Y
2081 *
2082 * After a crash we would expect only X.link to exist. But file X
2083 * didn't get fsync'd again so the log has back refs for X and X.link.
2084 *
2085 * We solve this by removing directory entries and inode backrefs from the
2086 * log when a file that was logged in the current transaction is
2087 * unlinked. Any later fsync will include the updated log entries, and
2088 * we'll be able to reconstruct the proper directory items from backrefs.
2089 *
2090 * This optimizations allows us to avoid relogging the entire inode
2091 * or the entire directory.
2092 */
2093int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2094 struct btrfs_root *root,
2095 const char *name, int name_len,
2096 struct inode *dir, u64 index)
2097{
2098 struct btrfs_root *log;
2099 struct btrfs_dir_item *di;
2100 struct btrfs_path *path;
2101 int ret;
2102 int bytes_del = 0;
2103
2104 if (BTRFS_I(dir)->logged_trans < trans->transid)
2105 return 0;
2106
2107 ret = join_running_log_trans(root);
2108 if (ret)
2109 return 0;
2110
2111 mutex_lock(&BTRFS_I(dir)->log_mutex);
2112
2113 log = root->log_root;
2114 path = btrfs_alloc_path();
2115 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2116 name, name_len, -1);
2117 if (di && !IS_ERR(di)) {
2118 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2119 bytes_del += name_len;
2120 BUG_ON(ret);
2121 }
2122 btrfs_release_path(log, path);
2123 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2124 index, name, name_len, -1);
2125 if (di && !IS_ERR(di)) {
2126 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2127 bytes_del += name_len;
2128 BUG_ON(ret);
2129 }
2130
2131 /* update the directory size in the log to reflect the names
2132 * we have removed
2133 */
2134 if (bytes_del) {
2135 struct btrfs_key key;
2136
2137 key.objectid = dir->i_ino;
2138 key.offset = 0;
2139 key.type = BTRFS_INODE_ITEM_KEY;
2140 btrfs_release_path(log, path);
2141
2142 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2143 if (ret == 0) {
2144 struct btrfs_inode_item *item;
2145 u64 i_size;
2146
2147 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2148 struct btrfs_inode_item);
2149 i_size = btrfs_inode_size(path->nodes[0], item);
2150 if (i_size > bytes_del)
2151 i_size -= bytes_del;
2152 else
2153 i_size = 0;
2154 btrfs_set_inode_size(path->nodes[0], item, i_size);
2155 btrfs_mark_buffer_dirty(path->nodes[0]);
2156 } else
2157 ret = 0;
2158 btrfs_release_path(log, path);
2159 }
2160
2161 btrfs_free_path(path);
2162 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2163 end_log_trans(root);
2164
2165 return 0;
2166}
2167
2168/* see comments for btrfs_del_dir_entries_in_log */
2169int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2170 struct btrfs_root *root,
2171 const char *name, int name_len,
2172 struct inode *inode, u64 dirid)
2173{
2174 struct btrfs_root *log;
2175 u64 index;
2176 int ret;
2177
2178 if (BTRFS_I(inode)->logged_trans < trans->transid)
2179 return 0;
2180
2181 ret = join_running_log_trans(root);
2182 if (ret)
2183 return 0;
2184 log = root->log_root;
2185 mutex_lock(&BTRFS_I(inode)->log_mutex);
2186
2187 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2188 dirid, &index);
2189 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2190 end_log_trans(root);
2191
2192 return ret;
2193}
2194
2195/*
2196 * creates a range item in the log for 'dirid'. first_offset and
2197 * last_offset tell us which parts of the key space the log should
2198 * be considered authoritative for.
2199 */
2200static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2201 struct btrfs_root *log,
2202 struct btrfs_path *path,
2203 int key_type, u64 dirid,
2204 u64 first_offset, u64 last_offset)
2205{
2206 int ret;
2207 struct btrfs_key key;
2208 struct btrfs_dir_log_item *item;
2209
2210 key.objectid = dirid;
2211 key.offset = first_offset;
2212 if (key_type == BTRFS_DIR_ITEM_KEY)
2213 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2214 else
2215 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2216 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2217 BUG_ON(ret);
2218
2219 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2220 struct btrfs_dir_log_item);
2221 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2222 btrfs_mark_buffer_dirty(path->nodes[0]);
2223 btrfs_release_path(log, path);
2224 return 0;
2225}
2226
2227/*
2228 * log all the items included in the current transaction for a given
2229 * directory. This also creates the range items in the log tree required
2230 * to replay anything deleted before the fsync
2231 */
2232static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2233 struct btrfs_root *root, struct inode *inode,
2234 struct btrfs_path *path,
2235 struct btrfs_path *dst_path, int key_type,
2236 u64 min_offset, u64 *last_offset_ret)
2237{
2238 struct btrfs_key min_key;
2239 struct btrfs_key max_key;
2240 struct btrfs_root *log = root->log_root;
2241 struct extent_buffer *src;
2242 int ret;
2243 int i;
2244 int nritems;
2245 u64 first_offset = min_offset;
2246 u64 last_offset = (u64)-1;
2247
2248 log = root->log_root;
2249 max_key.objectid = inode->i_ino;
2250 max_key.offset = (u64)-1;
2251 max_key.type = key_type;
2252
2253 min_key.objectid = inode->i_ino;
2254 min_key.type = key_type;
2255 min_key.offset = min_offset;
2256
2257 path->keep_locks = 1;
2258
2259 ret = btrfs_search_forward(root, &min_key, &max_key,
2260 path, 0, trans->transid);
2261
2262 /*
2263 * we didn't find anything from this transaction, see if there
2264 * is anything at all
2265 */
2266 if (ret != 0 || min_key.objectid != inode->i_ino ||
2267 min_key.type != key_type) {
2268 min_key.objectid = inode->i_ino;
2269 min_key.type = key_type;
2270 min_key.offset = (u64)-1;
2271 btrfs_release_path(root, path);
2272 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2273 if (ret < 0) {
2274 btrfs_release_path(root, path);
2275 return ret;
2276 }
2277 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2278
2279 /* if ret == 0 there are items for this type,
2280 * create a range to tell us the last key of this type.
2281 * otherwise, there are no items in this directory after
2282 * *min_offset, and we create a range to indicate that.
2283 */
2284 if (ret == 0) {
2285 struct btrfs_key tmp;
2286 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2287 path->slots[0]);
2288 if (key_type == tmp.type)
2289 first_offset = max(min_offset, tmp.offset) + 1;
2290 }
2291 goto done;
2292 }
2293
2294 /* go backward to find any previous key */
2295 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2296 if (ret == 0) {
2297 struct btrfs_key tmp;
2298 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2299 if (key_type == tmp.type) {
2300 first_offset = tmp.offset;
2301 ret = overwrite_item(trans, log, dst_path,
2302 path->nodes[0], path->slots[0],
2303 &tmp);
2304 }
2305 }
2306 btrfs_release_path(root, path);
2307
2308 /* find the first key from this transaction again */
2309 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2310 if (ret != 0) {
2311 WARN_ON(1);
2312 goto done;
2313 }
2314
2315 /*
2316 * we have a block from this transaction, log every item in it
2317 * from our directory
2318 */
2319 while (1) {
2320 struct btrfs_key tmp;
2321 src = path->nodes[0];
2322 nritems = btrfs_header_nritems(src);
2323 for (i = path->slots[0]; i < nritems; i++) {
2324 btrfs_item_key_to_cpu(src, &min_key, i);
2325
2326 if (min_key.objectid != inode->i_ino ||
2327 min_key.type != key_type)
2328 goto done;
2329 ret = overwrite_item(trans, log, dst_path, src, i,
2330 &min_key);
2331 BUG_ON(ret);
2332 }
2333 path->slots[0] = nritems;
2334
2335 /*
2336 * look ahead to the next item and see if it is also
2337 * from this directory and from this transaction
2338 */
2339 ret = btrfs_next_leaf(root, path);
2340 if (ret == 1) {
2341 last_offset = (u64)-1;
2342 goto done;
2343 }
2344 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2345 if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2346 last_offset = (u64)-1;
2347 goto done;
2348 }
2349 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2350 ret = overwrite_item(trans, log, dst_path,
2351 path->nodes[0], path->slots[0],
2352 &tmp);
2353
2354 BUG_ON(ret);
2355 last_offset = tmp.offset;
2356 goto done;
2357 }
2358 }
2359done:
2360 *last_offset_ret = last_offset;
2361 btrfs_release_path(root, path);
2362 btrfs_release_path(log, dst_path);
2363
2364 /* insert the log range keys to indicate where the log is valid */
2365 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2366 first_offset, last_offset);
2367 BUG_ON(ret);
2368 return 0;
2369}
2370
2371/*
2372 * logging directories is very similar to logging inodes, We find all the items
2373 * from the current transaction and write them to the log.
2374 *
2375 * The recovery code scans the directory in the subvolume, and if it finds a
2376 * key in the range logged that is not present in the log tree, then it means
2377 * that dir entry was unlinked during the transaction.
2378 *
2379 * In order for that scan to work, we must include one key smaller than
2380 * the smallest logged by this transaction and one key larger than the largest
2381 * key logged by this transaction.
2382 */
2383static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2384 struct btrfs_root *root, struct inode *inode,
2385 struct btrfs_path *path,
2386 struct btrfs_path *dst_path)
2387{
2388 u64 min_key;
2389 u64 max_key;
2390 int ret;
2391 int key_type = BTRFS_DIR_ITEM_KEY;
2392
2393again:
2394 min_key = 0;
2395 max_key = 0;
2396 while (1) {
2397 ret = log_dir_items(trans, root, inode, path,
2398 dst_path, key_type, min_key,
2399 &max_key);
2400 BUG_ON(ret);
2401 if (max_key == (u64)-1)
2402 break;
2403 min_key = max_key + 1;
2404 }
2405
2406 if (key_type == BTRFS_DIR_ITEM_KEY) {
2407 key_type = BTRFS_DIR_INDEX_KEY;
2408 goto again;
2409 }
2410 return 0;
2411}
2412
2413/*
2414 * a helper function to drop items from the log before we relog an
2415 * inode. max_key_type indicates the highest item type to remove.
2416 * This cannot be run for file data extents because it does not
2417 * free the extents they point to.
2418 */
2419static int drop_objectid_items(struct btrfs_trans_handle *trans,
2420 struct btrfs_root *log,
2421 struct btrfs_path *path,
2422 u64 objectid, int max_key_type)
2423{
2424 int ret;
2425 struct btrfs_key key;
2426 struct btrfs_key found_key;
2427
2428 key.objectid = objectid;
2429 key.type = max_key_type;
2430 key.offset = (u64)-1;
2431
2432 while (1) {
2433 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2434
2435 if (ret != 1)
2436 break;
2437
2438 if (path->slots[0] == 0)
2439 break;
2440
2441 path->slots[0]--;
2442 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2443 path->slots[0]);
2444
2445 if (found_key.objectid != objectid)
2446 break;
2447
2448 ret = btrfs_del_item(trans, log, path);
2449 BUG_ON(ret);
2450 btrfs_release_path(log, path);
2451 }
2452 btrfs_release_path(log, path);
2453 return 0;
2454}
2455
2456static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
2457 struct list_head *list,
2458 struct btrfs_root *root,
2459 u64 disk_bytenr, u64 len)
2460{
2461 struct btrfs_ordered_sum *sums;
2462 struct btrfs_sector_sum *sector_sum;
2463 int ret;
2464 struct btrfs_path *path;
2465 struct btrfs_csum_item *item = NULL;
2466 u64 end = disk_bytenr + len;
2467 u64 item_start_offset = 0;
2468 u64 item_last_offset = 0;
2469 u32 diff;
2470 u32 sum;
2471 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
2472
2473 sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
2474
2475 sector_sum = sums->sums;
2476 sums->bytenr = disk_bytenr;
2477 sums->len = len;
2478 list_add_tail(&sums->list, list);
2479
2480 path = btrfs_alloc_path();
2481 while (disk_bytenr < end) {
2482 if (!item || disk_bytenr < item_start_offset ||
2483 disk_bytenr >= item_last_offset) {
2484 struct btrfs_key found_key;
2485 u32 item_size;
2486
2487 if (item)
2488 btrfs_release_path(root, path);
2489 item = btrfs_lookup_csum(NULL, root, path,
2490 disk_bytenr, 0);
2491 if (IS_ERR(item)) {
2492 ret = PTR_ERR(item);
2493 if (ret == -ENOENT || ret == -EFBIG)
2494 ret = 0;
2495 sum = 0;
2496 printk(KERN_INFO "log no csum found for "
2497 "byte %llu\n",
2498 (unsigned long long)disk_bytenr);
2499 item = NULL;
2500 btrfs_release_path(root, path);
2501 goto found;
2502 }
2503 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2504 path->slots[0]);
2505
2506 item_start_offset = found_key.offset;
2507 item_size = btrfs_item_size_nr(path->nodes[0],
2508 path->slots[0]);
2509 item_last_offset = item_start_offset +
2510 (item_size / csum_size) *
2511 root->sectorsize;
2512 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2513 struct btrfs_csum_item);
2514 }
2515 /*
2516 * this byte range must be able to fit inside
2517 * a single leaf so it will also fit inside a u32
2518 */
2519 diff = disk_bytenr - item_start_offset;
2520 diff = diff / root->sectorsize;
2521 diff = diff * csum_size;
2522
2523 read_extent_buffer(path->nodes[0], &sum,
2524 ((unsigned long)item) + diff,
2525 csum_size);
2526found:
2527 sector_sum->bytenr = disk_bytenr;
2528 sector_sum->sum = sum;
2529 disk_bytenr += root->sectorsize;
2530 sector_sum++;
2531 }
2532 btrfs_free_path(path);
2533 return 0;
2534}
2535
2536static noinline int copy_items(struct btrfs_trans_handle *trans,
2537 struct btrfs_root *log,
2538 struct btrfs_path *dst_path,
2539 struct extent_buffer *src,
2540 int start_slot, int nr, int inode_only)
2541{
2542 unsigned long src_offset;
2543 unsigned long dst_offset;
2544 struct btrfs_file_extent_item *extent;
2545 struct btrfs_inode_item *inode_item;
2546 int ret;
2547 struct btrfs_key *ins_keys;
2548 u32 *ins_sizes;
2549 char *ins_data;
2550 int i;
2551 struct list_head ordered_sums;
2552
2553 INIT_LIST_HEAD(&ordered_sums);
2554
2555 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2556 nr * sizeof(u32), GFP_NOFS);
2557 ins_sizes = (u32 *)ins_data;
2558 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2559
2560 for (i = 0; i < nr; i++) {
2561 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2562 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2563 }
2564 ret = btrfs_insert_empty_items(trans, log, dst_path,
2565 ins_keys, ins_sizes, nr);
2566 BUG_ON(ret);
2567
2568 for (i = 0; i < nr; i++) {
2569 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2570 dst_path->slots[0]);
2571
2572 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2573
2574 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2575 src_offset, ins_sizes[i]);
2576
2577 if (inode_only == LOG_INODE_EXISTS &&
2578 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2579 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2580 dst_path->slots[0],
2581 struct btrfs_inode_item);
2582 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2583
2584 /* set the generation to zero so the recover code
2585 * can tell the difference between an logging
2586 * just to say 'this inode exists' and a logging
2587 * to say 'update this inode with these values'
2588 */
2589 btrfs_set_inode_generation(dst_path->nodes[0],
2590 inode_item, 0);
2591 }
2592 /* take a reference on file data extents so that truncates
2593 * or deletes of this inode don't have to relog the inode
2594 * again
2595 */
2596 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2597 int found_type;
2598 extent = btrfs_item_ptr(src, start_slot + i,
2599 struct btrfs_file_extent_item);
2600
2601 found_type = btrfs_file_extent_type(src, extent);
2602 if (found_type == BTRFS_FILE_EXTENT_REG ||
2603 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2604 u64 ds = btrfs_file_extent_disk_bytenr(src,
2605 extent);
2606 u64 dl = btrfs_file_extent_disk_num_bytes(src,
2607 extent);
2608 u64 cs = btrfs_file_extent_offset(src, extent);
2609 u64 cl = btrfs_file_extent_num_bytes(src,
2610 extent);;
2611 if (btrfs_file_extent_compression(src,
2612 extent)) {
2613 cs = 0;
2614 cl = dl;
2615 }
2616 /* ds == 0 is a hole */
2617 if (ds != 0) {
2618 ret = btrfs_inc_extent_ref(trans, log,
2619 ds, dl,
2620 dst_path->nodes[0]->start,
2621 BTRFS_TREE_LOG_OBJECTID,
2622 trans->transid,
2623 ins_keys[i].objectid);
2624 BUG_ON(ret);
2625 ret = copy_extent_csums(trans,
2626 &ordered_sums,
2627 log->fs_info->csum_root,
2628 ds + cs, cl);
2629 BUG_ON(ret);
2630 }
2631 }
2632 }
2633 dst_path->slots[0]++;
2634 }
2635
2636 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2637 btrfs_release_path(log, dst_path);
2638 kfree(ins_data);
2639
2640 /*
2641 * we have to do this after the loop above to avoid changing the
2642 * log tree while trying to change the log tree.
2643 */
2644 while (!list_empty(&ordered_sums)) {
2645 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2646 struct btrfs_ordered_sum,
2647 list);
2648 ret = btrfs_csum_file_blocks(trans, log, sums);
2649 BUG_ON(ret);
2650 list_del(&sums->list);
2651 kfree(sums);
2652 }
2653 return 0;
2654}
2655
2656/* log a single inode in the tree log.
2657 * At least one parent directory for this inode must exist in the tree
2658 * or be logged already.
2659 *
2660 * Any items from this inode changed by the current transaction are copied
2661 * to the log tree. An extra reference is taken on any extents in this
2662 * file, allowing us to avoid a whole pile of corner cases around logging
2663 * blocks that have been removed from the tree.
2664 *
2665 * See LOG_INODE_ALL and related defines for a description of what inode_only
2666 * does.
2667 *
2668 * This handles both files and directories.
2669 */
2670static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2671 struct btrfs_root *root, struct inode *inode,
2672 int inode_only)
2673{
2674 struct btrfs_path *path;
2675 struct btrfs_path *dst_path;
2676 struct btrfs_key min_key;
2677 struct btrfs_key max_key;
2678 struct btrfs_root *log = root->log_root;
2679 struct extent_buffer *src = NULL;
2680 u32 size;
2681 int ret;
2682 int nritems;
2683 int ins_start_slot = 0;
2684 int ins_nr;
2685
2686 log = root->log_root;
2687
2688 path = btrfs_alloc_path();
2689 dst_path = btrfs_alloc_path();
2690
2691 min_key.objectid = inode->i_ino;
2692 min_key.type = BTRFS_INODE_ITEM_KEY;
2693 min_key.offset = 0;
2694
2695 max_key.objectid = inode->i_ino;
2696 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2697 max_key.type = BTRFS_XATTR_ITEM_KEY;
2698 else
2699 max_key.type = (u8)-1;
2700 max_key.offset = (u64)-1;
2701
2702 /*
2703 * if this inode has already been logged and we're in inode_only
2704 * mode, we don't want to delete the things that have already
2705 * been written to the log.
2706 *
2707 * But, if the inode has been through an inode_only log,
2708 * the logged_trans field is not set. This allows us to catch
2709 * any new names for this inode in the backrefs by logging it
2710 * again
2711 */
2712 if (inode_only == LOG_INODE_EXISTS &&
2713 BTRFS_I(inode)->logged_trans == trans->transid) {
2714 btrfs_free_path(path);
2715 btrfs_free_path(dst_path);
2716 goto out;
2717 }
2718 mutex_lock(&BTRFS_I(inode)->log_mutex);
2719
2720 /*
2721 * a brute force approach to making sure we get the most uptodate
2722 * copies of everything.
2723 */
2724 if (S_ISDIR(inode->i_mode)) {
2725 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2726
2727 if (inode_only == LOG_INODE_EXISTS)
2728 max_key_type = BTRFS_XATTR_ITEM_KEY;
2729 ret = drop_objectid_items(trans, log, path,
2730 inode->i_ino, max_key_type);
2731 } else {
2732 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2733 }
2734 BUG_ON(ret);
2735 path->keep_locks = 1;
2736
2737 while (1) {
2738 ins_nr = 0;
2739 ret = btrfs_search_forward(root, &min_key, &max_key,
2740 path, 0, trans->transid);
2741 if (ret != 0)
2742 break;
2743again:
2744 /* note, ins_nr might be > 0 here, cleanup outside the loop */
2745 if (min_key.objectid != inode->i_ino)
2746 break;
2747 if (min_key.type > max_key.type)
2748 break;
2749
2750 src = path->nodes[0];
2751 size = btrfs_item_size_nr(src, path->slots[0]);
2752 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2753 ins_nr++;
2754 goto next_slot;
2755 } else if (!ins_nr) {
2756 ins_start_slot = path->slots[0];
2757 ins_nr = 1;
2758 goto next_slot;
2759 }
2760
2761 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2762 ins_nr, inode_only);
2763 BUG_ON(ret);
2764 ins_nr = 1;
2765 ins_start_slot = path->slots[0];
2766next_slot:
2767
2768 nritems = btrfs_header_nritems(path->nodes[0]);
2769 path->slots[0]++;
2770 if (path->slots[0] < nritems) {
2771 btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2772 path->slots[0]);
2773 goto again;
2774 }
2775 if (ins_nr) {
2776 ret = copy_items(trans, log, dst_path, src,
2777 ins_start_slot,
2778 ins_nr, inode_only);
2779 BUG_ON(ret);
2780 ins_nr = 0;
2781 }
2782 btrfs_release_path(root, path);
2783
2784 if (min_key.offset < (u64)-1)
2785 min_key.offset++;
2786 else if (min_key.type < (u8)-1)
2787 min_key.type++;
2788 else if (min_key.objectid < (u64)-1)
2789 min_key.objectid++;
2790 else
2791 break;
2792 }
2793 if (ins_nr) {
2794 ret = copy_items(trans, log, dst_path, src,
2795 ins_start_slot,
2796 ins_nr, inode_only);
2797 BUG_ON(ret);
2798 ins_nr = 0;
2799 }
2800 WARN_ON(ins_nr);
2801 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2802 btrfs_release_path(root, path);
2803 btrfs_release_path(log, dst_path);
2804 BTRFS_I(inode)->log_dirty_trans = 0;
2805 ret = log_directory_changes(trans, root, inode, path, dst_path);
2806 BUG_ON(ret);
2807 }
2808 BTRFS_I(inode)->logged_trans = trans->transid;
2809 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2810
2811 btrfs_free_path(path);
2812 btrfs_free_path(dst_path);
2813
2814 mutex_lock(&root->fs_info->tree_log_mutex);
2815 ret = update_log_root(trans, log);
2816 BUG_ON(ret);
2817 mutex_unlock(&root->fs_info->tree_log_mutex);
2818out:
2819 return 0;
2820}
2821
2822int btrfs_log_inode(struct btrfs_trans_handle *trans,
2823 struct btrfs_root *root, struct inode *inode,
2824 int inode_only)
2825{
2826 int ret;
2827
2828 start_log_trans(trans, root);
2829 ret = __btrfs_log_inode(trans, root, inode, inode_only);
2830 end_log_trans(root);
2831 return ret;
2832}
2833
2834/*
2835 * helper function around btrfs_log_inode to make sure newly created
2836 * parent directories also end up in the log. A minimal inode and backref
2837 * only logging is done of any parent directories that are older than
2838 * the last committed transaction
2839 */
2840int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2841 struct btrfs_root *root, struct dentry *dentry)
2842{
2843 int inode_only = LOG_INODE_ALL;
2844 struct super_block *sb;
2845 int ret;
2846
2847 start_log_trans(trans, root);
2848 sb = dentry->d_inode->i_sb;
2849 while (1) {
2850 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2851 inode_only);
2852 BUG_ON(ret);
2853 inode_only = LOG_INODE_EXISTS;
2854
2855 dentry = dentry->d_parent;
2856 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
2857 break;
2858
2859 if (BTRFS_I(dentry->d_inode)->generation <=
2860 root->fs_info->last_trans_committed)
2861 break;
2862 }
2863 end_log_trans(root);
2864 return 0;
2865}
2866
2867/*
2868 * it is not safe to log dentry if the chunk root has added new
2869 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
2870 * If this returns 1, you must commit the transaction to safely get your
2871 * data on disk.
2872 */
2873int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2874 struct btrfs_root *root, struct dentry *dentry)
2875{
2876 u64 gen;
2877 gen = root->fs_info->last_trans_new_blockgroup;
2878 if (gen > root->fs_info->last_trans_committed)
2879 return 1;
2880 else
2881 return btrfs_log_dentry(trans, root, dentry);
2882}
2883
2884/*
2885 * should be called during mount to recover any replay any log trees
2886 * from the FS
2887 */
2888int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2889{
2890 int ret;
2891 struct btrfs_path *path;
2892 struct btrfs_trans_handle *trans;
2893 struct btrfs_key key;
2894 struct btrfs_key found_key;
2895 struct btrfs_key tmp_key;
2896 struct btrfs_root *log;
2897 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2898 u64 highest_inode;
2899 struct walk_control wc = {
2900 .process_func = process_one_buffer,
2901 .stage = 0,
2902 };
2903
2904 fs_info->log_root_recovering = 1;
2905 path = btrfs_alloc_path();
2906 BUG_ON(!path);
2907
2908 trans = btrfs_start_transaction(fs_info->tree_root, 1);
2909
2910 wc.trans = trans;
2911 wc.pin = 1;
2912
2913 walk_log_tree(trans, log_root_tree, &wc);
2914
2915again:
2916 key.objectid = BTRFS_TREE_LOG_OBJECTID;
2917 key.offset = (u64)-1;
2918 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2919
2920 while (1) {
2921 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
2922 if (ret < 0)
2923 break;
2924 if (ret > 0) {
2925 if (path->slots[0] == 0)
2926 break;
2927 path->slots[0]--;
2928 }
2929 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2930 path->slots[0]);
2931 btrfs_release_path(log_root_tree, path);
2932 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
2933 break;
2934
2935 log = btrfs_read_fs_root_no_radix(log_root_tree,
2936 &found_key);
2937 BUG_ON(!log);
2938
2939
2940 tmp_key.objectid = found_key.offset;
2941 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
2942 tmp_key.offset = (u64)-1;
2943
2944 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
2945
2946 BUG_ON(!wc.replay_dest);
2947
2948 btrfs_record_root_in_trans(wc.replay_dest);
2949 ret = walk_log_tree(trans, log, &wc);
2950 BUG_ON(ret);
2951
2952 if (wc.stage == LOG_WALK_REPLAY_ALL) {
2953 ret = fixup_inode_link_counts(trans, wc.replay_dest,
2954 path);
2955 BUG_ON(ret);
2956 }
2957 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
2958 if (ret == 0) {
2959 wc.replay_dest->highest_inode = highest_inode;
2960 wc.replay_dest->last_inode_alloc = highest_inode;
2961 }
2962
2963 key.offset = found_key.offset - 1;
2964 free_extent_buffer(log->node);
2965 kfree(log);
2966
2967 if (found_key.offset == 0)
2968 break;
2969 }
2970 btrfs_release_path(log_root_tree, path);
2971
2972 /* step one is to pin it all, step two is to replay just inodes */
2973 if (wc.pin) {
2974 wc.pin = 0;
2975 wc.process_func = replay_one_buffer;
2976 wc.stage = LOG_WALK_REPLAY_INODES;
2977 goto again;
2978 }
2979 /* step three is to replay everything */
2980 if (wc.stage < LOG_WALK_REPLAY_ALL) {
2981 wc.stage++;
2982 goto again;
2983 }
2984
2985 btrfs_free_path(path);
2986
2987 free_extent_buffer(log_root_tree->node);
2988 log_root_tree->log_root = NULL;
2989 fs_info->log_root_recovering = 0;
2990
2991 /* step 4: commit the transaction, which also unpins the blocks */
2992 btrfs_commit_transaction(trans, fs_info->tree_root);
2993
2994 kfree(log_root_tree);
2995 return 0;
2996}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000000..b9409b32ed02
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __TREE_LOG_
20#define __TREE_LOG_
21
22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root,
35 const char *name, int name_len,
36 struct inode *dir, u64 index);
37int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root,
39 const char *name, int name_len,
40 struct inode *inode, u64 dirid);
41#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 000000000000..9bf3946d5ef2
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 000000000000..1ca1952fd917
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
1#!/bin/bash
2#
3# determine-version -- report a useful version for releases
4#
5# Copyright 2008, Aron Griffis <agriffis@n01se.net>
6# Copyright 2008, Oracle
7# Released under the GNU GPLv2
8
9v="v0.16"
10
11which git &> /dev/null
12if [ $? == 0 ]; then
13 git branch >& /dev/null
14 if [ $? == 0 ]; then
15 if head=`git rev-parse --verify HEAD 2>/dev/null`; then
16 if tag=`git describe --tags 2>/dev/null`; then
17 v="$tag"
18 fi
19
20 # Are there uncommitted changes?
21 git update-index --refresh --unmerged > /dev/null
22 if git diff-index --name-only HEAD | \
23 grep -v "^scripts/package" \
24 | read dummy; then
25 v="$v"-dirty
26 fi
27 fi
28 fi
29fi
30
31echo "#ifndef __BUILD_VERSION" > .build-version.h
32echo "#define __BUILD_VERSION" >> .build-version.h
33echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
34echo "#endif" >> .build-version.h
35
36diff -q version.h .build-version.h >& /dev/null
37
38if [ $? == 0 ]; then
39 rm .build-version.h
40 exit 0
41fi
42
43mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000000..b187b537888e
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3218 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/buffer_head.h>
21#include <linux/blkdev.h>
22#include <linux/random.h>
23#include <linux/version.h>
24#include <asm/div64.h>
25#include "compat.h"
26#include "ctree.h"
27#include "extent_map.h"
28#include "disk-io.h"
29#include "transaction.h"
30#include "print-tree.h"
31#include "volumes.h"
32#include "async-thread.h"
33
34struct map_lookup {
35 u64 type;
36 int io_align;
37 int io_width;
38 int stripe_len;
39 int sector_size;
40 int num_stripes;
41 int sub_stripes;
42 struct btrfs_bio_stripe stripes[];
43};
44
45static int init_first_rw_device(struct btrfs_trans_handle *trans,
46 struct btrfs_root *root,
47 struct btrfs_device *device);
48static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
49
50#define map_lookup_size(n) (sizeof(struct map_lookup) + \
51 (sizeof(struct btrfs_bio_stripe) * (n)))
52
53static DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids);
55
56void btrfs_lock_volumes(void)
57{
58 mutex_lock(&uuid_mutex);
59}
60
61void btrfs_unlock_volumes(void)
62{
63 mutex_unlock(&uuid_mutex);
64}
65
66static void lock_chunks(struct btrfs_root *root)
67{
68 mutex_lock(&root->fs_info->chunk_mutex);
69}
70
71static void unlock_chunks(struct btrfs_root *root)
72{
73 mutex_unlock(&root->fs_info->chunk_mutex);
74}
75
76static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
77{
78 struct btrfs_device *device;
79 WARN_ON(fs_devices->opened);
80 while (!list_empty(&fs_devices->devices)) {
81 device = list_entry(fs_devices->devices.next,
82 struct btrfs_device, dev_list);
83 list_del(&device->dev_list);
84 kfree(device->name);
85 kfree(device);
86 }
87 kfree(fs_devices);
88}
89
90int btrfs_cleanup_fs_uuids(void)
91{
92 struct btrfs_fs_devices *fs_devices;
93
94 while (!list_empty(&fs_uuids)) {
95 fs_devices = list_entry(fs_uuids.next,
96 struct btrfs_fs_devices, list);
97 list_del(&fs_devices->list);
98 free_fs_devices(fs_devices);
99 }
100 return 0;
101}
102
103static noinline struct btrfs_device *__find_device(struct list_head *head,
104 u64 devid, u8 *uuid)
105{
106 struct btrfs_device *dev;
107 struct list_head *cur;
108
109 list_for_each(cur, head) {
110 dev = list_entry(cur, struct btrfs_device, dev_list);
111 if (dev->devid == devid &&
112 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113 return dev;
114 }
115 }
116 return NULL;
117}
118
119static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120{
121 struct list_head *cur;
122 struct btrfs_fs_devices *fs_devices;
123
124 list_for_each(cur, &fs_uuids) {
125 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
126 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
127 return fs_devices;
128 }
129 return NULL;
130}
131
132/*
133 * we try to collect pending bios for a device so we don't get a large
134 * number of procs sending bios down to the same device. This greatly
135 * improves the schedulers ability to collect and merge the bios.
136 *
137 * But, it also turns into a long list of bios to process and that is sure
138 * to eventually make the worker thread block. The solution here is to
139 * make some progress and then put this work struct back at the end of
140 * the list if the block device is congested. This way, multiple devices
141 * can make progress from a single worker thread.
142 */
143static noinline int run_scheduled_bios(struct btrfs_device *device)
144{
145 struct bio *pending;
146 struct backing_dev_info *bdi;
147 struct btrfs_fs_info *fs_info;
148 struct bio *tail;
149 struct bio *cur;
150 int again = 0;
151 unsigned long num_run = 0;
152 unsigned long limit;
153
154 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
155 fs_info = device->dev_root->fs_info;
156 limit = btrfs_async_submit_limit(fs_info);
157 limit = limit * 2 / 3;
158
159loop:
160 spin_lock(&device->io_lock);
161
162 /* take all the bios off the list at once and process them
163 * later on (without the lock held). But, remember the
164 * tail and other pointers so the bios can be properly reinserted
165 * into the list if we hit congestion
166 */
167 pending = device->pending_bios;
168 tail = device->pending_bio_tail;
169 WARN_ON(pending && !tail);
170 device->pending_bios = NULL;
171 device->pending_bio_tail = NULL;
172
173 /*
174 * if pending was null this time around, no bios need processing
175 * at all and we can stop. Otherwise it'll loop back up again
176 * and do an additional check so no bios are missed.
177 *
178 * device->running_pending is used to synchronize with the
179 * schedule_bio code.
180 */
181 if (pending) {
182 again = 1;
183 device->running_pending = 1;
184 } else {
185 again = 0;
186 device->running_pending = 0;
187 }
188 spin_unlock(&device->io_lock);
189
190 while (pending) {
191 cur = pending;
192 pending = pending->bi_next;
193 cur->bi_next = NULL;
194 atomic_dec(&fs_info->nr_async_bios);
195
196 if (atomic_read(&fs_info->nr_async_bios) < limit &&
197 waitqueue_active(&fs_info->async_submit_wait))
198 wake_up(&fs_info->async_submit_wait);
199
200 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
201 bio_get(cur);
202 submit_bio(cur->bi_rw, cur);
203 bio_put(cur);
204 num_run++;
205
206 /*
207 * we made progress, there is more work to do and the bdi
208 * is now congested. Back off and let other work structs
209 * run instead
210 */
211 if (pending && bdi_write_congested(bdi) &&
212 fs_info->fs_devices->open_devices > 1) {
213 struct bio *old_head;
214
215 spin_lock(&device->io_lock);
216
217 old_head = device->pending_bios;
218 device->pending_bios = pending;
219 if (device->pending_bio_tail)
220 tail->bi_next = old_head;
221 else
222 device->pending_bio_tail = tail;
223
224 spin_unlock(&device->io_lock);
225 btrfs_requeue_work(&device->work);
226 goto done;
227 }
228 }
229 if (again)
230 goto loop;
231done:
232 return 0;
233}
234
235static void pending_bios_fn(struct btrfs_work *work)
236{
237 struct btrfs_device *device;
238
239 device = container_of(work, struct btrfs_device, work);
240 run_scheduled_bios(device);
241}
242
243static noinline int device_list_add(const char *path,
244 struct btrfs_super_block *disk_super,
245 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
246{
247 struct btrfs_device *device;
248 struct btrfs_fs_devices *fs_devices;
249 u64 found_transid = btrfs_super_generation(disk_super);
250
251 fs_devices = find_fsid(disk_super->fsid);
252 if (!fs_devices) {
253 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
254 if (!fs_devices)
255 return -ENOMEM;
256 INIT_LIST_HEAD(&fs_devices->devices);
257 INIT_LIST_HEAD(&fs_devices->alloc_list);
258 list_add(&fs_devices->list, &fs_uuids);
259 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
260 fs_devices->latest_devid = devid;
261 fs_devices->latest_trans = found_transid;
262 device = NULL;
263 } else {
264 device = __find_device(&fs_devices->devices, devid,
265 disk_super->dev_item.uuid);
266 }
267 if (!device) {
268 if (fs_devices->opened)
269 return -EBUSY;
270
271 device = kzalloc(sizeof(*device), GFP_NOFS);
272 if (!device) {
273 /* we can safely leave the fs_devices entry around */
274 return -ENOMEM;
275 }
276 device->devid = devid;
277 device->work.func = pending_bios_fn;
278 memcpy(device->uuid, disk_super->dev_item.uuid,
279 BTRFS_UUID_SIZE);
280 device->barriers = 1;
281 spin_lock_init(&device->io_lock);
282 device->name = kstrdup(path, GFP_NOFS);
283 if (!device->name) {
284 kfree(device);
285 return -ENOMEM;
286 }
287 INIT_LIST_HEAD(&device->dev_alloc_list);
288 list_add(&device->dev_list, &fs_devices->devices);
289 device->fs_devices = fs_devices;
290 fs_devices->num_devices++;
291 }
292
293 if (found_transid > fs_devices->latest_trans) {
294 fs_devices->latest_devid = devid;
295 fs_devices->latest_trans = found_transid;
296 }
297 *fs_devices_ret = fs_devices;
298 return 0;
299}
300
301static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
302{
303 struct btrfs_fs_devices *fs_devices;
304 struct btrfs_device *device;
305 struct btrfs_device *orig_dev;
306
307 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
308 if (!fs_devices)
309 return ERR_PTR(-ENOMEM);
310
311 INIT_LIST_HEAD(&fs_devices->devices);
312 INIT_LIST_HEAD(&fs_devices->alloc_list);
313 INIT_LIST_HEAD(&fs_devices->list);
314 fs_devices->latest_devid = orig->latest_devid;
315 fs_devices->latest_trans = orig->latest_trans;
316 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
317
318 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
319 device = kzalloc(sizeof(*device), GFP_NOFS);
320 if (!device)
321 goto error;
322
323 device->name = kstrdup(orig_dev->name, GFP_NOFS);
324 if (!device->name)
325 goto error;
326
327 device->devid = orig_dev->devid;
328 device->work.func = pending_bios_fn;
329 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
330 device->barriers = 1;
331 spin_lock_init(&device->io_lock);
332 INIT_LIST_HEAD(&device->dev_list);
333 INIT_LIST_HEAD(&device->dev_alloc_list);
334
335 list_add(&device->dev_list, &fs_devices->devices);
336 device->fs_devices = fs_devices;
337 fs_devices->num_devices++;
338 }
339 return fs_devices;
340error:
341 free_fs_devices(fs_devices);
342 return ERR_PTR(-ENOMEM);
343}
344
345int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
346{
347 struct list_head *tmp;
348 struct list_head *cur;
349 struct btrfs_device *device;
350
351 mutex_lock(&uuid_mutex);
352again:
353 list_for_each_safe(cur, tmp, &fs_devices->devices) {
354 device = list_entry(cur, struct btrfs_device, dev_list);
355 if (device->in_fs_metadata)
356 continue;
357
358 if (device->bdev) {
359 close_bdev_exclusive(device->bdev, device->mode);
360 device->bdev = NULL;
361 fs_devices->open_devices--;
362 }
363 if (device->writeable) {
364 list_del_init(&device->dev_alloc_list);
365 device->writeable = 0;
366 fs_devices->rw_devices--;
367 }
368 list_del_init(&device->dev_list);
369 fs_devices->num_devices--;
370 kfree(device->name);
371 kfree(device);
372 }
373
374 if (fs_devices->seed) {
375 fs_devices = fs_devices->seed;
376 goto again;
377 }
378
379 mutex_unlock(&uuid_mutex);
380 return 0;
381}
382
383static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
384{
385 struct list_head *cur;
386 struct btrfs_device *device;
387
388 if (--fs_devices->opened > 0)
389 return 0;
390
391 list_for_each(cur, &fs_devices->devices) {
392 device = list_entry(cur, struct btrfs_device, dev_list);
393 if (device->bdev) {
394 close_bdev_exclusive(device->bdev, device->mode);
395 fs_devices->open_devices--;
396 }
397 if (device->writeable) {
398 list_del_init(&device->dev_alloc_list);
399 fs_devices->rw_devices--;
400 }
401
402 device->bdev = NULL;
403 device->writeable = 0;
404 device->in_fs_metadata = 0;
405 }
406 WARN_ON(fs_devices->open_devices);
407 WARN_ON(fs_devices->rw_devices);
408 fs_devices->opened = 0;
409 fs_devices->seeding = 0;
410
411 return 0;
412}
413
414int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
415{
416 struct btrfs_fs_devices *seed_devices = NULL;
417 int ret;
418
419 mutex_lock(&uuid_mutex);
420 ret = __btrfs_close_devices(fs_devices);
421 if (!fs_devices->opened) {
422 seed_devices = fs_devices->seed;
423 fs_devices->seed = NULL;
424 }
425 mutex_unlock(&uuid_mutex);
426
427 while (seed_devices) {
428 fs_devices = seed_devices;
429 seed_devices = fs_devices->seed;
430 __btrfs_close_devices(fs_devices);
431 free_fs_devices(fs_devices);
432 }
433 return ret;
434}
435
436static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
437 fmode_t flags, void *holder)
438{
439 struct block_device *bdev;
440 struct list_head *head = &fs_devices->devices;
441 struct list_head *cur;
442 struct btrfs_device *device;
443 struct block_device *latest_bdev = NULL;
444 struct buffer_head *bh;
445 struct btrfs_super_block *disk_super;
446 u64 latest_devid = 0;
447 u64 latest_transid = 0;
448 u64 devid;
449 int seeding = 1;
450 int ret = 0;
451
452 list_for_each(cur, head) {
453 device = list_entry(cur, struct btrfs_device, dev_list);
454 if (device->bdev)
455 continue;
456 if (!device->name)
457 continue;
458
459 bdev = open_bdev_exclusive(device->name, flags, holder);
460 if (IS_ERR(bdev)) {
461 printk(KERN_INFO "open %s failed\n", device->name);
462 goto error;
463 }
464 set_blocksize(bdev, 4096);
465
466 bh = btrfs_read_dev_super(bdev);
467 if (!bh)
468 goto error_close;
469
470 disk_super = (struct btrfs_super_block *)bh->b_data;
471 devid = le64_to_cpu(disk_super->dev_item.devid);
472 if (devid != device->devid)
473 goto error_brelse;
474
475 if (memcmp(device->uuid, disk_super->dev_item.uuid,
476 BTRFS_UUID_SIZE))
477 goto error_brelse;
478
479 device->generation = btrfs_super_generation(disk_super);
480 if (!latest_transid || device->generation > latest_transid) {
481 latest_devid = devid;
482 latest_transid = device->generation;
483 latest_bdev = bdev;
484 }
485
486 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
487 device->writeable = 0;
488 } else {
489 device->writeable = !bdev_read_only(bdev);
490 seeding = 0;
491 }
492
493 device->bdev = bdev;
494 device->in_fs_metadata = 0;
495 device->mode = flags;
496
497 fs_devices->open_devices++;
498 if (device->writeable) {
499 fs_devices->rw_devices++;
500 list_add(&device->dev_alloc_list,
501 &fs_devices->alloc_list);
502 }
503 continue;
504
505error_brelse:
506 brelse(bh);
507error_close:
508 close_bdev_exclusive(bdev, FMODE_READ);
509error:
510 continue;
511 }
512 if (fs_devices->open_devices == 0) {
513 ret = -EIO;
514 goto out;
515 }
516 fs_devices->seeding = seeding;
517 fs_devices->opened = 1;
518 fs_devices->latest_bdev = latest_bdev;
519 fs_devices->latest_devid = latest_devid;
520 fs_devices->latest_trans = latest_transid;
521 fs_devices->total_rw_bytes = 0;
522out:
523 return ret;
524}
525
526int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
527 fmode_t flags, void *holder)
528{
529 int ret;
530
531 mutex_lock(&uuid_mutex);
532 if (fs_devices->opened) {
533 fs_devices->opened++;
534 ret = 0;
535 } else {
536 ret = __btrfs_open_devices(fs_devices, flags, holder);
537 }
538 mutex_unlock(&uuid_mutex);
539 return ret;
540}
541
542int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
543 struct btrfs_fs_devices **fs_devices_ret)
544{
545 struct btrfs_super_block *disk_super;
546 struct block_device *bdev;
547 struct buffer_head *bh;
548 int ret;
549 u64 devid;
550 u64 transid;
551
552 mutex_lock(&uuid_mutex);
553
554 bdev = open_bdev_exclusive(path, flags, holder);
555
556 if (IS_ERR(bdev)) {
557 ret = PTR_ERR(bdev);
558 goto error;
559 }
560
561 ret = set_blocksize(bdev, 4096);
562 if (ret)
563 goto error_close;
564 bh = btrfs_read_dev_super(bdev);
565 if (!bh) {
566 ret = -EIO;
567 goto error_close;
568 }
569 disk_super = (struct btrfs_super_block *)bh->b_data;
570 devid = le64_to_cpu(disk_super->dev_item.devid);
571 transid = btrfs_super_generation(disk_super);
572 if (disk_super->label[0])
573 printk(KERN_INFO "device label %s ", disk_super->label);
574 else {
575 /* FIXME, make a readl uuid parser */
576 printk(KERN_INFO "device fsid %llx-%llx ",
577 *(unsigned long long *)disk_super->fsid,
578 *(unsigned long long *)(disk_super->fsid + 8));
579 }
580 printk(KERN_INFO "devid %llu transid %llu %s\n",
581 (unsigned long long)devid, (unsigned long long)transid, path);
582 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
583
584 brelse(bh);
585error_close:
586 close_bdev_exclusive(bdev, flags);
587error:
588 mutex_unlock(&uuid_mutex);
589 return ret;
590}
591
592/*
593 * this uses a pretty simple search, the expectation is that it is
594 * called very infrequently and that a given device has a small number
595 * of extents
596 */
597static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
598 struct btrfs_device *device,
599 u64 num_bytes, u64 *start)
600{
601 struct btrfs_key key;
602 struct btrfs_root *root = device->dev_root;
603 struct btrfs_dev_extent *dev_extent = NULL;
604 struct btrfs_path *path;
605 u64 hole_size = 0;
606 u64 last_byte = 0;
607 u64 search_start = 0;
608 u64 search_end = device->total_bytes;
609 int ret;
610 int slot = 0;
611 int start_found;
612 struct extent_buffer *l;
613
614 path = btrfs_alloc_path();
615 if (!path)
616 return -ENOMEM;
617 path->reada = 2;
618 start_found = 0;
619
620 /* FIXME use last free of some kind */
621
622 /* we don't want to overwrite the superblock on the drive,
623 * so we make sure to start at an offset of at least 1MB
624 */
625 search_start = max((u64)1024 * 1024, search_start);
626
627 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
628 search_start = max(root->fs_info->alloc_start, search_start);
629
630 key.objectid = device->devid;
631 key.offset = search_start;
632 key.type = BTRFS_DEV_EXTENT_KEY;
633 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
634 if (ret < 0)
635 goto error;
636 ret = btrfs_previous_item(root, path, 0, key.type);
637 if (ret < 0)
638 goto error;
639 l = path->nodes[0];
640 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
641 while (1) {
642 l = path->nodes[0];
643 slot = path->slots[0];
644 if (slot >= btrfs_header_nritems(l)) {
645 ret = btrfs_next_leaf(root, path);
646 if (ret == 0)
647 continue;
648 if (ret < 0)
649 goto error;
650no_more_items:
651 if (!start_found) {
652 if (search_start >= search_end) {
653 ret = -ENOSPC;
654 goto error;
655 }
656 *start = search_start;
657 start_found = 1;
658 goto check_pending;
659 }
660 *start = last_byte > search_start ?
661 last_byte : search_start;
662 if (search_end <= *start) {
663 ret = -ENOSPC;
664 goto error;
665 }
666 goto check_pending;
667 }
668 btrfs_item_key_to_cpu(l, &key, slot);
669
670 if (key.objectid < device->devid)
671 goto next;
672
673 if (key.objectid > device->devid)
674 goto no_more_items;
675
676 if (key.offset >= search_start && key.offset > last_byte &&
677 start_found) {
678 if (last_byte < search_start)
679 last_byte = search_start;
680 hole_size = key.offset - last_byte;
681 if (key.offset > last_byte &&
682 hole_size >= num_bytes) {
683 *start = last_byte;
684 goto check_pending;
685 }
686 }
687 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
688 goto next;
689
690 start_found = 1;
691 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
692 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
693next:
694 path->slots[0]++;
695 cond_resched();
696 }
697check_pending:
698 /* we have to make sure we didn't find an extent that has already
699 * been allocated by the map tree or the original allocation
700 */
701 BUG_ON(*start < search_start);
702
703 if (*start + num_bytes > search_end) {
704 ret = -ENOSPC;
705 goto error;
706 }
707 /* check for pending inserts here */
708 ret = 0;
709
710error:
711 btrfs_free_path(path);
712 return ret;
713}
714
715static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
716 struct btrfs_device *device,
717 u64 start)
718{
719 int ret;
720 struct btrfs_path *path;
721 struct btrfs_root *root = device->dev_root;
722 struct btrfs_key key;
723 struct btrfs_key found_key;
724 struct extent_buffer *leaf = NULL;
725 struct btrfs_dev_extent *extent = NULL;
726
727 path = btrfs_alloc_path();
728 if (!path)
729 return -ENOMEM;
730
731 key.objectid = device->devid;
732 key.offset = start;
733 key.type = BTRFS_DEV_EXTENT_KEY;
734
735 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
736 if (ret > 0) {
737 ret = btrfs_previous_item(root, path, key.objectid,
738 BTRFS_DEV_EXTENT_KEY);
739 BUG_ON(ret);
740 leaf = path->nodes[0];
741 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
742 extent = btrfs_item_ptr(leaf, path->slots[0],
743 struct btrfs_dev_extent);
744 BUG_ON(found_key.offset > start || found_key.offset +
745 btrfs_dev_extent_length(leaf, extent) < start);
746 ret = 0;
747 } else if (ret == 0) {
748 leaf = path->nodes[0];
749 extent = btrfs_item_ptr(leaf, path->slots[0],
750 struct btrfs_dev_extent);
751 }
752 BUG_ON(ret);
753
754 if (device->bytes_used > 0)
755 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
756 ret = btrfs_del_item(trans, root, path);
757 BUG_ON(ret);
758
759 btrfs_free_path(path);
760 return ret;
761}
762
763int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
764 struct btrfs_device *device,
765 u64 chunk_tree, u64 chunk_objectid,
766 u64 chunk_offset, u64 start, u64 num_bytes)
767{
768 int ret;
769 struct btrfs_path *path;
770 struct btrfs_root *root = device->dev_root;
771 struct btrfs_dev_extent *extent;
772 struct extent_buffer *leaf;
773 struct btrfs_key key;
774
775 WARN_ON(!device->in_fs_metadata);
776 path = btrfs_alloc_path();
777 if (!path)
778 return -ENOMEM;
779
780 key.objectid = device->devid;
781 key.offset = start;
782 key.type = BTRFS_DEV_EXTENT_KEY;
783 ret = btrfs_insert_empty_item(trans, root, path, &key,
784 sizeof(*extent));
785 BUG_ON(ret);
786
787 leaf = path->nodes[0];
788 extent = btrfs_item_ptr(leaf, path->slots[0],
789 struct btrfs_dev_extent);
790 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
791 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
792 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
793
794 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
795 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
796 BTRFS_UUID_SIZE);
797
798 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
799 btrfs_mark_buffer_dirty(leaf);
800 btrfs_free_path(path);
801 return ret;
802}
803
804static noinline int find_next_chunk(struct btrfs_root *root,
805 u64 objectid, u64 *offset)
806{
807 struct btrfs_path *path;
808 int ret;
809 struct btrfs_key key;
810 struct btrfs_chunk *chunk;
811 struct btrfs_key found_key;
812
813 path = btrfs_alloc_path();
814 BUG_ON(!path);
815
816 key.objectid = objectid;
817 key.offset = (u64)-1;
818 key.type = BTRFS_CHUNK_ITEM_KEY;
819
820 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
821 if (ret < 0)
822 goto error;
823
824 BUG_ON(ret == 0);
825
826 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
827 if (ret) {
828 *offset = 0;
829 } else {
830 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
831 path->slots[0]);
832 if (found_key.objectid != objectid)
833 *offset = 0;
834 else {
835 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
836 struct btrfs_chunk);
837 *offset = found_key.offset +
838 btrfs_chunk_length(path->nodes[0], chunk);
839 }
840 }
841 ret = 0;
842error:
843 btrfs_free_path(path);
844 return ret;
845}
846
847static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
848{
849 int ret;
850 struct btrfs_key key;
851 struct btrfs_key found_key;
852 struct btrfs_path *path;
853
854 root = root->fs_info->chunk_root;
855
856 path = btrfs_alloc_path();
857 if (!path)
858 return -ENOMEM;
859
860 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
861 key.type = BTRFS_DEV_ITEM_KEY;
862 key.offset = (u64)-1;
863
864 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
865 if (ret < 0)
866 goto error;
867
868 BUG_ON(ret == 0);
869
870 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
871 BTRFS_DEV_ITEM_KEY);
872 if (ret) {
873 *objectid = 1;
874 } else {
875 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
876 path->slots[0]);
877 *objectid = found_key.offset + 1;
878 }
879 ret = 0;
880error:
881 btrfs_free_path(path);
882 return ret;
883}
884
885/*
886 * the device information is stored in the chunk root
887 * the btrfs_device struct should be fully filled in
888 */
889int btrfs_add_device(struct btrfs_trans_handle *trans,
890 struct btrfs_root *root,
891 struct btrfs_device *device)
892{
893 int ret;
894 struct btrfs_path *path;
895 struct btrfs_dev_item *dev_item;
896 struct extent_buffer *leaf;
897 struct btrfs_key key;
898 unsigned long ptr;
899
900 root = root->fs_info->chunk_root;
901
902 path = btrfs_alloc_path();
903 if (!path)
904 return -ENOMEM;
905
906 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
907 key.type = BTRFS_DEV_ITEM_KEY;
908 key.offset = device->devid;
909
910 ret = btrfs_insert_empty_item(trans, root, path, &key,
911 sizeof(*dev_item));
912 if (ret)
913 goto out;
914
915 leaf = path->nodes[0];
916 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
917
918 btrfs_set_device_id(leaf, dev_item, device->devid);
919 btrfs_set_device_generation(leaf, dev_item, 0);
920 btrfs_set_device_type(leaf, dev_item, device->type);
921 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
922 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
923 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
924 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
925 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
926 btrfs_set_device_group(leaf, dev_item, 0);
927 btrfs_set_device_seek_speed(leaf, dev_item, 0);
928 btrfs_set_device_bandwidth(leaf, dev_item, 0);
929 btrfs_set_device_start_offset(leaf, dev_item, 0);
930
931 ptr = (unsigned long)btrfs_device_uuid(dev_item);
932 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
933 ptr = (unsigned long)btrfs_device_fsid(dev_item);
934 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
935 btrfs_mark_buffer_dirty(leaf);
936
937 ret = 0;
938out:
939 btrfs_free_path(path);
940 return ret;
941}
942
943static int btrfs_rm_dev_item(struct btrfs_root *root,
944 struct btrfs_device *device)
945{
946 int ret;
947 struct btrfs_path *path;
948 struct btrfs_key key;
949 struct btrfs_trans_handle *trans;
950
951 root = root->fs_info->chunk_root;
952
953 path = btrfs_alloc_path();
954 if (!path)
955 return -ENOMEM;
956
957 trans = btrfs_start_transaction(root, 1);
958 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
959 key.type = BTRFS_DEV_ITEM_KEY;
960 key.offset = device->devid;
961 lock_chunks(root);
962
963 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
964 if (ret < 0)
965 goto out;
966
967 if (ret > 0) {
968 ret = -ENOENT;
969 goto out;
970 }
971
972 ret = btrfs_del_item(trans, root, path);
973 if (ret)
974 goto out;
975out:
976 btrfs_free_path(path);
977 unlock_chunks(root);
978 btrfs_commit_transaction(trans, root);
979 return ret;
980}
981
982int btrfs_rm_device(struct btrfs_root *root, char *device_path)
983{
984 struct btrfs_device *device;
985 struct btrfs_device *next_device;
986 struct block_device *bdev;
987 struct buffer_head *bh = NULL;
988 struct btrfs_super_block *disk_super;
989 u64 all_avail;
990 u64 devid;
991 u64 num_devices;
992 u8 *dev_uuid;
993 int ret = 0;
994
995 mutex_lock(&uuid_mutex);
996 mutex_lock(&root->fs_info->volume_mutex);
997
998 all_avail = root->fs_info->avail_data_alloc_bits |
999 root->fs_info->avail_system_alloc_bits |
1000 root->fs_info->avail_metadata_alloc_bits;
1001
1002 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1003 root->fs_info->fs_devices->rw_devices <= 4) {
1004 printk(KERN_ERR "btrfs: unable to go below four devices "
1005 "on raid10\n");
1006 ret = -EINVAL;
1007 goto out;
1008 }
1009
1010 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1011 root->fs_info->fs_devices->rw_devices <= 2) {
1012 printk(KERN_ERR "btrfs: unable to go below two "
1013 "devices on raid1\n");
1014 ret = -EINVAL;
1015 goto out;
1016 }
1017
1018 if (strcmp(device_path, "missing") == 0) {
1019 struct list_head *cur;
1020 struct list_head *devices;
1021 struct btrfs_device *tmp;
1022
1023 device = NULL;
1024 devices = &root->fs_info->fs_devices->devices;
1025 list_for_each(cur, devices) {
1026 tmp = list_entry(cur, struct btrfs_device, dev_list);
1027 if (tmp->in_fs_metadata && !tmp->bdev) {
1028 device = tmp;
1029 break;
1030 }
1031 }
1032 bdev = NULL;
1033 bh = NULL;
1034 disk_super = NULL;
1035 if (!device) {
1036 printk(KERN_ERR "btrfs: no missing devices found to "
1037 "remove\n");
1038 goto out;
1039 }
1040 } else {
1041 bdev = open_bdev_exclusive(device_path, FMODE_READ,
1042 root->fs_info->bdev_holder);
1043 if (IS_ERR(bdev)) {
1044 ret = PTR_ERR(bdev);
1045 goto out;
1046 }
1047
1048 set_blocksize(bdev, 4096);
1049 bh = btrfs_read_dev_super(bdev);
1050 if (!bh) {
1051 ret = -EIO;
1052 goto error_close;
1053 }
1054 disk_super = (struct btrfs_super_block *)bh->b_data;
1055 devid = le64_to_cpu(disk_super->dev_item.devid);
1056 dev_uuid = disk_super->dev_item.uuid;
1057 device = btrfs_find_device(root, devid, dev_uuid,
1058 disk_super->fsid);
1059 if (!device) {
1060 ret = -ENOENT;
1061 goto error_brelse;
1062 }
1063 }
1064
1065 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1066 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1067 "device\n");
1068 ret = -EINVAL;
1069 goto error_brelse;
1070 }
1071
1072 if (device->writeable) {
1073 list_del_init(&device->dev_alloc_list);
1074 root->fs_info->fs_devices->rw_devices--;
1075 }
1076
1077 ret = btrfs_shrink_device(device, 0);
1078 if (ret)
1079 goto error_brelse;
1080
1081 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1082 if (ret)
1083 goto error_brelse;
1084
1085 device->in_fs_metadata = 0;
1086 list_del_init(&device->dev_list);
1087 device->fs_devices->num_devices--;
1088
1089 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1090 struct btrfs_device, dev_list);
1091 if (device->bdev == root->fs_info->sb->s_bdev)
1092 root->fs_info->sb->s_bdev = next_device->bdev;
1093 if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1094 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1095
1096 if (device->bdev) {
1097 close_bdev_exclusive(device->bdev, device->mode);
1098 device->bdev = NULL;
1099 device->fs_devices->open_devices--;
1100 }
1101
1102 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1103 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1104
1105 if (device->fs_devices->open_devices == 0) {
1106 struct btrfs_fs_devices *fs_devices;
1107 fs_devices = root->fs_info->fs_devices;
1108 while (fs_devices) {
1109 if (fs_devices->seed == device->fs_devices)
1110 break;
1111 fs_devices = fs_devices->seed;
1112 }
1113 fs_devices->seed = device->fs_devices->seed;
1114 device->fs_devices->seed = NULL;
1115 __btrfs_close_devices(device->fs_devices);
1116 free_fs_devices(device->fs_devices);
1117 }
1118
1119 /*
1120 * at this point, the device is zero sized. We want to
1121 * remove it from the devices list and zero out the old super
1122 */
1123 if (device->writeable) {
1124 /* make sure this device isn't detected as part of
1125 * the FS anymore
1126 */
1127 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1128 set_buffer_dirty(bh);
1129 sync_dirty_buffer(bh);
1130 }
1131
1132 kfree(device->name);
1133 kfree(device);
1134 ret = 0;
1135
1136error_brelse:
1137 brelse(bh);
1138error_close:
1139 if (bdev)
1140 close_bdev_exclusive(bdev, FMODE_READ);
1141out:
1142 mutex_unlock(&root->fs_info->volume_mutex);
1143 mutex_unlock(&uuid_mutex);
1144 return ret;
1145}
1146
1147/*
1148 * does all the dirty work required for changing file system's UUID.
1149 */
1150static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1151 struct btrfs_root *root)
1152{
1153 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1154 struct btrfs_fs_devices *old_devices;
1155 struct btrfs_fs_devices *seed_devices;
1156 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1157 struct btrfs_device *device;
1158 u64 super_flags;
1159
1160 BUG_ON(!mutex_is_locked(&uuid_mutex));
1161 if (!fs_devices->seeding)
1162 return -EINVAL;
1163
1164 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1165 if (!seed_devices)
1166 return -ENOMEM;
1167
1168 old_devices = clone_fs_devices(fs_devices);
1169 if (IS_ERR(old_devices)) {
1170 kfree(seed_devices);
1171 return PTR_ERR(old_devices);
1172 }
1173
1174 list_add(&old_devices->list, &fs_uuids);
1175
1176 memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1177 seed_devices->opened = 1;
1178 INIT_LIST_HEAD(&seed_devices->devices);
1179 INIT_LIST_HEAD(&seed_devices->alloc_list);
1180 list_splice_init(&fs_devices->devices, &seed_devices->devices);
1181 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1182 list_for_each_entry(device, &seed_devices->devices, dev_list) {
1183 device->fs_devices = seed_devices;
1184 }
1185
1186 fs_devices->seeding = 0;
1187 fs_devices->num_devices = 0;
1188 fs_devices->open_devices = 0;
1189 fs_devices->seed = seed_devices;
1190
1191 generate_random_uuid(fs_devices->fsid);
1192 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1193 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1194 super_flags = btrfs_super_flags(disk_super) &
1195 ~BTRFS_SUPER_FLAG_SEEDING;
1196 btrfs_set_super_flags(disk_super, super_flags);
1197
1198 return 0;
1199}
1200
1201/*
1202 * strore the expected generation for seed devices in device items.
1203 */
1204static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1205 struct btrfs_root *root)
1206{
1207 struct btrfs_path *path;
1208 struct extent_buffer *leaf;
1209 struct btrfs_dev_item *dev_item;
1210 struct btrfs_device *device;
1211 struct btrfs_key key;
1212 u8 fs_uuid[BTRFS_UUID_SIZE];
1213 u8 dev_uuid[BTRFS_UUID_SIZE];
1214 u64 devid;
1215 int ret;
1216
1217 path = btrfs_alloc_path();
1218 if (!path)
1219 return -ENOMEM;
1220
1221 root = root->fs_info->chunk_root;
1222 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1223 key.offset = 0;
1224 key.type = BTRFS_DEV_ITEM_KEY;
1225
1226 while (1) {
1227 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1228 if (ret < 0)
1229 goto error;
1230
1231 leaf = path->nodes[0];
1232next_slot:
1233 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1234 ret = btrfs_next_leaf(root, path);
1235 if (ret > 0)
1236 break;
1237 if (ret < 0)
1238 goto error;
1239 leaf = path->nodes[0];
1240 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1241 btrfs_release_path(root, path);
1242 continue;
1243 }
1244
1245 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1246 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1247 key.type != BTRFS_DEV_ITEM_KEY)
1248 break;
1249
1250 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1251 struct btrfs_dev_item);
1252 devid = btrfs_device_id(leaf, dev_item);
1253 read_extent_buffer(leaf, dev_uuid,
1254 (unsigned long)btrfs_device_uuid(dev_item),
1255 BTRFS_UUID_SIZE);
1256 read_extent_buffer(leaf, fs_uuid,
1257 (unsigned long)btrfs_device_fsid(dev_item),
1258 BTRFS_UUID_SIZE);
1259 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1260 BUG_ON(!device);
1261
1262 if (device->fs_devices->seeding) {
1263 btrfs_set_device_generation(leaf, dev_item,
1264 device->generation);
1265 btrfs_mark_buffer_dirty(leaf);
1266 }
1267
1268 path->slots[0]++;
1269 goto next_slot;
1270 }
1271 ret = 0;
1272error:
1273 btrfs_free_path(path);
1274 return ret;
1275}
1276
1277int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1278{
1279 struct btrfs_trans_handle *trans;
1280 struct btrfs_device *device;
1281 struct block_device *bdev;
1282 struct list_head *cur;
1283 struct list_head *devices;
1284 struct super_block *sb = root->fs_info->sb;
1285 u64 total_bytes;
1286 int seeding_dev = 0;
1287 int ret = 0;
1288
1289 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1290 return -EINVAL;
1291
1292 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
1293 if (!bdev)
1294 return -EIO;
1295
1296 if (root->fs_info->fs_devices->seeding) {
1297 seeding_dev = 1;
1298 down_write(&sb->s_umount);
1299 mutex_lock(&uuid_mutex);
1300 }
1301
1302 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1303 mutex_lock(&root->fs_info->volume_mutex);
1304
1305 devices = &root->fs_info->fs_devices->devices;
1306 list_for_each(cur, devices) {
1307 device = list_entry(cur, struct btrfs_device, dev_list);
1308 if (device->bdev == bdev) {
1309 ret = -EEXIST;
1310 goto error;
1311 }
1312 }
1313
1314 device = kzalloc(sizeof(*device), GFP_NOFS);
1315 if (!device) {
1316 /* we can safely leave the fs_devices entry around */
1317 ret = -ENOMEM;
1318 goto error;
1319 }
1320
1321 device->name = kstrdup(device_path, GFP_NOFS);
1322 if (!device->name) {
1323 kfree(device);
1324 ret = -ENOMEM;
1325 goto error;
1326 }
1327
1328 ret = find_next_devid(root, &device->devid);
1329 if (ret) {
1330 kfree(device);
1331 goto error;
1332 }
1333
1334 trans = btrfs_start_transaction(root, 1);
1335 lock_chunks(root);
1336
1337 device->barriers = 1;
1338 device->writeable = 1;
1339 device->work.func = pending_bios_fn;
1340 generate_random_uuid(device->uuid);
1341 spin_lock_init(&device->io_lock);
1342 device->generation = trans->transid;
1343 device->io_width = root->sectorsize;
1344 device->io_align = root->sectorsize;
1345 device->sector_size = root->sectorsize;
1346 device->total_bytes = i_size_read(bdev->bd_inode);
1347 device->dev_root = root->fs_info->dev_root;
1348 device->bdev = bdev;
1349 device->in_fs_metadata = 1;
1350 device->mode = 0;
1351 set_blocksize(device->bdev, 4096);
1352
1353 if (seeding_dev) {
1354 sb->s_flags &= ~MS_RDONLY;
1355 ret = btrfs_prepare_sprout(trans, root);
1356 BUG_ON(ret);
1357 }
1358
1359 device->fs_devices = root->fs_info->fs_devices;
1360 list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1361 list_add(&device->dev_alloc_list,
1362 &root->fs_info->fs_devices->alloc_list);
1363 root->fs_info->fs_devices->num_devices++;
1364 root->fs_info->fs_devices->open_devices++;
1365 root->fs_info->fs_devices->rw_devices++;
1366 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1367
1368 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1369 btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1370 total_bytes + device->total_bytes);
1371
1372 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1373 btrfs_set_super_num_devices(&root->fs_info->super_copy,
1374 total_bytes + 1);
1375
1376 if (seeding_dev) {
1377 ret = init_first_rw_device(trans, root, device);
1378 BUG_ON(ret);
1379 ret = btrfs_finish_sprout(trans, root);
1380 BUG_ON(ret);
1381 } else {
1382 ret = btrfs_add_device(trans, root, device);
1383 }
1384
1385 unlock_chunks(root);
1386 btrfs_commit_transaction(trans, root);
1387
1388 if (seeding_dev) {
1389 mutex_unlock(&uuid_mutex);
1390 up_write(&sb->s_umount);
1391
1392 ret = btrfs_relocate_sys_chunks(root);
1393 BUG_ON(ret);
1394 }
1395out:
1396 mutex_unlock(&root->fs_info->volume_mutex);
1397 return ret;
1398error:
1399 close_bdev_exclusive(bdev, 0);
1400 if (seeding_dev) {
1401 mutex_unlock(&uuid_mutex);
1402 up_write(&sb->s_umount);
1403 }
1404 goto out;
1405}
1406
1407static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1408 struct btrfs_device *device)
1409{
1410 int ret;
1411 struct btrfs_path *path;
1412 struct btrfs_root *root;
1413 struct btrfs_dev_item *dev_item;
1414 struct extent_buffer *leaf;
1415 struct btrfs_key key;
1416
1417 root = device->dev_root->fs_info->chunk_root;
1418
1419 path = btrfs_alloc_path();
1420 if (!path)
1421 return -ENOMEM;
1422
1423 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1424 key.type = BTRFS_DEV_ITEM_KEY;
1425 key.offset = device->devid;
1426
1427 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1428 if (ret < 0)
1429 goto out;
1430
1431 if (ret > 0) {
1432 ret = -ENOENT;
1433 goto out;
1434 }
1435
1436 leaf = path->nodes[0];
1437 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1438
1439 btrfs_set_device_id(leaf, dev_item, device->devid);
1440 btrfs_set_device_type(leaf, dev_item, device->type);
1441 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1442 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1443 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1444 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1445 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1446 btrfs_mark_buffer_dirty(leaf);
1447
1448out:
1449 btrfs_free_path(path);
1450 return ret;
1451}
1452
1453static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1454 struct btrfs_device *device, u64 new_size)
1455{
1456 struct btrfs_super_block *super_copy =
1457 &device->dev_root->fs_info->super_copy;
1458 u64 old_total = btrfs_super_total_bytes(super_copy);
1459 u64 diff = new_size - device->total_bytes;
1460
1461 if (!device->writeable)
1462 return -EACCES;
1463 if (new_size <= device->total_bytes)
1464 return -EINVAL;
1465
1466 btrfs_set_super_total_bytes(super_copy, old_total + diff);
1467 device->fs_devices->total_rw_bytes += diff;
1468
1469 device->total_bytes = new_size;
1470 return btrfs_update_device(trans, device);
1471}
1472
1473int btrfs_grow_device(struct btrfs_trans_handle *trans,
1474 struct btrfs_device *device, u64 new_size)
1475{
1476 int ret;
1477 lock_chunks(device->dev_root);
1478 ret = __btrfs_grow_device(trans, device, new_size);
1479 unlock_chunks(device->dev_root);
1480 return ret;
1481}
1482
1483static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1484 struct btrfs_root *root,
1485 u64 chunk_tree, u64 chunk_objectid,
1486 u64 chunk_offset)
1487{
1488 int ret;
1489 struct btrfs_path *path;
1490 struct btrfs_key key;
1491
1492 root = root->fs_info->chunk_root;
1493 path = btrfs_alloc_path();
1494 if (!path)
1495 return -ENOMEM;
1496
1497 key.objectid = chunk_objectid;
1498 key.offset = chunk_offset;
1499 key.type = BTRFS_CHUNK_ITEM_KEY;
1500
1501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1502 BUG_ON(ret);
1503
1504 ret = btrfs_del_item(trans, root, path);
1505 BUG_ON(ret);
1506
1507 btrfs_free_path(path);
1508 return 0;
1509}
1510
1511static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1512 chunk_offset)
1513{
1514 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1515 struct btrfs_disk_key *disk_key;
1516 struct btrfs_chunk *chunk;
1517 u8 *ptr;
1518 int ret = 0;
1519 u32 num_stripes;
1520 u32 array_size;
1521 u32 len = 0;
1522 u32 cur;
1523 struct btrfs_key key;
1524
1525 array_size = btrfs_super_sys_array_size(super_copy);
1526
1527 ptr = super_copy->sys_chunk_array;
1528 cur = 0;
1529
1530 while (cur < array_size) {
1531 disk_key = (struct btrfs_disk_key *)ptr;
1532 btrfs_disk_key_to_cpu(&key, disk_key);
1533
1534 len = sizeof(*disk_key);
1535
1536 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1537 chunk = (struct btrfs_chunk *)(ptr + len);
1538 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1539 len += btrfs_chunk_item_size(num_stripes);
1540 } else {
1541 ret = -EIO;
1542 break;
1543 }
1544 if (key.objectid == chunk_objectid &&
1545 key.offset == chunk_offset) {
1546 memmove(ptr, ptr + len, array_size - (cur + len));
1547 array_size -= len;
1548 btrfs_set_super_sys_array_size(super_copy, array_size);
1549 } else {
1550 ptr += len;
1551 cur += len;
1552 }
1553 }
1554 return ret;
1555}
1556
1557static int btrfs_relocate_chunk(struct btrfs_root *root,
1558 u64 chunk_tree, u64 chunk_objectid,
1559 u64 chunk_offset)
1560{
1561 struct extent_map_tree *em_tree;
1562 struct btrfs_root *extent_root;
1563 struct btrfs_trans_handle *trans;
1564 struct extent_map *em;
1565 struct map_lookup *map;
1566 int ret;
1567 int i;
1568
1569 printk(KERN_INFO "btrfs relocating chunk %llu\n",
1570 (unsigned long long)chunk_offset);
1571 root = root->fs_info->chunk_root;
1572 extent_root = root->fs_info->extent_root;
1573 em_tree = &root->fs_info->mapping_tree.map_tree;
1574
1575 /* step one, relocate all the extents inside this chunk */
1576 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1577 BUG_ON(ret);
1578
1579 trans = btrfs_start_transaction(root, 1);
1580 BUG_ON(!trans);
1581
1582 lock_chunks(root);
1583
1584 /*
1585 * step two, delete the device extents and the
1586 * chunk tree entries
1587 */
1588 spin_lock(&em_tree->lock);
1589 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1590 spin_unlock(&em_tree->lock);
1591
1592 BUG_ON(em->start > chunk_offset ||
1593 em->start + em->len < chunk_offset);
1594 map = (struct map_lookup *)em->bdev;
1595
1596 for (i = 0; i < map->num_stripes; i++) {
1597 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1598 map->stripes[i].physical);
1599 BUG_ON(ret);
1600
1601 if (map->stripes[i].dev) {
1602 ret = btrfs_update_device(trans, map->stripes[i].dev);
1603 BUG_ON(ret);
1604 }
1605 }
1606 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1607 chunk_offset);
1608
1609 BUG_ON(ret);
1610
1611 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1612 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1613 BUG_ON(ret);
1614 }
1615
1616 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1617 BUG_ON(ret);
1618
1619 spin_lock(&em_tree->lock);
1620 remove_extent_mapping(em_tree, em);
1621 spin_unlock(&em_tree->lock);
1622
1623 kfree(map);
1624 em->bdev = NULL;
1625
1626 /* once for the tree */
1627 free_extent_map(em);
1628 /* once for us */
1629 free_extent_map(em);
1630
1631 unlock_chunks(root);
1632 btrfs_end_transaction(trans, root);
1633 return 0;
1634}
1635
1636static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1637{
1638 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1639 struct btrfs_path *path;
1640 struct extent_buffer *leaf;
1641 struct btrfs_chunk *chunk;
1642 struct btrfs_key key;
1643 struct btrfs_key found_key;
1644 u64 chunk_tree = chunk_root->root_key.objectid;
1645 u64 chunk_type;
1646 int ret;
1647
1648 path = btrfs_alloc_path();
1649 if (!path)
1650 return -ENOMEM;
1651
1652 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1653 key.offset = (u64)-1;
1654 key.type = BTRFS_CHUNK_ITEM_KEY;
1655
1656 while (1) {
1657 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1658 if (ret < 0)
1659 goto error;
1660 BUG_ON(ret == 0);
1661
1662 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1663 key.type);
1664 if (ret < 0)
1665 goto error;
1666 if (ret > 0)
1667 break;
1668
1669 leaf = path->nodes[0];
1670 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1671
1672 chunk = btrfs_item_ptr(leaf, path->slots[0],
1673 struct btrfs_chunk);
1674 chunk_type = btrfs_chunk_type(leaf, chunk);
1675 btrfs_release_path(chunk_root, path);
1676
1677 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1678 ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1679 found_key.objectid,
1680 found_key.offset);
1681 BUG_ON(ret);
1682 }
1683
1684 if (found_key.offset == 0)
1685 break;
1686 key.offset = found_key.offset - 1;
1687 }
1688 ret = 0;
1689error:
1690 btrfs_free_path(path);
1691 return ret;
1692}
1693
1694static u64 div_factor(u64 num, int factor)
1695{
1696 if (factor == 10)
1697 return num;
1698 num *= factor;
1699 do_div(num, 10);
1700 return num;
1701}
1702
1703int btrfs_balance(struct btrfs_root *dev_root)
1704{
1705 int ret;
1706 struct list_head *cur;
1707 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1708 struct btrfs_device *device;
1709 u64 old_size;
1710 u64 size_to_free;
1711 struct btrfs_path *path;
1712 struct btrfs_key key;
1713 struct btrfs_chunk *chunk;
1714 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1715 struct btrfs_trans_handle *trans;
1716 struct btrfs_key found_key;
1717
1718 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1719 return -EROFS;
1720
1721 mutex_lock(&dev_root->fs_info->volume_mutex);
1722 dev_root = dev_root->fs_info->dev_root;
1723
1724 /* step one make some room on all the devices */
1725 list_for_each(cur, devices) {
1726 device = list_entry(cur, struct btrfs_device, dev_list);
1727 old_size = device->total_bytes;
1728 size_to_free = div_factor(old_size, 1);
1729 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
1730 if (!device->writeable ||
1731 device->total_bytes - device->bytes_used > size_to_free)
1732 continue;
1733
1734 ret = btrfs_shrink_device(device, old_size - size_to_free);
1735 BUG_ON(ret);
1736
1737 trans = btrfs_start_transaction(dev_root, 1);
1738 BUG_ON(!trans);
1739
1740 ret = btrfs_grow_device(trans, device, old_size);
1741 BUG_ON(ret);
1742
1743 btrfs_end_transaction(trans, dev_root);
1744 }
1745
1746 /* step two, relocate all the chunks */
1747 path = btrfs_alloc_path();
1748 BUG_ON(!path);
1749
1750 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1751 key.offset = (u64)-1;
1752 key.type = BTRFS_CHUNK_ITEM_KEY;
1753
1754 while (1) {
1755 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1756 if (ret < 0)
1757 goto error;
1758
1759 /*
1760 * this shouldn't happen, it means the last relocate
1761 * failed
1762 */
1763 if (ret == 0)
1764 break;
1765
1766 ret = btrfs_previous_item(chunk_root, path, 0,
1767 BTRFS_CHUNK_ITEM_KEY);
1768 if (ret)
1769 break;
1770
1771 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1772 path->slots[0]);
1773 if (found_key.objectid != key.objectid)
1774 break;
1775
1776 chunk = btrfs_item_ptr(path->nodes[0],
1777 path->slots[0],
1778 struct btrfs_chunk);
1779 key.offset = found_key.offset;
1780 /* chunk zero is special */
1781 if (key.offset == 0)
1782 break;
1783
1784 btrfs_release_path(chunk_root, path);
1785 ret = btrfs_relocate_chunk(chunk_root,
1786 chunk_root->root_key.objectid,
1787 found_key.objectid,
1788 found_key.offset);
1789 BUG_ON(ret);
1790 }
1791 ret = 0;
1792error:
1793 btrfs_free_path(path);
1794 mutex_unlock(&dev_root->fs_info->volume_mutex);
1795 return ret;
1796}
1797
1798/*
1799 * shrinking a device means finding all of the device extents past
1800 * the new size, and then following the back refs to the chunks.
1801 * The chunk relocation code actually frees the device extent
1802 */
1803int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1804{
1805 struct btrfs_trans_handle *trans;
1806 struct btrfs_root *root = device->dev_root;
1807 struct btrfs_dev_extent *dev_extent = NULL;
1808 struct btrfs_path *path;
1809 u64 length;
1810 u64 chunk_tree;
1811 u64 chunk_objectid;
1812 u64 chunk_offset;
1813 int ret;
1814 int slot;
1815 struct extent_buffer *l;
1816 struct btrfs_key key;
1817 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1818 u64 old_total = btrfs_super_total_bytes(super_copy);
1819 u64 diff = device->total_bytes - new_size;
1820
1821 if (new_size >= device->total_bytes)
1822 return -EINVAL;
1823
1824 path = btrfs_alloc_path();
1825 if (!path)
1826 return -ENOMEM;
1827
1828 trans = btrfs_start_transaction(root, 1);
1829 if (!trans) {
1830 ret = -ENOMEM;
1831 goto done;
1832 }
1833
1834 path->reada = 2;
1835
1836 lock_chunks(root);
1837
1838 device->total_bytes = new_size;
1839 if (device->writeable)
1840 device->fs_devices->total_rw_bytes -= diff;
1841 ret = btrfs_update_device(trans, device);
1842 if (ret) {
1843 unlock_chunks(root);
1844 btrfs_end_transaction(trans, root);
1845 goto done;
1846 }
1847 WARN_ON(diff > old_total);
1848 btrfs_set_super_total_bytes(super_copy, old_total - diff);
1849 unlock_chunks(root);
1850 btrfs_end_transaction(trans, root);
1851
1852 key.objectid = device->devid;
1853 key.offset = (u64)-1;
1854 key.type = BTRFS_DEV_EXTENT_KEY;
1855
1856 while (1) {
1857 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1858 if (ret < 0)
1859 goto done;
1860
1861 ret = btrfs_previous_item(root, path, 0, key.type);
1862 if (ret < 0)
1863 goto done;
1864 if (ret) {
1865 ret = 0;
1866 goto done;
1867 }
1868
1869 l = path->nodes[0];
1870 slot = path->slots[0];
1871 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1872
1873 if (key.objectid != device->devid)
1874 goto done;
1875
1876 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1877 length = btrfs_dev_extent_length(l, dev_extent);
1878
1879 if (key.offset + length <= new_size)
1880 goto done;
1881
1882 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1883 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1884 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1885 btrfs_release_path(root, path);
1886
1887 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
1888 chunk_offset);
1889 if (ret)
1890 goto done;
1891 }
1892
1893done:
1894 btrfs_free_path(path);
1895 return ret;
1896}
1897
1898static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
1899 struct btrfs_root *root,
1900 struct btrfs_key *key,
1901 struct btrfs_chunk *chunk, int item_size)
1902{
1903 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1904 struct btrfs_disk_key disk_key;
1905 u32 array_size;
1906 u8 *ptr;
1907
1908 array_size = btrfs_super_sys_array_size(super_copy);
1909 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
1910 return -EFBIG;
1911
1912 ptr = super_copy->sys_chunk_array + array_size;
1913 btrfs_cpu_key_to_disk(&disk_key, key);
1914 memcpy(ptr, &disk_key, sizeof(disk_key));
1915 ptr += sizeof(disk_key);
1916 memcpy(ptr, chunk, item_size);
1917 item_size += sizeof(disk_key);
1918 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
1919 return 0;
1920}
1921
1922static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
1923 int num_stripes, int sub_stripes)
1924{
1925 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
1926 return calc_size;
1927 else if (type & BTRFS_BLOCK_GROUP_RAID10)
1928 return calc_size * (num_stripes / sub_stripes);
1929 else
1930 return calc_size * num_stripes;
1931}
1932
1933static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *extent_root,
1935 struct map_lookup **map_ret,
1936 u64 *num_bytes, u64 *stripe_size,
1937 u64 start, u64 type)
1938{
1939 struct btrfs_fs_info *info = extent_root->fs_info;
1940 struct btrfs_device *device = NULL;
1941 struct btrfs_fs_devices *fs_devices = info->fs_devices;
1942 struct list_head *cur;
1943 struct map_lookup *map = NULL;
1944 struct extent_map_tree *em_tree;
1945 struct extent_map *em;
1946 struct list_head private_devs;
1947 int min_stripe_size = 1 * 1024 * 1024;
1948 u64 calc_size = 1024 * 1024 * 1024;
1949 u64 max_chunk_size = calc_size;
1950 u64 min_free;
1951 u64 avail;
1952 u64 max_avail = 0;
1953 u64 dev_offset;
1954 int num_stripes = 1;
1955 int min_stripes = 1;
1956 int sub_stripes = 0;
1957 int looped = 0;
1958 int ret;
1959 int index;
1960 int stripe_len = 64 * 1024;
1961
1962 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
1963 (type & BTRFS_BLOCK_GROUP_DUP)) {
1964 WARN_ON(1);
1965 type &= ~BTRFS_BLOCK_GROUP_DUP;
1966 }
1967 if (list_empty(&fs_devices->alloc_list))
1968 return -ENOSPC;
1969
1970 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
1971 num_stripes = fs_devices->rw_devices;
1972 min_stripes = 2;
1973 }
1974 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
1975 num_stripes = 2;
1976 min_stripes = 2;
1977 }
1978 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
1979 num_stripes = min_t(u64, 2, fs_devices->rw_devices);
1980 if (num_stripes < 2)
1981 return -ENOSPC;
1982 min_stripes = 2;
1983 }
1984 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
1985 num_stripes = fs_devices->rw_devices;
1986 if (num_stripes < 4)
1987 return -ENOSPC;
1988 num_stripes &= ~(u32)1;
1989 sub_stripes = 2;
1990 min_stripes = 4;
1991 }
1992
1993 if (type & BTRFS_BLOCK_GROUP_DATA) {
1994 max_chunk_size = 10 * calc_size;
1995 min_stripe_size = 64 * 1024 * 1024;
1996 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
1997 max_chunk_size = 4 * calc_size;
1998 min_stripe_size = 32 * 1024 * 1024;
1999 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2000 calc_size = 8 * 1024 * 1024;
2001 max_chunk_size = calc_size * 2;
2002 min_stripe_size = 1 * 1024 * 1024;
2003 }
2004
2005 /* we don't want a chunk larger than 10% of writeable space */
2006 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2007 max_chunk_size);
2008
2009again:
2010 if (!map || map->num_stripes != num_stripes) {
2011 kfree(map);
2012 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2013 if (!map)
2014 return -ENOMEM;
2015 map->num_stripes = num_stripes;
2016 }
2017
2018 if (calc_size * num_stripes > max_chunk_size) {
2019 calc_size = max_chunk_size;
2020 do_div(calc_size, num_stripes);
2021 do_div(calc_size, stripe_len);
2022 calc_size *= stripe_len;
2023 }
2024 /* we don't want tiny stripes */
2025 calc_size = max_t(u64, min_stripe_size, calc_size);
2026
2027 do_div(calc_size, stripe_len);
2028 calc_size *= stripe_len;
2029
2030 cur = fs_devices->alloc_list.next;
2031 index = 0;
2032
2033 if (type & BTRFS_BLOCK_GROUP_DUP)
2034 min_free = calc_size * 2;
2035 else
2036 min_free = calc_size;
2037
2038 /*
2039 * we add 1MB because we never use the first 1MB of the device, unless
2040 * we've looped, then we are likely allocating the maximum amount of
2041 * space left already
2042 */
2043 if (!looped)
2044 min_free += 1024 * 1024;
2045
2046 INIT_LIST_HEAD(&private_devs);
2047 while (index < num_stripes) {
2048 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2049 BUG_ON(!device->writeable);
2050 if (device->total_bytes > device->bytes_used)
2051 avail = device->total_bytes - device->bytes_used;
2052 else
2053 avail = 0;
2054 cur = cur->next;
2055
2056 if (device->in_fs_metadata && avail >= min_free) {
2057 ret = find_free_dev_extent(trans, device,
2058 min_free, &dev_offset);
2059 if (ret == 0) {
2060 list_move_tail(&device->dev_alloc_list,
2061 &private_devs);
2062 map->stripes[index].dev = device;
2063 map->stripes[index].physical = dev_offset;
2064 index++;
2065 if (type & BTRFS_BLOCK_GROUP_DUP) {
2066 map->stripes[index].dev = device;
2067 map->stripes[index].physical =
2068 dev_offset + calc_size;
2069 index++;
2070 }
2071 }
2072 } else if (device->in_fs_metadata && avail > max_avail)
2073 max_avail = avail;
2074 if (cur == &fs_devices->alloc_list)
2075 break;
2076 }
2077 list_splice(&private_devs, &fs_devices->alloc_list);
2078 if (index < num_stripes) {
2079 if (index >= min_stripes) {
2080 num_stripes = index;
2081 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2082 num_stripes /= sub_stripes;
2083 num_stripes *= sub_stripes;
2084 }
2085 looped = 1;
2086 goto again;
2087 }
2088 if (!looped && max_avail > 0) {
2089 looped = 1;
2090 calc_size = max_avail;
2091 goto again;
2092 }
2093 kfree(map);
2094 return -ENOSPC;
2095 }
2096 map->sector_size = extent_root->sectorsize;
2097 map->stripe_len = stripe_len;
2098 map->io_align = stripe_len;
2099 map->io_width = stripe_len;
2100 map->type = type;
2101 map->num_stripes = num_stripes;
2102 map->sub_stripes = sub_stripes;
2103
2104 *map_ret = map;
2105 *stripe_size = calc_size;
2106 *num_bytes = chunk_bytes_by_type(type, calc_size,
2107 num_stripes, sub_stripes);
2108
2109 em = alloc_extent_map(GFP_NOFS);
2110 if (!em) {
2111 kfree(map);
2112 return -ENOMEM;
2113 }
2114 em->bdev = (struct block_device *)map;
2115 em->start = start;
2116 em->len = *num_bytes;
2117 em->block_start = 0;
2118 em->block_len = em->len;
2119
2120 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2121 spin_lock(&em_tree->lock);
2122 ret = add_extent_mapping(em_tree, em);
2123 spin_unlock(&em_tree->lock);
2124 BUG_ON(ret);
2125 free_extent_map(em);
2126
2127 ret = btrfs_make_block_group(trans, extent_root, 0, type,
2128 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2129 start, *num_bytes);
2130 BUG_ON(ret);
2131
2132 index = 0;
2133 while (index < map->num_stripes) {
2134 device = map->stripes[index].dev;
2135 dev_offset = map->stripes[index].physical;
2136
2137 ret = btrfs_alloc_dev_extent(trans, device,
2138 info->chunk_root->root_key.objectid,
2139 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2140 start, dev_offset, calc_size);
2141 BUG_ON(ret);
2142 index++;
2143 }
2144
2145 return 0;
2146}
2147
2148static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2149 struct btrfs_root *extent_root,
2150 struct map_lookup *map, u64 chunk_offset,
2151 u64 chunk_size, u64 stripe_size)
2152{
2153 u64 dev_offset;
2154 struct btrfs_key key;
2155 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2156 struct btrfs_device *device;
2157 struct btrfs_chunk *chunk;
2158 struct btrfs_stripe *stripe;
2159 size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2160 int index = 0;
2161 int ret;
2162
2163 chunk = kzalloc(item_size, GFP_NOFS);
2164 if (!chunk)
2165 return -ENOMEM;
2166
2167 index = 0;
2168 while (index < map->num_stripes) {
2169 device = map->stripes[index].dev;
2170 device->bytes_used += stripe_size;
2171 ret = btrfs_update_device(trans, device);
2172 BUG_ON(ret);
2173 index++;
2174 }
2175
2176 index = 0;
2177 stripe = &chunk->stripe;
2178 while (index < map->num_stripes) {
2179 device = map->stripes[index].dev;
2180 dev_offset = map->stripes[index].physical;
2181
2182 btrfs_set_stack_stripe_devid(stripe, device->devid);
2183 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2184 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2185 stripe++;
2186 index++;
2187 }
2188
2189 btrfs_set_stack_chunk_length(chunk, chunk_size);
2190 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2191 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2192 btrfs_set_stack_chunk_type(chunk, map->type);
2193 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2194 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2195 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2196 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2197 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2198
2199 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2200 key.type = BTRFS_CHUNK_ITEM_KEY;
2201 key.offset = chunk_offset;
2202
2203 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2204 BUG_ON(ret);
2205
2206 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2207 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2208 item_size);
2209 BUG_ON(ret);
2210 }
2211 kfree(chunk);
2212 return 0;
2213}
2214
2215/*
2216 * Chunk allocation falls into two parts. The first part does works
2217 * that make the new allocated chunk useable, but not do any operation
2218 * that modifies the chunk tree. The second part does the works that
2219 * require modifying the chunk tree. This division is important for the
2220 * bootstrap process of adding storage to a seed btrfs.
2221 */
2222int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2223 struct btrfs_root *extent_root, u64 type)
2224{
2225 u64 chunk_offset;
2226 u64 chunk_size;
2227 u64 stripe_size;
2228 struct map_lookup *map;
2229 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2230 int ret;
2231
2232 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2233 &chunk_offset);
2234 if (ret)
2235 return ret;
2236
2237 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2238 &stripe_size, chunk_offset, type);
2239 if (ret)
2240 return ret;
2241
2242 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2243 chunk_size, stripe_size);
2244 BUG_ON(ret);
2245 return 0;
2246}
2247
2248static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2249 struct btrfs_root *root,
2250 struct btrfs_device *device)
2251{
2252 u64 chunk_offset;
2253 u64 sys_chunk_offset;
2254 u64 chunk_size;
2255 u64 sys_chunk_size;
2256 u64 stripe_size;
2257 u64 sys_stripe_size;
2258 u64 alloc_profile;
2259 struct map_lookup *map;
2260 struct map_lookup *sys_map;
2261 struct btrfs_fs_info *fs_info = root->fs_info;
2262 struct btrfs_root *extent_root = fs_info->extent_root;
2263 int ret;
2264
2265 ret = find_next_chunk(fs_info->chunk_root,
2266 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2267 BUG_ON(ret);
2268
2269 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2270 (fs_info->metadata_alloc_profile &
2271 fs_info->avail_metadata_alloc_bits);
2272 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2273
2274 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2275 &stripe_size, chunk_offset, alloc_profile);
2276 BUG_ON(ret);
2277
2278 sys_chunk_offset = chunk_offset + chunk_size;
2279
2280 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2281 (fs_info->system_alloc_profile &
2282 fs_info->avail_system_alloc_bits);
2283 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2284
2285 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2286 &sys_chunk_size, &sys_stripe_size,
2287 sys_chunk_offset, alloc_profile);
2288 BUG_ON(ret);
2289
2290 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2291 BUG_ON(ret);
2292
2293 /*
2294 * Modifying chunk tree needs allocating new blocks from both
2295 * system block group and metadata block group. So we only can
2296 * do operations require modifying the chunk tree after both
2297 * block groups were created.
2298 */
2299 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2300 chunk_size, stripe_size);
2301 BUG_ON(ret);
2302
2303 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2304 sys_chunk_offset, sys_chunk_size,
2305 sys_stripe_size);
2306 BUG_ON(ret);
2307 return 0;
2308}
2309
2310int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2311{
2312 struct extent_map *em;
2313 struct map_lookup *map;
2314 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2315 int readonly = 0;
2316 int i;
2317
2318 spin_lock(&map_tree->map_tree.lock);
2319 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2320 spin_unlock(&map_tree->map_tree.lock);
2321 if (!em)
2322 return 1;
2323
2324 map = (struct map_lookup *)em->bdev;
2325 for (i = 0; i < map->num_stripes; i++) {
2326 if (!map->stripes[i].dev->writeable) {
2327 readonly = 1;
2328 break;
2329 }
2330 }
2331 free_extent_map(em);
2332 return readonly;
2333}
2334
2335void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2336{
2337 extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2338}
2339
2340void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2341{
2342 struct extent_map *em;
2343
2344 while (1) {
2345 spin_lock(&tree->map_tree.lock);
2346 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2347 if (em)
2348 remove_extent_mapping(&tree->map_tree, em);
2349 spin_unlock(&tree->map_tree.lock);
2350 if (!em)
2351 break;
2352 kfree(em->bdev);
2353 /* once for us */
2354 free_extent_map(em);
2355 /* once for the tree */
2356 free_extent_map(em);
2357 }
2358}
2359
2360int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2361{
2362 struct extent_map *em;
2363 struct map_lookup *map;
2364 struct extent_map_tree *em_tree = &map_tree->map_tree;
2365 int ret;
2366
2367 spin_lock(&em_tree->lock);
2368 em = lookup_extent_mapping(em_tree, logical, len);
2369 spin_unlock(&em_tree->lock);
2370 BUG_ON(!em);
2371
2372 BUG_ON(em->start > logical || em->start + em->len < logical);
2373 map = (struct map_lookup *)em->bdev;
2374 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2375 ret = map->num_stripes;
2376 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2377 ret = map->sub_stripes;
2378 else
2379 ret = 1;
2380 free_extent_map(em);
2381 return ret;
2382}
2383
2384static int find_live_mirror(struct map_lookup *map, int first, int num,
2385 int optimal)
2386{
2387 int i;
2388 if (map->stripes[optimal].dev->bdev)
2389 return optimal;
2390 for (i = first; i < first + num; i++) {
2391 if (map->stripes[i].dev->bdev)
2392 return i;
2393 }
2394 /* we couldn't find one that doesn't fail. Just return something
2395 * and the io error handling code will clean up eventually
2396 */
2397 return optimal;
2398}
2399
2400static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2401 u64 logical, u64 *length,
2402 struct btrfs_multi_bio **multi_ret,
2403 int mirror_num, struct page *unplug_page)
2404{
2405 struct extent_map *em;
2406 struct map_lookup *map;
2407 struct extent_map_tree *em_tree = &map_tree->map_tree;
2408 u64 offset;
2409 u64 stripe_offset;
2410 u64 stripe_nr;
2411 int stripes_allocated = 8;
2412 int stripes_required = 1;
2413 int stripe_index;
2414 int i;
2415 int num_stripes;
2416 int max_errors = 0;
2417 struct btrfs_multi_bio *multi = NULL;
2418
2419 if (multi_ret && !(rw & (1 << BIO_RW)))
2420 stripes_allocated = 1;
2421again:
2422 if (multi_ret) {
2423 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2424 GFP_NOFS);
2425 if (!multi)
2426 return -ENOMEM;
2427
2428 atomic_set(&multi->error, 0);
2429 }
2430
2431 spin_lock(&em_tree->lock);
2432 em = lookup_extent_mapping(em_tree, logical, *length);
2433 spin_unlock(&em_tree->lock);
2434
2435 if (!em && unplug_page)
2436 return 0;
2437
2438 if (!em) {
2439 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2440 (unsigned long long)logical,
2441 (unsigned long long)*length);
2442 BUG();
2443 }
2444
2445 BUG_ON(em->start > logical || em->start + em->len < logical);
2446 map = (struct map_lookup *)em->bdev;
2447 offset = logical - em->start;
2448
2449 if (mirror_num > map->num_stripes)
2450 mirror_num = 0;
2451
2452 /* if our multi bio struct is too small, back off and try again */
2453 if (rw & (1 << BIO_RW)) {
2454 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2455 BTRFS_BLOCK_GROUP_DUP)) {
2456 stripes_required = map->num_stripes;
2457 max_errors = 1;
2458 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2459 stripes_required = map->sub_stripes;
2460 max_errors = 1;
2461 }
2462 }
2463 if (multi_ret && rw == WRITE &&
2464 stripes_allocated < stripes_required) {
2465 stripes_allocated = map->num_stripes;
2466 free_extent_map(em);
2467 kfree(multi);
2468 goto again;
2469 }
2470 stripe_nr = offset;
2471 /*
2472 * stripe_nr counts the total number of stripes we have to stride
2473 * to get to this block
2474 */
2475 do_div(stripe_nr, map->stripe_len);
2476
2477 stripe_offset = stripe_nr * map->stripe_len;
2478 BUG_ON(offset < stripe_offset);
2479
2480 /* stripe_offset is the offset of this block in its stripe*/
2481 stripe_offset = offset - stripe_offset;
2482
2483 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2484 BTRFS_BLOCK_GROUP_RAID10 |
2485 BTRFS_BLOCK_GROUP_DUP)) {
2486 /* we limit the length of each bio to what fits in a stripe */
2487 *length = min_t(u64, em->len - offset,
2488 map->stripe_len - stripe_offset);
2489 } else {
2490 *length = em->len - offset;
2491 }
2492
2493 if (!multi_ret && !unplug_page)
2494 goto out;
2495
2496 num_stripes = 1;
2497 stripe_index = 0;
2498 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2499 if (unplug_page || (rw & (1 << BIO_RW)))
2500 num_stripes = map->num_stripes;
2501 else if (mirror_num)
2502 stripe_index = mirror_num - 1;
2503 else {
2504 stripe_index = find_live_mirror(map, 0,
2505 map->num_stripes,
2506 current->pid % map->num_stripes);
2507 }
2508
2509 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2510 if (rw & (1 << BIO_RW))
2511 num_stripes = map->num_stripes;
2512 else if (mirror_num)
2513 stripe_index = mirror_num - 1;
2514
2515 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2516 int factor = map->num_stripes / map->sub_stripes;
2517
2518 stripe_index = do_div(stripe_nr, factor);
2519 stripe_index *= map->sub_stripes;
2520
2521 if (unplug_page || (rw & (1 << BIO_RW)))
2522 num_stripes = map->sub_stripes;
2523 else if (mirror_num)
2524 stripe_index += mirror_num - 1;
2525 else {
2526 stripe_index = find_live_mirror(map, stripe_index,
2527 map->sub_stripes, stripe_index +
2528 current->pid % map->sub_stripes);
2529 }
2530 } else {
2531 /*
2532 * after this do_div call, stripe_nr is the number of stripes
2533 * on this device we have to walk to find the data, and
2534 * stripe_index is the number of our device in the stripe array
2535 */
2536 stripe_index = do_div(stripe_nr, map->num_stripes);
2537 }
2538 BUG_ON(stripe_index >= map->num_stripes);
2539
2540 for (i = 0; i < num_stripes; i++) {
2541 if (unplug_page) {
2542 struct btrfs_device *device;
2543 struct backing_dev_info *bdi;
2544
2545 device = map->stripes[stripe_index].dev;
2546 if (device->bdev) {
2547 bdi = blk_get_backing_dev_info(device->bdev);
2548 if (bdi->unplug_io_fn)
2549 bdi->unplug_io_fn(bdi, unplug_page);
2550 }
2551 } else {
2552 multi->stripes[i].physical =
2553 map->stripes[stripe_index].physical +
2554 stripe_offset + stripe_nr * map->stripe_len;
2555 multi->stripes[i].dev = map->stripes[stripe_index].dev;
2556 }
2557 stripe_index++;
2558 }
2559 if (multi_ret) {
2560 *multi_ret = multi;
2561 multi->num_stripes = num_stripes;
2562 multi->max_errors = max_errors;
2563 }
2564out:
2565 free_extent_map(em);
2566 return 0;
2567}
2568
2569int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2570 u64 logical, u64 *length,
2571 struct btrfs_multi_bio **multi_ret, int mirror_num)
2572{
2573 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2574 mirror_num, NULL);
2575}
2576
2577int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2578 u64 chunk_start, u64 physical, u64 devid,
2579 u64 **logical, int *naddrs, int *stripe_len)
2580{
2581 struct extent_map_tree *em_tree = &map_tree->map_tree;
2582 struct extent_map *em;
2583 struct map_lookup *map;
2584 u64 *buf;
2585 u64 bytenr;
2586 u64 length;
2587 u64 stripe_nr;
2588 int i, j, nr = 0;
2589
2590 spin_lock(&em_tree->lock);
2591 em = lookup_extent_mapping(em_tree, chunk_start, 1);
2592 spin_unlock(&em_tree->lock);
2593
2594 BUG_ON(!em || em->start != chunk_start);
2595 map = (struct map_lookup *)em->bdev;
2596
2597 length = em->len;
2598 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2599 do_div(length, map->num_stripes / map->sub_stripes);
2600 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
2601 do_div(length, map->num_stripes);
2602
2603 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
2604 BUG_ON(!buf);
2605
2606 for (i = 0; i < map->num_stripes; i++) {
2607 if (devid && map->stripes[i].dev->devid != devid)
2608 continue;
2609 if (map->stripes[i].physical > physical ||
2610 map->stripes[i].physical + length <= physical)
2611 continue;
2612
2613 stripe_nr = physical - map->stripes[i].physical;
2614 do_div(stripe_nr, map->stripe_len);
2615
2616 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2617 stripe_nr = stripe_nr * map->num_stripes + i;
2618 do_div(stripe_nr, map->sub_stripes);
2619 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2620 stripe_nr = stripe_nr * map->num_stripes + i;
2621 }
2622 bytenr = chunk_start + stripe_nr * map->stripe_len;
2623 WARN_ON(nr >= map->num_stripes);
2624 for (j = 0; j < nr; j++) {
2625 if (buf[j] == bytenr)
2626 break;
2627 }
2628 if (j == nr) {
2629 WARN_ON(nr >= map->num_stripes);
2630 buf[nr++] = bytenr;
2631 }
2632 }
2633
2634 for (i = 0; i > nr; i++) {
2635 struct btrfs_multi_bio *multi;
2636 struct btrfs_bio_stripe *stripe;
2637 int ret;
2638
2639 length = 1;
2640 ret = btrfs_map_block(map_tree, WRITE, buf[i],
2641 &length, &multi, 0);
2642 BUG_ON(ret);
2643
2644 stripe = multi->stripes;
2645 for (j = 0; j < multi->num_stripes; j++) {
2646 if (stripe->physical >= physical &&
2647 physical < stripe->physical + length)
2648 break;
2649 }
2650 BUG_ON(j >= multi->num_stripes);
2651 kfree(multi);
2652 }
2653
2654 *logical = buf;
2655 *naddrs = nr;
2656 *stripe_len = map->stripe_len;
2657
2658 free_extent_map(em);
2659 return 0;
2660}
2661
2662int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2663 u64 logical, struct page *page)
2664{
2665 u64 length = PAGE_CACHE_SIZE;
2666 return __btrfs_map_block(map_tree, READ, logical, &length,
2667 NULL, 0, page);
2668}
2669
2670static void end_bio_multi_stripe(struct bio *bio, int err)
2671{
2672 struct btrfs_multi_bio *multi = bio->bi_private;
2673 int is_orig_bio = 0;
2674
2675 if (err)
2676 atomic_inc(&multi->error);
2677
2678 if (bio == multi->orig_bio)
2679 is_orig_bio = 1;
2680
2681 if (atomic_dec_and_test(&multi->stripes_pending)) {
2682 if (!is_orig_bio) {
2683 bio_put(bio);
2684 bio = multi->orig_bio;
2685 }
2686 bio->bi_private = multi->private;
2687 bio->bi_end_io = multi->end_io;
2688 /* only send an error to the higher layers if it is
2689 * beyond the tolerance of the multi-bio
2690 */
2691 if (atomic_read(&multi->error) > multi->max_errors) {
2692 err = -EIO;
2693 } else if (err) {
2694 /*
2695 * this bio is actually up to date, we didn't
2696 * go over the max number of errors
2697 */
2698 set_bit(BIO_UPTODATE, &bio->bi_flags);
2699 err = 0;
2700 }
2701 kfree(multi);
2702
2703 bio_endio(bio, err);
2704 } else if (!is_orig_bio) {
2705 bio_put(bio);
2706 }
2707}
2708
2709struct async_sched {
2710 struct bio *bio;
2711 int rw;
2712 struct btrfs_fs_info *info;
2713 struct btrfs_work work;
2714};
2715
2716/*
2717 * see run_scheduled_bios for a description of why bios are collected for
2718 * async submit.
2719 *
2720 * This will add one bio to the pending list for a device and make sure
2721 * the work struct is scheduled.
2722 */
2723static noinline int schedule_bio(struct btrfs_root *root,
2724 struct btrfs_device *device,
2725 int rw, struct bio *bio)
2726{
2727 int should_queue = 1;
2728
2729 /* don't bother with additional async steps for reads, right now */
2730 if (!(rw & (1 << BIO_RW))) {
2731 bio_get(bio);
2732 submit_bio(rw, bio);
2733 bio_put(bio);
2734 return 0;
2735 }
2736
2737 /*
2738 * nr_async_bios allows us to reliably return congestion to the
2739 * higher layers. Otherwise, the async bio makes it appear we have
2740 * made progress against dirty pages when we've really just put it
2741 * on a queue for later
2742 */
2743 atomic_inc(&root->fs_info->nr_async_bios);
2744 WARN_ON(bio->bi_next);
2745 bio->bi_next = NULL;
2746 bio->bi_rw |= rw;
2747
2748 spin_lock(&device->io_lock);
2749
2750 if (device->pending_bio_tail)
2751 device->pending_bio_tail->bi_next = bio;
2752
2753 device->pending_bio_tail = bio;
2754 if (!device->pending_bios)
2755 device->pending_bios = bio;
2756 if (device->running_pending)
2757 should_queue = 0;
2758
2759 spin_unlock(&device->io_lock);
2760
2761 if (should_queue)
2762 btrfs_queue_worker(&root->fs_info->submit_workers,
2763 &device->work);
2764 return 0;
2765}
2766
2767int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
2768 int mirror_num, int async_submit)
2769{
2770 struct btrfs_mapping_tree *map_tree;
2771 struct btrfs_device *dev;
2772 struct bio *first_bio = bio;
2773 u64 logical = (u64)bio->bi_sector << 9;
2774 u64 length = 0;
2775 u64 map_length;
2776 struct btrfs_multi_bio *multi = NULL;
2777 int ret;
2778 int dev_nr = 0;
2779 int total_devs = 1;
2780
2781 length = bio->bi_size;
2782 map_tree = &root->fs_info->mapping_tree;
2783 map_length = length;
2784
2785 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
2786 mirror_num);
2787 BUG_ON(ret);
2788
2789 total_devs = multi->num_stripes;
2790 if (map_length < length) {
2791 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
2792 "len %llu\n", (unsigned long long)logical,
2793 (unsigned long long)length,
2794 (unsigned long long)map_length);
2795 BUG();
2796 }
2797 multi->end_io = first_bio->bi_end_io;
2798 multi->private = first_bio->bi_private;
2799 multi->orig_bio = first_bio;
2800 atomic_set(&multi->stripes_pending, multi->num_stripes);
2801
2802 while (dev_nr < total_devs) {
2803 if (total_devs > 1) {
2804 if (dev_nr < total_devs - 1) {
2805 bio = bio_clone(first_bio, GFP_NOFS);
2806 BUG_ON(!bio);
2807 } else {
2808 bio = first_bio;
2809 }
2810 bio->bi_private = multi;
2811 bio->bi_end_io = end_bio_multi_stripe;
2812 }
2813 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
2814 dev = multi->stripes[dev_nr].dev;
2815 BUG_ON(rw == WRITE && !dev->writeable);
2816 if (dev && dev->bdev) {
2817 bio->bi_bdev = dev->bdev;
2818 if (async_submit)
2819 schedule_bio(root, dev, rw, bio);
2820 else
2821 submit_bio(rw, bio);
2822 } else {
2823 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2824 bio->bi_sector = logical >> 9;
2825 bio_endio(bio, -EIO);
2826 }
2827 dev_nr++;
2828 }
2829 if (total_devs == 1)
2830 kfree(multi);
2831 return 0;
2832}
2833
2834struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
2835 u8 *uuid, u8 *fsid)
2836{
2837 struct btrfs_device *device;
2838 struct btrfs_fs_devices *cur_devices;
2839
2840 cur_devices = root->fs_info->fs_devices;
2841 while (cur_devices) {
2842 if (!fsid ||
2843 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2844 device = __find_device(&cur_devices->devices,
2845 devid, uuid);
2846 if (device)
2847 return device;
2848 }
2849 cur_devices = cur_devices->seed;
2850 }
2851 return NULL;
2852}
2853
2854static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
2855 u64 devid, u8 *dev_uuid)
2856{
2857 struct btrfs_device *device;
2858 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2859
2860 device = kzalloc(sizeof(*device), GFP_NOFS);
2861 if (!device)
2862 return NULL;
2863 list_add(&device->dev_list,
2864 &fs_devices->devices);
2865 device->barriers = 1;
2866 device->dev_root = root->fs_info->dev_root;
2867 device->devid = devid;
2868 device->work.func = pending_bios_fn;
2869 device->fs_devices = fs_devices;
2870 fs_devices->num_devices++;
2871 spin_lock_init(&device->io_lock);
2872 INIT_LIST_HEAD(&device->dev_alloc_list);
2873 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
2874 return device;
2875}
2876
2877static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2878 struct extent_buffer *leaf,
2879 struct btrfs_chunk *chunk)
2880{
2881 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2882 struct map_lookup *map;
2883 struct extent_map *em;
2884 u64 logical;
2885 u64 length;
2886 u64 devid;
2887 u8 uuid[BTRFS_UUID_SIZE];
2888 int num_stripes;
2889 int ret;
2890 int i;
2891
2892 logical = key->offset;
2893 length = btrfs_chunk_length(leaf, chunk);
2894
2895 spin_lock(&map_tree->map_tree.lock);
2896 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
2897 spin_unlock(&map_tree->map_tree.lock);
2898
2899 /* already mapped? */
2900 if (em && em->start <= logical && em->start + em->len > logical) {
2901 free_extent_map(em);
2902 return 0;
2903 } else if (em) {
2904 free_extent_map(em);
2905 }
2906
2907 map = kzalloc(sizeof(*map), GFP_NOFS);
2908 if (!map)
2909 return -ENOMEM;
2910
2911 em = alloc_extent_map(GFP_NOFS);
2912 if (!em)
2913 return -ENOMEM;
2914 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2915 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2916 if (!map) {
2917 free_extent_map(em);
2918 return -ENOMEM;
2919 }
2920
2921 em->bdev = (struct block_device *)map;
2922 em->start = logical;
2923 em->len = length;
2924 em->block_start = 0;
2925 em->block_len = em->len;
2926
2927 map->num_stripes = num_stripes;
2928 map->io_width = btrfs_chunk_io_width(leaf, chunk);
2929 map->io_align = btrfs_chunk_io_align(leaf, chunk);
2930 map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
2931 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
2932 map->type = btrfs_chunk_type(leaf, chunk);
2933 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
2934 for (i = 0; i < num_stripes; i++) {
2935 map->stripes[i].physical =
2936 btrfs_stripe_offset_nr(leaf, chunk, i);
2937 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
2938 read_extent_buffer(leaf, uuid, (unsigned long)
2939 btrfs_stripe_dev_uuid_nr(chunk, i),
2940 BTRFS_UUID_SIZE);
2941 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
2942 NULL);
2943 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
2944 kfree(map);
2945 free_extent_map(em);
2946 return -EIO;
2947 }
2948 if (!map->stripes[i].dev) {
2949 map->stripes[i].dev =
2950 add_missing_dev(root, devid, uuid);
2951 if (!map->stripes[i].dev) {
2952 kfree(map);
2953 free_extent_map(em);
2954 return -EIO;
2955 }
2956 }
2957 map->stripes[i].dev->in_fs_metadata = 1;
2958 }
2959
2960 spin_lock(&map_tree->map_tree.lock);
2961 ret = add_extent_mapping(&map_tree->map_tree, em);
2962 spin_unlock(&map_tree->map_tree.lock);
2963 BUG_ON(ret);
2964 free_extent_map(em);
2965
2966 return 0;
2967}
2968
2969static int fill_device_from_item(struct extent_buffer *leaf,
2970 struct btrfs_dev_item *dev_item,
2971 struct btrfs_device *device)
2972{
2973 unsigned long ptr;
2974
2975 device->devid = btrfs_device_id(leaf, dev_item);
2976 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2977 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2978 device->type = btrfs_device_type(leaf, dev_item);
2979 device->io_align = btrfs_device_io_align(leaf, dev_item);
2980 device->io_width = btrfs_device_io_width(leaf, dev_item);
2981 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
2982
2983 ptr = (unsigned long)btrfs_device_uuid(dev_item);
2984 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
2985
2986 return 0;
2987}
2988
2989static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
2990{
2991 struct btrfs_fs_devices *fs_devices;
2992 int ret;
2993
2994 mutex_lock(&uuid_mutex);
2995
2996 fs_devices = root->fs_info->fs_devices->seed;
2997 while (fs_devices) {
2998 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2999 ret = 0;
3000 goto out;
3001 }
3002 fs_devices = fs_devices->seed;
3003 }
3004
3005 fs_devices = find_fsid(fsid);
3006 if (!fs_devices) {
3007 ret = -ENOENT;
3008 goto out;
3009 }
3010
3011 fs_devices = clone_fs_devices(fs_devices);
3012 if (IS_ERR(fs_devices)) {
3013 ret = PTR_ERR(fs_devices);
3014 goto out;
3015 }
3016
3017 ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3018 root->fs_info->bdev_holder);
3019 if (ret)
3020 goto out;
3021
3022 if (!fs_devices->seeding) {
3023 __btrfs_close_devices(fs_devices);
3024 free_fs_devices(fs_devices);
3025 ret = -EINVAL;
3026 goto out;
3027 }
3028
3029 fs_devices->seed = root->fs_info->fs_devices->seed;
3030 root->fs_info->fs_devices->seed = fs_devices;
3031out:
3032 mutex_unlock(&uuid_mutex);
3033 return ret;
3034}
3035
3036static int read_one_dev(struct btrfs_root *root,
3037 struct extent_buffer *leaf,
3038 struct btrfs_dev_item *dev_item)
3039{
3040 struct btrfs_device *device;
3041 u64 devid;
3042 int ret;
3043 u8 fs_uuid[BTRFS_UUID_SIZE];
3044 u8 dev_uuid[BTRFS_UUID_SIZE];
3045
3046 devid = btrfs_device_id(leaf, dev_item);
3047 read_extent_buffer(leaf, dev_uuid,
3048 (unsigned long)btrfs_device_uuid(dev_item),
3049 BTRFS_UUID_SIZE);
3050 read_extent_buffer(leaf, fs_uuid,
3051 (unsigned long)btrfs_device_fsid(dev_item),
3052 BTRFS_UUID_SIZE);
3053
3054 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3055 ret = open_seed_devices(root, fs_uuid);
3056 if (ret && !btrfs_test_opt(root, DEGRADED))
3057 return ret;
3058 }
3059
3060 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3061 if (!device || !device->bdev) {
3062 if (!btrfs_test_opt(root, DEGRADED))
3063 return -EIO;
3064
3065 if (!device) {
3066 printk(KERN_WARNING "warning devid %llu missing\n",
3067 (unsigned long long)devid);
3068 device = add_missing_dev(root, devid, dev_uuid);
3069 if (!device)
3070 return -ENOMEM;
3071 }
3072 }
3073
3074 if (device->fs_devices != root->fs_info->fs_devices) {
3075 BUG_ON(device->writeable);
3076 if (device->generation !=
3077 btrfs_device_generation(leaf, dev_item))
3078 return -EINVAL;
3079 }
3080
3081 fill_device_from_item(leaf, dev_item, device);
3082 device->dev_root = root->fs_info->dev_root;
3083 device->in_fs_metadata = 1;
3084 if (device->writeable)
3085 device->fs_devices->total_rw_bytes += device->total_bytes;
3086 ret = 0;
3087 return ret;
3088}
3089
3090int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3091{
3092 struct btrfs_dev_item *dev_item;
3093
3094 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3095 dev_item);
3096 return read_one_dev(root, buf, dev_item);
3097}
3098
3099int btrfs_read_sys_array(struct btrfs_root *root)
3100{
3101 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3102 struct extent_buffer *sb;
3103 struct btrfs_disk_key *disk_key;
3104 struct btrfs_chunk *chunk;
3105 u8 *ptr;
3106 unsigned long sb_ptr;
3107 int ret = 0;
3108 u32 num_stripes;
3109 u32 array_size;
3110 u32 len = 0;
3111 u32 cur;
3112 struct btrfs_key key;
3113
3114 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3115 BTRFS_SUPER_INFO_SIZE);
3116 if (!sb)
3117 return -ENOMEM;
3118 btrfs_set_buffer_uptodate(sb);
3119 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3120 array_size = btrfs_super_sys_array_size(super_copy);
3121
3122 ptr = super_copy->sys_chunk_array;
3123 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3124 cur = 0;
3125
3126 while (cur < array_size) {
3127 disk_key = (struct btrfs_disk_key *)ptr;
3128 btrfs_disk_key_to_cpu(&key, disk_key);
3129
3130 len = sizeof(*disk_key); ptr += len;
3131 sb_ptr += len;
3132 cur += len;
3133
3134 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3135 chunk = (struct btrfs_chunk *)sb_ptr;
3136 ret = read_one_chunk(root, &key, sb, chunk);
3137 if (ret)
3138 break;
3139 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3140 len = btrfs_chunk_item_size(num_stripes);
3141 } else {
3142 ret = -EIO;
3143 break;
3144 }
3145 ptr += len;
3146 sb_ptr += len;
3147 cur += len;
3148 }
3149 free_extent_buffer(sb);
3150 return ret;
3151}
3152
3153int btrfs_read_chunk_tree(struct btrfs_root *root)
3154{
3155 struct btrfs_path *path;
3156 struct extent_buffer *leaf;
3157 struct btrfs_key key;
3158 struct btrfs_key found_key;
3159 int ret;
3160 int slot;
3161
3162 root = root->fs_info->chunk_root;
3163
3164 path = btrfs_alloc_path();
3165 if (!path)
3166 return -ENOMEM;
3167
3168 /* first we search for all of the device items, and then we
3169 * read in all of the chunk items. This way we can create chunk
3170 * mappings that reference all of the devices that are afound
3171 */
3172 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3173 key.offset = 0;
3174 key.type = 0;
3175again:
3176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3177 while (1) {
3178 leaf = path->nodes[0];
3179 slot = path->slots[0];
3180 if (slot >= btrfs_header_nritems(leaf)) {
3181 ret = btrfs_next_leaf(root, path);
3182 if (ret == 0)
3183 continue;
3184 if (ret < 0)
3185 goto error;
3186 break;
3187 }
3188 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3189 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3190 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3191 break;
3192 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3193 struct btrfs_dev_item *dev_item;
3194 dev_item = btrfs_item_ptr(leaf, slot,
3195 struct btrfs_dev_item);
3196 ret = read_one_dev(root, leaf, dev_item);
3197 if (ret)
3198 goto error;
3199 }
3200 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3201 struct btrfs_chunk *chunk;
3202 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3203 ret = read_one_chunk(root, &found_key, leaf, chunk);
3204 if (ret)
3205 goto error;
3206 }
3207 path->slots[0]++;
3208 }
3209 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3210 key.objectid = 0;
3211 btrfs_release_path(root, path);
3212 goto again;
3213 }
3214 ret = 0;
3215error:
3216 btrfs_free_path(path);
3217 return ret;
3218}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000000..86c44e9ae110
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,162 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_VOLUMES_
20#define __BTRFS_VOLUMES_
21
22#include <linux/bio.h>
23#include "async-thread.h"
24
25struct buffer_head;
26struct btrfs_device {
27 struct list_head dev_list;
28 struct list_head dev_alloc_list;
29 struct btrfs_fs_devices *fs_devices;
30 struct btrfs_root *dev_root;
31 struct bio *pending_bios;
32 struct bio *pending_bio_tail;
33 int running_pending;
34 u64 generation;
35
36 int barriers;
37 int writeable;
38 int in_fs_metadata;
39
40 spinlock_t io_lock;
41
42 struct block_device *bdev;
43
44 /* the mode sent to open_bdev_exclusive */
45 fmode_t mode;
46
47 char *name;
48
49 /* the internal btrfs device id */
50 u64 devid;
51
52 /* size of the device */
53 u64 total_bytes;
54
55 /* bytes used */
56 u64 bytes_used;
57
58 /* optimal io alignment for this device */
59 u32 io_align;
60
61 /* optimal io width for this device */
62 u32 io_width;
63
64 /* minimal io size for this device */
65 u32 sector_size;
66
67 /* type and info about this device */
68 u64 type;
69
70 /* physical drive uuid (or lvm uuid) */
71 u8 uuid[BTRFS_UUID_SIZE];
72
73 struct btrfs_work work;
74};
75
76struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78
79 /* the device with this id has the most recent coyp of the super */
80 u64 latest_devid;
81 u64 latest_trans;
82 u64 num_devices;
83 u64 open_devices;
84 u64 rw_devices;
85 u64 total_rw_bytes;
86 struct block_device *latest_bdev;
87 /* all of the devices in the FS */
88 struct list_head devices;
89
90 /* devices not currently being allocated */
91 struct list_head alloc_list;
92 struct list_head list;
93
94 struct btrfs_fs_devices *seed;
95 int seeding;
96
97 int opened;
98};
99
100struct btrfs_bio_stripe {
101 struct btrfs_device *dev;
102 u64 physical;
103};
104
105struct btrfs_multi_bio {
106 atomic_t stripes_pending;
107 bio_end_io_t *end_io;
108 struct bio *orig_bio;
109 void *private;
110 atomic_t error;
111 int max_errors;
112 int num_stripes;
113 struct btrfs_bio_stripe stripes[];
114};
115
116#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
117 (sizeof(struct btrfs_bio_stripe) * (n)))
118
119int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
120 struct btrfs_device *device,
121 u64 chunk_tree, u64 chunk_objectid,
122 u64 chunk_offset, u64 start, u64 num_bytes);
123int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
124 u64 logical, u64 *length,
125 struct btrfs_multi_bio **multi_ret, int mirror_num);
126int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
127 u64 chunk_start, u64 physical, u64 devid,
128 u64 **logical, int *naddrs, int *stripe_len);
129int btrfs_read_sys_array(struct btrfs_root *root);
130int btrfs_read_chunk_tree(struct btrfs_root *root);
131int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
132 struct btrfs_root *extent_root, u64 type);
133void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
134void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
135int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
136 int mirror_num, int async_submit);
137int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
138int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
139 fmode_t flags, void *holder);
140int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
141 struct btrfs_fs_devices **fs_devices_ret);
142int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
143int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
144int btrfs_add_device(struct btrfs_trans_handle *trans,
145 struct btrfs_root *root,
146 struct btrfs_device *device);
147int btrfs_rm_device(struct btrfs_root *root, char *device_path);
148int btrfs_cleanup_fs_uuids(void);
149int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
150int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
151 u64 logical, struct page *page);
152int btrfs_grow_device(struct btrfs_trans_handle *trans,
153 struct btrfs_device *device, u64 new_size);
154struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
155 u8 *uuid, u8 *fsid);
156int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
157int btrfs_init_new_device(struct btrfs_root *root, char *path);
158int btrfs_balance(struct btrfs_root *dev_root);
159void btrfs_unlock_volumes(void);
160void btrfs_lock_volumes(void);
161int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
162#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000000..7f332e270894
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,322 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/init.h>
20#include <linux/fs.h>
21#include <linux/slab.h>
22#include <linux/rwsem.h>
23#include <linux/xattr.h>
24#include "ctree.h"
25#include "btrfs_inode.h"
26#include "transaction.h"
27#include "xattr.h"
28#include "disk-io.h"
29
30
31ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
32 void *buffer, size_t size)
33{
34 struct btrfs_dir_item *di;
35 struct btrfs_root *root = BTRFS_I(inode)->root;
36 struct btrfs_path *path;
37 struct extent_buffer *leaf;
38 int ret = 0;
39 unsigned long data_ptr;
40
41 path = btrfs_alloc_path();
42 if (!path)
43 return -ENOMEM;
44
45 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0);
48 if (!di || IS_ERR(di)) {
49 ret = -ENODATA;
50 goto out;
51 }
52
53 leaf = path->nodes[0];
54 /* if size is 0, that means we want the size of the attr */
55 if (!size) {
56 ret = btrfs_dir_data_len(leaf, di);
57 goto out;
58 }
59
60 /* now get the data out of our dir_item */
61 if (btrfs_dir_data_len(leaf, di) > size) {
62 ret = -ERANGE;
63 goto out;
64 }
65 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr,
68 btrfs_dir_data_len(leaf, di));
69 ret = btrfs_dir_data_len(leaf, di);
70
71out:
72 btrfs_free_path(path);
73 return ret;
74}
75
76int __btrfs_setxattr(struct inode *inode, const char *name,
77 const void *value, size_t size, int flags)
78{
79 struct btrfs_dir_item *di;
80 struct btrfs_root *root = BTRFS_I(inode)->root;
81 struct btrfs_trans_handle *trans;
82 struct btrfs_path *path;
83 int ret = 0, mod = 0;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88
89 trans = btrfs_start_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode);
91
92 /* first lets see if we already have this xattr */
93 di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
94 strlen(name), -1);
95 if (IS_ERR(di)) {
96 ret = PTR_ERR(di);
97 goto out;
98 }
99
100 /* ok we already have this xattr, lets remove it */
101 if (di) {
102 /* if we want create only exit */
103 if (flags & XATTR_CREATE) {
104 ret = -EEXIST;
105 goto out;
106 }
107
108 ret = btrfs_delete_one_dir_name(trans, root, path, di);
109 if (ret)
110 goto out;
111 btrfs_release_path(root, path);
112
113 /* if we don't have a value then we are removing the xattr */
114 if (!value) {
115 mod = 1;
116 goto out;
117 }
118 } else {
119 btrfs_release_path(root, path);
120
121 if (flags & XATTR_REPLACE) {
122 /* we couldn't find the attr to replace */
123 ret = -ENODATA;
124 goto out;
125 }
126 }
127
128 /* ok we have to create a completely new xattr */
129 ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
130 value, size, inode->i_ino);
131 if (ret)
132 goto out;
133 mod = 1;
134
135out:
136 if (mod) {
137 inode->i_ctime = CURRENT_TIME;
138 ret = btrfs_update_inode(trans, root, inode);
139 }
140
141 btrfs_end_transaction(trans, root);
142 btrfs_free_path(path);
143 return ret;
144}
145
146ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 struct btrfs_key key, found_key;
149 struct inode *inode = dentry->d_inode;
150 struct btrfs_root *root = BTRFS_I(inode)->root;
151 struct btrfs_path *path;
152 struct btrfs_item *item;
153 struct extent_buffer *leaf;
154 struct btrfs_dir_item *di;
155 int ret = 0, slot, advance;
156 size_t total_size = 0, size_left = size;
157 unsigned long name_ptr;
158 size_t name_len;
159 u32 nritems;
160
161 /*
162 * ok we want all objects associated with this id.
163 * NOTE: we set key.offset = 0; because we want to start with the
164 * first xattr that we find and walk forward
165 */
166 key.objectid = inode->i_ino;
167 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
168 key.offset = 0;
169
170 path = btrfs_alloc_path();
171 if (!path)
172 return -ENOMEM;
173 path->reada = 2;
174
175 /* search for our xattrs */
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0)
178 goto err;
179 ret = 0;
180 advance = 0;
181 while (1) {
182 leaf = path->nodes[0];
183 nritems = btrfs_header_nritems(leaf);
184 slot = path->slots[0];
185
186 /* this is where we start walking through the path */
187 if (advance || slot >= nritems) {
188 /*
189 * if we've reached the last slot in this leaf we need
190 * to go to the next leaf and reset everything
191 */
192 if (slot >= nritems-1) {
193 ret = btrfs_next_leaf(root, path);
194 if (ret)
195 break;
196 leaf = path->nodes[0];
197 nritems = btrfs_header_nritems(leaf);
198 slot = path->slots[0];
199 } else {
200 /*
201 * just walking through the slots on this leaf
202 */
203 slot++;
204 path->slots[0]++;
205 }
206 }
207 advance = 1;
208
209 item = btrfs_item_nr(leaf, slot);
210 btrfs_item_key_to_cpu(leaf, &found_key, slot);
211
212 /* check to make sure this item is what we want */
213 if (found_key.objectid != key.objectid)
214 break;
215 if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
216 break;
217
218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
219
220 name_len = btrfs_dir_name_len(leaf, di);
221 total_size += name_len + 1;
222
223 /* we are just looking for how big our buffer needs to be */
224 if (!size)
225 continue;
226
227 if (!buffer || (name_len + 1) > size_left) {
228 ret = -ERANGE;
229 goto err;
230 }
231
232 name_ptr = (unsigned long)(di + 1);
233 read_extent_buffer(leaf, buffer, name_ptr, name_len);
234 buffer[name_len] = '\0';
235
236 size_left -= name_len + 1;
237 buffer += name_len + 1;
238 }
239 ret = total_size;
240
241err:
242 btrfs_free_path(path);
243
244 return ret;
245}
246
247/*
248 * List of handlers for synthetic system.* attributes. All real ondisk
249 * attributes are handled directly.
250 */
251struct xattr_handler *btrfs_xattr_handlers[] = {
252#ifdef CONFIG_FS_POSIX_ACL
253 &btrfs_xattr_acl_access_handler,
254 &btrfs_xattr_acl_default_handler,
255#endif
256 NULL,
257};
258
259/*
260 * Check if the attribute is in a supported namespace.
261 *
262 * This applied after the check for the synthetic attributes in the system
263 * namespace.
264 */
265static bool btrfs_is_valid_xattr(const char *name)
266{
267 return !strncmp(name, XATTR_SECURITY_PREFIX,
268 XATTR_SECURITY_PREFIX_LEN) ||
269 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
270 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
271 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
272}
273
274ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
275 void *buffer, size_t size)
276{
277 /*
278 * If this is a request for a synthetic attribute in the system.*
279 * namespace use the generic infrastructure to resolve a handler
280 * for it via sb->s_xattr.
281 */
282 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
283 return generic_getxattr(dentry, name, buffer, size);
284
285 if (!btrfs_is_valid_xattr(name))
286 return -EOPNOTSUPP;
287 return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
288}
289
290int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
291 size_t size, int flags)
292{
293 /*
294 * If this is a request for a synthetic attribute in the system.*
295 * namespace use the generic infrastructure to resolve a handler
296 * for it via sb->s_xattr.
297 */
298 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
299 return generic_setxattr(dentry, name, value, size, flags);
300
301 if (!btrfs_is_valid_xattr(name))
302 return -EOPNOTSUPP;
303
304 if (size == 0)
305 value = ""; /* empty EA, do not remove */
306 return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
307}
308
309int btrfs_removexattr(struct dentry *dentry, const char *name)
310{
311 /*
312 * If this is a request for a synthetic attribute in the system.*
313 * namespace use the generic infrastructure to resolve a handler
314 * for it via sb->s_xattr.
315 */
316 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
317 return generic_removexattr(dentry, name);
318
319 if (!btrfs_is_valid_xattr(name))
320 return -EOPNOTSUPP;
321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
322}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000000..5b1d08f8e68d
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __XATTR__
20#define __XATTR__
21
22#include <linux/xattr.h>
23
24extern struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[];
27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size);
30extern int __btrfs_setxattr(struct inode *inode, const char *name,
31 const void *value, size_t size, int flags);
32
33extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
34 void *buffer, size_t size);
35extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38
39#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..ecfbce836d32
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,632 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 *
18 * Based on jffs2 zlib code:
19 * Copyright © 2001-2007 Red Hat, Inc.
20 * Created by David Woodhouse <dwmw2@infradead.org>
21 */
22
23#include <linux/kernel.h>
24#include <linux/slab.h>
25#include <linux/zlib.h>
26#include <linux/zutil.h>
27#include <linux/vmalloc.h>
28#include <linux/init.h>
29#include <linux/err.h>
30#include <linux/sched.h>
31#include <linux/pagemap.h>
32#include <linux/bio.h>
33#include "compression.h"
34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace {
45 z_stream inf_strm;
46 z_stream def_strm;
47 char *buf;
48 struct list_head list;
49};
50
51static LIST_HEAD(idle_workspace);
52static DEFINE_SPINLOCK(workspace_lock);
53static unsigned long num_workspace;
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56
57/*
58 * this finds an available zlib workspace or allocates a new one
59 * NULL or an ERR_PTR is returned if things go bad.
60 */
61static struct workspace *find_zlib_workspace(void)
62{
63 struct workspace *workspace;
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) {
90 ret = -ENOMEM;
91 goto fail;
92 }
93
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) {
106 ret = -ENOMEM;
107 goto fail_kmalloc;
108 }
109 return workspace;
110
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf);
141 kfree(workspace);
142
143 atomic_dec(&alloc_workspace);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147}
148
149/*
150 * cleanup function for module exit
151 */
152static void free_workspaces(void)
153{
154 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) {
156 workspace = list_entry(idle_workspace.next, struct workspace,
157 list);
158 list_del(&workspace->list);
159 vfree(workspace->def_strm.workspace);
160 vfree(workspace->inf_strm.workspace);
161 kfree(workspace->buf);
162 kfree(workspace);
163 atomic_dec(&alloc_workspace);
164 }
165}
166
167/*
168 * given an address space and start/len, compress the bytes.
169 *
170 * pages are allocated to hold the compressed result and stored
171 * in 'pages'
172 *
173 * out_pages is used to return the number of pages allocated. There
174 * may be pages allocated even if we return an error
175 *
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{
195 int ret;
196 struct workspace *workspace;
197 char *data_in;
198 char *cpage_out;
199 int nr_pages = 0;
200 struct page *in_page = NULL;
201 struct page *out_page = NULL;
202 int out_written = 0;
203 int in_read = 0;
204 unsigned long bytes_left;
205
206 *out_pages = 0;
207 *total_out = 0;
208 *total_in = 0;
209
210 workspace = find_zlib_workspace();
211 if (!workspace)
212 return -1;
213
214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
215 printk(KERN_WARNING "deflateInit failed\n");
216 ret = -1;
217 goto out;
218 }
219
220 workspace->def_strm.total_in = 0;
221 workspace->def_strm.total_out = 0;
222
223 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
224 data_in = kmap(in_page);
225
226 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
227 cpage_out = kmap(out_page);
228 pages[0] = out_page;
229 nr_pages = 1;
230
231 workspace->def_strm.next_in = data_in;
232 workspace->def_strm.next_out = cpage_out;
233 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
234 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
235
236 out_written = 0;
237 in_read = 0;
238
239 while (workspace->def_strm.total_in < len) {
240 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
241 if (ret != Z_OK) {
242 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
243 ret);
244 zlib_deflateEnd(&workspace->def_strm);
245 ret = -1;
246 goto out;
247 }
248
249 /* we're making it bigger, give up */
250 if (workspace->def_strm.total_in > 8192 &&
251 workspace->def_strm.total_in <
252 workspace->def_strm.total_out) {
253 ret = -1;
254 goto out;
255 }
256 /* we need another page for writing out. Test this
257 * before the total_in so we will pull in a new page for
258 * the stream end if required
259 */
260 if (workspace->def_strm.avail_out == 0) {
261 kunmap(out_page);
262 if (nr_pages == nr_dest_pages) {
263 out_page = NULL;
264 ret = -1;
265 goto out;
266 }
267 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
268 cpage_out = kmap(out_page);
269 pages[nr_pages] = out_page;
270 nr_pages++;
271 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
272 workspace->def_strm.next_out = cpage_out;
273 }
274 /* we're all done */
275 if (workspace->def_strm.total_in >= len)
276 break;
277
278 /* we've read in a full page, get a new one */
279 if (workspace->def_strm.avail_in == 0) {
280 if (workspace->def_strm.total_out > max_out)
281 break;
282
283 bytes_left = len - workspace->def_strm.total_in;
284 kunmap(in_page);
285 page_cache_release(in_page);
286
287 start += PAGE_CACHE_SIZE;
288 in_page = find_get_page(mapping,
289 start >> PAGE_CACHE_SHIFT);
290 data_in = kmap(in_page);
291 workspace->def_strm.avail_in = min(bytes_left,
292 PAGE_CACHE_SIZE);
293 workspace->def_strm.next_in = data_in;
294 }
295 }
296 workspace->def_strm.avail_in = 0;
297 ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
298 zlib_deflateEnd(&workspace->def_strm);
299
300 if (ret != Z_STREAM_END) {
301 ret = -1;
302 goto out;
303 }
304
305 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
306 ret = -1;
307 goto out;
308 }
309
310 ret = 0;
311 *total_out = workspace->def_strm.total_out;
312 *total_in = workspace->def_strm.total_in;
313out:
314 *out_pages = nr_pages;
315 if (out_page)
316 kunmap(out_page);
317
318 if (in_page) {
319 kunmap(in_page);
320 page_cache_release(in_page);
321 }
322 free_workspace(workspace);
323 return ret;
324}
325
326/*
327 * pages_in is an array of pages with compressed data.
328 *
329 * disk_start is the starting logical offset of this array in the file
330 *
331 * bvec is a bio_vec of pages from the file that we want to decompress into
332 *
333 * vcnt is the count of pages in the biovec
334 *
335 * srclen is the number of bytes in pages_in
336 *
337 * The basic idea is that we have a bio that was created by readpages.
338 * The pages in the bio are for the uncompressed data, and they may not
339 * be contiguous. They all correspond to the range of bytes covered by
340 * the compressed extent.
341 */
342int btrfs_zlib_decompress_biovec(struct page **pages_in,
343 u64 disk_start,
344 struct bio_vec *bvec,
345 int vcnt,
346 size_t srclen)
347{
348 int ret = 0;
349 int wbits = MAX_WBITS;
350 struct workspace *workspace;
351 char *data_in;
352 size_t total_out = 0;
353 unsigned long page_bytes_left;
354 unsigned long page_in_index = 0;
355 unsigned long page_out_index = 0;
356 struct page *page_out;
357 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
358 PAGE_CACHE_SIZE;
359 unsigned long buf_start;
360 unsigned long buf_offset;
361 unsigned long bytes;
362 unsigned long working_bytes;
363 unsigned long pg_offset;
364 unsigned long start_byte;
365 unsigned long current_buf_start;
366 char *kaddr;
367
368 workspace = find_zlib_workspace();
369 if (!workspace)
370 return -ENOMEM;
371
372 data_in = kmap(pages_in[page_in_index]);
373 workspace->inf_strm.next_in = data_in;
374 workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
375 workspace->inf_strm.total_in = 0;
376
377 workspace->inf_strm.total_out = 0;
378 workspace->inf_strm.next_out = workspace->buf;
379 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
380 page_out = bvec[page_out_index].bv_page;
381 page_bytes_left = PAGE_CACHE_SIZE;
382 pg_offset = 0;
383
384 /* If it's deflate, and it's got no preset dictionary, then
385 we can tell zlib to skip the adler32 check. */
386 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
387 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
388 !(((data_in[0]<<8) + data_in[1]) % 31)) {
389
390 wbits = -((data_in[0] >> 4) + 8);
391 workspace->inf_strm.next_in += 2;
392 workspace->inf_strm.avail_in -= 2;
393 }
394
395 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
396 printk(KERN_WARNING "inflateInit failed\n");
397 ret = -1;
398 goto out;
399 }
400 while (workspace->inf_strm.total_in < srclen) {
401 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
402 if (ret != Z_OK && ret != Z_STREAM_END)
403 break;
404 /*
405 * buf start is the byte offset we're of the start of
406 * our workspace buffer
407 */
408 buf_start = total_out;
409
410 /* total_out is the last byte of the workspace buffer */
411 total_out = workspace->inf_strm.total_out;
412
413 working_bytes = total_out - buf_start;
414
415 /*
416 * start byte is the first byte of the page we're currently
417 * copying into relative to the start of the compressed data.
418 */
419 start_byte = page_offset(page_out) - disk_start;
420
421 if (working_bytes == 0) {
422 /* we didn't make progress in this inflate
423 * call, we're done
424 */
425 if (ret != Z_STREAM_END)
426 ret = -1;
427 break;
428 }
429
430 /* we haven't yet hit data corresponding to this page */
431 if (total_out <= start_byte)
432 goto next;
433
434 /*
435 * the start of the data we care about is offset into
436 * the middle of our working buffer
437 */
438 if (total_out > start_byte && buf_start < start_byte) {
439 buf_offset = start_byte - buf_start;
440 working_bytes -= buf_offset;
441 } else {
442 buf_offset = 0;
443 }
444 current_buf_start = buf_start;
445
446 /* copy bytes from the working buffer into the pages */
447 while (working_bytes > 0) {
448 bytes = min(PAGE_CACHE_SIZE - pg_offset,
449 PAGE_CACHE_SIZE - buf_offset);
450 bytes = min(bytes, working_bytes);
451 kaddr = kmap_atomic(page_out, KM_USER0);
452 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
453 bytes);
454 kunmap_atomic(kaddr, KM_USER0);
455 flush_dcache_page(page_out);
456
457 pg_offset += bytes;
458 page_bytes_left -= bytes;
459 buf_offset += bytes;
460 working_bytes -= bytes;
461 current_buf_start += bytes;
462
463 /* check if we need to pick another page */
464 if (page_bytes_left == 0) {
465 page_out_index++;
466 if (page_out_index >= vcnt) {
467 ret = 0;
468 goto done;
469 }
470
471 page_out = bvec[page_out_index].bv_page;
472 pg_offset = 0;
473 page_bytes_left = PAGE_CACHE_SIZE;
474 start_byte = page_offset(page_out) - disk_start;
475
476 /*
477 * make sure our new page is covered by this
478 * working buffer
479 */
480 if (total_out <= start_byte)
481 goto next;
482
483 /* the next page in the biovec might not
484 * be adjacent to the last page, but it
485 * might still be found inside this working
486 * buffer. bump our offset pointer
487 */
488 if (total_out > start_byte &&
489 current_buf_start < start_byte) {
490 buf_offset = start_byte - buf_start;
491 working_bytes = total_out - start_byte;
492 current_buf_start = buf_start +
493 buf_offset;
494 }
495 }
496 }
497next:
498 workspace->inf_strm.next_out = workspace->buf;
499 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
500
501 if (workspace->inf_strm.avail_in == 0) {
502 unsigned long tmp;
503 kunmap(pages_in[page_in_index]);
504 page_in_index++;
505 if (page_in_index >= total_pages_in) {
506 data_in = NULL;
507 break;
508 }
509 data_in = kmap(pages_in[page_in_index]);
510 workspace->inf_strm.next_in = data_in;
511 tmp = srclen - workspace->inf_strm.total_in;
512 workspace->inf_strm.avail_in = min(tmp,
513 PAGE_CACHE_SIZE);
514 }
515 }
516 if (ret != Z_STREAM_END)
517 ret = -1;
518 else
519 ret = 0;
520done:
521 zlib_inflateEnd(&workspace->inf_strm);
522 if (data_in)
523 kunmap(pages_in[page_in_index]);
524out:
525 free_workspace(workspace);
526 return ret;
527}
528
529/*
530 * a less complex decompression routine. Our compressed data fits in a
531 * single page, and we want to read a single page out of it.
532 * start_byte tells us the offset into the compressed data we're interested in
533 */
534int btrfs_zlib_decompress(unsigned char *data_in,
535 struct page *dest_page,
536 unsigned long start_byte,
537 size_t srclen, size_t destlen)
538{
539 int ret = 0;
540 int wbits = MAX_WBITS;
541 struct workspace *workspace;
542 unsigned long bytes_left = destlen;
543 unsigned long total_out = 0;
544 char *kaddr;
545
546 if (destlen > PAGE_CACHE_SIZE)
547 return -ENOMEM;
548
549 workspace = find_zlib_workspace();
550 if (!workspace)
551 return -ENOMEM;
552
553 workspace->inf_strm.next_in = data_in;
554 workspace->inf_strm.avail_in = srclen;
555 workspace->inf_strm.total_in = 0;
556
557 workspace->inf_strm.next_out = workspace->buf;
558 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
559 workspace->inf_strm.total_out = 0;
560 /* If it's deflate, and it's got no preset dictionary, then
561 we can tell zlib to skip the adler32 check. */
562 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
563 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
564 !(((data_in[0]<<8) + data_in[1]) % 31)) {
565
566 wbits = -((data_in[0] >> 4) + 8);
567 workspace->inf_strm.next_in += 2;
568 workspace->inf_strm.avail_in -= 2;
569 }
570
571 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
572 printk(KERN_WARNING "inflateInit failed\n");
573 ret = -1;
574 goto out;
575 }
576
577 while (bytes_left > 0) {
578 unsigned long buf_start;
579 unsigned long buf_offset;
580 unsigned long bytes;
581 unsigned long pg_offset = 0;
582
583 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
584 if (ret != Z_OK && ret != Z_STREAM_END)
585 break;
586
587 buf_start = total_out;
588 total_out = workspace->inf_strm.total_out;
589
590 if (total_out == buf_start) {
591 ret = -1;
592 break;
593 }
594
595 if (total_out <= start_byte)
596 goto next;
597
598 if (total_out > start_byte && buf_start < start_byte)
599 buf_offset = start_byte - buf_start;
600 else
601 buf_offset = 0;
602
603 bytes = min(PAGE_CACHE_SIZE - pg_offset,
604 PAGE_CACHE_SIZE - buf_offset);
605 bytes = min(bytes, bytes_left);
606
607 kaddr = kmap_atomic(dest_page, KM_USER0);
608 memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
609 kunmap_atomic(kaddr, KM_USER0);
610
611 pg_offset += bytes;
612 bytes_left -= bytes;
613next:
614 workspace->inf_strm.next_out = workspace->buf;
615 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
616 }
617
618 if (ret != Z_STREAM_END && bytes_left != 0)
619 ret = -1;
620 else
621 ret = 0;
622
623 zlib_inflateEnd(&workspace->inf_strm);
624out:
625 free_workspace(workspace);
626 return ret;
627}
628
629void btrfs_zlib_exit(void)
630{
631 free_workspaces();
632}