aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-10-16 10:12:58 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-16 10:12:58 -0400
commit26ce34a9c47334ff7984769e4661b2f1883594ff (patch)
treefbc1da082a0b9f4adc7a2560811bbd6e6c47e420
parent3fa8749e584b55f1180411ab1b51117190bac1e5 (diff)
parent37d3cdddf5378606f6eab99982ca16819745d8f4 (diff)
Merge branch 'master' of ssh://mason@master.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
-rw-r--r--fs/Kconfig17
-rw-r--r--fs/Makefile1
-rw-r--r--fs/btrfs/COPYING356
-rw-r--r--fs/btrfs/INSTALL48
-rw-r--r--fs/btrfs/Makefile24
-rw-r--r--fs/btrfs/acl.c352
-rw-r--r--fs/btrfs/async-thread.c357
-rw-r--r--fs/btrfs/async-thread.h85
-rw-r--r--fs/btrfs/btrfs_inode.h133
-rw-r--r--fs/btrfs/compat.h24
-rw-r--r--fs/btrfs/crc32c.h120
-rw-r--r--fs/btrfs/ctree.c3716
-rw-r--r--fs/btrfs/ctree.h1891
-rw-r--r--fs/btrfs/dir-item.c386
-rw-r--r--fs/btrfs/disk-io.c2078
-rw-r--r--fs/btrfs/disk-io.h84
-rw-r--r--fs/btrfs/export.c201
-rw-r--r--fs/btrfs/export.h19
-rw-r--r--fs/btrfs/extent-tree.c5253
-rw-r--r--fs/btrfs/extent_io.c3416
-rw-r--r--fs/btrfs/extent_io.h248
-rw-r--r--fs/btrfs/extent_map.c342
-rw-r--r--fs/btrfs/extent_map.h57
-rw-r--r--fs/btrfs/file-item.c512
-rw-r--r--fs/btrfs/file.c1178
-rw-r--r--fs/btrfs/free-space-cache.c449
-rw-r--r--fs/btrfs/hash.h27
-rw-r--r--fs/btrfs/inode-item.c206
-rw-r--r--fs/btrfs/inode-map.c145
-rw-r--r--fs/btrfs/inode.c3908
-rw-r--r--fs/btrfs/ioctl.c863
-rw-r--r--fs/btrfs/ioctl.h55
-rw-r--r--fs/btrfs/locking.c87
-rw-r--r--fs/btrfs/locking.h27
-rw-r--r--fs/btrfs/ordered-data.c727
-rw-r--r--fs/btrfs/ordered-data.h149
-rw-r--r--fs/btrfs/orphan.c67
-rw-r--r--fs/btrfs/print-tree.c200
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/ref-cache.c230
-rw-r--r--fs/btrfs/ref-cache.h77
-rw-r--r--fs/btrfs/root-tree.c277
-rw-r--r--fs/btrfs/struct-funcs.c132
-rw-r--r--fs/btrfs/super.c659
-rw-r--r--fs/btrfs/sysfs.c268
-rw-r--r--fs/btrfs/transaction.c1023
-rw-r--r--fs/btrfs/transaction.h104
-rw-r--r--fs/btrfs/tree-defrag.c149
-rw-r--r--fs/btrfs/tree-log.c2890
-rw-r--r--fs/btrfs/tree-log.h41
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c2549
-rw-r--r--fs/btrfs/volumes.h150
-rw-r--r--fs/btrfs/xattr.c321
-rw-r--r--fs/btrfs/xattr.h39
56 files changed, 36787 insertions, 0 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index abccb5dab9a8..18f5a85b47c6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -497,6 +497,23 @@ config OCFS2_DEBUG_FS
497 this option for debugging only as it is likely to decrease 497 this option for debugging only as it is likely to decrease
498 performance of the filesystem. 498 performance of the filesystem.
499 499
500config BTRFS_FS
501 tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
502 depends on EXPERIMENTAL
503 select LIBCRC32C
504 help
505 Btrfs is a new filesystem with extents, writable snapshotting,
506 support for multiple devices and many more features.
507
508 Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
509 FINALIZED. You should say N here unless you are interested in
510 testing Btrfs with non-critical data.
511
512 To compile this file system support as a module, choose M here. The
513 module will be called btrfs.
514
515 If unsure, say N.
516
500endif # BLOCK 517endif # BLOCK
501 518
502config DNOTIFY 519config DNOTIFY
diff --git a/fs/Makefile b/fs/Makefile
index a1482a5eff15..41fcc858c972 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -121,4 +121,5 @@ obj-$(CONFIG_HOSTFS) += hostfs/
121obj-$(CONFIG_HPPFS) += hppfs/ 121obj-$(CONFIG_HPPFS) += hppfs/
122obj-$(CONFIG_DEBUG_FS) += debugfs/ 122obj-$(CONFIG_DEBUG_FS) += debugfs/
123obj-$(CONFIG_OCFS2_FS) += ocfs2/ 123obj-$(CONFIG_OCFS2_FS) += ocfs2/
124obj-$(CONFIG_BTRFS_FS) += btrfs/
124obj-$(CONFIG_GFS2_FS) += gfs2/ 125obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/btrfs/COPYING b/fs/btrfs/COPYING
new file mode 100644
index 000000000000..ca442d313d86
--- /dev/null
+++ b/fs/btrfs/COPYING
@@ -0,0 +1,356 @@
1
2 NOTE! This copyright does *not* cover user programs that use kernel
3 services by normal system calls - this is merely considered normal use
4 of the kernel, and does *not* fall under the heading of "derived work".
5 Also note that the GPL below is copyrighted by the Free Software
6 Foundation, but the instance of code that it refers to (the Linux
7 kernel) is copyrighted by me and others who actually wrote it.
8
9 Also note that the only valid version of the GPL as far as the kernel
10 is concerned is _this_ particular version of the license (ie v2, not
11 v2.2 or v3.x or whatever), unless explicitly otherwise stated.
12
13 Linus Torvalds
14
15----------------------------------------
16
17 GNU GENERAL PUBLIC LICENSE
18 Version 2, June 1991
19
20 Copyright (C) 1989, 1991 Free Software Foundation, Inc.
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 Everyone is permitted to copy and distribute verbatim copies
23 of this license document, but changing it is not allowed.
24
25 Preamble
26
27 The licenses for most software are designed to take away your
28freedom to share and change it. By contrast, the GNU General Public
29License is intended to guarantee your freedom to share and change free
30software--to make sure the software is free for all its users. This
31General Public License applies to most of the Free Software
32Foundation's software and to any other program whose authors commit to
33using it. (Some other Free Software Foundation software is covered by
34the GNU Library General Public License instead.) You can apply it to
35your programs, too.
36
37 When we speak of free software, we are referring to freedom, not
38price. Our General Public Licenses are designed to make sure that you
39have the freedom to distribute copies of free software (and charge for
40this service if you wish), that you receive source code or can get it
41if you want it, that you can change the software or use pieces of it
42in new free programs; and that you know you can do these things.
43
44 To protect your rights, we need to make restrictions that forbid
45anyone to deny you these rights or to ask you to surrender the rights.
46These restrictions translate to certain responsibilities for you if you
47distribute copies of the software, or if you modify it.
48
49 For example, if you distribute copies of such a program, whether
50gratis or for a fee, you must give the recipients all the rights that
51you have. You must make sure that they, too, receive or can get the
52source code. And you must show them these terms so they know their
53rights.
54
55 We protect your rights with two steps: (1) copyright the software, and
56(2) offer you this license which gives you legal permission to copy,
57distribute and/or modify the software.
58
59 Also, for each author's protection and ours, we want to make certain
60that everyone understands that there is no warranty for this free
61software. If the software is modified by someone else and passed on, we
62want its recipients to know that what they have is not the original, so
63that any problems introduced by others will not reflect on the original
64authors' reputations.
65
66 Finally, any free program is threatened constantly by software
67patents. We wish to avoid the danger that redistributors of a free
68program will individually obtain patent licenses, in effect making the
69program proprietary. To prevent this, we have made it clear that any
70patent must be licensed for everyone's free use or not licensed at all.
71
72 The precise terms and conditions for copying, distribution and
73modification follow.
74
75 GNU GENERAL PUBLIC LICENSE
76 TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
77
78 0. This License applies to any program or other work which contains
79a notice placed by the copyright holder saying it may be distributed
80under the terms of this General Public License. The "Program", below,
81refers to any such program or work, and a "work based on the Program"
82means either the Program or any derivative work under copyright law:
83that is to say, a work containing the Program or a portion of it,
84either verbatim or with modifications and/or translated into another
85language. (Hereinafter, translation is included without limitation in
86the term "modification".) Each licensee is addressed as "you".
87
88Activities other than copying, distribution and modification are not
89covered by this License; they are outside its scope. The act of
90running the Program is not restricted, and the output from the Program
91is covered only if its contents constitute a work based on the
92Program (independent of having been made by running the Program).
93Whether that is true depends on what the Program does.
94
95 1. You may copy and distribute verbatim copies of the Program's
96source code as you receive it, in any medium, provided that you
97conspicuously and appropriately publish on each copy an appropriate
98copyright notice and disclaimer of warranty; keep intact all the
99notices that refer to this License and to the absence of any warranty;
100and give any other recipients of the Program a copy of this License
101along with the Program.
102
103You may charge a fee for the physical act of transferring a copy, and
104you may at your option offer warranty protection in exchange for a fee.
105
106 2. You may modify your copy or copies of the Program or any portion
107of it, thus forming a work based on the Program, and copy and
108distribute such modifications or work under the terms of Section 1
109above, provided that you also meet all of these conditions:
110
111 a) You must cause the modified files to carry prominent notices
112 stating that you changed the files and the date of any change.
113
114 b) You must cause any work that you distribute or publish, that in
115 whole or in part contains or is derived from the Program or any
116 part thereof, to be licensed as a whole at no charge to all third
117 parties under the terms of this License.
118
119 c) If the modified program normally reads commands interactively
120 when run, you must cause it, when started running for such
121 interactive use in the most ordinary way, to print or display an
122 announcement including an appropriate copyright notice and a
123 notice that there is no warranty (or else, saying that you provide
124 a warranty) and that users may redistribute the program under
125 these conditions, and telling the user how to view a copy of this
126 License. (Exception: if the Program itself is interactive but
127 does not normally print such an announcement, your work based on
128 the Program is not required to print an announcement.)
129
130These requirements apply to the modified work as a whole. If
131identifiable sections of that work are not derived from the Program,
132and can be reasonably considered independent and separate works in
133themselves, then this License, and its terms, do not apply to those
134sections when you distribute them as separate works. But when you
135distribute the same sections as part of a whole which is a work based
136on the Program, the distribution of the whole must be on the terms of
137this License, whose permissions for other licensees extend to the
138entire whole, and thus to each and every part regardless of who wrote it.
139
140Thus, it is not the intent of this section to claim rights or contest
141your rights to work written entirely by you; rather, the intent is to
142exercise the right to control the distribution of derivative or
143collective works based on the Program.
144
145In addition, mere aggregation of another work not based on the Program
146with the Program (or with a work based on the Program) on a volume of
147a storage or distribution medium does not bring the other work under
148the scope of this License.
149
150 3. You may copy and distribute the Program (or a work based on it,
151under Section 2) in object code or executable form under the terms of
152Sections 1 and 2 above provided that you also do one of the following:
153
154 a) Accompany it with the complete corresponding machine-readable
155 source code, which must be distributed under the terms of Sections
156 1 and 2 above on a medium customarily used for software interchange; or,
157
158 b) Accompany it with a written offer, valid for at least three
159 years, to give any third party, for a charge no more than your
160 cost of physically performing source distribution, a complete
161 machine-readable copy of the corresponding source code, to be
162 distributed under the terms of Sections 1 and 2 above on a medium
163 customarily used for software interchange; or,
164
165 c) Accompany it with the information you received as to the offer
166 to distribute corresponding source code. (This alternative is
167 allowed only for noncommercial distribution and only if you
168 received the program in object code or executable form with such
169 an offer, in accord with Subsection b above.)
170
171The source code for a work means the preferred form of the work for
172making modifications to it. For an executable work, complete source
173code means all the source code for all modules it contains, plus any
174associated interface definition files, plus the scripts used to
175control compilation and installation of the executable. However, as a
176special exception, the source code distributed need not include
177anything that is normally distributed (in either source or binary
178form) with the major components (compiler, kernel, and so on) of the
179operating system on which the executable runs, unless that component
180itself accompanies the executable.
181
182If distribution of executable or object code is made by offering
183access to copy from a designated place, then offering equivalent
184access to copy the source code from the same place counts as
185distribution of the source code, even though third parties are not
186compelled to copy the source along with the object code.
187
188 4. You may not copy, modify, sublicense, or distribute the Program
189except as expressly provided under this License. Any attempt
190otherwise to copy, modify, sublicense or distribute the Program is
191void, and will automatically terminate your rights under this License.
192However, parties who have received copies, or rights, from you under
193this License will not have their licenses terminated so long as such
194parties remain in full compliance.
195
196 5. You are not required to accept this License, since you have not
197signed it. However, nothing else grants you permission to modify or
198distribute the Program or its derivative works. These actions are
199prohibited by law if you do not accept this License. Therefore, by
200modifying or distributing the Program (or any work based on the
201Program), you indicate your acceptance of this License to do so, and
202all its terms and conditions for copying, distributing or modifying
203the Program or works based on it.
204
205 6. Each time you redistribute the Program (or any work based on the
206Program), the recipient automatically receives a license from the
207original licensor to copy, distribute or modify the Program subject to
208these terms and conditions. You may not impose any further
209restrictions on the recipients' exercise of the rights granted herein.
210You are not responsible for enforcing compliance by third parties to
211this License.
212
213 7. If, as a consequence of a court judgment or allegation of patent
214infringement or for any other reason (not limited to patent issues),
215conditions are imposed on you (whether by court order, agreement or
216otherwise) that contradict the conditions of this License, they do not
217excuse you from the conditions of this License. If you cannot
218distribute so as to satisfy simultaneously your obligations under this
219License and any other pertinent obligations, then as a consequence you
220may not distribute the Program at all. For example, if a patent
221license would not permit royalty-free redistribution of the Program by
222all those who receive copies directly or indirectly through you, then
223the only way you could satisfy both it and this License would be to
224refrain entirely from distribution of the Program.
225
226If any portion of this section is held invalid or unenforceable under
227any particular circumstance, the balance of the section is intended to
228apply and the section as a whole is intended to apply in other
229circumstances.
230
231It is not the purpose of this section to induce you to infringe any
232patents or other property right claims or to contest validity of any
233such claims; this section has the sole purpose of protecting the
234integrity of the free software distribution system, which is
235implemented by public license practices. Many people have made
236generous contributions to the wide range of software distributed
237through that system in reliance on consistent application of that
238system; it is up to the author/donor to decide if he or she is willing
239to distribute software through any other system and a licensee cannot
240impose that choice.
241
242This section is intended to make thoroughly clear what is believed to
243be a consequence of the rest of this License.
244
245 8. If the distribution and/or use of the Program is restricted in
246certain countries either by patents or by copyrighted interfaces, the
247original copyright holder who places the Program under this License
248may add an explicit geographical distribution limitation excluding
249those countries, so that distribution is permitted only in or among
250countries not thus excluded. In such case, this License incorporates
251the limitation as if written in the body of this License.
252
253 9. The Free Software Foundation may publish revised and/or new versions
254of the General Public License from time to time. Such new versions will
255be similar in spirit to the present version, but may differ in detail to
256address new problems or concerns.
257
258Each version is given a distinguishing version number. If the Program
259specifies a version number of this License which applies to it and "any
260later version", you have the option of following the terms and conditions
261either of that version or of any later version published by the Free
262Software Foundation. If the Program does not specify a version number of
263this License, you may choose any version ever published by the Free Software
264Foundation.
265
266 10. If you wish to incorporate parts of the Program into other free
267programs whose distribution conditions are different, write to the author
268to ask for permission. For software which is copyrighted by the Free
269Software Foundation, write to the Free Software Foundation; we sometimes
270make exceptions for this. Our decision will be guided by the two goals
271of preserving the free status of all derivatives of our free software and
272of promoting the sharing and reuse of software generally.
273
274 NO WARRANTY
275
276 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
277FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
278OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
279PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
280OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
281MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
282TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
283PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
284REPAIR OR CORRECTION.
285
286 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
287WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
288REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
289INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
290OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
291TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
292YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
293PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
294POSSIBILITY OF SUCH DAMAGES.
295
296 END OF TERMS AND CONDITIONS
297
298 How to Apply These Terms to Your New Programs
299
300 If you develop a new program, and you want it to be of the greatest
301possible use to the public, the best way to achieve this is to make it
302free software which everyone can redistribute and change under these terms.
303
304 To do so, attach the following notices to the program. It is safest
305to attach them to the start of each source file to most effectively
306convey the exclusion of warranty; and each file should have at least
307the "copyright" line and a pointer to where the full notice is found.
308
309 <one line to give the program's name and a brief idea of what it does.>
310 Copyright (C) <year> <name of author>
311
312 This program is free software; you can redistribute it and/or modify
313 it under the terms of the GNU General Public License as published by
314 the Free Software Foundation; either version 2 of the License, or
315 (at your option) any later version.
316
317 This program is distributed in the hope that it will be useful,
318 but WITHOUT ANY WARRANTY; without even the implied warranty of
319 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
320 GNU General Public License for more details.
321
322 You should have received a copy of the GNU General Public License
323 along with this program; if not, write to the Free Software
324 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
325
326
327Also add information on how to contact you by electronic and paper mail.
328
329If the program is interactive, make it output a short notice like this
330when it starts in an interactive mode:
331
332 Gnomovision version 69, Copyright (C) year name of author
333 Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
334 This is free software, and you are welcome to redistribute it
335 under certain conditions; type `show c' for details.
336
337The hypothetical commands `show w' and `show c' should show the appropriate
338parts of the General Public License. Of course, the commands you use may
339be called something other than `show w' and `show c'; they could even be
340mouse-clicks or menu items--whatever suits your program.
341
342You should also get your employer (if you work as a programmer) or your
343school, if any, to sign a "copyright disclaimer" for the program, if
344necessary. Here is a sample; alter the names:
345
346 Yoyodyne, Inc., hereby disclaims all copyright interest in the program
347 `Gnomovision' (which makes passes at compilers) written by James Hacker.
348
349 <signature of Ty Coon>, 1 April 1989
350 Ty Coon, President of Vice
351
352This General Public License does not permit incorporating your program into
353proprietary programs. If your program is a subroutine library, you may
354consider it more useful to permit linking proprietary applications with the
355library. If this is what you want to do, use the GNU Library General
356Public License instead of this License.
diff --git a/fs/btrfs/INSTALL b/fs/btrfs/INSTALL
new file mode 100644
index 000000000000..16b45a56878d
--- /dev/null
+++ b/fs/btrfs/INSTALL
@@ -0,0 +1,48 @@
1Install Instructions
2
3Btrfs puts snapshots and subvolumes into the root directory of the FS. This
4directory can only be changed by btrfsctl right now, and normal filesystem
5operations do not work on it. The default subvolume is called 'default',
6and you can create files and directories in mount_point/default
7
8Btrfs uses libcrc32c in the kernel for file and metadata checksums. You need
9to compile the kernel with:
10
11CONFIG_LIBCRC32C=m
12
13libcrc32c can be static as well. Once your kernel is setup, typing make in the
14btrfs module sources will build against the running kernel. When the build is
15complete:
16
17modprobe libcrc32c
18insmod btrfs.ko
19
20The Btrfs utility programs require libuuid to build. This can be found
21in the e2fsprogs sources, and is usually available as libuuid or
22e2fsprogs-devel from various distros.
23
24Building the utilities is just make ; make install. The programs go
25into /usr/local/bin. The commands available are:
26
27mkfs.btrfs: create a filesystem
28
29btrfsctl: control program to create snapshots and subvolumes:
30
31 mount /dev/sda2 /mnt
32 btrfsctl -s new_subvol_name /mnt
33 btrfsctl -s snapshot_of_default /mnt/default
34 btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
35 btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
36 ls /mnt
37 default snapshot_of_a_snapshot snapshot_of_new_subvol
38 new_subvol_name snapshot_of_default
39
40 Snapshots and subvolumes cannot be deleted right now, but you can
41 rm -rf all the files and directories inside them.
42
43btrfsck: do a limited check of the FS extent trees.</li>
44
45debug-tree: print all of the FS metadata in text form. Example:
46
47 debug-tree /dev/sda2 >& big_output_file
48
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000000..7125716e142b
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,24 @@
1ifneq ($(KERNELRELEASE),)
2# kbuild part of makefile
3
4obj-$(CONFIG_BTRFS_FS) := btrfs.o
5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 file-item.o inode-item.o inode-map.o disk-io.o \
7 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o
11else
12
13# Normal Makefile
14
15KERNELDIR := /lib/modules/`uname -r`/build
16all:
17 $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
18
19modules_install:
20 $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
21clean:
22 $(MAKE) -C $(KERNELDIR) M=`pwd` clean
23
24endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000000..867eaf1f8efb
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,352 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/string.h>
21#include <linux/xattr.h>
22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h>
24#include <linux/sched.h>
25
26#include "ctree.h"
27#include "btrfs_inode.h"
28#include "xattr.h"
29
30#ifdef CONFIG_FS_POSIX_ACL
31
32static void btrfs_update_cached_acl(struct inode *inode,
33 struct posix_acl **p_acl,
34 struct posix_acl *acl)
35{
36 spin_lock(&inode->i_lock);
37 if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
38 posix_acl_release(*p_acl);
39 *p_acl = posix_acl_dup(acl);
40 spin_unlock(&inode->i_lock);
41}
42
43static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
44{
45 int size;
46 const char *name;
47 char *value = NULL;
48 struct posix_acl *acl = NULL, **p_acl;
49
50 switch (type) {
51 case ACL_TYPE_ACCESS:
52 name = POSIX_ACL_XATTR_ACCESS;
53 p_acl = &BTRFS_I(inode)->i_acl;
54 break;
55 case ACL_TYPE_DEFAULT:
56 name = POSIX_ACL_XATTR_DEFAULT;
57 p_acl = &BTRFS_I(inode)->i_default_acl;
58 break;
59 default:
60 return ERR_PTR(-EINVAL);
61 }
62
63 spin_lock(&inode->i_lock);
64 if (*p_acl != BTRFS_ACL_NOT_CACHED)
65 acl = posix_acl_dup(*p_acl);
66 spin_unlock(&inode->i_lock);
67
68 if (acl)
69 return acl;
70
71
72 size = __btrfs_getxattr(inode, name, "", 0);
73 if (size > 0) {
74 value = kzalloc(size, GFP_NOFS);
75 if (!value)
76 return ERR_PTR(-ENOMEM);
77 size = __btrfs_getxattr(inode, name, value, size);
78 if (size > 0) {
79 acl = posix_acl_from_xattr(value, size);
80 btrfs_update_cached_acl(inode, p_acl, acl);
81 }
82 kfree(value);
83 } else if (size == -ENOENT) {
84 acl = NULL;
85 btrfs_update_cached_acl(inode, p_acl, acl);
86 }
87
88 return acl;
89}
90
91static int btrfs_xattr_get_acl(struct inode *inode, int type,
92 void *value, size_t size)
93{
94 struct posix_acl *acl;
95 int ret = 0;
96
97 acl = btrfs_get_acl(inode, type);
98
99 if (IS_ERR(acl))
100 return PTR_ERR(acl);
101 if (acl == NULL)
102 return -ENODATA;
103 ret = posix_acl_to_xattr(acl, value, size);
104 posix_acl_release(acl);
105
106 return ret;
107}
108
109/*
110 * Needs to be called with fs_mutex held
111 */
112static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
113{
114 int ret, size = 0;
115 const char *name;
116 struct posix_acl **p_acl;
117 char *value = NULL;
118 mode_t mode;
119
120 if (acl) {
121 ret = posix_acl_valid(acl);
122 if (ret < 0)
123 return ret;
124 ret = 0;
125 }
126
127 switch (type) {
128 case ACL_TYPE_ACCESS:
129 mode = inode->i_mode;
130 ret = posix_acl_equiv_mode(acl, &mode);
131 if (ret < 0)
132 return ret;
133 ret = 0;
134 inode->i_mode = mode;
135 name = POSIX_ACL_XATTR_ACCESS;
136 p_acl = &BTRFS_I(inode)->i_acl;
137 break;
138 case ACL_TYPE_DEFAULT:
139 if (!S_ISDIR(inode->i_mode))
140 return acl ? -EINVAL : 0;
141 name = POSIX_ACL_XATTR_DEFAULT;
142 p_acl = &BTRFS_I(inode)->i_default_acl;
143 break;
144 default:
145 return -EINVAL;
146 }
147
148 if (acl) {
149 size = posix_acl_xattr_size(acl->a_count);
150 value = kmalloc(size, GFP_NOFS);
151 if (!value) {
152 ret = -ENOMEM;
153 goto out;
154 }
155
156 ret = posix_acl_to_xattr(acl, value, size);
157 if (ret < 0)
158 goto out;
159 }
160
161 ret = __btrfs_setxattr(inode, name, value, size, 0);
162
163out:
164 if (value)
165 kfree(value);
166
167 if (!ret)
168 btrfs_update_cached_acl(inode, p_acl, acl);
169
170 return ret;
171}
172
173static int btrfs_xattr_set_acl(struct inode *inode, int type,
174 const void *value, size_t size)
175{
176 int ret = 0;
177 struct posix_acl *acl = NULL;
178
179 if (value) {
180 acl = posix_acl_from_xattr(value, size);
181 if (acl == NULL) {
182 value = NULL;
183 size = 0;
184 } else if (IS_ERR(acl)) {
185 return PTR_ERR(acl);
186 }
187 }
188
189 ret = btrfs_set_acl(inode, acl, type);
190
191 posix_acl_release(acl);
192
193 return ret;
194}
195
196
197static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
198 void *value, size_t size)
199{
200 return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
201}
202
203static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
204 const void *value, size_t size, int flags)
205{
206 return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
207}
208
209static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
210 void *value, size_t size)
211{
212 return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
213}
214
215static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
216 const void *value, size_t size, int flags)
217{
218 return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
219}
220
221int btrfs_check_acl(struct inode *inode, int mask)
222{
223 struct posix_acl *acl;
224 int error = -EAGAIN;
225
226 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
227
228 if (IS_ERR(acl))
229 return PTR_ERR(acl);
230 if (acl) {
231 error = posix_acl_permission(inode, acl, mask);
232 posix_acl_release(acl);
233 }
234
235 return error;
236}
237
238/*
239 * btrfs_init_acl is already generally called under fs_mutex, so the locking
240 * stuff has been fixed to work with that. If the locking stuff changes, we
241 * need to re-evaluate the acl locking stuff.
242 */
243int btrfs_init_acl(struct inode *inode, struct inode *dir)
244{
245 struct posix_acl *acl = NULL;
246 int ret = 0;
247
248 /* this happens with subvols */
249 if (!dir)
250 return 0;
251
252 if (!S_ISLNK(inode->i_mode)) {
253 if (IS_POSIXACL(dir)) {
254 acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
255 if (IS_ERR(acl))
256 return PTR_ERR(acl);
257 }
258
259 if (!acl)
260 inode->i_mode &= ~current->fs->umask;
261 }
262
263 if (IS_POSIXACL(dir) && acl) {
264 struct posix_acl *clone;
265 mode_t mode;
266
267 if (S_ISDIR(inode->i_mode)) {
268 ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
269 if (ret)
270 goto failed;
271 }
272 clone = posix_acl_clone(acl, GFP_NOFS);
273 ret = -ENOMEM;
274 if (!clone)
275 goto failed;
276
277 mode = inode->i_mode;
278 ret = posix_acl_create_masq(clone, &mode);
279 if (ret >= 0) {
280 inode->i_mode = mode;
281 if (ret > 0) {
282 /* we need an acl */
283 ret = btrfs_set_acl(inode, clone,
284 ACL_TYPE_ACCESS);
285 }
286 }
287 }
288failed:
289 posix_acl_release(acl);
290
291 return ret;
292}
293
294int btrfs_acl_chmod(struct inode *inode)
295{
296 struct posix_acl *acl, *clone;
297 int ret = 0;
298
299 if (S_ISLNK(inode->i_mode))
300 return -EOPNOTSUPP;
301
302 if (!IS_POSIXACL(inode))
303 return 0;
304
305 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
306 if (IS_ERR(acl) || !acl)
307 return PTR_ERR(acl);
308
309 clone = posix_acl_clone(acl, GFP_KERNEL);
310 posix_acl_release(acl);
311 if (!clone)
312 return -ENOMEM;
313
314 ret = posix_acl_chmod_masq(clone, inode->i_mode);
315 if (!ret)
316 ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
317
318 posix_acl_release(clone);
319
320 return ret;
321}
322
323struct xattr_handler btrfs_xattr_acl_default_handler = {
324 .prefix = POSIX_ACL_XATTR_DEFAULT,
325 .get = btrfs_xattr_acl_default_get,
326 .set = btrfs_xattr_acl_default_set,
327};
328
329struct xattr_handler btrfs_xattr_acl_access_handler = {
330 .prefix = POSIX_ACL_XATTR_ACCESS,
331 .get = btrfs_xattr_acl_access_get,
332 .set = btrfs_xattr_acl_access_set,
333};
334
335#else /* CONFIG_FS_POSIX_ACL */
336
337int btrfs_acl_chmod(struct inode *inode)
338{
339 return 0;
340}
341
342int btrfs_init_acl(struct inode *inode, struct inode *dir)
343{
344 return 0;
345}
346
347int btrfs_check_acl(struct inode *inode, int mask)
348{
349 return 0;
350}
351
352#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..d82efd722a48
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,357 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/kthread.h>
21#include <linux/list.h>
22#include <linux/spinlock.h>
23# include <linux/freezer.h>
24#include "async-thread.h"
25
26/*
27 * container for the kthread task pointer and the list of pending work
28 * One of these is allocated per thread.
29 */
30struct btrfs_worker_thread {
31 /* pool we belong to */
32 struct btrfs_workers *workers;
33
34 /* list of struct btrfs_work that are waiting for service */
35 struct list_head pending;
36
37 /* list of worker threads from struct btrfs_workers */
38 struct list_head worker_list;
39
40 /* kthread */
41 struct task_struct *task;
42
43 /* number of things on the pending list */
44 atomic_t num_pending;
45
46 unsigned long sequence;
47
48 /* protects the pending list. */
49 spinlock_t lock;
50
51 /* set to non-zero when this thread is already awake and kicking */
52 int working;
53
54 /* are we currently idle */
55 int idle;
56};
57
58/*
59 * helper function to move a thread onto the idle list after it
60 * has finished some requests.
61 */
62static void check_idle_worker(struct btrfs_worker_thread *worker)
63{
64 if (!worker->idle && atomic_read(&worker->num_pending) <
65 worker->workers->idle_thresh / 2) {
66 unsigned long flags;
67 spin_lock_irqsave(&worker->workers->lock, flags);
68 worker->idle = 1;
69 list_move(&worker->worker_list, &worker->workers->idle_list);
70 spin_unlock_irqrestore(&worker->workers->lock, flags);
71 }
72}
73
74/*
75 * helper function to move a thread off the idle list after new
76 * pending work is added.
77 */
78static void check_busy_worker(struct btrfs_worker_thread *worker)
79{
80 if (worker->idle && atomic_read(&worker->num_pending) >=
81 worker->workers->idle_thresh) {
82 unsigned long flags;
83 spin_lock_irqsave(&worker->workers->lock, flags);
84 worker->idle = 0;
85 list_move_tail(&worker->worker_list,
86 &worker->workers->worker_list);
87 spin_unlock_irqrestore(&worker->workers->lock, flags);
88 }
89}
90
91/*
92 * main loop for servicing work items
93 */
94static int worker_loop(void *arg)
95{
96 struct btrfs_worker_thread *worker = arg;
97 struct list_head *cur;
98 struct btrfs_work *work;
99 do {
100 spin_lock_irq(&worker->lock);
101 while(!list_empty(&worker->pending)) {
102 cur = worker->pending.next;
103 work = list_entry(cur, struct btrfs_work, list);
104 list_del(&work->list);
105 clear_bit(0, &work->flags);
106
107 work->worker = worker;
108 spin_unlock_irq(&worker->lock);
109
110 work->func(work);
111
112 atomic_dec(&worker->num_pending);
113 spin_lock_irq(&worker->lock);
114 check_idle_worker(worker);
115 }
116 worker->working = 0;
117 if (freezing(current)) {
118 refrigerator();
119 } else {
120 set_current_state(TASK_INTERRUPTIBLE);
121 spin_unlock_irq(&worker->lock);
122 schedule();
123 __set_current_state(TASK_RUNNING);
124 }
125 } while (!kthread_should_stop());
126 return 0;
127}
128
129/*
130 * this will wait for all the worker threads to shutdown
131 */
132int btrfs_stop_workers(struct btrfs_workers *workers)
133{
134 struct list_head *cur;
135 struct btrfs_worker_thread *worker;
136
137 list_splice_init(&workers->idle_list, &workers->worker_list);
138 while(!list_empty(&workers->worker_list)) {
139 cur = workers->worker_list.next;
140 worker = list_entry(cur, struct btrfs_worker_thread,
141 worker_list);
142 kthread_stop(worker->task);
143 list_del(&worker->worker_list);
144 kfree(worker);
145 }
146 return 0;
147}
148
149/*
150 * simple init on struct btrfs_workers
151 */
152void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
153{
154 workers->num_workers = 0;
155 INIT_LIST_HEAD(&workers->worker_list);
156 INIT_LIST_HEAD(&workers->idle_list);
157 spin_lock_init(&workers->lock);
158 workers->max_workers = max;
159 workers->idle_thresh = 32;
160 workers->name = name;
161}
162
163/*
164 * starts new worker threads. This does not enforce the max worker
165 * count in case you need to temporarily go past it.
166 */
167int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
168{
169 struct btrfs_worker_thread *worker;
170 int ret = 0;
171 int i;
172
173 for (i = 0; i < num_workers; i++) {
174 worker = kzalloc(sizeof(*worker), GFP_NOFS);
175 if (!worker) {
176 ret = -ENOMEM;
177 goto fail;
178 }
179
180 INIT_LIST_HEAD(&worker->pending);
181 INIT_LIST_HEAD(&worker->worker_list);
182 spin_lock_init(&worker->lock);
183 atomic_set(&worker->num_pending, 0);
184 worker->task = kthread_run(worker_loop, worker,
185 "btrfs-%s-%d", workers->name,
186 workers->num_workers + i);
187 worker->workers = workers;
188 if (IS_ERR(worker->task)) {
189 kfree(worker);
190 ret = PTR_ERR(worker->task);
191 goto fail;
192 }
193
194 spin_lock_irq(&workers->lock);
195 list_add_tail(&worker->worker_list, &workers->idle_list);
196 worker->idle = 1;
197 workers->num_workers++;
198 spin_unlock_irq(&workers->lock);
199 }
200 return 0;
201fail:
202 btrfs_stop_workers(workers);
203 return ret;
204}
205
206/*
207 * run through the list and find a worker thread that doesn't have a lot
208 * to do right now. This can return null if we aren't yet at the thread
209 * count limit and all of the threads are busy.
210 */
211static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
212{
213 struct btrfs_worker_thread *worker;
214 struct list_head *next;
215 int enforce_min = workers->num_workers < workers->max_workers;
216
217 /*
218 * if we find an idle thread, don't move it to the end of the
219 * idle list. This improves the chance that the next submission
220 * will reuse the same thread, and maybe catch it while it is still
221 * working
222 */
223 if (!list_empty(&workers->idle_list)) {
224 next = workers->idle_list.next;
225 worker = list_entry(next, struct btrfs_worker_thread,
226 worker_list);
227 return worker;
228 }
229 if (enforce_min || list_empty(&workers->worker_list))
230 return NULL;
231
232 /*
233 * if we pick a busy task, move the task to the end of the list.
234 * hopefully this will keep things somewhat evenly balanced.
235 * Do the move in batches based on the sequence number. This groups
236 * requests submitted at roughly the same time onto the same worker.
237 */
238 next = workers->worker_list.next;
239 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
240 atomic_inc(&worker->num_pending);
241 worker->sequence++;
242
243 if (worker->sequence % workers->idle_thresh == 0)
244 list_move_tail(next, &workers->worker_list);
245 return worker;
246}
247
248/*
249 * selects a worker thread to take the next job. This will either find
250 * an idle worker, start a new worker up to the max count, or just return
251 * one of the existing busy workers.
252 */
253static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
254{
255 struct btrfs_worker_thread *worker;
256 unsigned long flags;
257
258again:
259 spin_lock_irqsave(&workers->lock, flags);
260 worker = next_worker(workers);
261 spin_unlock_irqrestore(&workers->lock, flags);
262
263 if (!worker) {
264 spin_lock_irqsave(&workers->lock, flags);
265 if (workers->num_workers >= workers->max_workers) {
266 struct list_head *fallback = NULL;
267 /*
268 * we have failed to find any workers, just
269 * return the force one
270 */
271 if (!list_empty(&workers->worker_list))
272 fallback = workers->worker_list.next;
273 if (!list_empty(&workers->idle_list))
274 fallback = workers->idle_list.next;
275 BUG_ON(!fallback);
276 worker = list_entry(fallback,
277 struct btrfs_worker_thread, worker_list);
278 spin_unlock_irqrestore(&workers->lock, flags);
279 } else {
280 spin_unlock_irqrestore(&workers->lock, flags);
281 /* we're below the limit, start another worker */
282 btrfs_start_workers(workers, 1);
283 goto again;
284 }
285 }
286 return worker;
287}
288
289/*
290 * btrfs_requeue_work just puts the work item back on the tail of the list
291 * it was taken from. It is intended for use with long running work functions
292 * that make some progress and want to give the cpu up for others.
293 */
294int btrfs_requeue_work(struct btrfs_work *work)
295{
296 struct btrfs_worker_thread *worker = work->worker;
297 unsigned long flags;
298
299 if (test_and_set_bit(0, &work->flags))
300 goto out;
301
302 spin_lock_irqsave(&worker->lock, flags);
303 atomic_inc(&worker->num_pending);
304 list_add_tail(&work->list, &worker->pending);
305
306 /* by definition we're busy, take ourselves off the idle
307 * list
308 */
309 if (worker->idle) {
310 spin_lock_irqsave(&worker->workers->lock, flags);
311 worker->idle = 0;
312 list_move_tail(&worker->worker_list,
313 &worker->workers->worker_list);
314 spin_unlock_irqrestore(&worker->workers->lock, flags);
315 }
316
317 spin_unlock_irqrestore(&worker->lock, flags);
318
319out:
320 return 0;
321}
322
323/*
324 * places a struct btrfs_work into the pending queue of one of the kthreads
325 */
326int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
327{
328 struct btrfs_worker_thread *worker;
329 unsigned long flags;
330 int wake = 0;
331
332 /* don't requeue something already on a list */
333 if (test_and_set_bit(0, &work->flags))
334 goto out;
335
336 worker = find_worker(workers);
337
338 spin_lock_irqsave(&worker->lock, flags);
339 atomic_inc(&worker->num_pending);
340 check_busy_worker(worker);
341 list_add_tail(&work->list, &worker->pending);
342
343 /*
344 * avoid calling into wake_up_process if this thread has already
345 * been kicked
346 */
347 if (!worker->working)
348 wake = 1;
349 worker->working = 1;
350
351 spin_unlock_irqrestore(&worker->lock, flags);
352
353 if (wake)
354 wake_up_process(worker->task);
355out:
356 return 0;
357}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000000..4ec9a2ee0f9d
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,85 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_
21
22struct btrfs_worker_thread;
23
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work {
39 /*
40 * only func should be set to the function you want called
41 * your work struct is passed as the only arg
42 */
43 void (*func)(struct btrfs_work *work);
44
45 /*
46 * flags should be set to zero. It is used to make sure the
47 * struct is only inserted once into the list.
48 */
49 unsigned long flags;
50
51 /* don't touch these */
52 struct btrfs_worker_thread *worker;
53 struct list_head list;
54};
55
56struct btrfs_workers {
57 /* current number of running workers */
58 int num_workers;
59
60 /* max number of workers allowed. changed by btrfs_start_workers */
61 int max_workers;
62
63 /* once a worker has this many requests or fewer, it is idle */
64 int idle_thresh;
65
66 /* list with all the work threads. The workers on the idle thread
67 * may be actively servicing jobs, but they haven't yet hit the
68 * idle thresh limit above.
69 */
70 struct list_head worker_list;
71 struct list_head idle_list;
72
73 /* lock for finding the next worker thread to queue on */
74 spinlock_t lock;
75
76 /* extra name for this worker, used for current->name */
77 char *name;
78};
79
80int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
81int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
82int btrfs_stop_workers(struct btrfs_workers *workers);
83void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
84int btrfs_requeue_work(struct btrfs_work *work);
85#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000000..0b2e623cf421
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,133 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_I__
20#define __BTRFS_I__
21
22#include "extent_map.h"
23#include "extent_io.h"
24#include "ordered-data.h"
25
26/* in memory btrfs inode */
27struct btrfs_inode {
28 /* which subvolume this inode belongs to */
29 struct btrfs_root *root;
30
31 /* the block group preferred for allocations. This pointer is buggy
32 * and needs to be replaced with a bytenr instead
33 */
34 struct btrfs_block_group_cache *block_group;
35
36 /* key used to find this inode on disk. This is used by the code
37 * to read in roots of subvolumes
38 */
39 struct btrfs_key location;
40
41 /* the extent_tree has caches of all the extent mappings to disk */
42 struct extent_map_tree extent_tree;
43
44 /* the io_tree does range state (DIRTY, LOCKED etc) */
45 struct extent_io_tree io_tree;
46
47 /* special utility tree used to record which mirrors have already been
48 * tried when checksums fail for a given block
49 */
50 struct extent_io_tree io_failure_tree;
51
52 /* held while inserting checksums to avoid races */
53 struct mutex csum_mutex;
54
55 /* held while inesrting or deleting extents from files */
56 struct mutex extent_mutex;
57
58 /* held while logging the inode in tree-log.c */
59 struct mutex log_mutex;
60
61 /* used to order data wrt metadata */
62 struct btrfs_ordered_inode_tree ordered_tree;
63
64 /* standard acl pointers */
65 struct posix_acl *i_acl;
66 struct posix_acl *i_default_acl;
67
68 /* for keeping track of orphaned inodes */
69 struct list_head i_orphan;
70
71 /* list of all the delalloc inodes in the FS. There are times we need
72 * to write all the delalloc pages to disk, and this list is used
73 * to walk them all.
74 */
75 struct list_head delalloc_inodes;
76
77 /* full 64 bit generation number, struct vfs_inode doesn't have a big
78 * enough field for this.
79 */
80 u64 generation;
81
82 /*
83 * transid of the trans_handle that last modified this inode
84 */
85 u64 last_trans;
86 /*
87 * transid that last logged this inode
88 */
89 u64 logged_trans;
90
91 /*
92 * trans that last made a change that should be fully fsync'd. This
93 * gets reset to zero each time the inode is logged
94 */
95 u64 log_dirty_trans;
96
97 /* total number of bytes pending delalloc, used by stat to calc the
98 * real block usage of the file
99 */
100 u64 delalloc_bytes;
101
102 /*
103 * the size of the file stored in the metadata on disk. data=ordered
104 * means the in-memory i_size might be larger than the size on disk
105 * because not all the blocks are written yet.
106 */
107 u64 disk_i_size;
108
109 /* flags field from the on disk inode */
110 u32 flags;
111
112 /*
113 * if this is a directory then index_cnt is the counter for the index
114 * number for new files that are created
115 */
116 u64 index_cnt;
117
118 struct inode vfs_inode;
119};
120
121static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
122{
123 return container_of(inode, struct btrfs_inode, vfs_inode);
124}
125
126static inline void btrfs_i_size_write(struct inode *inode, u64 size)
127{
128 inode->i_size = size;
129 BTRFS_I(inode)->disk_i_size = size;
130}
131
132
133#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 000000000000..cd6598b169df
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,24 @@
1#ifndef _COMPAT_H_
2#define _COMPAT_H_
3
4#define btrfs_drop_nlink(inode) drop_nlink(inode)
5#define btrfs_inc_nlink(inode) inc_nlink(inode)
6
7#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27)
8static inline struct dentry *d_obtain_alias(struct inode *inode)
9{
10 struct dentry *d;
11
12 if (!inode)
13 return NULL;
14 if (IS_ERR(inode))
15 return ERR_CAST(inode);
16
17 d = d_alloc_anon(inode);
18 if (!d)
19 iput(inode);
20 return d;
21}
22#endif
23
24#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 000000000000..1eaf11d334fd
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,120 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CRC32C__
20#define __BTRFS_CRC32C__
21#include <asm/byteorder.h>
22#include <linux/crc32c.h>
23#include <linux/version.h>
24
25/* #define CONFIG_BTRFS_HW_SUM 1 */
26
27#ifdef CONFIG_BTRFS_HW_SUM
28#ifdef CONFIG_X86
29/*
30 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
31 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
32 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
33 * http://www.intel.com/products/processor/manuals/
34 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
35 * Volume 2A: Instruction Set Reference, A-M
36 */
37
38#include <asm/cpufeature.h>
39#include <asm/processor.h>
40
41#define X86_FEATURE_XMM4_2 (4*32+20) /* Streaming SIMD Extensions-4.2 */
42#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
43
44#ifdef CONFIG_X86_64
45#define REX_PRE "0x48, "
46#define SCALE_F 8
47#else
48#define REX_PRE
49#define SCALE_F 4
50#endif
51
52static inline u32 btrfs_crc32c_le_hw_byte(u32 crc, unsigned char const *data,
53 size_t length)
54{
55 while (length--) {
56 __asm__ __volatile__(
57 ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
58 :"=S"(crc)
59 :"0"(crc), "c"(*data)
60 );
61 data++;
62 }
63
64 return crc;
65}
66
67static inline u32 __pure btrfs_crc32c_le_hw(u32 crc, unsigned char const *p,
68 size_t len)
69{
70 unsigned int iquotient = len / SCALE_F;
71 unsigned int iremainder = len % SCALE_F;
72#ifdef CONFIG_X86_64
73 u64 *ptmp = (u64 *)p;
74#else
75 u32 *ptmp = (u32 *)p;
76#endif
77
78 while (iquotient--) {
79 __asm__ __volatile__(
80 ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
81 :"=S"(crc)
82 :"0"(crc), "c"(*ptmp)
83 );
84 ptmp++;
85 }
86
87 if (iremainder)
88 crc = btrfs_crc32c_le_hw_byte(crc, (unsigned char *)ptmp,
89 iremainder);
90
91 return crc;
92}
93#endif /* CONFIG_BTRFS_HW_SUM */
94
95static inline u32 __btrfs_crc32c(u32 crc, unsigned char const *address,
96 size_t len)
97{
98#ifdef CONFIG_BTRFS_HW_SUM
99 if (cpu_has_xmm4_2)
100 return btrfs_crc32c_le_hw(crc, address, len);
101#endif
102 return crc32c_le(crc, address, len);
103}
104
105#else
106
107#define __btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
108
109#endif /* CONFIG_X86 */
110
111/**
112 * implementation of crc32c_le() changed in linux-2.6.23,
113 * has of v0.13 btrfs-progs is using the latest version.
114 * We must workaround older implementations of crc32c_le()
115 * found on older kernel versions.
116 */
117#define btrfs_crc32c(seed, data, length) \
118 __btrfs_crc32c(seed, (unsigned char const *)data, length)
119#endif
120
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000000..9caeb377de63
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3716 @@
1/*
2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "transaction.h"
23#include "print-tree.h"
24#include "locking.h"
25
26static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27 *root, struct btrfs_path *path, int level);
28static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29 *root, struct btrfs_key *ins_key,
30 struct btrfs_path *path, int data_size, int extend);
31static int push_node_left(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct extent_buffer *dst,
33 struct extent_buffer *src, int empty);
34static int balance_node_right(struct btrfs_trans_handle *trans,
35 struct btrfs_root *root,
36 struct extent_buffer *dst_buf,
37 struct extent_buffer *src_buf);
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot);
40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void)
47{
48 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) {
51 btrfs_init_path(path);
52 path->reada = 1;
53 }
54 return path;
55}
56
57/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p)
59{
60 btrfs_release_path(NULL, p);
61 kmem_cache_free(btrfs_path_cachep, p);
62}
63
64/*
65 * path release drops references on the extent buffers in the path
66 * and it drops any locks held by this path
67 *
68 * It is safe to call this on paths that no locks or extent buffers held.
69 */
70void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71{
72 int i;
73
74 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75 p->slots[i] = 0;
76 if (!p->nodes[i])
77 continue;
78 if (p->locks[i]) {
79 btrfs_tree_unlock(p->nodes[i]);
80 p->locks[i] = 0;
81 }
82 free_extent_buffer(p->nodes[i]);
83 p->nodes[i] = NULL;
84 }
85}
86
87/*
88 * safely gets a reference on the root node of a tree. A lock
89 * is not taken, so a concurrent writer may put a different node
90 * at the root of the tree. See btrfs_lock_root_node for the
91 * looping required.
92 *
93 * The extent buffer returned by this has a reference taken, so
94 * it won't disappear. It may stop being the root of the tree
95 * at any time because there are no locks held.
96 */
97struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98{
99 struct extent_buffer *eb;
100 spin_lock(&root->node_lock);
101 eb = root->node;
102 extent_buffer_get(eb);
103 spin_unlock(&root->node_lock);
104 return eb;
105}
106
107/* loop around taking references on and locking the root node of the
108 * tree until you end up with a lock on the root. A locked buffer
109 * is returned, with a reference held.
110 */
111struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
112{
113 struct extent_buffer *eb;
114
115 while(1) {
116 eb = btrfs_root_node(root);
117 btrfs_tree_lock(eb);
118
119 spin_lock(&root->node_lock);
120 if (eb == root->node) {
121 spin_unlock(&root->node_lock);
122 break;
123 }
124 spin_unlock(&root->node_lock);
125
126 btrfs_tree_unlock(eb);
127 free_extent_buffer(eb);
128 }
129 return eb;
130}
131
132/* cowonly root (everything not a reference counted cow subvolume), just get
133 * put onto a simple dirty list. transaction.c walks this to make sure they
134 * get properly updated on disk.
135 */
136static void add_root_to_dirty_list(struct btrfs_root *root)
137{
138 if (root->track_dirty && list_empty(&root->dirty_list)) {
139 list_add(&root->dirty_list,
140 &root->fs_info->dirty_cowonly_roots);
141 }
142}
143
144/*
145 * used by snapshot creation to make a copy of a root for a tree with
146 * a given objectid. The buffer with the new root node is returned in
147 * cow_ret, and this func returns zero on success or a negative error code.
148 */
149int btrfs_copy_root(struct btrfs_trans_handle *trans,
150 struct btrfs_root *root,
151 struct extent_buffer *buf,
152 struct extent_buffer **cow_ret, u64 new_root_objectid)
153{
154 struct extent_buffer *cow;
155 u32 nritems;
156 int ret = 0;
157 int level;
158 struct btrfs_root *new_root;
159
160 new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
161 if (!new_root)
162 return -ENOMEM;
163
164 memcpy(new_root, root, sizeof(*new_root));
165 new_root->root_key.objectid = new_root_objectid;
166
167 WARN_ON(root->ref_cows && trans->transid !=
168 root->fs_info->running_transaction->transid);
169 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
170
171 level = btrfs_header_level(buf);
172 nritems = btrfs_header_nritems(buf);
173
174 cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
175 new_root_objectid, trans->transid,
176 level, buf->start, 0);
177 if (IS_ERR(cow)) {
178 kfree(new_root);
179 return PTR_ERR(cow);
180 }
181
182 copy_extent_buffer(cow, buf, 0, 0, cow->len);
183 btrfs_set_header_bytenr(cow, cow->start);
184 btrfs_set_header_generation(cow, trans->transid);
185 btrfs_set_header_owner(cow, new_root_objectid);
186 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187
188 WARN_ON(btrfs_header_generation(buf) > trans->transid);
189 ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
190 kfree(new_root);
191
192 if (ret)
193 return ret;
194
195 btrfs_mark_buffer_dirty(cow);
196 *cow_ret = cow;
197 return 0;
198}
199
200/*
201 * does the dirty work in cow of a single block. The parent block
202 * (if supplied) is updated to point to the new cow copy. The new
203 * buffer is marked dirty and returned locked. If you modify the block
204 * it needs to be marked dirty again.
205 *
206 * search_start -- an allocation hint for the new block
207 *
208 * empty_size -- a hint that you plan on doing more cow. This is the size in bytes
209 * the allocator should try to find free next to the block it returns. This is
210 * just a hint and may be ignored by the allocator.
211 *
212 * prealloc_dest -- if you have already reserved a destination for the cow,
213 * this uses that block instead of allocating a new one. btrfs_alloc_reserved_extent
214 * is used to finish the allocation.
215 */
216int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
217 struct btrfs_root *root,
218 struct extent_buffer *buf,
219 struct extent_buffer *parent, int parent_slot,
220 struct extent_buffer **cow_ret,
221 u64 search_start, u64 empty_size,
222 u64 prealloc_dest)
223{
224 u64 parent_start;
225 struct extent_buffer *cow;
226 u32 nritems;
227 int ret = 0;
228 int level;
229 int unlock_orig = 0;
230
231 if (*cow_ret == buf)
232 unlock_orig = 1;
233
234 WARN_ON(!btrfs_tree_locked(buf));
235
236 if (parent)
237 parent_start = parent->start;
238 else
239 parent_start = 0;
240
241 WARN_ON(root->ref_cows && trans->transid !=
242 root->fs_info->running_transaction->transid);
243 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
244
245 level = btrfs_header_level(buf);
246 nritems = btrfs_header_nritems(buf);
247
248 if (prealloc_dest) {
249 struct btrfs_key ins;
250
251 ins.objectid = prealloc_dest;
252 ins.offset = buf->len;
253 ins.type = BTRFS_EXTENT_ITEM_KEY;
254
255 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
256 root->root_key.objectid,
257 trans->transid, level, &ins);
258 BUG_ON(ret);
259 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
260 buf->len);
261 } else {
262 cow = btrfs_alloc_free_block(trans, root, buf->len,
263 parent_start,
264 root->root_key.objectid,
265 trans->transid, level,
266 search_start, empty_size);
267 }
268 if (IS_ERR(cow))
269 return PTR_ERR(cow);
270
271 copy_extent_buffer(cow, buf, 0, 0, cow->len);
272 btrfs_set_header_bytenr(cow, cow->start);
273 btrfs_set_header_generation(cow, trans->transid);
274 btrfs_set_header_owner(cow, root->root_key.objectid);
275 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
276
277 WARN_ON(btrfs_header_generation(buf) > trans->transid);
278 if (btrfs_header_generation(buf) != trans->transid) {
279 u32 nr_extents;
280 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
281 if (ret)
282 return ret;
283
284 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
285 WARN_ON(ret);
286 } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
287 /*
288 * There are only two places that can drop reference to
289 * tree blocks owned by living reloc trees, one is here,
290 * the other place is btrfs_merge_path. In both places,
291 * we check reference count while tree block is locked.
292 * Furthermore, if reference count is one, it won't get
293 * increased by someone else.
294 */
295 u32 refs;
296 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
297 buf->len, &refs);
298 BUG_ON(ret);
299 if (refs == 1) {
300 ret = btrfs_update_ref(trans, root, buf, cow,
301 0, nritems);
302 clean_tree_block(trans, root, buf);
303 } else {
304 ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
305 }
306 BUG_ON(ret);
307 } else {
308 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
309 if (ret)
310 return ret;
311 clean_tree_block(trans, root, buf);
312 }
313
314 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
315 ret = btrfs_add_reloc_mapping(root, buf->start,
316 buf->len, cow->start);
317 BUG_ON(ret);
318 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
319 WARN_ON(ret);
320 }
321
322 if (buf == root->node) {
323 WARN_ON(parent && parent != buf);
324
325 spin_lock(&root->node_lock);
326 root->node = cow;
327 extent_buffer_get(cow);
328 spin_unlock(&root->node_lock);
329
330 if (buf != root->commit_root) {
331 btrfs_free_extent(trans, root, buf->start,
332 buf->len, buf->start,
333 root->root_key.objectid,
334 btrfs_header_generation(buf),
335 level, 1);
336 }
337 free_extent_buffer(buf);
338 add_root_to_dirty_list(root);
339 } else {
340 btrfs_set_node_blockptr(parent, parent_slot,
341 cow->start);
342 WARN_ON(trans->transid == 0);
343 btrfs_set_node_ptr_generation(parent, parent_slot,
344 trans->transid);
345 btrfs_mark_buffer_dirty(parent);
346 WARN_ON(btrfs_header_generation(parent) != trans->transid);
347 btrfs_free_extent(trans, root, buf->start, buf->len,
348 parent_start, btrfs_header_owner(parent),
349 btrfs_header_generation(parent), level, 1);
350 }
351 if (unlock_orig)
352 btrfs_tree_unlock(buf);
353 free_extent_buffer(buf);
354 btrfs_mark_buffer_dirty(cow);
355 *cow_ret = cow;
356 return 0;
357}
358
359/*
360 * cows a single block, see __btrfs_cow_block for the real work.
361 * This version of it has extra checks so that a block isn't cow'd more than
362 * once per transaction, as long as it hasn't been written yet
363 */
364int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
365 struct btrfs_root *root, struct extent_buffer *buf,
366 struct extent_buffer *parent, int parent_slot,
367 struct extent_buffer **cow_ret, u64 prealloc_dest)
368{
369 u64 search_start;
370 int ret;
371
372 if (trans->transaction != root->fs_info->running_transaction) {
373 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
374 root->fs_info->running_transaction->transid);
375 WARN_ON(1);
376 }
377 if (trans->transid != root->fs_info->generation) {
378 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
379 root->fs_info->generation);
380 WARN_ON(1);
381 }
382
383 spin_lock(&root->fs_info->hash_lock);
384 if (btrfs_header_generation(buf) == trans->transid &&
385 btrfs_header_owner(buf) == root->root_key.objectid &&
386 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
387 *cow_ret = buf;
388 spin_unlock(&root->fs_info->hash_lock);
389 WARN_ON(prealloc_dest);
390 return 0;
391 }
392 spin_unlock(&root->fs_info->hash_lock);
393 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
394 ret = __btrfs_cow_block(trans, root, buf, parent,
395 parent_slot, cow_ret, search_start, 0,
396 prealloc_dest);
397 return ret;
398}
399
400/*
401 * helper function for defrag to decide if two blocks pointed to by a
402 * node are actually close by
403 */
404static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
405{
406 if (blocknr < other && other - (blocknr + blocksize) < 32768)
407 return 1;
408 if (blocknr > other && blocknr - (other + blocksize) < 32768)
409 return 1;
410 return 0;
411}
412
413/*
414 * compare two keys in a memcmp fashion
415 */
416static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
417{
418 struct btrfs_key k1;
419
420 btrfs_disk_key_to_cpu(&k1, disk);
421
422 if (k1.objectid > k2->objectid)
423 return 1;
424 if (k1.objectid < k2->objectid)
425 return -1;
426 if (k1.type > k2->type)
427 return 1;
428 if (k1.type < k2->type)
429 return -1;
430 if (k1.offset > k2->offset)
431 return 1;
432 if (k1.offset < k2->offset)
433 return -1;
434 return 0;
435}
436
437
438/*
439 * this is used by the defrag code to go through all the
440 * leaves pointed to by a node and reallocate them so that
441 * disk order is close to key order
442 */
443int btrfs_realloc_node(struct btrfs_trans_handle *trans,
444 struct btrfs_root *root, struct extent_buffer *parent,
445 int start_slot, int cache_only, u64 *last_ret,
446 struct btrfs_key *progress)
447{
448 struct extent_buffer *cur;
449 u64 blocknr;
450 u64 gen;
451 u64 search_start = *last_ret;
452 u64 last_block = 0;
453 u64 other;
454 u32 parent_nritems;
455 int end_slot;
456 int i;
457 int err = 0;
458 int parent_level;
459 int uptodate;
460 u32 blocksize;
461 int progress_passed = 0;
462 struct btrfs_disk_key disk_key;
463
464 parent_level = btrfs_header_level(parent);
465 if (cache_only && parent_level != 1)
466 return 0;
467
468 if (trans->transaction != root->fs_info->running_transaction) {
469 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
470 root->fs_info->running_transaction->transid);
471 WARN_ON(1);
472 }
473 if (trans->transid != root->fs_info->generation) {
474 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
475 root->fs_info->generation);
476 WARN_ON(1);
477 }
478
479 parent_nritems = btrfs_header_nritems(parent);
480 blocksize = btrfs_level_size(root, parent_level - 1);
481 end_slot = parent_nritems;
482
483 if (parent_nritems == 1)
484 return 0;
485
486 for (i = start_slot; i < end_slot; i++) {
487 int close = 1;
488
489 if (!parent->map_token) {
490 map_extent_buffer(parent,
491 btrfs_node_key_ptr_offset(i),
492 sizeof(struct btrfs_key_ptr),
493 &parent->map_token, &parent->kaddr,
494 &parent->map_start, &parent->map_len,
495 KM_USER1);
496 }
497 btrfs_node_key(parent, &disk_key, i);
498 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
499 continue;
500
501 progress_passed = 1;
502 blocknr = btrfs_node_blockptr(parent, i);
503 gen = btrfs_node_ptr_generation(parent, i);
504 if (last_block == 0)
505 last_block = blocknr;
506
507 if (i > 0) {
508 other = btrfs_node_blockptr(parent, i - 1);
509 close = close_blocks(blocknr, other, blocksize);
510 }
511 if (!close && i < end_slot - 2) {
512 other = btrfs_node_blockptr(parent, i + 1);
513 close = close_blocks(blocknr, other, blocksize);
514 }
515 if (close) {
516 last_block = blocknr;
517 continue;
518 }
519 if (parent->map_token) {
520 unmap_extent_buffer(parent, parent->map_token,
521 KM_USER1);
522 parent->map_token = NULL;
523 }
524
525 cur = btrfs_find_tree_block(root, blocknr, blocksize);
526 if (cur)
527 uptodate = btrfs_buffer_uptodate(cur, gen);
528 else
529 uptodate = 0;
530 if (!cur || !uptodate) {
531 if (cache_only) {
532 free_extent_buffer(cur);
533 continue;
534 }
535 if (!cur) {
536 cur = read_tree_block(root, blocknr,
537 blocksize, gen);
538 } else if (!uptodate) {
539 btrfs_read_buffer(cur, gen);
540 }
541 }
542 if (search_start == 0)
543 search_start = last_block;
544
545 btrfs_tree_lock(cur);
546 err = __btrfs_cow_block(trans, root, cur, parent, i,
547 &cur, search_start,
548 min(16 * blocksize,
549 (end_slot - i) * blocksize), 0);
550 if (err) {
551 btrfs_tree_unlock(cur);
552 free_extent_buffer(cur);
553 break;
554 }
555 search_start = cur->start;
556 last_block = cur->start;
557 *last_ret = search_start;
558 btrfs_tree_unlock(cur);
559 free_extent_buffer(cur);
560 }
561 if (parent->map_token) {
562 unmap_extent_buffer(parent, parent->map_token,
563 KM_USER1);
564 parent->map_token = NULL;
565 }
566 return err;
567}
568
569/*
570 * The leaf data grows from end-to-front in the node.
571 * this returns the address of the start of the last item,
572 * which is the stop of the leaf data stack
573 */
574static inline unsigned int leaf_data_end(struct btrfs_root *root,
575 struct extent_buffer *leaf)
576{
577 u32 nr = btrfs_header_nritems(leaf);
578 if (nr == 0)
579 return BTRFS_LEAF_DATA_SIZE(root);
580 return btrfs_item_offset_nr(leaf, nr - 1);
581}
582
583/*
584 * extra debugging checks to make sure all the items in a key are
585 * well formed and in the proper order
586 */
587static int check_node(struct btrfs_root *root, struct btrfs_path *path,
588 int level)
589{
590 struct extent_buffer *parent = NULL;
591 struct extent_buffer *node = path->nodes[level];
592 struct btrfs_disk_key parent_key;
593 struct btrfs_disk_key node_key;
594 int parent_slot;
595 int slot;
596 struct btrfs_key cpukey;
597 u32 nritems = btrfs_header_nritems(node);
598
599 if (path->nodes[level + 1])
600 parent = path->nodes[level + 1];
601
602 slot = path->slots[level];
603 BUG_ON(nritems == 0);
604 if (parent) {
605 parent_slot = path->slots[level + 1];
606 btrfs_node_key(parent, &parent_key, parent_slot);
607 btrfs_node_key(node, &node_key, 0);
608 BUG_ON(memcmp(&parent_key, &node_key,
609 sizeof(struct btrfs_disk_key)));
610 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
611 btrfs_header_bytenr(node));
612 }
613 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
614 if (slot != 0) {
615 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
616 btrfs_node_key(node, &node_key, slot);
617 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
618 }
619 if (slot < nritems - 1) {
620 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
621 btrfs_node_key(node, &node_key, slot);
622 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
623 }
624 return 0;
625}
626
627/*
628 * extra checking to make sure all the items in a leaf are
629 * well formed and in the proper order
630 */
631static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
632 int level)
633{
634 struct extent_buffer *leaf = path->nodes[level];
635 struct extent_buffer *parent = NULL;
636 int parent_slot;
637 struct btrfs_key cpukey;
638 struct btrfs_disk_key parent_key;
639 struct btrfs_disk_key leaf_key;
640 int slot = path->slots[0];
641
642 u32 nritems = btrfs_header_nritems(leaf);
643
644 if (path->nodes[level + 1])
645 parent = path->nodes[level + 1];
646
647 if (nritems == 0)
648 return 0;
649
650 if (parent) {
651 parent_slot = path->slots[level + 1];
652 btrfs_node_key(parent, &parent_key, parent_slot);
653 btrfs_item_key(leaf, &leaf_key, 0);
654
655 BUG_ON(memcmp(&parent_key, &leaf_key,
656 sizeof(struct btrfs_disk_key)));
657 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
658 btrfs_header_bytenr(leaf));
659 }
660#if 0
661 for (i = 0; nritems > 1 && i < nritems - 2; i++) {
662 btrfs_item_key_to_cpu(leaf, &cpukey, i + 1);
663 btrfs_item_key(leaf, &leaf_key, i);
664 if (comp_keys(&leaf_key, &cpukey) >= 0) {
665 btrfs_print_leaf(root, leaf);
666 printk("slot %d offset bad key\n", i);
667 BUG_ON(1);
668 }
669 if (btrfs_item_offset_nr(leaf, i) !=
670 btrfs_item_end_nr(leaf, i + 1)) {
671 btrfs_print_leaf(root, leaf);
672 printk("slot %d offset bad\n", i);
673 BUG_ON(1);
674 }
675 if (i == 0) {
676 if (btrfs_item_offset_nr(leaf, i) +
677 btrfs_item_size_nr(leaf, i) !=
678 BTRFS_LEAF_DATA_SIZE(root)) {
679 btrfs_print_leaf(root, leaf);
680 printk("slot %d first offset bad\n", i);
681 BUG_ON(1);
682 }
683 }
684 }
685 if (nritems > 0) {
686 if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) {
687 btrfs_print_leaf(root, leaf);
688 printk("slot %d bad size \n", nritems - 1);
689 BUG_ON(1);
690 }
691 }
692#endif
693 if (slot != 0 && slot < nritems - 1) {
694 btrfs_item_key(leaf, &leaf_key, slot);
695 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
696 if (comp_keys(&leaf_key, &cpukey) <= 0) {
697 btrfs_print_leaf(root, leaf);
698 printk("slot %d offset bad key\n", slot);
699 BUG_ON(1);
700 }
701 if (btrfs_item_offset_nr(leaf, slot - 1) !=
702 btrfs_item_end_nr(leaf, slot)) {
703 btrfs_print_leaf(root, leaf);
704 printk("slot %d offset bad\n", slot);
705 BUG_ON(1);
706 }
707 }
708 if (slot < nritems - 1) {
709 btrfs_item_key(leaf, &leaf_key, slot);
710 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
711 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
712 if (btrfs_item_offset_nr(leaf, slot) !=
713 btrfs_item_end_nr(leaf, slot + 1)) {
714 btrfs_print_leaf(root, leaf);
715 printk("slot %d offset bad\n", slot);
716 BUG_ON(1);
717 }
718 }
719 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
720 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
721 return 0;
722}
723
724static int noinline check_block(struct btrfs_root *root,
725 struct btrfs_path *path, int level)
726{
727 u64 found_start;
728 return 0;
729 if (btrfs_header_level(path->nodes[level]) != level)
730 printk("warning: bad level %Lu wanted %d found %d\n",
731 path->nodes[level]->start, level,
732 btrfs_header_level(path->nodes[level]));
733 found_start = btrfs_header_bytenr(path->nodes[level]);
734 if (found_start != path->nodes[level]->start) {
735 printk("warning: bad bytentr %Lu found %Lu\n",
736 path->nodes[level]->start, found_start);
737 }
738#if 0
739 struct extent_buffer *buf = path->nodes[level];
740
741 if (memcmp_extent_buffer(buf, root->fs_info->fsid,
742 (unsigned long)btrfs_header_fsid(buf),
743 BTRFS_FSID_SIZE)) {
744 printk("warning bad block %Lu\n", buf->start);
745 return 1;
746 }
747#endif
748 if (level == 0)
749 return check_leaf(root, path, level);
750 return check_node(root, path, level);
751}
752
753/*
754 * search for key in the extent_buffer. The items start at offset p,
755 * and they are item_size apart. There are 'max' items in p.
756 *
757 * the slot in the array is returned via slot, and it points to
758 * the place where you would insert key if it is not found in
759 * the array.
760 *
761 * slot may point to max if the key is bigger than all of the keys
762 */
763static noinline int generic_bin_search(struct extent_buffer *eb,
764 unsigned long p,
765 int item_size, struct btrfs_key *key,
766 int max, int *slot)
767{
768 int low = 0;
769 int high = max;
770 int mid;
771 int ret;
772 struct btrfs_disk_key *tmp = NULL;
773 struct btrfs_disk_key unaligned;
774 unsigned long offset;
775 char *map_token = NULL;
776 char *kaddr = NULL;
777 unsigned long map_start = 0;
778 unsigned long map_len = 0;
779 int err;
780
781 while(low < high) {
782 mid = (low + high) / 2;
783 offset = p + mid * item_size;
784
785 if (!map_token || offset < map_start ||
786 (offset + sizeof(struct btrfs_disk_key)) >
787 map_start + map_len) {
788 if (map_token) {
789 unmap_extent_buffer(eb, map_token, KM_USER0);
790 map_token = NULL;
791 }
792 err = map_extent_buffer(eb, offset,
793 sizeof(struct btrfs_disk_key),
794 &map_token, &kaddr,
795 &map_start, &map_len, KM_USER0);
796
797 if (!err) {
798 tmp = (struct btrfs_disk_key *)(kaddr + offset -
799 map_start);
800 } else {
801 read_extent_buffer(eb, &unaligned,
802 offset, sizeof(unaligned));
803 tmp = &unaligned;
804 }
805
806 } else {
807 tmp = (struct btrfs_disk_key *)(kaddr + offset -
808 map_start);
809 }
810 ret = comp_keys(tmp, key);
811
812 if (ret < 0)
813 low = mid + 1;
814 else if (ret > 0)
815 high = mid;
816 else {
817 *slot = mid;
818 if (map_token)
819 unmap_extent_buffer(eb, map_token, KM_USER0);
820 return 0;
821 }
822 }
823 *slot = low;
824 if (map_token)
825 unmap_extent_buffer(eb, map_token, KM_USER0);
826 return 1;
827}
828
829/*
830 * simple bin_search frontend that does the right thing for
831 * leaves vs nodes
832 */
833static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
834 int level, int *slot)
835{
836 if (level == 0) {
837 return generic_bin_search(eb,
838 offsetof(struct btrfs_leaf, items),
839 sizeof(struct btrfs_item),
840 key, btrfs_header_nritems(eb),
841 slot);
842 } else {
843 return generic_bin_search(eb,
844 offsetof(struct btrfs_node, ptrs),
845 sizeof(struct btrfs_key_ptr),
846 key, btrfs_header_nritems(eb),
847 slot);
848 }
849 return -1;
850}
851
852/* given a node and slot number, this reads the blocks it points to. The
853 * extent buffer is returned with a reference taken (but unlocked).
854 * NULL is returned on error.
855 */
856static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
857 struct extent_buffer *parent, int slot)
858{
859 int level = btrfs_header_level(parent);
860 if (slot < 0)
861 return NULL;
862 if (slot >= btrfs_header_nritems(parent))
863 return NULL;
864
865 BUG_ON(level == 0);
866
867 return read_tree_block(root, btrfs_node_blockptr(parent, slot),
868 btrfs_level_size(root, level - 1),
869 btrfs_node_ptr_generation(parent, slot));
870}
871
872/*
873 * node level balancing, used to make sure nodes are in proper order for
874 * item deletion. We balance from the top down, so we have to make sure
875 * that a deletion won't leave an node completely empty later on.
876 */
877static noinline int balance_level(struct btrfs_trans_handle *trans,
878 struct btrfs_root *root,
879 struct btrfs_path *path, int level)
880{
881 struct extent_buffer *right = NULL;
882 struct extent_buffer *mid;
883 struct extent_buffer *left = NULL;
884 struct extent_buffer *parent = NULL;
885 int ret = 0;
886 int wret;
887 int pslot;
888 int orig_slot = path->slots[level];
889 int err_on_enospc = 0;
890 u64 orig_ptr;
891
892 if (level == 0)
893 return 0;
894
895 mid = path->nodes[level];
896 WARN_ON(!path->locks[level]);
897 WARN_ON(btrfs_header_generation(mid) != trans->transid);
898
899 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
900
901 if (level < BTRFS_MAX_LEVEL - 1)
902 parent = path->nodes[level + 1];
903 pslot = path->slots[level + 1];
904
905 /*
906 * deal with the case where there is only one pointer in the root
907 * by promoting the node below to a root
908 */
909 if (!parent) {
910 struct extent_buffer *child;
911
912 if (btrfs_header_nritems(mid) != 1)
913 return 0;
914
915 /* promote the child to a root */
916 child = read_node_slot(root, mid, 0);
917 btrfs_tree_lock(child);
918 BUG_ON(!child);
919 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
920 BUG_ON(ret);
921
922 spin_lock(&root->node_lock);
923 root->node = child;
924 spin_unlock(&root->node_lock);
925
926 ret = btrfs_update_extent_ref(trans, root, child->start,
927 mid->start, child->start,
928 root->root_key.objectid,
929 trans->transid, level - 1);
930 BUG_ON(ret);
931
932 add_root_to_dirty_list(root);
933 btrfs_tree_unlock(child);
934 path->locks[level] = 0;
935 path->nodes[level] = NULL;
936 clean_tree_block(trans, root, mid);
937 btrfs_tree_unlock(mid);
938 /* once for the path */
939 free_extent_buffer(mid);
940 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
941 mid->start, root->root_key.objectid,
942 btrfs_header_generation(mid),
943 level, 1);
944 /* once for the root ptr */
945 free_extent_buffer(mid);
946 return ret;
947 }
948 if (btrfs_header_nritems(mid) >
949 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
950 return 0;
951
952 if (btrfs_header_nritems(mid) < 2)
953 err_on_enospc = 1;
954
955 left = read_node_slot(root, parent, pslot - 1);
956 if (left) {
957 btrfs_tree_lock(left);
958 wret = btrfs_cow_block(trans, root, left,
959 parent, pslot - 1, &left, 0);
960 if (wret) {
961 ret = wret;
962 goto enospc;
963 }
964 }
965 right = read_node_slot(root, parent, pslot + 1);
966 if (right) {
967 btrfs_tree_lock(right);
968 wret = btrfs_cow_block(trans, root, right,
969 parent, pslot + 1, &right, 0);
970 if (wret) {
971 ret = wret;
972 goto enospc;
973 }
974 }
975
976 /* first, try to make some room in the middle buffer */
977 if (left) {
978 orig_slot += btrfs_header_nritems(left);
979 wret = push_node_left(trans, root, left, mid, 1);
980 if (wret < 0)
981 ret = wret;
982 if (btrfs_header_nritems(mid) < 2)
983 err_on_enospc = 1;
984 }
985
986 /*
987 * then try to empty the right most buffer into the middle
988 */
989 if (right) {
990 wret = push_node_left(trans, root, mid, right, 1);
991 if (wret < 0 && wret != -ENOSPC)
992 ret = wret;
993 if (btrfs_header_nritems(right) == 0) {
994 u64 bytenr = right->start;
995 u64 generation = btrfs_header_generation(parent);
996 u32 blocksize = right->len;
997
998 clean_tree_block(trans, root, right);
999 btrfs_tree_unlock(right);
1000 free_extent_buffer(right);
1001 right = NULL;
1002 wret = del_ptr(trans, root, path, level + 1, pslot +
1003 1);
1004 if (wret)
1005 ret = wret;
1006 wret = btrfs_free_extent(trans, root, bytenr,
1007 blocksize, parent->start,
1008 btrfs_header_owner(parent),
1009 generation, level, 1);
1010 if (wret)
1011 ret = wret;
1012 } else {
1013 struct btrfs_disk_key right_key;
1014 btrfs_node_key(right, &right_key, 0);
1015 btrfs_set_node_key(parent, &right_key, pslot + 1);
1016 btrfs_mark_buffer_dirty(parent);
1017 }
1018 }
1019 if (btrfs_header_nritems(mid) == 1) {
1020 /*
1021 * we're not allowed to leave a node with one item in the
1022 * tree during a delete. A deletion from lower in the tree
1023 * could try to delete the only pointer in this node.
1024 * So, pull some keys from the left.
1025 * There has to be a left pointer at this point because
1026 * otherwise we would have pulled some pointers from the
1027 * right
1028 */
1029 BUG_ON(!left);
1030 wret = balance_node_right(trans, root, mid, left);
1031 if (wret < 0) {
1032 ret = wret;
1033 goto enospc;
1034 }
1035 if (wret == 1) {
1036 wret = push_node_left(trans, root, left, mid, 1);
1037 if (wret < 0)
1038 ret = wret;
1039 }
1040 BUG_ON(wret == 1);
1041 }
1042 if (btrfs_header_nritems(mid) == 0) {
1043 /* we've managed to empty the middle node, drop it */
1044 u64 root_gen = btrfs_header_generation(parent);
1045 u64 bytenr = mid->start;
1046 u32 blocksize = mid->len;
1047
1048 clean_tree_block(trans, root, mid);
1049 btrfs_tree_unlock(mid);
1050 free_extent_buffer(mid);
1051 mid = NULL;
1052 wret = del_ptr(trans, root, path, level + 1, pslot);
1053 if (wret)
1054 ret = wret;
1055 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1056 parent->start,
1057 btrfs_header_owner(parent),
1058 root_gen, level, 1);
1059 if (wret)
1060 ret = wret;
1061 } else {
1062 /* update the parent key to reflect our changes */
1063 struct btrfs_disk_key mid_key;
1064 btrfs_node_key(mid, &mid_key, 0);
1065 btrfs_set_node_key(parent, &mid_key, pslot);
1066 btrfs_mark_buffer_dirty(parent);
1067 }
1068
1069 /* update the path */
1070 if (left) {
1071 if (btrfs_header_nritems(left) > orig_slot) {
1072 extent_buffer_get(left);
1073 /* left was locked after cow */
1074 path->nodes[level] = left;
1075 path->slots[level + 1] -= 1;
1076 path->slots[level] = orig_slot;
1077 if (mid) {
1078 btrfs_tree_unlock(mid);
1079 free_extent_buffer(mid);
1080 }
1081 } else {
1082 orig_slot -= btrfs_header_nritems(left);
1083 path->slots[level] = orig_slot;
1084 }
1085 }
1086 /* double check we haven't messed things up */
1087 check_block(root, path, level);
1088 if (orig_ptr !=
1089 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1090 BUG();
1091enospc:
1092 if (right) {
1093 btrfs_tree_unlock(right);
1094 free_extent_buffer(right);
1095 }
1096 if (left) {
1097 if (path->nodes[level] != left)
1098 btrfs_tree_unlock(left);
1099 free_extent_buffer(left);
1100 }
1101 return ret;
1102}
1103
1104/* Node balancing for insertion. Here we only split or push nodes around
1105 * when they are completely full. This is also done top down, so we
1106 * have to be pessimistic.
1107 */
1108static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
1109 struct btrfs_root *root,
1110 struct btrfs_path *path, int level)
1111{
1112 struct extent_buffer *right = NULL;
1113 struct extent_buffer *mid;
1114 struct extent_buffer *left = NULL;
1115 struct extent_buffer *parent = NULL;
1116 int ret = 0;
1117 int wret;
1118 int pslot;
1119 int orig_slot = path->slots[level];
1120 u64 orig_ptr;
1121
1122 if (level == 0)
1123 return 1;
1124
1125 mid = path->nodes[level];
1126 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1127 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1128
1129 if (level < BTRFS_MAX_LEVEL - 1)
1130 parent = path->nodes[level + 1];
1131 pslot = path->slots[level + 1];
1132
1133 if (!parent)
1134 return 1;
1135
1136 left = read_node_slot(root, parent, pslot - 1);
1137
1138 /* first, try to make some room in the middle buffer */
1139 if (left) {
1140 u32 left_nr;
1141
1142 btrfs_tree_lock(left);
1143 left_nr = btrfs_header_nritems(left);
1144 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1145 wret = 1;
1146 } else {
1147 ret = btrfs_cow_block(trans, root, left, parent,
1148 pslot - 1, &left, 0);
1149 if (ret)
1150 wret = 1;
1151 else {
1152 wret = push_node_left(trans, root,
1153 left, mid, 0);
1154 }
1155 }
1156 if (wret < 0)
1157 ret = wret;
1158 if (wret == 0) {
1159 struct btrfs_disk_key disk_key;
1160 orig_slot += left_nr;
1161 btrfs_node_key(mid, &disk_key, 0);
1162 btrfs_set_node_key(parent, &disk_key, pslot);
1163 btrfs_mark_buffer_dirty(parent);
1164 if (btrfs_header_nritems(left) > orig_slot) {
1165 path->nodes[level] = left;
1166 path->slots[level + 1] -= 1;
1167 path->slots[level] = orig_slot;
1168 btrfs_tree_unlock(mid);
1169 free_extent_buffer(mid);
1170 } else {
1171 orig_slot -=
1172 btrfs_header_nritems(left);
1173 path->slots[level] = orig_slot;
1174 btrfs_tree_unlock(left);
1175 free_extent_buffer(left);
1176 }
1177 return 0;
1178 }
1179 btrfs_tree_unlock(left);
1180 free_extent_buffer(left);
1181 }
1182 right = read_node_slot(root, parent, pslot + 1);
1183
1184 /*
1185 * then try to empty the right most buffer into the middle
1186 */
1187 if (right) {
1188 u32 right_nr;
1189 btrfs_tree_lock(right);
1190 right_nr = btrfs_header_nritems(right);
1191 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1192 wret = 1;
1193 } else {
1194 ret = btrfs_cow_block(trans, root, right,
1195 parent, pslot + 1,
1196 &right, 0);
1197 if (ret)
1198 wret = 1;
1199 else {
1200 wret = balance_node_right(trans, root,
1201 right, mid);
1202 }
1203 }
1204 if (wret < 0)
1205 ret = wret;
1206 if (wret == 0) {
1207 struct btrfs_disk_key disk_key;
1208
1209 btrfs_node_key(right, &disk_key, 0);
1210 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1211 btrfs_mark_buffer_dirty(parent);
1212
1213 if (btrfs_header_nritems(mid) <= orig_slot) {
1214 path->nodes[level] = right;
1215 path->slots[level + 1] += 1;
1216 path->slots[level] = orig_slot -
1217 btrfs_header_nritems(mid);
1218 btrfs_tree_unlock(mid);
1219 free_extent_buffer(mid);
1220 } else {
1221 btrfs_tree_unlock(right);
1222 free_extent_buffer(right);
1223 }
1224 return 0;
1225 }
1226 btrfs_tree_unlock(right);
1227 free_extent_buffer(right);
1228 }
1229 return 1;
1230}
1231
1232/*
1233 * readahead one full node of leaves, finding things that are close
1234 * to the block in 'slot', and triggering ra on them.
1235 */
1236static noinline void reada_for_search(struct btrfs_root *root,
1237 struct btrfs_path *path,
1238 int level, int slot, u64 objectid)
1239{
1240 struct extent_buffer *node;
1241 struct btrfs_disk_key disk_key;
1242 u32 nritems;
1243 u64 search;
1244 u64 lowest_read;
1245 u64 highest_read;
1246 u64 nread = 0;
1247 int direction = path->reada;
1248 struct extent_buffer *eb;
1249 u32 nr;
1250 u32 blocksize;
1251 u32 nscan = 0;
1252
1253 if (level != 1)
1254 return;
1255
1256 if (!path->nodes[level])
1257 return;
1258
1259 node = path->nodes[level];
1260
1261 search = btrfs_node_blockptr(node, slot);
1262 blocksize = btrfs_level_size(root, level - 1);
1263 eb = btrfs_find_tree_block(root, search, blocksize);
1264 if (eb) {
1265 free_extent_buffer(eb);
1266 return;
1267 }
1268
1269 highest_read = search;
1270 lowest_read = search;
1271
1272 nritems = btrfs_header_nritems(node);
1273 nr = slot;
1274 while(1) {
1275 if (direction < 0) {
1276 if (nr == 0)
1277 break;
1278 nr--;
1279 } else if (direction > 0) {
1280 nr++;
1281 if (nr >= nritems)
1282 break;
1283 }
1284 if (path->reada < 0 && objectid) {
1285 btrfs_node_key(node, &disk_key, nr);
1286 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1287 break;
1288 }
1289 search = btrfs_node_blockptr(node, nr);
1290 if ((search >= lowest_read && search <= highest_read) ||
1291 (search < lowest_read && lowest_read - search <= 32768) ||
1292 (search > highest_read && search - highest_read <= 32768)) {
1293 readahead_tree_block(root, search, blocksize,
1294 btrfs_node_ptr_generation(node, nr));
1295 nread += blocksize;
1296 }
1297 nscan++;
1298 if (path->reada < 2 && (nread > (256 * 1024) || nscan > 32))
1299 break;
1300 if(nread > (1024 * 1024) || nscan > 128)
1301 break;
1302
1303 if (search < lowest_read)
1304 lowest_read = search;
1305 if (search > highest_read)
1306 highest_read = search;
1307 }
1308}
1309
1310/*
1311 * when we walk down the tree, it is usually safe to unlock the higher layers in
1312 * the tree. The exceptions are when our path goes through slot 0, because operations
1313 * on the tree might require changing key pointers higher up in the tree.
1314 *
1315 * callers might also have set path->keep_locks, which tells this code to
1316 * keep the lock if the path points to the last slot in the block. This is
1317 * part of walking through the tree, and selecting the next slot in the higher
1318 * block.
1319 *
1320 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
1321 * so if lowest_unlock is 1, level 0 won't be unlocked
1322 */
1323static noinline void unlock_up(struct btrfs_path *path, int level,
1324 int lowest_unlock)
1325{
1326 int i;
1327 int skip_level = level;
1328 int no_skips = 0;
1329 struct extent_buffer *t;
1330
1331 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1332 if (!path->nodes[i])
1333 break;
1334 if (!path->locks[i])
1335 break;
1336 if (!no_skips && path->slots[i] == 0) {
1337 skip_level = i + 1;
1338 continue;
1339 }
1340 if (!no_skips && path->keep_locks) {
1341 u32 nritems;
1342 t = path->nodes[i];
1343 nritems = btrfs_header_nritems(t);
1344 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1345 skip_level = i + 1;
1346 continue;
1347 }
1348 }
1349 if (skip_level < i && i >= lowest_unlock)
1350 no_skips = 1;
1351
1352 t = path->nodes[i];
1353 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1354 btrfs_tree_unlock(t);
1355 path->locks[i] = 0;
1356 }
1357 }
1358}
1359
1360/*
1361 * look for key in the tree. path is filled in with nodes along the way
1362 * if key is found, we return zero and you can find the item in the leaf
1363 * level of the path (level 0)
1364 *
1365 * If the key isn't found, the path points to the slot where it should
1366 * be inserted, and 1 is returned. If there are other errors during the
1367 * search a negative error number is returned.
1368 *
1369 * if ins_len > 0, nodes and leaves will be split as we walk down the
1370 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
1371 * possible)
1372 */
1373int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1374 *root, struct btrfs_key *key, struct btrfs_path *p, int
1375 ins_len, int cow)
1376{
1377 struct extent_buffer *b;
1378 struct extent_buffer *tmp;
1379 int slot;
1380 int ret;
1381 int level;
1382 int should_reada = p->reada;
1383 int lowest_unlock = 1;
1384 int blocksize;
1385 u8 lowest_level = 0;
1386 u64 blocknr;
1387 u64 gen;
1388 struct btrfs_key prealloc_block;
1389
1390 lowest_level = p->lowest_level;
1391 WARN_ON(lowest_level && ins_len > 0);
1392 WARN_ON(p->nodes[0] != NULL);
1393 WARN_ON(cow && root == root->fs_info->extent_root &&
1394 !mutex_is_locked(&root->fs_info->alloc_mutex));
1395 if (ins_len < 0)
1396 lowest_unlock = 2;
1397
1398 prealloc_block.objectid = 0;
1399
1400again:
1401 if (p->skip_locking)
1402 b = btrfs_root_node(root);
1403 else
1404 b = btrfs_lock_root_node(root);
1405
1406 while (b) {
1407 level = btrfs_header_level(b);
1408
1409 /*
1410 * setup the path here so we can release it under lock
1411 * contention with the cow code
1412 */
1413 p->nodes[level] = b;
1414 if (!p->skip_locking)
1415 p->locks[level] = 1;
1416
1417 if (cow) {
1418 int wret;
1419
1420 /* is a cow on this block not required */
1421 spin_lock(&root->fs_info->hash_lock);
1422 if (btrfs_header_generation(b) == trans->transid &&
1423 btrfs_header_owner(b) == root->root_key.objectid &&
1424 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1425 spin_unlock(&root->fs_info->hash_lock);
1426 goto cow_done;
1427 }
1428 spin_unlock(&root->fs_info->hash_lock);
1429
1430 /* ok, we have to cow, is our old prealloc the right
1431 * size?
1432 */
1433 if (prealloc_block.objectid &&
1434 prealloc_block.offset != b->len) {
1435 btrfs_free_reserved_extent(root,
1436 prealloc_block.objectid,
1437 prealloc_block.offset);
1438 prealloc_block.objectid = 0;
1439 }
1440
1441 /*
1442 * for higher level blocks, try not to allocate blocks
1443 * with the block and the parent locks held.
1444 */
1445 if (level > 1 && !prealloc_block.objectid &&
1446 btrfs_path_lock_waiting(p, level)) {
1447 u32 size = b->len;
1448 u64 hint = b->start;
1449
1450 btrfs_release_path(root, p);
1451 ret = btrfs_reserve_extent(trans, root,
1452 size, size, 0,
1453 hint, (u64)-1,
1454 &prealloc_block, 0);
1455 BUG_ON(ret);
1456 goto again;
1457 }
1458
1459 wret = btrfs_cow_block(trans, root, b,
1460 p->nodes[level + 1],
1461 p->slots[level + 1],
1462 &b, prealloc_block.objectid);
1463 prealloc_block.objectid = 0;
1464 if (wret) {
1465 free_extent_buffer(b);
1466 ret = wret;
1467 goto done;
1468 }
1469 }
1470cow_done:
1471 BUG_ON(!cow && ins_len);
1472 if (level != btrfs_header_level(b))
1473 WARN_ON(1);
1474 level = btrfs_header_level(b);
1475
1476 p->nodes[level] = b;
1477 if (!p->skip_locking)
1478 p->locks[level] = 1;
1479
1480 ret = check_block(root, p, level);
1481 if (ret) {
1482 ret = -1;
1483 goto done;
1484 }
1485
1486 ret = bin_search(b, key, level, &slot);
1487 if (level != 0) {
1488 if (ret && slot > 0)
1489 slot -= 1;
1490 p->slots[level] = slot;
1491 if (ins_len > 0 && btrfs_header_nritems(b) >=
1492 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1493 int sret = split_node(trans, root, p, level);
1494 BUG_ON(sret > 0);
1495 if (sret) {
1496 ret = sret;
1497 goto done;
1498 }
1499 b = p->nodes[level];
1500 slot = p->slots[level];
1501 } else if (ins_len < 0) {
1502 int sret = balance_level(trans, root, p,
1503 level);
1504 if (sret) {
1505 ret = sret;
1506 goto done;
1507 }
1508 b = p->nodes[level];
1509 if (!b) {
1510 btrfs_release_path(NULL, p);
1511 goto again;
1512 }
1513 slot = p->slots[level];
1514 BUG_ON(btrfs_header_nritems(b) == 1);
1515 }
1516 unlock_up(p, level, lowest_unlock);
1517
1518 /* this is only true while dropping a snapshot */
1519 if (level == lowest_level) {
1520 ret = 0;
1521 goto done;
1522 }
1523
1524 blocknr = btrfs_node_blockptr(b, slot);
1525 gen = btrfs_node_ptr_generation(b, slot);
1526 blocksize = btrfs_level_size(root, level - 1);
1527
1528 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1529 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1530 b = tmp;
1531 } else {
1532 /*
1533 * reduce lock contention at high levels
1534 * of the btree by dropping locks before
1535 * we read.
1536 */
1537 if (level > 1) {
1538 btrfs_release_path(NULL, p);
1539 if (tmp)
1540 free_extent_buffer(tmp);
1541 if (should_reada)
1542 reada_for_search(root, p,
1543 level, slot,
1544 key->objectid);
1545
1546 tmp = read_tree_block(root, blocknr,
1547 blocksize, gen);
1548 if (tmp)
1549 free_extent_buffer(tmp);
1550 goto again;
1551 } else {
1552 if (tmp)
1553 free_extent_buffer(tmp);
1554 if (should_reada)
1555 reada_for_search(root, p,
1556 level, slot,
1557 key->objectid);
1558 b = read_node_slot(root, b, slot);
1559 }
1560 }
1561 if (!p->skip_locking)
1562 btrfs_tree_lock(b);
1563 } else {
1564 p->slots[level] = slot;
1565 if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
1566 sizeof(struct btrfs_item) + ins_len) {
1567 int sret = split_leaf(trans, root, key,
1568 p, ins_len, ret == 0);
1569 BUG_ON(sret > 0);
1570 if (sret) {
1571 ret = sret;
1572 goto done;
1573 }
1574 }
1575 unlock_up(p, level, lowest_unlock);
1576 goto done;
1577 }
1578 }
1579 ret = 1;
1580done:
1581 if (prealloc_block.objectid) {
1582 btrfs_free_reserved_extent(root,
1583 prealloc_block.objectid,
1584 prealloc_block.offset);
1585 }
1586
1587 return ret;
1588}
1589
1590int btrfs_merge_path(struct btrfs_trans_handle *trans,
1591 struct btrfs_root *root,
1592 struct btrfs_key *node_keys,
1593 u64 *nodes, int lowest_level)
1594{
1595 struct extent_buffer *eb;
1596 struct extent_buffer *parent;
1597 struct btrfs_key key;
1598 u64 bytenr;
1599 u64 generation;
1600 u32 blocksize;
1601 int level;
1602 int slot;
1603 int key_match;
1604 int ret;
1605
1606 eb = btrfs_lock_root_node(root);
1607 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1608 BUG_ON(ret);
1609
1610 parent = eb;
1611 while (1) {
1612 level = btrfs_header_level(parent);
1613 if (level == 0 || level <= lowest_level)
1614 break;
1615
1616 ret = bin_search(parent, &node_keys[lowest_level], level,
1617 &slot);
1618 if (ret && slot > 0)
1619 slot--;
1620
1621 bytenr = btrfs_node_blockptr(parent, slot);
1622 if (nodes[level - 1] == bytenr)
1623 break;
1624
1625 blocksize = btrfs_level_size(root, level - 1);
1626 generation = btrfs_node_ptr_generation(parent, slot);
1627 btrfs_node_key_to_cpu(eb, &key, slot);
1628 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1629
1630 /*
1631 * if node keys match and node pointer hasn't been modified
1632 * in the running transaction, we can merge the path. for
1633 * reloc trees, the node pointer check is skipped, this is
1634 * because the reloc trees are fully controlled by the space
1635 * balance code, no one else can modify them.
1636 */
1637 if (!nodes[level - 1] || !key_match ||
1638 (generation == trans->transid &&
1639 root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)) {
1640next_level:
1641 if (level == 1 || level == lowest_level + 1)
1642 break;
1643
1644 eb = read_tree_block(root, bytenr, blocksize,
1645 generation);
1646 btrfs_tree_lock(eb);
1647
1648 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1649 &eb, 0);
1650 BUG_ON(ret);
1651
1652 btrfs_tree_unlock(parent);
1653 free_extent_buffer(parent);
1654 parent = eb;
1655 continue;
1656 }
1657
1658 if (generation == trans->transid) {
1659 u32 refs;
1660 BUG_ON(btrfs_header_owner(eb) !=
1661 BTRFS_TREE_RELOC_OBJECTID);
1662 /*
1663 * lock the block to keep __btrfs_cow_block from
1664 * changing the reference count.
1665 */
1666 eb = read_tree_block(root, bytenr, blocksize,
1667 generation);
1668 btrfs_tree_lock(eb);
1669
1670 ret = btrfs_lookup_extent_ref(trans, root, bytenr,
1671 blocksize, &refs);
1672 BUG_ON(ret);
1673 /*
1674 * if replace block whose reference count is one,
1675 * we have to "drop the subtree". so skip it for
1676 * simplicity
1677 */
1678 if (refs == 1) {
1679 btrfs_tree_unlock(eb);
1680 free_extent_buffer(eb);
1681 goto next_level;
1682 }
1683 }
1684
1685 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1686 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1687 btrfs_mark_buffer_dirty(parent);
1688
1689 ret = btrfs_inc_extent_ref(trans, root,
1690 nodes[level - 1],
1691 blocksize, parent->start,
1692 btrfs_header_owner(parent),
1693 btrfs_header_generation(parent),
1694 level - 1);
1695 BUG_ON(ret);
1696 ret = btrfs_free_extent(trans, root, bytenr,
1697 blocksize, parent->start,
1698 btrfs_header_owner(parent),
1699 btrfs_header_generation(parent),
1700 level - 1, 1);
1701 BUG_ON(ret);
1702
1703 if (generation == trans->transid) {
1704 btrfs_tree_unlock(eb);
1705 free_extent_buffer(eb);
1706 }
1707 break;
1708 }
1709 btrfs_tree_unlock(parent);
1710 free_extent_buffer(parent);
1711 return 0;
1712}
1713
1714/*
1715 * adjust the pointers going up the tree, starting at level
1716 * making sure the right key of each node is points to 'key'.
1717 * This is used after shifting pointers to the left, so it stops
1718 * fixing up pointers when a given leaf/node is not in slot 0 of the
1719 * higher levels
1720 *
1721 * If this fails to write a tree block, it returns -1, but continues
1722 * fixing up the blocks in ram so the tree is consistent.
1723 */
1724static int fixup_low_keys(struct btrfs_trans_handle *trans,
1725 struct btrfs_root *root, struct btrfs_path *path,
1726 struct btrfs_disk_key *key, int level)
1727{
1728 int i;
1729 int ret = 0;
1730 struct extent_buffer *t;
1731
1732 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1733 int tslot = path->slots[i];
1734 if (!path->nodes[i])
1735 break;
1736 t = path->nodes[i];
1737 btrfs_set_node_key(t, key, tslot);
1738 btrfs_mark_buffer_dirty(path->nodes[i]);
1739 if (tslot != 0)
1740 break;
1741 }
1742 return ret;
1743}
1744
1745/*
1746 * update item key.
1747 *
1748 * This function isn't completely safe. It's the caller's responsibility
1749 * that the new key won't break the order
1750 */
1751int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1752 struct btrfs_root *root, struct btrfs_path *path,
1753 struct btrfs_key *new_key)
1754{
1755 struct btrfs_disk_key disk_key;
1756 struct extent_buffer *eb;
1757 int slot;
1758
1759 eb = path->nodes[0];
1760 slot = path->slots[0];
1761 if (slot > 0) {
1762 btrfs_item_key(eb, &disk_key, slot - 1);
1763 if (comp_keys(&disk_key, new_key) >= 0)
1764 return -1;
1765 }
1766 if (slot < btrfs_header_nritems(eb) - 1) {
1767 btrfs_item_key(eb, &disk_key, slot + 1);
1768 if (comp_keys(&disk_key, new_key) <= 0)
1769 return -1;
1770 }
1771
1772 btrfs_cpu_key_to_disk(&disk_key, new_key);
1773 btrfs_set_item_key(eb, &disk_key, slot);
1774 btrfs_mark_buffer_dirty(eb);
1775 if (slot == 0)
1776 fixup_low_keys(trans, root, path, &disk_key, 1);
1777 return 0;
1778}
1779
1780/*
1781 * try to push data from one node into the next node left in the
1782 * tree.
1783 *
1784 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1785 * error, and > 0 if there was no room in the left hand block.
1786 */
1787static int push_node_left(struct btrfs_trans_handle *trans,
1788 struct btrfs_root *root, struct extent_buffer *dst,
1789 struct extent_buffer *src, int empty)
1790{
1791 int push_items = 0;
1792 int src_nritems;
1793 int dst_nritems;
1794 int ret = 0;
1795
1796 src_nritems = btrfs_header_nritems(src);
1797 dst_nritems = btrfs_header_nritems(dst);
1798 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1799 WARN_ON(btrfs_header_generation(src) != trans->transid);
1800 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1801
1802 if (!empty && src_nritems <= 8)
1803 return 1;
1804
1805 if (push_items <= 0) {
1806 return 1;
1807 }
1808
1809 if (empty) {
1810 push_items = min(src_nritems, push_items);
1811 if (push_items < src_nritems) {
1812 /* leave at least 8 pointers in the node if
1813 * we aren't going to empty it
1814 */
1815 if (src_nritems - push_items < 8) {
1816 if (push_items <= 8)
1817 return 1;
1818 push_items -= 8;
1819 }
1820 }
1821 } else
1822 push_items = min(src_nritems - 8, push_items);
1823
1824 copy_extent_buffer(dst, src,
1825 btrfs_node_key_ptr_offset(dst_nritems),
1826 btrfs_node_key_ptr_offset(0),
1827 push_items * sizeof(struct btrfs_key_ptr));
1828
1829 if (push_items < src_nritems) {
1830 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1831 btrfs_node_key_ptr_offset(push_items),
1832 (src_nritems - push_items) *
1833 sizeof(struct btrfs_key_ptr));
1834 }
1835 btrfs_set_header_nritems(src, src_nritems - push_items);
1836 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1837 btrfs_mark_buffer_dirty(src);
1838 btrfs_mark_buffer_dirty(dst);
1839
1840 ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
1841 BUG_ON(ret);
1842
1843 return ret;
1844}
1845
1846/*
1847 * try to push data from one node into the next node right in the
1848 * tree.
1849 *
1850 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1851 * error, and > 0 if there was no room in the right hand block.
1852 *
1853 * this will only push up to 1/2 the contents of the left node over
1854 */
1855static int balance_node_right(struct btrfs_trans_handle *trans,
1856 struct btrfs_root *root,
1857 struct extent_buffer *dst,
1858 struct extent_buffer *src)
1859{
1860 int push_items = 0;
1861 int max_push;
1862 int src_nritems;
1863 int dst_nritems;
1864 int ret = 0;
1865
1866 WARN_ON(btrfs_header_generation(src) != trans->transid);
1867 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1868
1869 src_nritems = btrfs_header_nritems(src);
1870 dst_nritems = btrfs_header_nritems(dst);
1871 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1872 if (push_items <= 0) {
1873 return 1;
1874 }
1875
1876 if (src_nritems < 4) {
1877 return 1;
1878 }
1879
1880 max_push = src_nritems / 2 + 1;
1881 /* don't try to empty the node */
1882 if (max_push >= src_nritems) {
1883 return 1;
1884 }
1885
1886 if (max_push < push_items)
1887 push_items = max_push;
1888
1889 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1890 btrfs_node_key_ptr_offset(0),
1891 (dst_nritems) *
1892 sizeof(struct btrfs_key_ptr));
1893
1894 copy_extent_buffer(dst, src,
1895 btrfs_node_key_ptr_offset(0),
1896 btrfs_node_key_ptr_offset(src_nritems - push_items),
1897 push_items * sizeof(struct btrfs_key_ptr));
1898
1899 btrfs_set_header_nritems(src, src_nritems - push_items);
1900 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1901
1902 btrfs_mark_buffer_dirty(src);
1903 btrfs_mark_buffer_dirty(dst);
1904
1905 ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
1906 BUG_ON(ret);
1907
1908 return ret;
1909}
1910
1911/*
1912 * helper function to insert a new root level in the tree.
1913 * A new node is allocated, and a single item is inserted to
1914 * point to the existing root
1915 *
1916 * returns zero on success or < 0 on failure.
1917 */
1918static int noinline insert_new_root(struct btrfs_trans_handle *trans,
1919 struct btrfs_root *root,
1920 struct btrfs_path *path, int level)
1921{
1922 u64 lower_gen;
1923 struct extent_buffer *lower;
1924 struct extent_buffer *c;
1925 struct extent_buffer *old;
1926 struct btrfs_disk_key lower_key;
1927 int ret;
1928
1929 BUG_ON(path->nodes[level]);
1930 BUG_ON(path->nodes[level-1] != root->node);
1931
1932 lower = path->nodes[level-1];
1933 if (level == 1)
1934 btrfs_item_key(lower, &lower_key, 0);
1935 else
1936 btrfs_node_key(lower, &lower_key, 0);
1937
1938 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1939 root->root_key.objectid, trans->transid,
1940 level, root->node->start, 0);
1941 if (IS_ERR(c))
1942 return PTR_ERR(c);
1943
1944 memset_extent_buffer(c, 0, 0, root->nodesize);
1945 btrfs_set_header_nritems(c, 1);
1946 btrfs_set_header_level(c, level);
1947 btrfs_set_header_bytenr(c, c->start);
1948 btrfs_set_header_generation(c, trans->transid);
1949 btrfs_set_header_owner(c, root->root_key.objectid);
1950
1951 write_extent_buffer(c, root->fs_info->fsid,
1952 (unsigned long)btrfs_header_fsid(c),
1953 BTRFS_FSID_SIZE);
1954
1955 write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1956 (unsigned long)btrfs_header_chunk_tree_uuid(c),
1957 BTRFS_UUID_SIZE);
1958
1959 btrfs_set_node_key(c, &lower_key, 0);
1960 btrfs_set_node_blockptr(c, 0, lower->start);
1961 lower_gen = btrfs_header_generation(lower);
1962 WARN_ON(lower_gen != trans->transid);
1963
1964 btrfs_set_node_ptr_generation(c, 0, lower_gen);
1965
1966 btrfs_mark_buffer_dirty(c);
1967
1968 spin_lock(&root->node_lock);
1969 old = root->node;
1970 root->node = c;
1971 spin_unlock(&root->node_lock);
1972
1973 ret = btrfs_update_extent_ref(trans, root, lower->start,
1974 lower->start, c->start,
1975 root->root_key.objectid,
1976 trans->transid, level - 1);
1977 BUG_ON(ret);
1978
1979 /* the super has an extra ref to root->node */
1980 free_extent_buffer(old);
1981
1982 add_root_to_dirty_list(root);
1983 extent_buffer_get(c);
1984 path->nodes[level] = c;
1985 path->locks[level] = 1;
1986 path->slots[level] = 0;
1987 return 0;
1988}
1989
1990/*
1991 * worker function to insert a single pointer in a node.
1992 * the node should have enough room for the pointer already
1993 *
1994 * slot and level indicate where you want the key to go, and
1995 * blocknr is the block the key points to.
1996 *
1997 * returns zero on success and < 0 on any error
1998 */
1999static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2000 *root, struct btrfs_path *path, struct btrfs_disk_key
2001 *key, u64 bytenr, int slot, int level)
2002{
2003 struct extent_buffer *lower;
2004 int nritems;
2005
2006 BUG_ON(!path->nodes[level]);
2007 lower = path->nodes[level];
2008 nritems = btrfs_header_nritems(lower);
2009 if (slot > nritems)
2010 BUG();
2011 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2012 BUG();
2013 if (slot != nritems) {
2014 memmove_extent_buffer(lower,
2015 btrfs_node_key_ptr_offset(slot + 1),
2016 btrfs_node_key_ptr_offset(slot),
2017 (nritems - slot) * sizeof(struct btrfs_key_ptr));
2018 }
2019 btrfs_set_node_key(lower, key, slot);
2020 btrfs_set_node_blockptr(lower, slot, bytenr);
2021 WARN_ON(trans->transid == 0);
2022 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2023 btrfs_set_header_nritems(lower, nritems + 1);
2024 btrfs_mark_buffer_dirty(lower);
2025 return 0;
2026}
2027
2028/*
2029 * split the node at the specified level in path in two.
2030 * The path is corrected to point to the appropriate node after the split
2031 *
2032 * Before splitting this tries to make some room in the node by pushing
2033 * left and right, if either one works, it returns right away.
2034 *
2035 * returns 0 on success and < 0 on failure
2036 */
2037static noinline int split_node(struct btrfs_trans_handle *trans,
2038 struct btrfs_root *root,
2039 struct btrfs_path *path, int level)
2040{
2041 struct extent_buffer *c;
2042 struct extent_buffer *split;
2043 struct btrfs_disk_key disk_key;
2044 int mid;
2045 int ret;
2046 int wret;
2047 u32 c_nritems;
2048
2049 c = path->nodes[level];
2050 WARN_ON(btrfs_header_generation(c) != trans->transid);
2051 if (c == root->node) {
2052 /* trying to split the root, lets make a new one */
2053 ret = insert_new_root(trans, root, path, level + 1);
2054 if (ret)
2055 return ret;
2056 } else {
2057 ret = push_nodes_for_insert(trans, root, path, level);
2058 c = path->nodes[level];
2059 if (!ret && btrfs_header_nritems(c) <
2060 BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2061 return 0;
2062 if (ret < 0)
2063 return ret;
2064 }
2065
2066 c_nritems = btrfs_header_nritems(c);
2067
2068 split = btrfs_alloc_free_block(trans, root, root->nodesize,
2069 path->nodes[level + 1]->start,
2070 root->root_key.objectid,
2071 trans->transid, level, c->start, 0);
2072 if (IS_ERR(split))
2073 return PTR_ERR(split);
2074
2075 btrfs_set_header_flags(split, btrfs_header_flags(c));
2076 btrfs_set_header_level(split, btrfs_header_level(c));
2077 btrfs_set_header_bytenr(split, split->start);
2078 btrfs_set_header_generation(split, trans->transid);
2079 btrfs_set_header_owner(split, root->root_key.objectid);
2080 btrfs_set_header_flags(split, 0);
2081 write_extent_buffer(split, root->fs_info->fsid,
2082 (unsigned long)btrfs_header_fsid(split),
2083 BTRFS_FSID_SIZE);
2084 write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2085 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2086 BTRFS_UUID_SIZE);
2087
2088 mid = (c_nritems + 1) / 2;
2089
2090 copy_extent_buffer(split, c,
2091 btrfs_node_key_ptr_offset(0),
2092 btrfs_node_key_ptr_offset(mid),
2093 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2094 btrfs_set_header_nritems(split, c_nritems - mid);
2095 btrfs_set_header_nritems(c, mid);
2096 ret = 0;
2097
2098 btrfs_mark_buffer_dirty(c);
2099 btrfs_mark_buffer_dirty(split);
2100
2101 btrfs_node_key(split, &disk_key, 0);
2102 wret = insert_ptr(trans, root, path, &disk_key, split->start,
2103 path->slots[level + 1] + 1,
2104 level + 1);
2105 if (wret)
2106 ret = wret;
2107
2108 ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2109 BUG_ON(ret);
2110
2111 if (path->slots[level] >= mid) {
2112 path->slots[level] -= mid;
2113 btrfs_tree_unlock(c);
2114 free_extent_buffer(c);
2115 path->nodes[level] = split;
2116 path->slots[level + 1] += 1;
2117 } else {
2118 btrfs_tree_unlock(split);
2119 free_extent_buffer(split);
2120 }
2121 return ret;
2122}
2123
2124/*
2125 * how many bytes are required to store the items in a leaf. start
2126 * and nr indicate which items in the leaf to check. This totals up the
2127 * space used both by the item structs and the item data
2128 */
2129static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2130{
2131 int data_len;
2132 int nritems = btrfs_header_nritems(l);
2133 int end = min(nritems, start + nr) - 1;
2134
2135 if (!nr)
2136 return 0;
2137 data_len = btrfs_item_end_nr(l, start);
2138 data_len = data_len - btrfs_item_offset_nr(l, end);
2139 data_len += sizeof(struct btrfs_item) * nr;
2140 WARN_ON(data_len < 0);
2141 return data_len;
2142}
2143
2144/*
2145 * The space between the end of the leaf items and
2146 * the start of the leaf data. IOW, how much room
2147 * the leaf has left for both items and data
2148 */
2149int noinline btrfs_leaf_free_space(struct btrfs_root *root,
2150 struct extent_buffer *leaf)
2151{
2152 int nritems = btrfs_header_nritems(leaf);
2153 int ret;
2154 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2155 if (ret < 0) {
2156 printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
2157 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2158 leaf_space_used(leaf, 0, nritems), nritems);
2159 }
2160 return ret;
2161}
2162
2163/*
2164 * push some data in the path leaf to the right, trying to free up at
2165 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2166 *
2167 * returns 1 if the push failed because the other node didn't have enough
2168 * room, 0 if everything worked out and < 0 if there were major errors.
2169 */
2170static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2171 *root, struct btrfs_path *path, int data_size,
2172 int empty)
2173{
2174 struct extent_buffer *left = path->nodes[0];
2175 struct extent_buffer *right;
2176 struct extent_buffer *upper;
2177 struct btrfs_disk_key disk_key;
2178 int slot;
2179 u32 i;
2180 int free_space;
2181 int push_space = 0;
2182 int push_items = 0;
2183 struct btrfs_item *item;
2184 u32 left_nritems;
2185 u32 nr;
2186 u32 right_nritems;
2187 u32 data_end;
2188 u32 this_item_size;
2189 int ret;
2190
2191 slot = path->slots[1];
2192 if (!path->nodes[1]) {
2193 return 1;
2194 }
2195 upper = path->nodes[1];
2196 if (slot >= btrfs_header_nritems(upper) - 1)
2197 return 1;
2198
2199 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2200
2201 right = read_node_slot(root, upper, slot + 1);
2202 btrfs_tree_lock(right);
2203 free_space = btrfs_leaf_free_space(root, right);
2204 if (free_space < data_size + sizeof(struct btrfs_item))
2205 goto out_unlock;
2206
2207 /* cow and double check */
2208 ret = btrfs_cow_block(trans, root, right, upper,
2209 slot + 1, &right, 0);
2210 if (ret)
2211 goto out_unlock;
2212
2213 free_space = btrfs_leaf_free_space(root, right);
2214 if (free_space < data_size + sizeof(struct btrfs_item))
2215 goto out_unlock;
2216
2217 left_nritems = btrfs_header_nritems(left);
2218 if (left_nritems == 0)
2219 goto out_unlock;
2220
2221 if (empty)
2222 nr = 0;
2223 else
2224 nr = 1;
2225
2226 if (path->slots[0] >= left_nritems)
2227 push_space += data_size + sizeof(*item);
2228
2229 i = left_nritems - 1;
2230 while (i >= nr) {
2231 item = btrfs_item_nr(left, i);
2232
2233 if (!empty && push_items > 0) {
2234 if (path->slots[0] > i)
2235 break;
2236 if (path->slots[0] == i) {
2237 int space = btrfs_leaf_free_space(root, left);
2238 if (space + push_space * 2 > free_space)
2239 break;
2240 }
2241 }
2242
2243 if (path->slots[0] == i)
2244 push_space += data_size + sizeof(*item);
2245
2246 if (!left->map_token) {
2247 map_extent_buffer(left, (unsigned long)item,
2248 sizeof(struct btrfs_item),
2249 &left->map_token, &left->kaddr,
2250 &left->map_start, &left->map_len,
2251 KM_USER1);
2252 }
2253
2254 this_item_size = btrfs_item_size(left, item);
2255 if (this_item_size + sizeof(*item) + push_space > free_space)
2256 break;
2257
2258 push_items++;
2259 push_space += this_item_size + sizeof(*item);
2260 if (i == 0)
2261 break;
2262 i--;
2263 }
2264 if (left->map_token) {
2265 unmap_extent_buffer(left, left->map_token, KM_USER1);
2266 left->map_token = NULL;
2267 }
2268
2269 if (push_items == 0)
2270 goto out_unlock;
2271
2272 if (!empty && push_items == left_nritems)
2273 WARN_ON(1);
2274
2275 /* push left to right */
2276 right_nritems = btrfs_header_nritems(right);
2277
2278 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2279 push_space -= leaf_data_end(root, left);
2280
2281 /* make room in the right data area */
2282 data_end = leaf_data_end(root, right);
2283 memmove_extent_buffer(right,
2284 btrfs_leaf_data(right) + data_end - push_space,
2285 btrfs_leaf_data(right) + data_end,
2286 BTRFS_LEAF_DATA_SIZE(root) - data_end);
2287
2288 /* copy from the left data area */
2289 copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2290 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2291 btrfs_leaf_data(left) + leaf_data_end(root, left),
2292 push_space);
2293
2294 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2295 btrfs_item_nr_offset(0),
2296 right_nritems * sizeof(struct btrfs_item));
2297
2298 /* copy the items from left to right */
2299 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2300 btrfs_item_nr_offset(left_nritems - push_items),
2301 push_items * sizeof(struct btrfs_item));
2302
2303 /* update the item pointers */
2304 right_nritems += push_items;
2305 btrfs_set_header_nritems(right, right_nritems);
2306 push_space = BTRFS_LEAF_DATA_SIZE(root);
2307 for (i = 0; i < right_nritems; i++) {
2308 item = btrfs_item_nr(right, i);
2309 if (!right->map_token) {
2310 map_extent_buffer(right, (unsigned long)item,
2311 sizeof(struct btrfs_item),
2312 &right->map_token, &right->kaddr,
2313 &right->map_start, &right->map_len,
2314 KM_USER1);
2315 }
2316 push_space -= btrfs_item_size(right, item);
2317 btrfs_set_item_offset(right, item, push_space);
2318 }
2319
2320 if (right->map_token) {
2321 unmap_extent_buffer(right, right->map_token, KM_USER1);
2322 right->map_token = NULL;
2323 }
2324 left_nritems -= push_items;
2325 btrfs_set_header_nritems(left, left_nritems);
2326
2327 if (left_nritems)
2328 btrfs_mark_buffer_dirty(left);
2329 btrfs_mark_buffer_dirty(right);
2330
2331 ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2332 BUG_ON(ret);
2333
2334 btrfs_item_key(right, &disk_key, 0);
2335 btrfs_set_node_key(upper, &disk_key, slot + 1);
2336 btrfs_mark_buffer_dirty(upper);
2337
2338 /* then fixup the leaf pointer in the path */
2339 if (path->slots[0] >= left_nritems) {
2340 path->slots[0] -= left_nritems;
2341 if (btrfs_header_nritems(path->nodes[0]) == 0)
2342 clean_tree_block(trans, root, path->nodes[0]);
2343 btrfs_tree_unlock(path->nodes[0]);
2344 free_extent_buffer(path->nodes[0]);
2345 path->nodes[0] = right;
2346 path->slots[1] += 1;
2347 } else {
2348 btrfs_tree_unlock(right);
2349 free_extent_buffer(right);
2350 }
2351 return 0;
2352
2353out_unlock:
2354 btrfs_tree_unlock(right);
2355 free_extent_buffer(right);
2356 return 1;
2357}
2358
2359/*
2360 * push some data in the path leaf to the left, trying to free up at
2361 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2362 */
2363static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2364 *root, struct btrfs_path *path, int data_size,
2365 int empty)
2366{
2367 struct btrfs_disk_key disk_key;
2368 struct extent_buffer *right = path->nodes[0];
2369 struct extent_buffer *left;
2370 int slot;
2371 int i;
2372 int free_space;
2373 int push_space = 0;
2374 int push_items = 0;
2375 struct btrfs_item *item;
2376 u32 old_left_nritems;
2377 u32 right_nritems;
2378 u32 nr;
2379 int ret = 0;
2380 int wret;
2381 u32 this_item_size;
2382 u32 old_left_item_size;
2383
2384 slot = path->slots[1];
2385 if (slot == 0)
2386 return 1;
2387 if (!path->nodes[1])
2388 return 1;
2389
2390 right_nritems = btrfs_header_nritems(right);
2391 if (right_nritems == 0) {
2392 return 1;
2393 }
2394
2395 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2396
2397 left = read_node_slot(root, path->nodes[1], slot - 1);
2398 btrfs_tree_lock(left);
2399 free_space = btrfs_leaf_free_space(root, left);
2400 if (free_space < data_size + sizeof(struct btrfs_item)) {
2401 ret = 1;
2402 goto out;
2403 }
2404
2405 /* cow and double check */
2406 ret = btrfs_cow_block(trans, root, left,
2407 path->nodes[1], slot - 1, &left, 0);
2408 if (ret) {
2409 /* we hit -ENOSPC, but it isn't fatal here */
2410 ret = 1;
2411 goto out;
2412 }
2413
2414 free_space = btrfs_leaf_free_space(root, left);
2415 if (free_space < data_size + sizeof(struct btrfs_item)) {
2416 ret = 1;
2417 goto out;
2418 }
2419
2420 if (empty)
2421 nr = right_nritems;
2422 else
2423 nr = right_nritems - 1;
2424
2425 for (i = 0; i < nr; i++) {
2426 item = btrfs_item_nr(right, i);
2427 if (!right->map_token) {
2428 map_extent_buffer(right, (unsigned long)item,
2429 sizeof(struct btrfs_item),
2430 &right->map_token, &right->kaddr,
2431 &right->map_start, &right->map_len,
2432 KM_USER1);
2433 }
2434
2435 if (!empty && push_items > 0) {
2436 if (path->slots[0] < i)
2437 break;
2438 if (path->slots[0] == i) {
2439 int space = btrfs_leaf_free_space(root, right);
2440 if (space + push_space * 2 > free_space)
2441 break;
2442 }
2443 }
2444
2445 if (path->slots[0] == i)
2446 push_space += data_size + sizeof(*item);
2447
2448 this_item_size = btrfs_item_size(right, item);
2449 if (this_item_size + sizeof(*item) + push_space > free_space)
2450 break;
2451
2452 push_items++;
2453 push_space += this_item_size + sizeof(*item);
2454 }
2455
2456 if (right->map_token) {
2457 unmap_extent_buffer(right, right->map_token, KM_USER1);
2458 right->map_token = NULL;
2459 }
2460
2461 if (push_items == 0) {
2462 ret = 1;
2463 goto out;
2464 }
2465 if (!empty && push_items == btrfs_header_nritems(right))
2466 WARN_ON(1);
2467
2468 /* push data from right to left */
2469 copy_extent_buffer(left, right,
2470 btrfs_item_nr_offset(btrfs_header_nritems(left)),
2471 btrfs_item_nr_offset(0),
2472 push_items * sizeof(struct btrfs_item));
2473
2474 push_space = BTRFS_LEAF_DATA_SIZE(root) -
2475 btrfs_item_offset_nr(right, push_items -1);
2476
2477 copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2478 leaf_data_end(root, left) - push_space,
2479 btrfs_leaf_data(right) +
2480 btrfs_item_offset_nr(right, push_items - 1),
2481 push_space);
2482 old_left_nritems = btrfs_header_nritems(left);
2483 BUG_ON(old_left_nritems < 0);
2484
2485 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2486 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2487 u32 ioff;
2488
2489 item = btrfs_item_nr(left, i);
2490 if (!left->map_token) {
2491 map_extent_buffer(left, (unsigned long)item,
2492 sizeof(struct btrfs_item),
2493 &left->map_token, &left->kaddr,
2494 &left->map_start, &left->map_len,
2495 KM_USER1);
2496 }
2497
2498 ioff = btrfs_item_offset(left, item);
2499 btrfs_set_item_offset(left, item,
2500 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2501 }
2502 btrfs_set_header_nritems(left, old_left_nritems + push_items);
2503 if (left->map_token) {
2504 unmap_extent_buffer(left, left->map_token, KM_USER1);
2505 left->map_token = NULL;
2506 }
2507
2508 /* fixup right node */
2509 if (push_items > right_nritems) {
2510 printk("push items %d nr %u\n", push_items, right_nritems);
2511 WARN_ON(1);
2512 }
2513
2514 if (push_items < right_nritems) {
2515 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2516 leaf_data_end(root, right);
2517 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2518 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2519 btrfs_leaf_data(right) +
2520 leaf_data_end(root, right), push_space);
2521
2522 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2523 btrfs_item_nr_offset(push_items),
2524 (btrfs_header_nritems(right) - push_items) *
2525 sizeof(struct btrfs_item));
2526 }
2527 right_nritems -= push_items;
2528 btrfs_set_header_nritems(right, right_nritems);
2529 push_space = BTRFS_LEAF_DATA_SIZE(root);
2530 for (i = 0; i < right_nritems; i++) {
2531 item = btrfs_item_nr(right, i);
2532
2533 if (!right->map_token) {
2534 map_extent_buffer(right, (unsigned long)item,
2535 sizeof(struct btrfs_item),
2536 &right->map_token, &right->kaddr,
2537 &right->map_start, &right->map_len,
2538 KM_USER1);
2539 }
2540
2541 push_space = push_space - btrfs_item_size(right, item);
2542 btrfs_set_item_offset(right, item, push_space);
2543 }
2544 if (right->map_token) {
2545 unmap_extent_buffer(right, right->map_token, KM_USER1);
2546 right->map_token = NULL;
2547 }
2548
2549 btrfs_mark_buffer_dirty(left);
2550 if (right_nritems)
2551 btrfs_mark_buffer_dirty(right);
2552
2553 ret = btrfs_update_ref(trans, root, right, left,
2554 old_left_nritems, push_items);
2555 BUG_ON(ret);
2556
2557 btrfs_item_key(right, &disk_key, 0);
2558 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2559 if (wret)
2560 ret = wret;
2561
2562 /* then fixup the leaf pointer in the path */
2563 if (path->slots[0] < push_items) {
2564 path->slots[0] += old_left_nritems;
2565 if (btrfs_header_nritems(path->nodes[0]) == 0)
2566 clean_tree_block(trans, root, path->nodes[0]);
2567 btrfs_tree_unlock(path->nodes[0]);
2568 free_extent_buffer(path->nodes[0]);
2569 path->nodes[0] = left;
2570 path->slots[1] -= 1;
2571 } else {
2572 btrfs_tree_unlock(left);
2573 free_extent_buffer(left);
2574 path->slots[0] -= push_items;
2575 }
2576 BUG_ON(path->slots[0] < 0);
2577 return ret;
2578out:
2579 btrfs_tree_unlock(left);
2580 free_extent_buffer(left);
2581 return ret;
2582}
2583
2584/*
2585 * split the path's leaf in two, making sure there is at least data_size
2586 * available for the resulting leaf level of the path.
2587 *
2588 * returns 0 if all went well and < 0 on failure.
2589 */
2590static noinline int split_leaf(struct btrfs_trans_handle *trans,
2591 struct btrfs_root *root,
2592 struct btrfs_key *ins_key,
2593 struct btrfs_path *path, int data_size,
2594 int extend)
2595{
2596 struct extent_buffer *l;
2597 u32 nritems;
2598 int mid;
2599 int slot;
2600 struct extent_buffer *right;
2601 int space_needed = data_size + sizeof(struct btrfs_item);
2602 int data_copy_size;
2603 int rt_data_off;
2604 int i;
2605 int ret = 0;
2606 int wret;
2607 int double_split;
2608 int num_doubles = 0;
2609 struct btrfs_disk_key disk_key;
2610
2611 if (extend)
2612 space_needed = data_size;
2613
2614 /* first try to make some room by pushing left and right */
2615 if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
2616 wret = push_leaf_right(trans, root, path, data_size, 0);
2617 if (wret < 0) {
2618 return wret;
2619 }
2620 if (wret) {
2621 wret = push_leaf_left(trans, root, path, data_size, 0);
2622 if (wret < 0)
2623 return wret;
2624 }
2625 l = path->nodes[0];
2626
2627 /* did the pushes work? */
2628 if (btrfs_leaf_free_space(root, l) >= space_needed)
2629 return 0;
2630 }
2631
2632 if (!path->nodes[1]) {
2633 ret = insert_new_root(trans, root, path, 1);
2634 if (ret)
2635 return ret;
2636 }
2637again:
2638 double_split = 0;
2639 l = path->nodes[0];
2640 slot = path->slots[0];
2641 nritems = btrfs_header_nritems(l);
2642 mid = (nritems + 1)/ 2;
2643
2644 right = btrfs_alloc_free_block(trans, root, root->leafsize,
2645 path->nodes[1]->start,
2646 root->root_key.objectid,
2647 trans->transid, 0, l->start, 0);
2648 if (IS_ERR(right)) {
2649 BUG_ON(1);
2650 return PTR_ERR(right);
2651 }
2652
2653 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2654 btrfs_set_header_bytenr(right, right->start);
2655 btrfs_set_header_generation(right, trans->transid);
2656 btrfs_set_header_owner(right, root->root_key.objectid);
2657 btrfs_set_header_level(right, 0);
2658 write_extent_buffer(right, root->fs_info->fsid,
2659 (unsigned long)btrfs_header_fsid(right),
2660 BTRFS_FSID_SIZE);
2661
2662 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2663 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2664 BTRFS_UUID_SIZE);
2665 if (mid <= slot) {
2666 if (nritems == 1 ||
2667 leaf_space_used(l, mid, nritems - mid) + space_needed >
2668 BTRFS_LEAF_DATA_SIZE(root)) {
2669 if (slot >= nritems) {
2670 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2671 btrfs_set_header_nritems(right, 0);
2672 wret = insert_ptr(trans, root, path,
2673 &disk_key, right->start,
2674 path->slots[1] + 1, 1);
2675 if (wret)
2676 ret = wret;
2677
2678 btrfs_tree_unlock(path->nodes[0]);
2679 free_extent_buffer(path->nodes[0]);
2680 path->nodes[0] = right;
2681 path->slots[0] = 0;
2682 path->slots[1] += 1;
2683 btrfs_mark_buffer_dirty(right);
2684 return ret;
2685 }
2686 mid = slot;
2687 if (mid != nritems &&
2688 leaf_space_used(l, mid, nritems - mid) +
2689 space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
2690 double_split = 1;
2691 }
2692 }
2693 } else {
2694 if (leaf_space_used(l, 0, mid + 1) + space_needed >
2695 BTRFS_LEAF_DATA_SIZE(root)) {
2696 if (!extend && slot == 0) {
2697 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2698 btrfs_set_header_nritems(right, 0);
2699 wret = insert_ptr(trans, root, path,
2700 &disk_key,
2701 right->start,
2702 path->slots[1], 1);
2703 if (wret)
2704 ret = wret;
2705 btrfs_tree_unlock(path->nodes[0]);
2706 free_extent_buffer(path->nodes[0]);
2707 path->nodes[0] = right;
2708 path->slots[0] = 0;
2709 if (path->slots[1] == 0) {
2710 wret = fixup_low_keys(trans, root,
2711 path, &disk_key, 1);
2712 if (wret)
2713 ret = wret;
2714 }
2715 btrfs_mark_buffer_dirty(right);
2716 return ret;
2717 } else if (extend && slot == 0) {
2718 mid = 1;
2719 } else {
2720 mid = slot;
2721 if (mid != nritems &&
2722 leaf_space_used(l, mid, nritems - mid) +
2723 space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
2724 double_split = 1;
2725 }
2726 }
2727 }
2728 }
2729 nritems = nritems - mid;
2730 btrfs_set_header_nritems(right, nritems);
2731 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2732
2733 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2734 btrfs_item_nr_offset(mid),
2735 nritems * sizeof(struct btrfs_item));
2736
2737 copy_extent_buffer(right, l,
2738 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2739 data_copy_size, btrfs_leaf_data(l) +
2740 leaf_data_end(root, l), data_copy_size);
2741
2742 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2743 btrfs_item_end_nr(l, mid);
2744
2745 for (i = 0; i < nritems; i++) {
2746 struct btrfs_item *item = btrfs_item_nr(right, i);
2747 u32 ioff;
2748
2749 if (!right->map_token) {
2750 map_extent_buffer(right, (unsigned long)item,
2751 sizeof(struct btrfs_item),
2752 &right->map_token, &right->kaddr,
2753 &right->map_start, &right->map_len,
2754 KM_USER1);
2755 }
2756
2757 ioff = btrfs_item_offset(right, item);
2758 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2759 }
2760
2761 if (right->map_token) {
2762 unmap_extent_buffer(right, right->map_token, KM_USER1);
2763 right->map_token = NULL;
2764 }
2765
2766 btrfs_set_header_nritems(l, mid);
2767 ret = 0;
2768 btrfs_item_key(right, &disk_key, 0);
2769 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2770 path->slots[1] + 1, 1);
2771 if (wret)
2772 ret = wret;
2773
2774 btrfs_mark_buffer_dirty(right);
2775 btrfs_mark_buffer_dirty(l);
2776 BUG_ON(path->slots[0] != slot);
2777
2778 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2779 BUG_ON(ret);
2780
2781 if (mid <= slot) {
2782 btrfs_tree_unlock(path->nodes[0]);
2783 free_extent_buffer(path->nodes[0]);
2784 path->nodes[0] = right;
2785 path->slots[0] -= mid;
2786 path->slots[1] += 1;
2787 } else {
2788 btrfs_tree_unlock(right);
2789 free_extent_buffer(right);
2790 }
2791
2792 BUG_ON(path->slots[0] < 0);
2793
2794 if (double_split) {
2795 BUG_ON(num_doubles != 0);
2796 num_doubles++;
2797 goto again;
2798 }
2799 return ret;
2800}
2801
2802/*
2803 * make the item pointed to by the path smaller. new_size indicates
2804 * how small to make it, and from_end tells us if we just chop bytes
2805 * off the end of the item or if we shift the item to chop bytes off
2806 * the front.
2807 */
2808int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2809 struct btrfs_root *root,
2810 struct btrfs_path *path,
2811 u32 new_size, int from_end)
2812{
2813 int ret = 0;
2814 int slot;
2815 int slot_orig;
2816 struct extent_buffer *leaf;
2817 struct btrfs_item *item;
2818 u32 nritems;
2819 unsigned int data_end;
2820 unsigned int old_data_start;
2821 unsigned int old_size;
2822 unsigned int size_diff;
2823 int i;
2824
2825 slot_orig = path->slots[0];
2826 leaf = path->nodes[0];
2827 slot = path->slots[0];
2828
2829 old_size = btrfs_item_size_nr(leaf, slot);
2830 if (old_size == new_size)
2831 return 0;
2832
2833 nritems = btrfs_header_nritems(leaf);
2834 data_end = leaf_data_end(root, leaf);
2835
2836 old_data_start = btrfs_item_offset_nr(leaf, slot);
2837
2838 size_diff = old_size - new_size;
2839
2840 BUG_ON(slot < 0);
2841 BUG_ON(slot >= nritems);
2842
2843 /*
2844 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2845 */
2846 /* first correct the data pointers */
2847 for (i = slot; i < nritems; i++) {
2848 u32 ioff;
2849 item = btrfs_item_nr(leaf, i);
2850
2851 if (!leaf->map_token) {
2852 map_extent_buffer(leaf, (unsigned long)item,
2853 sizeof(struct btrfs_item),
2854 &leaf->map_token, &leaf->kaddr,
2855 &leaf->map_start, &leaf->map_len,
2856 KM_USER1);
2857 }
2858
2859 ioff = btrfs_item_offset(leaf, item);
2860 btrfs_set_item_offset(leaf, item, ioff + size_diff);
2861 }
2862
2863 if (leaf->map_token) {
2864 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2865 leaf->map_token = NULL;
2866 }
2867
2868 /* shift the data */
2869 if (from_end) {
2870 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2871 data_end + size_diff, btrfs_leaf_data(leaf) +
2872 data_end, old_data_start + new_size - data_end);
2873 } else {
2874 struct btrfs_disk_key disk_key;
2875 u64 offset;
2876
2877 btrfs_item_key(leaf, &disk_key, slot);
2878
2879 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
2880 unsigned long ptr;
2881 struct btrfs_file_extent_item *fi;
2882
2883 fi = btrfs_item_ptr(leaf, slot,
2884 struct btrfs_file_extent_item);
2885 fi = (struct btrfs_file_extent_item *)(
2886 (unsigned long)fi - size_diff);
2887
2888 if (btrfs_file_extent_type(leaf, fi) ==
2889 BTRFS_FILE_EXTENT_INLINE) {
2890 ptr = btrfs_item_ptr_offset(leaf, slot);
2891 memmove_extent_buffer(leaf, ptr,
2892 (unsigned long)fi,
2893 offsetof(struct btrfs_file_extent_item,
2894 disk_bytenr));
2895 }
2896 }
2897
2898 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2899 data_end + size_diff, btrfs_leaf_data(leaf) +
2900 data_end, old_data_start - data_end);
2901
2902 offset = btrfs_disk_key_offset(&disk_key);
2903 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
2904 btrfs_set_item_key(leaf, &disk_key, slot);
2905 if (slot == 0)
2906 fixup_low_keys(trans, root, path, &disk_key, 1);
2907 }
2908
2909 item = btrfs_item_nr(leaf, slot);
2910 btrfs_set_item_size(leaf, item, new_size);
2911 btrfs_mark_buffer_dirty(leaf);
2912
2913 ret = 0;
2914 if (btrfs_leaf_free_space(root, leaf) < 0) {
2915 btrfs_print_leaf(root, leaf);
2916 BUG();
2917 }
2918 return ret;
2919}
2920
2921/*
2922 * make the item pointed to by the path bigger, data_size is the new size.
2923 */
2924int btrfs_extend_item(struct btrfs_trans_handle *trans,
2925 struct btrfs_root *root, struct btrfs_path *path,
2926 u32 data_size)
2927{
2928 int ret = 0;
2929 int slot;
2930 int slot_orig;
2931 struct extent_buffer *leaf;
2932 struct btrfs_item *item;
2933 u32 nritems;
2934 unsigned int data_end;
2935 unsigned int old_data;
2936 unsigned int old_size;
2937 int i;
2938
2939 slot_orig = path->slots[0];
2940 leaf = path->nodes[0];
2941
2942 nritems = btrfs_header_nritems(leaf);
2943 data_end = leaf_data_end(root, leaf);
2944
2945 if (btrfs_leaf_free_space(root, leaf) < data_size) {
2946 btrfs_print_leaf(root, leaf);
2947 BUG();
2948 }
2949 slot = path->slots[0];
2950 old_data = btrfs_item_end_nr(leaf, slot);
2951
2952 BUG_ON(slot < 0);
2953 if (slot >= nritems) {
2954 btrfs_print_leaf(root, leaf);
2955 printk("slot %d too large, nritems %d\n", slot, nritems);
2956 BUG_ON(1);
2957 }
2958
2959 /*
2960 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2961 */
2962 /* first correct the data pointers */
2963 for (i = slot; i < nritems; i++) {
2964 u32 ioff;
2965 item = btrfs_item_nr(leaf, i);
2966
2967 if (!leaf->map_token) {
2968 map_extent_buffer(leaf, (unsigned long)item,
2969 sizeof(struct btrfs_item),
2970 &leaf->map_token, &leaf->kaddr,
2971 &leaf->map_start, &leaf->map_len,
2972 KM_USER1);
2973 }
2974 ioff = btrfs_item_offset(leaf, item);
2975 btrfs_set_item_offset(leaf, item, ioff - data_size);
2976 }
2977
2978 if (leaf->map_token) {
2979 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2980 leaf->map_token = NULL;
2981 }
2982
2983 /* shift the data */
2984 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2985 data_end - data_size, btrfs_leaf_data(leaf) +
2986 data_end, old_data - data_end);
2987
2988 data_end = old_data;
2989 old_size = btrfs_item_size_nr(leaf, slot);
2990 item = btrfs_item_nr(leaf, slot);
2991 btrfs_set_item_size(leaf, item, old_size + data_size);
2992 btrfs_mark_buffer_dirty(leaf);
2993
2994 ret = 0;
2995 if (btrfs_leaf_free_space(root, leaf) < 0) {
2996 btrfs_print_leaf(root, leaf);
2997 BUG();
2998 }
2999 return ret;
3000}
3001
3002/*
3003 * Given a key and some data, insert items into the tree.
3004 * This does all the path init required, making room in the tree if needed.
3005 */
3006int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3007 struct btrfs_root *root,
3008 struct btrfs_path *path,
3009 struct btrfs_key *cpu_key, u32 *data_size,
3010 int nr)
3011{
3012 struct extent_buffer *leaf;
3013 struct btrfs_item *item;
3014 int ret = 0;
3015 int slot;
3016 int slot_orig;
3017 int i;
3018 u32 nritems;
3019 u32 total_size = 0;
3020 u32 total_data = 0;
3021 unsigned int data_end;
3022 struct btrfs_disk_key disk_key;
3023
3024 for (i = 0; i < nr; i++) {
3025 total_data += data_size[i];
3026 }
3027
3028 total_size = total_data + (nr * sizeof(struct btrfs_item));
3029 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3030 if (ret == 0)
3031 return -EEXIST;
3032 if (ret < 0)
3033 goto out;
3034
3035 slot_orig = path->slots[0];
3036 leaf = path->nodes[0];
3037
3038 nritems = btrfs_header_nritems(leaf);
3039 data_end = leaf_data_end(root, leaf);
3040
3041 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3042 btrfs_print_leaf(root, leaf);
3043 printk("not enough freespace need %u have %d\n",
3044 total_size, btrfs_leaf_free_space(root, leaf));
3045 BUG();
3046 }
3047
3048 slot = path->slots[0];
3049 BUG_ON(slot < 0);
3050
3051 if (slot != nritems) {
3052 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3053
3054 if (old_data < data_end) {
3055 btrfs_print_leaf(root, leaf);
3056 printk("slot %d old_data %d data_end %d\n",
3057 slot, old_data, data_end);
3058 BUG_ON(1);
3059 }
3060 /*
3061 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3062 */
3063 /* first correct the data pointers */
3064 WARN_ON(leaf->map_token);
3065 for (i = slot; i < nritems; i++) {
3066 u32 ioff;
3067
3068 item = btrfs_item_nr(leaf, i);
3069 if (!leaf->map_token) {
3070 map_extent_buffer(leaf, (unsigned long)item,
3071 sizeof(struct btrfs_item),
3072 &leaf->map_token, &leaf->kaddr,
3073 &leaf->map_start, &leaf->map_len,
3074 KM_USER1);
3075 }
3076
3077 ioff = btrfs_item_offset(leaf, item);
3078 btrfs_set_item_offset(leaf, item, ioff - total_data);
3079 }
3080 if (leaf->map_token) {
3081 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3082 leaf->map_token = NULL;
3083 }
3084
3085 /* shift the items */
3086 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3087 btrfs_item_nr_offset(slot),
3088 (nritems - slot) * sizeof(struct btrfs_item));
3089
3090 /* shift the data */
3091 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3092 data_end - total_data, btrfs_leaf_data(leaf) +
3093 data_end, old_data - data_end);
3094 data_end = old_data;
3095 }
3096
3097 /* setup the item for the new data */
3098 for (i = 0; i < nr; i++) {
3099 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3100 btrfs_set_item_key(leaf, &disk_key, slot + i);
3101 item = btrfs_item_nr(leaf, slot + i);
3102 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3103 data_end -= data_size[i];
3104 btrfs_set_item_size(leaf, item, data_size[i]);
3105 }
3106 btrfs_set_header_nritems(leaf, nritems + nr);
3107 btrfs_mark_buffer_dirty(leaf);
3108
3109 ret = 0;
3110 if (slot == 0) {
3111 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3112 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3113 }
3114
3115 if (btrfs_leaf_free_space(root, leaf) < 0) {
3116 btrfs_print_leaf(root, leaf);
3117 BUG();
3118 }
3119out:
3120 return ret;
3121}
3122
3123/*
3124 * Given a key and some data, insert an item into the tree.
3125 * This does all the path init required, making room in the tree if needed.
3126 */
3127int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3128 *root, struct btrfs_key *cpu_key, void *data, u32
3129 data_size)
3130{
3131 int ret = 0;
3132 struct btrfs_path *path;
3133 struct extent_buffer *leaf;
3134 unsigned long ptr;
3135
3136 path = btrfs_alloc_path();
3137 BUG_ON(!path);
3138 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3139 if (!ret) {
3140 leaf = path->nodes[0];
3141 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3142 write_extent_buffer(leaf, data, ptr, data_size);
3143 btrfs_mark_buffer_dirty(leaf);
3144 }
3145 btrfs_free_path(path);
3146 return ret;
3147}
3148
3149/*
3150 * delete the pointer from a given node.
3151 *
3152 * the tree should have been previously balanced so the deletion does not
3153 * empty a node.
3154 */
3155static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3156 struct btrfs_path *path, int level, int slot)
3157{
3158 struct extent_buffer *parent = path->nodes[level];
3159 u32 nritems;
3160 int ret = 0;
3161 int wret;
3162
3163 nritems = btrfs_header_nritems(parent);
3164 if (slot != nritems -1) {
3165 memmove_extent_buffer(parent,
3166 btrfs_node_key_ptr_offset(slot),
3167 btrfs_node_key_ptr_offset(slot + 1),
3168 sizeof(struct btrfs_key_ptr) *
3169 (nritems - slot - 1));
3170 }
3171 nritems--;
3172 btrfs_set_header_nritems(parent, nritems);
3173 if (nritems == 0 && parent == root->node) {
3174 BUG_ON(btrfs_header_level(root->node) != 1);
3175 /* just turn the root into a leaf and break */
3176 btrfs_set_header_level(root->node, 0);
3177 } else if (slot == 0) {
3178 struct btrfs_disk_key disk_key;
3179
3180 btrfs_node_key(parent, &disk_key, 0);
3181 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3182 if (wret)
3183 ret = wret;
3184 }
3185 btrfs_mark_buffer_dirty(parent);
3186 return ret;
3187}
3188
3189/*
3190 * a helper function to delete the leaf pointed to by path->slots[1] and
3191 * path->nodes[1]. bytenr is the node block pointer, but since the callers
3192 * already know it, it is faster to have them pass it down than to
3193 * read it out of the node again.
3194 *
3195 * This deletes the pointer in path->nodes[1] and frees the leaf
3196 * block extent. zero is returned if it all worked out, < 0 otherwise.
3197 *
3198 * The path must have already been setup for deleting the leaf, including
3199 * all the proper balancing. path->nodes[1] must be locked.
3200 */
3201noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3202 struct btrfs_root *root,
3203 struct btrfs_path *path, u64 bytenr)
3204{
3205 int ret;
3206 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3207
3208 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3209 if (ret)
3210 return ret;
3211
3212 ret = btrfs_free_extent(trans, root, bytenr,
3213 btrfs_level_size(root, 0),
3214 path->nodes[1]->start,
3215 btrfs_header_owner(path->nodes[1]),
3216 root_gen, 0, 1);
3217 return ret;
3218}
3219/*
3220 * delete the item at the leaf level in path. If that empties
3221 * the leaf, remove it from the tree
3222 */
3223int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3224 struct btrfs_path *path, int slot, int nr)
3225{
3226 struct extent_buffer *leaf;
3227 struct btrfs_item *item;
3228 int last_off;
3229 int dsize = 0;
3230 int ret = 0;
3231 int wret;
3232 int i;
3233 u32 nritems;
3234
3235 leaf = path->nodes[0];
3236 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3237
3238 for (i = 0; i < nr; i++)
3239 dsize += btrfs_item_size_nr(leaf, slot + i);
3240
3241 nritems = btrfs_header_nritems(leaf);
3242
3243 if (slot + nr != nritems) {
3244 int data_end = leaf_data_end(root, leaf);
3245
3246 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3247 data_end + dsize,
3248 btrfs_leaf_data(leaf) + data_end,
3249 last_off - data_end);
3250
3251 for (i = slot + nr; i < nritems; i++) {
3252 u32 ioff;
3253
3254 item = btrfs_item_nr(leaf, i);
3255 if (!leaf->map_token) {
3256 map_extent_buffer(leaf, (unsigned long)item,
3257 sizeof(struct btrfs_item),
3258 &leaf->map_token, &leaf->kaddr,
3259 &leaf->map_start, &leaf->map_len,
3260 KM_USER1);
3261 }
3262 ioff = btrfs_item_offset(leaf, item);
3263 btrfs_set_item_offset(leaf, item, ioff + dsize);
3264 }
3265
3266 if (leaf->map_token) {
3267 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3268 leaf->map_token = NULL;
3269 }
3270
3271 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3272 btrfs_item_nr_offset(slot + nr),
3273 sizeof(struct btrfs_item) *
3274 (nritems - slot - nr));
3275 }
3276 btrfs_set_header_nritems(leaf, nritems - nr);
3277 nritems -= nr;
3278
3279 /* delete the leaf if we've emptied it */
3280 if (nritems == 0) {
3281 if (leaf == root->node) {
3282 btrfs_set_header_level(leaf, 0);
3283 } else {
3284 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3285 BUG_ON(ret);
3286 }
3287 } else {
3288 int used = leaf_space_used(leaf, 0, nritems);
3289 if (slot == 0) {
3290 struct btrfs_disk_key disk_key;
3291
3292 btrfs_item_key(leaf, &disk_key, 0);
3293 wret = fixup_low_keys(trans, root, path,
3294 &disk_key, 1);
3295 if (wret)
3296 ret = wret;
3297 }
3298
3299 /* delete the leaf if it is mostly empty */
3300 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3301 /* push_leaf_left fixes the path.
3302 * make sure the path still points to our leaf
3303 * for possible call to del_ptr below
3304 */
3305 slot = path->slots[1];
3306 extent_buffer_get(leaf);
3307
3308 wret = push_leaf_left(trans, root, path, 1, 1);
3309 if (wret < 0 && wret != -ENOSPC)
3310 ret = wret;
3311
3312 if (path->nodes[0] == leaf &&
3313 btrfs_header_nritems(leaf)) {
3314 wret = push_leaf_right(trans, root, path, 1, 1);
3315 if (wret < 0 && wret != -ENOSPC)
3316 ret = wret;
3317 }
3318
3319 if (btrfs_header_nritems(leaf) == 0) {
3320 path->slots[1] = slot;
3321 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3322 BUG_ON(ret);
3323 free_extent_buffer(leaf);
3324 } else {
3325 /* if we're still in the path, make sure
3326 * we're dirty. Otherwise, one of the
3327 * push_leaf functions must have already
3328 * dirtied this buffer
3329 */
3330 if (path->nodes[0] == leaf)
3331 btrfs_mark_buffer_dirty(leaf);
3332 free_extent_buffer(leaf);
3333 }
3334 } else {
3335 btrfs_mark_buffer_dirty(leaf);
3336 }
3337 }
3338 return ret;
3339}
3340
3341/*
3342 * search the tree again to find a leaf with lesser keys
3343 * returns 0 if it found something or 1 if there are no lesser leaves.
3344 * returns < 0 on io errors.
3345 *
3346 * This may release the path, and so you may lose any locks held at the
3347 * time you call it.
3348 */
3349int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3350{
3351 struct btrfs_key key;
3352 struct btrfs_disk_key found_key;
3353 int ret;
3354
3355 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3356
3357 if (key.offset > 0)
3358 key.offset--;
3359 else if (key.type > 0)
3360 key.type--;
3361 else if (key.objectid > 0)
3362 key.objectid--;
3363 else
3364 return 1;
3365
3366 btrfs_release_path(root, path);
3367 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3368 if (ret < 0)
3369 return ret;
3370 btrfs_item_key(path->nodes[0], &found_key, 0);
3371 ret = comp_keys(&found_key, &key);
3372 if (ret < 0)
3373 return 0;
3374 return 1;
3375}
3376
3377/*
3378 * A helper function to walk down the tree starting at min_key, and looking
3379 * for nodes or leaves that are either in cache or have a minimum
3380 * transaction id. This is used by the btree defrag code, and tree logging
3381 *
3382 * This does not cow, but it does stuff the starting key it finds back
3383 * into min_key, so you can call btrfs_search_slot with cow=1 on the
3384 * key and get a writable path.
3385 *
3386 * This does lock as it descends, and path->keep_locks should be set
3387 * to 1 by the caller.
3388 *
3389 * This honors path->lowest_level to prevent descent past a given level
3390 * of the tree.
3391 *
3392 * min_trans indicates the oldest transaction that you are interested
3393 * in walking through. Any nodes or leaves older than min_trans are
3394 * skipped over (without reading them).
3395 *
3396 * returns zero if something useful was found, < 0 on error and 1 if there
3397 * was nothing in the tree that matched the search criteria.
3398 */
3399int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3400 struct btrfs_key *max_key,
3401 struct btrfs_path *path, int cache_only,
3402 u64 min_trans)
3403{
3404 struct extent_buffer *cur;
3405 struct btrfs_key found_key;
3406 int slot;
3407 int sret;
3408 u32 nritems;
3409 int level;
3410 int ret = 1;
3411
3412again:
3413 cur = btrfs_lock_root_node(root);
3414 level = btrfs_header_level(cur);
3415 WARN_ON(path->nodes[level]);
3416 path->nodes[level] = cur;
3417 path->locks[level] = 1;
3418
3419 if (btrfs_header_generation(cur) < min_trans) {
3420 ret = 1;
3421 goto out;
3422 }
3423 while(1) {
3424 nritems = btrfs_header_nritems(cur);
3425 level = btrfs_header_level(cur);
3426 sret = bin_search(cur, min_key, level, &slot);
3427
3428 /* at the lowest level, we're done, setup the path and exit */
3429 if (level == path->lowest_level) {
3430 if (slot >= nritems)
3431 goto find_next_key;
3432 ret = 0;
3433 path->slots[level] = slot;
3434 btrfs_item_key_to_cpu(cur, &found_key, slot);
3435 goto out;
3436 }
3437 if (sret && slot > 0)
3438 slot--;
3439 /*
3440 * check this node pointer against the cache_only and
3441 * min_trans parameters. If it isn't in cache or is too
3442 * old, skip to the next one.
3443 */
3444 while(slot < nritems) {
3445 u64 blockptr;
3446 u64 gen;
3447 struct extent_buffer *tmp;
3448 struct btrfs_disk_key disk_key;
3449
3450 blockptr = btrfs_node_blockptr(cur, slot);
3451 gen = btrfs_node_ptr_generation(cur, slot);
3452 if (gen < min_trans) {
3453 slot++;
3454 continue;
3455 }
3456 if (!cache_only)
3457 break;
3458
3459 if (max_key) {
3460 btrfs_node_key(cur, &disk_key, slot);
3461 if (comp_keys(&disk_key, max_key) >= 0) {
3462 ret = 1;
3463 goto out;
3464 }
3465 }
3466
3467 tmp = btrfs_find_tree_block(root, blockptr,
3468 btrfs_level_size(root, level - 1));
3469
3470 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3471 free_extent_buffer(tmp);
3472 break;
3473 }
3474 if (tmp)
3475 free_extent_buffer(tmp);
3476 slot++;
3477 }
3478find_next_key:
3479 /*
3480 * we didn't find a candidate key in this node, walk forward
3481 * and find another one
3482 */
3483 if (slot >= nritems) {
3484 path->slots[level] = slot;
3485 sret = btrfs_find_next_key(root, path, min_key, level,
3486 cache_only, min_trans);
3487 if (sret == 0) {
3488 btrfs_release_path(root, path);
3489 goto again;
3490 } else {
3491 goto out;
3492 }
3493 }
3494 /* save our key for returning back */
3495 btrfs_node_key_to_cpu(cur, &found_key, slot);
3496 path->slots[level] = slot;
3497 if (level == path->lowest_level) {
3498 ret = 0;
3499 unlock_up(path, level, 1);
3500 goto out;
3501 }
3502 cur = read_node_slot(root, cur, slot);
3503
3504 btrfs_tree_lock(cur);
3505 path->locks[level - 1] = 1;
3506 path->nodes[level - 1] = cur;
3507 unlock_up(path, level, 1);
3508 }
3509out:
3510 if (ret == 0)
3511 memcpy(min_key, &found_key, sizeof(found_key));
3512 return ret;
3513}
3514
3515/*
3516 * this is similar to btrfs_next_leaf, but does not try to preserve
3517 * and fixup the path. It looks for and returns the next key in the
3518 * tree based on the current path and the cache_only and min_trans
3519 * parameters.
3520 *
3521 * 0 is returned if another key is found, < 0 if there are any errors
3522 * and 1 is returned if there are no higher keys in the tree
3523 *
3524 * path->keep_locks should be set to 1 on the search made before
3525 * calling this function.
3526 */
3527int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3528 struct btrfs_key *key, int lowest_level,
3529 int cache_only, u64 min_trans)
3530{
3531 int level = lowest_level;
3532 int slot;
3533 struct extent_buffer *c;
3534
3535 while(level < BTRFS_MAX_LEVEL) {
3536 if (!path->nodes[level])
3537 return 1;
3538
3539 slot = path->slots[level] + 1;
3540 c = path->nodes[level];
3541next:
3542 if (slot >= btrfs_header_nritems(c)) {
3543 level++;
3544 if (level == BTRFS_MAX_LEVEL) {
3545 return 1;
3546 }
3547 continue;
3548 }
3549 if (level == 0)
3550 btrfs_item_key_to_cpu(c, key, slot);
3551 else {
3552 u64 blockptr = btrfs_node_blockptr(c, slot);
3553 u64 gen = btrfs_node_ptr_generation(c, slot);
3554
3555 if (cache_only) {
3556 struct extent_buffer *cur;
3557 cur = btrfs_find_tree_block(root, blockptr,
3558 btrfs_level_size(root, level - 1));
3559 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
3560 slot++;
3561 if (cur)
3562 free_extent_buffer(cur);
3563 goto next;
3564 }
3565 free_extent_buffer(cur);
3566 }
3567 if (gen < min_trans) {
3568 slot++;
3569 goto next;
3570 }
3571 btrfs_node_key_to_cpu(c, key, slot);
3572 }
3573 return 0;
3574 }
3575 return 1;
3576}
3577
3578/*
3579 * search the tree again to find a leaf with greater keys
3580 * returns 0 if it found something or 1 if there are no greater leaves.
3581 * returns < 0 on io errors.
3582 */
3583int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3584{
3585 int slot;
3586 int level = 1;
3587 struct extent_buffer *c;
3588 struct extent_buffer *next = NULL;
3589 struct btrfs_key key;
3590 u32 nritems;
3591 int ret;
3592
3593 nritems = btrfs_header_nritems(path->nodes[0]);
3594 if (nritems == 0) {
3595 return 1;
3596 }
3597
3598 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
3599
3600 btrfs_release_path(root, path);
3601 path->keep_locks = 1;
3602 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3603 path->keep_locks = 0;
3604
3605 if (ret < 0)
3606 return ret;
3607
3608 nritems = btrfs_header_nritems(path->nodes[0]);
3609 /*
3610 * by releasing the path above we dropped all our locks. A balance
3611 * could have added more items next to the key that used to be
3612 * at the very end of the block. So, check again here and
3613 * advance the path if there are now more items available.
3614 */
3615 if (nritems > 0 && path->slots[0] < nritems - 1) {
3616 path->slots[0]++;
3617 goto done;
3618 }
3619
3620 while(level < BTRFS_MAX_LEVEL) {
3621 if (!path->nodes[level])
3622 return 1;
3623
3624 slot = path->slots[level] + 1;
3625 c = path->nodes[level];
3626 if (slot >= btrfs_header_nritems(c)) {
3627 level++;
3628 if (level == BTRFS_MAX_LEVEL) {
3629 return 1;
3630 }
3631 continue;
3632 }
3633
3634 if (next) {
3635 btrfs_tree_unlock(next);
3636 free_extent_buffer(next);
3637 }
3638
3639 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3640 path->reada)
3641 reada_for_search(root, path, level, slot, 0);
3642
3643 next = read_node_slot(root, c, slot);
3644 if (!path->skip_locking) {
3645 WARN_ON(!btrfs_tree_locked(c));
3646 btrfs_tree_lock(next);
3647 }
3648 break;
3649 }
3650 path->slots[level] = slot;
3651 while(1) {
3652 level--;
3653 c = path->nodes[level];
3654 if (path->locks[level])
3655 btrfs_tree_unlock(c);
3656 free_extent_buffer(c);
3657 path->nodes[level] = next;
3658 path->slots[level] = 0;
3659 if (!path->skip_locking)
3660 path->locks[level] = 1;
3661 if (!level)
3662 break;
3663 if (level == 1 && path->locks[1] && path->reada)
3664 reada_for_search(root, path, level, slot, 0);
3665 next = read_node_slot(root, next, 0);
3666 if (!path->skip_locking) {
3667 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3668 btrfs_tree_lock(next);
3669 }
3670 }
3671done:
3672 unlock_up(path, 0, 1);
3673 return 0;
3674}
3675
3676/*
3677 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
3678 * searching until it gets past min_objectid or finds an item of 'type'
3679 *
3680 * returns 0 if something is found, 1 if nothing was found and < 0 on error
3681 */
3682int btrfs_previous_item(struct btrfs_root *root,
3683 struct btrfs_path *path, u64 min_objectid,
3684 int type)
3685{
3686 struct btrfs_key found_key;
3687 struct extent_buffer *leaf;
3688 u32 nritems;
3689 int ret;
3690
3691 while(1) {
3692 if (path->slots[0] == 0) {
3693 ret = btrfs_prev_leaf(root, path);
3694 if (ret != 0)
3695 return ret;
3696 } else {
3697 path->slots[0]--;
3698 }
3699 leaf = path->nodes[0];
3700 nritems = btrfs_header_nritems(leaf);
3701 if (nritems == 0)
3702 return 1;
3703 if (path->slots[0] == nritems)
3704 path->slots[0]--;
3705
3706 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3707 if (found_key.type == type)
3708 return 0;
3709 if (found_key.objectid < min_objectid)
3710 break;
3711 if (found_key.objectid == min_objectid &&
3712 found_key.type < type)
3713 break;
3714 }
3715 return 1;
3716}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000000..8559f39fd47f
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,1891 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CTREE__
20#define __BTRFS_CTREE__
21
22#include <linux/version.h>
23#include <linux/mm.h>
24#include <linux/highmem.h>
25#include <linux/fs.h>
26#include <linux/completion.h>
27#include <linux/backing-dev.h>
28#include <linux/wait.h>
29#include <asm/kmap_types.h>
30#include "extent_io.h"
31#include "extent_map.h"
32#include "async-thread.h"
33
34struct btrfs_trans_handle;
35struct btrfs_transaction;
36extern struct kmem_cache *btrfs_trans_handle_cachep;
37extern struct kmem_cache *btrfs_transaction_cachep;
38extern struct kmem_cache *btrfs_bit_radix_cachep;
39extern struct kmem_cache *btrfs_path_cachep;
40struct btrfs_ordered_sum;
41
42#define BTRFS_MAGIC "_BBRfS_M"
43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45
46#ifdef CONFIG_LOCKDEP
47# define BTRFS_MAX_LEVEL 7
48#else
49# define BTRFS_MAX_LEVEL 8
50#endif
51
52/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL
54
55/* stores information about which extents are in use, and reference counts */
56#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
57
58/*
59 * chunk tree stores translations from logical -> physical block numbering
60 * the super block points to the chunk tree
61 */
62#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
63
64/*
65 * stores information about which areas of a given device are in use.
66 * one per device. The tree of tree roots points to the device tree
67 */
68#define BTRFS_DEV_TREE_OBJECTID 4ULL
69
70/* one per subvolume, storing files and directories */
71#define BTRFS_FS_TREE_OBJECTID 5ULL
72
73/* directory objectid inside the root tree */
74#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
75
76/* orhpan objectid for tracking unlinked/truncated files */
77#define BTRFS_ORPHAN_OBJECTID -5ULL
78
79/* does write ahead logging to speed up fsyncs */
80#define BTRFS_TREE_LOG_OBJECTID -6ULL
81#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
82
83/* for space balancing */
84#define BTRFS_TREE_RELOC_OBJECTID -8ULL
85#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
86
87/* dummy objectid represents multiple objectids */
88#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
89
90/*
91 * All files have objectids in this range.
92 */
93#define BTRFS_FIRST_FREE_OBJECTID 256ULL
94#define BTRFS_LAST_FREE_OBJECTID -256ULL
95#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
96
97
98/*
99 * the device items go into the chunk tree. The key is in the form
100 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
101 */
102#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
103
104/*
105 * we can actually store much bigger names, but lets not confuse the rest
106 * of linux
107 */
108#define BTRFS_NAME_LEN 255
109
110/* 32 bytes in various csum fields */
111#define BTRFS_CSUM_SIZE 32
112/* four bytes for CRC32 */
113#define BTRFS_CRC32_SIZE 4
114#define BTRFS_EMPTY_DIR_SIZE 0
115
116#define BTRFS_FT_UNKNOWN 0
117#define BTRFS_FT_REG_FILE 1
118#define BTRFS_FT_DIR 2
119#define BTRFS_FT_CHRDEV 3
120#define BTRFS_FT_BLKDEV 4
121#define BTRFS_FT_FIFO 5
122#define BTRFS_FT_SOCK 6
123#define BTRFS_FT_SYMLINK 7
124#define BTRFS_FT_XATTR 8
125#define BTRFS_FT_MAX 9
126
127/*
128 * the key defines the order in the tree, and so it also defines (optimal)
129 * block layout. objectid corresonds to the inode number. The flags
130 * tells us things about the object, and is a kind of stream selector.
131 * so for a given inode, keys with flags of 1 might refer to the inode
132 * data, flags of 2 may point to file data in the btree and flags == 3
133 * may point to extents.
134 *
135 * offset is the starting byte offset for this key in the stream.
136 *
137 * btrfs_disk_key is in disk byte order. struct btrfs_key is always
138 * in cpu native order. Otherwise they are identical and their sizes
139 * should be the same (ie both packed)
140 */
141struct btrfs_disk_key {
142 __le64 objectid;
143 u8 type;
144 __le64 offset;
145} __attribute__ ((__packed__));
146
147struct btrfs_key {
148 u64 objectid;
149 u8 type;
150 u64 offset;
151} __attribute__ ((__packed__));
152
153struct btrfs_mapping_tree {
154 struct extent_map_tree map_tree;
155};
156
157#define BTRFS_UUID_SIZE 16
158struct btrfs_dev_item {
159 /* the internal btrfs device id */
160 __le64 devid;
161
162 /* size of the device */
163 __le64 total_bytes;
164
165 /* bytes used */
166 __le64 bytes_used;
167
168 /* optimal io alignment for this device */
169 __le32 io_align;
170
171 /* optimal io width for this device */
172 __le32 io_width;
173
174 /* minimal io size for this device */
175 __le32 sector_size;
176
177 /* type and info about this device */
178 __le64 type;
179
180 /* grouping information for allocation decisions */
181 __le32 dev_group;
182
183 /* seek speed 0-100 where 100 is fastest */
184 u8 seek_speed;
185
186 /* bandwidth 0-100 where 100 is fastest */
187 u8 bandwidth;
188
189 /* btrfs generated uuid for this device */
190 u8 uuid[BTRFS_UUID_SIZE];
191} __attribute__ ((__packed__));
192
193struct btrfs_stripe {
194 __le64 devid;
195 __le64 offset;
196 u8 dev_uuid[BTRFS_UUID_SIZE];
197} __attribute__ ((__packed__));
198
199struct btrfs_chunk {
200 /* size of this chunk in bytes */
201 __le64 length;
202
203 /* objectid of the root referencing this chunk */
204 __le64 owner;
205
206 __le64 stripe_len;
207 __le64 type;
208
209 /* optimal io alignment for this chunk */
210 __le32 io_align;
211
212 /* optimal io width for this chunk */
213 __le32 io_width;
214
215 /* minimal io size for this chunk */
216 __le32 sector_size;
217
218 /* 2^16 stripes is quite a lot, a second limit is the size of a single
219 * item in the btree
220 */
221 __le16 num_stripes;
222
223 /* sub stripes only matter for raid10 */
224 __le16 sub_stripes;
225 struct btrfs_stripe stripe;
226 /* additional stripes go here */
227} __attribute__ ((__packed__));
228
229static inline unsigned long btrfs_chunk_item_size(int num_stripes)
230{
231 BUG_ON(num_stripes == 0);
232 return sizeof(struct btrfs_chunk) +
233 sizeof(struct btrfs_stripe) * (num_stripes - 1);
234}
235
236#define BTRFS_FSID_SIZE 16
237#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
238
239/*
240 * every tree block (leaf or node) starts with this header.
241 */
242struct btrfs_header {
243 /* these first four must match the super block */
244 u8 csum[BTRFS_CSUM_SIZE];
245 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
246 __le64 bytenr; /* which block this node is supposed to live in */
247 __le64 flags;
248
249 /* allowed to be different from the super from here on down */
250 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
251 __le64 generation;
252 __le64 owner;
253 __le32 nritems;
254 u8 level;
255} __attribute__ ((__packed__));
256
257#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
258 sizeof(struct btrfs_header)) / \
259 sizeof(struct btrfs_key_ptr))
260#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
261#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
262#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
263 sizeof(struct btrfs_item) - \
264 sizeof(struct btrfs_file_extent_item))
265
266
267/*
268 * this is a very generous portion of the super block, giving us
269 * room to translate 14 chunks with 3 stripes each.
270 */
271#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
272#define BTRFS_LABEL_SIZE 256
273
274/*
275 * the super block basically lists the main trees of the FS
276 * it currently lacks any block count etc etc
277 */
278struct btrfs_super_block {
279 u8 csum[BTRFS_CSUM_SIZE];
280 /* the first 4 fields must match struct btrfs_header */
281 u8 fsid[16]; /* FS specific uuid */
282 __le64 bytenr; /* this block number */
283 __le64 flags;
284
285 /* allowed to be different from the btrfs_header from here own down */
286 __le64 magic;
287 __le64 generation;
288 __le64 root;
289 __le64 chunk_root;
290 __le64 log_root;
291 __le64 total_bytes;
292 __le64 bytes_used;
293 __le64 root_dir_objectid;
294 __le64 num_devices;
295 __le32 sectorsize;
296 __le32 nodesize;
297 __le32 leafsize;
298 __le32 stripesize;
299 __le32 sys_chunk_array_size;
300 u8 root_level;
301 u8 chunk_root_level;
302 u8 log_root_level;
303 struct btrfs_dev_item dev_item;
304 char label[BTRFS_LABEL_SIZE];
305 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
306} __attribute__ ((__packed__));
307
308/*
309 * A leaf is full of items. offset and size tell us where to find
310 * the item in the leaf (relative to the start of the data area)
311 */
312struct btrfs_item {
313 struct btrfs_disk_key key;
314 __le32 offset;
315 __le32 size;
316} __attribute__ ((__packed__));
317
318/*
319 * leaves have an item area and a data area:
320 * [item0, item1....itemN] [free space] [dataN...data1, data0]
321 *
322 * The data is separate from the items to get the keys closer together
323 * during searches.
324 */
325struct btrfs_leaf {
326 struct btrfs_header header;
327 struct btrfs_item items[];
328} __attribute__ ((__packed__));
329
330/*
331 * all non-leaf blocks are nodes, they hold only keys and pointers to
332 * other blocks
333 */
334struct btrfs_key_ptr {
335 struct btrfs_disk_key key;
336 __le64 blockptr;
337 __le64 generation;
338} __attribute__ ((__packed__));
339
340struct btrfs_node {
341 struct btrfs_header header;
342 struct btrfs_key_ptr ptrs[];
343} __attribute__ ((__packed__));
344
345/*
346 * btrfs_paths remember the path taken from the root down to the leaf.
347 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
348 * to any other levels that are present.
349 *
350 * The slots array records the index of the item or block pointer
351 * used while walking the tree.
352 */
353struct btrfs_path {
354 struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
355 int slots[BTRFS_MAX_LEVEL];
356 /* if there is real range locking, this locks field will change */
357 int locks[BTRFS_MAX_LEVEL];
358 int reada;
359 /* keep some upper locks as we walk down */
360 int keep_locks;
361 int skip_locking;
362 int lowest_level;
363};
364
365/*
366 * items in the extent btree are used to record the objectid of the
367 * owner of the block and the number of references
368 */
369struct btrfs_extent_item {
370 __le32 refs;
371} __attribute__ ((__packed__));
372
373struct btrfs_extent_ref {
374 __le64 root;
375 __le64 generation;
376 __le64 objectid;
377 __le32 num_refs;
378} __attribute__ ((__packed__));
379
380/* dev extents record free space on individual devices. The owner
381 * field points back to the chunk allocation mapping tree that allocated
382 * the extent. The chunk tree uuid field is a way to double check the owner
383 */
384struct btrfs_dev_extent {
385 __le64 chunk_tree;
386 __le64 chunk_objectid;
387 __le64 chunk_offset;
388 __le64 length;
389 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
390} __attribute__ ((__packed__));
391
392struct btrfs_inode_ref {
393 __le64 index;
394 __le16 name_len;
395 /* name goes here */
396} __attribute__ ((__packed__));
397
398struct btrfs_timespec {
399 __le64 sec;
400 __le32 nsec;
401} __attribute__ ((__packed__));
402
403/*
404 * there is no padding here on purpose. If you want to extent the inode,
405 * make a new item type
406 */
407struct btrfs_inode_item {
408 /* nfs style generation number */
409 __le64 generation;
410 /* transid that last touched this inode */
411 __le64 transid;
412 __le64 size;
413 __le64 nbytes;
414 __le64 block_group;
415 __le32 nlink;
416 __le32 uid;
417 __le32 gid;
418 __le32 mode;
419 __le64 rdev;
420 __le16 flags;
421 __le16 compat_flags;
422 struct btrfs_timespec atime;
423 struct btrfs_timespec ctime;
424 struct btrfs_timespec mtime;
425 struct btrfs_timespec otime;
426} __attribute__ ((__packed__));
427
428struct btrfs_dir_log_item {
429 __le64 end;
430} __attribute__ ((__packed__));
431
432struct btrfs_dir_item {
433 struct btrfs_disk_key location;
434 __le64 transid;
435 __le16 data_len;
436 __le16 name_len;
437 u8 type;
438} __attribute__ ((__packed__));
439
440struct btrfs_root_item {
441 struct btrfs_inode_item inode;
442 __le64 root_dirid;
443 __le64 bytenr;
444 __le64 byte_limit;
445 __le64 bytes_used;
446 __le32 flags;
447 __le32 refs;
448 struct btrfs_disk_key drop_progress;
449 u8 drop_level;
450 u8 level;
451} __attribute__ ((__packed__));
452
453#define BTRFS_FILE_EXTENT_REG 0
454#define BTRFS_FILE_EXTENT_INLINE 1
455
456struct btrfs_file_extent_item {
457 __le64 generation;
458 u8 type;
459 /*
460 * disk space consumed by the extent, checksum blocks are included
461 * in these numbers
462 */
463 __le64 disk_bytenr;
464 __le64 disk_num_bytes;
465 /*
466 * the logical offset in file blocks (no csums)
467 * this extent record is for. This allows a file extent to point
468 * into the middle of an existing extent on disk, sharing it
469 * between two snapshots (useful if some bytes in the middle of the
470 * extent have changed
471 */
472 __le64 offset;
473 /*
474 * the logical number of file blocks (no csums included)
475 */
476 __le64 num_bytes;
477} __attribute__ ((__packed__));
478
479struct btrfs_csum_item {
480 u8 csum;
481} __attribute__ ((__packed__));
482
483/* different types of block groups (and chunks) */
484#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
485#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
486#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
487#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
488#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
489#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
490#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
491
492struct btrfs_block_group_item {
493 __le64 used;
494 __le64 chunk_objectid;
495 __le64 flags;
496} __attribute__ ((__packed__));
497
498struct btrfs_space_info {
499 u64 flags;
500 u64 total_bytes;
501 u64 bytes_used;
502 u64 bytes_pinned;
503 u64 bytes_reserved;
504 int full;
505 int force_alloc;
506 struct list_head list;
507
508 /* for block groups in our same type */
509 struct list_head block_groups;
510 spinlock_t lock;
511};
512
513struct btrfs_free_space {
514 struct rb_node bytes_index;
515 struct rb_node offset_index;
516 u64 offset;
517 u64 bytes;
518};
519
520struct btrfs_block_group_cache {
521 struct btrfs_key key;
522 struct btrfs_block_group_item item;
523 spinlock_t lock;
524 u64 pinned;
525 u64 reserved;
526 u64 flags;
527 int cached;
528 int ro;
529 int dirty;
530
531 struct btrfs_space_info *space_info;
532
533 /* free space cache stuff */
534 struct rb_root free_space_bytes;
535 struct rb_root free_space_offset;
536
537 /* block group cache stuff */
538 struct rb_node cache_node;
539
540 /* for block groups in the same raid type */
541 struct list_head list;
542};
543
544struct btrfs_leaf_ref_tree {
545 struct rb_root root;
546 struct list_head list;
547 spinlock_t lock;
548};
549
550struct btrfs_device;
551struct btrfs_fs_devices;
552struct btrfs_fs_info {
553 u8 fsid[BTRFS_FSID_SIZE];
554 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
555 struct btrfs_root *extent_root;
556 struct btrfs_root *tree_root;
557 struct btrfs_root *chunk_root;
558 struct btrfs_root *dev_root;
559
560 /* the log root tree is a directory of all the other log roots */
561 struct btrfs_root *log_root_tree;
562 struct radix_tree_root fs_roots_radix;
563
564 /* block group cache stuff */
565 spinlock_t block_group_cache_lock;
566 struct rb_root block_group_cache_tree;
567
568 struct extent_io_tree pinned_extents;
569 struct extent_io_tree pending_del;
570 struct extent_io_tree extent_ins;
571
572 /* logical->physical extent mapping */
573 struct btrfs_mapping_tree mapping_tree;
574
575 u64 generation;
576 u64 last_trans_committed;
577 u64 last_trans_new_blockgroup;
578 u64 open_ioctl_trans;
579 unsigned long mount_opt;
580 u64 max_extent;
581 u64 max_inline;
582 u64 alloc_start;
583 struct btrfs_transaction *running_transaction;
584 wait_queue_head_t transaction_throttle;
585 wait_queue_head_t transaction_wait;
586 wait_queue_head_t async_submit_wait;
587
588 wait_queue_head_t tree_log_wait;
589
590 struct btrfs_super_block super_copy;
591 struct btrfs_super_block super_for_commit;
592 struct block_device *__bdev;
593 struct super_block *sb;
594 struct inode *btree_inode;
595 struct backing_dev_info bdi;
596 spinlock_t hash_lock;
597 struct mutex trans_mutex;
598 struct mutex tree_log_mutex;
599 struct mutex transaction_kthread_mutex;
600 struct mutex cleaner_mutex;
601 struct mutex alloc_mutex;
602 struct mutex chunk_mutex;
603 struct mutex drop_mutex;
604 struct mutex volume_mutex;
605 struct mutex tree_reloc_mutex;
606 struct list_head trans_list;
607 struct list_head hashers;
608 struct list_head dead_roots;
609
610 atomic_t nr_async_submits;
611 atomic_t async_submit_draining;
612 atomic_t nr_async_bios;
613 atomic_t tree_log_writers;
614 atomic_t tree_log_commit;
615 unsigned long tree_log_batch;
616 u64 tree_log_transid;
617
618 /*
619 * this is used by the balancing code to wait for all the pending
620 * ordered extents
621 */
622 spinlock_t ordered_extent_lock;
623 struct list_head ordered_extents;
624 struct list_head delalloc_inodes;
625
626 /*
627 * there is a pool of worker threads for checksumming during writes
628 * and a pool for checksumming after reads. This is because readers
629 * can run with FS locks held, and the writers may be waiting for
630 * those locks. We don't want ordering in the pending list to cause
631 * deadlocks, and so the two are serviced separately.
632 *
633 * A third pool does submit_bio to avoid deadlocking with the other
634 * two
635 */
636 struct btrfs_workers workers;
637 struct btrfs_workers endio_workers;
638 struct btrfs_workers endio_write_workers;
639 struct btrfs_workers submit_workers;
640 /*
641 * fixup workers take dirty pages that didn't properly go through
642 * the cow mechanism and make them safe to write. It happens
643 * for the sys_munmap function call path
644 */
645 struct btrfs_workers fixup_workers;
646 struct task_struct *transaction_kthread;
647 struct task_struct *cleaner_kthread;
648 int thread_pool_size;
649
650 /* tree relocation relocated fields */
651 struct extent_io_tree reloc_mapping_tree;
652 struct list_head dead_reloc_roots;
653 struct btrfs_leaf_ref_tree reloc_ref_tree;
654 struct btrfs_leaf_ref_tree shared_ref_tree;
655
656 struct kobject super_kobj;
657 struct completion kobj_unregister;
658 int do_barriers;
659 int closing;
660 int log_root_recovering;
661 atomic_t throttles;
662 atomic_t throttle_gen;
663
664 u64 total_pinned;
665 struct list_head dirty_cowonly_roots;
666
667 struct btrfs_fs_devices *fs_devices;
668 struct list_head space_info;
669 spinlock_t delalloc_lock;
670 spinlock_t new_trans_lock;
671 u64 delalloc_bytes;
672 u64 last_alloc;
673 u64 last_data_alloc;
674
675 spinlock_t ref_cache_lock;
676 u64 total_ref_cache_size;
677
678 u64 avail_data_alloc_bits;
679 u64 avail_metadata_alloc_bits;
680 u64 avail_system_alloc_bits;
681 u64 data_alloc_profile;
682 u64 metadata_alloc_profile;
683 u64 system_alloc_profile;
684
685 void *bdev_holder;
686};
687
688/*
689 * in ram representation of the tree. extent_root is used for all allocations
690 * and for the extent tree extent_root root.
691 */
692struct btrfs_dirty_root;
693struct btrfs_root {
694 struct extent_buffer *node;
695
696 /* the node lock is held while changing the node pointer */
697 spinlock_t node_lock;
698
699 struct extent_buffer *commit_root;
700 struct btrfs_leaf_ref_tree *ref_tree;
701 struct btrfs_leaf_ref_tree ref_tree_struct;
702 struct btrfs_dirty_root *dirty_root;
703 struct btrfs_root *log_root;
704 struct btrfs_root *reloc_root;
705
706 struct btrfs_root_item root_item;
707 struct btrfs_key root_key;
708 struct btrfs_fs_info *fs_info;
709 struct inode *inode;
710 struct extent_io_tree dirty_log_pages;
711
712 struct kobject root_kobj;
713 struct completion kobj_unregister;
714 struct mutex objectid_mutex;
715 struct mutex log_mutex;
716
717 u64 objectid;
718 u64 last_trans;
719
720 /* data allocations are done in sectorsize units */
721 u32 sectorsize;
722
723 /* node allocations are done in nodesize units */
724 u32 nodesize;
725
726 /* leaf allocations are done in leafsize units */
727 u32 leafsize;
728
729 u32 stripesize;
730
731 u32 type;
732 u64 highest_inode;
733 u64 last_inode_alloc;
734 int ref_cows;
735 int track_dirty;
736 u64 defrag_trans_start;
737 struct btrfs_key defrag_progress;
738 struct btrfs_key defrag_max;
739 int defrag_running;
740 int defrag_level;
741 char *name;
742 int in_sysfs;
743
744 /* the dirty list is only used by non-reference counted roots */
745 struct list_head dirty_list;
746
747 spinlock_t list_lock;
748 struct list_head dead_list;
749 struct list_head orphan_list;
750};
751
752/*
753
754 * inode items have the data typically returned from stat and store other
755 * info about object characteristics. There is one for every file and dir in
756 * the FS
757 */
758#define BTRFS_INODE_ITEM_KEY 1
759#define BTRFS_INODE_REF_KEY 2
760#define BTRFS_XATTR_ITEM_KEY 8
761#define BTRFS_ORPHAN_ITEM_KEY 9
762/* reserve 2-15 close to the inode for later flexibility */
763
764/*
765 * dir items are the name -> inode pointers in a directory. There is one
766 * for every name in a directory.
767 */
768#define BTRFS_DIR_LOG_ITEM_KEY 14
769#define BTRFS_DIR_LOG_INDEX_KEY 15
770#define BTRFS_DIR_ITEM_KEY 16
771#define BTRFS_DIR_INDEX_KEY 17
772/*
773 * extent data is for file data
774 */
775#define BTRFS_EXTENT_DATA_KEY 18
776/*
777 * csum items have the checksums for data in the extents
778 */
779#define BTRFS_CSUM_ITEM_KEY 19
780
781
782/* reserve 21-31 for other file/dir stuff */
783
784/*
785 * root items point to tree roots. There are typically in the root
786 * tree used by the super block to find all the other trees
787 */
788#define BTRFS_ROOT_ITEM_KEY 32
789/*
790 * extent items are in the extent map tree. These record which blocks
791 * are used, and how many references there are to each block
792 */
793#define BTRFS_EXTENT_ITEM_KEY 33
794#define BTRFS_EXTENT_REF_KEY 34
795
796/*
797 * block groups give us hints into the extent allocation trees. Which
798 * blocks are free etc etc
799 */
800#define BTRFS_BLOCK_GROUP_ITEM_KEY 50
801
802#define BTRFS_DEV_EXTENT_KEY 75
803#define BTRFS_DEV_ITEM_KEY 76
804#define BTRFS_CHUNK_ITEM_KEY 77
805
806/*
807 * string items are for debugging. They just store a short string of
808 * data in the FS
809 */
810#define BTRFS_STRING_ITEM_KEY 253
811
812#define BTRFS_MOUNT_NODATASUM (1 << 0)
813#define BTRFS_MOUNT_NODATACOW (1 << 1)
814#define BTRFS_MOUNT_NOBARRIER (1 << 2)
815#define BTRFS_MOUNT_SSD (1 << 3)
816#define BTRFS_MOUNT_DEGRADED (1 << 4)
817
818#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
819#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
820#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
821 BTRFS_MOUNT_##opt)
822/*
823 * Inode flags
824 */
825#define BTRFS_INODE_NODATASUM (1 << 0)
826#define BTRFS_INODE_NODATACOW (1 << 1)
827#define BTRFS_INODE_READONLY (1 << 2)
828#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
829 ~BTRFS_INODE_##flag)
830#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
831 BTRFS_INODE_##flag)
832#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \
833 BTRFS_INODE_##flag)
834/* some macros to generate set/get funcs for the struct fields. This
835 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
836 * one for u8:
837 */
838#define le8_to_cpu(v) (v)
839#define cpu_to_le8(v) (v)
840#define __le8 u8
841
842#define read_eb_member(eb, ptr, type, member, result) ( \
843 read_extent_buffer(eb, (char *)(result), \
844 ((unsigned long)(ptr)) + \
845 offsetof(type, member), \
846 sizeof(((type *)0)->member)))
847
848#define write_eb_member(eb, ptr, type, member, result) ( \
849 write_extent_buffer(eb, (char *)(result), \
850 ((unsigned long)(ptr)) + \
851 offsetof(type, member), \
852 sizeof(((type *)0)->member)))
853
854#ifndef BTRFS_SETGET_FUNCS
855#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
856u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
857void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
858#endif
859
860#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
861static inline u##bits btrfs_##name(struct extent_buffer *eb) \
862{ \
863 type *p = kmap_atomic(eb->first_page, KM_USER0); \
864 u##bits res = le##bits##_to_cpu(p->member); \
865 kunmap_atomic(p, KM_USER0); \
866 return res; \
867} \
868static inline void btrfs_set_##name(struct extent_buffer *eb, \
869 u##bits val) \
870{ \
871 type *p = kmap_atomic(eb->first_page, KM_USER0); \
872 p->member = cpu_to_le##bits(val); \
873 kunmap_atomic(p, KM_USER0); \
874}
875
876#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
877static inline u##bits btrfs_##name(type *s) \
878{ \
879 return le##bits##_to_cpu(s->member); \
880} \
881static inline void btrfs_set_##name(type *s, u##bits val) \
882{ \
883 s->member = cpu_to_le##bits(val); \
884}
885
886BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
887BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
888BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
889BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
890BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
891BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
892BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
893BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
894BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
895BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
896
897BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
898BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
899 total_bytes, 64);
900BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
901 bytes_used, 64);
902BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
903 io_align, 32);
904BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
905 io_width, 32);
906BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
907 sector_size, 32);
908BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
909BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
910 dev_group, 32);
911BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
912 seek_speed, 8);
913BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
914 bandwidth, 8);
915
916static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
917{
918 return (char *)d + offsetof(struct btrfs_dev_item, uuid);
919}
920
921BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
922BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
923BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
924BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
925BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
926BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
927BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
928BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
929BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
930BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
931BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
932
933static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
934{
935 return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
936}
937
938BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
939BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
940BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
941 stripe_len, 64);
942BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
943 io_align, 32);
944BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
945 io_width, 32);
946BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
947 sector_size, 32);
948BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
949BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
950 num_stripes, 16);
951BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
952 sub_stripes, 16);
953BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
954BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
955
956static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
957 int nr)
958{
959 unsigned long offset = (unsigned long)c;
960 offset += offsetof(struct btrfs_chunk, stripe);
961 offset += nr * sizeof(struct btrfs_stripe);
962 return (struct btrfs_stripe *)offset;
963}
964
965static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
966{
967 return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
968}
969
970static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
971 struct btrfs_chunk *c, int nr)
972{
973 return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
974}
975
976static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
977 struct btrfs_chunk *c, int nr,
978 u64 val)
979{
980 btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
981}
982
983static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
984 struct btrfs_chunk *c, int nr)
985{
986 return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
987}
988
989static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
990 struct btrfs_chunk *c, int nr,
991 u64 val)
992{
993 btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
994}
995
996/* struct btrfs_block_group_item */
997BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
998 used, 64);
999BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
1000 used, 64);
1001BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
1002 struct btrfs_block_group_item, chunk_objectid, 64);
1003
1004BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
1005 struct btrfs_block_group_item, chunk_objectid, 64);
1006BTRFS_SETGET_FUNCS(disk_block_group_flags,
1007 struct btrfs_block_group_item, flags, 64);
1008BTRFS_SETGET_STACK_FUNCS(block_group_flags,
1009 struct btrfs_block_group_item, flags, 64);
1010
1011/* struct btrfs_inode_ref */
1012BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1013BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1014
1015/* struct btrfs_inode_item */
1016BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1017BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
1018BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
1019BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
1020BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
1021BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
1022BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
1023BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
1024BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
1025BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
1026BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 16);
1027BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item,
1028 compat_flags, 16);
1029
1030static inline struct btrfs_timespec *
1031btrfs_inode_atime(struct btrfs_inode_item *inode_item)
1032{
1033 unsigned long ptr = (unsigned long)inode_item;
1034 ptr += offsetof(struct btrfs_inode_item, atime);
1035 return (struct btrfs_timespec *)ptr;
1036}
1037
1038static inline struct btrfs_timespec *
1039btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
1040{
1041 unsigned long ptr = (unsigned long)inode_item;
1042 ptr += offsetof(struct btrfs_inode_item, mtime);
1043 return (struct btrfs_timespec *)ptr;
1044}
1045
1046static inline struct btrfs_timespec *
1047btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
1048{
1049 unsigned long ptr = (unsigned long)inode_item;
1050 ptr += offsetof(struct btrfs_inode_item, ctime);
1051 return (struct btrfs_timespec *)ptr;
1052}
1053
1054static inline struct btrfs_timespec *
1055btrfs_inode_otime(struct btrfs_inode_item *inode_item)
1056{
1057 unsigned long ptr = (unsigned long)inode_item;
1058 ptr += offsetof(struct btrfs_inode_item, otime);
1059 return (struct btrfs_timespec *)ptr;
1060}
1061
1062BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
1063BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
1064
1065/* struct btrfs_dev_extent */
1066BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
1067 chunk_tree, 64);
1068BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
1069 chunk_objectid, 64);
1070BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
1071 chunk_offset, 64);
1072BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
1073
1074static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
1075{
1076 unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
1077 return (u8 *)((unsigned long)dev + ptr);
1078}
1079
1080/* struct btrfs_extent_ref */
1081BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
1082BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
1083BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
1084BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
1085
1086BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
1087BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
1088 generation, 64);
1089BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
1090 objectid, 64);
1091BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
1092 num_refs, 32);
1093
1094/* struct btrfs_extent_item */
1095BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
1096BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
1097 refs, 32);
1098
1099/* struct btrfs_node */
1100BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
1101BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
1102
1103static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
1104{
1105 unsigned long ptr;
1106 ptr = offsetof(struct btrfs_node, ptrs) +
1107 sizeof(struct btrfs_key_ptr) * nr;
1108 return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
1109}
1110
1111static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
1112 int nr, u64 val)
1113{
1114 unsigned long ptr;
1115 ptr = offsetof(struct btrfs_node, ptrs) +
1116 sizeof(struct btrfs_key_ptr) * nr;
1117 btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
1118}
1119
1120static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
1121{
1122 unsigned long ptr;
1123 ptr = offsetof(struct btrfs_node, ptrs) +
1124 sizeof(struct btrfs_key_ptr) * nr;
1125 return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
1126}
1127
1128static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
1129 int nr, u64 val)
1130{
1131 unsigned long ptr;
1132 ptr = offsetof(struct btrfs_node, ptrs) +
1133 sizeof(struct btrfs_key_ptr) * nr;
1134 btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
1135}
1136
1137static inline unsigned long btrfs_node_key_ptr_offset(int nr)
1138{
1139 return offsetof(struct btrfs_node, ptrs) +
1140 sizeof(struct btrfs_key_ptr) * nr;
1141}
1142
1143void btrfs_node_key(struct extent_buffer *eb,
1144 struct btrfs_disk_key *disk_key, int nr);
1145
1146static inline void btrfs_set_node_key(struct extent_buffer *eb,
1147 struct btrfs_disk_key *disk_key, int nr)
1148{
1149 unsigned long ptr;
1150 ptr = btrfs_node_key_ptr_offset(nr);
1151 write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
1152 struct btrfs_key_ptr, key, disk_key);
1153}
1154
1155/* struct btrfs_item */
1156BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
1157BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
1158
1159static inline unsigned long btrfs_item_nr_offset(int nr)
1160{
1161 return offsetof(struct btrfs_leaf, items) +
1162 sizeof(struct btrfs_item) * nr;
1163}
1164
1165static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
1166 int nr)
1167{
1168 return (struct btrfs_item *)btrfs_item_nr_offset(nr);
1169}
1170
1171static inline u32 btrfs_item_end(struct extent_buffer *eb,
1172 struct btrfs_item *item)
1173{
1174 return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
1175}
1176
1177static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
1178{
1179 return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
1180}
1181
1182static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
1183{
1184 return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
1185}
1186
1187static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
1188{
1189 return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
1190}
1191
1192static inline void btrfs_item_key(struct extent_buffer *eb,
1193 struct btrfs_disk_key *disk_key, int nr)
1194{
1195 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1196 read_eb_member(eb, item, struct btrfs_item, key, disk_key);
1197}
1198
1199static inline void btrfs_set_item_key(struct extent_buffer *eb,
1200 struct btrfs_disk_key *disk_key, int nr)
1201{
1202 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1203 write_eb_member(eb, item, struct btrfs_item, key, disk_key);
1204}
1205
1206BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
1207
1208/* struct btrfs_dir_item */
1209BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
1210BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
1211BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
1212BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
1213
1214static inline void btrfs_dir_item_key(struct extent_buffer *eb,
1215 struct btrfs_dir_item *item,
1216 struct btrfs_disk_key *key)
1217{
1218 read_eb_member(eb, item, struct btrfs_dir_item, location, key);
1219}
1220
1221static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1222 struct btrfs_dir_item *item,
1223 struct btrfs_disk_key *key)
1224{
1225 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1226}
1227
1228/* struct btrfs_disk_key */
1229BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1230 objectid, 64);
1231BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
1232BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
1233
1234static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
1235 struct btrfs_disk_key *disk)
1236{
1237 cpu->offset = le64_to_cpu(disk->offset);
1238 cpu->type = disk->type;
1239 cpu->objectid = le64_to_cpu(disk->objectid);
1240}
1241
1242static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
1243 struct btrfs_key *cpu)
1244{
1245 disk->offset = cpu_to_le64(cpu->offset);
1246 disk->type = cpu->type;
1247 disk->objectid = cpu_to_le64(cpu->objectid);
1248}
1249
1250static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
1251 struct btrfs_key *key, int nr)
1252{
1253 struct btrfs_disk_key disk_key;
1254 btrfs_node_key(eb, &disk_key, nr);
1255 btrfs_disk_key_to_cpu(key, &disk_key);
1256}
1257
1258static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
1259 struct btrfs_key *key, int nr)
1260{
1261 struct btrfs_disk_key disk_key;
1262 btrfs_item_key(eb, &disk_key, nr);
1263 btrfs_disk_key_to_cpu(key, &disk_key);
1264}
1265
1266static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
1267 struct btrfs_dir_item *item,
1268 struct btrfs_key *key)
1269{
1270 struct btrfs_disk_key disk_key;
1271 btrfs_dir_item_key(eb, item, &disk_key);
1272 btrfs_disk_key_to_cpu(key, &disk_key);
1273}
1274
1275
1276static inline u8 btrfs_key_type(struct btrfs_key *key)
1277{
1278 return key->type;
1279}
1280
1281static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
1282{
1283 key->type = val;
1284}
1285
1286/* struct btrfs_header */
1287BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
1288BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
1289 generation, 64);
1290BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
1291BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
1292BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
1293BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
1294
1295static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
1296{
1297 return (btrfs_header_flags(eb) & flag) == flag;
1298}
1299
1300static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
1301{
1302 u64 flags = btrfs_header_flags(eb);
1303 btrfs_set_header_flags(eb, flags | flag);
1304 return (flags & flag) == flag;
1305}
1306
1307static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
1308{
1309 u64 flags = btrfs_header_flags(eb);
1310 btrfs_set_header_flags(eb, flags & ~flag);
1311 return (flags & flag) == flag;
1312}
1313
1314static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
1315{
1316 unsigned long ptr = offsetof(struct btrfs_header, fsid);
1317 return (u8 *)ptr;
1318}
1319
1320static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
1321{
1322 unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
1323 return (u8 *)ptr;
1324}
1325
1326static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
1327{
1328 unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
1329 return (u8 *)ptr;
1330}
1331
1332static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
1333{
1334 unsigned long ptr = offsetof(struct btrfs_header, csum);
1335 return (u8 *)ptr;
1336}
1337
1338static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
1339{
1340 return NULL;
1341}
1342
1343static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
1344{
1345 return NULL;
1346}
1347
1348static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
1349{
1350 return NULL;
1351}
1352
1353static inline int btrfs_is_leaf(struct extent_buffer *eb)
1354{
1355 return (btrfs_header_level(eb) == 0);
1356}
1357
1358/* struct btrfs_root_item */
1359BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
1360BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
1361BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
1362
1363BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
1364BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
1365BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
1366BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
1367BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 32);
1368BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
1369BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1370
1371/* struct btrfs_super_block */
1372BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
1373BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
1374BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
1375 generation, 64);
1376BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
1377BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
1378 struct btrfs_super_block, sys_chunk_array_size, 32);
1379BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
1380 root_level, 8);
1381BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
1382 chunk_root, 64);
1383BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
1384 chunk_root_level, 8);
1385BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
1386 log_root, 64);
1387BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
1388 log_root_level, 8);
1389BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
1390 total_bytes, 64);
1391BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
1392 bytes_used, 64);
1393BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
1394 sectorsize, 32);
1395BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
1396 nodesize, 32);
1397BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
1398 leafsize, 32);
1399BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
1400 stripesize, 32);
1401BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
1402 root_dir_objectid, 64);
1403BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1404 num_devices, 64);
1405
1406static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
1407{
1408 return offsetof(struct btrfs_leaf, items);
1409}
1410
1411/* struct btrfs_file_extent_item */
1412BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
1413
1414static inline unsigned long btrfs_file_extent_inline_start(struct
1415 btrfs_file_extent_item *e)
1416{
1417 unsigned long offset = (unsigned long)e;
1418 offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
1419 return offset;
1420}
1421
1422static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
1423{
1424 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
1425}
1426
1427static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1428 struct btrfs_item *e)
1429{
1430 unsigned long offset;
1431 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1432 return btrfs_item_size(eb, e) - offset;
1433}
1434
1435BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
1436 disk_bytenr, 64);
1437BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
1438 generation, 64);
1439BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
1440 disk_num_bytes, 64);
1441BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
1442 offset, 64);
1443BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
1444 num_bytes, 64);
1445
1446static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
1447{
1448 return sb->s_fs_info;
1449}
1450
1451static inline int btrfs_set_root_name(struct btrfs_root *root,
1452 const char *name, int len)
1453{
1454 /* if we already have a name just free it */
1455 if (root->name)
1456 kfree(root->name);
1457
1458 root->name = kmalloc(len+1, GFP_KERNEL);
1459 if (!root->name)
1460 return -ENOMEM;
1461
1462 memcpy(root->name, name, len);
1463 root->name[len] ='\0';
1464
1465 return 0;
1466}
1467
1468static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
1469 if (level == 0)
1470 return root->leafsize;
1471 return root->nodesize;
1472}
1473
1474/* helper function to cast into the data area of the leaf. */
1475#define btrfs_item_ptr(leaf, slot, type) \
1476 ((type *)(btrfs_leaf_data(leaf) + \
1477 btrfs_item_offset_nr(leaf, slot)))
1478
1479#define btrfs_item_ptr_offset(leaf, slot) \
1480 ((unsigned long)(btrfs_leaf_data(leaf) + \
1481 btrfs_item_offset_nr(leaf, slot)))
1482
1483static inline struct dentry *fdentry(struct file *file)
1484{
1485 return file->f_path.dentry;
1486}
1487
1488/* extent-tree.c */
1489int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1490int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1491 struct btrfs_root *root, u64 bytenr,
1492 u64 num_bytes, u32 *refs);
1493int btrfs_update_pinned_extents(struct btrfs_root *root,
1494 u64 bytenr, u64 num, int pin);
1495int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1496 struct btrfs_root *root, struct extent_buffer *leaf);
1497int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
1498 struct btrfs_root *root,
1499 struct btrfs_key *key, u64 bytenr);
1500int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1501 struct btrfs_root *root);
1502int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1503struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
1504 btrfs_fs_info *info,
1505 u64 bytenr);
1506struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
1507 struct btrfs_block_group_cache
1508 *hint, u64 search_start,
1509 int data, int owner);
1510struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1511 struct btrfs_root *root,
1512 u32 blocksize, u64 parent,
1513 u64 root_objectid,
1514 u64 ref_generation,
1515 int level,
1516 u64 hint,
1517 u64 empty_size);
1518struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1519 struct btrfs_root *root,
1520 u64 bytenr, u32 blocksize);
1521int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1522 struct btrfs_root *root,
1523 u64 num_bytes, u64 parent, u64 min_bytes,
1524 u64 root_objectid, u64 ref_generation,
1525 u64 owner, u64 empty_size, u64 hint_byte,
1526 u64 search_end, struct btrfs_key *ins, u64 data);
1527int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
1528 struct btrfs_root *root, u64 parent,
1529 u64 root_objectid, u64 ref_generation,
1530 u64 owner, struct btrfs_key *ins);
1531int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
1532 struct btrfs_root *root, u64 parent,
1533 u64 root_objectid, u64 ref_generation,
1534 u64 owner, struct btrfs_key *ins);
1535int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1536 struct btrfs_root *root,
1537 u64 num_bytes, u64 min_alloc_size,
1538 u64 empty_size, u64 hint_byte,
1539 u64 search_end, struct btrfs_key *ins,
1540 u64 data);
1541int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1542 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1543 u32 *nr_extents);
1544int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1545 struct extent_buffer *buf, u32 nr_extents);
1546int btrfs_update_ref(struct btrfs_trans_handle *trans,
1547 struct btrfs_root *root, struct extent_buffer *orig_buf,
1548 struct extent_buffer *buf, int start_slot, int nr);
1549int btrfs_free_extent(struct btrfs_trans_handle *trans,
1550 struct btrfs_root *root,
1551 u64 bytenr, u64 num_bytes, u64 parent,
1552 u64 root_objectid, u64 ref_generation,
1553 u64 owner_objectid, int pin);
1554int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
1555int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1556 struct btrfs_root *root,
1557 struct extent_io_tree *unpin);
1558int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1559 struct btrfs_root *root,
1560 u64 bytenr, u64 num_bytes, u64 parent,
1561 u64 root_objectid, u64 ref_generation,
1562 u64 owner_objectid);
1563int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1564 struct btrfs_root *root, u64 bytenr,
1565 u64 orig_parent, u64 parent,
1566 u64 root_objectid, u64 ref_generation,
1567 u64 owner_objectid);
1568int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1569 struct btrfs_root *root);
1570int btrfs_free_block_groups(struct btrfs_fs_info *info);
1571int btrfs_read_block_groups(struct btrfs_root *root);
1572int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1573 struct btrfs_root *root, u64 bytes_used,
1574 u64 type, u64 chunk_objectid, u64 chunk_offset,
1575 u64 size);
1576int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1577 struct btrfs_root *root, u64 group_start);
1578int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
1579int btrfs_free_reloc_root(struct btrfs_root *root);
1580int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
1581int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
1582 u64 num_bytes, u64 new_bytenr);
1583int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
1584 u64 num_bytes, u64 *new_bytenr);
1585void btrfs_free_reloc_mappings(struct btrfs_root *root);
1586int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
1587 struct btrfs_root *root,
1588 struct extent_buffer *buf, u64 orig_start);
1589int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1590int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1591/* ctree.c */
1592int btrfs_previous_item(struct btrfs_root *root,
1593 struct btrfs_path *path, u64 min_objectid,
1594 int type);
1595int btrfs_merge_path(struct btrfs_trans_handle *trans,
1596 struct btrfs_root *root,
1597 struct btrfs_key *node_keys,
1598 u64 *nodes, int lowest_level);
1599int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1600 struct btrfs_root *root, struct btrfs_path *path,
1601 struct btrfs_key *new_key);
1602struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
1603struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
1604int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
1605 struct btrfs_key *key, int lowest_level,
1606 int cache_only, u64 min_trans);
1607int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1608 struct btrfs_key *max_key,
1609 struct btrfs_path *path, int cache_only,
1610 u64 min_trans);
1611int btrfs_cow_block(struct btrfs_trans_handle *trans,
1612 struct btrfs_root *root, struct extent_buffer *buf,
1613 struct extent_buffer *parent, int parent_slot,
1614 struct extent_buffer **cow_ret, u64 prealloc_dest);
1615int btrfs_copy_root(struct btrfs_trans_handle *trans,
1616 struct btrfs_root *root,
1617 struct extent_buffer *buf,
1618 struct extent_buffer **cow_ret, u64 new_root_objectid);
1619int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
1620 *root, struct btrfs_path *path, u32 data_size);
1621int btrfs_truncate_item(struct btrfs_trans_handle *trans,
1622 struct btrfs_root *root,
1623 struct btrfs_path *path,
1624 u32 new_size, int from_end);
1625int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1626 *root, struct btrfs_key *key, struct btrfs_path *p, int
1627 ins_len, int cow);
1628int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1629 struct btrfs_root *root, struct extent_buffer *parent,
1630 int start_slot, int cache_only, u64 *last_ret,
1631 struct btrfs_key *progress);
1632void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1633struct btrfs_path *btrfs_alloc_path(void);
1634void btrfs_free_path(struct btrfs_path *p);
1635void btrfs_init_path(struct btrfs_path *p);
1636int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1637 struct btrfs_path *path, int slot, int nr);
1638int btrfs_del_leaf(struct btrfs_trans_handle *trans,
1639 struct btrfs_root *root,
1640 struct btrfs_path *path, u64 bytenr);
1641static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
1642 struct btrfs_root *root,
1643 struct btrfs_path *path)
1644{
1645 return btrfs_del_items(trans, root, path, path->slots[0], 1);
1646}
1647
1648int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
1649 *root, struct btrfs_key *key, void *data, u32 data_size);
1650int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
1651 struct btrfs_root *root,
1652 struct btrfs_path *path,
1653 struct btrfs_key *cpu_key, u32 *data_size, int nr);
1654
1655static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
1656 struct btrfs_root *root,
1657 struct btrfs_path *path,
1658 struct btrfs_key *key,
1659 u32 data_size)
1660{
1661 return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
1662}
1663
1664int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
1665int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
1666int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
1667int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
1668 *root);
1669/* root-item.c */
1670int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1671 struct btrfs_key *key);
1672int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
1673 *root, struct btrfs_key *key, struct btrfs_root_item
1674 *item);
1675int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
1676 *root, struct btrfs_key *key, struct btrfs_root_item
1677 *item);
1678int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
1679 btrfs_root_item *item, struct btrfs_key *key);
1680int btrfs_search_root(struct btrfs_root *root, u64 search_start,
1681 u64 *found_objectid);
1682int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
1683 struct btrfs_root *latest_root);
1684/* dir-item.c */
1685int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
1686 *root, const char *name, int name_len, u64 dir,
1687 struct btrfs_key *location, u8 type, u64 index);
1688struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
1689 struct btrfs_root *root,
1690 struct btrfs_path *path, u64 dir,
1691 const char *name, int name_len,
1692 int mod);
1693struct btrfs_dir_item *
1694btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1695 struct btrfs_root *root,
1696 struct btrfs_path *path, u64 dir,
1697 u64 objectid, const char *name, int name_len,
1698 int mod);
1699struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
1700 struct btrfs_path *path,
1701 const char *name, int name_len);
1702int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
1703 struct btrfs_root *root,
1704 struct btrfs_path *path,
1705 struct btrfs_dir_item *di);
1706int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
1707 struct btrfs_root *root, const char *name,
1708 u16 name_len, const void *data, u16 data_len,
1709 u64 dir);
1710struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
1711 struct btrfs_root *root,
1712 struct btrfs_path *path, u64 dir,
1713 const char *name, u16 name_len,
1714 int mod);
1715
1716/* orphan.c */
1717int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
1718 struct btrfs_root *root, u64 offset);
1719int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
1720 struct btrfs_root *root, u64 offset);
1721
1722/* inode-map.c */
1723int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
1724 struct btrfs_root *fs_root,
1725 u64 dirid, u64 *objectid);
1726int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
1727
1728/* inode-item.c */
1729int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
1730 struct btrfs_root *root,
1731 const char *name, int name_len,
1732 u64 inode_objectid, u64 ref_objectid, u64 index);
1733int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
1734 struct btrfs_root *root,
1735 const char *name, int name_len,
1736 u64 inode_objectid, u64 ref_objectid, u64 *index);
1737int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
1738 struct btrfs_root *root,
1739 struct btrfs_path *path, u64 objectid);
1740int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
1741 *root, struct btrfs_path *path,
1742 struct btrfs_key *location, int mod);
1743
1744/* file-item.c */
1745int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
1746 struct bio *bio);
1747int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
1748 struct btrfs_root *root,
1749 u64 objectid, u64 pos, u64 disk_offset,
1750 u64 disk_num_bytes,
1751 u64 num_bytes, u64 offset);
1752int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
1753 struct btrfs_root *root,
1754 struct btrfs_path *path, u64 objectid,
1755 u64 bytenr, int mod);
1756int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
1757 struct btrfs_root *root, struct inode *inode,
1758 struct btrfs_ordered_sum *sums);
1759int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
1760 struct bio *bio);
1761struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
1762 struct btrfs_root *root,
1763 struct btrfs_path *path,
1764 u64 objectid, u64 offset,
1765 int cow);
1766int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
1767 struct btrfs_root *root, struct btrfs_path *path,
1768 u64 isize);
1769/* inode.c */
1770
1771/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
1772#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
1773#define ClearPageChecked ClearPageFsMisc
1774#define SetPageChecked SetPageFsMisc
1775#define PageChecked PageFsMisc
1776#endif
1777
1778int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
1779 struct btrfs_root *root,
1780 struct inode *dir, struct inode *inode,
1781 const char *name, int name_len);
1782int btrfs_add_link(struct btrfs_trans_handle *trans,
1783 struct inode *parent_inode, struct inode *inode,
1784 const char *name, int name_len, int add_backref, u64 index);
1785int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
1786 struct btrfs_root *root,
1787 struct inode *inode, u64 new_size,
1788 u32 min_type);
1789
1790int btrfs_start_delalloc_inodes(struct btrfs_root *root);
1791int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
1792int btrfs_writepages(struct address_space *mapping,
1793 struct writeback_control *wbc);
1794int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
1795 struct btrfs_trans_handle *trans, u64 new_dirid,
1796 struct btrfs_block_group_cache *block_group);
1797
1798void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
1799 int namelen);
1800
1801int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1802 size_t size, struct bio *bio);
1803
1804unsigned long btrfs_force_ra(struct address_space *mapping,
1805 struct file_ra_state *ra, struct file *file,
1806 pgoff_t offset, pgoff_t last_index);
1807int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
1808 int for_del);
1809int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1810int btrfs_readpage(struct file *file, struct page *page);
1811void btrfs_delete_inode(struct inode *inode);
1812void btrfs_put_inode(struct inode *inode);
1813void btrfs_read_locked_inode(struct inode *inode);
1814int btrfs_write_inode(struct inode *inode, int wait);
1815void btrfs_dirty_inode(struct inode *inode);
1816struct inode *btrfs_alloc_inode(struct super_block *sb);
1817void btrfs_destroy_inode(struct inode *inode);
1818int btrfs_init_cachep(void);
1819void btrfs_destroy_cachep(void);
1820long btrfs_ioctl_trans_end(struct file *file);
1821struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
1822 struct btrfs_root *root, int wait);
1823struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
1824 struct btrfs_root *root);
1825struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
1826 struct btrfs_root *root, int *is_new);
1827int btrfs_commit_write(struct file *file, struct page *page,
1828 unsigned from, unsigned to);
1829struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
1830 size_t page_offset, u64 start, u64 end,
1831 int create);
1832int btrfs_update_inode(struct btrfs_trans_handle *trans,
1833 struct btrfs_root *root,
1834 struct inode *inode);
1835int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
1836int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
1837void btrfs_orphan_cleanup(struct btrfs_root *root);
1838
1839/* ioctl.c */
1840long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1841
1842/* file.c */
1843int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
1844int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
1845 int skip_pinned);
1846int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
1847extern struct file_operations btrfs_file_operations;
1848int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1849 struct btrfs_root *root, struct inode *inode,
1850 u64 start, u64 end, u64 inline_limit, u64 *hint_block);
1851int btrfs_release_file(struct inode *inode, struct file *file);
1852
1853/* tree-defrag.c */
1854int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
1855 struct btrfs_root *root, int cache_only);
1856
1857/* sysfs.c */
1858int btrfs_init_sysfs(void);
1859void btrfs_exit_sysfs(void);
1860int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
1861int btrfs_sysfs_add_root(struct btrfs_root *root);
1862void btrfs_sysfs_del_root(struct btrfs_root *root);
1863void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
1864
1865/* xattr.c */
1866ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
1867
1868/* super.c */
1869u64 btrfs_parse_size(char *str);
1870int btrfs_parse_options(struct btrfs_root *root, char *options);
1871int btrfs_sync_fs(struct super_block *sb, int wait);
1872
1873/* acl.c */
1874int btrfs_check_acl(struct inode *inode, int mask);
1875int btrfs_init_acl(struct inode *inode, struct inode *dir);
1876int btrfs_acl_chmod(struct inode *inode);
1877
1878/* free-space-cache.c */
1879int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1880 u64 bytenr, u64 size);
1881int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
1882 u64 bytenr, u64 size);
1883void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
1884 *block_group);
1885struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
1886 *block_group, u64 offset,
1887 u64 bytes);
1888void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
1889 u64 bytes);
1890u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
1891#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000000..5040b71f1900
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,386 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "hash.h"
22#include "transaction.h"
23
24/*
25 * insert a name into a directory, doing overflow properly if there is a hash
26 * collision. data_size indicates how big the item inserted should be. On
27 * success a struct btrfs_dir_item pointer is returned, otherwise it is
28 * an ERR_PTR.
29 *
30 * The name is not copied into the dir item, you have to do that yourself.
31 */
32static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
33 *trans,
34 struct btrfs_root *root,
35 struct btrfs_path *path,
36 struct btrfs_key *cpu_key,
37 u32 data_size,
38 const char *name,
39 int name_len)
40{
41 int ret;
42 char *ptr;
43 struct btrfs_item *item;
44 struct extent_buffer *leaf;
45
46 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
47 if (ret == -EEXIST) {
48 struct btrfs_dir_item *di;
49 di = btrfs_match_dir_item_name(root, path, name, name_len);
50 if (di)
51 return ERR_PTR(-EEXIST);
52 ret = btrfs_extend_item(trans, root, path, data_size);
53 WARN_ON(ret > 0);
54 }
55 if (ret < 0)
56 return ERR_PTR(ret);
57 WARN_ON(ret > 0);
58 leaf = path->nodes[0];
59 item = btrfs_item_nr(leaf, path->slots[0]);
60 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
61 BUG_ON(data_size > btrfs_item_size(leaf, item));
62 ptr += btrfs_item_size(leaf, item) - data_size;
63 return (struct btrfs_dir_item *)ptr;
64}
65
66/*
67 * xattrs work a lot like directories, this inserts an xattr item
68 * into the tree
69 */
70int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
71 struct btrfs_root *root, const char *name,
72 u16 name_len, const void *data, u16 data_len,
73 u64 dir)
74{
75 int ret = 0;
76 struct btrfs_path *path;
77 struct btrfs_dir_item *dir_item;
78 unsigned long name_ptr, data_ptr;
79 struct btrfs_key key, location;
80 struct btrfs_disk_key disk_key;
81 struct extent_buffer *leaf;
82 u32 data_size;
83
84 key.objectid = dir;
85 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
86 key.offset = btrfs_name_hash(name, name_len);
87 path = btrfs_alloc_path();
88 if (!path)
89 return -ENOMEM;
90 if (name_len + data_len + sizeof(struct btrfs_dir_item) >
91 BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
92 return -ENOSPC;
93
94 data_size = sizeof(*dir_item) + name_len + data_len;
95 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
96 name, name_len);
97 /*
98 * FIXME: at some point we should handle xattr's that are larger than
99 * what we can fit in our leaf. We set location to NULL b/c we arent
100 * pointing at anything else, that will change if we store the xattr
101 * data in a separate inode.
102 */
103 BUG_ON(IS_ERR(dir_item));
104 memset(&location, 0, sizeof(location));
105
106 leaf = path->nodes[0];
107 btrfs_cpu_key_to_disk(&disk_key, &location);
108 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
109 btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
110 btrfs_set_dir_name_len(leaf, dir_item, name_len);
111 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
112 btrfs_set_dir_data_len(leaf, dir_item, data_len);
113 name_ptr = (unsigned long)(dir_item + 1);
114 data_ptr = (unsigned long)((char *)name_ptr + name_len);
115
116 write_extent_buffer(leaf, name, name_ptr, name_len);
117 write_extent_buffer(leaf, data, data_ptr, data_len);
118 btrfs_mark_buffer_dirty(path->nodes[0]);
119
120 btrfs_free_path(path);
121 return ret;
122}
123
124/*
125 * insert a directory item in the tree, doing all the magic for
126 * both indexes. 'dir' indicates which objectid to insert it into,
127 * 'location' is the key to stuff into the directory item, 'type' is the
128 * type of the inode we're pointing to, and 'index' is the sequence number
129 * to use for the second index (if one is created).
130 */
131int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
132 *root, const char *name, int name_len, u64 dir,
133 struct btrfs_key *location, u8 type, u64 index)
134{
135 int ret = 0;
136 int ret2 = 0;
137 struct btrfs_path *path;
138 struct btrfs_dir_item *dir_item;
139 struct extent_buffer *leaf;
140 unsigned long name_ptr;
141 struct btrfs_key key;
142 struct btrfs_disk_key disk_key;
143 u32 data_size;
144
145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len);
148 path = btrfs_alloc_path();
149 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len);
152 if (IS_ERR(dir_item)) {
153 ret = PTR_ERR(dir_item);
154 if (ret == -EEXIST)
155 goto second_insert;
156 goto out;
157 }
158
159 leaf = path->nodes[0];
160 btrfs_cpu_key_to_disk(&disk_key, location);
161 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
162 btrfs_set_dir_type(leaf, dir_item, type);
163 btrfs_set_dir_data_len(leaf, dir_item, 0);
164 btrfs_set_dir_name_len(leaf, dir_item, name_len);
165 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
166 name_ptr = (unsigned long)(dir_item + 1);
167
168 write_extent_buffer(leaf, name, name_ptr, name_len);
169 btrfs_mark_buffer_dirty(leaf);
170
171second_insert:
172 /* FIXME, use some real flag for selecting the extra index */
173 if (root == root->fs_info->tree_root) {
174 ret = 0;
175 goto out;
176 }
177 btrfs_release_path(root, path);
178
179 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
180 key.offset = index;
181 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
182 name, name_len);
183 if (IS_ERR(dir_item)) {
184 ret2 = PTR_ERR(dir_item);
185 goto out;
186 }
187 leaf = path->nodes[0];
188 btrfs_cpu_key_to_disk(&disk_key, location);
189 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
190 btrfs_set_dir_type(leaf, dir_item, type);
191 btrfs_set_dir_data_len(leaf, dir_item, 0);
192 btrfs_set_dir_name_len(leaf, dir_item, name_len);
193 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
194 name_ptr = (unsigned long)(dir_item + 1);
195 write_extent_buffer(leaf, name, name_ptr, name_len);
196 btrfs_mark_buffer_dirty(leaf);
197out:
198 btrfs_free_path(path);
199 if (ret)
200 return ret;
201 if (ret2)
202 return ret2;
203 return 0;
204}
205
206/*
207 * lookup a directory item based on name. 'dir' is the objectid
208 * we're searching in, and 'mod' tells us if you plan on deleting the
209 * item (use mod < 0) or changing the options (use mod > 0)
210 */
211struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
212 struct btrfs_root *root,
213 struct btrfs_path *path, u64 dir,
214 const char *name, int name_len,
215 int mod)
216{
217 int ret;
218 struct btrfs_key key;
219 int ins_len = mod < 0 ? -1 : 0;
220 int cow = mod != 0;
221 struct btrfs_key found_key;
222 struct extent_buffer *leaf;
223
224 key.objectid = dir;
225 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
226
227 key.offset = btrfs_name_hash(name, name_len);
228
229 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
230 if (ret < 0)
231 return ERR_PTR(ret);
232 if (ret > 0) {
233 if (path->slots[0] == 0)
234 return NULL;
235 path->slots[0]--;
236 }
237
238 leaf = path->nodes[0];
239 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
240
241 if (found_key.objectid != dir ||
242 btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
243 found_key.offset != key.offset)
244 return NULL;
245
246 return btrfs_match_dir_item_name(root, path, name, name_len);
247}
248
249/*
250 * lookup a directory item based on index. 'dir' is the objectid
251 * we're searching in, and 'mod' tells us if you plan on deleting the
252 * item (use mod < 0) or changing the options (use mod > 0)
253 *
254 * The name is used to make sure the index really points to the name you were
255 * looking for.
256 */
257struct btrfs_dir_item *
258btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
259 struct btrfs_root *root,
260 struct btrfs_path *path, u64 dir,
261 u64 objectid, const char *name, int name_len,
262 int mod)
263{
264 int ret;
265 struct btrfs_key key;
266 int ins_len = mod < 0 ? -1 : 0;
267 int cow = mod != 0;
268
269 key.objectid = dir;
270 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
271 key.offset = objectid;
272
273 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
274 if (ret < 0)
275 return ERR_PTR(ret);
276 if (ret > 0)
277 return ERR_PTR(-ENOENT);
278 return btrfs_match_dir_item_name(root, path, name, name_len);
279}
280
281struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
282 struct btrfs_root *root,
283 struct btrfs_path *path, u64 dir,
284 const char *name, u16 name_len,
285 int mod)
286{
287 int ret;
288 struct btrfs_key key;
289 int ins_len = mod < 0 ? -1 : 0;
290 int cow = mod != 0;
291 struct btrfs_key found_key;
292 struct extent_buffer *leaf;
293
294 key.objectid = dir;
295 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
296 key.offset = btrfs_name_hash(name, name_len);
297 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
298 if (ret < 0)
299 return ERR_PTR(ret);
300 if (ret > 0) {
301 if (path->slots[0] == 0)
302 return NULL;
303 path->slots[0]--;
304 }
305
306 leaf = path->nodes[0];
307 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
308
309 if (found_key.objectid != dir ||
310 btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
311 found_key.offset != key.offset)
312 return NULL;
313
314 return btrfs_match_dir_item_name(root, path, name, name_len);
315}
316
317/*
318 * helper function to look at the directory item pointed to by 'path'
319 * this walks through all the entries in a dir item and finds one
320 * for a specific name.
321 */
322struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
323 struct btrfs_path *path,
324 const char *name, int name_len)
325{
326 struct btrfs_dir_item *dir_item;
327 unsigned long name_ptr;
328 u32 total_len;
329 u32 cur = 0;
330 u32 this_len;
331 struct extent_buffer *leaf;
332
333 leaf = path->nodes[0];
334 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
335 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
336 while(cur < total_len) {
337 this_len = sizeof(*dir_item) +
338 btrfs_dir_name_len(leaf, dir_item) +
339 btrfs_dir_data_len(leaf, dir_item);
340 name_ptr = (unsigned long)(dir_item + 1);
341
342 if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
343 memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
344 return dir_item;
345
346 cur += this_len;
347 dir_item = (struct btrfs_dir_item *)((char *)dir_item +
348 this_len);
349 }
350 return NULL;
351}
352
353/*
354 * given a pointer into a directory item, delete it. This
355 * handles items that have more than one entry in them.
356 */
357int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
358 struct btrfs_root *root,
359 struct btrfs_path *path,
360 struct btrfs_dir_item *di)
361{
362
363 struct extent_buffer *leaf;
364 u32 sub_item_len;
365 u32 item_len;
366 int ret = 0;
367
368 leaf = path->nodes[0];
369 sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
370 btrfs_dir_data_len(leaf, di);
371 item_len = btrfs_item_size_nr(leaf, path->slots[0]);
372 if (sub_item_len == item_len) {
373 ret = btrfs_del_item(trans, root, path);
374 } else {
375 /* MARKER */
376 unsigned long ptr = (unsigned long)di;
377 unsigned long start;
378
379 start = btrfs_item_ptr_offset(leaf, path->slots[0]);
380 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
381 item_len - (ptr + sub_item_len - start));
382 ret = btrfs_truncate_item(trans, root, path,
383 item_len - sub_item_len, 1);
384 }
385 return 0;
386}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000000..0be044bb6194
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2078 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/fs.h>
21#include <linux/blkdev.h>
22#include <linux/scatterlist.h>
23#include <linux/swap.h>
24#include <linux/radix-tree.h>
25#include <linux/writeback.h>
26#include <linux/buffer_head.h> // for block_sync_page
27#include <linux/workqueue.h>
28#include <linux/kthread.h>
29# include <linux/freezer.h>
30#include "crc32c.h"
31#include "ctree.h"
32#include "disk-io.h"
33#include "transaction.h"
34#include "btrfs_inode.h"
35#include "volumes.h"
36#include "print-tree.h"
37#include "async-thread.h"
38#include "locking.h"
39#include "ref-cache.h"
40#include "tree-log.h"
41
42#if 0
43static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
44{
45 if (extent_buffer_blocknr(buf) != btrfs_header_blocknr(buf)) {
46 printk(KERN_CRIT "buf blocknr(buf) is %llu, header is %llu\n",
47 (unsigned long long)extent_buffer_blocknr(buf),
48 (unsigned long long)btrfs_header_blocknr(buf));
49 return 1;
50 }
51 return 0;
52}
53#endif
54
55static struct extent_io_ops btree_extent_io_ops;
56static void end_workqueue_fn(struct btrfs_work *work);
57
58/*
59 * end_io_wq structs are used to do processing in task context when an IO is
60 * complete. This is used during reads to verify checksums, and it is used
61 * by writes to insert metadata for new file extents after IO is complete.
62 */
63struct end_io_wq {
64 struct bio *bio;
65 bio_end_io_t *end_io;
66 void *private;
67 struct btrfs_fs_info *info;
68 int error;
69 int metadata;
70 struct list_head list;
71 struct btrfs_work work;
72};
73
74/*
75 * async submit bios are used to offload expensive checksumming
76 * onto the worker threads. They checksum file and metadata bios
77 * just before they are sent down the IO stack.
78 */
79struct async_submit_bio {
80 struct inode *inode;
81 struct bio *bio;
82 struct list_head list;
83 extent_submit_bio_hook_t *submit_bio_hook;
84 int rw;
85 int mirror_num;
86 struct btrfs_work work;
87};
88
89/*
90 * extents on the btree inode are pretty simple, there's one extent
91 * that covers the entire device
92 */
93struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
94 size_t page_offset, u64 start, u64 len,
95 int create)
96{
97 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
98 struct extent_map *em;
99 int ret;
100
101 spin_lock(&em_tree->lock);
102 em = lookup_extent_mapping(em_tree, start, len);
103 if (em) {
104 em->bdev =
105 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
106 spin_unlock(&em_tree->lock);
107 goto out;
108 }
109 spin_unlock(&em_tree->lock);
110
111 em = alloc_extent_map(GFP_NOFS);
112 if (!em) {
113 em = ERR_PTR(-ENOMEM);
114 goto out;
115 }
116 em->start = 0;
117 em->len = (u64)-1;
118 em->block_start = 0;
119 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
120
121 spin_lock(&em_tree->lock);
122 ret = add_extent_mapping(em_tree, em);
123 if (ret == -EEXIST) {
124 u64 failed_start = em->start;
125 u64 failed_len = em->len;
126
127 printk("failed to insert %Lu %Lu -> %Lu into tree\n",
128 em->start, em->len, em->block_start);
129 free_extent_map(em);
130 em = lookup_extent_mapping(em_tree, start, len);
131 if (em) {
132 printk("after failing, found %Lu %Lu %Lu\n",
133 em->start, em->len, em->block_start);
134 ret = 0;
135 } else {
136 em = lookup_extent_mapping(em_tree, failed_start,
137 failed_len);
138 if (em) {
139 printk("double failure lookup gives us "
140 "%Lu %Lu -> %Lu\n", em->start,
141 em->len, em->block_start);
142 free_extent_map(em);
143 }
144 ret = -EIO;
145 }
146 } else if (ret) {
147 free_extent_map(em);
148 em = NULL;
149 }
150 spin_unlock(&em_tree->lock);
151
152 if (ret)
153 em = ERR_PTR(ret);
154out:
155 return em;
156}
157
158u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
159{
160 return btrfs_crc32c(seed, data, len);
161}
162
163void btrfs_csum_final(u32 crc, char *result)
164{
165 *(__le32 *)result = ~cpu_to_le32(crc);
166}
167
168/*
169 * compute the csum for a btree block, and either verify it or write it
170 * into the csum field of the block.
171 */
172static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
173 int verify)
174{
175 char result[BTRFS_CRC32_SIZE];
176 unsigned long len;
177 unsigned long cur_len;
178 unsigned long offset = BTRFS_CSUM_SIZE;
179 char *map_token = NULL;
180 char *kaddr;
181 unsigned long map_start;
182 unsigned long map_len;
183 int err;
184 u32 crc = ~(u32)0;
185
186 len = buf->len - offset;
187 while(len > 0) {
188 err = map_private_extent_buffer(buf, offset, 32,
189 &map_token, &kaddr,
190 &map_start, &map_len, KM_USER0);
191 if (err) {
192 printk("failed to map extent buffer! %lu\n",
193 offset);
194 return 1;
195 }
196 cur_len = min(len, map_len - (offset - map_start));
197 crc = btrfs_csum_data(root, kaddr + offset - map_start,
198 crc, cur_len);
199 len -= cur_len;
200 offset += cur_len;
201 unmap_extent_buffer(buf, map_token, KM_USER0);
202 }
203 btrfs_csum_final(crc, result);
204
205 if (verify) {
206 /* FIXME, this is not good */
207 if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
208 u32 val;
209 u32 found = 0;
210 memcpy(&found, result, BTRFS_CRC32_SIZE);
211
212 read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
213 printk("btrfs: %s checksum verify failed on %llu "
214 "wanted %X found %X level %d\n",
215 root->fs_info->sb->s_id,
216 buf->start, val, found, btrfs_header_level(buf));
217 return 1;
218 }
219 } else {
220 write_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE);
221 }
222 return 0;
223}
224
225/*
226 * we can't consider a given block up to date unless the transid of the
227 * block matches the transid in the parent node's pointer. This is how we
228 * detect blocks that either didn't get written at all or got written
229 * in the wrong place.
230 */
231static int verify_parent_transid(struct extent_io_tree *io_tree,
232 struct extent_buffer *eb, u64 parent_transid)
233{
234 int ret;
235
236 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
237 return 0;
238
239 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
240 if (extent_buffer_uptodate(io_tree, eb) &&
241 btrfs_header_generation(eb) == parent_transid) {
242 ret = 0;
243 goto out;
244 }
245 printk("parent transid verify failed on %llu wanted %llu found %llu\n",
246 (unsigned long long)eb->start,
247 (unsigned long long)parent_transid,
248 (unsigned long long)btrfs_header_generation(eb));
249 ret = 1;
250 clear_extent_buffer_uptodate(io_tree, eb);
251out:
252 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
253 GFP_NOFS);
254 return ret;
255}
256
257/*
258 * helper to read a given tree block, doing retries as required when
259 * the checksums don't match and we have alternate mirrors to try.
260 */
261static int btree_read_extent_buffer_pages(struct btrfs_root *root,
262 struct extent_buffer *eb,
263 u64 start, u64 parent_transid)
264{
265 struct extent_io_tree *io_tree;
266 int ret;
267 int num_copies = 0;
268 int mirror_num = 0;
269
270 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
271 while (1) {
272 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
273 btree_get_extent, mirror_num);
274 if (!ret &&
275 !verify_parent_transid(io_tree, eb, parent_transid))
276 return ret;
277printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror_num);
278 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
279 eb->start, eb->len);
280 if (num_copies == 1)
281 return ret;
282
283 mirror_num++;
284 if (mirror_num > num_copies)
285 return ret;
286 }
287 return -EIO;
288}
289
290/*
291 * checksum a dirty tree block before IO. This has extra checks to make
292 * sure we only fill in the checksum field in the first page of a multi-page block
293 */
294int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
295{
296 struct extent_io_tree *tree;
297 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
298 u64 found_start;
299 int found_level;
300 unsigned long len;
301 struct extent_buffer *eb;
302 int ret;
303
304 tree = &BTRFS_I(page->mapping->host)->io_tree;
305
306 if (page->private == EXTENT_PAGE_PRIVATE)
307 goto out;
308 if (!page->private)
309 goto out;
310 len = page->private >> 2;
311 if (len == 0) {
312 WARN_ON(1);
313 }
314 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
315 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
316 btrfs_header_generation(eb));
317 BUG_ON(ret);
318 found_start = btrfs_header_bytenr(eb);
319 if (found_start != start) {
320 printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
321 start, found_start, len);
322 WARN_ON(1);
323 goto err;
324 }
325 if (eb->first_page != page) {
326 printk("bad first page %lu %lu\n", eb->first_page->index,
327 page->index);
328 WARN_ON(1);
329 goto err;
330 }
331 if (!PageUptodate(page)) {
332 printk("csum not up to date page %lu\n", page->index);
333 WARN_ON(1);
334 goto err;
335 }
336 found_level = btrfs_header_level(eb);
337
338 csum_tree_block(root, eb, 0);
339err:
340 free_extent_buffer(eb);
341out:
342 return 0;
343}
344
345int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
346 struct extent_state *state)
347{
348 struct extent_io_tree *tree;
349 u64 found_start;
350 int found_level;
351 unsigned long len;
352 struct extent_buffer *eb;
353 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
354 int ret = 0;
355
356 tree = &BTRFS_I(page->mapping->host)->io_tree;
357 if (page->private == EXTENT_PAGE_PRIVATE)
358 goto out;
359 if (!page->private)
360 goto out;
361 len = page->private >> 2;
362 if (len == 0) {
363 WARN_ON(1);
364 }
365 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
366
367 found_start = btrfs_header_bytenr(eb);
368 if (found_start != start) {
369 printk("bad tree block start %llu %llu\n",
370 (unsigned long long)found_start,
371 (unsigned long long)eb->start);
372 ret = -EIO;
373 goto err;
374 }
375 if (eb->first_page != page) {
376 printk("bad first page %lu %lu\n", eb->first_page->index,
377 page->index);
378 WARN_ON(1);
379 ret = -EIO;
380 goto err;
381 }
382 if (memcmp_extent_buffer(eb, root->fs_info->fsid,
383 (unsigned long)btrfs_header_fsid(eb),
384 BTRFS_FSID_SIZE)) {
385 printk("bad fsid on block %Lu\n", eb->start);
386 ret = -EIO;
387 goto err;
388 }
389 found_level = btrfs_header_level(eb);
390
391 ret = csum_tree_block(root, eb, 1);
392 if (ret)
393 ret = -EIO;
394
395 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
396 end = eb->start + end - 1;
397err:
398 free_extent_buffer(eb);
399out:
400 return ret;
401}
402
403static void end_workqueue_bio(struct bio *bio, int err)
404{
405 struct end_io_wq *end_io_wq = bio->bi_private;
406 struct btrfs_fs_info *fs_info;
407
408 fs_info = end_io_wq->info;
409 end_io_wq->error = err;
410 end_io_wq->work.func = end_workqueue_fn;
411 end_io_wq->work.flags = 0;
412 if (bio->bi_rw & (1 << BIO_RW))
413 btrfs_queue_worker(&fs_info->endio_write_workers,
414 &end_io_wq->work);
415 else
416 btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
417}
418
419int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
420 int metadata)
421{
422 struct end_io_wq *end_io_wq;
423 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
424 if (!end_io_wq)
425 return -ENOMEM;
426
427 end_io_wq->private = bio->bi_private;
428 end_io_wq->end_io = bio->bi_end_io;
429 end_io_wq->info = info;
430 end_io_wq->error = 0;
431 end_io_wq->bio = bio;
432 end_io_wq->metadata = metadata;
433
434 bio->bi_private = end_io_wq;
435 bio->bi_end_io = end_workqueue_bio;
436 return 0;
437}
438
439unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
440{
441 unsigned long limit = min_t(unsigned long,
442 info->workers.max_workers,
443 info->fs_devices->open_devices);
444 return 256 * limit;
445}
446
447int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
448{
449 return atomic_read(&info->nr_async_bios) >
450 btrfs_async_submit_limit(info);
451}
452
453static void run_one_async_submit(struct btrfs_work *work)
454{
455 struct btrfs_fs_info *fs_info;
456 struct async_submit_bio *async;
457 int limit;
458
459 async = container_of(work, struct async_submit_bio, work);
460 fs_info = BTRFS_I(async->inode)->root->fs_info;
461
462 limit = btrfs_async_submit_limit(fs_info);
463 limit = limit * 2 / 3;
464
465 atomic_dec(&fs_info->nr_async_submits);
466
467 if (atomic_read(&fs_info->nr_async_submits) < limit &&
468 waitqueue_active(&fs_info->async_submit_wait))
469 wake_up(&fs_info->async_submit_wait);
470
471 async->submit_bio_hook(async->inode, async->rw, async->bio,
472 async->mirror_num);
473 kfree(async);
474}
475
476int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
477 int rw, struct bio *bio, int mirror_num,
478 extent_submit_bio_hook_t *submit_bio_hook)
479{
480 struct async_submit_bio *async;
481 int limit = btrfs_async_submit_limit(fs_info);
482
483 async = kmalloc(sizeof(*async), GFP_NOFS);
484 if (!async)
485 return -ENOMEM;
486
487 async->inode = inode;
488 async->rw = rw;
489 async->bio = bio;
490 async->mirror_num = mirror_num;
491 async->submit_bio_hook = submit_bio_hook;
492 async->work.func = run_one_async_submit;
493 async->work.flags = 0;
494
495 while(atomic_read(&fs_info->async_submit_draining) &&
496 atomic_read(&fs_info->nr_async_submits)) {
497 wait_event(fs_info->async_submit_wait,
498 (atomic_read(&fs_info->nr_async_submits) == 0));
499 }
500
501 atomic_inc(&fs_info->nr_async_submits);
502 btrfs_queue_worker(&fs_info->workers, &async->work);
503
504 if (atomic_read(&fs_info->nr_async_submits) > limit) {
505 wait_event_timeout(fs_info->async_submit_wait,
506 (atomic_read(&fs_info->nr_async_submits) < limit),
507 HZ/10);
508
509 wait_event_timeout(fs_info->async_submit_wait,
510 (atomic_read(&fs_info->nr_async_bios) < limit),
511 HZ/10);
512 }
513 return 0;
514}
515
516static int btree_csum_one_bio(struct bio *bio)
517{
518 struct bio_vec *bvec = bio->bi_io_vec;
519 int bio_index = 0;
520 struct btrfs_root *root;
521
522 WARN_ON(bio->bi_vcnt <= 0);
523 while(bio_index < bio->bi_vcnt) {
524 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
525 csum_dirty_buffer(root, bvec->bv_page);
526 bio_index++;
527 bvec++;
528 }
529 return 0;
530}
531
532static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
533 int mirror_num)
534{
535 struct btrfs_root *root = BTRFS_I(inode)->root;
536 int ret;
537
538 /*
539 * when we're called for a write, we're already in the async
540 * submission context. Just jump into btrfs_map_bio
541 */
542 if (rw & (1 << BIO_RW)) {
543 btree_csum_one_bio(bio);
544 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
545 mirror_num, 1);
546 }
547
548 /*
549 * called for a read, do the setup so that checksum validation
550 * can happen in the async kernel threads
551 */
552 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
553 BUG_ON(ret);
554
555 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
556}
557
558static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
559 int mirror_num)
560{
561 /*
562 * kthread helpers are used to submit writes so that checksumming
563 * can happen in parallel across all CPUs
564 */
565 if (!(rw & (1 << BIO_RW))) {
566 return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
567 }
568 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
569 inode, rw, bio, mirror_num,
570 __btree_submit_bio_hook);
571}
572
573static int btree_writepage(struct page *page, struct writeback_control *wbc)
574{
575 struct extent_io_tree *tree;
576 tree = &BTRFS_I(page->mapping->host)->io_tree;
577
578 if (current->flags & PF_MEMALLOC) {
579 redirty_page_for_writepage(wbc, page);
580 unlock_page(page);
581 return 0;
582 }
583 return extent_write_full_page(tree, page, btree_get_extent, wbc);
584}
585
586static int btree_writepages(struct address_space *mapping,
587 struct writeback_control *wbc)
588{
589 struct extent_io_tree *tree;
590 tree = &BTRFS_I(mapping->host)->io_tree;
591 if (wbc->sync_mode == WB_SYNC_NONE) {
592 u64 num_dirty;
593 u64 start = 0;
594 unsigned long thresh = 32 * 1024 * 1024;
595
596 if (wbc->for_kupdate)
597 return 0;
598
599 num_dirty = count_range_bits(tree, &start, (u64)-1,
600 thresh, EXTENT_DIRTY);
601 if (num_dirty < thresh) {
602 return 0;
603 }
604 }
605 return extent_writepages(tree, mapping, btree_get_extent, wbc);
606}
607
608int btree_readpage(struct file *file, struct page *page)
609{
610 struct extent_io_tree *tree;
611 tree = &BTRFS_I(page->mapping->host)->io_tree;
612 return extent_read_full_page(tree, page, btree_get_extent);
613}
614
615static int btree_releasepage(struct page *page, gfp_t gfp_flags)
616{
617 struct extent_io_tree *tree;
618 struct extent_map_tree *map;
619 int ret;
620
621 if (PageWriteback(page) || PageDirty(page))
622 return 0;
623
624 tree = &BTRFS_I(page->mapping->host)->io_tree;
625 map = &BTRFS_I(page->mapping->host)->extent_tree;
626
627 ret = try_release_extent_state(map, tree, page, gfp_flags);
628 if (!ret) {
629 return 0;
630 }
631
632 ret = try_release_extent_buffer(tree, page);
633 if (ret == 1) {
634 ClearPagePrivate(page);
635 set_page_private(page, 0);
636 page_cache_release(page);
637 }
638
639 return ret;
640}
641
642static void btree_invalidatepage(struct page *page, unsigned long offset)
643{
644 struct extent_io_tree *tree;
645 tree = &BTRFS_I(page->mapping->host)->io_tree;
646 extent_invalidatepage(tree, page, offset);
647 btree_releasepage(page, GFP_NOFS);
648 if (PagePrivate(page)) {
649 printk("warning page private not zero on page %Lu\n",
650 page_offset(page));
651 ClearPagePrivate(page);
652 set_page_private(page, 0);
653 page_cache_release(page);
654 }
655}
656
657#if 0
658static int btree_writepage(struct page *page, struct writeback_control *wbc)
659{
660 struct buffer_head *bh;
661 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
662 struct buffer_head *head;
663 if (!page_has_buffers(page)) {
664 create_empty_buffers(page, root->fs_info->sb->s_blocksize,
665 (1 << BH_Dirty)|(1 << BH_Uptodate));
666 }
667 head = page_buffers(page);
668 bh = head;
669 do {
670 if (buffer_dirty(bh))
671 csum_tree_block(root, bh, 0);
672 bh = bh->b_this_page;
673 } while (bh != head);
674 return block_write_full_page(page, btree_get_block, wbc);
675}
676#endif
677
678static struct address_space_operations btree_aops = {
679 .readpage = btree_readpage,
680 .writepage = btree_writepage,
681 .writepages = btree_writepages,
682 .releasepage = btree_releasepage,
683 .invalidatepage = btree_invalidatepage,
684 .sync_page = block_sync_page,
685};
686
687int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
688 u64 parent_transid)
689{
690 struct extent_buffer *buf = NULL;
691 struct inode *btree_inode = root->fs_info->btree_inode;
692 int ret = 0;
693
694 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
695 if (!buf)
696 return 0;
697 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
698 buf, 0, 0, btree_get_extent, 0);
699 free_extent_buffer(buf);
700 return ret;
701}
702
703struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
704 u64 bytenr, u32 blocksize)
705{
706 struct inode *btree_inode = root->fs_info->btree_inode;
707 struct extent_buffer *eb;
708 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
709 bytenr, blocksize, GFP_NOFS);
710 return eb;
711}
712
713struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
714 u64 bytenr, u32 blocksize)
715{
716 struct inode *btree_inode = root->fs_info->btree_inode;
717 struct extent_buffer *eb;
718
719 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
720 bytenr, blocksize, NULL, GFP_NOFS);
721 return eb;
722}
723
724
725int btrfs_write_tree_block(struct extent_buffer *buf)
726{
727 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
728 buf->start + buf->len - 1, WB_SYNC_ALL);
729}
730
731int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
732{
733 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
734 buf->start, buf->start + buf->len -1);
735}
736
737struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
738 u32 blocksize, u64 parent_transid)
739{
740 struct extent_buffer *buf = NULL;
741 struct inode *btree_inode = root->fs_info->btree_inode;
742 struct extent_io_tree *io_tree;
743 int ret;
744
745 io_tree = &BTRFS_I(btree_inode)->io_tree;
746
747 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
748 if (!buf)
749 return NULL;
750
751 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
752
753 if (ret == 0) {
754 buf->flags |= EXTENT_UPTODATE;
755 } else {
756 WARN_ON(1);
757 }
758 return buf;
759
760}
761
762int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
763 struct extent_buffer *buf)
764{
765 struct inode *btree_inode = root->fs_info->btree_inode;
766 if (btrfs_header_generation(buf) ==
767 root->fs_info->running_transaction->transid) {
768 WARN_ON(!btrfs_tree_locked(buf));
769 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
770 buf);
771 }
772 return 0;
773}
774
775static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
776 u32 stripesize, struct btrfs_root *root,
777 struct btrfs_fs_info *fs_info,
778 u64 objectid)
779{
780 root->node = NULL;
781 root->inode = NULL;
782 root->commit_root = NULL;
783 root->ref_tree = NULL;
784 root->sectorsize = sectorsize;
785 root->nodesize = nodesize;
786 root->leafsize = leafsize;
787 root->stripesize = stripesize;
788 root->ref_cows = 0;
789 root->track_dirty = 0;
790
791 root->fs_info = fs_info;
792 root->objectid = objectid;
793 root->last_trans = 0;
794 root->highest_inode = 0;
795 root->last_inode_alloc = 0;
796 root->name = NULL;
797 root->in_sysfs = 0;
798
799 INIT_LIST_HEAD(&root->dirty_list);
800 INIT_LIST_HEAD(&root->orphan_list);
801 INIT_LIST_HEAD(&root->dead_list);
802 spin_lock_init(&root->node_lock);
803 spin_lock_init(&root->list_lock);
804 mutex_init(&root->objectid_mutex);
805 mutex_init(&root->log_mutex);
806 extent_io_tree_init(&root->dirty_log_pages,
807 fs_info->btree_inode->i_mapping, GFP_NOFS);
808
809 btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
810 root->ref_tree = &root->ref_tree_struct;
811
812 memset(&root->root_key, 0, sizeof(root->root_key));
813 memset(&root->root_item, 0, sizeof(root->root_item));
814 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
815 memset(&root->root_kobj, 0, sizeof(root->root_kobj));
816 root->defrag_trans_start = fs_info->generation;
817 init_completion(&root->kobj_unregister);
818 root->defrag_running = 0;
819 root->defrag_level = 0;
820 root->root_key.objectid = objectid;
821 return 0;
822}
823
824static int find_and_setup_root(struct btrfs_root *tree_root,
825 struct btrfs_fs_info *fs_info,
826 u64 objectid,
827 struct btrfs_root *root)
828{
829 int ret;
830 u32 blocksize;
831
832 __setup_root(tree_root->nodesize, tree_root->leafsize,
833 tree_root->sectorsize, tree_root->stripesize,
834 root, fs_info, objectid);
835 ret = btrfs_find_last_root(tree_root, objectid,
836 &root->root_item, &root->root_key);
837 BUG_ON(ret);
838
839 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
840 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
841 blocksize, 0);
842 BUG_ON(!root->node);
843 return 0;
844}
845
846int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
847 struct btrfs_fs_info *fs_info)
848{
849 struct extent_buffer *eb;
850 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
851 u64 start = 0;
852 u64 end = 0;
853 int ret;
854
855 if (!log_root_tree)
856 return 0;
857
858 while(1) {
859 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
860 0, &start, &end, EXTENT_DIRTY);
861 if (ret)
862 break;
863
864 clear_extent_dirty(&log_root_tree->dirty_log_pages,
865 start, end, GFP_NOFS);
866 }
867 eb = fs_info->log_root_tree->node;
868
869 WARN_ON(btrfs_header_level(eb) != 0);
870 WARN_ON(btrfs_header_nritems(eb) != 0);
871
872 ret = btrfs_free_reserved_extent(fs_info->tree_root,
873 eb->start, eb->len);
874 BUG_ON(ret);
875
876 free_extent_buffer(eb);
877 kfree(fs_info->log_root_tree);
878 fs_info->log_root_tree = NULL;
879 return 0;
880}
881
882int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
883 struct btrfs_fs_info *fs_info)
884{
885 struct btrfs_root *root;
886 struct btrfs_root *tree_root = fs_info->tree_root;
887
888 root = kzalloc(sizeof(*root), GFP_NOFS);
889 if (!root)
890 return -ENOMEM;
891
892 __setup_root(tree_root->nodesize, tree_root->leafsize,
893 tree_root->sectorsize, tree_root->stripesize,
894 root, fs_info, BTRFS_TREE_LOG_OBJECTID);
895
896 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
897 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
898 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
899 root->ref_cows = 0;
900
901 root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
902 0, BTRFS_TREE_LOG_OBJECTID,
903 trans->transid, 0, 0, 0);
904
905 btrfs_set_header_nritems(root->node, 0);
906 btrfs_set_header_level(root->node, 0);
907 btrfs_set_header_bytenr(root->node, root->node->start);
908 btrfs_set_header_generation(root->node, trans->transid);
909 btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
910
911 write_extent_buffer(root->node, root->fs_info->fsid,
912 (unsigned long)btrfs_header_fsid(root->node),
913 BTRFS_FSID_SIZE);
914 btrfs_mark_buffer_dirty(root->node);
915 btrfs_tree_unlock(root->node);
916 fs_info->log_root_tree = root;
917 return 0;
918}
919
920struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
921 struct btrfs_key *location)
922{
923 struct btrfs_root *root;
924 struct btrfs_fs_info *fs_info = tree_root->fs_info;
925 struct btrfs_path *path;
926 struct extent_buffer *l;
927 u64 highest_inode;
928 u32 blocksize;
929 int ret = 0;
930
931 root = kzalloc(sizeof(*root), GFP_NOFS);
932 if (!root)
933 return ERR_PTR(-ENOMEM);
934 if (location->offset == (u64)-1) {
935 ret = find_and_setup_root(tree_root, fs_info,
936 location->objectid, root);
937 if (ret) {
938 kfree(root);
939 return ERR_PTR(ret);
940 }
941 goto insert;
942 }
943
944 __setup_root(tree_root->nodesize, tree_root->leafsize,
945 tree_root->sectorsize, tree_root->stripesize,
946 root, fs_info, location->objectid);
947
948 path = btrfs_alloc_path();
949 BUG_ON(!path);
950 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
951 if (ret != 0) {
952 if (ret > 0)
953 ret = -ENOENT;
954 goto out;
955 }
956 l = path->nodes[0];
957 read_extent_buffer(l, &root->root_item,
958 btrfs_item_ptr_offset(l, path->slots[0]),
959 sizeof(root->root_item));
960 memcpy(&root->root_key, location, sizeof(*location));
961 ret = 0;
962out:
963 btrfs_release_path(root, path);
964 btrfs_free_path(path);
965 if (ret) {
966 kfree(root);
967 return ERR_PTR(ret);
968 }
969 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
970 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
971 blocksize, 0);
972 BUG_ON(!root->node);
973insert:
974 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
975 root->ref_cows = 1;
976 ret = btrfs_find_highest_inode(root, &highest_inode);
977 if (ret == 0) {
978 root->highest_inode = highest_inode;
979 root->last_inode_alloc = highest_inode;
980 }
981 }
982 return root;
983}
984
985struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
986 u64 root_objectid)
987{
988 struct btrfs_root *root;
989
990 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
991 return fs_info->tree_root;
992 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
993 return fs_info->extent_root;
994
995 root = radix_tree_lookup(&fs_info->fs_roots_radix,
996 (unsigned long)root_objectid);
997 return root;
998}
999
1000struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1001 struct btrfs_key *location)
1002{
1003 struct btrfs_root *root;
1004 int ret;
1005
1006 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1007 return fs_info->tree_root;
1008 if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1009 return fs_info->extent_root;
1010 if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1011 return fs_info->chunk_root;
1012 if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1013 return fs_info->dev_root;
1014
1015 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1016 (unsigned long)location->objectid);
1017 if (root)
1018 return root;
1019
1020 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1021 if (IS_ERR(root))
1022 return root;
1023 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1024 (unsigned long)root->root_key.objectid,
1025 root);
1026 if (ret) {
1027 free_extent_buffer(root->node);
1028 kfree(root);
1029 return ERR_PTR(ret);
1030 }
1031 ret = btrfs_find_dead_roots(fs_info->tree_root,
1032 root->root_key.objectid, root);
1033 BUG_ON(ret);
1034
1035 return root;
1036}
1037
1038struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1039 struct btrfs_key *location,
1040 const char *name, int namelen)
1041{
1042 struct btrfs_root *root;
1043 int ret;
1044
1045 root = btrfs_read_fs_root_no_name(fs_info, location);
1046 if (!root)
1047 return NULL;
1048
1049 if (root->in_sysfs)
1050 return root;
1051
1052 ret = btrfs_set_root_name(root, name, namelen);
1053 if (ret) {
1054 free_extent_buffer(root->node);
1055 kfree(root);
1056 return ERR_PTR(ret);
1057 }
1058
1059 ret = btrfs_sysfs_add_root(root);
1060 if (ret) {
1061 free_extent_buffer(root->node);
1062 kfree(root->name);
1063 kfree(root);
1064 return ERR_PTR(ret);
1065 }
1066 root->in_sysfs = 1;
1067 return root;
1068}
1069#if 0
1070static int add_hasher(struct btrfs_fs_info *info, char *type) {
1071 struct btrfs_hasher *hasher;
1072
1073 hasher = kmalloc(sizeof(*hasher), GFP_NOFS);
1074 if (!hasher)
1075 return -ENOMEM;
1076 hasher->hash_tfm = crypto_alloc_hash(type, 0, CRYPTO_ALG_ASYNC);
1077 if (!hasher->hash_tfm) {
1078 kfree(hasher);
1079 return -EINVAL;
1080 }
1081 spin_lock(&info->hash_lock);
1082 list_add(&hasher->list, &info->hashers);
1083 spin_unlock(&info->hash_lock);
1084 return 0;
1085}
1086#endif
1087
1088static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1089{
1090 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1091 int ret = 0;
1092 struct list_head *cur;
1093 struct btrfs_device *device;
1094 struct backing_dev_info *bdi;
1095
1096 if ((bdi_bits & (1 << BDI_write_congested)) &&
1097 btrfs_congested_async(info, 0))
1098 return 1;
1099
1100 list_for_each(cur, &info->fs_devices->devices) {
1101 device = list_entry(cur, struct btrfs_device, dev_list);
1102 if (!device->bdev)
1103 continue;
1104 bdi = blk_get_backing_dev_info(device->bdev);
1105 if (bdi && bdi_congested(bdi, bdi_bits)) {
1106 ret = 1;
1107 break;
1108 }
1109 }
1110 return ret;
1111}
1112
1113/*
1114 * this unplugs every device on the box, and it is only used when page
1115 * is null
1116 */
1117static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1118{
1119 struct list_head *cur;
1120 struct btrfs_device *device;
1121 struct btrfs_fs_info *info;
1122
1123 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1124 list_for_each(cur, &info->fs_devices->devices) {
1125 device = list_entry(cur, struct btrfs_device, dev_list);
1126 bdi = blk_get_backing_dev_info(device->bdev);
1127 if (bdi->unplug_io_fn) {
1128 bdi->unplug_io_fn(bdi, page);
1129 }
1130 }
1131}
1132
1133void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1134{
1135 struct inode *inode;
1136 struct extent_map_tree *em_tree;
1137 struct extent_map *em;
1138 struct address_space *mapping;
1139 u64 offset;
1140
1141 /* the generic O_DIRECT read code does this */
1142 if (!page) {
1143 __unplug_io_fn(bdi, page);
1144 return;
1145 }
1146
1147 /*
1148 * page->mapping may change at any time. Get a consistent copy
1149 * and use that for everything below
1150 */
1151 smp_mb();
1152 mapping = page->mapping;
1153 if (!mapping)
1154 return;
1155
1156 inode = mapping->host;
1157 offset = page_offset(page);
1158
1159 em_tree = &BTRFS_I(inode)->extent_tree;
1160 spin_lock(&em_tree->lock);
1161 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1162 spin_unlock(&em_tree->lock);
1163 if (!em) {
1164 __unplug_io_fn(bdi, page);
1165 return;
1166 }
1167
1168 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1169 free_extent_map(em);
1170 __unplug_io_fn(bdi, page);
1171 return;
1172 }
1173 offset = offset - em->start;
1174 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1175 em->block_start + offset, page);
1176 free_extent_map(em);
1177}
1178
1179static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1180{
1181 bdi_init(bdi);
1182 bdi->ra_pages = default_backing_dev_info.ra_pages;
1183 bdi->state = 0;
1184 bdi->capabilities = default_backing_dev_info.capabilities;
1185 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1186 bdi->unplug_io_data = info;
1187 bdi->congested_fn = btrfs_congested_fn;
1188 bdi->congested_data = info;
1189 return 0;
1190}
1191
1192static int bio_ready_for_csum(struct bio *bio)
1193{
1194 u64 length = 0;
1195 u64 buf_len = 0;
1196 u64 start = 0;
1197 struct page *page;
1198 struct extent_io_tree *io_tree = NULL;
1199 struct btrfs_fs_info *info = NULL;
1200 struct bio_vec *bvec;
1201 int i;
1202 int ret;
1203
1204 bio_for_each_segment(bvec, bio, i) {
1205 page = bvec->bv_page;
1206 if (page->private == EXTENT_PAGE_PRIVATE) {
1207 length += bvec->bv_len;
1208 continue;
1209 }
1210 if (!page->private) {
1211 length += bvec->bv_len;
1212 continue;
1213 }
1214 length = bvec->bv_len;
1215 buf_len = page->private >> 2;
1216 start = page_offset(page) + bvec->bv_offset;
1217 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1218 info = BTRFS_I(page->mapping->host)->root->fs_info;
1219 }
1220 /* are we fully contained in this bio? */
1221 if (buf_len <= length)
1222 return 1;
1223
1224 ret = extent_range_uptodate(io_tree, start + length,
1225 start + buf_len - 1);
1226 if (ret == 1)
1227 return ret;
1228 return ret;
1229}
1230
1231/*
1232 * called by the kthread helper functions to finally call the bio end_io
1233 * functions. This is where read checksum verification actually happens
1234 */
1235static void end_workqueue_fn(struct btrfs_work *work)
1236{
1237 struct bio *bio;
1238 struct end_io_wq *end_io_wq;
1239 struct btrfs_fs_info *fs_info;
1240 int error;
1241
1242 end_io_wq = container_of(work, struct end_io_wq, work);
1243 bio = end_io_wq->bio;
1244 fs_info = end_io_wq->info;
1245
1246 /* metadata bios are special because the whole tree block must
1247 * be checksummed at once. This makes sure the entire block is in
1248 * ram and up to date before trying to verify things. For
1249 * blocksize <= pagesize, it is basically a noop
1250 */
1251 if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
1252 btrfs_queue_worker(&fs_info->endio_workers,
1253 &end_io_wq->work);
1254 return;
1255 }
1256 error = end_io_wq->error;
1257 bio->bi_private = end_io_wq->private;
1258 bio->bi_end_io = end_io_wq->end_io;
1259 kfree(end_io_wq);
1260 bio_endio(bio, error);
1261}
1262
1263static int cleaner_kthread(void *arg)
1264{
1265 struct btrfs_root *root = arg;
1266
1267 do {
1268 smp_mb();
1269 if (root->fs_info->closing)
1270 break;
1271
1272 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1273 mutex_lock(&root->fs_info->cleaner_mutex);
1274 btrfs_clean_old_snapshots(root);
1275 mutex_unlock(&root->fs_info->cleaner_mutex);
1276
1277 if (freezing(current)) {
1278 refrigerator();
1279 } else {
1280 smp_mb();
1281 if (root->fs_info->closing)
1282 break;
1283 set_current_state(TASK_INTERRUPTIBLE);
1284 schedule();
1285 __set_current_state(TASK_RUNNING);
1286 }
1287 } while (!kthread_should_stop());
1288 return 0;
1289}
1290
1291static int transaction_kthread(void *arg)
1292{
1293 struct btrfs_root *root = arg;
1294 struct btrfs_trans_handle *trans;
1295 struct btrfs_transaction *cur;
1296 unsigned long now;
1297 unsigned long delay;
1298 int ret;
1299
1300 do {
1301 smp_mb();
1302 if (root->fs_info->closing)
1303 break;
1304
1305 delay = HZ * 30;
1306 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1307 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1308
1309 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1310 printk("btrfs: total reference cache size %Lu\n",
1311 root->fs_info->total_ref_cache_size);
1312 }
1313
1314 mutex_lock(&root->fs_info->trans_mutex);
1315 cur = root->fs_info->running_transaction;
1316 if (!cur) {
1317 mutex_unlock(&root->fs_info->trans_mutex);
1318 goto sleep;
1319 }
1320
1321 now = get_seconds();
1322 if (now < cur->start_time || now - cur->start_time < 30) {
1323 mutex_unlock(&root->fs_info->trans_mutex);
1324 delay = HZ * 5;
1325 goto sleep;
1326 }
1327 mutex_unlock(&root->fs_info->trans_mutex);
1328 trans = btrfs_start_transaction(root, 1);
1329 ret = btrfs_commit_transaction(trans, root);
1330sleep:
1331 wake_up_process(root->fs_info->cleaner_kthread);
1332 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1333
1334 if (freezing(current)) {
1335 refrigerator();
1336 } else {
1337 if (root->fs_info->closing)
1338 break;
1339 set_current_state(TASK_INTERRUPTIBLE);
1340 schedule_timeout(delay);
1341 __set_current_state(TASK_RUNNING);
1342 }
1343 } while (!kthread_should_stop());
1344 return 0;
1345}
1346
1347struct btrfs_root *open_ctree(struct super_block *sb,
1348 struct btrfs_fs_devices *fs_devices,
1349 char *options)
1350{
1351 u32 sectorsize;
1352 u32 nodesize;
1353 u32 leafsize;
1354 u32 blocksize;
1355 u32 stripesize;
1356 struct buffer_head *bh;
1357 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
1358 GFP_NOFS);
1359 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
1360 GFP_NOFS);
1361 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1362 GFP_NOFS);
1363 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1364 GFP_NOFS);
1365 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
1366 GFP_NOFS);
1367 struct btrfs_root *log_tree_root;
1368
1369 int ret;
1370 int err = -EINVAL;
1371
1372 struct btrfs_super_block *disk_super;
1373
1374 if (!extent_root || !tree_root || !fs_info ||
1375 !chunk_root || !dev_root) {
1376 err = -ENOMEM;
1377 goto fail;
1378 }
1379 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1380 INIT_LIST_HEAD(&fs_info->trans_list);
1381 INIT_LIST_HEAD(&fs_info->dead_roots);
1382 INIT_LIST_HEAD(&fs_info->hashers);
1383 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1384 spin_lock_init(&fs_info->hash_lock);
1385 spin_lock_init(&fs_info->delalloc_lock);
1386 spin_lock_init(&fs_info->new_trans_lock);
1387 spin_lock_init(&fs_info->ref_cache_lock);
1388
1389 init_completion(&fs_info->kobj_unregister);
1390 fs_info->tree_root = tree_root;
1391 fs_info->extent_root = extent_root;
1392 fs_info->chunk_root = chunk_root;
1393 fs_info->dev_root = dev_root;
1394 fs_info->fs_devices = fs_devices;
1395 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1396 INIT_LIST_HEAD(&fs_info->space_info);
1397 btrfs_mapping_init(&fs_info->mapping_tree);
1398 atomic_set(&fs_info->nr_async_submits, 0);
1399 atomic_set(&fs_info->async_submit_draining, 0);
1400 atomic_set(&fs_info->nr_async_bios, 0);
1401 atomic_set(&fs_info->throttles, 0);
1402 atomic_set(&fs_info->throttle_gen, 0);
1403 fs_info->sb = sb;
1404 fs_info->max_extent = (u64)-1;
1405 fs_info->max_inline = 8192 * 1024;
1406 setup_bdi(fs_info, &fs_info->bdi);
1407 fs_info->btree_inode = new_inode(sb);
1408 fs_info->btree_inode->i_ino = 1;
1409 fs_info->btree_inode->i_nlink = 1;
1410 fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
1411
1412 INIT_LIST_HEAD(&fs_info->ordered_extents);
1413 spin_lock_init(&fs_info->ordered_extent_lock);
1414
1415 sb->s_blocksize = 4096;
1416 sb->s_blocksize_bits = blksize_bits(4096);
1417
1418 /*
1419 * we set the i_size on the btree inode to the max possible int.
1420 * the real end of the address space is determined by all of
1421 * the devices in the system
1422 */
1423 fs_info->btree_inode->i_size = OFFSET_MAX;
1424 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1425 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1426
1427 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1428 fs_info->btree_inode->i_mapping,
1429 GFP_NOFS);
1430 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1431 GFP_NOFS);
1432
1433 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1434
1435 spin_lock_init(&fs_info->block_group_cache_lock);
1436 fs_info->block_group_cache_tree.rb_node = NULL;
1437
1438 extent_io_tree_init(&fs_info->pinned_extents,
1439 fs_info->btree_inode->i_mapping, GFP_NOFS);
1440 extent_io_tree_init(&fs_info->pending_del,
1441 fs_info->btree_inode->i_mapping, GFP_NOFS);
1442 extent_io_tree_init(&fs_info->extent_ins,
1443 fs_info->btree_inode->i_mapping, GFP_NOFS);
1444 fs_info->do_barriers = 1;
1445
1446 extent_io_tree_init(&fs_info->reloc_mapping_tree,
1447 fs_info->btree_inode->i_mapping, GFP_NOFS);
1448 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
1449 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
1450 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
1451
1452 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1453 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1454 sizeof(struct btrfs_key));
1455 insert_inode_hash(fs_info->btree_inode);
1456
1457 mutex_init(&fs_info->trans_mutex);
1458 mutex_init(&fs_info->tree_log_mutex);
1459 mutex_init(&fs_info->drop_mutex);
1460 mutex_init(&fs_info->alloc_mutex);
1461 mutex_init(&fs_info->chunk_mutex);
1462 mutex_init(&fs_info->transaction_kthread_mutex);
1463 mutex_init(&fs_info->cleaner_mutex);
1464 mutex_init(&fs_info->volume_mutex);
1465 mutex_init(&fs_info->tree_reloc_mutex);
1466 init_waitqueue_head(&fs_info->transaction_throttle);
1467 init_waitqueue_head(&fs_info->transaction_wait);
1468 init_waitqueue_head(&fs_info->async_submit_wait);
1469 init_waitqueue_head(&fs_info->tree_log_wait);
1470 atomic_set(&fs_info->tree_log_commit, 0);
1471 atomic_set(&fs_info->tree_log_writers, 0);
1472 fs_info->tree_log_transid = 0;
1473
1474#if 0
1475 ret = add_hasher(fs_info, "crc32c");
1476 if (ret) {
1477 printk("btrfs: failed hash setup, modprobe cryptomgr?\n");
1478 err = -ENOMEM;
1479 goto fail_iput;
1480 }
1481#endif
1482 __setup_root(4096, 4096, 4096, 4096, tree_root,
1483 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1484
1485
1486 bh = __bread(fs_devices->latest_bdev,
1487 BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
1488 if (!bh)
1489 goto fail_iput;
1490
1491 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1492 brelse(bh);
1493
1494 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
1495
1496 disk_super = &fs_info->super_copy;
1497 if (!btrfs_super_root(disk_super))
1498 goto fail_sb_buffer;
1499
1500 err = btrfs_parse_options(tree_root, options);
1501 if (err)
1502 goto fail_sb_buffer;
1503
1504 /*
1505 * we need to start all the end_io workers up front because the
1506 * queue work function gets called at interrupt time, and so it
1507 * cannot dynamically grow.
1508 */
1509 btrfs_init_workers(&fs_info->workers, "worker",
1510 fs_info->thread_pool_size);
1511 btrfs_init_workers(&fs_info->submit_workers, "submit",
1512 min_t(u64, fs_devices->num_devices,
1513 fs_info->thread_pool_size));
1514
1515 /* a higher idle thresh on the submit workers makes it much more
1516 * likely that bios will be send down in a sane order to the
1517 * devices
1518 */
1519 fs_info->submit_workers.idle_thresh = 64;
1520
1521 /* fs_info->workers is responsible for checksumming file data
1522 * blocks and metadata. Using a larger idle thresh allows each
1523 * worker thread to operate on things in roughly the order they
1524 * were sent by the writeback daemons, improving overall locality
1525 * of the IO going down the pipe.
1526 */
1527 fs_info->workers.idle_thresh = 128;
1528
1529 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1530 btrfs_init_workers(&fs_info->endio_workers, "endio",
1531 fs_info->thread_pool_size);
1532 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1533 fs_info->thread_pool_size);
1534
1535 /*
1536 * endios are largely parallel and should have a very
1537 * low idle thresh
1538 */
1539 fs_info->endio_workers.idle_thresh = 4;
1540 fs_info->endio_write_workers.idle_thresh = 64;
1541
1542 btrfs_start_workers(&fs_info->workers, 1);
1543 btrfs_start_workers(&fs_info->submit_workers, 1);
1544 btrfs_start_workers(&fs_info->fixup_workers, 1);
1545 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1546 btrfs_start_workers(&fs_info->endio_write_workers,
1547 fs_info->thread_pool_size);
1548
1549 err = -EINVAL;
1550 if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
1551 printk("Btrfs: wanted %llu devices, but found %llu\n",
1552 (unsigned long long)btrfs_super_num_devices(disk_super),
1553 (unsigned long long)fs_devices->open_devices);
1554 if (btrfs_test_opt(tree_root, DEGRADED))
1555 printk("continuing in degraded mode\n");
1556 else {
1557 goto fail_sb_buffer;
1558 }
1559 }
1560
1561 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1562
1563 nodesize = btrfs_super_nodesize(disk_super);
1564 leafsize = btrfs_super_leafsize(disk_super);
1565 sectorsize = btrfs_super_sectorsize(disk_super);
1566 stripesize = btrfs_super_stripesize(disk_super);
1567 tree_root->nodesize = nodesize;
1568 tree_root->leafsize = leafsize;
1569 tree_root->sectorsize = sectorsize;
1570 tree_root->stripesize = stripesize;
1571
1572 sb->s_blocksize = sectorsize;
1573 sb->s_blocksize_bits = blksize_bits(sectorsize);
1574
1575 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
1576 sizeof(disk_super->magic))) {
1577 printk("btrfs: valid FS not found on %s\n", sb->s_id);
1578 goto fail_sb_buffer;
1579 }
1580
1581 mutex_lock(&fs_info->chunk_mutex);
1582 ret = btrfs_read_sys_array(tree_root);
1583 mutex_unlock(&fs_info->chunk_mutex);
1584 if (ret) {
1585 printk("btrfs: failed to read the system array on %s\n",
1586 sb->s_id);
1587 goto fail_sys_array;
1588 }
1589
1590 blocksize = btrfs_level_size(tree_root,
1591 btrfs_super_chunk_root_level(disk_super));
1592
1593 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1594 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1595
1596 chunk_root->node = read_tree_block(chunk_root,
1597 btrfs_super_chunk_root(disk_super),
1598 blocksize, 0);
1599 BUG_ON(!chunk_root->node);
1600
1601 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1602 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
1603 BTRFS_UUID_SIZE);
1604
1605 mutex_lock(&fs_info->chunk_mutex);
1606 ret = btrfs_read_chunk_tree(chunk_root);
1607 mutex_unlock(&fs_info->chunk_mutex);
1608 BUG_ON(ret);
1609
1610 btrfs_close_extra_devices(fs_devices);
1611
1612 blocksize = btrfs_level_size(tree_root,
1613 btrfs_super_root_level(disk_super));
1614
1615
1616 tree_root->node = read_tree_block(tree_root,
1617 btrfs_super_root(disk_super),
1618 blocksize, 0);
1619 if (!tree_root->node)
1620 goto fail_sb_buffer;
1621
1622
1623 ret = find_and_setup_root(tree_root, fs_info,
1624 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1625 if (ret)
1626 goto fail_tree_root;
1627 extent_root->track_dirty = 1;
1628
1629 ret = find_and_setup_root(tree_root, fs_info,
1630 BTRFS_DEV_TREE_OBJECTID, dev_root);
1631 dev_root->track_dirty = 1;
1632
1633 if (ret)
1634 goto fail_extent_root;
1635
1636 btrfs_read_block_groups(extent_root);
1637
1638 fs_info->generation = btrfs_super_generation(disk_super) + 1;
1639 fs_info->data_alloc_profile = (u64)-1;
1640 fs_info->metadata_alloc_profile = (u64)-1;
1641 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1642 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1643 "btrfs-cleaner");
1644 if (!fs_info->cleaner_kthread)
1645 goto fail_extent_root;
1646
1647 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1648 tree_root,
1649 "btrfs-transaction");
1650 if (!fs_info->transaction_kthread)
1651 goto fail_cleaner;
1652
1653 if (btrfs_super_log_root(disk_super) != 0) {
1654 u32 blocksize;
1655 u64 bytenr = btrfs_super_log_root(disk_super);
1656
1657 blocksize =
1658 btrfs_level_size(tree_root,
1659 btrfs_super_log_root_level(disk_super));
1660
1661 log_tree_root = kzalloc(sizeof(struct btrfs_root),
1662 GFP_NOFS);
1663
1664 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1665 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1666
1667 log_tree_root->node = read_tree_block(tree_root, bytenr,
1668 blocksize, 0);
1669 ret = btrfs_recover_log_trees(log_tree_root);
1670 BUG_ON(ret);
1671 }
1672
1673 ret = btrfs_cleanup_reloc_trees(tree_root);
1674 BUG_ON(ret);
1675
1676 fs_info->last_trans_committed = btrfs_super_generation(disk_super);
1677 return tree_root;
1678
1679fail_cleaner:
1680 kthread_stop(fs_info->cleaner_kthread);
1681fail_extent_root:
1682 free_extent_buffer(extent_root->node);
1683fail_tree_root:
1684 free_extent_buffer(tree_root->node);
1685fail_sys_array:
1686fail_sb_buffer:
1687 btrfs_stop_workers(&fs_info->fixup_workers);
1688 btrfs_stop_workers(&fs_info->workers);
1689 btrfs_stop_workers(&fs_info->endio_workers);
1690 btrfs_stop_workers(&fs_info->endio_write_workers);
1691 btrfs_stop_workers(&fs_info->submit_workers);
1692fail_iput:
1693 iput(fs_info->btree_inode);
1694fail:
1695 btrfs_close_devices(fs_info->fs_devices);
1696 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1697
1698 kfree(extent_root);
1699 kfree(tree_root);
1700 bdi_destroy(&fs_info->bdi);
1701 kfree(fs_info);
1702 kfree(chunk_root);
1703 kfree(dev_root);
1704 return ERR_PTR(err);
1705}
1706
1707static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
1708{
1709 char b[BDEVNAME_SIZE];
1710
1711 if (uptodate) {
1712 set_buffer_uptodate(bh);
1713 } else {
1714 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
1715 printk(KERN_WARNING "lost page write due to "
1716 "I/O error on %s\n",
1717 bdevname(bh->b_bdev, b));
1718 }
1719 /* note, we dont' set_buffer_write_io_error because we have
1720 * our own ways of dealing with the IO errors
1721 */
1722 clear_buffer_uptodate(bh);
1723 }
1724 unlock_buffer(bh);
1725 put_bh(bh);
1726}
1727
1728int write_all_supers(struct btrfs_root *root)
1729{
1730 struct list_head *cur;
1731 struct list_head *head = &root->fs_info->fs_devices->devices;
1732 struct btrfs_device *dev;
1733 struct btrfs_super_block *sb;
1734 struct btrfs_dev_item *dev_item;
1735 struct buffer_head *bh;
1736 int ret;
1737 int do_barriers;
1738 int max_errors;
1739 int total_errors = 0;
1740 u32 crc;
1741 u64 flags;
1742
1743 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1744 do_barriers = !btrfs_test_opt(root, NOBARRIER);
1745
1746 sb = &root->fs_info->super_for_commit;
1747 dev_item = &sb->dev_item;
1748 list_for_each(cur, head) {
1749 dev = list_entry(cur, struct btrfs_device, dev_list);
1750 if (!dev->bdev) {
1751 total_errors++;
1752 continue;
1753 }
1754 if (!dev->in_fs_metadata)
1755 continue;
1756
1757 btrfs_set_stack_device_type(dev_item, dev->type);
1758 btrfs_set_stack_device_id(dev_item, dev->devid);
1759 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
1760 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
1761 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
1762 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
1763 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
1764 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
1765 flags = btrfs_super_flags(sb);
1766 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
1767
1768
1769 crc = ~(u32)0;
1770 crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc,
1771 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
1772 btrfs_csum_final(crc, sb->csum);
1773
1774 bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096,
1775 BTRFS_SUPER_INFO_SIZE);
1776
1777 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
1778 dev->pending_io = bh;
1779
1780 get_bh(bh);
1781 set_buffer_uptodate(bh);
1782 lock_buffer(bh);
1783 bh->b_end_io = btrfs_end_buffer_write_sync;
1784
1785 if (do_barriers && dev->barriers) {
1786 ret = submit_bh(WRITE_BARRIER, bh);
1787 if (ret == -EOPNOTSUPP) {
1788 printk("btrfs: disabling barriers on dev %s\n",
1789 dev->name);
1790 set_buffer_uptodate(bh);
1791 dev->barriers = 0;
1792 get_bh(bh);
1793 lock_buffer(bh);
1794 ret = submit_bh(WRITE, bh);
1795 }
1796 } else {
1797 ret = submit_bh(WRITE, bh);
1798 }
1799 if (ret)
1800 total_errors++;
1801 }
1802 if (total_errors > max_errors) {
1803 printk("btrfs: %d errors while writing supers\n", total_errors);
1804 BUG();
1805 }
1806 total_errors = 0;
1807
1808 list_for_each(cur, head) {
1809 dev = list_entry(cur, struct btrfs_device, dev_list);
1810 if (!dev->bdev)
1811 continue;
1812 if (!dev->in_fs_metadata)
1813 continue;
1814
1815 BUG_ON(!dev->pending_io);
1816 bh = dev->pending_io;
1817 wait_on_buffer(bh);
1818 if (!buffer_uptodate(dev->pending_io)) {
1819 if (do_barriers && dev->barriers) {
1820 printk("btrfs: disabling barriers on dev %s\n",
1821 dev->name);
1822 set_buffer_uptodate(bh);
1823 get_bh(bh);
1824 lock_buffer(bh);
1825 dev->barriers = 0;
1826 ret = submit_bh(WRITE, bh);
1827 BUG_ON(ret);
1828 wait_on_buffer(bh);
1829 if (!buffer_uptodate(bh))
1830 total_errors++;
1831 } else {
1832 total_errors++;
1833 }
1834
1835 }
1836 dev->pending_io = NULL;
1837 brelse(bh);
1838 }
1839 if (total_errors > max_errors) {
1840 printk("btrfs: %d errors while writing supers\n", total_errors);
1841 BUG();
1842 }
1843 return 0;
1844}
1845
1846int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
1847 *root)
1848{
1849 int ret;
1850
1851 ret = write_all_supers(root);
1852 return ret;
1853}
1854
1855int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
1856{
1857 radix_tree_delete(&fs_info->fs_roots_radix,
1858 (unsigned long)root->root_key.objectid);
1859 if (root->in_sysfs)
1860 btrfs_sysfs_del_root(root);
1861 if (root->inode)
1862 iput(root->inode);
1863 if (root->node)
1864 free_extent_buffer(root->node);
1865 if (root->commit_root)
1866 free_extent_buffer(root->commit_root);
1867 if (root->name)
1868 kfree(root->name);
1869 kfree(root);
1870 return 0;
1871}
1872
1873static int del_fs_roots(struct btrfs_fs_info *fs_info)
1874{
1875 int ret;
1876 struct btrfs_root *gang[8];
1877 int i;
1878
1879 while(1) {
1880 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
1881 (void **)gang, 0,
1882 ARRAY_SIZE(gang));
1883 if (!ret)
1884 break;
1885 for (i = 0; i < ret; i++)
1886 btrfs_free_fs_root(fs_info, gang[i]);
1887 }
1888 return 0;
1889}
1890
1891int close_ctree(struct btrfs_root *root)
1892{
1893 int ret;
1894 struct btrfs_trans_handle *trans;
1895 struct btrfs_fs_info *fs_info = root->fs_info;
1896
1897 fs_info->closing = 1;
1898 smp_mb();
1899
1900 kthread_stop(root->fs_info->transaction_kthread);
1901 kthread_stop(root->fs_info->cleaner_kthread);
1902
1903 btrfs_clean_old_snapshots(root);
1904 trans = btrfs_start_transaction(root, 1);
1905 ret = btrfs_commit_transaction(trans, root);
1906 /* run commit again to drop the original snapshot */
1907 trans = btrfs_start_transaction(root, 1);
1908 btrfs_commit_transaction(trans, root);
1909 ret = btrfs_write_and_wait_transaction(NULL, root);
1910 BUG_ON(ret);
1911
1912 write_ctree_super(NULL, root);
1913
1914 if (fs_info->delalloc_bytes) {
1915 printk("btrfs: at unmount delalloc count %Lu\n",
1916 fs_info->delalloc_bytes);
1917 }
1918 if (fs_info->total_ref_cache_size) {
1919 printk("btrfs: at umount reference cache size %Lu\n",
1920 fs_info->total_ref_cache_size);
1921 }
1922
1923 if (fs_info->extent_root->node)
1924 free_extent_buffer(fs_info->extent_root->node);
1925
1926 if (fs_info->tree_root->node)
1927 free_extent_buffer(fs_info->tree_root->node);
1928
1929 if (root->fs_info->chunk_root->node);
1930 free_extent_buffer(root->fs_info->chunk_root->node);
1931
1932 if (root->fs_info->dev_root->node);
1933 free_extent_buffer(root->fs_info->dev_root->node);
1934
1935 btrfs_free_block_groups(root->fs_info);
1936 fs_info->closing = 2;
1937 del_fs_roots(fs_info);
1938
1939 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
1940
1941 truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
1942
1943 btrfs_stop_workers(&fs_info->fixup_workers);
1944 btrfs_stop_workers(&fs_info->workers);
1945 btrfs_stop_workers(&fs_info->endio_workers);
1946 btrfs_stop_workers(&fs_info->endio_write_workers);
1947 btrfs_stop_workers(&fs_info->submit_workers);
1948
1949 iput(fs_info->btree_inode);
1950#if 0
1951 while(!list_empty(&fs_info->hashers)) {
1952 struct btrfs_hasher *hasher;
1953 hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
1954 hashers);
1955 list_del(&hasher->hashers);
1956 crypto_free_hash(&fs_info->hash_tfm);
1957 kfree(hasher);
1958 }
1959#endif
1960 btrfs_close_devices(fs_info->fs_devices);
1961 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1962
1963 bdi_destroy(&fs_info->bdi);
1964
1965 kfree(fs_info->extent_root);
1966 kfree(fs_info->tree_root);
1967 kfree(fs_info->chunk_root);
1968 kfree(fs_info->dev_root);
1969 return 0;
1970}
1971
1972int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
1973{
1974 int ret;
1975 struct inode *btree_inode = buf->first_page->mapping->host;
1976
1977 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
1978 if (!ret)
1979 return ret;
1980
1981 ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
1982 parent_transid);
1983 return !ret;
1984}
1985
1986int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
1987{
1988 struct inode *btree_inode = buf->first_page->mapping->host;
1989 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
1990 buf);
1991}
1992
1993void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
1994{
1995 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
1996 u64 transid = btrfs_header_generation(buf);
1997 struct inode *btree_inode = root->fs_info->btree_inode;
1998
1999 WARN_ON(!btrfs_tree_locked(buf));
2000 if (transid != root->fs_info->generation) {
2001 printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
2002 (unsigned long long)buf->start,
2003 transid, root->fs_info->generation);
2004 WARN_ON(1);
2005 }
2006 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
2007}
2008
2009void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2010{
2011 /*
2012 * looks as though older kernels can get into trouble with
2013 * this code, they end up stuck in balance_dirty_pages forever
2014 */
2015 struct extent_io_tree *tree;
2016 u64 num_dirty;
2017 u64 start = 0;
2018 unsigned long thresh = 96 * 1024 * 1024;
2019 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2020
2021 if (current_is_pdflush() || current->flags & PF_MEMALLOC)
2022 return;
2023
2024 num_dirty = count_range_bits(tree, &start, (u64)-1,
2025 thresh, EXTENT_DIRTY);
2026 if (num_dirty > thresh) {
2027 balance_dirty_pages_ratelimited_nr(
2028 root->fs_info->btree_inode->i_mapping, 1);
2029 }
2030 return;
2031}
2032
2033int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2034{
2035 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2036 int ret;
2037 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2038 if (ret == 0) {
2039 buf->flags |= EXTENT_UPTODATE;
2040 }
2041 return ret;
2042}
2043
2044int btree_lock_page_hook(struct page *page)
2045{
2046 struct inode *inode = page->mapping->host;
2047 struct btrfs_root *root = BTRFS_I(inode)->root;
2048 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2049 struct extent_buffer *eb;
2050 unsigned long len;
2051 u64 bytenr = page_offset(page);
2052
2053 if (page->private == EXTENT_PAGE_PRIVATE)
2054 goto out;
2055
2056 len = page->private >> 2;
2057 eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
2058 if (!eb)
2059 goto out;
2060
2061 btrfs_tree_lock(eb);
2062 spin_lock(&root->fs_info->hash_lock);
2063 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2064 spin_unlock(&root->fs_info->hash_lock);
2065 btrfs_tree_unlock(eb);
2066 free_extent_buffer(eb);
2067out:
2068 lock_page(page);
2069 return 0;
2070}
2071
2072static struct extent_io_ops btree_extent_io_ops = {
2073 .write_cache_pages_lock_hook = btree_lock_page_hook,
2074 .readpage_end_io_hook = btree_readpage_end_io_hook,
2075 .submit_bio_hook = btree_submit_bio_hook,
2076 /* note we're sharing with inode.c for the merge bio hook */
2077 .merge_bio_hook = btrfs_merge_bio_hook,
2078};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000000..f84f5058dbbb
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,84 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __DISKIO__
20#define __DISKIO__
21
22#define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
23#define BTRFS_SUPER_INFO_SIZE 4096
24struct btrfs_device;
25struct btrfs_fs_devices;
26
27struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
28 u32 blocksize, u64 parent_transid);
29int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
30 u64 parent_transid);
31struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
32 u64 bytenr, u32 blocksize);
33int clean_tree_block(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, struct extent_buffer *buf);
35struct btrfs_root *open_ctree(struct super_block *sb,
36 struct btrfs_fs_devices *fs_devices,
37 char *options);
38int close_ctree(struct btrfs_root *root);
39int write_ctree_super(struct btrfs_trans_handle *trans,
40 struct btrfs_root *root);
41struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
42 u64 bytenr, u32 blocksize);
43struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
44 u64 root_objectid);
45struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
46 struct btrfs_key *location,
47 const char *name, int namelen);
48struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
49 struct btrfs_key *location);
50struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
51 struct btrfs_key *location);
52int btrfs_insert_dev_radix(struct btrfs_root *root,
53 struct block_device *bdev,
54 u64 device_id,
55 u64 block_start,
56 u64 num_blocks);
57void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
58int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
59void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
60int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
61int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
62int wait_on_tree_block_writeback(struct btrfs_root *root,
63 struct extent_buffer *buf);
64int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
65u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
66void btrfs_csum_final(u32 crc, char *result);
67int btrfs_open_device(struct btrfs_device *dev);
68int btrfs_verify_block_csum(struct btrfs_root *root,
69 struct extent_buffer *buf);
70int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
71 int metadata);
72int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
73 int rw, struct bio *bio, int mirror_num,
74 extent_submit_bio_hook_t *submit_bio_hook);
75int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
76unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
77int btrfs_write_tree_block(struct extent_buffer *buf);
78int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
79int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
80 struct btrfs_fs_info *fs_info);
81int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
82 struct btrfs_fs_info *fs_info);
83int btree_lock_page_hook(struct page *page);
84#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000000..48b82cd7583c
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,201 @@
1#include <linux/fs.h>
2#include <linux/types.h>
3#include "ctree.h"
4#include "disk-io.h"
5#include "btrfs_inode.h"
6#include "print-tree.h"
7#include "export.h"
8#include "compat.h"
9
10#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, parent_objectid)/4)
11#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, parent_root_objectid)/4)
12#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid)/4)
13
14static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
15 int connectable)
16{
17 struct btrfs_fid *fid = (struct btrfs_fid *)fh;
18 struct inode *inode = dentry->d_inode;
19 int len = *max_len;
20 int type;
21
22 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
23 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
24 return 255;
25
26 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
27 type = FILEID_BTRFS_WITHOUT_PARENT;
28
29 fid->objectid = BTRFS_I(inode)->location.objectid;
30 fid->root_objectid = BTRFS_I(inode)->root->objectid;
31 fid->gen = inode->i_generation;
32
33 if (connectable && !S_ISDIR(inode->i_mode)) {
34 struct inode *parent;
35 u64 parent_root_id;
36
37 spin_lock(&dentry->d_lock);
38
39 parent = dentry->d_parent->d_inode;
40 fid->parent_objectid = BTRFS_I(parent)->location.objectid;
41 fid->parent_gen = parent->i_generation;
42 parent_root_id = BTRFS_I(parent)->root->objectid;
43
44 spin_unlock(&dentry->d_lock);
45
46 if (parent_root_id != fid->root_objectid) {
47 fid->parent_root_objectid = parent_root_id;
48 len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
49 type = FILEID_BTRFS_WITH_PARENT_ROOT;
50 } else {
51 len = BTRFS_FID_SIZE_CONNECTABLE;
52 type = FILEID_BTRFS_WITH_PARENT;
53 }
54 }
55
56 *max_len = len;
57 return type;
58}
59
60static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
61 u64 root_objectid, u32 generation)
62{
63 struct btrfs_root *root;
64 struct inode *inode;
65 struct btrfs_key key;
66
67 key.objectid = root_objectid;
68 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
69 key.offset = (u64)-1;
70
71 root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
72 if (IS_ERR(root))
73 return ERR_CAST(root);
74
75 key.objectid = objectid;
76 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
77 key.offset = 0;
78
79 inode = btrfs_iget(sb, &key, root, NULL);
80 if (IS_ERR(inode))
81 return (void *)inode;
82
83 if (generation != inode->i_generation) {
84 iput(inode);
85 return ERR_PTR(-ESTALE);
86 }
87
88 return d_obtain_alias(inode);
89}
90
91static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
92 int fh_len, int fh_type)
93{
94 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
95 u64 objectid, root_objectid;
96 u32 generation;
97
98 if (fh_type == FILEID_BTRFS_WITH_PARENT) {
99 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
100 return NULL;
101 root_objectid = fid->root_objectid;
102 } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
103 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
104 return NULL;
105 root_objectid = fid->parent_root_objectid;
106 } else
107 return NULL;
108
109 objectid = fid->parent_objectid;
110 generation = fid->parent_gen;
111
112 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
113}
114
115static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
116 int fh_len, int fh_type)
117{
118 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
119 u64 objectid, root_objectid;
120 u32 generation;
121
122 if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
123 fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
124 (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
125 fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
126 (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
127 fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
128 return NULL;
129
130 objectid = fid->objectid;
131 root_objectid = fid->root_objectid;
132 generation = fid->gen;
133
134 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
135}
136
137static struct dentry *btrfs_get_parent(struct dentry *child)
138{
139 struct inode *dir = child->d_inode;
140 struct btrfs_root *root = BTRFS_I(dir)->root;
141 struct btrfs_key key;
142 struct btrfs_path *path;
143 struct extent_buffer *leaf;
144 int slot;
145 u64 objectid;
146 int ret;
147
148 path = btrfs_alloc_path();
149
150 key.objectid = dir->i_ino;
151 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
152 key.offset = (u64)-1;
153
154 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
155 if (ret < 0) {
156 /* Error */
157 btrfs_free_path(path);
158 return ERR_PTR(ret);
159 }
160 leaf = path->nodes[0];
161 slot = path->slots[0];
162 if (ret) {
163 /* btrfs_search_slot() returns the slot where we'd want to
164 insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
165 The _real_ backref, telling us what the parent inode
166 _actually_ is, will be in the slot _before_ the one
167 that btrfs_search_slot() returns. */
168 if (!slot) {
169 /* Unless there is _no_ key in the tree before... */
170 btrfs_free_path(path);
171 return ERR_PTR(-EIO);
172 }
173 slot--;
174 }
175
176 btrfs_item_key_to_cpu(leaf, &key, slot);
177 btrfs_free_path(path);
178
179 if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
180 return ERR_PTR(-EINVAL);
181
182 objectid = key.offset;
183
184 /* If we are already at the root of a subvol, return the real root */
185 if (objectid == dir->i_ino)
186 return dget(dir->i_sb->s_root);
187
188 /* Build a new key for the inode item */
189 key.objectid = objectid;
190 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
191 key.offset = 0;
192
193 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
194}
195
196const struct export_operations btrfs_export_ops = {
197 .encode_fh = btrfs_encode_fh,
198 .fh_to_dentry = btrfs_fh_to_dentry,
199 .fh_to_parent = btrfs_fh_to_parent,
200 .get_parent = btrfs_get_parent,
201};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000000..074348a95841
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
1#ifndef BTRFS_EXPORT_H
2#define BTRFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations btrfs_export_ops;
7
8struct btrfs_fid {
9 u64 objectid;
10 u64 root_objectid;
11 u32 gen;
12
13 u64 parent_objectid;
14 u32 parent_gen;
15
16 u64 parent_root_objectid;
17} __attribute__ ((packed));
18
19#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000000..280ac1aa9b6d
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,5253 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/pagemap.h>
20#include <linux/writeback.h>
21#include <linux/blkdev.h>
22#include "hash.h"
23#include "crc32c.h"
24#include "ctree.h"
25#include "disk-io.h"
26#include "print-tree.h"
27#include "transaction.h"
28#include "volumes.h"
29#include "locking.h"
30#include "ref-cache.h"
31
32#define PENDING_EXTENT_INSERT 0
33#define PENDING_EXTENT_DELETE 1
34#define PENDING_BACKREF_UPDATE 2
35
36struct pending_extent_op {
37 int type;
38 u64 bytenr;
39 u64 num_bytes;
40 u64 parent;
41 u64 orig_parent;
42 u64 generation;
43 u64 orig_generation;
44 int level;
45};
46
47static int finish_current_insert(struct btrfs_trans_handle *trans, struct
48 btrfs_root *extent_root);
49static int del_pending_extents(struct btrfs_trans_handle *trans, struct
50 btrfs_root *extent_root);
51static struct btrfs_block_group_cache *
52__btrfs_find_block_group(struct btrfs_root *root,
53 struct btrfs_block_group_cache *hint,
54 u64 search_start, int data, int owner);
55
56void maybe_lock_mutex(struct btrfs_root *root)
57{
58 if (root != root->fs_info->extent_root &&
59 root != root->fs_info->chunk_root &&
60 root != root->fs_info->dev_root) {
61 mutex_lock(&root->fs_info->alloc_mutex);
62 }
63}
64
65void maybe_unlock_mutex(struct btrfs_root *root)
66{
67 if (root != root->fs_info->extent_root &&
68 root != root->fs_info->chunk_root &&
69 root != root->fs_info->dev_root) {
70 mutex_unlock(&root->fs_info->alloc_mutex);
71 }
72}
73
74static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
75{
76 return (cache->flags & bits) == bits;
77}
78
79/*
80 * this adds the block group to the fs_info rb tree for the block group
81 * cache
82 */
83int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
84 struct btrfs_block_group_cache *block_group)
85{
86 struct rb_node **p;
87 struct rb_node *parent = NULL;
88 struct btrfs_block_group_cache *cache;
89
90 spin_lock(&info->block_group_cache_lock);
91 p = &info->block_group_cache_tree.rb_node;
92
93 while (*p) {
94 parent = *p;
95 cache = rb_entry(parent, struct btrfs_block_group_cache,
96 cache_node);
97 if (block_group->key.objectid < cache->key.objectid) {
98 p = &(*p)->rb_left;
99 } else if (block_group->key.objectid > cache->key.objectid) {
100 p = &(*p)->rb_right;
101 } else {
102 spin_unlock(&info->block_group_cache_lock);
103 return -EEXIST;
104 }
105 }
106
107 rb_link_node(&block_group->cache_node, parent, p);
108 rb_insert_color(&block_group->cache_node,
109 &info->block_group_cache_tree);
110 spin_unlock(&info->block_group_cache_lock);
111
112 return 0;
113}
114
115/*
116 * This will return the block group at or after bytenr if contains is 0, else
117 * it will return the block group that contains the bytenr
118 */
119static struct btrfs_block_group_cache *
120block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
121 int contains)
122{
123 struct btrfs_block_group_cache *cache, *ret = NULL;
124 struct rb_node *n;
125 u64 end, start;
126
127 spin_lock(&info->block_group_cache_lock);
128 n = info->block_group_cache_tree.rb_node;
129
130 while (n) {
131 cache = rb_entry(n, struct btrfs_block_group_cache,
132 cache_node);
133 end = cache->key.objectid + cache->key.offset - 1;
134 start = cache->key.objectid;
135
136 if (bytenr < start) {
137 if (!contains && (!ret || start < ret->key.objectid))
138 ret = cache;
139 n = n->rb_left;
140 } else if (bytenr > start) {
141 if (contains && bytenr <= end) {
142 ret = cache;
143 break;
144 }
145 n = n->rb_right;
146 } else {
147 ret = cache;
148 break;
149 }
150 }
151 spin_unlock(&info->block_group_cache_lock);
152
153 return ret;
154}
155
156/*
157 * this is only called by cache_block_group, since we could have freed extents
158 * we need to check the pinned_extents for any extents that can't be used yet
159 * since their free space will be released as soon as the transaction commits.
160 */
161static int add_new_free_space(struct btrfs_block_group_cache *block_group,
162 struct btrfs_fs_info *info, u64 start, u64 end)
163{
164 u64 extent_start, extent_end, size;
165 int ret;
166
167 while (start < end) {
168 ret = find_first_extent_bit(&info->pinned_extents, start,
169 &extent_start, &extent_end,
170 EXTENT_DIRTY);
171 if (ret)
172 break;
173
174 if (extent_start == start) {
175 start = extent_end + 1;
176 } else if (extent_start > start && extent_start < end) {
177 size = extent_start - start;
178 ret = btrfs_add_free_space(block_group, start, size);
179 BUG_ON(ret);
180 start = extent_end + 1;
181 } else {
182 break;
183 }
184 }
185
186 if (start < end) {
187 size = end - start;
188 ret = btrfs_add_free_space(block_group, start, size);
189 BUG_ON(ret);
190 }
191
192 return 0;
193}
194
195static int cache_block_group(struct btrfs_root *root,
196 struct btrfs_block_group_cache *block_group)
197{
198 struct btrfs_path *path;
199 int ret = 0;
200 struct btrfs_key key;
201 struct extent_buffer *leaf;
202 int slot;
203 u64 last = 0;
204 u64 first_free;
205 int found = 0;
206
207 if (!block_group)
208 return 0;
209
210 root = root->fs_info->extent_root;
211
212 if (block_group->cached)
213 return 0;
214
215 path = btrfs_alloc_path();
216 if (!path)
217 return -ENOMEM;
218
219 path->reada = 2;
220 /*
221 * we get into deadlocks with paths held by callers of this function.
222 * since the alloc_mutex is protecting things right now, just
223 * skip the locking here
224 */
225 path->skip_locking = 1;
226 first_free = max_t(u64, block_group->key.objectid,
227 BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
228 key.objectid = block_group->key.objectid;
229 key.offset = 0;
230 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
231 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
232 if (ret < 0)
233 goto err;
234 ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
235 if (ret < 0)
236 goto err;
237 if (ret == 0) {
238 leaf = path->nodes[0];
239 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
240 if (key.objectid + key.offset > first_free)
241 first_free = key.objectid + key.offset;
242 }
243 while(1) {
244 leaf = path->nodes[0];
245 slot = path->slots[0];
246 if (slot >= btrfs_header_nritems(leaf)) {
247 ret = btrfs_next_leaf(root, path);
248 if (ret < 0)
249 goto err;
250 if (ret == 0)
251 continue;
252 else
253 break;
254 }
255 btrfs_item_key_to_cpu(leaf, &key, slot);
256 if (key.objectid < block_group->key.objectid)
257 goto next;
258
259 if (key.objectid >= block_group->key.objectid +
260 block_group->key.offset)
261 break;
262
263 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
264 if (!found) {
265 last = first_free;
266 found = 1;
267 }
268
269 add_new_free_space(block_group, root->fs_info, last,
270 key.objectid);
271
272 last = key.objectid + key.offset;
273 }
274next:
275 path->slots[0]++;
276 }
277
278 if (!found)
279 last = first_free;
280
281 add_new_free_space(block_group, root->fs_info, last,
282 block_group->key.objectid +
283 block_group->key.offset);
284
285 block_group->cached = 1;
286 ret = 0;
287err:
288 btrfs_free_path(path);
289 return ret;
290}
291
292/*
293 * return the block group that starts at or after bytenr
294 */
295struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
296 btrfs_fs_info *info,
297 u64 bytenr)
298{
299 struct btrfs_block_group_cache *cache;
300
301 cache = block_group_cache_tree_search(info, bytenr, 0);
302
303 return cache;
304}
305
306/*
307 * return the block group that contains teh given bytenr
308 */
309struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
310 btrfs_fs_info *info,
311 u64 bytenr)
312{
313 struct btrfs_block_group_cache *cache;
314
315 cache = block_group_cache_tree_search(info, bytenr, 1);
316
317 return cache;
318}
319
320static int noinline find_free_space(struct btrfs_root *root,
321 struct btrfs_block_group_cache **cache_ret,
322 u64 *start_ret, u64 num, int data)
323{
324 int ret;
325 struct btrfs_block_group_cache *cache = *cache_ret;
326 struct btrfs_free_space *info = NULL;
327 u64 last;
328 u64 search_start = *start_ret;
329
330 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
331 if (!cache)
332 goto out;
333
334 last = max(search_start, cache->key.objectid);
335
336again:
337 ret = cache_block_group(root, cache);
338 if (ret)
339 goto out;
340
341 if (cache->ro || !block_group_bits(cache, data))
342 goto new_group;
343
344 info = btrfs_find_free_space(cache, last, num);
345 if (info) {
346 *start_ret = info->offset;
347 return 0;
348 }
349
350new_group:
351 last = cache->key.objectid + cache->key.offset;
352
353 cache = btrfs_lookup_first_block_group(root->fs_info, last);
354 if (!cache)
355 goto out;
356
357 *cache_ret = cache;
358 goto again;
359
360out:
361 return -ENOSPC;
362}
363
364static u64 div_factor(u64 num, int factor)
365{
366 if (factor == 10)
367 return num;
368 num *= factor;
369 do_div(num, 10);
370 return num;
371}
372
373static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
374 u64 flags)
375{
376 struct list_head *head = &info->space_info;
377 struct list_head *cur;
378 struct btrfs_space_info *found;
379 list_for_each(cur, head) {
380 found = list_entry(cur, struct btrfs_space_info, list);
381 if (found->flags == flags)
382 return found;
383 }
384 return NULL;
385}
386
387static struct btrfs_block_group_cache *
388__btrfs_find_block_group(struct btrfs_root *root,
389 struct btrfs_block_group_cache *hint,
390 u64 search_start, int data, int owner)
391{
392 struct btrfs_block_group_cache *cache;
393 struct btrfs_block_group_cache *found_group = NULL;
394 struct btrfs_fs_info *info = root->fs_info;
395 u64 used;
396 u64 last = 0;
397 u64 free_check;
398 int full_search = 0;
399 int factor = 10;
400 int wrapped = 0;
401
402 if (data & BTRFS_BLOCK_GROUP_METADATA)
403 factor = 9;
404
405 if (search_start) {
406 struct btrfs_block_group_cache *shint;
407 shint = btrfs_lookup_first_block_group(info, search_start);
408 if (shint && block_group_bits(shint, data) && !shint->ro) {
409 spin_lock(&shint->lock);
410 used = btrfs_block_group_used(&shint->item);
411 if (used + shint->pinned + shint->reserved <
412 div_factor(shint->key.offset, factor)) {
413 spin_unlock(&shint->lock);
414 return shint;
415 }
416 spin_unlock(&shint->lock);
417 }
418 }
419 if (hint && !hint->ro && block_group_bits(hint, data)) {
420 spin_lock(&hint->lock);
421 used = btrfs_block_group_used(&hint->item);
422 if (used + hint->pinned + hint->reserved <
423 div_factor(hint->key.offset, factor)) {
424 spin_unlock(&hint->lock);
425 return hint;
426 }
427 spin_unlock(&hint->lock);
428 last = hint->key.objectid + hint->key.offset;
429 } else {
430 if (hint)
431 last = max(hint->key.objectid, search_start);
432 else
433 last = search_start;
434 }
435again:
436 while (1) {
437 cache = btrfs_lookup_first_block_group(root->fs_info, last);
438 if (!cache)
439 break;
440
441 spin_lock(&cache->lock);
442 last = cache->key.objectid + cache->key.offset;
443 used = btrfs_block_group_used(&cache->item);
444
445 if (!cache->ro && block_group_bits(cache, data)) {
446 free_check = div_factor(cache->key.offset, factor);
447 if (used + cache->pinned + cache->reserved <
448 free_check) {
449 found_group = cache;
450 spin_unlock(&cache->lock);
451 goto found;
452 }
453 }
454 spin_unlock(&cache->lock);
455 cond_resched();
456 }
457 if (!wrapped) {
458 last = search_start;
459 wrapped = 1;
460 goto again;
461 }
462 if (!full_search && factor < 10) {
463 last = search_start;
464 full_search = 1;
465 factor = 10;
466 goto again;
467 }
468found:
469 return found_group;
470}
471
472struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
473 struct btrfs_block_group_cache
474 *hint, u64 search_start,
475 int data, int owner)
476{
477
478 struct btrfs_block_group_cache *ret;
479 ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
480 return ret;
481}
482
483/* simple helper to search for an existing extent at a given offset */
484int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
485{
486 int ret;
487 struct btrfs_key key;
488 struct btrfs_path *path;
489
490 path = btrfs_alloc_path();
491 BUG_ON(!path);
492 maybe_lock_mutex(root);
493 key.objectid = start;
494 key.offset = len;
495 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
496 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
497 0, 0);
498 maybe_unlock_mutex(root);
499 btrfs_free_path(path);
500 return ret;
501}
502
503/*
504 * Back reference rules. Back refs have three main goals:
505 *
506 * 1) differentiate between all holders of references to an extent so that
507 * when a reference is dropped we can make sure it was a valid reference
508 * before freeing the extent.
509 *
510 * 2) Provide enough information to quickly find the holders of an extent
511 * if we notice a given block is corrupted or bad.
512 *
513 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
514 * maintenance. This is actually the same as #2, but with a slightly
515 * different use case.
516 *
517 * File extents can be referenced by:
518 *
519 * - multiple snapshots, subvolumes, or different generations in one subvol
520 * - different files inside a single subvolume
521 * - different offsets inside a file (bookend extents in file.c)
522 *
523 * The extent ref structure has fields for:
524 *
525 * - Objectid of the subvolume root
526 * - Generation number of the tree holding the reference
527 * - objectid of the file holding the reference
528 * - number of references holding by parent node (alway 1 for tree blocks)
529 *
530 * Btree leaf may hold multiple references to a file extent. In most cases,
531 * these references are from same file and the corresponding offsets inside
532 * the file are close together.
533 *
534 * When a file extent is allocated the fields are filled in:
535 * (root_key.objectid, trans->transid, inode objectid, 1)
536 *
537 * When a leaf is cow'd new references are added for every file extent found
538 * in the leaf. It looks similar to the create case, but trans->transid will
539 * be different when the block is cow'd.
540 *
541 * (root_key.objectid, trans->transid, inode objectid,
542 * number of references in the leaf)
543 *
544 * When a file extent is removed either during snapshot deletion or
545 * file truncation, we find the corresponding back reference and check
546 * the following fields:
547 *
548 * (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
549 * inode objectid)
550 *
551 * Btree extents can be referenced by:
552 *
553 * - Different subvolumes
554 * - Different generations of the same subvolume
555 *
556 * When a tree block is created, back references are inserted:
557 *
558 * (root->root_key.objectid, trans->transid, level, 1)
559 *
560 * When a tree block is cow'd, new back references are added for all the
561 * blocks it points to. If the tree block isn't in reference counted root,
562 * the old back references are removed. These new back references are of
563 * the form (trans->transid will have increased since creation):
564 *
565 * (root->root_key.objectid, trans->transid, level, 1)
566 *
567 * When a backref is in deleting, the following fields are checked:
568 *
569 * if backref was for a tree root:
570 * (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
571 * else
572 * (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
573 *
574 * Back Reference Key composing:
575 *
576 * The key objectid corresponds to the first byte in the extent, the key
577 * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
578 * byte of parent extent. If a extent is tree root, the key offset is set
579 * to the key objectid.
580 */
581
582static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
583 struct btrfs_root *root,
584 struct btrfs_path *path,
585 u64 bytenr, u64 parent,
586 u64 ref_root, u64 ref_generation,
587 u64 owner_objectid, int del)
588{
589 struct btrfs_key key;
590 struct btrfs_extent_ref *ref;
591 struct extent_buffer *leaf;
592 u64 ref_objectid;
593 int ret;
594
595 key.objectid = bytenr;
596 key.type = BTRFS_EXTENT_REF_KEY;
597 key.offset = parent;
598
599 ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
600 if (ret < 0)
601 goto out;
602 if (ret > 0) {
603 ret = -ENOENT;
604 goto out;
605 }
606
607 leaf = path->nodes[0];
608 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
609 ref_objectid = btrfs_ref_objectid(leaf, ref);
610 if (btrfs_ref_root(leaf, ref) != ref_root ||
611 btrfs_ref_generation(leaf, ref) != ref_generation ||
612 (ref_objectid != owner_objectid &&
613 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
614 ret = -EIO;
615 WARN_ON(1);
616 goto out;
617 }
618 ret = 0;
619out:
620 return ret;
621}
622
623static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
624 struct btrfs_root *root,
625 struct btrfs_path *path,
626 u64 bytenr, u64 parent,
627 u64 ref_root, u64 ref_generation,
628 u64 owner_objectid)
629{
630 struct btrfs_key key;
631 struct extent_buffer *leaf;
632 struct btrfs_extent_ref *ref;
633 u32 num_refs;
634 int ret;
635
636 key.objectid = bytenr;
637 key.type = BTRFS_EXTENT_REF_KEY;
638 key.offset = parent;
639
640 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
641 if (ret == 0) {
642 leaf = path->nodes[0];
643 ref = btrfs_item_ptr(leaf, path->slots[0],
644 struct btrfs_extent_ref);
645 btrfs_set_ref_root(leaf, ref, ref_root);
646 btrfs_set_ref_generation(leaf, ref, ref_generation);
647 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
648 btrfs_set_ref_num_refs(leaf, ref, 1);
649 } else if (ret == -EEXIST) {
650 u64 existing_owner;
651 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
652 leaf = path->nodes[0];
653 ref = btrfs_item_ptr(leaf, path->slots[0],
654 struct btrfs_extent_ref);
655 if (btrfs_ref_root(leaf, ref) != ref_root ||
656 btrfs_ref_generation(leaf, ref) != ref_generation) {
657 ret = -EIO;
658 WARN_ON(1);
659 goto out;
660 }
661
662 num_refs = btrfs_ref_num_refs(leaf, ref);
663 BUG_ON(num_refs == 0);
664 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
665
666 existing_owner = btrfs_ref_objectid(leaf, ref);
667 if (existing_owner != owner_objectid &&
668 existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
669 btrfs_set_ref_objectid(leaf, ref,
670 BTRFS_MULTIPLE_OBJECTIDS);
671 }
672 ret = 0;
673 } else {
674 goto out;
675 }
676 btrfs_mark_buffer_dirty(path->nodes[0]);
677out:
678 btrfs_release_path(root, path);
679 return ret;
680}
681
682static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
683 struct btrfs_root *root,
684 struct btrfs_path *path)
685{
686 struct extent_buffer *leaf;
687 struct btrfs_extent_ref *ref;
688 u32 num_refs;
689 int ret = 0;
690
691 leaf = path->nodes[0];
692 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
693 num_refs = btrfs_ref_num_refs(leaf, ref);
694 BUG_ON(num_refs == 0);
695 num_refs -= 1;
696 if (num_refs == 0) {
697 ret = btrfs_del_item(trans, root, path);
698 } else {
699 btrfs_set_ref_num_refs(leaf, ref, num_refs);
700 btrfs_mark_buffer_dirty(leaf);
701 }
702 btrfs_release_path(root, path);
703 return ret;
704}
705
706static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
707 struct btrfs_root *root, u64 bytenr,
708 u64 orig_parent, u64 parent,
709 u64 orig_root, u64 ref_root,
710 u64 orig_generation, u64 ref_generation,
711 u64 owner_objectid)
712{
713 int ret;
714 struct btrfs_root *extent_root = root->fs_info->extent_root;
715 struct btrfs_path *path;
716
717 if (root == root->fs_info->extent_root) {
718 struct pending_extent_op *extent_op;
719 u64 num_bytes;
720
721 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
722 num_bytes = btrfs_level_size(root, (int)owner_objectid);
723 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
724 bytenr + num_bytes - 1, EXTENT_LOCKED, 0)) {
725 u64 priv;
726 ret = get_state_private(&root->fs_info->extent_ins,
727 bytenr, &priv);
728 BUG_ON(ret);
729 extent_op = (struct pending_extent_op *)
730 (unsigned long)priv;
731 BUG_ON(extent_op->parent != orig_parent);
732 BUG_ON(extent_op->generation != orig_generation);
733 extent_op->parent = parent;
734 extent_op->generation = ref_generation;
735 } else {
736 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
737 BUG_ON(!extent_op);
738
739 extent_op->type = PENDING_BACKREF_UPDATE;
740 extent_op->bytenr = bytenr;
741 extent_op->num_bytes = num_bytes;
742 extent_op->parent = parent;
743 extent_op->orig_parent = orig_parent;
744 extent_op->generation = ref_generation;
745 extent_op->orig_generation = orig_generation;
746 extent_op->level = (int)owner_objectid;
747
748 set_extent_bits(&root->fs_info->extent_ins,
749 bytenr, bytenr + num_bytes - 1,
750 EXTENT_LOCKED, GFP_NOFS);
751 set_state_private(&root->fs_info->extent_ins,
752 bytenr, (unsigned long)extent_op);
753 }
754 return 0;
755 }
756
757 path = btrfs_alloc_path();
758 if (!path)
759 return -ENOMEM;
760 ret = lookup_extent_backref(trans, extent_root, path,
761 bytenr, orig_parent, orig_root,
762 orig_generation, owner_objectid, 1);
763 if (ret)
764 goto out;
765 ret = remove_extent_backref(trans, extent_root, path);
766 if (ret)
767 goto out;
768 ret = insert_extent_backref(trans, extent_root, path, bytenr,
769 parent, ref_root, ref_generation,
770 owner_objectid);
771 BUG_ON(ret);
772 finish_current_insert(trans, extent_root);
773 del_pending_extents(trans, extent_root);
774out:
775 btrfs_free_path(path);
776 return ret;
777}
778
779int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
780 struct btrfs_root *root, u64 bytenr,
781 u64 orig_parent, u64 parent,
782 u64 ref_root, u64 ref_generation,
783 u64 owner_objectid)
784{
785 int ret;
786 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
787 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
788 return 0;
789 maybe_lock_mutex(root);
790 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
791 parent, ref_root, ref_root,
792 ref_generation, ref_generation,
793 owner_objectid);
794 maybe_unlock_mutex(root);
795 return ret;
796}
797
798static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
799 struct btrfs_root *root, u64 bytenr,
800 u64 orig_parent, u64 parent,
801 u64 orig_root, u64 ref_root,
802 u64 orig_generation, u64 ref_generation,
803 u64 owner_objectid)
804{
805 struct btrfs_path *path;
806 int ret;
807 struct btrfs_key key;
808 struct extent_buffer *l;
809 struct btrfs_extent_item *item;
810 u32 refs;
811
812 path = btrfs_alloc_path();
813 if (!path)
814 return -ENOMEM;
815
816 path->reada = 1;
817 key.objectid = bytenr;
818 key.type = BTRFS_EXTENT_ITEM_KEY;
819 key.offset = (u64)-1;
820
821 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
822 0, 1);
823 if (ret < 0)
824 return ret;
825 BUG_ON(ret == 0 || path->slots[0] == 0);
826
827 path->slots[0]--;
828 l = path->nodes[0];
829
830 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
831 BUG_ON(key.objectid != bytenr);
832 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
833
834 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
835 refs = btrfs_extent_refs(l, item);
836 btrfs_set_extent_refs(l, item, refs + 1);
837 btrfs_mark_buffer_dirty(path->nodes[0]);
838
839 btrfs_release_path(root->fs_info->extent_root, path);
840
841 path->reada = 1;
842 ret = insert_extent_backref(trans, root->fs_info->extent_root,
843 path, bytenr, parent,
844 ref_root, ref_generation,
845 owner_objectid);
846 BUG_ON(ret);
847 finish_current_insert(trans, root->fs_info->extent_root);
848 del_pending_extents(trans, root->fs_info->extent_root);
849
850 btrfs_free_path(path);
851 return 0;
852}
853
854int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
855 struct btrfs_root *root,
856 u64 bytenr, u64 num_bytes, u64 parent,
857 u64 ref_root, u64 ref_generation,
858 u64 owner_objectid)
859{
860 int ret;
861 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
862 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
863 return 0;
864 maybe_lock_mutex(root);
865 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
866 0, ref_root, 0, ref_generation,
867 owner_objectid);
868 maybe_unlock_mutex(root);
869 return ret;
870}
871
872int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
873 struct btrfs_root *root)
874{
875 finish_current_insert(trans, root->fs_info->extent_root);
876 del_pending_extents(trans, root->fs_info->extent_root);
877 return 0;
878}
879
880int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
881 struct btrfs_root *root, u64 bytenr,
882 u64 num_bytes, u32 *refs)
883{
884 struct btrfs_path *path;
885 int ret;
886 struct btrfs_key key;
887 struct extent_buffer *l;
888 struct btrfs_extent_item *item;
889
890 WARN_ON(num_bytes < root->sectorsize);
891 path = btrfs_alloc_path();
892 path->reada = 1;
893 key.objectid = bytenr;
894 key.offset = num_bytes;
895 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
896 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
897 0, 0);
898 if (ret < 0)
899 goto out;
900 if (ret != 0) {
901 btrfs_print_leaf(root, path->nodes[0]);
902 printk("failed to find block number %Lu\n", bytenr);
903 BUG();
904 }
905 l = path->nodes[0];
906 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
907 *refs = btrfs_extent_refs(l, item);
908out:
909 btrfs_free_path(path);
910 return 0;
911}
912
913static int get_reference_status(struct btrfs_root *root, u64 bytenr,
914 u64 parent_gen, u64 ref_objectid,
915 u64 *min_generation, u32 *ref_count)
916{
917 struct btrfs_root *extent_root = root->fs_info->extent_root;
918 struct btrfs_path *path;
919 struct extent_buffer *leaf;
920 struct btrfs_extent_ref *ref_item;
921 struct btrfs_key key;
922 struct btrfs_key found_key;
923 u64 root_objectid = root->root_key.objectid;
924 u64 ref_generation;
925 u32 nritems;
926 int ret;
927
928 key.objectid = bytenr;
929 key.offset = (u64)-1;
930 key.type = BTRFS_EXTENT_ITEM_KEY;
931
932 path = btrfs_alloc_path();
933 mutex_lock(&root->fs_info->alloc_mutex);
934 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
935 if (ret < 0)
936 goto out;
937 BUG_ON(ret == 0);
938 if (ret < 0 || path->slots[0] == 0)
939 goto out;
940
941 path->slots[0]--;
942 leaf = path->nodes[0];
943 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
944
945 if (found_key.objectid != bytenr ||
946 found_key.type != BTRFS_EXTENT_ITEM_KEY) {
947 ret = 1;
948 goto out;
949 }
950
951 *ref_count = 0;
952 *min_generation = (u64)-1;
953
954 while (1) {
955 leaf = path->nodes[0];
956 nritems = btrfs_header_nritems(leaf);
957 if (path->slots[0] >= nritems) {
958 ret = btrfs_next_leaf(extent_root, path);
959 if (ret < 0)
960 goto out;
961 if (ret == 0)
962 continue;
963 break;
964 }
965 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
966 if (found_key.objectid != bytenr)
967 break;
968
969 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
970 path->slots[0]++;
971 continue;
972 }
973
974 ref_item = btrfs_item_ptr(leaf, path->slots[0],
975 struct btrfs_extent_ref);
976 ref_generation = btrfs_ref_generation(leaf, ref_item);
977 /*
978 * For (parent_gen > 0 && parent_gen > ref_generation):
979 *
980 * we reach here through the oldest root, therefore
981 * all other reference from same snapshot should have
982 * a larger generation.
983 */
984 if ((root_objectid != btrfs_ref_root(leaf, ref_item)) ||
985 (parent_gen > 0 && parent_gen > ref_generation) ||
986 (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
987 ref_objectid != btrfs_ref_objectid(leaf, ref_item))) {
988 *ref_count = 2;
989 break;
990 }
991
992 *ref_count = 1;
993 if (*min_generation > ref_generation)
994 *min_generation = ref_generation;
995
996 path->slots[0]++;
997 }
998 ret = 0;
999out:
1000 mutex_unlock(&root->fs_info->alloc_mutex);
1001 btrfs_free_path(path);
1002 return ret;
1003}
1004
1005int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
1006 struct btrfs_root *root,
1007 struct btrfs_key *key, u64 bytenr)
1008{
1009 struct btrfs_root *old_root;
1010 struct btrfs_path *path = NULL;
1011 struct extent_buffer *eb;
1012 struct btrfs_file_extent_item *item;
1013 u64 ref_generation;
1014 u64 min_generation;
1015 u64 extent_start;
1016 u32 ref_count;
1017 int level;
1018 int ret;
1019
1020 BUG_ON(trans == NULL);
1021 BUG_ON(key->type != BTRFS_EXTENT_DATA_KEY);
1022 ret = get_reference_status(root, bytenr, 0, key->objectid,
1023 &min_generation, &ref_count);
1024 if (ret)
1025 return ret;
1026
1027 if (ref_count != 1)
1028 return 1;
1029
1030 old_root = root->dirty_root->root;
1031 ref_generation = old_root->root_key.offset;
1032
1033 /* all references are created in running transaction */
1034 if (min_generation > ref_generation) {
1035 ret = 0;
1036 goto out;
1037 }
1038
1039 path = btrfs_alloc_path();
1040 if (!path) {
1041 ret = -ENOMEM;
1042 goto out;
1043 }
1044
1045 path->skip_locking = 1;
1046 /* if no item found, the extent is referenced by other snapshot */
1047 ret = btrfs_search_slot(NULL, old_root, key, path, 0, 0);
1048 if (ret)
1049 goto out;
1050
1051 eb = path->nodes[0];
1052 item = btrfs_item_ptr(eb, path->slots[0],
1053 struct btrfs_file_extent_item);
1054 if (btrfs_file_extent_type(eb, item) != BTRFS_FILE_EXTENT_REG ||
1055 btrfs_file_extent_disk_bytenr(eb, item) != bytenr) {
1056 ret = 1;
1057 goto out;
1058 }
1059
1060 for (level = BTRFS_MAX_LEVEL - 1; level >= -1; level--) {
1061 if (level >= 0) {
1062 eb = path->nodes[level];
1063 if (!eb)
1064 continue;
1065 extent_start = eb->start;
1066 } else
1067 extent_start = bytenr;
1068
1069 ret = get_reference_status(root, extent_start, ref_generation,
1070 0, &min_generation, &ref_count);
1071 if (ret)
1072 goto out;
1073
1074 if (ref_count != 1) {
1075 ret = 1;
1076 goto out;
1077 }
1078 if (level >= 0)
1079 ref_generation = btrfs_header_generation(eb);
1080 }
1081 ret = 0;
1082out:
1083 if (path)
1084 btrfs_free_path(path);
1085 return ret;
1086}
1087
1088int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1089 struct extent_buffer *buf, u32 nr_extents)
1090{
1091 struct btrfs_key key;
1092 struct btrfs_file_extent_item *fi;
1093 u64 root_gen;
1094 u32 nritems;
1095 int i;
1096 int level;
1097 int ret = 0;
1098 int shared = 0;
1099
1100 if (!root->ref_cows)
1101 return 0;
1102
1103 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1104 shared = 0;
1105 root_gen = root->root_key.offset;
1106 } else {
1107 shared = 1;
1108 root_gen = trans->transid - 1;
1109 }
1110
1111 level = btrfs_header_level(buf);
1112 nritems = btrfs_header_nritems(buf);
1113
1114 if (level == 0) {
1115 struct btrfs_leaf_ref *ref;
1116 struct btrfs_extent_info *info;
1117
1118 ref = btrfs_alloc_leaf_ref(root, nr_extents);
1119 if (!ref) {
1120 ret = -ENOMEM;
1121 goto out;
1122 }
1123
1124 ref->root_gen = root_gen;
1125 ref->bytenr = buf->start;
1126 ref->owner = btrfs_header_owner(buf);
1127 ref->generation = btrfs_header_generation(buf);
1128 ref->nritems = nr_extents;
1129 info = ref->extents;
1130
1131 for (i = 0; nr_extents > 0 && i < nritems; i++) {
1132 u64 disk_bytenr;
1133 btrfs_item_key_to_cpu(buf, &key, i);
1134 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1135 continue;
1136 fi = btrfs_item_ptr(buf, i,
1137 struct btrfs_file_extent_item);
1138 if (btrfs_file_extent_type(buf, fi) ==
1139 BTRFS_FILE_EXTENT_INLINE)
1140 continue;
1141 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1142 if (disk_bytenr == 0)
1143 continue;
1144
1145 info->bytenr = disk_bytenr;
1146 info->num_bytes =
1147 btrfs_file_extent_disk_num_bytes(buf, fi);
1148 info->objectid = key.objectid;
1149 info->offset = key.offset;
1150 info++;
1151 }
1152
1153 ret = btrfs_add_leaf_ref(root, ref, shared);
1154 if (ret == -EEXIST && shared) {
1155 struct btrfs_leaf_ref *old;
1156 old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1157 BUG_ON(!old);
1158 btrfs_remove_leaf_ref(root, old);
1159 btrfs_free_leaf_ref(root, old);
1160 ret = btrfs_add_leaf_ref(root, ref, shared);
1161 }
1162 WARN_ON(ret);
1163 btrfs_free_leaf_ref(root, ref);
1164 }
1165out:
1166 return ret;
1167}
1168
1169int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1170 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1171 u32 *nr_extents)
1172{
1173 u64 bytenr;
1174 u64 ref_root;
1175 u64 orig_root;
1176 u64 ref_generation;
1177 u64 orig_generation;
1178 u32 nritems;
1179 u32 nr_file_extents = 0;
1180 struct btrfs_key key;
1181 struct btrfs_file_extent_item *fi;
1182 int i;
1183 int level;
1184 int ret = 0;
1185 int faili = 0;
1186 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1187 u64, u64, u64, u64, u64, u64, u64, u64);
1188
1189 ref_root = btrfs_header_owner(buf);
1190 ref_generation = btrfs_header_generation(buf);
1191 orig_root = btrfs_header_owner(orig_buf);
1192 orig_generation = btrfs_header_generation(orig_buf);
1193
1194 nritems = btrfs_header_nritems(buf);
1195 level = btrfs_header_level(buf);
1196
1197 if (root->ref_cows) {
1198 process_func = __btrfs_inc_extent_ref;
1199 } else {
1200 if (level == 0 &&
1201 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1202 goto out;
1203 if (level != 0 &&
1204 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1205 goto out;
1206 process_func = __btrfs_update_extent_ref;
1207 }
1208
1209 for (i = 0; i < nritems; i++) {
1210 cond_resched();
1211 if (level == 0) {
1212 btrfs_item_key_to_cpu(buf, &key, i);
1213 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1214 continue;
1215 fi = btrfs_item_ptr(buf, i,
1216 struct btrfs_file_extent_item);
1217 if (btrfs_file_extent_type(buf, fi) ==
1218 BTRFS_FILE_EXTENT_INLINE)
1219 continue;
1220 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1221 if (bytenr == 0)
1222 continue;
1223
1224 nr_file_extents++;
1225
1226 maybe_lock_mutex(root);
1227 ret = process_func(trans, root, bytenr,
1228 orig_buf->start, buf->start,
1229 orig_root, ref_root,
1230 orig_generation, ref_generation,
1231 key.objectid);
1232 maybe_unlock_mutex(root);
1233
1234 if (ret) {
1235 faili = i;
1236 WARN_ON(1);
1237 goto fail;
1238 }
1239 } else {
1240 bytenr = btrfs_node_blockptr(buf, i);
1241 maybe_lock_mutex(root);
1242 ret = process_func(trans, root, bytenr,
1243 orig_buf->start, buf->start,
1244 orig_root, ref_root,
1245 orig_generation, ref_generation,
1246 level - 1);
1247 maybe_unlock_mutex(root);
1248 if (ret) {
1249 faili = i;
1250 WARN_ON(1);
1251 goto fail;
1252 }
1253 }
1254 }
1255out:
1256 if (nr_extents) {
1257 if (level == 0)
1258 *nr_extents = nr_file_extents;
1259 else
1260 *nr_extents = nritems;
1261 }
1262 return 0;
1263fail:
1264 WARN_ON(1);
1265 return ret;
1266}
1267
1268int btrfs_update_ref(struct btrfs_trans_handle *trans,
1269 struct btrfs_root *root, struct extent_buffer *orig_buf,
1270 struct extent_buffer *buf, int start_slot, int nr)
1271
1272{
1273 u64 bytenr;
1274 u64 ref_root;
1275 u64 orig_root;
1276 u64 ref_generation;
1277 u64 orig_generation;
1278 struct btrfs_key key;
1279 struct btrfs_file_extent_item *fi;
1280 int i;
1281 int ret;
1282 int slot;
1283 int level;
1284
1285 BUG_ON(start_slot < 0);
1286 BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1287
1288 ref_root = btrfs_header_owner(buf);
1289 ref_generation = btrfs_header_generation(buf);
1290 orig_root = btrfs_header_owner(orig_buf);
1291 orig_generation = btrfs_header_generation(orig_buf);
1292 level = btrfs_header_level(buf);
1293
1294 if (!root->ref_cows) {
1295 if (level == 0 &&
1296 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1297 return 0;
1298 if (level != 0 &&
1299 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1300 return 0;
1301 }
1302
1303 for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1304 cond_resched();
1305 if (level == 0) {
1306 btrfs_item_key_to_cpu(buf, &key, slot);
1307 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1308 continue;
1309 fi = btrfs_item_ptr(buf, slot,
1310 struct btrfs_file_extent_item);
1311 if (btrfs_file_extent_type(buf, fi) ==
1312 BTRFS_FILE_EXTENT_INLINE)
1313 continue;
1314 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1315 if (bytenr == 0)
1316 continue;
1317 maybe_lock_mutex(root);
1318 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1319 orig_buf->start, buf->start,
1320 orig_root, ref_root,
1321 orig_generation, ref_generation,
1322 key.objectid);
1323 maybe_unlock_mutex(root);
1324 if (ret)
1325 goto fail;
1326 } else {
1327 bytenr = btrfs_node_blockptr(buf, slot);
1328 maybe_lock_mutex(root);
1329 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1330 orig_buf->start, buf->start,
1331 orig_root, ref_root,
1332 orig_generation, ref_generation,
1333 level - 1);
1334 maybe_unlock_mutex(root);
1335 if (ret)
1336 goto fail;
1337 }
1338 }
1339 return 0;
1340fail:
1341 WARN_ON(1);
1342 return -1;
1343}
1344
1345static int write_one_cache_group(struct btrfs_trans_handle *trans,
1346 struct btrfs_root *root,
1347 struct btrfs_path *path,
1348 struct btrfs_block_group_cache *cache)
1349{
1350 int ret;
1351 int pending_ret;
1352 struct btrfs_root *extent_root = root->fs_info->extent_root;
1353 unsigned long bi;
1354 struct extent_buffer *leaf;
1355
1356 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1357 if (ret < 0)
1358 goto fail;
1359 BUG_ON(ret);
1360
1361 leaf = path->nodes[0];
1362 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1363 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1364 btrfs_mark_buffer_dirty(leaf);
1365 btrfs_release_path(extent_root, path);
1366fail:
1367 finish_current_insert(trans, extent_root);
1368 pending_ret = del_pending_extents(trans, extent_root);
1369 if (ret)
1370 return ret;
1371 if (pending_ret)
1372 return pending_ret;
1373 return 0;
1374
1375}
1376
1377int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1378 struct btrfs_root *root)
1379{
1380 struct btrfs_block_group_cache *cache, *entry;
1381 struct rb_node *n;
1382 int err = 0;
1383 int werr = 0;
1384 struct btrfs_path *path;
1385 u64 last = 0;
1386
1387 path = btrfs_alloc_path();
1388 if (!path)
1389 return -ENOMEM;
1390
1391 mutex_lock(&root->fs_info->alloc_mutex);
1392 while(1) {
1393 cache = NULL;
1394 spin_lock(&root->fs_info->block_group_cache_lock);
1395 for (n = rb_first(&root->fs_info->block_group_cache_tree);
1396 n; n = rb_next(n)) {
1397 entry = rb_entry(n, struct btrfs_block_group_cache,
1398 cache_node);
1399 if (entry->dirty) {
1400 cache = entry;
1401 break;
1402 }
1403 }
1404 spin_unlock(&root->fs_info->block_group_cache_lock);
1405
1406 if (!cache)
1407 break;
1408
1409 cache->dirty = 0;
1410 last += cache->key.offset;
1411
1412 err = write_one_cache_group(trans, root,
1413 path, cache);
1414 /*
1415 * if we fail to write the cache group, we want
1416 * to keep it marked dirty in hopes that a later
1417 * write will work
1418 */
1419 if (err) {
1420 werr = err;
1421 continue;
1422 }
1423 }
1424 btrfs_free_path(path);
1425 mutex_unlock(&root->fs_info->alloc_mutex);
1426 return werr;
1427}
1428
1429static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1430 u64 total_bytes, u64 bytes_used,
1431 struct btrfs_space_info **space_info)
1432{
1433 struct btrfs_space_info *found;
1434
1435 found = __find_space_info(info, flags);
1436 if (found) {
1437 found->total_bytes += total_bytes;
1438 found->bytes_used += bytes_used;
1439 found->full = 0;
1440 *space_info = found;
1441 return 0;
1442 }
1443 found = kmalloc(sizeof(*found), GFP_NOFS);
1444 if (!found)
1445 return -ENOMEM;
1446
1447 list_add(&found->list, &info->space_info);
1448 INIT_LIST_HEAD(&found->block_groups);
1449 spin_lock_init(&found->lock);
1450 found->flags = flags;
1451 found->total_bytes = total_bytes;
1452 found->bytes_used = bytes_used;
1453 found->bytes_pinned = 0;
1454 found->bytes_reserved = 0;
1455 found->full = 0;
1456 found->force_alloc = 0;
1457 *space_info = found;
1458 return 0;
1459}
1460
1461static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1462{
1463 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1464 BTRFS_BLOCK_GROUP_RAID1 |
1465 BTRFS_BLOCK_GROUP_RAID10 |
1466 BTRFS_BLOCK_GROUP_DUP);
1467 if (extra_flags) {
1468 if (flags & BTRFS_BLOCK_GROUP_DATA)
1469 fs_info->avail_data_alloc_bits |= extra_flags;
1470 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1471 fs_info->avail_metadata_alloc_bits |= extra_flags;
1472 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1473 fs_info->avail_system_alloc_bits |= extra_flags;
1474 }
1475}
1476
1477static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1478{
1479 u64 num_devices = root->fs_info->fs_devices->num_devices;
1480
1481 if (num_devices == 1)
1482 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1483 if (num_devices < 4)
1484 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1485
1486 if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1487 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1488 BTRFS_BLOCK_GROUP_RAID10))) {
1489 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1490 }
1491
1492 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1493 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1494 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1495 }
1496
1497 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1498 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1499 (flags & BTRFS_BLOCK_GROUP_RAID10) |
1500 (flags & BTRFS_BLOCK_GROUP_DUP)))
1501 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1502 return flags;
1503}
1504
1505static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1506 struct btrfs_root *extent_root, u64 alloc_bytes,
1507 u64 flags, int force)
1508{
1509 struct btrfs_space_info *space_info;
1510 u64 thresh;
1511 u64 start;
1512 u64 num_bytes;
1513 int ret = 0, waited = 0;
1514
1515 flags = reduce_alloc_profile(extent_root, flags);
1516
1517 space_info = __find_space_info(extent_root->fs_info, flags);
1518 if (!space_info) {
1519 ret = update_space_info(extent_root->fs_info, flags,
1520 0, 0, &space_info);
1521 BUG_ON(ret);
1522 }
1523 BUG_ON(!space_info);
1524
1525 if (space_info->force_alloc) {
1526 force = 1;
1527 space_info->force_alloc = 0;
1528 }
1529 if (space_info->full)
1530 goto out;
1531
1532 thresh = div_factor(space_info->total_bytes, 6);
1533 if (!force &&
1534 (space_info->bytes_used + space_info->bytes_pinned +
1535 space_info->bytes_reserved + alloc_bytes) < thresh)
1536 goto out;
1537
1538 while (!mutex_trylock(&extent_root->fs_info->chunk_mutex)) {
1539 if (!force)
1540 goto out;
1541 mutex_unlock(&extent_root->fs_info->alloc_mutex);
1542 cond_resched();
1543 mutex_lock(&extent_root->fs_info->alloc_mutex);
1544 waited = 1;
1545 }
1546
1547 if (waited && space_info->full)
1548 goto out_unlock;
1549
1550 ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
1551 if (ret == -ENOSPC) {
1552printk("space info full %Lu\n", flags);
1553 space_info->full = 1;
1554 goto out_unlock;
1555 }
1556 BUG_ON(ret);
1557
1558 ret = btrfs_make_block_group(trans, extent_root, 0, flags,
1559 BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
1560 BUG_ON(ret);
1561
1562out_unlock:
1563 mutex_unlock(&extent_root->fs_info->chunk_mutex);
1564out:
1565 return ret;
1566}
1567
1568static int update_block_group(struct btrfs_trans_handle *trans,
1569 struct btrfs_root *root,
1570 u64 bytenr, u64 num_bytes, int alloc,
1571 int mark_free)
1572{
1573 struct btrfs_block_group_cache *cache;
1574 struct btrfs_fs_info *info = root->fs_info;
1575 u64 total = num_bytes;
1576 u64 old_val;
1577 u64 byte_in_group;
1578
1579 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1580 while(total) {
1581 cache = btrfs_lookup_block_group(info, bytenr);
1582 if (!cache) {
1583 return -1;
1584 }
1585 byte_in_group = bytenr - cache->key.objectid;
1586 WARN_ON(byte_in_group > cache->key.offset);
1587
1588 spin_lock(&cache->lock);
1589 cache->dirty = 1;
1590 old_val = btrfs_block_group_used(&cache->item);
1591 num_bytes = min(total, cache->key.offset - byte_in_group);
1592 if (alloc) {
1593 old_val += num_bytes;
1594 cache->space_info->bytes_used += num_bytes;
1595 btrfs_set_block_group_used(&cache->item, old_val);
1596 spin_unlock(&cache->lock);
1597 } else {
1598 old_val -= num_bytes;
1599 cache->space_info->bytes_used -= num_bytes;
1600 btrfs_set_block_group_used(&cache->item, old_val);
1601 spin_unlock(&cache->lock);
1602 if (mark_free) {
1603 int ret;
1604 ret = btrfs_add_free_space(cache, bytenr,
1605 num_bytes);
1606 if (ret)
1607 return -1;
1608 }
1609 }
1610 total -= num_bytes;
1611 bytenr += num_bytes;
1612 }
1613 return 0;
1614}
1615
1616static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1617{
1618 struct btrfs_block_group_cache *cache;
1619
1620 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
1621 if (!cache)
1622 return 0;
1623
1624 return cache->key.objectid;
1625}
1626
1627int btrfs_update_pinned_extents(struct btrfs_root *root,
1628 u64 bytenr, u64 num, int pin)
1629{
1630 u64 len;
1631 struct btrfs_block_group_cache *cache;
1632 struct btrfs_fs_info *fs_info = root->fs_info;
1633
1634 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1635 if (pin) {
1636 set_extent_dirty(&fs_info->pinned_extents,
1637 bytenr, bytenr + num - 1, GFP_NOFS);
1638 } else {
1639 clear_extent_dirty(&fs_info->pinned_extents,
1640 bytenr, bytenr + num - 1, GFP_NOFS);
1641 }
1642 while (num > 0) {
1643 cache = btrfs_lookup_block_group(fs_info, bytenr);
1644 BUG_ON(!cache);
1645 len = min(num, cache->key.offset -
1646 (bytenr - cache->key.objectid));
1647 if (pin) {
1648 spin_lock(&cache->lock);
1649 cache->pinned += len;
1650 cache->space_info->bytes_pinned += len;
1651 spin_unlock(&cache->lock);
1652 fs_info->total_pinned += len;
1653 } else {
1654 spin_lock(&cache->lock);
1655 cache->pinned -= len;
1656 cache->space_info->bytes_pinned -= len;
1657 spin_unlock(&cache->lock);
1658 fs_info->total_pinned -= len;
1659 }
1660 bytenr += len;
1661 num -= len;
1662 }
1663 return 0;
1664}
1665
1666static int update_reserved_extents(struct btrfs_root *root,
1667 u64 bytenr, u64 num, int reserve)
1668{
1669 u64 len;
1670 struct btrfs_block_group_cache *cache;
1671 struct btrfs_fs_info *fs_info = root->fs_info;
1672
1673 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1674 while (num > 0) {
1675 cache = btrfs_lookup_block_group(fs_info, bytenr);
1676 BUG_ON(!cache);
1677 len = min(num, cache->key.offset -
1678 (bytenr - cache->key.objectid));
1679 if (reserve) {
1680 spin_lock(&cache->lock);
1681 cache->reserved += len;
1682 cache->space_info->bytes_reserved += len;
1683 spin_unlock(&cache->lock);
1684 } else {
1685 spin_lock(&cache->lock);
1686 cache->reserved -= len;
1687 cache->space_info->bytes_reserved -= len;
1688 spin_unlock(&cache->lock);
1689 }
1690 bytenr += len;
1691 num -= len;
1692 }
1693 return 0;
1694}
1695
1696int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
1697{
1698 u64 last = 0;
1699 u64 start;
1700 u64 end;
1701 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
1702 int ret;
1703
1704 while(1) {
1705 ret = find_first_extent_bit(pinned_extents, last,
1706 &start, &end, EXTENT_DIRTY);
1707 if (ret)
1708 break;
1709 set_extent_dirty(copy, start, end, GFP_NOFS);
1710 last = end + 1;
1711 }
1712 return 0;
1713}
1714
1715int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1716 struct btrfs_root *root,
1717 struct extent_io_tree *unpin)
1718{
1719 u64 start;
1720 u64 end;
1721 int ret;
1722 struct btrfs_block_group_cache *cache;
1723
1724 mutex_lock(&root->fs_info->alloc_mutex);
1725 while(1) {
1726 ret = find_first_extent_bit(unpin, 0, &start, &end,
1727 EXTENT_DIRTY);
1728 if (ret)
1729 break;
1730 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
1731 clear_extent_dirty(unpin, start, end, GFP_NOFS);
1732 cache = btrfs_lookup_block_group(root->fs_info, start);
1733 if (cache->cached)
1734 btrfs_add_free_space(cache, start, end - start + 1);
1735 if (need_resched()) {
1736 mutex_unlock(&root->fs_info->alloc_mutex);
1737 cond_resched();
1738 mutex_lock(&root->fs_info->alloc_mutex);
1739 }
1740 }
1741 mutex_unlock(&root->fs_info->alloc_mutex);
1742 return 0;
1743}
1744
1745static int finish_current_insert(struct btrfs_trans_handle *trans,
1746 struct btrfs_root *extent_root)
1747{
1748 u64 start;
1749 u64 end;
1750 u64 priv;
1751 struct btrfs_fs_info *info = extent_root->fs_info;
1752 struct btrfs_path *path;
1753 struct btrfs_extent_ref *ref;
1754 struct pending_extent_op *extent_op;
1755 struct btrfs_key key;
1756 struct btrfs_extent_item extent_item;
1757 int ret;
1758 int err = 0;
1759
1760 WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
1761 btrfs_set_stack_extent_refs(&extent_item, 1);
1762 path = btrfs_alloc_path();
1763
1764 while(1) {
1765 ret = find_first_extent_bit(&info->extent_ins, 0, &start,
1766 &end, EXTENT_LOCKED);
1767 if (ret)
1768 break;
1769
1770 ret = get_state_private(&info->extent_ins, start, &priv);
1771 BUG_ON(ret);
1772 extent_op = (struct pending_extent_op *)(unsigned long)priv;
1773
1774 if (extent_op->type == PENDING_EXTENT_INSERT) {
1775 key.objectid = start;
1776 key.offset = end + 1 - start;
1777 key.type = BTRFS_EXTENT_ITEM_KEY;
1778 err = btrfs_insert_item(trans, extent_root, &key,
1779 &extent_item, sizeof(extent_item));
1780 BUG_ON(err);
1781
1782 clear_extent_bits(&info->extent_ins, start, end,
1783 EXTENT_LOCKED, GFP_NOFS);
1784
1785 err = insert_extent_backref(trans, extent_root, path,
1786 start, extent_op->parent,
1787 extent_root->root_key.objectid,
1788 extent_op->generation,
1789 extent_op->level);
1790 BUG_ON(err);
1791 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
1792 err = lookup_extent_backref(trans, extent_root, path,
1793 start, extent_op->orig_parent,
1794 extent_root->root_key.objectid,
1795 extent_op->orig_generation,
1796 extent_op->level, 0);
1797 BUG_ON(err);
1798
1799 clear_extent_bits(&info->extent_ins, start, end,
1800 EXTENT_LOCKED, GFP_NOFS);
1801
1802 key.objectid = start;
1803 key.offset = extent_op->parent;
1804 key.type = BTRFS_EXTENT_REF_KEY;
1805 err = btrfs_set_item_key_safe(trans, extent_root, path,
1806 &key);
1807 BUG_ON(err);
1808 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1809 struct btrfs_extent_ref);
1810 btrfs_set_ref_generation(path->nodes[0], ref,
1811 extent_op->generation);
1812 btrfs_mark_buffer_dirty(path->nodes[0]);
1813 btrfs_release_path(extent_root, path);
1814 } else {
1815 BUG_ON(1);
1816 }
1817 kfree(extent_op);
1818
1819 if (need_resched()) {
1820 mutex_unlock(&extent_root->fs_info->alloc_mutex);
1821 cond_resched();
1822 mutex_lock(&extent_root->fs_info->alloc_mutex);
1823 }
1824 }
1825 btrfs_free_path(path);
1826 return 0;
1827}
1828
1829static int pin_down_bytes(struct btrfs_trans_handle *trans,
1830 struct btrfs_root *root,
1831 u64 bytenr, u64 num_bytes, int is_data)
1832{
1833 int err = 0;
1834 struct extent_buffer *buf;
1835
1836 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1837 if (is_data)
1838 goto pinit;
1839
1840 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
1841 if (!buf)
1842 goto pinit;
1843
1844 /* we can reuse a block if it hasn't been written
1845 * and it is from this transaction. We can't
1846 * reuse anything from the tree log root because
1847 * it has tiny sub-transactions.
1848 */
1849 if (btrfs_buffer_uptodate(buf, 0) &&
1850 btrfs_try_tree_lock(buf)) {
1851 u64 header_owner = btrfs_header_owner(buf);
1852 u64 header_transid = btrfs_header_generation(buf);
1853 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
1854 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
1855 header_transid == trans->transid &&
1856 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
1857 clean_tree_block(NULL, root, buf);
1858 btrfs_tree_unlock(buf);
1859 free_extent_buffer(buf);
1860 return 1;
1861 }
1862 btrfs_tree_unlock(buf);
1863 }
1864 free_extent_buffer(buf);
1865pinit:
1866 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
1867
1868 BUG_ON(err < 0);
1869 return 0;
1870}
1871
1872/*
1873 * remove an extent from the root, returns 0 on success
1874 */
1875static int __free_extent(struct btrfs_trans_handle *trans,
1876 struct btrfs_root *root,
1877 u64 bytenr, u64 num_bytes, u64 parent,
1878 u64 root_objectid, u64 ref_generation,
1879 u64 owner_objectid, int pin, int mark_free)
1880{
1881 struct btrfs_path *path;
1882 struct btrfs_key key;
1883 struct btrfs_fs_info *info = root->fs_info;
1884 struct btrfs_root *extent_root = info->extent_root;
1885 struct extent_buffer *leaf;
1886 int ret;
1887 int extent_slot = 0;
1888 int found_extent = 0;
1889 int num_to_del = 1;
1890 struct btrfs_extent_item *ei;
1891 u32 refs;
1892
1893 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1894 key.objectid = bytenr;
1895 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1896 key.offset = num_bytes;
1897 path = btrfs_alloc_path();
1898 if (!path)
1899 return -ENOMEM;
1900
1901 path->reada = 1;
1902 ret = lookup_extent_backref(trans, extent_root, path,
1903 bytenr, parent, root_objectid,
1904 ref_generation, owner_objectid, 1);
1905 if (ret == 0) {
1906 struct btrfs_key found_key;
1907 extent_slot = path->slots[0];
1908 while(extent_slot > 0) {
1909 extent_slot--;
1910 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1911 extent_slot);
1912 if (found_key.objectid != bytenr)
1913 break;
1914 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
1915 found_key.offset == num_bytes) {
1916 found_extent = 1;
1917 break;
1918 }
1919 if (path->slots[0] - extent_slot > 5)
1920 break;
1921 }
1922 if (!found_extent) {
1923 ret = remove_extent_backref(trans, extent_root, path);
1924 BUG_ON(ret);
1925 btrfs_release_path(extent_root, path);
1926 ret = btrfs_search_slot(trans, extent_root,
1927 &key, path, -1, 1);
1928 BUG_ON(ret);
1929 extent_slot = path->slots[0];
1930 }
1931 } else {
1932 btrfs_print_leaf(extent_root, path->nodes[0]);
1933 WARN_ON(1);
1934 printk("Unable to find ref byte nr %Lu root %Lu "
1935 "gen %Lu owner %Lu\n", bytenr,
1936 root_objectid, ref_generation, owner_objectid);
1937 }
1938
1939 leaf = path->nodes[0];
1940 ei = btrfs_item_ptr(leaf, extent_slot,
1941 struct btrfs_extent_item);
1942 refs = btrfs_extent_refs(leaf, ei);
1943 BUG_ON(refs == 0);
1944 refs -= 1;
1945 btrfs_set_extent_refs(leaf, ei, refs);
1946
1947 btrfs_mark_buffer_dirty(leaf);
1948
1949 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
1950 struct btrfs_extent_ref *ref;
1951 ref = btrfs_item_ptr(leaf, path->slots[0],
1952 struct btrfs_extent_ref);
1953 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
1954 /* if the back ref and the extent are next to each other
1955 * they get deleted below in one shot
1956 */
1957 path->slots[0] = extent_slot;
1958 num_to_del = 2;
1959 } else if (found_extent) {
1960 /* otherwise delete the extent back ref */
1961 ret = remove_extent_backref(trans, extent_root, path);
1962 BUG_ON(ret);
1963 /* if refs are 0, we need to setup the path for deletion */
1964 if (refs == 0) {
1965 btrfs_release_path(extent_root, path);
1966 ret = btrfs_search_slot(trans, extent_root, &key, path,
1967 -1, 1);
1968 BUG_ON(ret);
1969 }
1970 }
1971
1972 if (refs == 0) {
1973 u64 super_used;
1974 u64 root_used;
1975#ifdef BIO_RW_DISCARD
1976 u64 map_length = num_bytes;
1977 struct btrfs_multi_bio *multi = NULL;
1978#endif
1979
1980 if (pin) {
1981 ret = pin_down_bytes(trans, root, bytenr, num_bytes,
1982 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
1983 if (ret > 0)
1984 mark_free = 1;
1985 BUG_ON(ret < 0);
1986 }
1987
1988 /* block accounting for super block */
1989 spin_lock_irq(&info->delalloc_lock);
1990 super_used = btrfs_super_bytes_used(&info->super_copy);
1991 btrfs_set_super_bytes_used(&info->super_copy,
1992 super_used - num_bytes);
1993 spin_unlock_irq(&info->delalloc_lock);
1994
1995 /* block accounting for root item */
1996 root_used = btrfs_root_used(&root->root_item);
1997 btrfs_set_root_used(&root->root_item,
1998 root_used - num_bytes);
1999 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2000 num_to_del);
2001 BUG_ON(ret);
2002 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2003 mark_free);
2004 BUG_ON(ret);
2005
2006#ifdef BIO_RW_DISCARD
2007 /* Tell the block device(s) that the sectors can be discarded */
2008 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
2009 bytenr, &map_length, &multi, 0);
2010 if (!ret) {
2011 struct btrfs_bio_stripe *stripe = multi->stripes;
2012 int i;
2013
2014 if (map_length > num_bytes)
2015 map_length = num_bytes;
2016
2017 for (i = 0; i < multi->num_stripes; i++, stripe++) {
2018 blkdev_issue_discard(stripe->dev->bdev,
2019 stripe->physical >> 9,
2020 map_length >> 9);
2021 }
2022 kfree(multi);
2023 }
2024#endif
2025 }
2026 btrfs_free_path(path);
2027 finish_current_insert(trans, extent_root);
2028 return ret;
2029}
2030
2031/*
2032 * find all the blocks marked as pending in the radix tree and remove
2033 * them from the extent map
2034 */
2035static int del_pending_extents(struct btrfs_trans_handle *trans, struct
2036 btrfs_root *extent_root)
2037{
2038 int ret;
2039 int err = 0;
2040 int mark_free = 0;
2041 u64 start;
2042 u64 end;
2043 u64 priv;
2044 struct extent_io_tree *pending_del;
2045 struct extent_io_tree *extent_ins;
2046 struct pending_extent_op *extent_op;
2047
2048 WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
2049 extent_ins = &extent_root->fs_info->extent_ins;
2050 pending_del = &extent_root->fs_info->pending_del;
2051
2052 while(1) {
2053 ret = find_first_extent_bit(pending_del, 0, &start, &end,
2054 EXTENT_LOCKED);
2055 if (ret)
2056 break;
2057
2058 ret = get_state_private(pending_del, start, &priv);
2059 BUG_ON(ret);
2060 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2061
2062 clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
2063 GFP_NOFS);
2064
2065 ret = pin_down_bytes(trans, extent_root, start,
2066 end + 1 - start, 0);
2067 mark_free = ret > 0;
2068 if (!test_range_bit(extent_ins, start, end,
2069 EXTENT_LOCKED, 0)) {
2070free_extent:
2071 ret = __free_extent(trans, extent_root,
2072 start, end + 1 - start,
2073 extent_op->orig_parent,
2074 extent_root->root_key.objectid,
2075 extent_op->orig_generation,
2076 extent_op->level, 0, mark_free);
2077 kfree(extent_op);
2078 } else {
2079 kfree(extent_op);
2080 ret = get_state_private(extent_ins, start, &priv);
2081 BUG_ON(ret);
2082 extent_op = (struct pending_extent_op *)
2083 (unsigned long)priv;
2084
2085 clear_extent_bits(extent_ins, start, end,
2086 EXTENT_LOCKED, GFP_NOFS);
2087
2088 if (extent_op->type == PENDING_BACKREF_UPDATE)
2089 goto free_extent;
2090
2091 ret = update_block_group(trans, extent_root, start,
2092 end + 1 - start, 0, mark_free);
2093 BUG_ON(ret);
2094 kfree(extent_op);
2095 }
2096 if (ret)
2097 err = ret;
2098
2099 if (need_resched()) {
2100 mutex_unlock(&extent_root->fs_info->alloc_mutex);
2101 cond_resched();
2102 mutex_lock(&extent_root->fs_info->alloc_mutex);
2103 }
2104 }
2105 return err;
2106}
2107
2108/*
2109 * remove an extent from the root, returns 0 on success
2110 */
2111static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2112 struct btrfs_root *root,
2113 u64 bytenr, u64 num_bytes, u64 parent,
2114 u64 root_objectid, u64 ref_generation,
2115 u64 owner_objectid, int pin)
2116{
2117 struct btrfs_root *extent_root = root->fs_info->extent_root;
2118 int pending_ret;
2119 int ret;
2120
2121 WARN_ON(num_bytes < root->sectorsize);
2122 if (root == extent_root) {
2123 struct pending_extent_op *extent_op;
2124
2125 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2126 BUG_ON(!extent_op);
2127
2128 extent_op->type = PENDING_EXTENT_DELETE;
2129 extent_op->bytenr = bytenr;
2130 extent_op->num_bytes = num_bytes;
2131 extent_op->parent = parent;
2132 extent_op->orig_parent = parent;
2133 extent_op->generation = ref_generation;
2134 extent_op->orig_generation = ref_generation;
2135 extent_op->level = (int)owner_objectid;
2136
2137 set_extent_bits(&root->fs_info->pending_del,
2138 bytenr, bytenr + num_bytes - 1,
2139 EXTENT_LOCKED, GFP_NOFS);
2140 set_state_private(&root->fs_info->pending_del,
2141 bytenr, (unsigned long)extent_op);
2142 return 0;
2143 }
2144 /* if metadata always pin */
2145 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2146 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2147 struct btrfs_block_group_cache *cache;
2148
2149 /* btrfs_free_reserved_extent */
2150 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2151 BUG_ON(!cache);
2152 btrfs_add_free_space(cache, bytenr, num_bytes);
2153 update_reserved_extents(root, bytenr, num_bytes, 0);
2154 return 0;
2155 }
2156 pin = 1;
2157 }
2158
2159 /* if data pin when any transaction has committed this */
2160 if (ref_generation != trans->transid)
2161 pin = 1;
2162
2163 ret = __free_extent(trans, root, bytenr, num_bytes, parent,
2164 root_objectid, ref_generation,
2165 owner_objectid, pin, pin == 0);
2166
2167 finish_current_insert(trans, root->fs_info->extent_root);
2168 pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
2169 return ret ? ret : pending_ret;
2170}
2171
2172int btrfs_free_extent(struct btrfs_trans_handle *trans,
2173 struct btrfs_root *root,
2174 u64 bytenr, u64 num_bytes, u64 parent,
2175 u64 root_objectid, u64 ref_generation,
2176 u64 owner_objectid, int pin)
2177{
2178 int ret;
2179
2180 maybe_lock_mutex(root);
2181 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
2182 root_objectid, ref_generation,
2183 owner_objectid, pin);
2184 maybe_unlock_mutex(root);
2185 return ret;
2186}
2187
2188static u64 stripe_align(struct btrfs_root *root, u64 val)
2189{
2190 u64 mask = ((u64)root->stripesize - 1);
2191 u64 ret = (val + mask) & ~mask;
2192 return ret;
2193}
2194
2195/*
2196 * walks the btree of allocated extents and find a hole of a given size.
2197 * The key ins is changed to record the hole:
2198 * ins->objectid == block start
2199 * ins->flags = BTRFS_EXTENT_ITEM_KEY
2200 * ins->offset == number of blocks
2201 * Any available blocks before search_start are skipped.
2202 */
2203static int noinline find_free_extent(struct btrfs_trans_handle *trans,
2204 struct btrfs_root *orig_root,
2205 u64 num_bytes, u64 empty_size,
2206 u64 search_start, u64 search_end,
2207 u64 hint_byte, struct btrfs_key *ins,
2208 u64 exclude_start, u64 exclude_nr,
2209 int data)
2210{
2211 int ret;
2212 u64 orig_search_start;
2213 struct btrfs_root * root = orig_root->fs_info->extent_root;
2214 struct btrfs_fs_info *info = root->fs_info;
2215 u64 total_needed = num_bytes;
2216 u64 *last_ptr = NULL;
2217 struct btrfs_block_group_cache *block_group;
2218 int chunk_alloc_done = 0;
2219 int empty_cluster = 2 * 1024 * 1024;
2220 int allowed_chunk_alloc = 0;
2221
2222 WARN_ON(num_bytes < root->sectorsize);
2223 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2224
2225 if (orig_root->ref_cows || empty_size)
2226 allowed_chunk_alloc = 1;
2227
2228 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2229 last_ptr = &root->fs_info->last_alloc;
2230 empty_cluster = 256 * 1024;
2231 }
2232
2233 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2234 last_ptr = &root->fs_info->last_data_alloc;
2235
2236 if (last_ptr) {
2237 if (*last_ptr)
2238 hint_byte = *last_ptr;
2239 else
2240 empty_size += empty_cluster;
2241 }
2242
2243 search_start = max(search_start, first_logical_byte(root, 0));
2244 orig_search_start = search_start;
2245
2246 search_start = max(search_start, hint_byte);
2247 total_needed += empty_size;
2248
2249new_group:
2250 block_group = btrfs_lookup_block_group(info, search_start);
2251 if (!block_group)
2252 block_group = btrfs_lookup_first_block_group(info,
2253 search_start);
2254
2255 /*
2256 * Ok this looks a little tricky, buts its really simple. First if we
2257 * didn't find a block group obviously we want to start over.
2258 * Secondly, if the block group we found does not match the type we
2259 * need, and we have a last_ptr and its not 0, chances are the last
2260 * allocation we made was at the end of the block group, so lets go
2261 * ahead and skip the looking through the rest of the block groups and
2262 * start at the beginning. This helps with metadata allocations,
2263 * since you are likely to have a bunch of data block groups to search
2264 * through first before you realize that you need to start over, so go
2265 * ahead and start over and save the time.
2266 */
2267 if (!block_group || (!block_group_bits(block_group, data) &&
2268 last_ptr && *last_ptr)) {
2269 if (search_start != orig_search_start) {
2270 if (last_ptr && *last_ptr) {
2271 total_needed += empty_cluster;
2272 *last_ptr = 0;
2273 }
2274 search_start = orig_search_start;
2275 goto new_group;
2276 } else if (!chunk_alloc_done && allowed_chunk_alloc) {
2277 ret = do_chunk_alloc(trans, root,
2278 num_bytes + 2 * 1024 * 1024,
2279 data, 1);
2280 if (ret < 0)
2281 goto error;
2282 BUG_ON(ret);
2283 chunk_alloc_done = 1;
2284 search_start = orig_search_start;
2285 goto new_group;
2286 } else {
2287 ret = -ENOSPC;
2288 goto error;
2289 }
2290 }
2291
2292 /*
2293 * this is going to seach through all of the existing block groups it
2294 * can find, so if we don't find something we need to see if we can
2295 * allocate what we need.
2296 */
2297 ret = find_free_space(root, &block_group, &search_start,
2298 total_needed, data);
2299 if (ret == -ENOSPC) {
2300 /*
2301 * instead of allocating, start at the original search start
2302 * and see if there is something to be found, if not then we
2303 * allocate
2304 */
2305 if (search_start != orig_search_start) {
2306 if (last_ptr && *last_ptr) {
2307 *last_ptr = 0;
2308 total_needed += empty_cluster;
2309 }
2310 search_start = orig_search_start;
2311 goto new_group;
2312 }
2313
2314 /*
2315 * we've already allocated, we're pretty screwed
2316 */
2317 if (chunk_alloc_done) {
2318 goto error;
2319 } else if (!allowed_chunk_alloc && block_group &&
2320 block_group_bits(block_group, data)) {
2321 block_group->space_info->force_alloc = 1;
2322 goto error;
2323 } else if (!allowed_chunk_alloc) {
2324 goto error;
2325 }
2326
2327 ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024,
2328 data, 1);
2329 if (ret < 0)
2330 goto error;
2331
2332 BUG_ON(ret);
2333 chunk_alloc_done = 1;
2334 if (block_group)
2335 search_start = block_group->key.objectid +
2336 block_group->key.offset;
2337 else
2338 search_start = orig_search_start;
2339 goto new_group;
2340 }
2341
2342 if (ret)
2343 goto error;
2344
2345 search_start = stripe_align(root, search_start);
2346 ins->objectid = search_start;
2347 ins->offset = num_bytes;
2348
2349 if (ins->objectid + num_bytes >= search_end) {
2350 search_start = orig_search_start;
2351 if (chunk_alloc_done) {
2352 ret = -ENOSPC;
2353 goto error;
2354 }
2355 goto new_group;
2356 }
2357
2358 if (ins->objectid + num_bytes >
2359 block_group->key.objectid + block_group->key.offset) {
2360 if (search_start == orig_search_start && chunk_alloc_done) {
2361 ret = -ENOSPC;
2362 goto error;
2363 }
2364 search_start = block_group->key.objectid +
2365 block_group->key.offset;
2366 goto new_group;
2367 }
2368
2369 if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
2370 ins->objectid < exclude_start + exclude_nr)) {
2371 search_start = exclude_start + exclude_nr;
2372 goto new_group;
2373 }
2374
2375 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2376 trans->block_group = block_group;
2377
2378 ins->offset = num_bytes;
2379 if (last_ptr) {
2380 *last_ptr = ins->objectid + ins->offset;
2381 if (*last_ptr ==
2382 btrfs_super_total_bytes(&root->fs_info->super_copy))
2383 *last_ptr = 0;
2384 }
2385
2386 ret = 0;
2387error:
2388 return ret;
2389}
2390
2391static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
2392{
2393 struct btrfs_block_group_cache *cache;
2394 struct list_head *l;
2395
2396 printk(KERN_INFO "space_info has %Lu free, is %sfull\n",
2397 info->total_bytes - info->bytes_used - info->bytes_pinned -
2398 info->bytes_reserved, (info->full) ? "" : "not ");
2399
2400 spin_lock(&info->lock);
2401 list_for_each(l, &info->block_groups) {
2402 cache = list_entry(l, struct btrfs_block_group_cache, list);
2403 spin_lock(&cache->lock);
2404 printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used "
2405 "%Lu pinned %Lu reserved\n",
2406 cache->key.objectid, cache->key.offset,
2407 btrfs_block_group_used(&cache->item),
2408 cache->pinned, cache->reserved);
2409 btrfs_dump_free_space(cache, bytes);
2410 spin_unlock(&cache->lock);
2411 }
2412 spin_unlock(&info->lock);
2413}
2414
2415static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2416 struct btrfs_root *root,
2417 u64 num_bytes, u64 min_alloc_size,
2418 u64 empty_size, u64 hint_byte,
2419 u64 search_end, struct btrfs_key *ins,
2420 u64 data)
2421{
2422 int ret;
2423 u64 search_start = 0;
2424 u64 alloc_profile;
2425 struct btrfs_fs_info *info = root->fs_info;
2426 struct btrfs_block_group_cache *cache;
2427
2428 if (data) {
2429 alloc_profile = info->avail_data_alloc_bits &
2430 info->data_alloc_profile;
2431 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2432 } else if (root == root->fs_info->chunk_root) {
2433 alloc_profile = info->avail_system_alloc_bits &
2434 info->system_alloc_profile;
2435 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2436 } else {
2437 alloc_profile = info->avail_metadata_alloc_bits &
2438 info->metadata_alloc_profile;
2439 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2440 }
2441again:
2442 data = reduce_alloc_profile(root, data);
2443 /*
2444 * the only place that sets empty_size is btrfs_realloc_node, which
2445 * is not called recursively on allocations
2446 */
2447 if (empty_size || root->ref_cows) {
2448 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
2449 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2450 2 * 1024 * 1024,
2451 BTRFS_BLOCK_GROUP_METADATA |
2452 (info->metadata_alloc_profile &
2453 info->avail_metadata_alloc_bits), 0);
2454 }
2455 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2456 num_bytes + 2 * 1024 * 1024, data, 0);
2457 }
2458
2459 WARN_ON(num_bytes < root->sectorsize);
2460 ret = find_free_extent(trans, root, num_bytes, empty_size,
2461 search_start, search_end, hint_byte, ins,
2462 trans->alloc_exclude_start,
2463 trans->alloc_exclude_nr, data);
2464
2465 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
2466 num_bytes = num_bytes >> 1;
2467 num_bytes = num_bytes & ~(root->sectorsize - 1);
2468 num_bytes = max(num_bytes, min_alloc_size);
2469 do_chunk_alloc(trans, root->fs_info->extent_root,
2470 num_bytes, data, 1);
2471 goto again;
2472 }
2473 if (ret) {
2474 struct btrfs_space_info *sinfo;
2475
2476 sinfo = __find_space_info(root->fs_info, data);
2477 printk("allocation failed flags %Lu, wanted %Lu\n",
2478 data, num_bytes);
2479 dump_space_info(sinfo, num_bytes);
2480 BUG();
2481 }
2482 cache = btrfs_lookup_block_group(root->fs_info, ins->objectid);
2483 if (!cache) {
2484 printk(KERN_ERR "Unable to find block group for %Lu\n", ins->objectid);
2485 return -ENOSPC;
2486 }
2487
2488 ret = btrfs_remove_free_space(cache, ins->objectid, ins->offset);
2489
2490 return ret;
2491}
2492
2493int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
2494{
2495 struct btrfs_block_group_cache *cache;
2496
2497 maybe_lock_mutex(root);
2498 cache = btrfs_lookup_block_group(root->fs_info, start);
2499 if (!cache) {
2500 printk(KERN_ERR "Unable to find block group for %Lu\n", start);
2501 maybe_unlock_mutex(root);
2502 return -ENOSPC;
2503 }
2504 btrfs_add_free_space(cache, start, len);
2505 update_reserved_extents(root, start, len, 0);
2506 maybe_unlock_mutex(root);
2507 return 0;
2508}
2509
2510int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2511 struct btrfs_root *root,
2512 u64 num_bytes, u64 min_alloc_size,
2513 u64 empty_size, u64 hint_byte,
2514 u64 search_end, struct btrfs_key *ins,
2515 u64 data)
2516{
2517 int ret;
2518 maybe_lock_mutex(root);
2519 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
2520 empty_size, hint_byte, search_end, ins,
2521 data);
2522 update_reserved_extents(root, ins->objectid, ins->offset, 1);
2523 maybe_unlock_mutex(root);
2524 return ret;
2525}
2526
2527static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2528 struct btrfs_root *root, u64 parent,
2529 u64 root_objectid, u64 ref_generation,
2530 u64 owner, struct btrfs_key *ins)
2531{
2532 int ret;
2533 int pending_ret;
2534 u64 super_used;
2535 u64 root_used;
2536 u64 num_bytes = ins->offset;
2537 u32 sizes[2];
2538 struct btrfs_fs_info *info = root->fs_info;
2539 struct btrfs_root *extent_root = info->extent_root;
2540 struct btrfs_extent_item *extent_item;
2541 struct btrfs_extent_ref *ref;
2542 struct btrfs_path *path;
2543 struct btrfs_key keys[2];
2544
2545 if (parent == 0)
2546 parent = ins->objectid;
2547
2548 /* block accounting for super block */
2549 spin_lock_irq(&info->delalloc_lock);
2550 super_used = btrfs_super_bytes_used(&info->super_copy);
2551 btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
2552 spin_unlock_irq(&info->delalloc_lock);
2553
2554 /* block accounting for root item */
2555 root_used = btrfs_root_used(&root->root_item);
2556 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
2557
2558 if (root == extent_root) {
2559 struct pending_extent_op *extent_op;
2560
2561 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2562 BUG_ON(!extent_op);
2563
2564 extent_op->type = PENDING_EXTENT_INSERT;
2565 extent_op->bytenr = ins->objectid;
2566 extent_op->num_bytes = ins->offset;
2567 extent_op->parent = parent;
2568 extent_op->orig_parent = 0;
2569 extent_op->generation = ref_generation;
2570 extent_op->orig_generation = 0;
2571 extent_op->level = (int)owner;
2572
2573 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
2574 ins->objectid + ins->offset - 1,
2575 EXTENT_LOCKED, GFP_NOFS);
2576 set_state_private(&root->fs_info->extent_ins,
2577 ins->objectid, (unsigned long)extent_op);
2578 goto update_block;
2579 }
2580
2581 memcpy(&keys[0], ins, sizeof(*ins));
2582 keys[1].objectid = ins->objectid;
2583 keys[1].type = BTRFS_EXTENT_REF_KEY;
2584 keys[1].offset = parent;
2585 sizes[0] = sizeof(*extent_item);
2586 sizes[1] = sizeof(*ref);
2587
2588 path = btrfs_alloc_path();
2589 BUG_ON(!path);
2590
2591 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
2592 sizes, 2);
2593 BUG_ON(ret);
2594
2595 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2596 struct btrfs_extent_item);
2597 btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
2598 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
2599 struct btrfs_extent_ref);
2600
2601 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
2602 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
2603 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
2604 btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
2605
2606 btrfs_mark_buffer_dirty(path->nodes[0]);
2607
2608 trans->alloc_exclude_start = 0;
2609 trans->alloc_exclude_nr = 0;
2610 btrfs_free_path(path);
2611 finish_current_insert(trans, extent_root);
2612 pending_ret = del_pending_extents(trans, extent_root);
2613
2614 if (ret)
2615 goto out;
2616 if (pending_ret) {
2617 ret = pending_ret;
2618 goto out;
2619 }
2620
2621update_block:
2622 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
2623 if (ret) {
2624 printk("update block group failed for %Lu %Lu\n",
2625 ins->objectid, ins->offset);
2626 BUG();
2627 }
2628out:
2629 return ret;
2630}
2631
2632int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2633 struct btrfs_root *root, u64 parent,
2634 u64 root_objectid, u64 ref_generation,
2635 u64 owner, struct btrfs_key *ins)
2636{
2637 int ret;
2638
2639 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
2640 return 0;
2641 maybe_lock_mutex(root);
2642 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
2643 ref_generation, owner, ins);
2644 update_reserved_extents(root, ins->objectid, ins->offset, 0);
2645 maybe_unlock_mutex(root);
2646 return ret;
2647}
2648
2649/*
2650 * this is used by the tree logging recovery code. It records that
2651 * an extent has been allocated and makes sure to clear the free
2652 * space cache bits as well
2653 */
2654int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
2655 struct btrfs_root *root, u64 parent,
2656 u64 root_objectid, u64 ref_generation,
2657 u64 owner, struct btrfs_key *ins)
2658{
2659 int ret;
2660 struct btrfs_block_group_cache *block_group;
2661
2662 maybe_lock_mutex(root);
2663 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
2664 cache_block_group(root, block_group);
2665
2666 ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset);
2667 BUG_ON(ret);
2668 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
2669 ref_generation, owner, ins);
2670 maybe_unlock_mutex(root);
2671 return ret;
2672}
2673
2674/*
2675 * finds a free extent and does all the dirty work required for allocation
2676 * returns the key for the extent through ins, and a tree buffer for
2677 * the first block of the extent through buf.
2678 *
2679 * returns 0 if everything worked, non-zero otherwise.
2680 */
2681int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
2682 struct btrfs_root *root,
2683 u64 num_bytes, u64 parent, u64 min_alloc_size,
2684 u64 root_objectid, u64 ref_generation,
2685 u64 owner_objectid, u64 empty_size, u64 hint_byte,
2686 u64 search_end, struct btrfs_key *ins, u64 data)
2687{
2688 int ret;
2689
2690 maybe_lock_mutex(root);
2691
2692 ret = __btrfs_reserve_extent(trans, root, num_bytes,
2693 min_alloc_size, empty_size, hint_byte,
2694 search_end, ins, data);
2695 BUG_ON(ret);
2696 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
2697 ret = __btrfs_alloc_reserved_extent(trans, root, parent,
2698 root_objectid, ref_generation,
2699 owner_objectid, ins);
2700 BUG_ON(ret);
2701
2702 } else {
2703 update_reserved_extents(root, ins->objectid, ins->offset, 1);
2704 }
2705 maybe_unlock_mutex(root);
2706 return ret;
2707}
2708
2709struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2710 struct btrfs_root *root,
2711 u64 bytenr, u32 blocksize)
2712{
2713 struct extent_buffer *buf;
2714
2715 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
2716 if (!buf)
2717 return ERR_PTR(-ENOMEM);
2718 btrfs_set_header_generation(buf, trans->transid);
2719 btrfs_tree_lock(buf);
2720 clean_tree_block(trans, root, buf);
2721 btrfs_set_buffer_uptodate(buf);
2722 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2723 set_extent_dirty(&root->dirty_log_pages, buf->start,
2724 buf->start + buf->len - 1, GFP_NOFS);
2725 } else {
2726 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
2727 buf->start + buf->len - 1, GFP_NOFS);
2728 }
2729 trans->blocks_used++;
2730 return buf;
2731}
2732
2733/*
2734 * helper function to allocate a block for a given tree
2735 * returns the tree buffer or NULL.
2736 */
2737struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2738 struct btrfs_root *root,
2739 u32 blocksize, u64 parent,
2740 u64 root_objectid,
2741 u64 ref_generation,
2742 int level,
2743 u64 hint,
2744 u64 empty_size)
2745{
2746 struct btrfs_key ins;
2747 int ret;
2748 struct extent_buffer *buf;
2749
2750 ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
2751 root_objectid, ref_generation, level,
2752 empty_size, hint, (u64)-1, &ins, 0);
2753 if (ret) {
2754 BUG_ON(ret > 0);
2755 return ERR_PTR(ret);
2756 }
2757
2758 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
2759 return buf;
2760}
2761
2762int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
2763 struct btrfs_root *root, struct extent_buffer *leaf)
2764{
2765 u64 leaf_owner;
2766 u64 leaf_generation;
2767 struct btrfs_key key;
2768 struct btrfs_file_extent_item *fi;
2769 int i;
2770 int nritems;
2771 int ret;
2772
2773 BUG_ON(!btrfs_is_leaf(leaf));
2774 nritems = btrfs_header_nritems(leaf);
2775 leaf_owner = btrfs_header_owner(leaf);
2776 leaf_generation = btrfs_header_generation(leaf);
2777
2778 for (i = 0; i < nritems; i++) {
2779 u64 disk_bytenr;
2780 cond_resched();
2781
2782 btrfs_item_key_to_cpu(leaf, &key, i);
2783 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2784 continue;
2785 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2786 if (btrfs_file_extent_type(leaf, fi) ==
2787 BTRFS_FILE_EXTENT_INLINE)
2788 continue;
2789 /*
2790 * FIXME make sure to insert a trans record that
2791 * repeats the snapshot del on crash
2792 */
2793 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2794 if (disk_bytenr == 0)
2795 continue;
2796
2797 mutex_lock(&root->fs_info->alloc_mutex);
2798 ret = __btrfs_free_extent(trans, root, disk_bytenr,
2799 btrfs_file_extent_disk_num_bytes(leaf, fi),
2800 leaf->start, leaf_owner, leaf_generation,
2801 key.objectid, 0);
2802 mutex_unlock(&root->fs_info->alloc_mutex);
2803 BUG_ON(ret);
2804
2805 atomic_inc(&root->fs_info->throttle_gen);
2806 wake_up(&root->fs_info->transaction_throttle);
2807 cond_resched();
2808 }
2809 return 0;
2810}
2811
2812static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
2813 struct btrfs_root *root,
2814 struct btrfs_leaf_ref *ref)
2815{
2816 int i;
2817 int ret;
2818 struct btrfs_extent_info *info = ref->extents;
2819
2820 for (i = 0; i < ref->nritems; i++) {
2821 mutex_lock(&root->fs_info->alloc_mutex);
2822 ret = __btrfs_free_extent(trans, root, info->bytenr,
2823 info->num_bytes, ref->bytenr,
2824 ref->owner, ref->generation,
2825 info->objectid, 0);
2826 mutex_unlock(&root->fs_info->alloc_mutex);
2827
2828 atomic_inc(&root->fs_info->throttle_gen);
2829 wake_up(&root->fs_info->transaction_throttle);
2830 cond_resched();
2831
2832 BUG_ON(ret);
2833 info++;
2834 }
2835
2836 return 0;
2837}
2838
2839int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
2840 u32 *refs)
2841{
2842 int ret;
2843
2844 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
2845 BUG_ON(ret);
2846
2847#if 0 // some debugging code in case we see problems here
2848 /* if the refs count is one, it won't get increased again. But
2849 * if the ref count is > 1, someone may be decreasing it at
2850 * the same time we are.
2851 */
2852 if (*refs != 1) {
2853 struct extent_buffer *eb = NULL;
2854 eb = btrfs_find_create_tree_block(root, start, len);
2855 if (eb)
2856 btrfs_tree_lock(eb);
2857
2858 mutex_lock(&root->fs_info->alloc_mutex);
2859 ret = lookup_extent_ref(NULL, root, start, len, refs);
2860 BUG_ON(ret);
2861 mutex_unlock(&root->fs_info->alloc_mutex);
2862
2863 if (eb) {
2864 btrfs_tree_unlock(eb);
2865 free_extent_buffer(eb);
2866 }
2867 if (*refs == 1) {
2868 printk("block %llu went down to one during drop_snap\n",
2869 (unsigned long long)start);
2870 }
2871
2872 }
2873#endif
2874
2875 cond_resched();
2876 return ret;
2877}
2878
2879/*
2880 * helper function for drop_snapshot, this walks down the tree dropping ref
2881 * counts as it goes.
2882 */
2883static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
2884 struct btrfs_root *root,
2885 struct btrfs_path *path, int *level)
2886{
2887 u64 root_owner;
2888 u64 root_gen;
2889 u64 bytenr;
2890 u64 ptr_gen;
2891 struct extent_buffer *next;
2892 struct extent_buffer *cur;
2893 struct extent_buffer *parent;
2894 struct btrfs_leaf_ref *ref;
2895 u32 blocksize;
2896 int ret;
2897 u32 refs;
2898
2899 WARN_ON(*level < 0);
2900 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2901 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
2902 path->nodes[*level]->len, &refs);
2903 BUG_ON(ret);
2904 if (refs > 1)
2905 goto out;
2906
2907 /*
2908 * walk down to the last node level and free all the leaves
2909 */
2910 while(*level >= 0) {
2911 WARN_ON(*level < 0);
2912 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2913 cur = path->nodes[*level];
2914
2915 if (btrfs_header_level(cur) != *level)
2916 WARN_ON(1);
2917
2918 if (path->slots[*level] >=
2919 btrfs_header_nritems(cur))
2920 break;
2921 if (*level == 0) {
2922 ret = btrfs_drop_leaf_ref(trans, root, cur);
2923 BUG_ON(ret);
2924 break;
2925 }
2926 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2927 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2928 blocksize = btrfs_level_size(root, *level - 1);
2929
2930 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
2931 BUG_ON(ret);
2932 if (refs != 1) {
2933 parent = path->nodes[*level];
2934 root_owner = btrfs_header_owner(parent);
2935 root_gen = btrfs_header_generation(parent);
2936 path->slots[*level]++;
2937
2938 mutex_lock(&root->fs_info->alloc_mutex);
2939 ret = __btrfs_free_extent(trans, root, bytenr,
2940 blocksize, parent->start,
2941 root_owner, root_gen,
2942 *level - 1, 1);
2943 BUG_ON(ret);
2944 mutex_unlock(&root->fs_info->alloc_mutex);
2945
2946 atomic_inc(&root->fs_info->throttle_gen);
2947 wake_up(&root->fs_info->transaction_throttle);
2948 cond_resched();
2949
2950 continue;
2951 }
2952 /*
2953 * at this point, we have a single ref, and since the
2954 * only place referencing this extent is a dead root
2955 * the reference count should never go higher.
2956 * So, we don't need to check it again
2957 */
2958 if (*level == 1) {
2959 ref = btrfs_lookup_leaf_ref(root, bytenr);
2960 if (ref && ref->generation != ptr_gen) {
2961 btrfs_free_leaf_ref(root, ref);
2962 ref = NULL;
2963 }
2964 if (ref) {
2965 ret = cache_drop_leaf_ref(trans, root, ref);
2966 BUG_ON(ret);
2967 btrfs_remove_leaf_ref(root, ref);
2968 btrfs_free_leaf_ref(root, ref);
2969 *level = 0;
2970 break;
2971 }
2972 if (printk_ratelimit()) {
2973 printk("leaf ref miss for bytenr %llu\n",
2974 (unsigned long long)bytenr);
2975 }
2976 }
2977 next = btrfs_find_tree_block(root, bytenr, blocksize);
2978 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
2979 free_extent_buffer(next);
2980
2981 next = read_tree_block(root, bytenr, blocksize,
2982 ptr_gen);
2983 cond_resched();
2984#if 0
2985 /*
2986 * this is a debugging check and can go away
2987 * the ref should never go all the way down to 1
2988 * at this point
2989 */
2990 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
2991 &refs);
2992 BUG_ON(ret);
2993 WARN_ON(refs != 1);
2994#endif
2995 }
2996 WARN_ON(*level <= 0);
2997 if (path->nodes[*level-1])
2998 free_extent_buffer(path->nodes[*level-1]);
2999 path->nodes[*level-1] = next;
3000 *level = btrfs_header_level(next);
3001 path->slots[*level] = 0;
3002 cond_resched();
3003 }
3004out:
3005 WARN_ON(*level < 0);
3006 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3007
3008 if (path->nodes[*level] == root->node) {
3009 parent = path->nodes[*level];
3010 bytenr = path->nodes[*level]->start;
3011 } else {
3012 parent = path->nodes[*level + 1];
3013 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3014 }
3015
3016 blocksize = btrfs_level_size(root, *level);
3017 root_owner = btrfs_header_owner(parent);
3018 root_gen = btrfs_header_generation(parent);
3019
3020 mutex_lock(&root->fs_info->alloc_mutex);
3021 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3022 parent->start, root_owner, root_gen,
3023 *level, 1);
3024 mutex_unlock(&root->fs_info->alloc_mutex);
3025 free_extent_buffer(path->nodes[*level]);
3026 path->nodes[*level] = NULL;
3027 *level += 1;
3028 BUG_ON(ret);
3029
3030 cond_resched();
3031 return 0;
3032}
3033
3034/*
3035 * helper for dropping snapshots. This walks back up the tree in the path
3036 * to find the first node higher up where we haven't yet gone through
3037 * all the slots
3038 */
3039static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
3040 struct btrfs_root *root,
3041 struct btrfs_path *path, int *level)
3042{
3043 u64 root_owner;
3044 u64 root_gen;
3045 struct btrfs_root_item *root_item = &root->root_item;
3046 int i;
3047 int slot;
3048 int ret;
3049
3050 for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
3051 slot = path->slots[i];
3052 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3053 struct extent_buffer *node;
3054 struct btrfs_disk_key disk_key;
3055 node = path->nodes[i];
3056 path->slots[i]++;
3057 *level = i;
3058 WARN_ON(*level == 0);
3059 btrfs_node_key(node, &disk_key, path->slots[i]);
3060 memcpy(&root_item->drop_progress,
3061 &disk_key, sizeof(disk_key));
3062 root_item->drop_level = i;
3063 return 0;
3064 } else {
3065 struct extent_buffer *parent;
3066 if (path->nodes[*level] == root->node)
3067 parent = path->nodes[*level];
3068 else
3069 parent = path->nodes[*level + 1];
3070
3071 root_owner = btrfs_header_owner(parent);
3072 root_gen = btrfs_header_generation(parent);
3073 ret = btrfs_free_extent(trans, root,
3074 path->nodes[*level]->start,
3075 path->nodes[*level]->len,
3076 parent->start, root_owner,
3077 root_gen, *level, 1);
3078 BUG_ON(ret);
3079 free_extent_buffer(path->nodes[*level]);
3080 path->nodes[*level] = NULL;
3081 *level = i + 1;
3082 }
3083 }
3084 return 1;
3085}
3086
3087/*
3088 * drop the reference count on the tree rooted at 'snap'. This traverses
3089 * the tree freeing any blocks that have a ref count of zero after being
3090 * decremented.
3091 */
3092int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3093 *root)
3094{
3095 int ret = 0;
3096 int wret;
3097 int level;
3098 struct btrfs_path *path;
3099 int i;
3100 int orig_level;
3101 struct btrfs_root_item *root_item = &root->root_item;
3102
3103 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3104 path = btrfs_alloc_path();
3105 BUG_ON(!path);
3106
3107 level = btrfs_header_level(root->node);
3108 orig_level = level;
3109 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3110 path->nodes[level] = root->node;
3111 extent_buffer_get(root->node);
3112 path->slots[level] = 0;
3113 } else {
3114 struct btrfs_key key;
3115 struct btrfs_disk_key found_key;
3116 struct extent_buffer *node;
3117
3118 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3119 level = root_item->drop_level;
3120 path->lowest_level = level;
3121 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3122 if (wret < 0) {
3123 ret = wret;
3124 goto out;
3125 }
3126 node = path->nodes[level];
3127 btrfs_node_key(node, &found_key, path->slots[level]);
3128 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3129 sizeof(found_key)));
3130 /*
3131 * unlock our path, this is safe because only this
3132 * function is allowed to delete this snapshot
3133 */
3134 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3135 if (path->nodes[i] && path->locks[i]) {
3136 path->locks[i] = 0;
3137 btrfs_tree_unlock(path->nodes[i]);
3138 }
3139 }
3140 }
3141 while(1) {
3142 wret = walk_down_tree(trans, root, path, &level);
3143 if (wret > 0)
3144 break;
3145 if (wret < 0)
3146 ret = wret;
3147
3148 wret = walk_up_tree(trans, root, path, &level);
3149 if (wret > 0)
3150 break;
3151 if (wret < 0)
3152 ret = wret;
3153 if (trans->transaction->in_commit) {
3154 ret = -EAGAIN;
3155 break;
3156 }
3157 atomic_inc(&root->fs_info->throttle_gen);
3158 wake_up(&root->fs_info->transaction_throttle);
3159 }
3160 for (i = 0; i <= orig_level; i++) {
3161 if (path->nodes[i]) {
3162 free_extent_buffer(path->nodes[i]);
3163 path->nodes[i] = NULL;
3164 }
3165 }
3166out:
3167 btrfs_free_path(path);
3168 return ret;
3169}
3170
3171static unsigned long calc_ra(unsigned long start, unsigned long last,
3172 unsigned long nr)
3173{
3174 return min(last, start + nr - 1);
3175}
3176
3177static int noinline relocate_inode_pages(struct inode *inode, u64 start,
3178 u64 len)
3179{
3180 u64 page_start;
3181 u64 page_end;
3182 unsigned long first_index;
3183 unsigned long last_index;
3184 unsigned long i;
3185 struct page *page;
3186 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3187 struct file_ra_state *ra;
3188 struct btrfs_ordered_extent *ordered;
3189 unsigned int total_read = 0;
3190 unsigned int total_dirty = 0;
3191 int ret = 0;
3192
3193 ra = kzalloc(sizeof(*ra), GFP_NOFS);
3194
3195 mutex_lock(&inode->i_mutex);
3196 first_index = start >> PAGE_CACHE_SHIFT;
3197 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3198
3199 /* make sure the dirty trick played by the caller work */
3200 ret = invalidate_inode_pages2_range(inode->i_mapping,
3201 first_index, last_index);
3202 if (ret)
3203 goto out_unlock;
3204
3205 file_ra_state_init(ra, inode->i_mapping);
3206
3207 for (i = first_index ; i <= last_index; i++) {
3208 if (total_read % ra->ra_pages == 0) {
3209 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3210 calc_ra(i, last_index, ra->ra_pages));
3211 }
3212 total_read++;
3213again:
3214 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3215 BUG_ON(1);
3216 page = grab_cache_page(inode->i_mapping, i);
3217 if (!page) {
3218 ret = -ENOMEM;
3219 goto out_unlock;
3220 }
3221 if (!PageUptodate(page)) {
3222 btrfs_readpage(NULL, page);
3223 lock_page(page);
3224 if (!PageUptodate(page)) {
3225 unlock_page(page);
3226 page_cache_release(page);
3227 ret = -EIO;
3228 goto out_unlock;
3229 }
3230 }
3231 wait_on_page_writeback(page);
3232
3233 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3234 page_end = page_start + PAGE_CACHE_SIZE - 1;
3235 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3236
3237 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3238 if (ordered) {
3239 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3240 unlock_page(page);
3241 page_cache_release(page);
3242 btrfs_start_ordered_extent(inode, ordered, 1);
3243 btrfs_put_ordered_extent(ordered);
3244 goto again;
3245 }
3246 set_page_extent_mapped(page);
3247
3248 btrfs_set_extent_delalloc(inode, page_start, page_end);
3249 if (i == first_index)
3250 set_extent_bits(io_tree, page_start, page_end,
3251 EXTENT_BOUNDARY, GFP_NOFS);
3252
3253 set_page_dirty(page);
3254 total_dirty++;
3255
3256 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3257 unlock_page(page);
3258 page_cache_release(page);
3259 }
3260
3261out_unlock:
3262 kfree(ra);
3263 mutex_unlock(&inode->i_mutex);
3264 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
3265 return ret;
3266}
3267
3268static int noinline relocate_data_extent(struct inode *reloc_inode,
3269 struct btrfs_key *extent_key,
3270 u64 offset)
3271{
3272 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
3273 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
3274 struct extent_map *em;
3275
3276 em = alloc_extent_map(GFP_NOFS);
3277 BUG_ON(!em || IS_ERR(em));
3278
3279 em->start = extent_key->objectid - offset;
3280 em->len = extent_key->offset;
3281 em->block_start = extent_key->objectid;
3282 em->bdev = root->fs_info->fs_devices->latest_bdev;
3283 set_bit(EXTENT_FLAG_PINNED, &em->flags);
3284
3285 /* setup extent map to cheat btrfs_readpage */
3286 mutex_lock(&BTRFS_I(reloc_inode)->extent_mutex);
3287 while (1) {
3288 int ret;
3289 spin_lock(&em_tree->lock);
3290 ret = add_extent_mapping(em_tree, em);
3291 spin_unlock(&em_tree->lock);
3292 if (ret != -EEXIST) {
3293 free_extent_map(em);
3294 break;
3295 }
3296 btrfs_drop_extent_cache(reloc_inode, em->start,
3297 em->start + em->len - 1, 0);
3298 }
3299 mutex_unlock(&BTRFS_I(reloc_inode)->extent_mutex);
3300
3301 return relocate_inode_pages(reloc_inode, extent_key->objectid - offset,
3302 extent_key->offset);
3303}
3304
3305struct btrfs_ref_path {
3306 u64 extent_start;
3307 u64 nodes[BTRFS_MAX_LEVEL];
3308 u64 root_objectid;
3309 u64 root_generation;
3310 u64 owner_objectid;
3311 u32 num_refs;
3312 int lowest_level;
3313 int current_level;
3314};
3315
3316struct disk_extent {
3317 u64 disk_bytenr;
3318 u64 disk_num_bytes;
3319 u64 offset;
3320 u64 num_bytes;
3321};
3322
3323static int is_cowonly_root(u64 root_objectid)
3324{
3325 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
3326 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
3327 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
3328 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
3329 root_objectid == BTRFS_TREE_LOG_OBJECTID)
3330 return 1;
3331 return 0;
3332}
3333
3334static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
3335 struct btrfs_root *extent_root,
3336 struct btrfs_ref_path *ref_path,
3337 int first_time)
3338{
3339 struct extent_buffer *leaf;
3340 struct btrfs_path *path;
3341 struct btrfs_extent_ref *ref;
3342 struct btrfs_key key;
3343 struct btrfs_key found_key;
3344 u64 bytenr;
3345 u32 nritems;
3346 int level;
3347 int ret = 1;
3348
3349 path = btrfs_alloc_path();
3350 if (!path)
3351 return -ENOMEM;
3352
3353 mutex_lock(&extent_root->fs_info->alloc_mutex);
3354
3355 if (first_time) {
3356 ref_path->lowest_level = -1;
3357 ref_path->current_level = -1;
3358 goto walk_up;
3359 }
3360walk_down:
3361 level = ref_path->current_level - 1;
3362 while (level >= -1) {
3363 u64 parent;
3364 if (level < ref_path->lowest_level)
3365 break;
3366
3367 if (level >= 0) {
3368 bytenr = ref_path->nodes[level];
3369 } else {
3370 bytenr = ref_path->extent_start;
3371 }
3372 BUG_ON(bytenr == 0);
3373
3374 parent = ref_path->nodes[level + 1];
3375 ref_path->nodes[level + 1] = 0;
3376 ref_path->current_level = level;
3377 BUG_ON(parent == 0);
3378
3379 key.objectid = bytenr;
3380 key.offset = parent + 1;
3381 key.type = BTRFS_EXTENT_REF_KEY;
3382
3383 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
3384 if (ret < 0)
3385 goto out;
3386 BUG_ON(ret == 0);
3387
3388 leaf = path->nodes[0];
3389 nritems = btrfs_header_nritems(leaf);
3390 if (path->slots[0] >= nritems) {
3391 ret = btrfs_next_leaf(extent_root, path);
3392 if (ret < 0)
3393 goto out;
3394 if (ret > 0)
3395 goto next;
3396 leaf = path->nodes[0];
3397 }
3398
3399 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3400 if (found_key.objectid == bytenr &&
3401 found_key.type == BTRFS_EXTENT_REF_KEY)
3402 goto found;
3403next:
3404 level--;
3405 btrfs_release_path(extent_root, path);
3406 if (need_resched()) {
3407 mutex_unlock(&extent_root->fs_info->alloc_mutex);
3408 cond_resched();
3409 mutex_lock(&extent_root->fs_info->alloc_mutex);
3410 }
3411 }
3412 /* reached lowest level */
3413 ret = 1;
3414 goto out;
3415walk_up:
3416 level = ref_path->current_level;
3417 while (level < BTRFS_MAX_LEVEL - 1) {
3418 u64 ref_objectid;
3419 if (level >= 0) {
3420 bytenr = ref_path->nodes[level];
3421 } else {
3422 bytenr = ref_path->extent_start;
3423 }
3424 BUG_ON(bytenr == 0);
3425
3426 key.objectid = bytenr;
3427 key.offset = 0;
3428 key.type = BTRFS_EXTENT_REF_KEY;
3429
3430 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
3431 if (ret < 0)
3432 goto out;
3433
3434 leaf = path->nodes[0];
3435 nritems = btrfs_header_nritems(leaf);
3436 if (path->slots[0] >= nritems) {
3437 ret = btrfs_next_leaf(extent_root, path);
3438 if (ret < 0)
3439 goto out;
3440 if (ret > 0) {
3441 /* the extent was freed by someone */
3442 if (ref_path->lowest_level == level)
3443 goto out;
3444 btrfs_release_path(extent_root, path);
3445 goto walk_down;
3446 }
3447 leaf = path->nodes[0];
3448 }
3449
3450 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3451 if (found_key.objectid != bytenr ||
3452 found_key.type != BTRFS_EXTENT_REF_KEY) {
3453 /* the extent was freed by someone */
3454 if (ref_path->lowest_level == level) {
3455 ret = 1;
3456 goto out;
3457 }
3458 btrfs_release_path(extent_root, path);
3459 goto walk_down;
3460 }
3461found:
3462 ref = btrfs_item_ptr(leaf, path->slots[0],
3463 struct btrfs_extent_ref);
3464 ref_objectid = btrfs_ref_objectid(leaf, ref);
3465 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3466 if (first_time) {
3467 level = (int)ref_objectid;
3468 BUG_ON(level >= BTRFS_MAX_LEVEL);
3469 ref_path->lowest_level = level;
3470 ref_path->current_level = level;
3471 ref_path->nodes[level] = bytenr;
3472 } else {
3473 WARN_ON(ref_objectid != level);
3474 }
3475 } else {
3476 WARN_ON(level != -1);
3477 }
3478 first_time = 0;
3479
3480 if (ref_path->lowest_level == level) {
3481 ref_path->owner_objectid = ref_objectid;
3482 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
3483 }
3484
3485 /*
3486 * the block is tree root or the block isn't in reference
3487 * counted tree.
3488 */
3489 if (found_key.objectid == found_key.offset ||
3490 is_cowonly_root(btrfs_ref_root(leaf, ref))) {
3491 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
3492 ref_path->root_generation =
3493 btrfs_ref_generation(leaf, ref);
3494 if (level < 0) {
3495 /* special reference from the tree log */
3496 ref_path->nodes[0] = found_key.offset;
3497 ref_path->current_level = 0;
3498 }
3499 ret = 0;
3500 goto out;
3501 }
3502
3503 level++;
3504 BUG_ON(ref_path->nodes[level] != 0);
3505 ref_path->nodes[level] = found_key.offset;
3506 ref_path->current_level = level;
3507
3508 /*
3509 * the reference was created in the running transaction,
3510 * no need to continue walking up.
3511 */
3512 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
3513 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
3514 ref_path->root_generation =
3515 btrfs_ref_generation(leaf, ref);
3516 ret = 0;
3517 goto out;
3518 }
3519
3520 btrfs_release_path(extent_root, path);
3521 if (need_resched()) {
3522 mutex_unlock(&extent_root->fs_info->alloc_mutex);
3523 cond_resched();
3524 mutex_lock(&extent_root->fs_info->alloc_mutex);
3525 }
3526 }
3527 /* reached max tree level, but no tree root found. */
3528 BUG();
3529out:
3530 mutex_unlock(&extent_root->fs_info->alloc_mutex);
3531 btrfs_free_path(path);
3532 return ret;
3533}
3534
3535static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
3536 struct btrfs_root *extent_root,
3537 struct btrfs_ref_path *ref_path,
3538 u64 extent_start)
3539{
3540 memset(ref_path, 0, sizeof(*ref_path));
3541 ref_path->extent_start = extent_start;
3542
3543 return __next_ref_path(trans, extent_root, ref_path, 1);
3544}
3545
3546static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
3547 struct btrfs_root *extent_root,
3548 struct btrfs_ref_path *ref_path)
3549{
3550 return __next_ref_path(trans, extent_root, ref_path, 0);
3551}
3552
3553static int noinline get_new_locations(struct inode *reloc_inode,
3554 struct btrfs_key *extent_key,
3555 u64 offset, int no_fragment,
3556 struct disk_extent **extents,
3557 int *nr_extents)
3558{
3559 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
3560 struct btrfs_path *path;
3561 struct btrfs_file_extent_item *fi;
3562 struct extent_buffer *leaf;
3563 struct disk_extent *exts = *extents;
3564 struct btrfs_key found_key;
3565 u64 cur_pos;
3566 u64 last_byte;
3567 u32 nritems;
3568 int nr = 0;
3569 int max = *nr_extents;
3570 int ret;
3571
3572 WARN_ON(!no_fragment && *extents);
3573 if (!exts) {
3574 max = 1;
3575 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
3576 if (!exts)
3577 return -ENOMEM;
3578 }
3579
3580 path = btrfs_alloc_path();
3581 BUG_ON(!path);
3582
3583 cur_pos = extent_key->objectid - offset;
3584 last_byte = extent_key->objectid + extent_key->offset;
3585 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
3586 cur_pos, 0);
3587 if (ret < 0)
3588 goto out;
3589 if (ret > 0) {
3590 ret = -ENOENT;
3591 goto out;
3592 }
3593
3594 while (1) {
3595 leaf = path->nodes[0];
3596 nritems = btrfs_header_nritems(leaf);
3597 if (path->slots[0] >= nritems) {
3598 ret = btrfs_next_leaf(root, path);
3599 if (ret < 0)
3600 goto out;
3601 if (ret > 0)
3602 break;
3603 leaf = path->nodes[0];
3604 }
3605
3606 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3607 if (found_key.offset != cur_pos ||
3608 found_key.type != BTRFS_EXTENT_DATA_KEY ||
3609 found_key.objectid != reloc_inode->i_ino)
3610 break;
3611
3612 fi = btrfs_item_ptr(leaf, path->slots[0],
3613 struct btrfs_file_extent_item);
3614 if (btrfs_file_extent_type(leaf, fi) !=
3615 BTRFS_FILE_EXTENT_REG ||
3616 btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
3617 break;
3618
3619 if (nr == max) {
3620 struct disk_extent *old = exts;
3621 max *= 2;
3622 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
3623 memcpy(exts, old, sizeof(*exts) * nr);
3624 if (old != *extents)
3625 kfree(old);
3626 }
3627
3628 exts[nr].disk_bytenr =
3629 btrfs_file_extent_disk_bytenr(leaf, fi);
3630 exts[nr].disk_num_bytes =
3631 btrfs_file_extent_disk_num_bytes(leaf, fi);
3632 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
3633 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
3634 WARN_ON(exts[nr].offset > 0);
3635 WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
3636
3637 cur_pos += exts[nr].num_bytes;
3638 nr++;
3639
3640 if (cur_pos + offset >= last_byte)
3641 break;
3642
3643 if (no_fragment) {
3644 ret = 1;
3645 goto out;
3646 }
3647 path->slots[0]++;
3648 }
3649
3650 WARN_ON(cur_pos + offset > last_byte);
3651 if (cur_pos + offset < last_byte) {
3652 ret = -ENOENT;
3653 goto out;
3654 }
3655 ret = 0;
3656out:
3657 btrfs_free_path(path);
3658 if (ret) {
3659 if (exts != *extents)
3660 kfree(exts);
3661 } else {
3662 *extents = exts;
3663 *nr_extents = nr;
3664 }
3665 return ret;
3666}
3667
3668static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
3669 struct btrfs_root *root,
3670 struct btrfs_path *path,
3671 struct btrfs_key *extent_key,
3672 struct btrfs_key *leaf_key,
3673 struct btrfs_ref_path *ref_path,
3674 struct disk_extent *new_extents,
3675 int nr_extents)
3676{
3677 struct extent_buffer *leaf;
3678 struct btrfs_file_extent_item *fi;
3679 struct inode *inode = NULL;
3680 struct btrfs_key key;
3681 u64 lock_start = 0;
3682 u64 lock_end = 0;
3683 u64 num_bytes;
3684 u64 ext_offset;
3685 u64 first_pos;
3686 u32 nritems;
3687 int nr_scaned = 0;
3688 int extent_locked = 0;
3689 int ret;
3690
3691 memcpy(&key, leaf_key, sizeof(key));
3692 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
3693 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
3694 if (key.objectid < ref_path->owner_objectid ||
3695 (key.objectid == ref_path->owner_objectid &&
3696 key.type < BTRFS_EXTENT_DATA_KEY)) {
3697 key.objectid = ref_path->owner_objectid;
3698 key.type = BTRFS_EXTENT_DATA_KEY;
3699 key.offset = 0;
3700 }
3701 }
3702
3703 while (1) {
3704 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3705 if (ret < 0)
3706 goto out;
3707
3708 leaf = path->nodes[0];
3709 nritems = btrfs_header_nritems(leaf);
3710next:
3711 if (extent_locked && ret > 0) {
3712 /*
3713 * the file extent item was modified by someone
3714 * before the extent got locked.
3715 */
3716 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
3717 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
3718 lock_end, GFP_NOFS);
3719 extent_locked = 0;
3720 }
3721
3722 if (path->slots[0] >= nritems) {
3723 if (++nr_scaned > 2)
3724 break;
3725
3726 BUG_ON(extent_locked);
3727 ret = btrfs_next_leaf(root, path);
3728 if (ret < 0)
3729 goto out;
3730 if (ret > 0)
3731 break;
3732 leaf = path->nodes[0];
3733 nritems = btrfs_header_nritems(leaf);
3734 }
3735
3736 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3737
3738 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
3739 if ((key.objectid > ref_path->owner_objectid) ||
3740 (key.objectid == ref_path->owner_objectid &&
3741 key.type > BTRFS_EXTENT_DATA_KEY) ||
3742 (key.offset >= first_pos + extent_key->offset))
3743 break;
3744 }
3745
3746 if (inode && key.objectid != inode->i_ino) {
3747 BUG_ON(extent_locked);
3748 btrfs_release_path(root, path);
3749 mutex_unlock(&inode->i_mutex);
3750 iput(inode);
3751 inode = NULL;
3752 continue;
3753 }
3754
3755 if (key.type != BTRFS_EXTENT_DATA_KEY) {
3756 path->slots[0]++;
3757 ret = 1;
3758 goto next;
3759 }
3760 fi = btrfs_item_ptr(leaf, path->slots[0],
3761 struct btrfs_file_extent_item);
3762 if ((btrfs_file_extent_type(leaf, fi) !=
3763 BTRFS_FILE_EXTENT_REG) ||
3764 (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3765 extent_key->objectid)) {
3766 path->slots[0]++;
3767 ret = 1;
3768 goto next;
3769 }
3770
3771 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
3772 ext_offset = btrfs_file_extent_offset(leaf, fi);
3773
3774 if (first_pos > key.offset - ext_offset)
3775 first_pos = key.offset - ext_offset;
3776
3777 if (!extent_locked) {
3778 lock_start = key.offset;
3779 lock_end = lock_start + num_bytes - 1;
3780 } else {
3781 BUG_ON(lock_start != key.offset);
3782 BUG_ON(lock_end - lock_start + 1 < num_bytes);
3783 }
3784
3785 if (!inode) {
3786 btrfs_release_path(root, path);
3787
3788 inode = btrfs_iget_locked(root->fs_info->sb,
3789 key.objectid, root);
3790 if (inode->i_state & I_NEW) {
3791 BTRFS_I(inode)->root = root;
3792 BTRFS_I(inode)->location.objectid =
3793 key.objectid;
3794 BTRFS_I(inode)->location.type =
3795 BTRFS_INODE_ITEM_KEY;
3796 BTRFS_I(inode)->location.offset = 0;
3797 btrfs_read_locked_inode(inode);
3798 unlock_new_inode(inode);
3799 }
3800 /*
3801 * some code call btrfs_commit_transaction while
3802 * holding the i_mutex, so we can't use mutex_lock
3803 * here.
3804 */
3805 if (is_bad_inode(inode) ||
3806 !mutex_trylock(&inode->i_mutex)) {
3807 iput(inode);
3808 inode = NULL;
3809 key.offset = (u64)-1;
3810 goto skip;
3811 }
3812 }
3813
3814 if (!extent_locked) {
3815 struct btrfs_ordered_extent *ordered;
3816
3817 btrfs_release_path(root, path);
3818
3819 lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
3820 lock_end, GFP_NOFS);
3821 ordered = btrfs_lookup_first_ordered_extent(inode,
3822 lock_end);
3823 if (ordered &&
3824 ordered->file_offset <= lock_end &&
3825 ordered->file_offset + ordered->len > lock_start) {
3826 unlock_extent(&BTRFS_I(inode)->io_tree,
3827 lock_start, lock_end, GFP_NOFS);
3828 btrfs_start_ordered_extent(inode, ordered, 1);
3829 btrfs_put_ordered_extent(ordered);
3830 key.offset += num_bytes;
3831 goto skip;
3832 }
3833 if (ordered)
3834 btrfs_put_ordered_extent(ordered);
3835
3836 mutex_lock(&BTRFS_I(inode)->extent_mutex);
3837 extent_locked = 1;
3838 continue;
3839 }
3840
3841 if (nr_extents == 1) {
3842 /* update extent pointer in place */
3843 btrfs_set_file_extent_generation(leaf, fi,
3844 trans->transid);
3845 btrfs_set_file_extent_disk_bytenr(leaf, fi,
3846 new_extents[0].disk_bytenr);
3847 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
3848 new_extents[0].disk_num_bytes);
3849 ext_offset += new_extents[0].offset;
3850 btrfs_set_file_extent_offset(leaf, fi, ext_offset);
3851 btrfs_mark_buffer_dirty(leaf);
3852
3853 btrfs_drop_extent_cache(inode, key.offset,
3854 key.offset + num_bytes - 1, 0);
3855
3856 ret = btrfs_inc_extent_ref(trans, root,
3857 new_extents[0].disk_bytenr,
3858 new_extents[0].disk_num_bytes,
3859 leaf->start,
3860 root->root_key.objectid,
3861 trans->transid,
3862 key.objectid);
3863 BUG_ON(ret);
3864
3865 ret = btrfs_free_extent(trans, root,
3866 extent_key->objectid,
3867 extent_key->offset,
3868 leaf->start,
3869 btrfs_header_owner(leaf),
3870 btrfs_header_generation(leaf),
3871 key.objectid, 0);
3872 BUG_ON(ret);
3873
3874 btrfs_release_path(root, path);
3875 key.offset += num_bytes;
3876 } else {
3877 u64 alloc_hint;
3878 u64 extent_len;
3879 int i;
3880 /*
3881 * drop old extent pointer at first, then insert the
3882 * new pointers one bye one
3883 */
3884 btrfs_release_path(root, path);
3885 ret = btrfs_drop_extents(trans, root, inode, key.offset,
3886 key.offset + num_bytes,
3887 key.offset, &alloc_hint);
3888 BUG_ON(ret);
3889
3890 for (i = 0; i < nr_extents; i++) {
3891 if (ext_offset >= new_extents[i].num_bytes) {
3892 ext_offset -= new_extents[i].num_bytes;
3893 continue;
3894 }
3895 extent_len = min(new_extents[i].num_bytes -
3896 ext_offset, num_bytes);
3897
3898 ret = btrfs_insert_empty_item(trans, root,
3899 path, &key,
3900 sizeof(*fi));
3901 BUG_ON(ret);
3902
3903 leaf = path->nodes[0];
3904 fi = btrfs_item_ptr(leaf, path->slots[0],
3905 struct btrfs_file_extent_item);
3906 btrfs_set_file_extent_generation(leaf, fi,
3907 trans->transid);
3908 btrfs_set_file_extent_type(leaf, fi,
3909 BTRFS_FILE_EXTENT_REG);
3910 btrfs_set_file_extent_disk_bytenr(leaf, fi,
3911 new_extents[i].disk_bytenr);
3912 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
3913 new_extents[i].disk_num_bytes);
3914 btrfs_set_file_extent_num_bytes(leaf, fi,
3915 extent_len);
3916 ext_offset += new_extents[i].offset;
3917 btrfs_set_file_extent_offset(leaf, fi,
3918 ext_offset);
3919 btrfs_mark_buffer_dirty(leaf);
3920
3921 btrfs_drop_extent_cache(inode, key.offset,
3922 key.offset + extent_len - 1, 0);
3923
3924 ret = btrfs_inc_extent_ref(trans, root,
3925 new_extents[i].disk_bytenr,
3926 new_extents[i].disk_num_bytes,
3927 leaf->start,
3928 root->root_key.objectid,
3929 trans->transid, key.objectid);
3930 BUG_ON(ret);
3931 btrfs_release_path(root, path);
3932
3933 inode_add_bytes(inode, extent_len);
3934
3935 ext_offset = 0;
3936 num_bytes -= extent_len;
3937 key.offset += extent_len;
3938
3939 if (num_bytes == 0)
3940 break;
3941 }
3942 BUG_ON(i >= nr_extents);
3943 }
3944
3945 if (extent_locked) {
3946 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
3947 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
3948 lock_end, GFP_NOFS);
3949 extent_locked = 0;
3950 }
3951skip:
3952 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
3953 key.offset >= first_pos + extent_key->offset)
3954 break;
3955
3956 cond_resched();
3957 }
3958 ret = 0;
3959out:
3960 btrfs_release_path(root, path);
3961 if (inode) {
3962 mutex_unlock(&inode->i_mutex);
3963 if (extent_locked) {
3964 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
3965 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
3966 lock_end, GFP_NOFS);
3967 }
3968 iput(inode);
3969 }
3970 return ret;
3971}
3972
3973int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
3974 u64 num_bytes, u64 new_bytenr)
3975{
3976 set_extent_bits(&root->fs_info->reloc_mapping_tree,
3977 orig_bytenr, orig_bytenr + num_bytes - 1,
3978 EXTENT_LOCKED, GFP_NOFS);
3979 set_state_private(&root->fs_info->reloc_mapping_tree,
3980 orig_bytenr, new_bytenr);
3981 return 0;
3982}
3983
3984int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
3985 u64 num_bytes, u64 *new_bytenr)
3986{
3987 u64 bytenr;
3988 u64 cur_bytenr = orig_bytenr;
3989 u64 prev_bytenr = orig_bytenr;
3990 int ret;
3991
3992 while (1) {
3993 ret = get_state_private(&root->fs_info->reloc_mapping_tree,
3994 cur_bytenr, &bytenr);
3995 if (ret)
3996 break;
3997 prev_bytenr = cur_bytenr;
3998 cur_bytenr = bytenr;
3999 }
4000
4001 if (orig_bytenr == cur_bytenr)
4002 return -ENOENT;
4003
4004 if (prev_bytenr != orig_bytenr) {
4005 set_state_private(&root->fs_info->reloc_mapping_tree,
4006 orig_bytenr, cur_bytenr);
4007 }
4008 *new_bytenr = cur_bytenr;
4009 return 0;
4010}
4011
4012void btrfs_free_reloc_mappings(struct btrfs_root *root)
4013{
4014 clear_extent_bits(&root->fs_info->reloc_mapping_tree,
4015 0, (u64)-1, -1, GFP_NOFS);
4016}
4017
4018int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4019 struct btrfs_root *root,
4020 struct extent_buffer *buf, u64 orig_start)
4021{
4022 int level;
4023 int ret;
4024
4025 BUG_ON(btrfs_header_generation(buf) != trans->transid);
4026 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4027
4028 level = btrfs_header_level(buf);
4029 if (level == 0) {
4030 struct btrfs_leaf_ref *ref;
4031 struct btrfs_leaf_ref *orig_ref;
4032
4033 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4034 if (!orig_ref)
4035 return -ENOENT;
4036
4037 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4038 if (!ref) {
4039 btrfs_free_leaf_ref(root, orig_ref);
4040 return -ENOMEM;
4041 }
4042
4043 ref->nritems = orig_ref->nritems;
4044 memcpy(ref->extents, orig_ref->extents,
4045 sizeof(ref->extents[0]) * ref->nritems);
4046
4047 btrfs_free_leaf_ref(root, orig_ref);
4048
4049 ref->root_gen = trans->transid;
4050 ref->bytenr = buf->start;
4051 ref->owner = btrfs_header_owner(buf);
4052 ref->generation = btrfs_header_generation(buf);
4053 ret = btrfs_add_leaf_ref(root, ref, 0);
4054 WARN_ON(ret);
4055 btrfs_free_leaf_ref(root, ref);
4056 }
4057 return 0;
4058}
4059
4060static int noinline invalidate_extent_cache(struct btrfs_root *root,
4061 struct extent_buffer *leaf,
4062 struct btrfs_block_group_cache *group,
4063 struct btrfs_root *target_root)
4064{
4065 struct btrfs_key key;
4066 struct inode *inode = NULL;
4067 struct btrfs_file_extent_item *fi;
4068 u64 num_bytes;
4069 u64 skip_objectid = 0;
4070 u32 nritems;
4071 u32 i;
4072
4073 nritems = btrfs_header_nritems(leaf);
4074 for (i = 0; i < nritems; i++) {
4075 btrfs_item_key_to_cpu(leaf, &key, i);
4076 if (key.objectid == skip_objectid ||
4077 key.type != BTRFS_EXTENT_DATA_KEY)
4078 continue;
4079 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4080 if (btrfs_file_extent_type(leaf, fi) ==
4081 BTRFS_FILE_EXTENT_INLINE)
4082 continue;
4083 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4084 continue;
4085 if (!inode || inode->i_ino != key.objectid) {
4086 iput(inode);
4087 inode = btrfs_ilookup(target_root->fs_info->sb,
4088 key.objectid, target_root, 1);
4089 }
4090 if (!inode) {
4091 skip_objectid = key.objectid;
4092 continue;
4093 }
4094 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4095
4096 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4097 key.offset + num_bytes - 1, GFP_NOFS);
4098 mutex_lock(&BTRFS_I(inode)->extent_mutex);
4099 btrfs_drop_extent_cache(inode, key.offset,
4100 key.offset + num_bytes - 1, 1);
4101 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
4102 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4103 key.offset + num_bytes - 1, GFP_NOFS);
4104 cond_resched();
4105 }
4106 iput(inode);
4107 return 0;
4108}
4109
4110static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4111 struct btrfs_root *root,
4112 struct extent_buffer *leaf,
4113 struct btrfs_block_group_cache *group,
4114 struct inode *reloc_inode)
4115{
4116 struct btrfs_key key;
4117 struct btrfs_key extent_key;
4118 struct btrfs_file_extent_item *fi;
4119 struct btrfs_leaf_ref *ref;
4120 struct disk_extent *new_extent;
4121 u64 bytenr;
4122 u64 num_bytes;
4123 u32 nritems;
4124 u32 i;
4125 int ext_index;
4126 int nr_extent;
4127 int ret;
4128
4129 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4130 BUG_ON(!new_extent);
4131
4132 ref = btrfs_lookup_leaf_ref(root, leaf->start);
4133 BUG_ON(!ref);
4134
4135 ext_index = -1;
4136 nritems = btrfs_header_nritems(leaf);
4137 for (i = 0; i < nritems; i++) {
4138 btrfs_item_key_to_cpu(leaf, &key, i);
4139 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4140 continue;
4141 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4142 if (btrfs_file_extent_type(leaf, fi) ==
4143 BTRFS_FILE_EXTENT_INLINE)
4144 continue;
4145 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4146 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4147 if (bytenr == 0)
4148 continue;
4149
4150 ext_index++;
4151 if (bytenr >= group->key.objectid + group->key.offset ||
4152 bytenr + num_bytes <= group->key.objectid)
4153 continue;
4154
4155 extent_key.objectid = bytenr;
4156 extent_key.offset = num_bytes;
4157 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4158 nr_extent = 1;
4159 ret = get_new_locations(reloc_inode, &extent_key,
4160 group->key.objectid, 1,
4161 &new_extent, &nr_extent);
4162 if (ret > 0)
4163 continue;
4164 BUG_ON(ret < 0);
4165
4166 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4167 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4168 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4169 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4170
4171 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
4172 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4173 new_extent->disk_bytenr);
4174 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4175 new_extent->disk_num_bytes);
4176 new_extent->offset += btrfs_file_extent_offset(leaf, fi);
4177 btrfs_set_file_extent_offset(leaf, fi, new_extent->offset);
4178 btrfs_mark_buffer_dirty(leaf);
4179
4180 ret = btrfs_inc_extent_ref(trans, root,
4181 new_extent->disk_bytenr,
4182 new_extent->disk_num_bytes,
4183 leaf->start,
4184 root->root_key.objectid,
4185 trans->transid, key.objectid);
4186 BUG_ON(ret);
4187 ret = btrfs_free_extent(trans, root,
4188 bytenr, num_bytes, leaf->start,
4189 btrfs_header_owner(leaf),
4190 btrfs_header_generation(leaf),
4191 key.objectid, 0);
4192 BUG_ON(ret);
4193 cond_resched();
4194 }
4195 kfree(new_extent);
4196 BUG_ON(ext_index + 1 != ref->nritems);
4197 btrfs_free_leaf_ref(root, ref);
4198 return 0;
4199}
4200
4201int btrfs_free_reloc_root(struct btrfs_root *root)
4202{
4203 struct btrfs_root *reloc_root;
4204
4205 if (root->reloc_root) {
4206 reloc_root = root->reloc_root;
4207 root->reloc_root = NULL;
4208 list_add(&reloc_root->dead_list,
4209 &root->fs_info->dead_reloc_roots);
4210 }
4211 return 0;
4212}
4213
4214int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4215{
4216 struct btrfs_trans_handle *trans;
4217 struct btrfs_root *reloc_root;
4218 struct btrfs_root *prev_root = NULL;
4219 struct list_head dead_roots;
4220 int ret;
4221 unsigned long nr;
4222
4223 INIT_LIST_HEAD(&dead_roots);
4224 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4225
4226 while (!list_empty(&dead_roots)) {
4227 reloc_root = list_entry(dead_roots.prev,
4228 struct btrfs_root, dead_list);
4229 list_del_init(&reloc_root->dead_list);
4230
4231 BUG_ON(reloc_root->commit_root != NULL);
4232 while (1) {
4233 trans = btrfs_join_transaction(root, 1);
4234 BUG_ON(!trans);
4235
4236 mutex_lock(&root->fs_info->drop_mutex);
4237 ret = btrfs_drop_snapshot(trans, reloc_root);
4238 if (ret != -EAGAIN)
4239 break;
4240 mutex_unlock(&root->fs_info->drop_mutex);
4241
4242 nr = trans->blocks_used;
4243 ret = btrfs_end_transaction(trans, root);
4244 BUG_ON(ret);
4245 btrfs_btree_balance_dirty(root, nr);
4246 }
4247
4248 free_extent_buffer(reloc_root->node);
4249
4250 ret = btrfs_del_root(trans, root->fs_info->tree_root,
4251 &reloc_root->root_key);
4252 BUG_ON(ret);
4253 mutex_unlock(&root->fs_info->drop_mutex);
4254
4255 nr = trans->blocks_used;
4256 ret = btrfs_end_transaction(trans, root);
4257 BUG_ON(ret);
4258 btrfs_btree_balance_dirty(root, nr);
4259
4260 kfree(prev_root);
4261 prev_root = reloc_root;
4262 }
4263 if (prev_root) {
4264 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
4265 kfree(prev_root);
4266 }
4267 return 0;
4268}
4269
4270int btrfs_add_dead_reloc_root(struct btrfs_root *root)
4271{
4272 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
4273 return 0;
4274}
4275
4276int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
4277{
4278 struct btrfs_root *reloc_root;
4279 struct btrfs_trans_handle *trans;
4280 struct btrfs_key location;
4281 int found;
4282 int ret;
4283
4284 mutex_lock(&root->fs_info->tree_reloc_mutex);
4285 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
4286 BUG_ON(ret);
4287 found = !list_empty(&root->fs_info->dead_reloc_roots);
4288 mutex_unlock(&root->fs_info->tree_reloc_mutex);
4289
4290 if (found) {
4291 trans = btrfs_start_transaction(root, 1);
4292 BUG_ON(!trans);
4293 ret = btrfs_commit_transaction(trans, root);
4294 BUG_ON(ret);
4295 }
4296
4297 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
4298 location.offset = (u64)-1;
4299 location.type = BTRFS_ROOT_ITEM_KEY;
4300
4301 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
4302 BUG_ON(!reloc_root);
4303 btrfs_orphan_cleanup(reloc_root);
4304 return 0;
4305}
4306
4307static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
4308 struct btrfs_root *root)
4309{
4310 struct btrfs_root *reloc_root;
4311 struct extent_buffer *eb;
4312 struct btrfs_root_item *root_item;
4313 struct btrfs_key root_key;
4314 int ret;
4315
4316 BUG_ON(!root->ref_cows);
4317 if (root->reloc_root)
4318 return 0;
4319
4320 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
4321 BUG_ON(!root_item);
4322
4323 ret = btrfs_copy_root(trans, root, root->commit_root,
4324 &eb, BTRFS_TREE_RELOC_OBJECTID);
4325 BUG_ON(ret);
4326
4327 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4328 root_key.offset = root->root_key.objectid;
4329 root_key.type = BTRFS_ROOT_ITEM_KEY;
4330
4331 memcpy(root_item, &root->root_item, sizeof(root_item));
4332 btrfs_set_root_refs(root_item, 0);
4333 btrfs_set_root_bytenr(root_item, eb->start);
4334 btrfs_set_root_level(root_item, btrfs_header_level(eb));
4335 memset(&root_item->drop_progress, 0, sizeof(root_item->drop_progress));
4336 root_item->drop_level = 0;
4337
4338 btrfs_tree_unlock(eb);
4339 free_extent_buffer(eb);
4340
4341 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
4342 &root_key, root_item);
4343 BUG_ON(ret);
4344 kfree(root_item);
4345
4346 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
4347 &root_key);
4348 BUG_ON(!reloc_root);
4349 reloc_root->last_trans = trans->transid;
4350 reloc_root->commit_root = NULL;
4351 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
4352
4353 root->reloc_root = reloc_root;
4354 return 0;
4355}
4356
4357/*
4358 * Core function of space balance.
4359 *
4360 * The idea is using reloc trees to relocate tree blocks in reference
4361 * counted roots. There is one reloc tree for each subvol, all reloc
4362 * trees share same key objectid. Reloc trees are snapshots of the
4363 * latest committed roots (subvol root->commit_root). To relocate a tree
4364 * block referenced by a subvol, the code COW the block through the reloc
4365 * tree, then update pointer in the subvol to point to the new block.
4366 * Since all reloc trees share same key objectid, we can easily do special
4367 * handing to share tree blocks between reloc trees. Once a tree block has
4368 * been COWed in one reloc tree, we can use the result when the same block
4369 * is COWed again through other reloc trees.
4370 */
4371static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
4372 struct btrfs_root *root,
4373 struct btrfs_path *path,
4374 struct btrfs_key *first_key,
4375 struct btrfs_ref_path *ref_path,
4376 struct btrfs_block_group_cache *group,
4377 struct inode *reloc_inode)
4378{
4379 struct btrfs_root *reloc_root;
4380 struct extent_buffer *eb = NULL;
4381 struct btrfs_key *keys;
4382 u64 *nodes;
4383 int level;
4384 int lowest_merge;
4385 int lowest_level = 0;
4386 int update_refs;
4387 int ret;
4388
4389 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
4390 lowest_level = ref_path->owner_objectid;
4391
4392 if (is_cowonly_root(ref_path->root_objectid)) {
4393 path->lowest_level = lowest_level;
4394 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
4395 BUG_ON(ret < 0);
4396 path->lowest_level = 0;
4397 btrfs_release_path(root, path);
4398 return 0;
4399 }
4400
4401 keys = kzalloc(sizeof(*keys) * BTRFS_MAX_LEVEL, GFP_NOFS);
4402 BUG_ON(!keys);
4403 nodes = kzalloc(sizeof(*nodes) * BTRFS_MAX_LEVEL, GFP_NOFS);
4404 BUG_ON(!nodes);
4405
4406 mutex_lock(&root->fs_info->tree_reloc_mutex);
4407 ret = init_reloc_tree(trans, root);
4408 BUG_ON(ret);
4409 reloc_root = root->reloc_root;
4410
4411 path->lowest_level = lowest_level;
4412 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 0);
4413 BUG_ON(ret);
4414 /*
4415 * get relocation mapping for tree blocks in the path
4416 */
4417 lowest_merge = BTRFS_MAX_LEVEL;
4418 for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
4419 u64 new_bytenr;
4420 eb = path->nodes[level];
4421 if (!eb || eb == reloc_root->node)
4422 continue;
4423 ret = btrfs_get_reloc_mapping(reloc_root, eb->start, eb->len,
4424 &new_bytenr);
4425 if (ret)
4426 continue;
4427 if (level == 0)
4428 btrfs_item_key_to_cpu(eb, &keys[level], 0);
4429 else
4430 btrfs_node_key_to_cpu(eb, &keys[level], 0);
4431 nodes[level] = new_bytenr;
4432 lowest_merge = level;
4433 }
4434
4435 update_refs = 0;
4436 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
4437 eb = path->nodes[0];
4438 if (btrfs_header_generation(eb) < trans->transid)
4439 update_refs = 1;
4440 }
4441
4442 btrfs_release_path(reloc_root, path);
4443 /*
4444 * merge tree blocks that already relocated in other reloc trees
4445 */
4446 if (lowest_merge != BTRFS_MAX_LEVEL) {
4447 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
4448 lowest_merge);
4449 BUG_ON(ret < 0);
4450 }
4451 /*
4452 * cow any tree blocks that still haven't been relocated
4453 */
4454 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 1);
4455 BUG_ON(ret);
4456 /*
4457 * if we are relocating data block group, update extent pointers
4458 * in the newly created tree leaf.
4459 */
4460 eb = path->nodes[0];
4461 if (update_refs && nodes[0] != eb->start) {
4462 ret = replace_extents_in_leaf(trans, reloc_root, eb, group,
4463 reloc_inode);
4464 BUG_ON(ret);
4465 }
4466
4467 memset(keys, 0, sizeof(*keys) * BTRFS_MAX_LEVEL);
4468 memset(nodes, 0, sizeof(*nodes) * BTRFS_MAX_LEVEL);
4469 for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
4470 eb = path->nodes[level];
4471 if (!eb || eb == reloc_root->node)
4472 continue;
4473 BUG_ON(btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID);
4474 nodes[level] = eb->start;
4475 if (level == 0)
4476 btrfs_item_key_to_cpu(eb, &keys[level], 0);
4477 else
4478 btrfs_node_key_to_cpu(eb, &keys[level], 0);
4479 }
4480
4481 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
4482 eb = path->nodes[0];
4483 extent_buffer_get(eb);
4484 }
4485 btrfs_release_path(reloc_root, path);
4486 /*
4487 * replace tree blocks in the fs tree with tree blocks in
4488 * the reloc tree.
4489 */
4490 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
4491 BUG_ON(ret < 0);
4492
4493 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
4494 ret = invalidate_extent_cache(reloc_root, eb, group, root);
4495 BUG_ON(ret);
4496 free_extent_buffer(eb);
4497 }
4498 mutex_unlock(&root->fs_info->tree_reloc_mutex);
4499
4500 path->lowest_level = 0;
4501 kfree(nodes);
4502 kfree(keys);
4503 return 0;
4504}
4505
4506static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
4507 struct btrfs_root *root,
4508 struct btrfs_path *path,
4509 struct btrfs_key *first_key,
4510 struct btrfs_ref_path *ref_path)
4511{
4512 int ret;
4513 int needs_lock = 0;
4514
4515 if (root == root->fs_info->extent_root ||
4516 root == root->fs_info->chunk_root ||
4517 root == root->fs_info->dev_root) {
4518 needs_lock = 1;
4519 mutex_lock(&root->fs_info->alloc_mutex);
4520 }
4521
4522 ret = relocate_one_path(trans, root, path, first_key,
4523 ref_path, NULL, NULL);
4524 BUG_ON(ret);
4525
4526 if (root == root->fs_info->extent_root)
4527 btrfs_extent_post_op(trans, root);
4528 if (needs_lock)
4529 mutex_unlock(&root->fs_info->alloc_mutex);
4530
4531 return 0;
4532}
4533
4534static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
4535 struct btrfs_root *extent_root,
4536 struct btrfs_path *path,
4537 struct btrfs_key *extent_key)
4538{
4539 int ret;
4540
4541 mutex_lock(&extent_root->fs_info->alloc_mutex);
4542 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
4543 if (ret)
4544 goto out;
4545 ret = btrfs_del_item(trans, extent_root, path);
4546out:
4547 btrfs_release_path(extent_root, path);
4548 mutex_unlock(&extent_root->fs_info->alloc_mutex);
4549 return ret;
4550}
4551
4552static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
4553 struct btrfs_ref_path *ref_path)
4554{
4555 struct btrfs_key root_key;
4556
4557 root_key.objectid = ref_path->root_objectid;
4558 root_key.type = BTRFS_ROOT_ITEM_KEY;
4559 if (is_cowonly_root(ref_path->root_objectid))
4560 root_key.offset = 0;
4561 else
4562 root_key.offset = (u64)-1;
4563
4564 return btrfs_read_fs_root_no_name(fs_info, &root_key);
4565}
4566
4567static int noinline relocate_one_extent(struct btrfs_root *extent_root,
4568 struct btrfs_path *path,
4569 struct btrfs_key *extent_key,
4570 struct btrfs_block_group_cache *group,
4571 struct inode *reloc_inode, int pass)
4572{
4573 struct btrfs_trans_handle *trans;
4574 struct btrfs_root *found_root;
4575 struct btrfs_ref_path *ref_path = NULL;
4576 struct disk_extent *new_extents = NULL;
4577 int nr_extents = 0;
4578 int loops;
4579 int ret;
4580 int level;
4581 struct btrfs_key first_key;
4582 u64 prev_block = 0;
4583
4584 mutex_unlock(&extent_root->fs_info->alloc_mutex);
4585
4586 trans = btrfs_start_transaction(extent_root, 1);
4587 BUG_ON(!trans);
4588
4589 if (extent_key->objectid == 0) {
4590 ret = del_extent_zero(trans, extent_root, path, extent_key);
4591 goto out;
4592 }
4593
4594 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
4595 if (!ref_path) {
4596 ret = -ENOMEM;
4597 goto out;
4598 }
4599
4600 for (loops = 0; ; loops++) {
4601 if (loops == 0) {
4602 ret = btrfs_first_ref_path(trans, extent_root, ref_path,
4603 extent_key->objectid);
4604 } else {
4605 ret = btrfs_next_ref_path(trans, extent_root, ref_path);
4606 }
4607 if (ret < 0)
4608 goto out;
4609 if (ret > 0)
4610 break;
4611
4612 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
4613 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
4614 continue;
4615
4616 found_root = read_ref_root(extent_root->fs_info, ref_path);
4617 BUG_ON(!found_root);
4618 /*
4619 * for reference counted tree, only process reference paths
4620 * rooted at the latest committed root.
4621 */
4622 if (found_root->ref_cows &&
4623 ref_path->root_generation != found_root->root_key.offset)
4624 continue;
4625
4626 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
4627 if (pass == 0) {
4628 /*
4629 * copy data extents to new locations
4630 */
4631 u64 group_start = group->key.objectid;
4632 ret = relocate_data_extent(reloc_inode,
4633 extent_key,
4634 group_start);
4635 if (ret < 0)
4636 goto out;
4637 break;
4638 }
4639 level = 0;
4640 } else {
4641 level = ref_path->owner_objectid;
4642 }
4643
4644 if (prev_block != ref_path->nodes[level]) {
4645 struct extent_buffer *eb;
4646 u64 block_start = ref_path->nodes[level];
4647 u64 block_size = btrfs_level_size(found_root, level);
4648
4649 eb = read_tree_block(found_root, block_start,
4650 block_size, 0);
4651 btrfs_tree_lock(eb);
4652 BUG_ON(level != btrfs_header_level(eb));
4653
4654 if (level == 0)
4655 btrfs_item_key_to_cpu(eb, &first_key, 0);
4656 else
4657 btrfs_node_key_to_cpu(eb, &first_key, 0);
4658
4659 btrfs_tree_unlock(eb);
4660 free_extent_buffer(eb);
4661 prev_block = block_start;
4662 }
4663
4664 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
4665 pass >= 2) {
4666 /*
4667 * use fallback method to process the remaining
4668 * references.
4669 */
4670 if (!new_extents) {
4671 u64 group_start = group->key.objectid;
4672 ret = get_new_locations(reloc_inode,
4673 extent_key,
4674 group_start, 0,
4675 &new_extents,
4676 &nr_extents);
4677 if (ret < 0)
4678 goto out;
4679 }
4680 btrfs_record_root_in_trans(found_root);
4681 ret = replace_one_extent(trans, found_root,
4682 path, extent_key,
4683 &first_key, ref_path,
4684 new_extents, nr_extents);
4685 if (ret < 0)
4686 goto out;
4687 continue;
4688 }
4689
4690 btrfs_record_root_in_trans(found_root);
4691 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4692 ret = relocate_tree_block(trans, found_root, path,
4693 &first_key, ref_path);
4694 } else {
4695 /*
4696 * try to update data extent references while
4697 * keeping metadata shared between snapshots.
4698 */
4699 ret = relocate_one_path(trans, found_root, path,
4700 &first_key, ref_path,
4701 group, reloc_inode);
4702 }
4703 if (ret < 0)
4704 goto out;
4705 }
4706 ret = 0;
4707out:
4708 btrfs_end_transaction(trans, extent_root);
4709 kfree(new_extents);
4710 kfree(ref_path);
4711 mutex_lock(&extent_root->fs_info->alloc_mutex);
4712 return ret;
4713}
4714
4715static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
4716{
4717 u64 num_devices;
4718 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
4719 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
4720
4721 num_devices = root->fs_info->fs_devices->num_devices;
4722 if (num_devices == 1) {
4723 stripped |= BTRFS_BLOCK_GROUP_DUP;
4724 stripped = flags & ~stripped;
4725
4726 /* turn raid0 into single device chunks */
4727 if (flags & BTRFS_BLOCK_GROUP_RAID0)
4728 return stripped;
4729
4730 /* turn mirroring into duplication */
4731 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
4732 BTRFS_BLOCK_GROUP_RAID10))
4733 return stripped | BTRFS_BLOCK_GROUP_DUP;
4734 return flags;
4735 } else {
4736 /* they already had raid on here, just return */
4737 if (flags & stripped)
4738 return flags;
4739
4740 stripped |= BTRFS_BLOCK_GROUP_DUP;
4741 stripped = flags & ~stripped;
4742
4743 /* switch duplicated blocks with raid1 */
4744 if (flags & BTRFS_BLOCK_GROUP_DUP)
4745 return stripped | BTRFS_BLOCK_GROUP_RAID1;
4746
4747 /* turn single device chunks into raid0 */
4748 return stripped | BTRFS_BLOCK_GROUP_RAID0;
4749 }
4750 return flags;
4751}
4752
4753int __alloc_chunk_for_shrink(struct btrfs_root *root,
4754 struct btrfs_block_group_cache *shrink_block_group,
4755 int force)
4756{
4757 struct btrfs_trans_handle *trans;
4758 u64 new_alloc_flags;
4759 u64 calc;
4760
4761 spin_lock(&shrink_block_group->lock);
4762 if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
4763 spin_unlock(&shrink_block_group->lock);
4764 mutex_unlock(&root->fs_info->alloc_mutex);
4765
4766 trans = btrfs_start_transaction(root, 1);
4767 mutex_lock(&root->fs_info->alloc_mutex);
4768 spin_lock(&shrink_block_group->lock);
4769
4770 new_alloc_flags = update_block_group_flags(root,
4771 shrink_block_group->flags);
4772 if (new_alloc_flags != shrink_block_group->flags) {
4773 calc =
4774 btrfs_block_group_used(&shrink_block_group->item);
4775 } else {
4776 calc = shrink_block_group->key.offset;
4777 }
4778 spin_unlock(&shrink_block_group->lock);
4779
4780 do_chunk_alloc(trans, root->fs_info->extent_root,
4781 calc + 2 * 1024 * 1024, new_alloc_flags, force);
4782
4783 mutex_unlock(&root->fs_info->alloc_mutex);
4784 btrfs_end_transaction(trans, root);
4785 mutex_lock(&root->fs_info->alloc_mutex);
4786 } else
4787 spin_unlock(&shrink_block_group->lock);
4788 return 0;
4789}
4790
4791static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4792 struct btrfs_root *root,
4793 u64 objectid, u64 size)
4794{
4795 struct btrfs_path *path;
4796 struct btrfs_inode_item *item;
4797 struct extent_buffer *leaf;
4798 int ret;
4799
4800 path = btrfs_alloc_path();
4801 if (!path)
4802 return -ENOMEM;
4803
4804 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4805 if (ret)
4806 goto out;
4807
4808 leaf = path->nodes[0];
4809 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4810 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
4811 btrfs_set_inode_generation(leaf, item, 1);
4812 btrfs_set_inode_size(leaf, item, size);
4813 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4814 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM);
4815 btrfs_mark_buffer_dirty(leaf);
4816 btrfs_release_path(root, path);
4817out:
4818 btrfs_free_path(path);
4819 return ret;
4820}
4821
4822static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
4823 struct btrfs_block_group_cache *group)
4824{
4825 struct inode *inode = NULL;
4826 struct btrfs_trans_handle *trans;
4827 struct btrfs_root *root;
4828 struct btrfs_key root_key;
4829 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
4830 int err = 0;
4831
4832 root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
4833 root_key.type = BTRFS_ROOT_ITEM_KEY;
4834 root_key.offset = (u64)-1;
4835 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
4836 if (IS_ERR(root))
4837 return ERR_CAST(root);
4838
4839 trans = btrfs_start_transaction(root, 1);
4840 BUG_ON(!trans);
4841
4842 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
4843 if (err)
4844 goto out;
4845
4846 err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
4847 BUG_ON(err);
4848
4849 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
4850 group->key.offset, 0);
4851 BUG_ON(err);
4852
4853 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
4854 if (inode->i_state & I_NEW) {
4855 BTRFS_I(inode)->root = root;
4856 BTRFS_I(inode)->location.objectid = objectid;
4857 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
4858 BTRFS_I(inode)->location.offset = 0;
4859 btrfs_read_locked_inode(inode);
4860 unlock_new_inode(inode);
4861 BUG_ON(is_bad_inode(inode));
4862 } else {
4863 BUG_ON(1);
4864 }
4865
4866 err = btrfs_orphan_add(trans, inode);
4867out:
4868 btrfs_end_transaction(trans, root);
4869 if (err) {
4870 if (inode)
4871 iput(inode);
4872 inode = ERR_PTR(err);
4873 }
4874 return inode;
4875}
4876
4877int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
4878{
4879 struct btrfs_trans_handle *trans;
4880 struct btrfs_path *path;
4881 struct btrfs_fs_info *info = root->fs_info;
4882 struct extent_buffer *leaf;
4883 struct inode *reloc_inode;
4884 struct btrfs_block_group_cache *block_group;
4885 struct btrfs_key key;
4886 u64 cur_byte;
4887 u64 total_found;
4888 u32 nritems;
4889 int ret;
4890 int progress;
4891 int pass = 0;
4892
4893 root = root->fs_info->extent_root;
4894
4895 block_group = btrfs_lookup_block_group(info, group_start);
4896 BUG_ON(!block_group);
4897
4898 printk("btrfs relocating block group %llu flags %llu\n",
4899 (unsigned long long)block_group->key.objectid,
4900 (unsigned long long)block_group->flags);
4901
4902 path = btrfs_alloc_path();
4903 BUG_ON(!path);
4904
4905 reloc_inode = create_reloc_inode(info, block_group);
4906 BUG_ON(IS_ERR(reloc_inode));
4907
4908 mutex_lock(&root->fs_info->alloc_mutex);
4909
4910 __alloc_chunk_for_shrink(root, block_group, 1);
4911 block_group->ro = 1;
4912 block_group->space_info->total_bytes -= block_group->key.offset;
4913
4914 mutex_unlock(&root->fs_info->alloc_mutex);
4915
4916 btrfs_start_delalloc_inodes(info->tree_root);
4917 btrfs_wait_ordered_extents(info->tree_root, 0);
4918again:
4919 total_found = 0;
4920 progress = 0;
4921 key.objectid = block_group->key.objectid;
4922 key.offset = 0;
4923 key.type = 0;
4924 cur_byte = key.objectid;
4925
4926 trans = btrfs_start_transaction(info->tree_root, 1);
4927 btrfs_commit_transaction(trans, info->tree_root);
4928
4929 mutex_lock(&root->fs_info->cleaner_mutex);
4930 btrfs_clean_old_snapshots(info->tree_root);
4931 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
4932 mutex_unlock(&root->fs_info->cleaner_mutex);
4933
4934 mutex_lock(&root->fs_info->alloc_mutex);
4935
4936 while(1) {
4937 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4938 if (ret < 0)
4939 goto out;
4940next:
4941 leaf = path->nodes[0];
4942 nritems = btrfs_header_nritems(leaf);
4943 if (path->slots[0] >= nritems) {
4944 ret = btrfs_next_leaf(root, path);
4945 if (ret < 0)
4946 goto out;
4947 if (ret == 1) {
4948 ret = 0;
4949 break;
4950 }
4951 leaf = path->nodes[0];
4952 nritems = btrfs_header_nritems(leaf);
4953 }
4954
4955 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4956
4957 if (key.objectid >= block_group->key.objectid +
4958 block_group->key.offset)
4959 break;
4960
4961 if (progress && need_resched()) {
4962 btrfs_release_path(root, path);
4963 mutex_unlock(&root->fs_info->alloc_mutex);
4964 cond_resched();
4965 mutex_lock(&root->fs_info->alloc_mutex);
4966 progress = 0;
4967 continue;
4968 }
4969 progress = 1;
4970
4971 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
4972 key.objectid + key.offset <= cur_byte) {
4973 path->slots[0]++;
4974 goto next;
4975 }
4976
4977 total_found++;
4978 cur_byte = key.objectid + key.offset;
4979 btrfs_release_path(root, path);
4980
4981 __alloc_chunk_for_shrink(root, block_group, 0);
4982 ret = relocate_one_extent(root, path, &key, block_group,
4983 reloc_inode, pass);
4984 BUG_ON(ret < 0);
4985
4986 key.objectid = cur_byte;
4987 key.type = 0;
4988 key.offset = 0;
4989 }
4990
4991 btrfs_release_path(root, path);
4992 mutex_unlock(&root->fs_info->alloc_mutex);
4993
4994 if (pass == 0) {
4995 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
4996 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
4997 WARN_ON(reloc_inode->i_mapping->nrpages);
4998 }
4999
5000 if (total_found > 0) {
5001 printk("btrfs found %llu extents in pass %d\n",
5002 (unsigned long long)total_found, pass);
5003 pass++;
5004 goto again;
5005 }
5006
5007 /* delete reloc_inode */
5008 iput(reloc_inode);
5009
5010 /* unpin extents in this range */
5011 trans = btrfs_start_transaction(info->tree_root, 1);
5012 btrfs_commit_transaction(trans, info->tree_root);
5013
5014 mutex_lock(&root->fs_info->alloc_mutex);
5015
5016 spin_lock(&block_group->lock);
5017 WARN_ON(block_group->pinned > 0);
5018 WARN_ON(block_group->reserved > 0);
5019 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5020 spin_unlock(&block_group->lock);
5021 ret = 0;
5022out:
5023 mutex_unlock(&root->fs_info->alloc_mutex);
5024 btrfs_free_path(path);
5025 return ret;
5026}
5027
5028int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
5029 struct btrfs_key *key)
5030{
5031 int ret = 0;
5032 struct btrfs_key found_key;
5033 struct extent_buffer *leaf;
5034 int slot;
5035
5036 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5037 if (ret < 0)
5038 goto out;
5039
5040 while(1) {
5041 slot = path->slots[0];
5042 leaf = path->nodes[0];
5043 if (slot >= btrfs_header_nritems(leaf)) {
5044 ret = btrfs_next_leaf(root, path);
5045 if (ret == 0)
5046 continue;
5047 if (ret < 0)
5048 goto out;
5049 break;
5050 }
5051 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5052
5053 if (found_key.objectid >= key->objectid &&
5054 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5055 ret = 0;
5056 goto out;
5057 }
5058 path->slots[0]++;
5059 }
5060 ret = -ENOENT;
5061out:
5062 return ret;
5063}
5064
5065int btrfs_free_block_groups(struct btrfs_fs_info *info)
5066{
5067 struct btrfs_block_group_cache *block_group;
5068 struct rb_node *n;
5069
5070 mutex_lock(&info->alloc_mutex);
5071 spin_lock(&info->block_group_cache_lock);
5072 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5073 block_group = rb_entry(n, struct btrfs_block_group_cache,
5074 cache_node);
5075
5076 spin_unlock(&info->block_group_cache_lock);
5077 btrfs_remove_free_space_cache(block_group);
5078 spin_lock(&info->block_group_cache_lock);
5079
5080 rb_erase(&block_group->cache_node,
5081 &info->block_group_cache_tree);
5082 spin_lock(&block_group->space_info->lock);
5083 list_del(&block_group->list);
5084 spin_unlock(&block_group->space_info->lock);
5085 kfree(block_group);
5086 }
5087 spin_unlock(&info->block_group_cache_lock);
5088 mutex_unlock(&info->alloc_mutex);
5089 return 0;
5090}
5091
5092int btrfs_read_block_groups(struct btrfs_root *root)
5093{
5094 struct btrfs_path *path;
5095 int ret;
5096 struct btrfs_block_group_cache *cache;
5097 struct btrfs_fs_info *info = root->fs_info;
5098 struct btrfs_space_info *space_info;
5099 struct btrfs_key key;
5100 struct btrfs_key found_key;
5101 struct extent_buffer *leaf;
5102
5103 root = info->extent_root;
5104 key.objectid = 0;
5105 key.offset = 0;
5106 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5107 path = btrfs_alloc_path();
5108 if (!path)
5109 return -ENOMEM;
5110
5111 mutex_lock(&root->fs_info->alloc_mutex);
5112 while(1) {
5113 ret = find_first_block_group(root, path, &key);
5114 if (ret > 0) {
5115 ret = 0;
5116 goto error;
5117 }
5118 if (ret != 0)
5119 goto error;
5120
5121 leaf = path->nodes[0];
5122 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5123 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5124 if (!cache) {
5125 ret = -ENOMEM;
5126 break;
5127 }
5128
5129 spin_lock_init(&cache->lock);
5130 INIT_LIST_HEAD(&cache->list);
5131 read_extent_buffer(leaf, &cache->item,
5132 btrfs_item_ptr_offset(leaf, path->slots[0]),
5133 sizeof(cache->item));
5134 memcpy(&cache->key, &found_key, sizeof(found_key));
5135
5136 key.objectid = found_key.objectid + found_key.offset;
5137 btrfs_release_path(root, path);
5138 cache->flags = btrfs_block_group_flags(&cache->item);
5139
5140 ret = update_space_info(info, cache->flags, found_key.offset,
5141 btrfs_block_group_used(&cache->item),
5142 &space_info);
5143 BUG_ON(ret);
5144 cache->space_info = space_info;
5145 spin_lock(&space_info->lock);
5146 list_add(&cache->list, &space_info->block_groups);
5147 spin_unlock(&space_info->lock);
5148
5149 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5150 BUG_ON(ret);
5151
5152 set_avail_alloc_bits(root->fs_info, cache->flags);
5153 }
5154 ret = 0;
5155error:
5156 btrfs_free_path(path);
5157 mutex_unlock(&root->fs_info->alloc_mutex);
5158 return ret;
5159}
5160
5161int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5162 struct btrfs_root *root, u64 bytes_used,
5163 u64 type, u64 chunk_objectid, u64 chunk_offset,
5164 u64 size)
5165{
5166 int ret;
5167 struct btrfs_root *extent_root;
5168 struct btrfs_block_group_cache *cache;
5169
5170 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
5171 extent_root = root->fs_info->extent_root;
5172
5173 root->fs_info->last_trans_new_blockgroup = trans->transid;
5174
5175 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5176 if (!cache)
5177 return -ENOMEM;
5178
5179 cache->key.objectid = chunk_offset;
5180 cache->key.offset = size;
5181 spin_lock_init(&cache->lock);
5182 INIT_LIST_HEAD(&cache->list);
5183 btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5184
5185 btrfs_set_block_group_used(&cache->item, bytes_used);
5186 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5187 cache->flags = type;
5188 btrfs_set_block_group_flags(&cache->item, type);
5189
5190 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5191 &cache->space_info);
5192 BUG_ON(ret);
5193 spin_lock(&cache->space_info->lock);
5194 list_add(&cache->list, &cache->space_info->block_groups);
5195 spin_unlock(&cache->space_info->lock);
5196
5197 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5198 BUG_ON(ret);
5199
5200 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5201 sizeof(cache->item));
5202 BUG_ON(ret);
5203
5204 finish_current_insert(trans, extent_root);
5205 ret = del_pending_extents(trans, extent_root);
5206 BUG_ON(ret);
5207 set_avail_alloc_bits(extent_root->fs_info, type);
5208
5209 return 0;
5210}
5211
5212int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5213 struct btrfs_root *root, u64 group_start)
5214{
5215 struct btrfs_path *path;
5216 struct btrfs_block_group_cache *block_group;
5217 struct btrfs_key key;
5218 int ret;
5219
5220 BUG_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
5221 root = root->fs_info->extent_root;
5222
5223 block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5224 BUG_ON(!block_group);
5225
5226 memcpy(&key, &block_group->key, sizeof(key));
5227
5228 path = btrfs_alloc_path();
5229 BUG_ON(!path);
5230
5231 btrfs_remove_free_space_cache(block_group);
5232 rb_erase(&block_group->cache_node,
5233 &root->fs_info->block_group_cache_tree);
5234 spin_lock(&block_group->space_info->lock);
5235 list_del(&block_group->list);
5236 spin_unlock(&block_group->space_info->lock);
5237
5238 /*
5239 memset(shrink_block_group, 0, sizeof(*shrink_block_group));
5240 kfree(shrink_block_group);
5241 */
5242
5243 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5244 if (ret > 0)
5245 ret = -EIO;
5246 if (ret < 0)
5247 goto out;
5248
5249 ret = btrfs_del_item(trans, root, path);
5250out:
5251 btrfs_free_path(path);
5252 return ret;
5253}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..563b2d12f4f2
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3416 @@
1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h>
7#include <linux/page-flags.h>
8#include <linux/module.h>
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
15#include "extent_io.h"
16#include "extent_map.h"
17#include "compat.h"
18#include "ctree.h"
19#include "btrfs_inode.h"
20
21/* temporary define until extent_map moves out of btrfs */
22struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
23 unsigned long extra_flags,
24 void (*ctor)(void *, struct kmem_cache *,
25 unsigned long));
26
27static struct kmem_cache *extent_state_cache;
28static struct kmem_cache *extent_buffer_cache;
29
30static LIST_HEAD(buffers);
31static LIST_HEAD(states);
32
33#ifdef LEAK_DEBUG
34static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
35#endif
36
37#define BUFFER_LRU_MAX 64
38
39struct tree_entry {
40 u64 start;
41 u64 end;
42 struct rb_node rb_node;
43};
44
45struct extent_page_data {
46 struct bio *bio;
47 struct extent_io_tree *tree;
48 get_extent_t *get_extent;
49};
50
51int __init extent_io_init(void)
52{
53 extent_state_cache = btrfs_cache_create("extent_state",
54 sizeof(struct extent_state), 0,
55 NULL);
56 if (!extent_state_cache)
57 return -ENOMEM;
58
59 extent_buffer_cache = btrfs_cache_create("extent_buffers",
60 sizeof(struct extent_buffer), 0,
61 NULL);
62 if (!extent_buffer_cache)
63 goto free_state_cache;
64 return 0;
65
66free_state_cache:
67 kmem_cache_destroy(extent_state_cache);
68 return -ENOMEM;
69}
70
71void extent_io_exit(void)
72{
73 struct extent_state *state;
74 struct extent_buffer *eb;
75
76 while (!list_empty(&states)) {
77 state = list_entry(states.next, struct extent_state, leak_list);
78 printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
79 list_del(&state->leak_list);
80 kmem_cache_free(extent_state_cache, state);
81
82 }
83
84 while (!list_empty(&buffers)) {
85 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
86 printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs));
87 list_del(&eb->leak_list);
88 kmem_cache_free(extent_buffer_cache, eb);
89 }
90 if (extent_state_cache)
91 kmem_cache_destroy(extent_state_cache);
92 if (extent_buffer_cache)
93 kmem_cache_destroy(extent_buffer_cache);
94}
95
96void extent_io_tree_init(struct extent_io_tree *tree,
97 struct address_space *mapping, gfp_t mask)
98{
99 tree->state.rb_node = NULL;
100 tree->buffer.rb_node = NULL;
101 tree->ops = NULL;
102 tree->dirty_bytes = 0;
103 spin_lock_init(&tree->lock);
104 spin_lock_init(&tree->buffer_lock);
105 tree->mapping = mapping;
106}
107EXPORT_SYMBOL(extent_io_tree_init);
108
109struct extent_state *alloc_extent_state(gfp_t mask)
110{
111 struct extent_state *state;
112#ifdef LEAK_DEBUG
113 unsigned long flags;
114#endif
115
116 state = kmem_cache_alloc(extent_state_cache, mask);
117 if (!state)
118 return state;
119 state->state = 0;
120 state->private = 0;
121 state->tree = NULL;
122#ifdef LEAK_DEBUG
123 spin_lock_irqsave(&leak_lock, flags);
124 list_add(&state->leak_list, &states);
125 spin_unlock_irqrestore(&leak_lock, flags);
126#endif
127 atomic_set(&state->refs, 1);
128 init_waitqueue_head(&state->wq);
129 return state;
130}
131EXPORT_SYMBOL(alloc_extent_state);
132
133void free_extent_state(struct extent_state *state)
134{
135 if (!state)
136 return;
137 if (atomic_dec_and_test(&state->refs)) {
138#ifdef LEAK_DEBUG
139 unsigned long flags;
140#endif
141 WARN_ON(state->tree);
142#ifdef LEAK_DEBUG
143 spin_lock_irqsave(&leak_lock, flags);
144 list_del(&state->leak_list);
145 spin_unlock_irqrestore(&leak_lock, flags);
146#endif
147 kmem_cache_free(extent_state_cache, state);
148 }
149}
150EXPORT_SYMBOL(free_extent_state);
151
152static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
153 struct rb_node *node)
154{
155 struct rb_node ** p = &root->rb_node;
156 struct rb_node * parent = NULL;
157 struct tree_entry *entry;
158
159 while(*p) {
160 parent = *p;
161 entry = rb_entry(parent, struct tree_entry, rb_node);
162
163 if (offset < entry->start)
164 p = &(*p)->rb_left;
165 else if (offset > entry->end)
166 p = &(*p)->rb_right;
167 else
168 return parent;
169 }
170
171 entry = rb_entry(node, struct tree_entry, rb_node);
172 rb_link_node(node, parent, p);
173 rb_insert_color(node, root);
174 return NULL;
175}
176
177static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
178 struct rb_node **prev_ret,
179 struct rb_node **next_ret)
180{
181 struct rb_root *root = &tree->state;
182 struct rb_node * n = root->rb_node;
183 struct rb_node *prev = NULL;
184 struct rb_node *orig_prev = NULL;
185 struct tree_entry *entry;
186 struct tree_entry *prev_entry = NULL;
187
188 while(n) {
189 entry = rb_entry(n, struct tree_entry, rb_node);
190 prev = n;
191 prev_entry = entry;
192
193 if (offset < entry->start)
194 n = n->rb_left;
195 else if (offset > entry->end)
196 n = n->rb_right;
197 else {
198 return n;
199 }
200 }
201
202 if (prev_ret) {
203 orig_prev = prev;
204 while(prev && offset > prev_entry->end) {
205 prev = rb_next(prev);
206 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
207 }
208 *prev_ret = prev;
209 prev = orig_prev;
210 }
211
212 if (next_ret) {
213 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
214 while(prev && offset < prev_entry->start) {
215 prev = rb_prev(prev);
216 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
217 }
218 *next_ret = prev;
219 }
220 return NULL;
221}
222
223static inline struct rb_node *tree_search(struct extent_io_tree *tree,
224 u64 offset)
225{
226 struct rb_node *prev = NULL;
227 struct rb_node *ret;
228
229 ret = __etree_search(tree, offset, &prev, NULL);
230 if (!ret) {
231 return prev;
232 }
233 return ret;
234}
235
236static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
237 u64 offset, struct rb_node *node)
238{
239 struct rb_root *root = &tree->buffer;
240 struct rb_node ** p = &root->rb_node;
241 struct rb_node * parent = NULL;
242 struct extent_buffer *eb;
243
244 while(*p) {
245 parent = *p;
246 eb = rb_entry(parent, struct extent_buffer, rb_node);
247
248 if (offset < eb->start)
249 p = &(*p)->rb_left;
250 else if (offset > eb->start)
251 p = &(*p)->rb_right;
252 else
253 return eb;
254 }
255
256 rb_link_node(node, parent, p);
257 rb_insert_color(node, root);
258 return NULL;
259}
260
261static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
262 u64 offset)
263{
264 struct rb_root *root = &tree->buffer;
265 struct rb_node * n = root->rb_node;
266 struct extent_buffer *eb;
267
268 while(n) {
269 eb = rb_entry(n, struct extent_buffer, rb_node);
270 if (offset < eb->start)
271 n = n->rb_left;
272 else if (offset > eb->start)
273 n = n->rb_right;
274 else
275 return eb;
276 }
277 return NULL;
278}
279
280/*
281 * utility function to look for merge candidates inside a given range.
282 * Any extents with matching state are merged together into a single
283 * extent in the tree. Extents with EXTENT_IO in their state field
284 * are not merged because the end_io handlers need to be able to do
285 * operations on them without sleeping (or doing allocations/splits).
286 *
287 * This should be called with the tree lock held.
288 */
289static int merge_state(struct extent_io_tree *tree,
290 struct extent_state *state)
291{
292 struct extent_state *other;
293 struct rb_node *other_node;
294
295 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
296 return 0;
297
298 other_node = rb_prev(&state->rb_node);
299 if (other_node) {
300 other = rb_entry(other_node, struct extent_state, rb_node);
301 if (other->end == state->start - 1 &&
302 other->state == state->state) {
303 state->start = other->start;
304 other->tree = NULL;
305 rb_erase(&other->rb_node, &tree->state);
306 free_extent_state(other);
307 }
308 }
309 other_node = rb_next(&state->rb_node);
310 if (other_node) {
311 other = rb_entry(other_node, struct extent_state, rb_node);
312 if (other->start == state->end + 1 &&
313 other->state == state->state) {
314 other->start = state->start;
315 state->tree = NULL;
316 rb_erase(&state->rb_node, &tree->state);
317 free_extent_state(state);
318 }
319 }
320 return 0;
321}
322
323static void set_state_cb(struct extent_io_tree *tree,
324 struct extent_state *state,
325 unsigned long bits)
326{
327 if (tree->ops && tree->ops->set_bit_hook) {
328 tree->ops->set_bit_hook(tree->mapping->host, state->start,
329 state->end, state->state, bits);
330 }
331}
332
333static void clear_state_cb(struct extent_io_tree *tree,
334 struct extent_state *state,
335 unsigned long bits)
336{
337 if (tree->ops && tree->ops->set_bit_hook) {
338 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
339 state->end, state->state, bits);
340 }
341}
342
343/*
344 * insert an extent_state struct into the tree. 'bits' are set on the
345 * struct before it is inserted.
346 *
347 * This may return -EEXIST if the extent is already there, in which case the
348 * state struct is freed.
349 *
350 * The tree lock is not taken internally. This is a utility function and
351 * probably isn't what you want to call (see set/clear_extent_bit).
352 */
353static int insert_state(struct extent_io_tree *tree,
354 struct extent_state *state, u64 start, u64 end,
355 int bits)
356{
357 struct rb_node *node;
358
359 if (end < start) {
360 printk("end < start %Lu %Lu\n", end, start);
361 WARN_ON(1);
362 }
363 if (bits & EXTENT_DIRTY)
364 tree->dirty_bytes += end - start + 1;
365 set_state_cb(tree, state, bits);
366 state->state |= bits;
367 state->start = start;
368 state->end = end;
369 node = tree_insert(&tree->state, end, &state->rb_node);
370 if (node) {
371 struct extent_state *found;
372 found = rb_entry(node, struct extent_state, rb_node);
373 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
374 free_extent_state(state);
375 return -EEXIST;
376 }
377 state->tree = tree;
378 merge_state(tree, state);
379 return 0;
380}
381
382/*
383 * split a given extent state struct in two, inserting the preallocated
384 * struct 'prealloc' as the newly created second half. 'split' indicates an
385 * offset inside 'orig' where it should be split.
386 *
387 * Before calling,
388 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
389 * are two extent state structs in the tree:
390 * prealloc: [orig->start, split - 1]
391 * orig: [ split, orig->end ]
392 *
393 * The tree locks are not taken by this function. They need to be held
394 * by the caller.
395 */
396static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
397 struct extent_state *prealloc, u64 split)
398{
399 struct rb_node *node;
400 prealloc->start = orig->start;
401 prealloc->end = split - 1;
402 prealloc->state = orig->state;
403 orig->start = split;
404
405 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
406 if (node) {
407 struct extent_state *found;
408 found = rb_entry(node, struct extent_state, rb_node);
409 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
410 free_extent_state(prealloc);
411 return -EEXIST;
412 }
413 prealloc->tree = tree;
414 return 0;
415}
416
417/*
418 * utility function to clear some bits in an extent state struct.
419 * it will optionally wake up any one waiting on this state (wake == 1), or
420 * forcibly remove the state from the tree (delete == 1).
421 *
422 * If no bits are set on the state struct after clearing things, the
423 * struct is freed and removed from the tree
424 */
425static int clear_state_bit(struct extent_io_tree *tree,
426 struct extent_state *state, int bits, int wake,
427 int delete)
428{
429 int ret = state->state & bits;
430
431 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
432 u64 range = state->end - state->start + 1;
433 WARN_ON(range > tree->dirty_bytes);
434 tree->dirty_bytes -= range;
435 }
436 clear_state_cb(tree, state, bits);
437 state->state &= ~bits;
438 if (wake)
439 wake_up(&state->wq);
440 if (delete || state->state == 0) {
441 if (state->tree) {
442 clear_state_cb(tree, state, state->state);
443 rb_erase(&state->rb_node, &tree->state);
444 state->tree = NULL;
445 free_extent_state(state);
446 } else {
447 WARN_ON(1);
448 }
449 } else {
450 merge_state(tree, state);
451 }
452 return ret;
453}
454
455/*
456 * clear some bits on a range in the tree. This may require splitting
457 * or inserting elements in the tree, so the gfp mask is used to
458 * indicate which allocations or sleeping are allowed.
459 *
460 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
461 * the given range from the tree regardless of state (ie for truncate).
462 *
463 * the range [start, end] is inclusive.
464 *
465 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
466 * bits were already set, or zero if none of the bits were already set.
467 */
468int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
469 int bits, int wake, int delete, gfp_t mask)
470{
471 struct extent_state *state;
472 struct extent_state *prealloc = NULL;
473 struct rb_node *node;
474 unsigned long flags;
475 int err;
476 int set = 0;
477
478again:
479 if (!prealloc && (mask & __GFP_WAIT)) {
480 prealloc = alloc_extent_state(mask);
481 if (!prealloc)
482 return -ENOMEM;
483 }
484
485 spin_lock_irqsave(&tree->lock, flags);
486 /*
487 * this search will find the extents that end after
488 * our range starts
489 */
490 node = tree_search(tree, start);
491 if (!node)
492 goto out;
493 state = rb_entry(node, struct extent_state, rb_node);
494 if (state->start > end)
495 goto out;
496 WARN_ON(state->end < start);
497
498 /*
499 * | ---- desired range ---- |
500 * | state | or
501 * | ------------- state -------------- |
502 *
503 * We need to split the extent we found, and may flip
504 * bits on second half.
505 *
506 * If the extent we found extends past our range, we
507 * just split and search again. It'll get split again
508 * the next time though.
509 *
510 * If the extent we found is inside our range, we clear
511 * the desired bit on it.
512 */
513
514 if (state->start < start) {
515 if (!prealloc)
516 prealloc = alloc_extent_state(GFP_ATOMIC);
517 err = split_state(tree, state, prealloc, start);
518 BUG_ON(err == -EEXIST);
519 prealloc = NULL;
520 if (err)
521 goto out;
522 if (state->end <= end) {
523 start = state->end + 1;
524 set |= clear_state_bit(tree, state, bits,
525 wake, delete);
526 } else {
527 start = state->start;
528 }
529 goto search_again;
530 }
531 /*
532 * | ---- desired range ---- |
533 * | state |
534 * We need to split the extent, and clear the bit
535 * on the first half
536 */
537 if (state->start <= end && state->end > end) {
538 if (!prealloc)
539 prealloc = alloc_extent_state(GFP_ATOMIC);
540 err = split_state(tree, state, prealloc, end + 1);
541 BUG_ON(err == -EEXIST);
542
543 if (wake)
544 wake_up(&state->wq);
545 set |= clear_state_bit(tree, prealloc, bits,
546 wake, delete);
547 prealloc = NULL;
548 goto out;
549 }
550
551 start = state->end + 1;
552 set |= clear_state_bit(tree, state, bits, wake, delete);
553 goto search_again;
554
555out:
556 spin_unlock_irqrestore(&tree->lock, flags);
557 if (prealloc)
558 free_extent_state(prealloc);
559
560 return set;
561
562search_again:
563 if (start > end)
564 goto out;
565 spin_unlock_irqrestore(&tree->lock, flags);
566 if (mask & __GFP_WAIT)
567 cond_resched();
568 goto again;
569}
570EXPORT_SYMBOL(clear_extent_bit);
571
572static int wait_on_state(struct extent_io_tree *tree,
573 struct extent_state *state)
574{
575 DEFINE_WAIT(wait);
576 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
577 spin_unlock_irq(&tree->lock);
578 schedule();
579 spin_lock_irq(&tree->lock);
580 finish_wait(&state->wq, &wait);
581 return 0;
582}
583
584/*
585 * waits for one or more bits to clear on a range in the state tree.
586 * The range [start, end] is inclusive.
587 * The tree lock is taken by this function
588 */
589int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
590{
591 struct extent_state *state;
592 struct rb_node *node;
593
594 spin_lock_irq(&tree->lock);
595again:
596 while (1) {
597 /*
598 * this search will find all the extents that end after
599 * our range starts
600 */
601 node = tree_search(tree, start);
602 if (!node)
603 break;
604
605 state = rb_entry(node, struct extent_state, rb_node);
606
607 if (state->start > end)
608 goto out;
609
610 if (state->state & bits) {
611 start = state->start;
612 atomic_inc(&state->refs);
613 wait_on_state(tree, state);
614 free_extent_state(state);
615 goto again;
616 }
617 start = state->end + 1;
618
619 if (start > end)
620 break;
621
622 if (need_resched()) {
623 spin_unlock_irq(&tree->lock);
624 cond_resched();
625 spin_lock_irq(&tree->lock);
626 }
627 }
628out:
629 spin_unlock_irq(&tree->lock);
630 return 0;
631}
632EXPORT_SYMBOL(wait_extent_bit);
633
634static void set_state_bits(struct extent_io_tree *tree,
635 struct extent_state *state,
636 int bits)
637{
638 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
639 u64 range = state->end - state->start + 1;
640 tree->dirty_bytes += range;
641 }
642 set_state_cb(tree, state, bits);
643 state->state |= bits;
644}
645
646/*
647 * set some bits on a range in the tree. This may require allocations
648 * or sleeping, so the gfp mask is used to indicate what is allowed.
649 *
650 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
651 * range already has the desired bits set. The start of the existing
652 * range is returned in failed_start in this case.
653 *
654 * [start, end] is inclusive
655 * This takes the tree lock.
656 */
657int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
658 int exclusive, u64 *failed_start, gfp_t mask)
659{
660 struct extent_state *state;
661 struct extent_state *prealloc = NULL;
662 struct rb_node *node;
663 unsigned long flags;
664 int err = 0;
665 int set;
666 u64 last_start;
667 u64 last_end;
668again:
669 if (!prealloc && (mask & __GFP_WAIT)) {
670 prealloc = alloc_extent_state(mask);
671 if (!prealloc)
672 return -ENOMEM;
673 }
674
675 spin_lock_irqsave(&tree->lock, flags);
676 /*
677 * this search will find all the extents that end after
678 * our range starts.
679 */
680 node = tree_search(tree, start);
681 if (!node) {
682 err = insert_state(tree, prealloc, start, end, bits);
683 prealloc = NULL;
684 BUG_ON(err == -EEXIST);
685 goto out;
686 }
687
688 state = rb_entry(node, struct extent_state, rb_node);
689 last_start = state->start;
690 last_end = state->end;
691
692 /*
693 * | ---- desired range ---- |
694 * | state |
695 *
696 * Just lock what we found and keep going
697 */
698 if (state->start == start && state->end <= end) {
699 set = state->state & bits;
700 if (set && exclusive) {
701 *failed_start = state->start;
702 err = -EEXIST;
703 goto out;
704 }
705 set_state_bits(tree, state, bits);
706 start = state->end + 1;
707 merge_state(tree, state);
708 goto search_again;
709 }
710
711 /*
712 * | ---- desired range ---- |
713 * | state |
714 * or
715 * | ------------- state -------------- |
716 *
717 * We need to split the extent we found, and may flip bits on
718 * second half.
719 *
720 * If the extent we found extends past our
721 * range, we just split and search again. It'll get split
722 * again the next time though.
723 *
724 * If the extent we found is inside our range, we set the
725 * desired bit on it.
726 */
727 if (state->start < start) {
728 set = state->state & bits;
729 if (exclusive && set) {
730 *failed_start = start;
731 err = -EEXIST;
732 goto out;
733 }
734 err = split_state(tree, state, prealloc, start);
735 BUG_ON(err == -EEXIST);
736 prealloc = NULL;
737 if (err)
738 goto out;
739 if (state->end <= end) {
740 set_state_bits(tree, state, bits);
741 start = state->end + 1;
742 merge_state(tree, state);
743 } else {
744 start = state->start;
745 }
746 goto search_again;
747 }
748 /*
749 * | ---- desired range ---- |
750 * | state | or | state |
751 *
752 * There's a hole, we need to insert something in it and
753 * ignore the extent we found.
754 */
755 if (state->start > start) {
756 u64 this_end;
757 if (end < last_start)
758 this_end = end;
759 else
760 this_end = last_start -1;
761 err = insert_state(tree, prealloc, start, this_end,
762 bits);
763 prealloc = NULL;
764 BUG_ON(err == -EEXIST);
765 if (err)
766 goto out;
767 start = this_end + 1;
768 goto search_again;
769 }
770 /*
771 * | ---- desired range ---- |
772 * | state |
773 * We need to split the extent, and set the bit
774 * on the first half
775 */
776 if (state->start <= end && state->end > end) {
777 set = state->state & bits;
778 if (exclusive && set) {
779 *failed_start = start;
780 err = -EEXIST;
781 goto out;
782 }
783 err = split_state(tree, state, prealloc, end + 1);
784 BUG_ON(err == -EEXIST);
785
786 set_state_bits(tree, prealloc, bits);
787 merge_state(tree, prealloc);
788 prealloc = NULL;
789 goto out;
790 }
791
792 goto search_again;
793
794out:
795 spin_unlock_irqrestore(&tree->lock, flags);
796 if (prealloc)
797 free_extent_state(prealloc);
798
799 return err;
800
801search_again:
802 if (start > end)
803 goto out;
804 spin_unlock_irqrestore(&tree->lock, flags);
805 if (mask & __GFP_WAIT)
806 cond_resched();
807 goto again;
808}
809EXPORT_SYMBOL(set_extent_bit);
810
811/* wrappers around set/clear extent bit */
812int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
813 gfp_t mask)
814{
815 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
816 mask);
817}
818EXPORT_SYMBOL(set_extent_dirty);
819
820int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
821 gfp_t mask)
822{
823 return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
824}
825EXPORT_SYMBOL(set_extent_ordered);
826
827int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
828 int bits, gfp_t mask)
829{
830 return set_extent_bit(tree, start, end, bits, 0, NULL,
831 mask);
832}
833EXPORT_SYMBOL(set_extent_bits);
834
835int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
836 int bits, gfp_t mask)
837{
838 return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
839}
840EXPORT_SYMBOL(clear_extent_bits);
841
842int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
843 gfp_t mask)
844{
845 return set_extent_bit(tree, start, end,
846 EXTENT_DELALLOC | EXTENT_DIRTY,
847 0, NULL, mask);
848}
849EXPORT_SYMBOL(set_extent_delalloc);
850
851int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
852 gfp_t mask)
853{
854 return clear_extent_bit(tree, start, end,
855 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
856}
857EXPORT_SYMBOL(clear_extent_dirty);
858
859int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
860 gfp_t mask)
861{
862 return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
863}
864EXPORT_SYMBOL(clear_extent_ordered);
865
866int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
867 gfp_t mask)
868{
869 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
870 mask);
871}
872EXPORT_SYMBOL(set_extent_new);
873
874int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
875 gfp_t mask)
876{
877 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
878}
879EXPORT_SYMBOL(clear_extent_new);
880
881int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
882 gfp_t mask)
883{
884 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
885 mask);
886}
887EXPORT_SYMBOL(set_extent_uptodate);
888
889int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
890 gfp_t mask)
891{
892 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
893}
894EXPORT_SYMBOL(clear_extent_uptodate);
895
896int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
897 gfp_t mask)
898{
899 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
900 0, NULL, mask);
901}
902EXPORT_SYMBOL(set_extent_writeback);
903
904int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
905 gfp_t mask)
906{
907 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
908}
909EXPORT_SYMBOL(clear_extent_writeback);
910
911int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
912{
913 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
914}
915EXPORT_SYMBOL(wait_on_extent_writeback);
916
917/*
918 * either insert or lock state struct between start and end use mask to tell
919 * us if waiting is desired.
920 */
921int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
922{
923 int err;
924 u64 failed_start;
925 while (1) {
926 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
927 &failed_start, mask);
928 if (err == -EEXIST && (mask & __GFP_WAIT)) {
929 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
930 start = failed_start;
931 } else {
932 break;
933 }
934 WARN_ON(start > end);
935 }
936 return err;
937}
938EXPORT_SYMBOL(lock_extent);
939
940int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
941 gfp_t mask)
942{
943 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
944}
945EXPORT_SYMBOL(unlock_extent);
946
947/*
948 * helper function to set pages and extents in the tree dirty
949 */
950int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
951{
952 unsigned long index = start >> PAGE_CACHE_SHIFT;
953 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
954 struct page *page;
955
956 while (index <= end_index) {
957 page = find_get_page(tree->mapping, index);
958 BUG_ON(!page);
959 __set_page_dirty_nobuffers(page);
960 page_cache_release(page);
961 index++;
962 }
963 set_extent_dirty(tree, start, end, GFP_NOFS);
964 return 0;
965}
966EXPORT_SYMBOL(set_range_dirty);
967
968/*
969 * helper function to set both pages and extents in the tree writeback
970 */
971int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
972{
973 unsigned long index = start >> PAGE_CACHE_SHIFT;
974 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
975 struct page *page;
976
977 while (index <= end_index) {
978 page = find_get_page(tree->mapping, index);
979 BUG_ON(!page);
980 set_page_writeback(page);
981 page_cache_release(page);
982 index++;
983 }
984 set_extent_writeback(tree, start, end, GFP_NOFS);
985 return 0;
986}
987EXPORT_SYMBOL(set_range_writeback);
988
989/*
990 * find the first offset in the io tree with 'bits' set. zero is
991 * returned if we find something, and *start_ret and *end_ret are
992 * set to reflect the state struct that was found.
993 *
994 * If nothing was found, 1 is returned, < 0 on error
995 */
996int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
997 u64 *start_ret, u64 *end_ret, int bits)
998{
999 struct rb_node *node;
1000 struct extent_state *state;
1001 int ret = 1;
1002
1003 spin_lock_irq(&tree->lock);
1004 /*
1005 * this search will find all the extents that end after
1006 * our range starts.
1007 */
1008 node = tree_search(tree, start);
1009 if (!node) {
1010 goto out;
1011 }
1012
1013 while(1) {
1014 state = rb_entry(node, struct extent_state, rb_node);
1015 if (state->end >= start && (state->state & bits)) {
1016 *start_ret = state->start;
1017 *end_ret = state->end;
1018 ret = 0;
1019 break;
1020 }
1021 node = rb_next(node);
1022 if (!node)
1023 break;
1024 }
1025out:
1026 spin_unlock_irq(&tree->lock);
1027 return ret;
1028}
1029EXPORT_SYMBOL(find_first_extent_bit);
1030
1031/* find the first state struct with 'bits' set after 'start', and
1032 * return it. tree->lock must be held. NULL will returned if
1033 * nothing was found after 'start'
1034 */
1035struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1036 u64 start, int bits)
1037{
1038 struct rb_node *node;
1039 struct extent_state *state;
1040
1041 /*
1042 * this search will find all the extents that end after
1043 * our range starts.
1044 */
1045 node = tree_search(tree, start);
1046 if (!node) {
1047 goto out;
1048 }
1049
1050 while(1) {
1051 state = rb_entry(node, struct extent_state, rb_node);
1052 if (state->end >= start && (state->state & bits)) {
1053 return state;
1054 }
1055 node = rb_next(node);
1056 if (!node)
1057 break;
1058 }
1059out:
1060 return NULL;
1061}
1062EXPORT_SYMBOL(find_first_extent_bit_state);
1063
1064/*
1065 * find a contiguous range of bytes in the file marked as delalloc, not
1066 * more than 'max_bytes'. start and end are used to return the range,
1067 *
1068 * 1 is returned if we find something, 0 if nothing was in the tree
1069 */
1070static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
1071 u64 *start, u64 *end, u64 max_bytes)
1072{
1073 struct rb_node *node;
1074 struct extent_state *state;
1075 u64 cur_start = *start;
1076 u64 found = 0;
1077 u64 total_bytes = 0;
1078
1079 spin_lock_irq(&tree->lock);
1080 /*
1081 * this search will find all the extents that end after
1082 * our range starts.
1083 */
1084search_again:
1085 node = tree_search(tree, cur_start);
1086 if (!node) {
1087 if (!found)
1088 *end = (u64)-1;
1089 goto out;
1090 }
1091
1092 while(1) {
1093 state = rb_entry(node, struct extent_state, rb_node);
1094 if (found && (state->start != cur_start ||
1095 (state->state & EXTENT_BOUNDARY))) {
1096 goto out;
1097 }
1098 if (!(state->state & EXTENT_DELALLOC)) {
1099 if (!found)
1100 *end = state->end;
1101 goto out;
1102 }
1103 if (!found && !(state->state & EXTENT_BOUNDARY)) {
1104 struct extent_state *prev_state;
1105 struct rb_node *prev_node = node;
1106 while(1) {
1107 prev_node = rb_prev(prev_node);
1108 if (!prev_node)
1109 break;
1110 prev_state = rb_entry(prev_node,
1111 struct extent_state,
1112 rb_node);
1113 if ((prev_state->end + 1 != state->start) ||
1114 !(prev_state->state & EXTENT_DELALLOC))
1115 break;
1116 if ((cur_start - prev_state->start) * 2 >
1117 max_bytes)
1118 break;
1119 state = prev_state;
1120 node = prev_node;
1121 }
1122 }
1123 if (state->state & EXTENT_LOCKED) {
1124 DEFINE_WAIT(wait);
1125 atomic_inc(&state->refs);
1126 prepare_to_wait(&state->wq, &wait,
1127 TASK_UNINTERRUPTIBLE);
1128 spin_unlock_irq(&tree->lock);
1129 schedule();
1130 spin_lock_irq(&tree->lock);
1131 finish_wait(&state->wq, &wait);
1132 free_extent_state(state);
1133 goto search_again;
1134 }
1135 set_state_cb(tree, state, EXTENT_LOCKED);
1136 state->state |= EXTENT_LOCKED;
1137 if (!found)
1138 *start = state->start;
1139 found++;
1140 *end = state->end;
1141 cur_start = state->end + 1;
1142 node = rb_next(node);
1143 if (!node)
1144 break;
1145 total_bytes += state->end - state->start + 1;
1146 if (total_bytes >= max_bytes)
1147 break;
1148 }
1149out:
1150 spin_unlock_irq(&tree->lock);
1151 return found;
1152}
1153
1154/*
1155 * count the number of bytes in the tree that have a given bit(s)
1156 * set. This can be fairly slow, except for EXTENT_DIRTY which is
1157 * cached. The total number found is returned.
1158 */
1159u64 count_range_bits(struct extent_io_tree *tree,
1160 u64 *start, u64 search_end, u64 max_bytes,
1161 unsigned long bits)
1162{
1163 struct rb_node *node;
1164 struct extent_state *state;
1165 u64 cur_start = *start;
1166 u64 total_bytes = 0;
1167 int found = 0;
1168
1169 if (search_end <= cur_start) {
1170 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1171 WARN_ON(1);
1172 return 0;
1173 }
1174
1175 spin_lock_irq(&tree->lock);
1176 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1177 total_bytes = tree->dirty_bytes;
1178 goto out;
1179 }
1180 /*
1181 * this search will find all the extents that end after
1182 * our range starts.
1183 */
1184 node = tree_search(tree, cur_start);
1185 if (!node) {
1186 goto out;
1187 }
1188
1189 while(1) {
1190 state = rb_entry(node, struct extent_state, rb_node);
1191 if (state->start > search_end)
1192 break;
1193 if (state->end >= cur_start && (state->state & bits)) {
1194 total_bytes += min(search_end, state->end) + 1 -
1195 max(cur_start, state->start);
1196 if (total_bytes >= max_bytes)
1197 break;
1198 if (!found) {
1199 *start = state->start;
1200 found = 1;
1201 }
1202 }
1203 node = rb_next(node);
1204 if (!node)
1205 break;
1206 }
1207out:
1208 spin_unlock_irq(&tree->lock);
1209 return total_bytes;
1210}
1211/*
1212 * helper function to lock both pages and extents in the tree.
1213 * pages must be locked first.
1214 */
1215int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1216{
1217 unsigned long index = start >> PAGE_CACHE_SHIFT;
1218 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1219 struct page *page;
1220 int err;
1221
1222 while (index <= end_index) {
1223 page = grab_cache_page(tree->mapping, index);
1224 if (!page) {
1225 err = -ENOMEM;
1226 goto failed;
1227 }
1228 if (IS_ERR(page)) {
1229 err = PTR_ERR(page);
1230 goto failed;
1231 }
1232 index++;
1233 }
1234 lock_extent(tree, start, end, GFP_NOFS);
1235 return 0;
1236
1237failed:
1238 /*
1239 * we failed above in getting the page at 'index', so we undo here
1240 * up to but not including the page at 'index'
1241 */
1242 end_index = index;
1243 index = start >> PAGE_CACHE_SHIFT;
1244 while (index < end_index) {
1245 page = find_get_page(tree->mapping, index);
1246 unlock_page(page);
1247 page_cache_release(page);
1248 index++;
1249 }
1250 return err;
1251}
1252EXPORT_SYMBOL(lock_range);
1253
1254/*
1255 * helper function to unlock both pages and extents in the tree.
1256 */
1257int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1258{
1259 unsigned long index = start >> PAGE_CACHE_SHIFT;
1260 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1261 struct page *page;
1262
1263 while (index <= end_index) {
1264 page = find_get_page(tree->mapping, index);
1265 unlock_page(page);
1266 page_cache_release(page);
1267 index++;
1268 }
1269 unlock_extent(tree, start, end, GFP_NOFS);
1270 return 0;
1271}
1272EXPORT_SYMBOL(unlock_range);
1273
1274/*
1275 * set the private field for a given byte offset in the tree. If there isn't
1276 * an extent_state there already, this does nothing.
1277 */
1278int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1279{
1280 struct rb_node *node;
1281 struct extent_state *state;
1282 int ret = 0;
1283
1284 spin_lock_irq(&tree->lock);
1285 /*
1286 * this search will find all the extents that end after
1287 * our range starts.
1288 */
1289 node = tree_search(tree, start);
1290 if (!node) {
1291 ret = -ENOENT;
1292 goto out;
1293 }
1294 state = rb_entry(node, struct extent_state, rb_node);
1295 if (state->start != start) {
1296 ret = -ENOENT;
1297 goto out;
1298 }
1299 state->private = private;
1300out:
1301 spin_unlock_irq(&tree->lock);
1302 return ret;
1303}
1304
1305int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1306{
1307 struct rb_node *node;
1308 struct extent_state *state;
1309 int ret = 0;
1310
1311 spin_lock_irq(&tree->lock);
1312 /*
1313 * this search will find all the extents that end after
1314 * our range starts.
1315 */
1316 node = tree_search(tree, start);
1317 if (!node) {
1318 ret = -ENOENT;
1319 goto out;
1320 }
1321 state = rb_entry(node, struct extent_state, rb_node);
1322 if (state->start != start) {
1323 ret = -ENOENT;
1324 goto out;
1325 }
1326 *private = state->private;
1327out:
1328 spin_unlock_irq(&tree->lock);
1329 return ret;
1330}
1331
1332/*
1333 * searches a range in the state tree for a given mask.
1334 * If 'filled' == 1, this returns 1 only if every extent in the tree
1335 * has the bits set. Otherwise, 1 is returned if any bit in the
1336 * range is found set.
1337 */
1338int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1339 int bits, int filled)
1340{
1341 struct extent_state *state = NULL;
1342 struct rb_node *node;
1343 int bitset = 0;
1344 unsigned long flags;
1345
1346 spin_lock_irqsave(&tree->lock, flags);
1347 node = tree_search(tree, start);
1348 while (node && start <= end) {
1349 state = rb_entry(node, struct extent_state, rb_node);
1350
1351 if (filled && state->start > start) {
1352 bitset = 0;
1353 break;
1354 }
1355
1356 if (state->start > end)
1357 break;
1358
1359 if (state->state & bits) {
1360 bitset = 1;
1361 if (!filled)
1362 break;
1363 } else if (filled) {
1364 bitset = 0;
1365 break;
1366 }
1367 start = state->end + 1;
1368 if (start > end)
1369 break;
1370 node = rb_next(node);
1371 if (!node) {
1372 if (filled)
1373 bitset = 0;
1374 break;
1375 }
1376 }
1377 spin_unlock_irqrestore(&tree->lock, flags);
1378 return bitset;
1379}
1380EXPORT_SYMBOL(test_range_bit);
1381
1382/*
1383 * helper function to set a given page up to date if all the
1384 * extents in the tree for that page are up to date
1385 */
1386static int check_page_uptodate(struct extent_io_tree *tree,
1387 struct page *page)
1388{
1389 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1390 u64 end = start + PAGE_CACHE_SIZE - 1;
1391 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1392 SetPageUptodate(page);
1393 return 0;
1394}
1395
1396/*
1397 * helper function to unlock a page if all the extents in the tree
1398 * for that page are unlocked
1399 */
1400static int check_page_locked(struct extent_io_tree *tree,
1401 struct page *page)
1402{
1403 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1404 u64 end = start + PAGE_CACHE_SIZE - 1;
1405 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1406 unlock_page(page);
1407 return 0;
1408}
1409
1410/*
1411 * helper function to end page writeback if all the extents
1412 * in the tree for that page are done with writeback
1413 */
1414static int check_page_writeback(struct extent_io_tree *tree,
1415 struct page *page)
1416{
1417 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1418 u64 end = start + PAGE_CACHE_SIZE - 1;
1419 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1420 end_page_writeback(page);
1421 return 0;
1422}
1423
1424/* lots and lots of room for performance fixes in the end_bio funcs */
1425
1426/*
1427 * after a writepage IO is done, we need to:
1428 * clear the uptodate bits on error
1429 * clear the writeback bits in the extent tree for this IO
1430 * end_page_writeback if the page has no more pending IO
1431 *
1432 * Scheduling is not allowed, so the extent state tree is expected
1433 * to have one and only one object corresponding to this IO.
1434 */
1435static void end_bio_extent_writepage(struct bio *bio, int err)
1436{
1437 int uptodate = err == 0;
1438 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1439 struct extent_io_tree *tree;
1440 u64 start;
1441 u64 end;
1442 int whole_page;
1443 int ret;
1444
1445 do {
1446 struct page *page = bvec->bv_page;
1447 tree = &BTRFS_I(page->mapping->host)->io_tree;
1448
1449 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1450 bvec->bv_offset;
1451 end = start + bvec->bv_len - 1;
1452
1453 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1454 whole_page = 1;
1455 else
1456 whole_page = 0;
1457
1458 if (--bvec >= bio->bi_io_vec)
1459 prefetchw(&bvec->bv_page->flags);
1460 if (tree->ops && tree->ops->writepage_end_io_hook) {
1461 ret = tree->ops->writepage_end_io_hook(page, start,
1462 end, NULL, uptodate);
1463 if (ret)
1464 uptodate = 0;
1465 }
1466
1467 if (!uptodate && tree->ops &&
1468 tree->ops->writepage_io_failed_hook) {
1469 ret = tree->ops->writepage_io_failed_hook(bio, page,
1470 start, end, NULL);
1471 if (ret == 0) {
1472 uptodate = (err == 0);
1473 continue;
1474 }
1475 }
1476
1477 if (!uptodate) {
1478 clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1479 ClearPageUptodate(page);
1480 SetPageError(page);
1481 }
1482
1483 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1484
1485 if (whole_page)
1486 end_page_writeback(page);
1487 else
1488 check_page_writeback(tree, page);
1489 } while (bvec >= bio->bi_io_vec);
1490
1491 bio_put(bio);
1492}
1493
1494/*
1495 * after a readpage IO is done, we need to:
1496 * clear the uptodate bits on error
1497 * set the uptodate bits if things worked
1498 * set the page up to date if all extents in the tree are uptodate
1499 * clear the lock bit in the extent tree
1500 * unlock the page if there are no other extents locked for it
1501 *
1502 * Scheduling is not allowed, so the extent state tree is expected
1503 * to have one and only one object corresponding to this IO.
1504 */
1505static void end_bio_extent_readpage(struct bio *bio, int err)
1506{
1507 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1508 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1509 struct extent_io_tree *tree;
1510 u64 start;
1511 u64 end;
1512 int whole_page;
1513 int ret;
1514
1515 do {
1516 struct page *page = bvec->bv_page;
1517 tree = &BTRFS_I(page->mapping->host)->io_tree;
1518
1519 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1520 bvec->bv_offset;
1521 end = start + bvec->bv_len - 1;
1522
1523 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1524 whole_page = 1;
1525 else
1526 whole_page = 0;
1527
1528 if (--bvec >= bio->bi_io_vec)
1529 prefetchw(&bvec->bv_page->flags);
1530
1531 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1532 ret = tree->ops->readpage_end_io_hook(page, start, end,
1533 NULL);
1534 if (ret)
1535 uptodate = 0;
1536 }
1537 if (!uptodate && tree->ops &&
1538 tree->ops->readpage_io_failed_hook) {
1539 ret = tree->ops->readpage_io_failed_hook(bio, page,
1540 start, end, NULL);
1541 if (ret == 0) {
1542 uptodate =
1543 test_bit(BIO_UPTODATE, &bio->bi_flags);
1544 continue;
1545 }
1546 }
1547
1548 if (uptodate)
1549 set_extent_uptodate(tree, start, end,
1550 GFP_ATOMIC);
1551 unlock_extent(tree, start, end, GFP_ATOMIC);
1552
1553 if (whole_page) {
1554 if (uptodate) {
1555 SetPageUptodate(page);
1556 } else {
1557 ClearPageUptodate(page);
1558 SetPageError(page);
1559 }
1560 unlock_page(page);
1561 } else {
1562 if (uptodate) {
1563 check_page_uptodate(tree, page);
1564 } else {
1565 ClearPageUptodate(page);
1566 SetPageError(page);
1567 }
1568 check_page_locked(tree, page);
1569 }
1570 } while (bvec >= bio->bi_io_vec);
1571
1572 bio_put(bio);
1573}
1574
1575/*
1576 * IO done from prepare_write is pretty simple, we just unlock
1577 * the structs in the extent tree when done, and set the uptodate bits
1578 * as appropriate.
1579 */
1580static void end_bio_extent_preparewrite(struct bio *bio, int err)
1581{
1582 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1583 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1584 struct extent_io_tree *tree;
1585 u64 start;
1586 u64 end;
1587
1588 do {
1589 struct page *page = bvec->bv_page;
1590 tree = &BTRFS_I(page->mapping->host)->io_tree;
1591
1592 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1593 bvec->bv_offset;
1594 end = start + bvec->bv_len - 1;
1595
1596 if (--bvec >= bio->bi_io_vec)
1597 prefetchw(&bvec->bv_page->flags);
1598
1599 if (uptodate) {
1600 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1601 } else {
1602 ClearPageUptodate(page);
1603 SetPageError(page);
1604 }
1605
1606 unlock_extent(tree, start, end, GFP_ATOMIC);
1607
1608 } while (bvec >= bio->bi_io_vec);
1609
1610 bio_put(bio);
1611}
1612
1613static struct bio *
1614extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1615 gfp_t gfp_flags)
1616{
1617 struct bio *bio;
1618
1619 bio = bio_alloc(gfp_flags, nr_vecs);
1620
1621 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1622 while (!bio && (nr_vecs /= 2))
1623 bio = bio_alloc(gfp_flags, nr_vecs);
1624 }
1625
1626 if (bio) {
1627 bio->bi_size = 0;
1628 bio->bi_bdev = bdev;
1629 bio->bi_sector = first_sector;
1630 }
1631 return bio;
1632}
1633
1634static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
1635{
1636 int ret = 0;
1637 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1638 struct page *page = bvec->bv_page;
1639 struct extent_io_tree *tree = bio->bi_private;
1640 struct rb_node *node;
1641 struct extent_state *state;
1642 u64 start;
1643 u64 end;
1644
1645 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1646 end = start + bvec->bv_len - 1;
1647
1648 spin_lock_irq(&tree->lock);
1649 node = __etree_search(tree, start, NULL, NULL);
1650 BUG_ON(!node);
1651 state = rb_entry(node, struct extent_state, rb_node);
1652 while(state->end < end) {
1653 node = rb_next(node);
1654 state = rb_entry(node, struct extent_state, rb_node);
1655 }
1656 BUG_ON(state->end != end);
1657 spin_unlock_irq(&tree->lock);
1658
1659 bio->bi_private = NULL;
1660
1661 bio_get(bio);
1662
1663 if (tree->ops && tree->ops->submit_bio_hook)
1664 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1665 mirror_num);
1666 else
1667 submit_bio(rw, bio);
1668 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1669 ret = -EOPNOTSUPP;
1670 bio_put(bio);
1671 return ret;
1672}
1673
1674static int submit_extent_page(int rw, struct extent_io_tree *tree,
1675 struct page *page, sector_t sector,
1676 size_t size, unsigned long offset,
1677 struct block_device *bdev,
1678 struct bio **bio_ret,
1679 unsigned long max_pages,
1680 bio_end_io_t end_io_func,
1681 int mirror_num)
1682{
1683 int ret = 0;
1684 struct bio *bio;
1685 int nr;
1686
1687 if (bio_ret && *bio_ret) {
1688 bio = *bio_ret;
1689 if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1690 (tree->ops && tree->ops->merge_bio_hook &&
1691 tree->ops->merge_bio_hook(page, offset, size, bio)) ||
1692 bio_add_page(bio, page, size, offset) < size) {
1693 ret = submit_one_bio(rw, bio, mirror_num);
1694 bio = NULL;
1695 } else {
1696 return 0;
1697 }
1698 }
1699 nr = bio_get_nr_vecs(bdev);
1700 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1701 if (!bio) {
1702 printk("failed to allocate bio nr %d\n", nr);
1703 }
1704
1705
1706 bio_add_page(bio, page, size, offset);
1707 bio->bi_end_io = end_io_func;
1708 bio->bi_private = tree;
1709
1710 if (bio_ret) {
1711 *bio_ret = bio;
1712 } else {
1713 ret = submit_one_bio(rw, bio, mirror_num);
1714 }
1715
1716 return ret;
1717}
1718
1719void set_page_extent_mapped(struct page *page)
1720{
1721 if (!PagePrivate(page)) {
1722 SetPagePrivate(page);
1723 page_cache_get(page);
1724 set_page_private(page, EXTENT_PAGE_PRIVATE);
1725 }
1726}
1727
1728void set_page_extent_head(struct page *page, unsigned long len)
1729{
1730 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1731}
1732
1733/*
1734 * basic readpage implementation. Locked extent state structs are inserted
1735 * into the tree that are removed when the IO is done (by the end_io
1736 * handlers)
1737 */
1738static int __extent_read_full_page(struct extent_io_tree *tree,
1739 struct page *page,
1740 get_extent_t *get_extent,
1741 struct bio **bio, int mirror_num)
1742{
1743 struct inode *inode = page->mapping->host;
1744 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1745 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1746 u64 end;
1747 u64 cur = start;
1748 u64 extent_offset;
1749 u64 last_byte = i_size_read(inode);
1750 u64 block_start;
1751 u64 cur_end;
1752 sector_t sector;
1753 struct extent_map *em;
1754 struct block_device *bdev;
1755 int ret;
1756 int nr = 0;
1757 size_t page_offset = 0;
1758 size_t iosize;
1759 size_t blocksize = inode->i_sb->s_blocksize;
1760
1761 set_page_extent_mapped(page);
1762
1763 end = page_end;
1764 lock_extent(tree, start, end, GFP_NOFS);
1765
1766 while (cur <= end) {
1767 if (cur >= last_byte) {
1768 char *userpage;
1769 iosize = PAGE_CACHE_SIZE - page_offset;
1770 userpage = kmap_atomic(page, KM_USER0);
1771 memset(userpage + page_offset, 0, iosize);
1772 flush_dcache_page(page);
1773 kunmap_atomic(userpage, KM_USER0);
1774 set_extent_uptodate(tree, cur, cur + iosize - 1,
1775 GFP_NOFS);
1776 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1777 break;
1778 }
1779 em = get_extent(inode, page, page_offset, cur,
1780 end - cur + 1, 0);
1781 if (IS_ERR(em) || !em) {
1782 SetPageError(page);
1783 unlock_extent(tree, cur, end, GFP_NOFS);
1784 break;
1785 }
1786 extent_offset = cur - em->start;
1787 if (extent_map_end(em) <= cur) {
1788printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
1789 }
1790 BUG_ON(extent_map_end(em) <= cur);
1791 if (end < cur) {
1792printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1793 }
1794 BUG_ON(end < cur);
1795
1796 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1797 cur_end = min(extent_map_end(em) - 1, end);
1798 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1799 sector = (em->block_start + extent_offset) >> 9;
1800 bdev = em->bdev;
1801 block_start = em->block_start;
1802 free_extent_map(em);
1803 em = NULL;
1804
1805 /* we've found a hole, just zero and go on */
1806 if (block_start == EXTENT_MAP_HOLE) {
1807 char *userpage;
1808 userpage = kmap_atomic(page, KM_USER0);
1809 memset(userpage + page_offset, 0, iosize);
1810 flush_dcache_page(page);
1811 kunmap_atomic(userpage, KM_USER0);
1812
1813 set_extent_uptodate(tree, cur, cur + iosize - 1,
1814 GFP_NOFS);
1815 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1816 cur = cur + iosize;
1817 page_offset += iosize;
1818 continue;
1819 }
1820 /* the get_extent function already copied into the page */
1821 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1822 check_page_uptodate(tree, page);
1823 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1824 cur = cur + iosize;
1825 page_offset += iosize;
1826 continue;
1827 }
1828 /* we have an inline extent but it didn't get marked up
1829 * to date. Error out
1830 */
1831 if (block_start == EXTENT_MAP_INLINE) {
1832 SetPageError(page);
1833 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1834 cur = cur + iosize;
1835 page_offset += iosize;
1836 continue;
1837 }
1838
1839 ret = 0;
1840 if (tree->ops && tree->ops->readpage_io_hook) {
1841 ret = tree->ops->readpage_io_hook(page, cur,
1842 cur + iosize - 1);
1843 }
1844 if (!ret) {
1845 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1846 pnr -= page->index;
1847 ret = submit_extent_page(READ, tree, page,
1848 sector, iosize, page_offset,
1849 bdev, bio, pnr,
1850 end_bio_extent_readpage, mirror_num);
1851 nr++;
1852 }
1853 if (ret)
1854 SetPageError(page);
1855 cur = cur + iosize;
1856 page_offset += iosize;
1857 }
1858 if (!nr) {
1859 if (!PageError(page))
1860 SetPageUptodate(page);
1861 unlock_page(page);
1862 }
1863 return 0;
1864}
1865
1866int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1867 get_extent_t *get_extent)
1868{
1869 struct bio *bio = NULL;
1870 int ret;
1871
1872 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
1873 if (bio)
1874 submit_one_bio(READ, bio, 0);
1875 return ret;
1876}
1877EXPORT_SYMBOL(extent_read_full_page);
1878
1879/*
1880 * the writepage semantics are similar to regular writepage. extent
1881 * records are inserted to lock ranges in the tree, and as dirty areas
1882 * are found, they are marked writeback. Then the lock bits are removed
1883 * and the end_io handler clears the writeback ranges
1884 */
1885static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1886 void *data)
1887{
1888 struct inode *inode = page->mapping->host;
1889 struct extent_page_data *epd = data;
1890 struct extent_io_tree *tree = epd->tree;
1891 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1892 u64 delalloc_start;
1893 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1894 u64 end;
1895 u64 cur = start;
1896 u64 extent_offset;
1897 u64 last_byte = i_size_read(inode);
1898 u64 block_start;
1899 u64 iosize;
1900 u64 unlock_start;
1901 sector_t sector;
1902 struct extent_map *em;
1903 struct block_device *bdev;
1904 int ret;
1905 int nr = 0;
1906 size_t pg_offset = 0;
1907 size_t blocksize;
1908 loff_t i_size = i_size_read(inode);
1909 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1910 u64 nr_delalloc;
1911 u64 delalloc_end;
1912
1913 WARN_ON(!PageLocked(page));
1914 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
1915 if (page->index > end_index ||
1916 (page->index == end_index && !pg_offset)) {
1917 page->mapping->a_ops->invalidatepage(page, 0);
1918 unlock_page(page);
1919 return 0;
1920 }
1921
1922 if (page->index == end_index) {
1923 char *userpage;
1924
1925 userpage = kmap_atomic(page, KM_USER0);
1926 memset(userpage + pg_offset, 0,
1927 PAGE_CACHE_SIZE - pg_offset);
1928 kunmap_atomic(userpage, KM_USER0);
1929 flush_dcache_page(page);
1930 }
1931 pg_offset = 0;
1932
1933 set_page_extent_mapped(page);
1934
1935 delalloc_start = start;
1936 delalloc_end = 0;
1937 while(delalloc_end < page_end) {
1938 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
1939 &delalloc_end,
1940 128 * 1024 * 1024);
1941 if (nr_delalloc == 0) {
1942 delalloc_start = delalloc_end + 1;
1943 continue;
1944 }
1945 tree->ops->fill_delalloc(inode, delalloc_start,
1946 delalloc_end);
1947 clear_extent_bit(tree, delalloc_start,
1948 delalloc_end,
1949 EXTENT_LOCKED | EXTENT_DELALLOC,
1950 1, 0, GFP_NOFS);
1951 delalloc_start = delalloc_end + 1;
1952 }
1953 lock_extent(tree, start, page_end, GFP_NOFS);
1954 unlock_start = start;
1955
1956 if (tree->ops && tree->ops->writepage_start_hook) {
1957 ret = tree->ops->writepage_start_hook(page, start, page_end);
1958 if (ret == -EAGAIN) {
1959 unlock_extent(tree, start, page_end, GFP_NOFS);
1960 redirty_page_for_writepage(wbc, page);
1961 unlock_page(page);
1962 return 0;
1963 }
1964 }
1965
1966 end = page_end;
1967 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1968 printk("found delalloc bits after lock_extent\n");
1969 }
1970
1971 if (last_byte <= start) {
1972 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1973 unlock_extent(tree, start, page_end, GFP_NOFS);
1974 if (tree->ops && tree->ops->writepage_end_io_hook)
1975 tree->ops->writepage_end_io_hook(page, start,
1976 page_end, NULL, 1);
1977 unlock_start = page_end + 1;
1978 goto done;
1979 }
1980
1981 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1982 blocksize = inode->i_sb->s_blocksize;
1983
1984 while (cur <= end) {
1985 if (cur >= last_byte) {
1986 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1987 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
1988 if (tree->ops && tree->ops->writepage_end_io_hook)
1989 tree->ops->writepage_end_io_hook(page, cur,
1990 page_end, NULL, 1);
1991 unlock_start = page_end + 1;
1992 break;
1993 }
1994 em = epd->get_extent(inode, page, pg_offset, cur,
1995 end - cur + 1, 1);
1996 if (IS_ERR(em) || !em) {
1997 SetPageError(page);
1998 break;
1999 }
2000
2001 extent_offset = cur - em->start;
2002 BUG_ON(extent_map_end(em) <= cur);
2003 BUG_ON(end < cur);
2004 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2005 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2006 sector = (em->block_start + extent_offset) >> 9;
2007 bdev = em->bdev;
2008 block_start = em->block_start;
2009 free_extent_map(em);
2010 em = NULL;
2011
2012 if (block_start == EXTENT_MAP_HOLE ||
2013 block_start == EXTENT_MAP_INLINE) {
2014 clear_extent_dirty(tree, cur,
2015 cur + iosize - 1, GFP_NOFS);
2016
2017 unlock_extent(tree, unlock_start, cur + iosize -1,
2018 GFP_NOFS);
2019
2020 if (tree->ops && tree->ops->writepage_end_io_hook)
2021 tree->ops->writepage_end_io_hook(page, cur,
2022 cur + iosize - 1,
2023 NULL, 1);
2024 cur = cur + iosize;
2025 pg_offset += iosize;
2026 unlock_start = cur;
2027 continue;
2028 }
2029
2030 /* leave this out until we have a page_mkwrite call */
2031 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2032 EXTENT_DIRTY, 0)) {
2033 cur = cur + iosize;
2034 pg_offset += iosize;
2035 continue;
2036 }
2037 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2038 if (tree->ops && tree->ops->writepage_io_hook) {
2039 ret = tree->ops->writepage_io_hook(page, cur,
2040 cur + iosize - 1);
2041 } else {
2042 ret = 0;
2043 }
2044 if (ret) {
2045 SetPageError(page);
2046 } else {
2047 unsigned long max_nr = end_index + 1;
2048
2049 set_range_writeback(tree, cur, cur + iosize - 1);
2050 if (!PageWriteback(page)) {
2051 printk("warning page %lu not writeback, "
2052 "cur %llu end %llu\n", page->index,
2053 (unsigned long long)cur,
2054 (unsigned long long)end);
2055 }
2056
2057 ret = submit_extent_page(WRITE, tree, page, sector,
2058 iosize, pg_offset, bdev,
2059 &epd->bio, max_nr,
2060 end_bio_extent_writepage, 0);
2061 if (ret)
2062 SetPageError(page);
2063 }
2064 cur = cur + iosize;
2065 pg_offset += iosize;
2066 nr++;
2067 }
2068done:
2069 if (nr == 0) {
2070 /* make sure the mapping tag for page dirty gets cleared */
2071 set_page_writeback(page);
2072 end_page_writeback(page);
2073 }
2074 if (unlock_start <= page_end)
2075 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2076 unlock_page(page);
2077 return 0;
2078}
2079
2080/**
2081 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2082 * @mapping: address space structure to write
2083 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2084 * @writepage: function called for each page
2085 * @data: data passed to writepage function
2086 *
2087 * If a page is already under I/O, write_cache_pages() skips it, even
2088 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2089 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2090 * and msync() need to guarantee that all the data which was dirty at the time
2091 * the call was made get new I/O started against them. If wbc->sync_mode is
2092 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2093 * existing IO to complete.
2094 */
2095int extent_write_cache_pages(struct extent_io_tree *tree,
2096 struct address_space *mapping,
2097 struct writeback_control *wbc,
2098 writepage_t writepage, void *data)
2099{
2100 struct backing_dev_info *bdi = mapping->backing_dev_info;
2101 int ret = 0;
2102 int done = 0;
2103 struct pagevec pvec;
2104 int nr_pages;
2105 pgoff_t index;
2106 pgoff_t end; /* Inclusive */
2107 int scanned = 0;
2108 int range_whole = 0;
2109
2110 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2111 wbc->encountered_congestion = 1;
2112 return 0;
2113 }
2114
2115 pagevec_init(&pvec, 0);
2116 if (wbc->range_cyclic) {
2117 index = mapping->writeback_index; /* Start from prev offset */
2118 end = -1;
2119 } else {
2120 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2121 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2122 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2123 range_whole = 1;
2124 scanned = 1;
2125 }
2126retry:
2127 while (!done && (index <= end) &&
2128 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2129 PAGECACHE_TAG_DIRTY,
2130 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2131 unsigned i;
2132
2133 scanned = 1;
2134 for (i = 0; i < nr_pages; i++) {
2135 struct page *page = pvec.pages[i];
2136
2137 /*
2138 * At this point we hold neither mapping->tree_lock nor
2139 * lock on the page itself: the page may be truncated or
2140 * invalidated (changing page->mapping to NULL), or even
2141 * swizzled back from swapper_space to tmpfs file
2142 * mapping
2143 */
2144 if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2145 tree->ops->write_cache_pages_lock_hook(page);
2146 else
2147 lock_page(page);
2148
2149 if (unlikely(page->mapping != mapping)) {
2150 unlock_page(page);
2151 continue;
2152 }
2153
2154 if (!wbc->range_cyclic && page->index > end) {
2155 done = 1;
2156 unlock_page(page);
2157 continue;
2158 }
2159
2160 if (wbc->sync_mode != WB_SYNC_NONE)
2161 wait_on_page_writeback(page);
2162
2163 if (PageWriteback(page) ||
2164 !clear_page_dirty_for_io(page)) {
2165 unlock_page(page);
2166 continue;
2167 }
2168
2169 ret = (*writepage)(page, wbc, data);
2170
2171 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2172 unlock_page(page);
2173 ret = 0;
2174 }
2175 if (ret || (--(wbc->nr_to_write) <= 0))
2176 done = 1;
2177 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2178 wbc->encountered_congestion = 1;
2179 done = 1;
2180 }
2181 }
2182 pagevec_release(&pvec);
2183 cond_resched();
2184 }
2185 if (!scanned && !done) {
2186 /*
2187 * We hit the last page and there is more work to be done: wrap
2188 * back to the start of the file
2189 */
2190 scanned = 1;
2191 index = 0;
2192 goto retry;
2193 }
2194 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2195 mapping->writeback_index = index;
2196
2197 if (wbc->range_cont)
2198 wbc->range_start = index << PAGE_CACHE_SHIFT;
2199 return ret;
2200}
2201EXPORT_SYMBOL(extent_write_cache_pages);
2202
2203int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2204 get_extent_t *get_extent,
2205 struct writeback_control *wbc)
2206{
2207 int ret;
2208 struct address_space *mapping = page->mapping;
2209 struct extent_page_data epd = {
2210 .bio = NULL,
2211 .tree = tree,
2212 .get_extent = get_extent,
2213 };
2214 struct writeback_control wbc_writepages = {
2215 .bdi = wbc->bdi,
2216 .sync_mode = WB_SYNC_NONE,
2217 .older_than_this = NULL,
2218 .nr_to_write = 64,
2219 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2220 .range_end = (loff_t)-1,
2221 };
2222
2223
2224 ret = __extent_writepage(page, wbc, &epd);
2225
2226 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2227 __extent_writepage, &epd);
2228 if (epd.bio) {
2229 submit_one_bio(WRITE, epd.bio, 0);
2230 }
2231 return ret;
2232}
2233EXPORT_SYMBOL(extent_write_full_page);
2234
2235
2236int extent_writepages(struct extent_io_tree *tree,
2237 struct address_space *mapping,
2238 get_extent_t *get_extent,
2239 struct writeback_control *wbc)
2240{
2241 int ret = 0;
2242 struct extent_page_data epd = {
2243 .bio = NULL,
2244 .tree = tree,
2245 .get_extent = get_extent,
2246 };
2247
2248 ret = extent_write_cache_pages(tree, mapping, wbc,
2249 __extent_writepage, &epd);
2250 if (epd.bio) {
2251 submit_one_bio(WRITE, epd.bio, 0);
2252 }
2253 return ret;
2254}
2255EXPORT_SYMBOL(extent_writepages);
2256
2257int extent_readpages(struct extent_io_tree *tree,
2258 struct address_space *mapping,
2259 struct list_head *pages, unsigned nr_pages,
2260 get_extent_t get_extent)
2261{
2262 struct bio *bio = NULL;
2263 unsigned page_idx;
2264 struct pagevec pvec;
2265
2266 pagevec_init(&pvec, 0);
2267 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2268 struct page *page = list_entry(pages->prev, struct page, lru);
2269
2270 prefetchw(&page->flags);
2271 list_del(&page->lru);
2272 /*
2273 * what we want to do here is call add_to_page_cache_lru,
2274 * but that isn't exported, so we reproduce it here
2275 */
2276 if (!add_to_page_cache(page, mapping,
2277 page->index, GFP_KERNEL)) {
2278
2279 /* open coding of lru_cache_add, also not exported */
2280 page_cache_get(page);
2281 if (!pagevec_add(&pvec, page))
2282 __pagevec_lru_add(&pvec);
2283 __extent_read_full_page(tree, page, get_extent,
2284 &bio, 0);
2285 }
2286 page_cache_release(page);
2287 }
2288 if (pagevec_count(&pvec))
2289 __pagevec_lru_add(&pvec);
2290 BUG_ON(!list_empty(pages));
2291 if (bio)
2292 submit_one_bio(READ, bio, 0);
2293 return 0;
2294}
2295EXPORT_SYMBOL(extent_readpages);
2296
2297/*
2298 * basic invalidatepage code, this waits on any locked or writeback
2299 * ranges corresponding to the page, and then deletes any extent state
2300 * records from the tree
2301 */
2302int extent_invalidatepage(struct extent_io_tree *tree,
2303 struct page *page, unsigned long offset)
2304{
2305 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2306 u64 end = start + PAGE_CACHE_SIZE - 1;
2307 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2308
2309 start += (offset + blocksize -1) & ~(blocksize - 1);
2310 if (start > end)
2311 return 0;
2312
2313 lock_extent(tree, start, end, GFP_NOFS);
2314 wait_on_extent_writeback(tree, start, end);
2315 clear_extent_bit(tree, start, end,
2316 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2317 1, 1, GFP_NOFS);
2318 return 0;
2319}
2320EXPORT_SYMBOL(extent_invalidatepage);
2321
2322/*
2323 * simple commit_write call, set_range_dirty is used to mark both
2324 * the pages and the extent records as dirty
2325 */
2326int extent_commit_write(struct extent_io_tree *tree,
2327 struct inode *inode, struct page *page,
2328 unsigned from, unsigned to)
2329{
2330 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2331
2332 set_page_extent_mapped(page);
2333 set_page_dirty(page);
2334
2335 if (pos > inode->i_size) {
2336 i_size_write(inode, pos);
2337 mark_inode_dirty(inode);
2338 }
2339 return 0;
2340}
2341EXPORT_SYMBOL(extent_commit_write);
2342
2343int extent_prepare_write(struct extent_io_tree *tree,
2344 struct inode *inode, struct page *page,
2345 unsigned from, unsigned to, get_extent_t *get_extent)
2346{
2347 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2348 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2349 u64 block_start;
2350 u64 orig_block_start;
2351 u64 block_end;
2352 u64 cur_end;
2353 struct extent_map *em;
2354 unsigned blocksize = 1 << inode->i_blkbits;
2355 size_t page_offset = 0;
2356 size_t block_off_start;
2357 size_t block_off_end;
2358 int err = 0;
2359 int iocount = 0;
2360 int ret = 0;
2361 int isnew;
2362
2363 set_page_extent_mapped(page);
2364
2365 block_start = (page_start + from) & ~((u64)blocksize - 1);
2366 block_end = (page_start + to - 1) | (blocksize - 1);
2367 orig_block_start = block_start;
2368
2369 lock_extent(tree, page_start, page_end, GFP_NOFS);
2370 while(block_start <= block_end) {
2371 em = get_extent(inode, page, page_offset, block_start,
2372 block_end - block_start + 1, 1);
2373 if (IS_ERR(em) || !em) {
2374 goto err;
2375 }
2376 cur_end = min(block_end, extent_map_end(em) - 1);
2377 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2378 block_off_end = block_off_start + blocksize;
2379 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2380
2381 if (!PageUptodate(page) && isnew &&
2382 (block_off_end > to || block_off_start < from)) {
2383 void *kaddr;
2384
2385 kaddr = kmap_atomic(page, KM_USER0);
2386 if (block_off_end > to)
2387 memset(kaddr + to, 0, block_off_end - to);
2388 if (block_off_start < from)
2389 memset(kaddr + block_off_start, 0,
2390 from - block_off_start);
2391 flush_dcache_page(page);
2392 kunmap_atomic(kaddr, KM_USER0);
2393 }
2394 if ((em->block_start != EXTENT_MAP_HOLE &&
2395 em->block_start != EXTENT_MAP_INLINE) &&
2396 !isnew && !PageUptodate(page) &&
2397 (block_off_end > to || block_off_start < from) &&
2398 !test_range_bit(tree, block_start, cur_end,
2399 EXTENT_UPTODATE, 1)) {
2400 u64 sector;
2401 u64 extent_offset = block_start - em->start;
2402 size_t iosize;
2403 sector = (em->block_start + extent_offset) >> 9;
2404 iosize = (cur_end - block_start + blocksize) &
2405 ~((u64)blocksize - 1);
2406 /*
2407 * we've already got the extent locked, but we
2408 * need to split the state such that our end_bio
2409 * handler can clear the lock.
2410 */
2411 set_extent_bit(tree, block_start,
2412 block_start + iosize - 1,
2413 EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2414 ret = submit_extent_page(READ, tree, page,
2415 sector, iosize, page_offset, em->bdev,
2416 NULL, 1,
2417 end_bio_extent_preparewrite, 0);
2418 iocount++;
2419 block_start = block_start + iosize;
2420 } else {
2421 set_extent_uptodate(tree, block_start, cur_end,
2422 GFP_NOFS);
2423 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2424 block_start = cur_end + 1;
2425 }
2426 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2427 free_extent_map(em);
2428 }
2429 if (iocount) {
2430 wait_extent_bit(tree, orig_block_start,
2431 block_end, EXTENT_LOCKED);
2432 }
2433 check_page_uptodate(tree, page);
2434err:
2435 /* FIXME, zero out newly allocated blocks on error */
2436 return err;
2437}
2438EXPORT_SYMBOL(extent_prepare_write);
2439
2440/*
2441 * a helper for releasepage, this tests for areas of the page that
2442 * are locked or under IO and drops the related state bits if it is safe
2443 * to drop the page.
2444 */
2445int try_release_extent_state(struct extent_map_tree *map,
2446 struct extent_io_tree *tree, struct page *page,
2447 gfp_t mask)
2448{
2449 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2450 u64 end = start + PAGE_CACHE_SIZE - 1;
2451 int ret = 1;
2452
2453 if (test_range_bit(tree, start, end,
2454 EXTENT_IOBITS | EXTENT_ORDERED, 0))
2455 ret = 0;
2456 else {
2457 if ((mask & GFP_NOFS) == GFP_NOFS)
2458 mask = GFP_NOFS;
2459 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2460 1, 1, mask);
2461 }
2462 return ret;
2463}
2464EXPORT_SYMBOL(try_release_extent_state);
2465
2466/*
2467 * a helper for releasepage. As long as there are no locked extents
2468 * in the range corresponding to the page, both state records and extent
2469 * map records are removed
2470 */
2471int try_release_extent_mapping(struct extent_map_tree *map,
2472 struct extent_io_tree *tree, struct page *page,
2473 gfp_t mask)
2474{
2475 struct extent_map *em;
2476 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2477 u64 end = start + PAGE_CACHE_SIZE - 1;
2478
2479 if ((mask & __GFP_WAIT) &&
2480 page->mapping->host->i_size > 16 * 1024 * 1024) {
2481 u64 len;
2482 while (start <= end) {
2483 len = end - start + 1;
2484 spin_lock(&map->lock);
2485 em = lookup_extent_mapping(map, start, len);
2486 if (!em || IS_ERR(em)) {
2487 spin_unlock(&map->lock);
2488 break;
2489 }
2490 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2491 em->start != start) {
2492 spin_unlock(&map->lock);
2493 free_extent_map(em);
2494 break;
2495 }
2496 if (!test_range_bit(tree, em->start,
2497 extent_map_end(em) - 1,
2498 EXTENT_LOCKED, 0)) {
2499 remove_extent_mapping(map, em);
2500 /* once for the rb tree */
2501 free_extent_map(em);
2502 }
2503 start = extent_map_end(em);
2504 spin_unlock(&map->lock);
2505
2506 /* once for us */
2507 free_extent_map(em);
2508 }
2509 }
2510 return try_release_extent_state(map, tree, page, mask);
2511}
2512EXPORT_SYMBOL(try_release_extent_mapping);
2513
2514sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2515 get_extent_t *get_extent)
2516{
2517 struct inode *inode = mapping->host;
2518 u64 start = iblock << inode->i_blkbits;
2519 sector_t sector = 0;
2520 struct extent_map *em;
2521
2522 em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
2523 if (!em || IS_ERR(em))
2524 return 0;
2525
2526 if (em->block_start == EXTENT_MAP_INLINE ||
2527 em->block_start == EXTENT_MAP_HOLE)
2528 goto out;
2529
2530 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2531out:
2532 free_extent_map(em);
2533 return sector;
2534}
2535
2536static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2537 unsigned long i)
2538{
2539 struct page *p;
2540 struct address_space *mapping;
2541
2542 if (i == 0)
2543 return eb->first_page;
2544 i += eb->start >> PAGE_CACHE_SHIFT;
2545 mapping = eb->first_page->mapping;
2546 if (!mapping)
2547 return NULL;
2548
2549 /*
2550 * extent_buffer_page is only called after pinning the page
2551 * by increasing the reference count. So we know the page must
2552 * be in the radix tree.
2553 */
2554 rcu_read_lock();
2555 p = radix_tree_lookup(&mapping->page_tree, i);
2556 rcu_read_unlock();
2557
2558 return p;
2559}
2560
2561static inline unsigned long num_extent_pages(u64 start, u64 len)
2562{
2563 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2564 (start >> PAGE_CACHE_SHIFT);
2565}
2566
2567static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2568 u64 start,
2569 unsigned long len,
2570 gfp_t mask)
2571{
2572 struct extent_buffer *eb = NULL;
2573#ifdef LEAK_DEBUG
2574 unsigned long flags;
2575#endif
2576
2577 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2578 eb->start = start;
2579 eb->len = len;
2580 mutex_init(&eb->mutex);
2581#ifdef LEAK_DEBUG
2582 spin_lock_irqsave(&leak_lock, flags);
2583 list_add(&eb->leak_list, &buffers);
2584 spin_unlock_irqrestore(&leak_lock, flags);
2585#endif
2586 atomic_set(&eb->refs, 1);
2587
2588 return eb;
2589}
2590
2591static void __free_extent_buffer(struct extent_buffer *eb)
2592{
2593#ifdef LEAK_DEBUG
2594 unsigned long flags;
2595 spin_lock_irqsave(&leak_lock, flags);
2596 list_del(&eb->leak_list);
2597 spin_unlock_irqrestore(&leak_lock, flags);
2598#endif
2599 kmem_cache_free(extent_buffer_cache, eb);
2600}
2601
2602struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2603 u64 start, unsigned long len,
2604 struct page *page0,
2605 gfp_t mask)
2606{
2607 unsigned long num_pages = num_extent_pages(start, len);
2608 unsigned long i;
2609 unsigned long index = start >> PAGE_CACHE_SHIFT;
2610 struct extent_buffer *eb;
2611 struct extent_buffer *exists = NULL;
2612 struct page *p;
2613 struct address_space *mapping = tree->mapping;
2614 int uptodate = 1;
2615
2616 spin_lock(&tree->buffer_lock);
2617 eb = buffer_search(tree, start);
2618 if (eb) {
2619 atomic_inc(&eb->refs);
2620 spin_unlock(&tree->buffer_lock);
2621 mark_page_accessed(eb->first_page);
2622 return eb;
2623 }
2624 spin_unlock(&tree->buffer_lock);
2625
2626 eb = __alloc_extent_buffer(tree, start, len, mask);
2627 if (!eb)
2628 return NULL;
2629
2630 if (page0) {
2631 eb->first_page = page0;
2632 i = 1;
2633 index++;
2634 page_cache_get(page0);
2635 mark_page_accessed(page0);
2636 set_page_extent_mapped(page0);
2637 set_page_extent_head(page0, len);
2638 uptodate = PageUptodate(page0);
2639 } else {
2640 i = 0;
2641 }
2642 for (; i < num_pages; i++, index++) {
2643 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2644 if (!p) {
2645 WARN_ON(1);
2646 goto free_eb;
2647 }
2648 set_page_extent_mapped(p);
2649 mark_page_accessed(p);
2650 if (i == 0) {
2651 eb->first_page = p;
2652 set_page_extent_head(p, len);
2653 } else {
2654 set_page_private(p, EXTENT_PAGE_PRIVATE);
2655 }
2656 if (!PageUptodate(p))
2657 uptodate = 0;
2658 unlock_page(p);
2659 }
2660 if (uptodate)
2661 eb->flags |= EXTENT_UPTODATE;
2662 eb->flags |= EXTENT_BUFFER_FILLED;
2663
2664 spin_lock(&tree->buffer_lock);
2665 exists = buffer_tree_insert(tree, start, &eb->rb_node);
2666 if (exists) {
2667 /* add one reference for the caller */
2668 atomic_inc(&exists->refs);
2669 spin_unlock(&tree->buffer_lock);
2670 goto free_eb;
2671 }
2672 spin_unlock(&tree->buffer_lock);
2673
2674 /* add one reference for the tree */
2675 atomic_inc(&eb->refs);
2676 return eb;
2677
2678free_eb:
2679 if (!atomic_dec_and_test(&eb->refs))
2680 return exists;
2681 for (index = 1; index < i; index++)
2682 page_cache_release(extent_buffer_page(eb, index));
2683 page_cache_release(extent_buffer_page(eb, 0));
2684 __free_extent_buffer(eb);
2685 return exists;
2686}
2687EXPORT_SYMBOL(alloc_extent_buffer);
2688
2689struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
2690 u64 start, unsigned long len,
2691 gfp_t mask)
2692{
2693 struct extent_buffer *eb;
2694
2695 spin_lock(&tree->buffer_lock);
2696 eb = buffer_search(tree, start);
2697 if (eb)
2698 atomic_inc(&eb->refs);
2699 spin_unlock(&tree->buffer_lock);
2700
2701 if (eb)
2702 mark_page_accessed(eb->first_page);
2703
2704 return eb;
2705}
2706EXPORT_SYMBOL(find_extent_buffer);
2707
2708void free_extent_buffer(struct extent_buffer *eb)
2709{
2710 if (!eb)
2711 return;
2712
2713 if (!atomic_dec_and_test(&eb->refs))
2714 return;
2715
2716 WARN_ON(1);
2717}
2718EXPORT_SYMBOL(free_extent_buffer);
2719
2720int clear_extent_buffer_dirty(struct extent_io_tree *tree,
2721 struct extent_buffer *eb)
2722{
2723 int set;
2724 unsigned long i;
2725 unsigned long num_pages;
2726 struct page *page;
2727
2728 u64 start = eb->start;
2729 u64 end = start + eb->len - 1;
2730
2731 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2732 num_pages = num_extent_pages(eb->start, eb->len);
2733
2734 for (i = 0; i < num_pages; i++) {
2735 page = extent_buffer_page(eb, i);
2736 lock_page(page);
2737 if (i == 0)
2738 set_page_extent_head(page, eb->len);
2739 else
2740 set_page_private(page, EXTENT_PAGE_PRIVATE);
2741
2742 /*
2743 * if we're on the last page or the first page and the
2744 * block isn't aligned on a page boundary, do extra checks
2745 * to make sure we don't clean page that is partially dirty
2746 */
2747 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2748 ((i == num_pages - 1) &&
2749 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2750 start = (u64)page->index << PAGE_CACHE_SHIFT;
2751 end = start + PAGE_CACHE_SIZE - 1;
2752 if (test_range_bit(tree, start, end,
2753 EXTENT_DIRTY, 0)) {
2754 unlock_page(page);
2755 continue;
2756 }
2757 }
2758 clear_page_dirty_for_io(page);
2759 spin_lock_irq(&page->mapping->tree_lock);
2760 if (!PageDirty(page)) {
2761 radix_tree_tag_clear(&page->mapping->page_tree,
2762 page_index(page),
2763 PAGECACHE_TAG_DIRTY);
2764 }
2765 spin_unlock_irq(&page->mapping->tree_lock);
2766 unlock_page(page);
2767 }
2768 return 0;
2769}
2770EXPORT_SYMBOL(clear_extent_buffer_dirty);
2771
2772int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
2773 struct extent_buffer *eb)
2774{
2775 return wait_on_extent_writeback(tree, eb->start,
2776 eb->start + eb->len - 1);
2777}
2778EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2779
2780int set_extent_buffer_dirty(struct extent_io_tree *tree,
2781 struct extent_buffer *eb)
2782{
2783 unsigned long i;
2784 unsigned long num_pages;
2785
2786 num_pages = num_extent_pages(eb->start, eb->len);
2787 for (i = 0; i < num_pages; i++) {
2788 struct page *page = extent_buffer_page(eb, i);
2789 /* writepage may need to do something special for the
2790 * first page, we have to make sure page->private is
2791 * properly set. releasepage may drop page->private
2792 * on us if the page isn't already dirty.
2793 */
2794 lock_page(page);
2795 if (i == 0) {
2796 set_page_extent_head(page, eb->len);
2797 } else if (PagePrivate(page) &&
2798 page->private != EXTENT_PAGE_PRIVATE) {
2799 set_page_extent_mapped(page);
2800 }
2801 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2802 set_extent_dirty(tree, page_offset(page),
2803 page_offset(page) + PAGE_CACHE_SIZE -1,
2804 GFP_NOFS);
2805 unlock_page(page);
2806 }
2807 return 0;
2808}
2809EXPORT_SYMBOL(set_extent_buffer_dirty);
2810
2811int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
2812 struct extent_buffer *eb)
2813{
2814 unsigned long i;
2815 struct page *page;
2816 unsigned long num_pages;
2817
2818 num_pages = num_extent_pages(eb->start, eb->len);
2819 eb->flags &= ~EXTENT_UPTODATE;
2820
2821 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2822 GFP_NOFS);
2823 for (i = 0; i < num_pages; i++) {
2824 page = extent_buffer_page(eb, i);
2825 if (page)
2826 ClearPageUptodate(page);
2827 }
2828 return 0;
2829}
2830
2831int set_extent_buffer_uptodate(struct extent_io_tree *tree,
2832 struct extent_buffer *eb)
2833{
2834 unsigned long i;
2835 struct page *page;
2836 unsigned long num_pages;
2837
2838 num_pages = num_extent_pages(eb->start, eb->len);
2839
2840 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2841 GFP_NOFS);
2842 for (i = 0; i < num_pages; i++) {
2843 page = extent_buffer_page(eb, i);
2844 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2845 ((i == num_pages - 1) &&
2846 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2847 check_page_uptodate(tree, page);
2848 continue;
2849 }
2850 SetPageUptodate(page);
2851 }
2852 return 0;
2853}
2854EXPORT_SYMBOL(set_extent_buffer_uptodate);
2855
2856int extent_range_uptodate(struct extent_io_tree *tree,
2857 u64 start, u64 end)
2858{
2859 struct page *page;
2860 int ret;
2861 int pg_uptodate = 1;
2862 int uptodate;
2863 unsigned long index;
2864
2865 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
2866 if (ret)
2867 return 1;
2868 while(start <= end) {
2869 index = start >> PAGE_CACHE_SHIFT;
2870 page = find_get_page(tree->mapping, index);
2871 uptodate = PageUptodate(page);
2872 page_cache_release(page);
2873 if (!uptodate) {
2874 pg_uptodate = 0;
2875 break;
2876 }
2877 start += PAGE_CACHE_SIZE;
2878 }
2879 return pg_uptodate;
2880}
2881
2882int extent_buffer_uptodate(struct extent_io_tree *tree,
2883 struct extent_buffer *eb)
2884{
2885 int ret = 0;
2886 unsigned long num_pages;
2887 unsigned long i;
2888 struct page *page;
2889 int pg_uptodate = 1;
2890
2891 if (eb->flags & EXTENT_UPTODATE)
2892 return 1;
2893
2894 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2895 EXTENT_UPTODATE, 1);
2896 if (ret)
2897 return ret;
2898
2899 num_pages = num_extent_pages(eb->start, eb->len);
2900 for (i = 0; i < num_pages; i++) {
2901 page = extent_buffer_page(eb, i);
2902 if (!PageUptodate(page)) {
2903 pg_uptodate = 0;
2904 break;
2905 }
2906 }
2907 return pg_uptodate;
2908}
2909EXPORT_SYMBOL(extent_buffer_uptodate);
2910
2911int read_extent_buffer_pages(struct extent_io_tree *tree,
2912 struct extent_buffer *eb,
2913 u64 start, int wait,
2914 get_extent_t *get_extent, int mirror_num)
2915{
2916 unsigned long i;
2917 unsigned long start_i;
2918 struct page *page;
2919 int err;
2920 int ret = 0;
2921 int locked_pages = 0;
2922 int all_uptodate = 1;
2923 int inc_all_pages = 0;
2924 unsigned long num_pages;
2925 struct bio *bio = NULL;
2926
2927 if (eb->flags & EXTENT_UPTODATE)
2928 return 0;
2929
2930 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2931 EXTENT_UPTODATE, 1)) {
2932 return 0;
2933 }
2934
2935 if (start) {
2936 WARN_ON(start < eb->start);
2937 start_i = (start >> PAGE_CACHE_SHIFT) -
2938 (eb->start >> PAGE_CACHE_SHIFT);
2939 } else {
2940 start_i = 0;
2941 }
2942
2943 num_pages = num_extent_pages(eb->start, eb->len);
2944 for (i = start_i; i < num_pages; i++) {
2945 page = extent_buffer_page(eb, i);
2946 if (!wait) {
2947 if (!trylock_page(page))
2948 goto unlock_exit;
2949 } else {
2950 lock_page(page);
2951 }
2952 locked_pages++;
2953 if (!PageUptodate(page)) {
2954 all_uptodate = 0;
2955 }
2956 }
2957 if (all_uptodate) {
2958 if (start_i == 0)
2959 eb->flags |= EXTENT_UPTODATE;
2960 if (ret) {
2961 printk("all up to date but ret is %d\n", ret);
2962 }
2963 goto unlock_exit;
2964 }
2965
2966 for (i = start_i; i < num_pages; i++) {
2967 page = extent_buffer_page(eb, i);
2968 if (inc_all_pages)
2969 page_cache_get(page);
2970 if (!PageUptodate(page)) {
2971 if (start_i == 0)
2972 inc_all_pages = 1;
2973 ClearPageError(page);
2974 err = __extent_read_full_page(tree, page,
2975 get_extent, &bio,
2976 mirror_num);
2977 if (err) {
2978 ret = err;
2979 printk("err %d from __extent_read_full_page\n", ret);
2980 }
2981 } else {
2982 unlock_page(page);
2983 }
2984 }
2985
2986 if (bio)
2987 submit_one_bio(READ, bio, mirror_num);
2988
2989 if (ret || !wait) {
2990 if (ret)
2991 printk("ret %d wait %d returning\n", ret, wait);
2992 return ret;
2993 }
2994 for (i = start_i; i < num_pages; i++) {
2995 page = extent_buffer_page(eb, i);
2996 wait_on_page_locked(page);
2997 if (!PageUptodate(page)) {
2998 printk("page not uptodate after wait_on_page_locked\n");
2999 ret = -EIO;
3000 }
3001 }
3002 if (!ret)
3003 eb->flags |= EXTENT_UPTODATE;
3004 return ret;
3005
3006unlock_exit:
3007 i = start_i;
3008 while(locked_pages > 0) {
3009 page = extent_buffer_page(eb, i);
3010 i++;
3011 unlock_page(page);
3012 locked_pages--;
3013 }
3014 return ret;
3015}
3016EXPORT_SYMBOL(read_extent_buffer_pages);
3017
3018void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3019 unsigned long start,
3020 unsigned long len)
3021{
3022 size_t cur;
3023 size_t offset;
3024 struct page *page;
3025 char *kaddr;
3026 char *dst = (char *)dstv;
3027 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3028 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3029
3030 WARN_ON(start > eb->len);
3031 WARN_ON(start + len > eb->start + eb->len);
3032
3033 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3034
3035 while(len > 0) {
3036 page = extent_buffer_page(eb, i);
3037
3038 cur = min(len, (PAGE_CACHE_SIZE - offset));
3039 kaddr = kmap_atomic(page, KM_USER1);
3040 memcpy(dst, kaddr + offset, cur);
3041 kunmap_atomic(kaddr, KM_USER1);
3042
3043 dst += cur;
3044 len -= cur;
3045 offset = 0;
3046 i++;
3047 }
3048}
3049EXPORT_SYMBOL(read_extent_buffer);
3050
3051int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3052 unsigned long min_len, char **token, char **map,
3053 unsigned long *map_start,
3054 unsigned long *map_len, int km)
3055{
3056 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3057 char *kaddr;
3058 struct page *p;
3059 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3060 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3061 unsigned long end_i = (start_offset + start + min_len - 1) >>
3062 PAGE_CACHE_SHIFT;
3063
3064 if (i != end_i)
3065 return -EINVAL;
3066
3067 if (i == 0) {
3068 offset = start_offset;
3069 *map_start = 0;
3070 } else {
3071 offset = 0;
3072 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3073 }
3074 if (start + min_len > eb->len) {
3075printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
3076 WARN_ON(1);
3077 }
3078
3079 p = extent_buffer_page(eb, i);
3080 kaddr = kmap_atomic(p, km);
3081 *token = kaddr;
3082 *map = kaddr + offset;
3083 *map_len = PAGE_CACHE_SIZE - offset;
3084 return 0;
3085}
3086EXPORT_SYMBOL(map_private_extent_buffer);
3087
3088int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3089 unsigned long min_len,
3090 char **token, char **map,
3091 unsigned long *map_start,
3092 unsigned long *map_len, int km)
3093{
3094 int err;
3095 int save = 0;
3096 if (eb->map_token) {
3097 unmap_extent_buffer(eb, eb->map_token, km);
3098 eb->map_token = NULL;
3099 save = 1;
3100 }
3101 err = map_private_extent_buffer(eb, start, min_len, token, map,
3102 map_start, map_len, km);
3103 if (!err && save) {
3104 eb->map_token = *token;
3105 eb->kaddr = *map;
3106 eb->map_start = *map_start;
3107 eb->map_len = *map_len;
3108 }
3109 return err;
3110}
3111EXPORT_SYMBOL(map_extent_buffer);
3112
3113void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3114{
3115 kunmap_atomic(token, km);
3116}
3117EXPORT_SYMBOL(unmap_extent_buffer);
3118
3119int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3120 unsigned long start,
3121 unsigned long len)
3122{
3123 size_t cur;
3124 size_t offset;
3125 struct page *page;
3126 char *kaddr;
3127 char *ptr = (char *)ptrv;
3128 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3129 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3130 int ret = 0;
3131
3132 WARN_ON(start > eb->len);
3133 WARN_ON(start + len > eb->start + eb->len);
3134
3135 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3136
3137 while(len > 0) {
3138 page = extent_buffer_page(eb, i);
3139
3140 cur = min(len, (PAGE_CACHE_SIZE - offset));
3141
3142 kaddr = kmap_atomic(page, KM_USER0);
3143 ret = memcmp(ptr, kaddr + offset, cur);
3144 kunmap_atomic(kaddr, KM_USER0);
3145 if (ret)
3146 break;
3147
3148 ptr += cur;
3149 len -= cur;
3150 offset = 0;
3151 i++;
3152 }
3153 return ret;
3154}
3155EXPORT_SYMBOL(memcmp_extent_buffer);
3156
3157void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3158 unsigned long start, unsigned long len)
3159{
3160 size_t cur;
3161 size_t offset;
3162 struct page *page;
3163 char *kaddr;
3164 char *src = (char *)srcv;
3165 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3166 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3167
3168 WARN_ON(start > eb->len);
3169 WARN_ON(start + len > eb->start + eb->len);
3170
3171 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3172
3173 while(len > 0) {
3174 page = extent_buffer_page(eb, i);
3175 WARN_ON(!PageUptodate(page));
3176
3177 cur = min(len, PAGE_CACHE_SIZE - offset);
3178 kaddr = kmap_atomic(page, KM_USER1);
3179 memcpy(kaddr + offset, src, cur);
3180 kunmap_atomic(kaddr, KM_USER1);
3181
3182 src += cur;
3183 len -= cur;
3184 offset = 0;
3185 i++;
3186 }
3187}
3188EXPORT_SYMBOL(write_extent_buffer);
3189
3190void memset_extent_buffer(struct extent_buffer *eb, char c,
3191 unsigned long start, unsigned long len)
3192{
3193 size_t cur;
3194 size_t offset;
3195 struct page *page;
3196 char *kaddr;
3197 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3198 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3199
3200 WARN_ON(start > eb->len);
3201 WARN_ON(start + len > eb->start + eb->len);
3202
3203 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3204
3205 while(len > 0) {
3206 page = extent_buffer_page(eb, i);
3207 WARN_ON(!PageUptodate(page));
3208
3209 cur = min(len, PAGE_CACHE_SIZE - offset);
3210 kaddr = kmap_atomic(page, KM_USER0);
3211 memset(kaddr + offset, c, cur);
3212 kunmap_atomic(kaddr, KM_USER0);
3213
3214 len -= cur;
3215 offset = 0;
3216 i++;
3217 }
3218}
3219EXPORT_SYMBOL(memset_extent_buffer);
3220
3221void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3222 unsigned long dst_offset, unsigned long src_offset,
3223 unsigned long len)
3224{
3225 u64 dst_len = dst->len;
3226 size_t cur;
3227 size_t offset;
3228 struct page *page;
3229 char *kaddr;
3230 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3231 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3232
3233 WARN_ON(src->len != dst_len);
3234
3235 offset = (start_offset + dst_offset) &
3236 ((unsigned long)PAGE_CACHE_SIZE - 1);
3237
3238 while(len > 0) {
3239 page = extent_buffer_page(dst, i);
3240 WARN_ON(!PageUptodate(page));
3241
3242 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3243
3244 kaddr = kmap_atomic(page, KM_USER0);
3245 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3246 kunmap_atomic(kaddr, KM_USER0);
3247
3248 src_offset += cur;
3249 len -= cur;
3250 offset = 0;
3251 i++;
3252 }
3253}
3254EXPORT_SYMBOL(copy_extent_buffer);
3255
3256static void move_pages(struct page *dst_page, struct page *src_page,
3257 unsigned long dst_off, unsigned long src_off,
3258 unsigned long len)
3259{
3260 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3261 if (dst_page == src_page) {
3262 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3263 } else {
3264 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3265 char *p = dst_kaddr + dst_off + len;
3266 char *s = src_kaddr + src_off + len;
3267
3268 while (len--)
3269 *--p = *--s;
3270
3271 kunmap_atomic(src_kaddr, KM_USER1);
3272 }
3273 kunmap_atomic(dst_kaddr, KM_USER0);
3274}
3275
3276static void copy_pages(struct page *dst_page, struct page *src_page,
3277 unsigned long dst_off, unsigned long src_off,
3278 unsigned long len)
3279{
3280 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3281 char *src_kaddr;
3282
3283 if (dst_page != src_page)
3284 src_kaddr = kmap_atomic(src_page, KM_USER1);
3285 else
3286 src_kaddr = dst_kaddr;
3287
3288 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3289 kunmap_atomic(dst_kaddr, KM_USER0);
3290 if (dst_page != src_page)
3291 kunmap_atomic(src_kaddr, KM_USER1);
3292}
3293
3294void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3295 unsigned long src_offset, unsigned long len)
3296{
3297 size_t cur;
3298 size_t dst_off_in_page;
3299 size_t src_off_in_page;
3300 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3301 unsigned long dst_i;
3302 unsigned long src_i;
3303
3304 if (src_offset + len > dst->len) {
3305 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3306 src_offset, len, dst->len);
3307 BUG_ON(1);
3308 }
3309 if (dst_offset + len > dst->len) {
3310 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3311 dst_offset, len, dst->len);
3312 BUG_ON(1);
3313 }
3314
3315 while(len > 0) {
3316 dst_off_in_page = (start_offset + dst_offset) &
3317 ((unsigned long)PAGE_CACHE_SIZE - 1);
3318 src_off_in_page = (start_offset + src_offset) &
3319 ((unsigned long)PAGE_CACHE_SIZE - 1);
3320
3321 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3322 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3323
3324 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3325 src_off_in_page));
3326 cur = min_t(unsigned long, cur,
3327 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3328
3329 copy_pages(extent_buffer_page(dst, dst_i),
3330 extent_buffer_page(dst, src_i),
3331 dst_off_in_page, src_off_in_page, cur);
3332
3333 src_offset += cur;
3334 dst_offset += cur;
3335 len -= cur;
3336 }
3337}
3338EXPORT_SYMBOL(memcpy_extent_buffer);
3339
3340void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3341 unsigned long src_offset, unsigned long len)
3342{
3343 size_t cur;
3344 size_t dst_off_in_page;
3345 size_t src_off_in_page;
3346 unsigned long dst_end = dst_offset + len - 1;
3347 unsigned long src_end = src_offset + len - 1;
3348 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3349 unsigned long dst_i;
3350 unsigned long src_i;
3351
3352 if (src_offset + len > dst->len) {
3353 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3354 src_offset, len, dst->len);
3355 BUG_ON(1);
3356 }
3357 if (dst_offset + len > dst->len) {
3358 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3359 dst_offset, len, dst->len);
3360 BUG_ON(1);
3361 }
3362 if (dst_offset < src_offset) {
3363 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3364 return;
3365 }
3366 while(len > 0) {
3367 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3368 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3369
3370 dst_off_in_page = (start_offset + dst_end) &
3371 ((unsigned long)PAGE_CACHE_SIZE - 1);
3372 src_off_in_page = (start_offset + src_end) &
3373 ((unsigned long)PAGE_CACHE_SIZE - 1);
3374
3375 cur = min_t(unsigned long, len, src_off_in_page + 1);
3376 cur = min(cur, dst_off_in_page + 1);
3377 move_pages(extent_buffer_page(dst, dst_i),
3378 extent_buffer_page(dst, src_i),
3379 dst_off_in_page - cur + 1,
3380 src_off_in_page - cur + 1, cur);
3381
3382 dst_end -= cur;
3383 src_end -= cur;
3384 len -= cur;
3385 }
3386}
3387EXPORT_SYMBOL(memmove_extent_buffer);
3388
3389int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3390{
3391 u64 start = page_offset(page);
3392 struct extent_buffer *eb;
3393 int ret = 1;
3394 unsigned long i;
3395 unsigned long num_pages;
3396
3397 spin_lock(&tree->buffer_lock);
3398 eb = buffer_search(tree, start);
3399 if (!eb)
3400 goto out;
3401
3402 if (atomic_read(&eb->refs) > 1) {
3403 ret = 0;
3404 goto out;
3405 }
3406 /* at this point we can safely release the extent buffer */
3407 num_pages = num_extent_pages(eb->start, eb->len);
3408 for (i = 0; i < num_pages; i++)
3409 page_cache_release(extent_buffer_page(eb, i));
3410 rb_erase(&eb->rb_node, &tree->buffer);
3411 __free_extent_buffer(eb);
3412out:
3413 spin_unlock(&tree->buffer_lock);
3414 return ret;
3415}
3416EXPORT_SYMBOL(try_release_extent_buffer);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000000..c9d1908a1ae3
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,248 @@
1#ifndef __EXTENTIO__
2#define __EXTENTIO__
3
4#include <linux/rbtree.h>
5
6/* bits for the extent state */
7#define EXTENT_DIRTY 1
8#define EXTENT_WRITEBACK (1 << 1)
9#define EXTENT_UPTODATE (1 << 2)
10#define EXTENT_LOCKED (1 << 3)
11#define EXTENT_NEW (1 << 4)
12#define EXTENT_DELALLOC (1 << 5)
13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_ORDERED (1 << 9)
17#define EXTENT_ORDERED_METADATA (1 << 10)
18#define EXTENT_BOUNDARY (1 << 11)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
20
21/*
22 * page->private values. Every page that is controlled by the extent
23 * map has page->private set to one.
24 */
25#define EXTENT_PAGE_PRIVATE 1
26#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
27
28struct extent_state;
29
30typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
31 struct bio *bio, int mirror_num);
32struct extent_io_ops {
33 int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
34 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
35 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
36 extent_submit_bio_hook_t *submit_bio_hook;
37 int (*merge_bio_hook)(struct page *page, unsigned long offset,
38 size_t size, struct bio *bio);
39 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
40 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
41 u64 start, u64 end,
42 struct extent_state *state);
43 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
44 u64 start, u64 end,
45 struct extent_state *state);
46 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
47 struct extent_state *state);
48 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
49 struct extent_state *state, int uptodate);
50 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
51 unsigned long old, unsigned long bits);
52 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
53 unsigned long old, unsigned long bits);
54 int (*write_cache_pages_lock_hook)(struct page *page);
55};
56
57struct extent_io_tree {
58 struct rb_root state;
59 struct rb_root buffer;
60 struct address_space *mapping;
61 u64 dirty_bytes;
62 spinlock_t lock;
63 spinlock_t buffer_lock;
64 struct extent_io_ops *ops;
65};
66
67struct extent_state {
68 u64 start;
69 u64 end; /* inclusive */
70 struct rb_node rb_node;
71 struct extent_io_tree *tree;
72 wait_queue_head_t wq;
73 atomic_t refs;
74 unsigned long state;
75
76 /* for use by the FS */
77 u64 private;
78
79 struct list_head leak_list;
80};
81
82struct extent_buffer {
83 u64 start;
84 unsigned long len;
85 char *map_token;
86 char *kaddr;
87 unsigned long map_start;
88 unsigned long map_len;
89 struct page *first_page;
90 atomic_t refs;
91 int flags;
92 struct list_head leak_list;
93 struct rb_node rb_node;
94 struct mutex mutex;
95};
96
97struct extent_map_tree;
98
99static inline struct extent_state *extent_state_next(struct extent_state *state)
100{
101 struct rb_node *node;
102 node = rb_next(&state->rb_node);
103 if (!node)
104 return NULL;
105 return rb_entry(node, struct extent_state, rb_node);
106}
107
108typedef struct extent_map *(get_extent_t)(struct inode *inode,
109 struct page *page,
110 size_t page_offset,
111 u64 start, u64 len,
112 int create);
113
114void extent_io_tree_init(struct extent_io_tree *tree,
115 struct address_space *mapping, gfp_t mask);
116int try_release_extent_mapping(struct extent_map_tree *map,
117 struct extent_io_tree *tree, struct page *page,
118 gfp_t mask);
119int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
120int try_release_extent_state(struct extent_map_tree *map,
121 struct extent_io_tree *tree, struct page *page,
122 gfp_t mask);
123int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
124int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
125int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
126 get_extent_t *get_extent);
127int __init extent_io_init(void);
128void extent_io_exit(void);
129
130u64 count_range_bits(struct extent_io_tree *tree,
131 u64 *start, u64 search_end,
132 u64 max_bytes, unsigned long bits);
133
134int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
135 int bits, int filled);
136int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
137 int bits, gfp_t mask);
138int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
139 int bits, int wake, int delete, gfp_t mask);
140int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
141 int bits, gfp_t mask);
142int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
143 gfp_t mask);
144int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
145 gfp_t mask);
146int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
147 gfp_t mask);
148int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
149 gfp_t mask);
150int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
151 gfp_t mask);
152int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
153 u64 end, gfp_t mask);
154int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
155 gfp_t mask);
156int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
157 gfp_t mask);
158int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
159 u64 *start_ret, u64 *end_ret, int bits);
160struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
161 u64 start, int bits);
162int extent_invalidatepage(struct extent_io_tree *tree,
163 struct page *page, unsigned long offset);
164int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
165 get_extent_t *get_extent,
166 struct writeback_control *wbc);
167int extent_writepages(struct extent_io_tree *tree,
168 struct address_space *mapping,
169 get_extent_t *get_extent,
170 struct writeback_control *wbc);
171int extent_readpages(struct extent_io_tree *tree,
172 struct address_space *mapping,
173 struct list_head *pages, unsigned nr_pages,
174 get_extent_t get_extent);
175int extent_prepare_write(struct extent_io_tree *tree,
176 struct inode *inode, struct page *page,
177 unsigned from, unsigned to, get_extent_t *get_extent);
178int extent_commit_write(struct extent_io_tree *tree,
179 struct inode *inode, struct page *page,
180 unsigned from, unsigned to);
181sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
182 get_extent_t *get_extent);
183int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
184int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
185int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
186void set_page_extent_mapped(struct page *page);
187
188struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
189 u64 start, unsigned long len,
190 struct page *page0,
191 gfp_t mask);
192struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
193 u64 start, unsigned long len,
194 gfp_t mask);
195void free_extent_buffer(struct extent_buffer *eb);
196int read_extent_buffer_pages(struct extent_io_tree *tree,
197 struct extent_buffer *eb, u64 start, int wait,
198 get_extent_t *get_extent, int mirror_num);
199
200static inline void extent_buffer_get(struct extent_buffer *eb)
201{
202 atomic_inc(&eb->refs);
203}
204
205int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
206 unsigned long start,
207 unsigned long len);
208void read_extent_buffer(struct extent_buffer *eb, void *dst,
209 unsigned long start,
210 unsigned long len);
211void write_extent_buffer(struct extent_buffer *eb, const void *src,
212 unsigned long start, unsigned long len);
213void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
214 unsigned long dst_offset, unsigned long src_offset,
215 unsigned long len);
216void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
217 unsigned long src_offset, unsigned long len);
218void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
219 unsigned long src_offset, unsigned long len);
220void memset_extent_buffer(struct extent_buffer *eb, char c,
221 unsigned long start, unsigned long len);
222int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
223 struct extent_buffer *eb);
224int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
225int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
226int clear_extent_buffer_dirty(struct extent_io_tree *tree,
227 struct extent_buffer *eb);
228int set_extent_buffer_dirty(struct extent_io_tree *tree,
229 struct extent_buffer *eb);
230int set_extent_buffer_uptodate(struct extent_io_tree *tree,
231 struct extent_buffer *eb);
232int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
233 struct extent_buffer *eb);
234int extent_buffer_uptodate(struct extent_io_tree *tree,
235 struct extent_buffer *eb);
236int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
237 unsigned long min_len, char **token, char **map,
238 unsigned long *map_start,
239 unsigned long *map_len, int km);
240int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
241 unsigned long min_len, char **token, char **map,
242 unsigned long *map_start,
243 unsigned long *map_len, int km);
244void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
245int release_extent_buffer_tail_pages(struct extent_buffer *eb);
246int extent_range_uptodate(struct extent_io_tree *tree,
247 u64 start, u64 end);
248#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000000..74b2a29880d3
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,342 @@
1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h>
4#include <linux/module.h>
5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h>
8#include "extent_map.h"
9
10/* temporary define until extent_map moves out of btrfs */
11struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
12 unsigned long extra_flags,
13 void (*ctor)(void *, struct kmem_cache *,
14 unsigned long));
15
16static struct kmem_cache *extent_map_cache;
17
18int __init extent_map_init(void)
19{
20 extent_map_cache = btrfs_cache_create("extent_map",
21 sizeof(struct extent_map), 0,
22 NULL);
23 if (!extent_map_cache)
24 return -ENOMEM;
25 return 0;
26}
27
28void extent_map_exit(void)
29{
30 if (extent_map_cache)
31 kmem_cache_destroy(extent_map_cache);
32}
33
34/**
35 * extent_map_tree_init - initialize extent map tree
36 * @tree: tree to initialize
37 * @mask: flags for memory allocations during tree operations
38 *
39 * Initialize the extent tree @tree. Should be called for each new inode
40 * or other user of the extent_map interface.
41 */
42void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
43{
44 tree->map.rb_node = NULL;
45 spin_lock_init(&tree->lock);
46}
47EXPORT_SYMBOL(extent_map_tree_init);
48
49/**
50 * alloc_extent_map - allocate new extent map structure
51 * @mask: memory allocation flags
52 *
53 * Allocate a new extent_map structure. The new structure is
54 * returned with a reference count of one and needs to be
55 * freed using free_extent_map()
56 */
57struct extent_map *alloc_extent_map(gfp_t mask)
58{
59 struct extent_map *em;
60 em = kmem_cache_alloc(extent_map_cache, mask);
61 if (!em || IS_ERR(em))
62 return em;
63 em->in_tree = 0;
64 em->flags = 0;
65 atomic_set(&em->refs, 1);
66 return em;
67}
68EXPORT_SYMBOL(alloc_extent_map);
69
70/**
71 * free_extent_map - drop reference count of an extent_map
72 * @em: extent map beeing releasead
73 *
74 * Drops the reference out on @em by one and free the structure
75 * if the reference count hits zero.
76 */
77void free_extent_map(struct extent_map *em)
78{
79 if (!em)
80 return;
81 WARN_ON(atomic_read(&em->refs) == 0);
82 if (atomic_dec_and_test(&em->refs)) {
83 WARN_ON(em->in_tree);
84 kmem_cache_free(extent_map_cache, em);
85 }
86}
87EXPORT_SYMBOL(free_extent_map);
88
89static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
90 struct rb_node *node)
91{
92 struct rb_node ** p = &root->rb_node;
93 struct rb_node * parent = NULL;
94 struct extent_map *entry;
95
96 while(*p) {
97 parent = *p;
98 entry = rb_entry(parent, struct extent_map, rb_node);
99
100 WARN_ON(!entry->in_tree);
101
102 if (offset < entry->start)
103 p = &(*p)->rb_left;
104 else if (offset >= extent_map_end(entry))
105 p = &(*p)->rb_right;
106 else
107 return parent;
108 }
109
110 entry = rb_entry(node, struct extent_map, rb_node);
111 entry->in_tree = 1;
112 rb_link_node(node, parent, p);
113 rb_insert_color(node, root);
114 return NULL;
115}
116
117/*
118 * search through the tree for an extent_map with a given offset. If
119 * it can't be found, try to find some neighboring extents
120 */
121static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
122 struct rb_node **prev_ret,
123 struct rb_node **next_ret)
124{
125 struct rb_node * n = root->rb_node;
126 struct rb_node *prev = NULL;
127 struct rb_node *orig_prev = NULL;
128 struct extent_map *entry;
129 struct extent_map *prev_entry = NULL;
130
131 while(n) {
132 entry = rb_entry(n, struct extent_map, rb_node);
133 prev = n;
134 prev_entry = entry;
135
136 WARN_ON(!entry->in_tree);
137
138 if (offset < entry->start)
139 n = n->rb_left;
140 else if (offset >= extent_map_end(entry))
141 n = n->rb_right;
142 else
143 return n;
144 }
145
146 if (prev_ret) {
147 orig_prev = prev;
148 while(prev && offset >= extent_map_end(prev_entry)) {
149 prev = rb_next(prev);
150 prev_entry = rb_entry(prev, struct extent_map, rb_node);
151 }
152 *prev_ret = prev;
153 prev = orig_prev;
154 }
155
156 if (next_ret) {
157 prev_entry = rb_entry(prev, struct extent_map, rb_node);
158 while(prev && offset < prev_entry->start) {
159 prev = rb_prev(prev);
160 prev_entry = rb_entry(prev, struct extent_map, rb_node);
161 }
162 *next_ret = prev;
163 }
164 return NULL;
165}
166
167/*
168 * look for an offset in the tree, and if it can't be found, return
169 * the first offset we can find smaller than 'offset'.
170 */
171static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
172{
173 struct rb_node *prev;
174 struct rb_node *ret;
175 ret = __tree_search(root, offset, &prev, NULL);
176 if (!ret)
177 return prev;
178 return ret;
179}
180
181/* check to see if two extent_map structs are adjacent and safe to merge */
182static int mergable_maps(struct extent_map *prev, struct extent_map *next)
183{
184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
185 return 0;
186
187 if (extent_map_end(prev) == next->start &&
188 prev->flags == next->flags &&
189 prev->bdev == next->bdev &&
190 ((next->block_start == EXTENT_MAP_HOLE &&
191 prev->block_start == EXTENT_MAP_HOLE) ||
192 (next->block_start == EXTENT_MAP_INLINE &&
193 prev->block_start == EXTENT_MAP_INLINE) ||
194 (next->block_start == EXTENT_MAP_DELALLOC &&
195 prev->block_start == EXTENT_MAP_DELALLOC) ||
196 (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
197 next->block_start == extent_map_block_end(prev)))) {
198 return 1;
199 }
200 return 0;
201}
202
203/**
204 * add_extent_mapping - add new extent map to the extent tree
205 * @tree: tree to insert new map in
206 * @em: map to insert
207 *
208 * Insert @em into @tree or perform a simple forward/backward merge with
209 * existing mappings. The extent_map struct passed in will be inserted
210 * into the tree directly, with an additional reference taken, or a
211 * reference dropped if the merge attempt was sucessfull.
212 */
213int add_extent_mapping(struct extent_map_tree *tree,
214 struct extent_map *em)
215{
216 int ret = 0;
217 struct extent_map *merge = NULL;
218 struct rb_node *rb;
219 struct extent_map *exist;
220
221 exist = lookup_extent_mapping(tree, em->start, em->len);
222 if (exist) {
223 free_extent_map(exist);
224 ret = -EEXIST;
225 goto out;
226 }
227 assert_spin_locked(&tree->lock);
228 rb = tree_insert(&tree->map, em->start, &em->rb_node);
229 if (rb) {
230 ret = -EEXIST;
231 free_extent_map(merge);
232 goto out;
233 }
234 atomic_inc(&em->refs);
235 if (em->start != 0) {
236 rb = rb_prev(&em->rb_node);
237 if (rb)
238 merge = rb_entry(rb, struct extent_map, rb_node);
239 if (rb && mergable_maps(merge, em)) {
240 em->start = merge->start;
241 em->len += merge->len;
242 em->block_start = merge->block_start;
243 merge->in_tree = 0;
244 rb_erase(&merge->rb_node, &tree->map);
245 free_extent_map(merge);
246 }
247 }
248 rb = rb_next(&em->rb_node);
249 if (rb)
250 merge = rb_entry(rb, struct extent_map, rb_node);
251 if (rb && mergable_maps(em, merge)) {
252 em->len += merge->len;
253 rb_erase(&merge->rb_node, &tree->map);
254 merge->in_tree = 0;
255 free_extent_map(merge);
256 }
257out:
258 return ret;
259}
260EXPORT_SYMBOL(add_extent_mapping);
261
262/* simple helper to do math around the end of an extent, handling wrap */
263static u64 range_end(u64 start, u64 len)
264{
265 if (start + len < start)
266 return (u64)-1;
267 return start + len;
268}
269
270/**
271 * lookup_extent_mapping - lookup extent_map
272 * @tree: tree to lookup in
273 * @start: byte offset to start the search
274 * @len: length of the lookup range
275 *
276 * Find and return the first extent_map struct in @tree that intersects the
277 * [start, len] range. There may be additional objects in the tree that
278 * intersect, so check the object returned carefully to make sure that no
279 * additional lookups are needed.
280 */
281struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
282 u64 start, u64 len)
283{
284 struct extent_map *em;
285 struct rb_node *rb_node;
286 struct rb_node *prev = NULL;
287 struct rb_node *next = NULL;
288 u64 end = range_end(start, len);
289
290 assert_spin_locked(&tree->lock);
291 rb_node = __tree_search(&tree->map, start, &prev, &next);
292 if (!rb_node && prev) {
293 em = rb_entry(prev, struct extent_map, rb_node);
294 if (end > em->start && start < extent_map_end(em))
295 goto found;
296 }
297 if (!rb_node && next) {
298 em = rb_entry(next, struct extent_map, rb_node);
299 if (end > em->start && start < extent_map_end(em))
300 goto found;
301 }
302 if (!rb_node) {
303 em = NULL;
304 goto out;
305 }
306 if (IS_ERR(rb_node)) {
307 em = ERR_PTR(PTR_ERR(rb_node));
308 goto out;
309 }
310 em = rb_entry(rb_node, struct extent_map, rb_node);
311 if (end > em->start && start < extent_map_end(em))
312 goto found;
313
314 em = NULL;
315 goto out;
316
317found:
318 atomic_inc(&em->refs);
319out:
320 return em;
321}
322EXPORT_SYMBOL(lookup_extent_mapping);
323
324/**
325 * remove_extent_mapping - removes an extent_map from the extent tree
326 * @tree: extent tree to remove from
327 * @em: extent map beeing removed
328 *
329 * Removes @em from @tree. No reference counts are dropped, and no checks
330 * are done to see if the range is in use
331 */
332int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
333{
334 int ret = 0;
335
336 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
337 assert_spin_locked(&tree->lock);
338 rb_erase(&em->rb_node, &tree->map);
339 em->in_tree = 0;
340 return ret;
341}
342EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000000..26ac6fe0b268
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,57 @@
1#ifndef __EXTENTMAP__
2#define __EXTENTMAP__
3
4#include <linux/rbtree.h>
5
6#define EXTENT_MAP_LAST_BYTE (u64)-4
7#define EXTENT_MAP_HOLE (u64)-3
8#define EXTENT_MAP_INLINE (u64)-2
9#define EXTENT_MAP_DELALLOC (u64)-1
10
11/* bits for the flags field */
12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
13
14struct extent_map {
15 struct rb_node rb_node;
16
17 /* all of these are in bytes */
18 u64 start;
19 u64 len;
20 u64 block_start;
21 unsigned long flags;
22 struct block_device *bdev;
23 atomic_t refs;
24 int in_tree;
25};
26
27struct extent_map_tree {
28 struct rb_root map;
29 spinlock_t lock;
30};
31
32static inline u64 extent_map_end(struct extent_map *em)
33{
34 if (em->start + em->len < em->start)
35 return (u64)-1;
36 return em->start + em->len;
37}
38
39static inline u64 extent_map_block_end(struct extent_map *em)
40{
41 if (em->block_start + em->len < em->block_start)
42 return (u64)-1;
43 return em->block_start + em->len;
44}
45
46void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
47struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
48 u64 start, u64 len);
49int add_extent_mapping(struct extent_map_tree *tree,
50 struct extent_map *em);
51int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
52
53struct extent_map *alloc_extent_map(gfp_t mask);
54void free_extent_map(struct extent_map *em);
55int __init extent_map_init(void);
56void extent_map_exit(void);
57#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000000..6dbe88b9d7d4
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,512 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/bio.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include "ctree.h"
23#include "disk-io.h"
24#include "transaction.h"
25#include "print-tree.h"
26
27#define MAX_CSUM_ITEMS(r) ((((BTRFS_LEAF_DATA_SIZE(r) - \
28 sizeof(struct btrfs_item) * 2) / \
29 BTRFS_CRC32_SIZE) - 1))
30int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root,
32 u64 objectid, u64 pos,
33 u64 disk_offset, u64 disk_num_bytes,
34 u64 num_bytes, u64 offset)
35{
36 int ret = 0;
37 struct btrfs_file_extent_item *item;
38 struct btrfs_key file_key;
39 struct btrfs_path *path;
40 struct extent_buffer *leaf;
41
42 path = btrfs_alloc_path();
43 BUG_ON(!path);
44 file_key.objectid = objectid;
45 file_key.offset = pos;
46 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
47
48 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
49 sizeof(*item));
50 if (ret < 0)
51 goto out;
52 BUG_ON(ret);
53 leaf = path->nodes[0];
54 item = btrfs_item_ptr(leaf, path->slots[0],
55 struct btrfs_file_extent_item);
56 btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
57 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
58 btrfs_set_file_extent_offset(leaf, item, offset);
59 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
60 btrfs_set_file_extent_generation(leaf, item, trans->transid);
61 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
62 btrfs_mark_buffer_dirty(leaf);
63out:
64 btrfs_free_path(path);
65 return ret;
66}
67
68struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
69 struct btrfs_root *root,
70 struct btrfs_path *path,
71 u64 objectid, u64 offset,
72 int cow)
73{
74 int ret;
75 struct btrfs_key file_key;
76 struct btrfs_key found_key;
77 struct btrfs_csum_item *item;
78 struct extent_buffer *leaf;
79 u64 csum_offset = 0;
80 int csums_in_item;
81
82 file_key.objectid = objectid;
83 file_key.offset = offset;
84 btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
85 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
86 if (ret < 0)
87 goto fail;
88 leaf = path->nodes[0];
89 if (ret > 0) {
90 ret = 1;
91 if (path->slots[0] == 0)
92 goto fail;
93 path->slots[0]--;
94 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
95 if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
96 found_key.objectid != objectid) {
97 goto fail;
98 }
99 csum_offset = (offset - found_key.offset) >>
100 root->fs_info->sb->s_blocksize_bits;
101 csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
102 csums_in_item /= BTRFS_CRC32_SIZE;
103
104 if (csum_offset >= csums_in_item) {
105 ret = -EFBIG;
106 goto fail;
107 }
108 }
109 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
110 item = (struct btrfs_csum_item *)((unsigned char *)item +
111 csum_offset * BTRFS_CRC32_SIZE);
112 return item;
113fail:
114 if (ret > 0)
115 ret = -ENOENT;
116 return ERR_PTR(ret);
117}
118
119
120int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
121 struct btrfs_root *root,
122 struct btrfs_path *path, u64 objectid,
123 u64 offset, int mod)
124{
125 int ret;
126 struct btrfs_key file_key;
127 int ins_len = mod < 0 ? -1 : 0;
128 int cow = mod != 0;
129
130 file_key.objectid = objectid;
131 file_key.offset = offset;
132 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
133 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
134 return ret;
135}
136
137int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
138 struct bio *bio)
139{
140 u32 sum;
141 struct bio_vec *bvec = bio->bi_io_vec;
142 int bio_index = 0;
143 u64 offset;
144 u64 item_start_offset = 0;
145 u64 item_last_offset = 0;
146 u32 diff;
147 int ret;
148 struct btrfs_path *path;
149 struct btrfs_csum_item *item = NULL;
150 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
151
152 path = btrfs_alloc_path();
153 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
154 path->reada = 2;
155
156 WARN_ON(bio->bi_vcnt <= 0);
157
158 while(bio_index < bio->bi_vcnt) {
159 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
160 ret = btrfs_find_ordered_sum(inode, offset, &sum);
161 if (ret == 0)
162 goto found;
163
164 if (!item || offset < item_start_offset ||
165 offset >= item_last_offset) {
166 struct btrfs_key found_key;
167 u32 item_size;
168
169 if (item)
170 btrfs_release_path(root, path);
171 item = btrfs_lookup_csum(NULL, root, path,
172 inode->i_ino, offset, 0);
173 if (IS_ERR(item)) {
174 ret = PTR_ERR(item);
175 if (ret == -ENOENT || ret == -EFBIG)
176 ret = 0;
177 sum = 0;
178 printk("no csum found for inode %lu start "
179 "%llu\n", inode->i_ino,
180 (unsigned long long)offset);
181 item = NULL;
182 goto found;
183 }
184 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
185 path->slots[0]);
186
187 item_start_offset = found_key.offset;
188 item_size = btrfs_item_size_nr(path->nodes[0],
189 path->slots[0]);
190 item_last_offset = item_start_offset +
191 (item_size / BTRFS_CRC32_SIZE) *
192 root->sectorsize;
193 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
194 struct btrfs_csum_item);
195 }
196 /*
197 * this byte range must be able to fit inside
198 * a single leaf so it will also fit inside a u32
199 */
200 diff = offset - item_start_offset;
201 diff = diff / root->sectorsize;
202 diff = diff * BTRFS_CRC32_SIZE;
203
204 read_extent_buffer(path->nodes[0], &sum,
205 ((unsigned long)item) + diff,
206 BTRFS_CRC32_SIZE);
207found:
208 set_state_private(io_tree, offset, sum);
209 bio_index++;
210 bvec++;
211 }
212 btrfs_free_path(path);
213 return 0;
214}
215
216int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
217 struct bio *bio)
218{
219 struct btrfs_ordered_sum *sums;
220 struct btrfs_sector_sum *sector_sum;
221 struct btrfs_ordered_extent *ordered;
222 char *data;
223 struct bio_vec *bvec = bio->bi_io_vec;
224 int bio_index = 0;
225 unsigned long total_bytes = 0;
226 unsigned long this_sum_bytes = 0;
227 u64 offset;
228
229 WARN_ON(bio->bi_vcnt <= 0);
230 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
231 if (!sums)
232 return -ENOMEM;
233
234 sector_sum = sums->sums;
235 sums->file_offset = page_offset(bvec->bv_page) + bvec->bv_offset;
236 sums->len = bio->bi_size;
237 INIT_LIST_HEAD(&sums->list);
238 ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
239 BUG_ON(!ordered);
240
241 while(bio_index < bio->bi_vcnt) {
242 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
243 if (offset >= ordered->file_offset + ordered->len ||
244 offset < ordered->file_offset) {
245 unsigned long bytes_left;
246 sums->len = this_sum_bytes;
247 this_sum_bytes = 0;
248 btrfs_add_ordered_sum(inode, ordered, sums);
249 btrfs_put_ordered_extent(ordered);
250
251 bytes_left = bio->bi_size - total_bytes;
252
253 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
254 GFP_NOFS);
255 BUG_ON(!sums);
256 sector_sum = sums->sums;
257 sums->len = bytes_left;
258 sums->file_offset = offset;
259 ordered = btrfs_lookup_ordered_extent(inode,
260 sums->file_offset);
261 BUG_ON(!ordered);
262 }
263
264 data = kmap_atomic(bvec->bv_page, KM_USER0);
265 sector_sum->sum = ~(u32)0;
266 sector_sum->sum = btrfs_csum_data(root,
267 data + bvec->bv_offset,
268 sector_sum->sum,
269 bvec->bv_len);
270 kunmap_atomic(data, KM_USER0);
271 btrfs_csum_final(sector_sum->sum,
272 (char *)&sector_sum->sum);
273 sector_sum->offset = page_offset(bvec->bv_page) +
274 bvec->bv_offset;
275
276 sector_sum++;
277 bio_index++;
278 total_bytes += bvec->bv_len;
279 this_sum_bytes += bvec->bv_len;
280 bvec++;
281 }
282 this_sum_bytes = 0;
283 btrfs_add_ordered_sum(inode, ordered, sums);
284 btrfs_put_ordered_extent(ordered);
285 return 0;
286}
287
288int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
289 struct btrfs_root *root, struct inode *inode,
290 struct btrfs_ordered_sum *sums)
291{
292 u64 objectid = inode->i_ino;
293 u64 offset;
294 int ret;
295 struct btrfs_key file_key;
296 struct btrfs_key found_key;
297 u64 next_offset;
298 u64 total_bytes = 0;
299 int found_next;
300 struct btrfs_path *path;
301 struct btrfs_csum_item *item;
302 struct btrfs_csum_item *item_end;
303 struct extent_buffer *leaf = NULL;
304 u64 csum_offset;
305 struct btrfs_sector_sum *sector_sum;
306 u32 nritems;
307 u32 ins_size;
308 char *eb_map;
309 char *eb_token;
310 unsigned long map_len;
311 unsigned long map_start;
312
313 path = btrfs_alloc_path();
314 BUG_ON(!path);
315 sector_sum = sums->sums;
316again:
317 next_offset = (u64)-1;
318 found_next = 0;
319 offset = sector_sum->offset;
320 file_key.objectid = objectid;
321 file_key.offset = offset;
322 btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
323
324 mutex_lock(&BTRFS_I(inode)->csum_mutex);
325 item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
326 if (!IS_ERR(item)) {
327 leaf = path->nodes[0];
328 ret = 0;
329 goto found;
330 }
331 ret = PTR_ERR(item);
332 if (ret == -EFBIG) {
333 u32 item_size;
334 /* we found one, but it isn't big enough yet */
335 leaf = path->nodes[0];
336 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
337 if ((item_size / BTRFS_CRC32_SIZE) >= MAX_CSUM_ITEMS(root)) {
338 /* already at max size, make a new one */
339 goto insert;
340 }
341 } else {
342 int slot = path->slots[0] + 1;
343 /* we didn't find a csum item, insert one */
344 nritems = btrfs_header_nritems(path->nodes[0]);
345 if (path->slots[0] >= nritems - 1) {
346 ret = btrfs_next_leaf(root, path);
347 if (ret == 1)
348 found_next = 1;
349 if (ret != 0)
350 goto insert;
351 slot = 0;
352 }
353 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
354 if (found_key.objectid != objectid ||
355 found_key.type != BTRFS_CSUM_ITEM_KEY) {
356 found_next = 1;
357 goto insert;
358 }
359 next_offset = found_key.offset;
360 found_next = 1;
361 goto insert;
362 }
363
364 /*
365 * at this point, we know the tree has an item, but it isn't big
366 * enough yet to put our csum in. Grow it
367 */
368 btrfs_release_path(root, path);
369 ret = btrfs_search_slot(trans, root, &file_key, path,
370 BTRFS_CRC32_SIZE, 1);
371 if (ret < 0)
372 goto fail_unlock;
373 if (ret == 0) {
374 BUG();
375 }
376 if (path->slots[0] == 0) {
377 goto insert;
378 }
379 path->slots[0]--;
380 leaf = path->nodes[0];
381 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
382 csum_offset = (offset - found_key.offset) >>
383 root->fs_info->sb->s_blocksize_bits;
384 if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
385 found_key.objectid != objectid ||
386 csum_offset >= MAX_CSUM_ITEMS(root)) {
387 goto insert;
388 }
389 if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
390 BTRFS_CRC32_SIZE) {
391 u32 diff = (csum_offset + 1) * BTRFS_CRC32_SIZE;
392 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
393 if (diff != BTRFS_CRC32_SIZE)
394 goto insert;
395 ret = btrfs_extend_item(trans, root, path, diff);
396 BUG_ON(ret);
397 goto csum;
398 }
399
400insert:
401 btrfs_release_path(root, path);
402 csum_offset = 0;
403 if (found_next) {
404 u64 tmp = min((u64)i_size_read(inode), next_offset);
405 tmp -= offset & ~((u64)root->sectorsize -1);
406 tmp >>= root->fs_info->sb->s_blocksize_bits;
407 tmp = max((u64)1, tmp);
408 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root));
409 ins_size = BTRFS_CRC32_SIZE * tmp;
410 } else {
411 ins_size = BTRFS_CRC32_SIZE;
412 }
413 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
414 ins_size);
415 if (ret < 0)
416 goto fail_unlock;
417 if (ret != 0) {
418 WARN_ON(1);
419 goto fail_unlock;
420 }
421csum:
422 leaf = path->nodes[0];
423 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
424 ret = 0;
425 item = (struct btrfs_csum_item *)((unsigned char *)item +
426 csum_offset * BTRFS_CRC32_SIZE);
427found:
428 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
429 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
430 btrfs_item_size_nr(leaf, path->slots[0]));
431 eb_token = NULL;
432 mutex_unlock(&BTRFS_I(inode)->csum_mutex);
433 cond_resched();
434next_sector:
435
436 if (!eb_token ||
437 (unsigned long)item + BTRFS_CRC32_SIZE >= map_start + map_len) {
438 int err;
439
440 if (eb_token)
441 unmap_extent_buffer(leaf, eb_token, KM_USER1);
442 eb_token = NULL;
443 err = map_private_extent_buffer(leaf, (unsigned long)item,
444 BTRFS_CRC32_SIZE,
445 &eb_token, &eb_map,
446 &map_start, &map_len, KM_USER1);
447 if (err)
448 eb_token = NULL;
449 }
450 if (eb_token) {
451 memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
452 &sector_sum->sum, BTRFS_CRC32_SIZE);
453 } else {
454 write_extent_buffer(leaf, &sector_sum->sum,
455 (unsigned long)item, BTRFS_CRC32_SIZE);
456 }
457
458 total_bytes += root->sectorsize;
459 sector_sum++;
460 if (total_bytes < sums->len) {
461 item = (struct btrfs_csum_item *)((char *)item +
462 BTRFS_CRC32_SIZE);
463 if (item < item_end && offset + PAGE_CACHE_SIZE ==
464 sector_sum->offset) {
465 offset = sector_sum->offset;
466 goto next_sector;
467 }
468 }
469 if (eb_token) {
470 unmap_extent_buffer(leaf, eb_token, KM_USER1);
471 eb_token = NULL;
472 }
473 btrfs_mark_buffer_dirty(path->nodes[0]);
474 cond_resched();
475 if (total_bytes < sums->len) {
476 btrfs_release_path(root, path);
477 goto again;
478 }
479out:
480 btrfs_free_path(path);
481 return ret;
482
483fail_unlock:
484 mutex_unlock(&BTRFS_I(inode)->csum_mutex);
485 goto out;
486}
487
488int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
489 struct btrfs_root *root, struct btrfs_path *path,
490 u64 isize)
491{
492 struct btrfs_key key;
493 struct extent_buffer *leaf = path->nodes[0];
494 int slot = path->slots[0];
495 int ret;
496 u32 new_item_size;
497 u64 new_item_span;
498 u64 blocks;
499
500 btrfs_item_key_to_cpu(leaf, &key, slot);
501 if (isize <= key.offset)
502 return 0;
503 new_item_span = isize - key.offset;
504 blocks = (new_item_span + root->sectorsize - 1) >>
505 root->fs_info->sb->s_blocksize_bits;
506 new_item_size = blocks * BTRFS_CRC32_SIZE;
507 if (new_item_size >= btrfs_item_size_nr(leaf, slot))
508 return 0;
509 ret = btrfs_truncate_item(trans, root, path, new_item_size, 1);
510 BUG_ON(ret);
511 return ret;
512}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000000..69abbe19add2
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1178 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/string.h>
25#include <linux/smp_lock.h>
26#include <linux/backing-dev.h>
27#include <linux/mpage.h>
28#include <linux/swap.h>
29#include <linux/writeback.h>
30#include <linux/statfs.h>
31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h"
34#include "disk-io.h"
35#include "transaction.h"
36#include "btrfs_inode.h"
37#include "ioctl.h"
38#include "print-tree.h"
39#include "tree-log.h"
40#include "locking.h"
41#include "compat.h"
42
43
44/* simple helper to fault in pages and copy. This should go away
45 * and be replaced with calls into generic code.
46 */
47static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
48 int write_bytes,
49 struct page **prepared_pages,
50 const char __user * buf)
51{
52 long page_fault = 0;
53 int i;
54 int offset = pos & (PAGE_CACHE_SIZE - 1);
55
56 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
57 size_t count = min_t(size_t,
58 PAGE_CACHE_SIZE - offset, write_bytes);
59 struct page *page = prepared_pages[i];
60 fault_in_pages_readable(buf, count);
61
62 /* Copy data from userspace to the current page */
63 kmap(page);
64 page_fault = __copy_from_user(page_address(page) + offset,
65 buf, count);
66 /* Flush processor's dcache for this page */
67 flush_dcache_page(page);
68 kunmap(page);
69 buf += count;
70 write_bytes -= count;
71
72 if (page_fault)
73 break;
74 }
75 return page_fault ? -EFAULT : 0;
76}
77
78/*
79 * unlocks pages after btrfs_file_write is done with them
80 */
81static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
82{
83 size_t i;
84 for (i = 0; i < num_pages; i++) {
85 if (!pages[i])
86 break;
87 /* page checked is some magic around finding pages that
88 * have been modified without going through btrfs_set_page_dirty
89 * clear it here
90 */
91 ClearPageChecked(pages[i]);
92 unlock_page(pages[i]);
93 mark_page_accessed(pages[i]);
94 page_cache_release(pages[i]);
95 }
96}
97
98/* this does all the hard work for inserting an inline extent into
99 * the btree. Any existing inline extent is extended as required to make room,
100 * otherwise things are inserted as required into the btree
101 */
102static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root, struct inode *inode,
104 u64 offset, size_t size,
105 struct page **pages, size_t page_offset,
106 int num_pages)
107{
108 struct btrfs_key key;
109 struct btrfs_path *path;
110 struct extent_buffer *leaf;
111 char *kaddr;
112 unsigned long ptr;
113 struct btrfs_file_extent_item *ei;
114 struct page *page;
115 u32 datasize;
116 int err = 0;
117 int ret;
118 int i;
119 ssize_t cur_size;
120
121 path = btrfs_alloc_path();
122 if (!path)
123 return -ENOMEM;
124
125 btrfs_set_trans_block_group(trans, inode);
126
127 key.objectid = inode->i_ino;
128 key.offset = offset;
129 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
130
131 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
132 if (ret < 0) {
133 err = ret;
134 goto fail;
135 }
136 if (ret == 1) {
137 struct btrfs_key found_key;
138
139 if (path->slots[0] == 0)
140 goto insert;
141
142 path->slots[0]--;
143 leaf = path->nodes[0];
144 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
145
146 if (found_key.objectid != inode->i_ino)
147 goto insert;
148
149 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
150 goto insert;
151 ei = btrfs_item_ptr(leaf, path->slots[0],
152 struct btrfs_file_extent_item);
153
154 if (btrfs_file_extent_type(leaf, ei) !=
155 BTRFS_FILE_EXTENT_INLINE) {
156 goto insert;
157 }
158 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
159 ret = 0;
160 }
161 if (ret == 0) {
162 u32 found_size;
163 u64 found_end;
164
165 leaf = path->nodes[0];
166 ei = btrfs_item_ptr(leaf, path->slots[0],
167 struct btrfs_file_extent_item);
168
169 if (btrfs_file_extent_type(leaf, ei) !=
170 BTRFS_FILE_EXTENT_INLINE) {
171 err = ret;
172 btrfs_print_leaf(root, leaf);
173 printk("found wasn't inline offset %Lu inode %lu\n",
174 offset, inode->i_ino);
175 goto fail;
176 }
177 found_size = btrfs_file_extent_inline_len(leaf,
178 btrfs_item_nr(leaf, path->slots[0]));
179 found_end = key.offset + found_size;
180
181 if (found_end < offset + size) {
182 btrfs_release_path(root, path);
183 ret = btrfs_search_slot(trans, root, &key, path,
184 offset + size - found_end, 1);
185 BUG_ON(ret != 0);
186
187 ret = btrfs_extend_item(trans, root, path,
188 offset + size - found_end);
189 if (ret) {
190 err = ret;
191 goto fail;
192 }
193 leaf = path->nodes[0];
194 ei = btrfs_item_ptr(leaf, path->slots[0],
195 struct btrfs_file_extent_item);
196 inode_add_bytes(inode, offset + size - found_end);
197 }
198 if (found_end < offset) {
199 ptr = btrfs_file_extent_inline_start(ei) + found_size;
200 memset_extent_buffer(leaf, 0, ptr, offset - found_end);
201 }
202 } else {
203insert:
204 btrfs_release_path(root, path);
205 datasize = offset + size - key.offset;
206 inode_add_bytes(inode, datasize);
207 datasize = btrfs_file_extent_calc_inline_size(datasize);
208 ret = btrfs_insert_empty_item(trans, root, path, &key,
209 datasize);
210 if (ret) {
211 err = ret;
212 printk("got bad ret %d\n", ret);
213 goto fail;
214 }
215 leaf = path->nodes[0];
216 ei = btrfs_item_ptr(leaf, path->slots[0],
217 struct btrfs_file_extent_item);
218 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
219 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
220 }
221 ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
222
223 cur_size = size;
224 i = 0;
225 while (size > 0) {
226 page = pages[i];
227 kaddr = kmap_atomic(page, KM_USER0);
228 cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
229 write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
230 kunmap_atomic(kaddr, KM_USER0);
231 page_offset = 0;
232 ptr += cur_size;
233 size -= cur_size;
234 if (i >= num_pages) {
235 printk("i %d num_pages %d\n", i, num_pages);
236 }
237 i++;
238 }
239 btrfs_mark_buffer_dirty(leaf);
240fail:
241 btrfs_free_path(path);
242 return err;
243}
244
245/*
246 * after copy_from_user, pages need to be dirtied and we need to make
247 * sure holes are created between the current EOF and the start of
248 * any next extents (if required).
249 *
250 * this also makes the decision about creating an inline extent vs
251 * doing real data extents, marking pages dirty and delalloc as required.
252 */
253static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
254 struct btrfs_root *root,
255 struct file *file,
256 struct page **pages,
257 size_t num_pages,
258 loff_t pos,
259 size_t write_bytes)
260{
261 int err = 0;
262 int i;
263 struct inode *inode = fdentry(file)->d_inode;
264 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
265 u64 hint_byte;
266 u64 num_bytes;
267 u64 start_pos;
268 u64 end_of_last_block;
269 u64 end_pos = pos + write_bytes;
270 u64 inline_size;
271 int did_inline = 0;
272 loff_t isize = i_size_read(inode);
273
274 start_pos = pos & ~((u64)root->sectorsize - 1);
275 num_bytes = (write_bytes + pos - start_pos +
276 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
277
278 end_of_last_block = start_pos + num_bytes - 1;
279
280 lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
281 trans = btrfs_join_transaction(root, 1);
282 if (!trans) {
283 err = -ENOMEM;
284 goto out_unlock;
285 }
286 btrfs_set_trans_block_group(trans, inode);
287 hint_byte = 0;
288
289 if ((end_of_last_block & 4095) == 0) {
290 printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
291 }
292 set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
293
294 /* FIXME...EIEIO, ENOSPC and more */
295 /* insert any holes we need to create */
296 if (isize < start_pos) {
297 u64 last_pos_in_file;
298 u64 hole_size;
299 u64 mask = root->sectorsize - 1;
300 last_pos_in_file = (isize + mask) & ~mask;
301 hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
302 if (hole_size > 0) {
303 btrfs_wait_ordered_range(inode, last_pos_in_file,
304 last_pos_in_file + hole_size);
305 mutex_lock(&BTRFS_I(inode)->extent_mutex);
306 err = btrfs_drop_extents(trans, root, inode,
307 last_pos_in_file,
308 last_pos_in_file + hole_size,
309 last_pos_in_file,
310 &hint_byte);
311 if (err)
312 goto failed;
313
314 err = btrfs_insert_file_extent(trans, root,
315 inode->i_ino,
316 last_pos_in_file,
317 0, 0, hole_size, 0);
318 btrfs_drop_extent_cache(inode, last_pos_in_file,
319 last_pos_in_file + hole_size - 1, 0);
320 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
321 btrfs_check_file(root, inode);
322 }
323 if (err)
324 goto failed;
325 }
326
327 /*
328 * either allocate an extent for the new bytes or setup the key
329 * to show we are doing inline data in the extent
330 */
331 inline_size = end_pos;
332 if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
333 inline_size > root->fs_info->max_inline ||
334 (inline_size & (root->sectorsize -1)) == 0 ||
335 inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
336 /* check for reserved extents on each page, we don't want
337 * to reset the delalloc bit on things that already have
338 * extents reserved.
339 */
340 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
341 for (i = 0; i < num_pages; i++) {
342 struct page *p = pages[i];
343 SetPageUptodate(p);
344 ClearPageChecked(p);
345 set_page_dirty(p);
346 }
347 } else {
348 u64 aligned_end;
349 /* step one, delete the existing extents in this range */
350 aligned_end = (pos + write_bytes + root->sectorsize - 1) &
351 ~((u64)root->sectorsize - 1);
352 mutex_lock(&BTRFS_I(inode)->extent_mutex);
353 err = btrfs_drop_extents(trans, root, inode, start_pos,
354 aligned_end, aligned_end, &hint_byte);
355 if (err)
356 goto failed;
357 if (isize > inline_size)
358 inline_size = min_t(u64, isize, aligned_end);
359 inline_size -= start_pos;
360 err = insert_inline_extent(trans, root, inode, start_pos,
361 inline_size, pages, 0, num_pages);
362 btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
363 BUG_ON(err);
364 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
365
366 /*
367 * an ugly way to do all the prop accounting around
368 * the page bits and mapping tags
369 */
370 set_page_writeback(pages[0]);
371 end_page_writeback(pages[0]);
372 did_inline = 1;
373 }
374 if (end_pos > isize) {
375 i_size_write(inode, end_pos);
376 if (did_inline)
377 BTRFS_I(inode)->disk_i_size = end_pos;
378 btrfs_update_inode(trans, root, inode);
379 }
380failed:
381 err = btrfs_end_transaction(trans, root);
382out_unlock:
383 unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
384 return err;
385}
386
387/*
388 * this drops all the extents in the cache that intersect the range
389 * [start, end]. Existing extents are split as required.
390 */
391int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
392 int skip_pinned)
393{
394 struct extent_map *em;
395 struct extent_map *split = NULL;
396 struct extent_map *split2 = NULL;
397 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
398 u64 len = end - start + 1;
399 int ret;
400 int testend = 1;
401 unsigned long flags;
402
403 WARN_ON(end < start);
404 if (end == (u64)-1) {
405 len = (u64)-1;
406 testend = 0;
407 }
408 while(1) {
409 if (!split)
410 split = alloc_extent_map(GFP_NOFS);
411 if (!split2)
412 split2 = alloc_extent_map(GFP_NOFS);
413
414 spin_lock(&em_tree->lock);
415 em = lookup_extent_mapping(em_tree, start, len);
416 if (!em) {
417 spin_unlock(&em_tree->lock);
418 break;
419 }
420 flags = em->flags;
421 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
422 spin_unlock(&em_tree->lock);
423 if (em->start <= start &&
424 (!testend || em->start + em->len >= start + len)) {
425 free_extent_map(em);
426 break;
427 }
428 if (start < em->start) {
429 len = em->start - start;
430 } else {
431 len = start + len - (em->start + em->len);
432 start = em->start + em->len;
433 }
434 free_extent_map(em);
435 continue;
436 }
437 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
438 remove_extent_mapping(em_tree, em);
439
440 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
441 em->start < start) {
442 split->start = em->start;
443 split->len = start - em->start;
444 split->block_start = em->block_start;
445 split->bdev = em->bdev;
446 split->flags = flags;
447 ret = add_extent_mapping(em_tree, split);
448 BUG_ON(ret);
449 free_extent_map(split);
450 split = split2;
451 split2 = NULL;
452 }
453 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
454 testend && em->start + em->len > start + len) {
455 u64 diff = start + len - em->start;
456
457 split->start = start + len;
458 split->len = em->start + em->len - (start + len);
459 split->bdev = em->bdev;
460 split->flags = flags;
461
462 split->block_start = em->block_start + diff;
463
464 ret = add_extent_mapping(em_tree, split);
465 BUG_ON(ret);
466 free_extent_map(split);
467 split = NULL;
468 }
469 spin_unlock(&em_tree->lock);
470
471 /* once for us */
472 free_extent_map(em);
473 /* once for the tree*/
474 free_extent_map(em);
475 }
476 if (split)
477 free_extent_map(split);
478 if (split2)
479 free_extent_map(split2);
480 return 0;
481}
482
483int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
484{
485 return 0;
486#if 0
487 struct btrfs_path *path;
488 struct btrfs_key found_key;
489 struct extent_buffer *leaf;
490 struct btrfs_file_extent_item *extent;
491 u64 last_offset = 0;
492 int nritems;
493 int slot;
494 int found_type;
495 int ret;
496 int err = 0;
497 u64 extent_end = 0;
498
499 path = btrfs_alloc_path();
500 ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
501 last_offset, 0);
502 while(1) {
503 nritems = btrfs_header_nritems(path->nodes[0]);
504 if (path->slots[0] >= nritems) {
505 ret = btrfs_next_leaf(root, path);
506 if (ret)
507 goto out;
508 nritems = btrfs_header_nritems(path->nodes[0]);
509 }
510 slot = path->slots[0];
511 leaf = path->nodes[0];
512 btrfs_item_key_to_cpu(leaf, &found_key, slot);
513 if (found_key.objectid != inode->i_ino)
514 break;
515 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
516 goto out;
517
518 if (found_key.offset < last_offset) {
519 WARN_ON(1);
520 btrfs_print_leaf(root, leaf);
521 printk("inode %lu found offset %Lu expected %Lu\n",
522 inode->i_ino, found_key.offset, last_offset);
523 err = 1;
524 goto out;
525 }
526 extent = btrfs_item_ptr(leaf, slot,
527 struct btrfs_file_extent_item);
528 found_type = btrfs_file_extent_type(leaf, extent);
529 if (found_type == BTRFS_FILE_EXTENT_REG) {
530 extent_end = found_key.offset +
531 btrfs_file_extent_num_bytes(leaf, extent);
532 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
533 struct btrfs_item *item;
534 item = btrfs_item_nr(leaf, slot);
535 extent_end = found_key.offset +
536 btrfs_file_extent_inline_len(leaf, item);
537 extent_end = (extent_end + root->sectorsize - 1) &
538 ~((u64)root->sectorsize -1 );
539 }
540 last_offset = extent_end;
541 path->slots[0]++;
542 }
543 if (0 && last_offset < inode->i_size) {
544 WARN_ON(1);
545 btrfs_print_leaf(root, leaf);
546 printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
547 last_offset, inode->i_size);
548 err = 1;
549
550 }
551out:
552 btrfs_free_path(path);
553 return err;
554#endif
555}
556
557/*
558 * this is very complex, but the basic idea is to drop all extents
559 * in the range start - end. hint_block is filled in with a block number
560 * that would be a good hint to the block allocator for this file.
561 *
562 * If an extent intersects the range but is not entirely inside the range
563 * it is either truncated or split. Anything entirely inside the range
564 * is deleted from the tree.
565 *
566 * inline_limit is used to tell this code which offsets in the file to keep
567 * if they contain inline extents.
568 */
569int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
570 struct btrfs_root *root, struct inode *inode,
571 u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
572{
573 u64 extent_end = 0;
574 u64 search_start = start;
575 u64 leaf_start;
576 u64 root_gen;
577 u64 root_owner;
578 struct extent_buffer *leaf;
579 struct btrfs_file_extent_item *extent;
580 struct btrfs_path *path;
581 struct btrfs_key key;
582 struct btrfs_file_extent_item old;
583 int keep;
584 int slot;
585 int bookend;
586 int found_type;
587 int found_extent;
588 int found_inline;
589 int recow;
590 int ret;
591
592 btrfs_drop_extent_cache(inode, start, end - 1, 0);
593
594 path = btrfs_alloc_path();
595 if (!path)
596 return -ENOMEM;
597 while(1) {
598 recow = 0;
599 btrfs_release_path(root, path);
600 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
601 search_start, -1);
602 if (ret < 0)
603 goto out;
604 if (ret > 0) {
605 if (path->slots[0] == 0) {
606 ret = 0;
607 goto out;
608 }
609 path->slots[0]--;
610 }
611next_slot:
612 keep = 0;
613 bookend = 0;
614 found_extent = 0;
615 found_inline = 0;
616 leaf_start = 0;
617 root_gen = 0;
618 root_owner = 0;
619 extent = NULL;
620 leaf = path->nodes[0];
621 slot = path->slots[0];
622 ret = 0;
623 btrfs_item_key_to_cpu(leaf, &key, slot);
624 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
625 key.offset >= end) {
626 goto out;
627 }
628 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
629 key.objectid != inode->i_ino) {
630 goto out;
631 }
632 if (recow) {
633 search_start = key.offset;
634 continue;
635 }
636 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
637 extent = btrfs_item_ptr(leaf, slot,
638 struct btrfs_file_extent_item);
639 found_type = btrfs_file_extent_type(leaf, extent);
640 if (found_type == BTRFS_FILE_EXTENT_REG) {
641 extent_end =
642 btrfs_file_extent_disk_bytenr(leaf,
643 extent);
644 if (extent_end)
645 *hint_byte = extent_end;
646
647 extent_end = key.offset +
648 btrfs_file_extent_num_bytes(leaf, extent);
649 found_extent = 1;
650 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
651 struct btrfs_item *item;
652 item = btrfs_item_nr(leaf, slot);
653 found_inline = 1;
654 extent_end = key.offset +
655 btrfs_file_extent_inline_len(leaf, item);
656 }
657 } else {
658 extent_end = search_start;
659 }
660
661 /* we found nothing we can drop */
662 if ((!found_extent && !found_inline) ||
663 search_start >= extent_end) {
664 int nextret;
665 u32 nritems;
666 nritems = btrfs_header_nritems(leaf);
667 if (slot >= nritems - 1) {
668 nextret = btrfs_next_leaf(root, path);
669 if (nextret)
670 goto out;
671 recow = 1;
672 } else {
673 path->slots[0]++;
674 }
675 goto next_slot;
676 }
677
678 if (found_inline) {
679 u64 mask = root->sectorsize - 1;
680 search_start = (extent_end + mask) & ~mask;
681 } else
682 search_start = extent_end;
683 if (end <= extent_end && start >= key.offset && found_inline) {
684 *hint_byte = EXTENT_MAP_INLINE;
685 goto out;
686 }
687
688 if (found_extent) {
689 read_extent_buffer(leaf, &old, (unsigned long)extent,
690 sizeof(old));
691 root_gen = btrfs_header_generation(leaf);
692 root_owner = btrfs_header_owner(leaf);
693 leaf_start = leaf->start;
694 }
695
696 if (end < extent_end && end >= key.offset) {
697 bookend = 1;
698 if (found_inline && start <= key.offset)
699 keep = 1;
700 }
701 /* truncate existing extent */
702 if (start > key.offset) {
703 u64 new_num;
704 u64 old_num;
705 keep = 1;
706 WARN_ON(start & (root->sectorsize - 1));
707 if (found_extent) {
708 new_num = start - key.offset;
709 old_num = btrfs_file_extent_num_bytes(leaf,
710 extent);
711 *hint_byte =
712 btrfs_file_extent_disk_bytenr(leaf,
713 extent);
714 if (btrfs_file_extent_disk_bytenr(leaf,
715 extent)) {
716 inode_sub_bytes(inode, old_num -
717 new_num);
718 }
719 btrfs_set_file_extent_num_bytes(leaf, extent,
720 new_num);
721 btrfs_mark_buffer_dirty(leaf);
722 } else if (key.offset < inline_limit &&
723 (end > extent_end) &&
724 (inline_limit < extent_end)) {
725 u32 new_size;
726 new_size = btrfs_file_extent_calc_inline_size(
727 inline_limit - key.offset);
728 inode_sub_bytes(inode, extent_end -
729 inline_limit);
730 btrfs_truncate_item(trans, root, path,
731 new_size, 1);
732 }
733 }
734 /* delete the entire extent */
735 if (!keep) {
736 if (found_inline)
737 inode_sub_bytes(inode, extent_end -
738 key.offset);
739 ret = btrfs_del_item(trans, root, path);
740 /* TODO update progress marker and return */
741 BUG_ON(ret);
742 extent = NULL;
743 btrfs_release_path(root, path);
744 /* the extent will be freed later */
745 }
746 if (bookend && found_inline && start <= key.offset) {
747 u32 new_size;
748 new_size = btrfs_file_extent_calc_inline_size(
749 extent_end - end);
750 inode_sub_bytes(inode, end - key.offset);
751 ret = btrfs_truncate_item(trans, root, path,
752 new_size, 0);
753 BUG_ON(ret);
754 }
755 /* create bookend, splitting the extent in two */
756 if (bookend && found_extent) {
757 u64 disk_bytenr;
758 struct btrfs_key ins;
759 ins.objectid = inode->i_ino;
760 ins.offset = end;
761 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
762 btrfs_release_path(root, path);
763 ret = btrfs_insert_empty_item(trans, root, path, &ins,
764 sizeof(*extent));
765 BUG_ON(ret);
766
767 leaf = path->nodes[0];
768 extent = btrfs_item_ptr(leaf, path->slots[0],
769 struct btrfs_file_extent_item);
770 write_extent_buffer(leaf, &old,
771 (unsigned long)extent, sizeof(old));
772
773 btrfs_set_file_extent_offset(leaf, extent,
774 le64_to_cpu(old.offset) + end - key.offset);
775 WARN_ON(le64_to_cpu(old.num_bytes) <
776 (extent_end - end));
777 btrfs_set_file_extent_num_bytes(leaf, extent,
778 extent_end - end);
779 btrfs_set_file_extent_type(leaf, extent,
780 BTRFS_FILE_EXTENT_REG);
781
782 btrfs_mark_buffer_dirty(path->nodes[0]);
783
784 disk_bytenr = le64_to_cpu(old.disk_bytenr);
785 if (disk_bytenr != 0) {
786 ret = btrfs_inc_extent_ref(trans, root,
787 disk_bytenr,
788 le64_to_cpu(old.disk_num_bytes),
789 leaf->start,
790 root->root_key.objectid,
791 trans->transid, ins.objectid);
792 BUG_ON(ret);
793 }
794 btrfs_release_path(root, path);
795 if (disk_bytenr != 0) {
796 inode_add_bytes(inode, extent_end - end);
797 }
798 }
799
800 if (found_extent && !keep) {
801 u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);
802
803 if (disk_bytenr != 0) {
804 inode_sub_bytes(inode,
805 le64_to_cpu(old.num_bytes));
806 ret = btrfs_free_extent(trans, root,
807 disk_bytenr,
808 le64_to_cpu(old.disk_num_bytes),
809 leaf_start, root_owner,
810 root_gen, key.objectid, 0);
811 BUG_ON(ret);
812 *hint_byte = disk_bytenr;
813 }
814 }
815
816 if (search_start >= end) {
817 ret = 0;
818 goto out;
819 }
820 }
821out:
822 btrfs_free_path(path);
823 btrfs_check_file(root, inode);
824 return ret;
825}
826
827/*
828 * this gets pages into the page cache and locks them down, it also properly
829 * waits for data=ordered extents to finish before allowing the pages to be
830 * modified.
831 */
832static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
833 struct page **pages, size_t num_pages,
834 loff_t pos, unsigned long first_index,
835 unsigned long last_index, size_t write_bytes)
836{
837 int i;
838 unsigned long index = pos >> PAGE_CACHE_SHIFT;
839 struct inode *inode = fdentry(file)->d_inode;
840 int err = 0;
841 u64 start_pos;
842 u64 last_pos;
843
844 start_pos = pos & ~((u64)root->sectorsize - 1);
845 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
846
847 memset(pages, 0, num_pages * sizeof(struct page *));
848again:
849 for (i = 0; i < num_pages; i++) {
850 pages[i] = grab_cache_page(inode->i_mapping, index + i);
851 if (!pages[i]) {
852 err = -ENOMEM;
853 BUG_ON(1);
854 }
855 wait_on_page_writeback(pages[i]);
856 }
857 if (start_pos < inode->i_size) {
858 struct btrfs_ordered_extent *ordered;
859 lock_extent(&BTRFS_I(inode)->io_tree,
860 start_pos, last_pos - 1, GFP_NOFS);
861 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
862 if (ordered &&
863 ordered->file_offset + ordered->len > start_pos &&
864 ordered->file_offset < last_pos) {
865 btrfs_put_ordered_extent(ordered);
866 unlock_extent(&BTRFS_I(inode)->io_tree,
867 start_pos, last_pos - 1, GFP_NOFS);
868 for (i = 0; i < num_pages; i++) {
869 unlock_page(pages[i]);
870 page_cache_release(pages[i]);
871 }
872 btrfs_wait_ordered_range(inode, start_pos,
873 last_pos - start_pos);
874 goto again;
875 }
876 if (ordered)
877 btrfs_put_ordered_extent(ordered);
878
879 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
880 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
881 GFP_NOFS);
882 unlock_extent(&BTRFS_I(inode)->io_tree,
883 start_pos, last_pos - 1, GFP_NOFS);
884 }
885 for (i = 0; i < num_pages; i++) {
886 clear_page_dirty_for_io(pages[i]);
887 set_page_extent_mapped(pages[i]);
888 WARN_ON(!PageLocked(pages[i]));
889 }
890 return 0;
891}
892
893static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
894 size_t count, loff_t *ppos)
895{
896 loff_t pos;
897 loff_t start_pos;
898 ssize_t num_written = 0;
899 ssize_t err = 0;
900 int ret = 0;
901 struct inode *inode = fdentry(file)->d_inode;
902 struct btrfs_root *root = BTRFS_I(inode)->root;
903 struct page **pages = NULL;
904 int nrptrs;
905 struct page *pinned[2];
906 unsigned long first_index;
907 unsigned long last_index;
908 int will_write;
909
910 will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
911 (file->f_flags & O_DIRECT));
912
913 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
914 PAGE_CACHE_SIZE / (sizeof(struct page *)));
915 pinned[0] = NULL;
916 pinned[1] = NULL;
917
918 pos = *ppos;
919 start_pos = pos;
920
921 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
922 current->backing_dev_info = inode->i_mapping->backing_dev_info;
923 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
924 if (err)
925 goto out_nolock;
926 if (count == 0)
927 goto out_nolock;
928
929 err = file_remove_suid(file);
930 if (err)
931 goto out_nolock;
932 file_update_time(file);
933
934 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
935
936 mutex_lock(&inode->i_mutex);
937 first_index = pos >> PAGE_CACHE_SHIFT;
938 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
939
940 /*
941 * if this is a nodatasum mount, force summing off for the inode
942 * all the time. That way a later mount with summing on won't
943 * get confused
944 */
945 if (btrfs_test_opt(root, NODATASUM))
946 btrfs_set_flag(inode, NODATASUM);
947
948 /*
949 * there are lots of better ways to do this, but this code
950 * makes sure the first and last page in the file range are
951 * up to date and ready for cow
952 */
953 if ((pos & (PAGE_CACHE_SIZE - 1))) {
954 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
955 if (!PageUptodate(pinned[0])) {
956 ret = btrfs_readpage(NULL, pinned[0]);
957 BUG_ON(ret);
958 wait_on_page_locked(pinned[0]);
959 } else {
960 unlock_page(pinned[0]);
961 }
962 }
963 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
964 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
965 if (!PageUptodate(pinned[1])) {
966 ret = btrfs_readpage(NULL, pinned[1]);
967 BUG_ON(ret);
968 wait_on_page_locked(pinned[1]);
969 } else {
970 unlock_page(pinned[1]);
971 }
972 }
973
974 while(count > 0) {
975 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
976 size_t write_bytes = min(count, nrptrs *
977 (size_t)PAGE_CACHE_SIZE -
978 offset);
979 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
980 PAGE_CACHE_SHIFT;
981
982 WARN_ON(num_pages > nrptrs);
983 memset(pages, 0, sizeof(pages));
984
985 ret = btrfs_check_free_space(root, write_bytes, 0);
986 if (ret)
987 goto out;
988
989 ret = prepare_pages(root, file, pages, num_pages,
990 pos, first_index, last_index,
991 write_bytes);
992 if (ret)
993 goto out;
994
995 ret = btrfs_copy_from_user(pos, num_pages,
996 write_bytes, pages, buf);
997 if (ret) {
998 btrfs_drop_pages(pages, num_pages);
999 goto out;
1000 }
1001
1002 ret = dirty_and_release_pages(NULL, root, file, pages,
1003 num_pages, pos, write_bytes);
1004 btrfs_drop_pages(pages, num_pages);
1005 if (ret)
1006 goto out;
1007
1008 if (will_write) {
1009 btrfs_fdatawrite_range(inode->i_mapping, pos,
1010 pos + write_bytes - 1,
1011 WB_SYNC_NONE);
1012 } else {
1013 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1014 num_pages);
1015 if (num_pages <
1016 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1017 btrfs_btree_balance_dirty(root, 1);
1018 btrfs_throttle(root);
1019 }
1020
1021 buf += write_bytes;
1022 count -= write_bytes;
1023 pos += write_bytes;
1024 num_written += write_bytes;
1025
1026 cond_resched();
1027 }
1028out:
1029 mutex_unlock(&inode->i_mutex);
1030
1031out_nolock:
1032 kfree(pages);
1033 if (pinned[0])
1034 page_cache_release(pinned[0]);
1035 if (pinned[1])
1036 page_cache_release(pinned[1]);
1037 *ppos = pos;
1038
1039 if (num_written > 0 && will_write) {
1040 struct btrfs_trans_handle *trans;
1041
1042 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1043 if (err)
1044 num_written = err;
1045
1046 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
1047 trans = btrfs_start_transaction(root, 1);
1048 ret = btrfs_log_dentry_safe(trans, root,
1049 file->f_dentry);
1050 if (ret == 0) {
1051 btrfs_sync_log(trans, root);
1052 btrfs_end_transaction(trans, root);
1053 } else {
1054 btrfs_commit_transaction(trans, root);
1055 }
1056 }
1057 if (file->f_flags & O_DIRECT) {
1058 invalidate_mapping_pages(inode->i_mapping,
1059 start_pos >> PAGE_CACHE_SHIFT,
1060 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1061 }
1062 }
1063 current->backing_dev_info = NULL;
1064 return num_written ? num_written : err;
1065}
1066
1067int btrfs_release_file(struct inode * inode, struct file * filp)
1068{
1069 if (filp->private_data)
1070 btrfs_ioctl_trans_end(filp);
1071 return 0;
1072}
1073
1074/*
1075 * fsync call for both files and directories. This logs the inode into
1076 * the tree log instead of forcing full commits whenever possible.
1077 *
1078 * It needs to call filemap_fdatawait so that all ordered extent updates are
1079 * in the metadata btree are up to date for copying to the log.
1080 *
1081 * It drops the inode mutex before doing the tree log commit. This is an
1082 * important optimization for directories because holding the mutex prevents
1083 * new operations on the dir while we write to disk.
1084 */
1085int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1086{
1087 struct inode *inode = dentry->d_inode;
1088 struct btrfs_root *root = BTRFS_I(inode)->root;
1089 int ret = 0;
1090 struct btrfs_trans_handle *trans;
1091
1092 /*
1093 * check the transaction that last modified this inode
1094 * and see if its already been committed
1095 */
1096 if (!BTRFS_I(inode)->last_trans)
1097 goto out;
1098
1099 mutex_lock(&root->fs_info->trans_mutex);
1100 if (BTRFS_I(inode)->last_trans <=
1101 root->fs_info->last_trans_committed) {
1102 BTRFS_I(inode)->last_trans = 0;
1103 mutex_unlock(&root->fs_info->trans_mutex);
1104 goto out;
1105 }
1106 mutex_unlock(&root->fs_info->trans_mutex);
1107
1108 root->fs_info->tree_log_batch++;
1109 filemap_fdatawait(inode->i_mapping);
1110 root->fs_info->tree_log_batch++;
1111
1112 /*
1113 * ok we haven't committed the transaction yet, lets do a commit
1114 */
1115 if (file->private_data)
1116 btrfs_ioctl_trans_end(file);
1117
1118 trans = btrfs_start_transaction(root, 1);
1119 if (!trans) {
1120 ret = -ENOMEM;
1121 goto out;
1122 }
1123
1124 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
1125 if (ret < 0) {
1126 goto out;
1127 }
1128
1129 /* we've logged all the items and now have a consistent
1130 * version of the file in the log. It is possible that
1131 * someone will come in and modify the file, but that's
1132 * fine because the log is consistent on disk, and we
1133 * have references to all of the file's extents
1134 *
1135 * It is possible that someone will come in and log the
1136 * file again, but that will end up using the synchronization
1137 * inside btrfs_sync_log to keep things safe.
1138 */
1139 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
1140
1141 if (ret > 0) {
1142 ret = btrfs_commit_transaction(trans, root);
1143 } else {
1144 btrfs_sync_log(trans, root);
1145 ret = btrfs_end_transaction(trans, root);
1146 }
1147 mutex_lock(&file->f_dentry->d_inode->i_mutex);
1148out:
1149 return ret > 0 ? EIO : ret;
1150}
1151
1152static struct vm_operations_struct btrfs_file_vm_ops = {
1153 .fault = filemap_fault,
1154 .page_mkwrite = btrfs_page_mkwrite,
1155};
1156
1157static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1158{
1159 vma->vm_ops = &btrfs_file_vm_ops;
1160 file_accessed(filp);
1161 return 0;
1162}
1163
1164struct file_operations btrfs_file_operations = {
1165 .llseek = generic_file_llseek,
1166 .read = do_sync_read,
1167 .aio_read = generic_file_aio_read,
1168 .splice_read = generic_file_splice_read,
1169 .write = btrfs_file_write,
1170 .mmap = btrfs_file_mmap,
1171 .open = generic_file_open,
1172 .release = btrfs_release_file,
1173 .fsync = btrfs_sync_file,
1174 .unlocked_ioctl = btrfs_ioctl,
1175#ifdef CONFIG_COMPAT
1176 .compat_ioctl = btrfs_ioctl,
1177#endif
1178};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..96241f01fa0a
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,449 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21
22static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node)
24{
25 struct rb_node **p = &root->rb_node;
26 struct rb_node *parent = NULL;
27 struct btrfs_free_space *info;
28
29 while (*p) {
30 parent = *p;
31 info = rb_entry(parent, struct btrfs_free_space, offset_index);
32
33 if (offset < info->offset)
34 p = &(*p)->rb_left;
35 else if (offset > info->offset)
36 p = &(*p)->rb_right;
37 else
38 return -EEXIST;
39 }
40
41 rb_link_node(node, parent, p);
42 rb_insert_color(node, root);
43
44 return 0;
45}
46
47static int tree_insert_bytes(struct rb_root *root, u64 bytes,
48 struct rb_node *node)
49{
50 struct rb_node **p = &root->rb_node;
51 struct rb_node *parent = NULL;
52 struct btrfs_free_space *info;
53
54 while (*p) {
55 parent = *p;
56 info = rb_entry(parent, struct btrfs_free_space, bytes_index);
57
58 if (bytes < info->bytes)
59 p = &(*p)->rb_left;
60 else
61 p = &(*p)->rb_right;
62 }
63
64 rb_link_node(node, parent, p);
65 rb_insert_color(node, root);
66
67 return 0;
68}
69
70/*
71 * searches the tree for the given offset. If contains is set we will return
72 * the free space that contains the given offset. If contains is not set we
73 * will return the free space that starts at or after the given offset and is
74 * at least bytes long.
75 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes,
78 int contains)
79{
80 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL;
82
83 while (n) {
84 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85
86 if (offset < entry->offset) {
87 if (!contains &&
88 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes))
90 ret = entry;
91 n = n->rb_left;
92 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) {
95 ret = entry;
96 break;
97 }
98 n = n->rb_right;
99 } else {
100 if (bytes > entry->bytes) {
101 n = n->rb_right;
102 continue;
103 }
104 ret = entry;
105 break;
106 }
107 }
108
109 return ret;
110}
111
112/*
113 * return a chunk at least bytes size, as close to offset that we can get.
114 */
115static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
116 u64 offset, u64 bytes)
117{
118 struct rb_node *n = root->rb_node;
119 struct btrfs_free_space *entry, *ret = NULL;
120
121 while (n) {
122 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
123
124 if (bytes < entry->bytes) {
125 /*
126 * We prefer to get a hole size as close to the size we
127 * are asking for so we don't take small slivers out of
128 * huge holes, but we also want to get as close to the
129 * offset as possible so we don't have a whole lot of
130 * fragmentation.
131 */
132 if (offset <= entry->offset) {
133 if (!ret)
134 ret = entry;
135 else if (entry->bytes < ret->bytes)
136 ret = entry;
137 else if (entry->offset < ret->offset)
138 ret = entry;
139 }
140 n = n->rb_left;
141 } else if (bytes > entry->bytes) {
142 n = n->rb_right;
143 } else {
144 /*
145 * Ok we may have multiple chunks of the wanted size,
146 * so we don't want to take the first one we find, we
147 * want to take the one closest to our given offset, so
148 * keep searching just in case theres a better match.
149 */
150 n = n->rb_right;
151 if (offset > entry->offset)
152 continue;
153 else if (!ret || entry->offset < ret->offset)
154 ret = entry;
155 }
156 }
157
158 return ret;
159}
160
161static void unlink_free_space(struct btrfs_block_group_cache *block_group,
162 struct btrfs_free_space *info)
163{
164 rb_erase(&info->offset_index, &block_group->free_space_offset);
165 rb_erase(&info->bytes_index, &block_group->free_space_bytes);
166}
167
168static int link_free_space(struct btrfs_block_group_cache *block_group,
169 struct btrfs_free_space *info)
170{
171 int ret = 0;
172
173
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index);
176 if (ret)
177 return ret;
178
179 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
180 &info->bytes_index);
181 if (ret)
182 return ret;
183
184 return ret;
185}
186
187int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes)
189{
190 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0;
195
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info)
198 return -ENOMEM;
199
200 /*
201 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range
204 */
205 spin_lock(&block_group->lock);
206
207 right_info = tree_search_offset(&block_group->free_space_offset,
208 offset+bytes, 0, 1);
209 left_info = tree_search_offset(&block_group->free_space_offset,
210 offset-1, 0, 1);
211
212 if (right_info && right_info->offset == offset+bytes) {
213 unlink_free_space(block_group, right_info);
214 info = right_info;
215 info->offset = offset;
216 info->bytes += bytes;
217 } else if (right_info && right_info->offset != offset+bytes) {
218 printk(KERN_ERR "adding space in the middle of an existing "
219 "free space area. existing: offset=%Lu, bytes=%Lu. "
220 "new: offset=%Lu, bytes=%Lu\n", right_info->offset,
221 right_info->bytes, offset, bytes);
222 BUG();
223 }
224
225 if (left_info) {
226 unlink_free_space(block_group, left_info);
227
228 if (unlikely((left_info->offset + left_info->bytes) !=
229 offset)) {
230 printk(KERN_ERR "free space to the left of new free "
231 "space isn't quite right. existing: offset=%Lu,"
232 " bytes=%Lu. new: offset=%Lu, bytes=%Lu\n",
233 left_info->offset, left_info->bytes, offset,
234 bytes);
235 BUG();
236 }
237
238 if (info) {
239 info->offset = left_info->offset;
240 info->bytes += left_info->bytes;
241 kfree(left_info);
242 } else {
243 info = left_info;
244 info->bytes += bytes;
245 }
246 }
247
248 if (info) {
249 ret = link_free_space(block_group, info);
250 if (!ret)
251 info = NULL;
252 goto out;
253 }
254
255 info = alloc_info;
256 alloc_info = NULL;
257 info->offset = offset;
258 info->bytes = bytes;
259
260 ret = link_free_space(block_group, info);
261 if (ret)
262 kfree(info);
263out:
264 spin_unlock(&block_group->lock);
265 if (ret) {
266 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
267 if (ret == -EEXIST)
268 BUG();
269 }
270
271 if (alloc_info)
272 kfree(alloc_info);
273
274 return ret;
275}
276
277int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
278 u64 offset, u64 bytes)
279{
280 struct btrfs_free_space *info;
281 int ret = 0;
282
283 spin_lock(&block_group->lock);
284 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
285 1);
286
287 if (info && info->offset == offset) {
288 if (info->bytes < bytes) {
289 printk(KERN_ERR "Found free space at %Lu, size %Lu,"
290 "trying to use %Lu\n",
291 info->offset, info->bytes, bytes);
292 WARN_ON(1);
293 ret = -EINVAL;
294 goto out;
295 }
296
297 unlink_free_space(block_group, info);
298
299 if (info->bytes == bytes) {
300 kfree(info);
301 goto out;
302 }
303
304 info->offset += bytes;
305 info->bytes -= bytes;
306
307 ret = link_free_space(block_group, info);
308 BUG_ON(ret);
309 } else if (info && info->offset < offset &&
310 info->offset + info->bytes >= offset + bytes) {
311 u64 old_start = info->offset;
312 /*
313 * we're freeing space in the middle of the info,
314 * this can happen during tree log replay
315 *
316 * first unlink the old info and then
317 * insert it again after the hole we're creating
318 */
319 unlink_free_space(block_group, info);
320 if (offset + bytes < info->offset + info->bytes) {
321 u64 old_end = info->offset + info->bytes;
322
323 info->offset = offset + bytes;
324 info->bytes = old_end - info->offset;
325 ret = link_free_space(block_group, info);
326 BUG_ON(ret);
327 } else {
328 /* the hole we're creating ends at the end
329 * of the info struct, just free the info
330 */
331 kfree(info);
332 }
333
334 /* step two, insert a new info struct to cover anything
335 * before the hole
336 */
337 spin_unlock(&block_group->lock);
338 ret = btrfs_add_free_space(block_group, old_start,
339 offset - old_start);
340 BUG_ON(ret);
341 goto out_nolock;
342 } else {
343 WARN_ON(1);
344 }
345out:
346 spin_unlock(&block_group->lock);
347out_nolock:
348 return ret;
349}
350
351void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
352 u64 bytes)
353{
354 struct btrfs_free_space *info;
355 struct rb_node *n;
356 int count = 0;
357
358 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
359 info = rb_entry(n, struct btrfs_free_space, offset_index);
360 if (info->bytes >= bytes)
361 count++;
362 //printk(KERN_INFO "offset=%Lu, bytes=%Lu\n", info->offset,
363 // info->bytes);
364 }
365 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
366 "\n", count);
367}
368
369u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
370{
371 struct btrfs_free_space *info;
372 struct rb_node *n;
373 u64 ret = 0;
374
375 for (n = rb_first(&block_group->free_space_offset); n;
376 n = rb_next(n)) {
377 info = rb_entry(n, struct btrfs_free_space, offset_index);
378 ret += info->bytes;
379 }
380
381 return ret;
382}
383
384void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
385{
386 struct btrfs_free_space *info;
387 struct rb_node *node;
388
389 spin_lock(&block_group->lock);
390 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
391 info = rb_entry(node, struct btrfs_free_space, bytes_index);
392 unlink_free_space(block_group, info);
393 kfree(info);
394 if (need_resched()) {
395 spin_unlock(&block_group->lock);
396 cond_resched();
397 spin_lock(&block_group->lock);
398 }
399 }
400 spin_unlock(&block_group->lock);
401}
402
403struct btrfs_free_space *btrfs_find_free_space_offset(struct
404 btrfs_block_group_cache
405 *block_group, u64 offset,
406 u64 bytes)
407{
408 struct btrfs_free_space *ret;
409
410 spin_lock(&block_group->lock);
411 ret = tree_search_offset(&block_group->free_space_offset, offset,
412 bytes, 0);
413 spin_unlock(&block_group->lock);
414
415 return ret;
416}
417
418struct btrfs_free_space *btrfs_find_free_space_bytes(struct
419 btrfs_block_group_cache
420 *block_group, u64 offset,
421 u64 bytes)
422{
423 struct btrfs_free_space *ret;
424
425 spin_lock(&block_group->lock);
426
427 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
428 spin_unlock(&block_group->lock);
429
430 return ret;
431}
432
433struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
434 *block_group, u64 offset,
435 u64 bytes)
436{
437 struct btrfs_free_space *ret;
438
439 spin_lock(&block_group->lock);
440 ret = tree_search_offset(&block_group->free_space_offset, offset,
441 bytes, 0);
442 if (!ret)
443 ret = tree_search_bytes(&block_group->free_space_bytes,
444 offset, bytes);
445
446 spin_unlock(&block_group->lock);
447
448 return ret;
449}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 000000000000..2a020b276768
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __HASH__
20#define __HASH__
21
22#include "crc32c.h"
23static inline u64 btrfs_name_hash(const char *name, int len)
24{
25 return btrfs_crc32c((u32)~1, name, len);
26}
27#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000000..d93451c66ba1
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23int find_name_in_backref(struct btrfs_path *path, const char * name,
24 int name_len, struct btrfs_inode_ref **ref_ret)
25{
26 struct extent_buffer *leaf;
27 struct btrfs_inode_ref *ref;
28 unsigned long ptr;
29 unsigned long name_ptr;
30 u32 item_size;
31 u32 cur_offset = 0;
32 int len;
33
34 leaf = path->nodes[0];
35 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
36 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
37 while (cur_offset < item_size) {
38 ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
39 len = btrfs_inode_ref_name_len(leaf, ref);
40 name_ptr = (unsigned long)(ref + 1);
41 cur_offset += len + sizeof(*ref);
42 if (len != name_len)
43 continue;
44 if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
45 *ref_ret = ref;
46 return 1;
47 }
48 }
49 return 0;
50}
51
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root,
54 const char *name, int name_len,
55 u64 inode_objectid, u64 ref_objectid, u64 *index)
56{
57 struct btrfs_path *path;
58 struct btrfs_key key;
59 struct btrfs_inode_ref *ref;
60 struct extent_buffer *leaf;
61 unsigned long ptr;
62 unsigned long item_start;
63 u32 item_size;
64 u32 sub_item_len;
65 int ret;
66 int del_len = name_len + sizeof(*ref);
67
68 key.objectid = inode_objectid;
69 key.offset = ref_objectid;
70 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
71
72 path = btrfs_alloc_path();
73 if (!path)
74 return -ENOMEM;
75
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) {
78 ret = -ENOENT;
79 goto out;
80 } else if (ret < 0) {
81 goto out;
82 }
83 if (!find_name_in_backref(path, name, name_len, &ref)) {
84 ret = -ENOENT;
85 goto out;
86 }
87 leaf = path->nodes[0];
88 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
89
90 if (index)
91 *index = btrfs_inode_ref_index(leaf, ref);
92
93 if (del_len == item_size) {
94 ret = btrfs_del_item(trans, root, path);
95 goto out;
96 }
97 ptr = (unsigned long)ref;
98 sub_item_len = name_len + sizeof(*ref);
99 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
100 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
101 item_size - (ptr + sub_item_len - item_start));
102 ret = btrfs_truncate_item(trans, root, path,
103 item_size - sub_item_len, 1);
104 BUG_ON(ret);
105out:
106 btrfs_free_path(path);
107 return ret;
108}
109
110int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
111 struct btrfs_root *root,
112 const char *name, int name_len,
113 u64 inode_objectid, u64 ref_objectid, u64 index)
114{
115 struct btrfs_path *path;
116 struct btrfs_key key;
117 struct btrfs_inode_ref *ref;
118 unsigned long ptr;
119 int ret;
120 int ins_len = name_len + sizeof(*ref);
121
122 key.objectid = inode_objectid;
123 key.offset = ref_objectid;
124 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
125
126 path = btrfs_alloc_path();
127 if (!path)
128 return -ENOMEM;
129
130 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len);
132 if (ret == -EEXIST) {
133 u32 old_size;
134
135 if (find_name_in_backref(path, name, name_len, &ref))
136 goto out;
137
138 old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
139 ret = btrfs_extend_item(trans, root, path, ins_len);
140 BUG_ON(ret);
141 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
142 struct btrfs_inode_ref);
143 ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
144 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
145 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
146 ptr = (unsigned long)(ref + 1);
147 ret = 0;
148 } else if (ret < 0) {
149 goto out;
150 } else {
151 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
152 struct btrfs_inode_ref);
153 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
154 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
155 ptr = (unsigned long)(ref + 1);
156 }
157 write_extent_buffer(path->nodes[0], name, ptr, name_len);
158 btrfs_mark_buffer_dirty(path->nodes[0]);
159
160out:
161 btrfs_free_path(path);
162 return ret;
163}
164
165int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
166 struct btrfs_root *root,
167 struct btrfs_path *path, u64 objectid)
168{
169 struct btrfs_key key;
170 int ret;
171 key.objectid = objectid;
172 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
173 key.offset = 0;
174
175 ret = btrfs_insert_empty_item(trans, root, path, &key,
176 sizeof(struct btrfs_inode_item));
177 if (ret == 0 && objectid > root->highest_inode)
178 root->highest_inode = objectid;
179 return ret;
180}
181
182int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
183 *root, struct btrfs_path *path,
184 struct btrfs_key *location, int mod)
185{
186 int ins_len = mod < 0 ? -1 : 0;
187 int cow = mod != 0;
188 int ret;
189 int slot;
190 struct extent_buffer *leaf;
191 struct btrfs_key found_key;
192
193 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
194 if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
195 location->offset == (u64)-1 && path->slots[0] != 0) {
196 slot = path->slots[0] - 1;
197 leaf = path->nodes[0];
198 btrfs_item_key_to_cpu(leaf, &found_key, slot);
199 if (found_key.objectid == location->objectid &&
200 btrfs_key_type(&found_key) == btrfs_key_type(location)) {
201 path->slots[0]--;
202 return 0;
203 }
204 }
205 return ret;
206}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 000000000000..80038c5ef7cf
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,145 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
24{
25 struct btrfs_path *path;
26 int ret;
27 struct extent_buffer *l;
28 struct btrfs_key search_key;
29 struct btrfs_key found_key;
30 int slot;
31
32 path = btrfs_alloc_path();
33 BUG_ON(!path);
34
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1;
37 search_key.offset = (u64)-1;
38 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
39 if (ret < 0)
40 goto error;
41 BUG_ON(ret == 0);
42 if (path->slots[0] > 0) {
43 slot = path->slots[0] - 1;
44 l = path->nodes[0];
45 btrfs_item_key_to_cpu(l, &found_key, slot);
46 *objectid = found_key.objectid;
47 } else {
48 *objectid = BTRFS_FIRST_FREE_OBJECTID;
49 }
50 ret = 0;
51error:
52 btrfs_free_path(path);
53 return ret;
54}
55
56/*
57 * walks the btree of allocated inodes and find a hole.
58 */
59int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 dirid, u64 *objectid)
62{
63 struct btrfs_path *path;
64 struct btrfs_key key;
65 int ret;
66 int slot = 0;
67 u64 last_ino = 0;
68 int start_found;
69 struct extent_buffer *l;
70 struct btrfs_key search_key;
71 u64 search_start = dirid;
72
73 mutex_lock(&root->objectid_mutex);
74 if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
75 root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
76 *objectid = ++root->last_inode_alloc;
77 mutex_unlock(&root->objectid_mutex);
78 return 0;
79 }
80 path = btrfs_alloc_path();
81 BUG_ON(!path);
82 search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start;
84 search_key.type = 0;
85 search_key.offset = 0;
86
87 btrfs_init_path(path);
88 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0)
91 goto error;
92
93 while (1) {
94 l = path->nodes[0];
95 slot = path->slots[0];
96 if (slot >= btrfs_header_nritems(l)) {
97 ret = btrfs_next_leaf(root, path);
98 if (ret == 0)
99 continue;
100 if (ret < 0)
101 goto error;
102 if (!start_found) {
103 *objectid = search_start;
104 start_found = 1;
105 goto found;
106 }
107 *objectid = last_ino > search_start ?
108 last_ino : search_start;
109 goto found;
110 }
111 btrfs_item_key_to_cpu(l, &key, slot);
112 if (key.objectid >= search_start) {
113 if (start_found) {
114 if (last_ino < search_start)
115 last_ino = search_start;
116 if (key.objectid > last_ino) {
117 *objectid = last_ino;
118 goto found;
119 }
120 } else if (key.objectid > search_start) {
121 *objectid = search_start;
122 goto found;
123 }
124 }
125 if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
126 break;
127
128 start_found = 1;
129 last_ino = key.objectid + 1;
130 path->slots[0]++;
131 }
132 // FIXME -ENOSPC
133 BUG_ON(1);
134found:
135 btrfs_release_path(root, path);
136 btrfs_free_path(path);
137 BUG_ON(*objectid < search_start);
138 mutex_unlock(&root->objectid_mutex);
139 return 0;
140error:
141 btrfs_release_path(root, path);
142 btrfs_free_path(path);
143 mutex_unlock(&root->objectid_mutex);
144 return ret;
145}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000000..bf4bed6ca4d6
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,3908 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
40#include "ctree.h"
41#include "disk-io.h"
42#include "transaction.h"
43#include "btrfs_inode.h"
44#include "ioctl.h"
45#include "print-tree.h"
46#include "volumes.h"
47#include "ordered-data.h"
48#include "xattr.h"
49#include "compat.h"
50#include "tree-log.h"
51#include "ref-cache.h"
52
53struct btrfs_iget_args {
54 u64 ino;
55 struct btrfs_root *root;
56};
57
58static struct inode_operations btrfs_dir_inode_operations;
59static struct inode_operations btrfs_symlink_inode_operations;
60static struct inode_operations btrfs_dir_ro_inode_operations;
61static struct inode_operations btrfs_special_inode_operations;
62static struct inode_operations btrfs_file_inode_operations;
63static struct address_space_operations btrfs_aops;
64static struct address_space_operations btrfs_symlink_aops;
65static struct file_operations btrfs_dir_file_operations;
66static struct extent_io_ops btrfs_extent_io_ops;
67
68static struct kmem_cache *btrfs_inode_cachep;
69struct kmem_cache *btrfs_trans_handle_cachep;
70struct kmem_cache *btrfs_transaction_cachep;
71struct kmem_cache *btrfs_bit_radix_cachep;
72struct kmem_cache *btrfs_path_cachep;
73
74#define S_SHIFT 12
75static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
76 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
77 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
78 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
79 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
80 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
81 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
82 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
83};
84
85static void btrfs_truncate(struct inode *inode);
86
87/*
88 * a very lame attempt at stopping writes when the FS is 85% full. There
89 * are countless ways this is incorrect, but it is better than nothing.
90 */
91int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
92 int for_del)
93{
94 u64 total;
95 u64 used;
96 u64 thresh;
97 unsigned long flags;
98 int ret = 0;
99
100 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
101 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
102 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
103 if (for_del)
104 thresh = total * 90;
105 else
106 thresh = total * 85;
107
108 do_div(thresh, 100);
109
110 if (used + root->fs_info->delalloc_bytes + num_required > thresh)
111 ret = -ENOSPC;
112 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
113 return ret;
114}
115
116/*
117 * when extent_io.c finds a delayed allocation range in the file,
118 * the call backs end up in this code. The basic idea is to
119 * allocate extents on disk for the range, and create ordered data structs
120 * in ram to track those extents.
121 */
122static int cow_file_range(struct inode *inode, u64 start, u64 end)
123{
124 struct btrfs_root *root = BTRFS_I(inode)->root;
125 struct btrfs_trans_handle *trans;
126 u64 alloc_hint = 0;
127 u64 num_bytes;
128 u64 cur_alloc_size;
129 u64 blocksize = root->sectorsize;
130 u64 orig_num_bytes;
131 struct btrfs_key ins;
132 struct extent_map *em;
133 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
134 int ret = 0;
135
136 trans = btrfs_join_transaction(root, 1);
137 BUG_ON(!trans);
138 btrfs_set_trans_block_group(trans, inode);
139
140 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
141 num_bytes = max(blocksize, num_bytes);
142 orig_num_bytes = num_bytes;
143
144 if (alloc_hint == EXTENT_MAP_INLINE)
145 goto out;
146
147 BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
148 mutex_lock(&BTRFS_I(inode)->extent_mutex);
149 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
150 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
151
152 while(num_bytes > 0) {
153 cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
154 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
155 root->sectorsize, 0, alloc_hint,
156 (u64)-1, &ins, 1);
157 if (ret) {
158 WARN_ON(1);
159 goto out;
160 }
161 em = alloc_extent_map(GFP_NOFS);
162 em->start = start;
163 em->len = ins.offset;
164 em->block_start = ins.objectid;
165 em->bdev = root->fs_info->fs_devices->latest_bdev;
166 mutex_lock(&BTRFS_I(inode)->extent_mutex);
167 set_bit(EXTENT_FLAG_PINNED, &em->flags);
168 while(1) {
169 spin_lock(&em_tree->lock);
170 ret = add_extent_mapping(em_tree, em);
171 spin_unlock(&em_tree->lock);
172 if (ret != -EEXIST) {
173 free_extent_map(em);
174 break;
175 }
176 btrfs_drop_extent_cache(inode, start,
177 start + ins.offset - 1, 0);
178 }
179 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
180
181 cur_alloc_size = ins.offset;
182 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
183 ins.offset, 0);
184 BUG_ON(ret);
185 if (num_bytes < cur_alloc_size) {
186 printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
187 cur_alloc_size);
188 break;
189 }
190 num_bytes -= cur_alloc_size;
191 alloc_hint = ins.objectid + ins.offset;
192 start += cur_alloc_size;
193 }
194out:
195 btrfs_end_transaction(trans, root);
196 return ret;
197}
198
199/*
200 * when nowcow writeback call back. This checks for snapshots or COW copies
201 * of the extents that exist in the file, and COWs the file as required.
202 *
203 * If no cow copies or snapshots exist, we write directly to the existing
204 * blocks on disk
205 */
206static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
207{
208 u64 extent_start;
209 u64 extent_end;
210 u64 bytenr;
211 u64 loops = 0;
212 u64 total_fs_bytes;
213 struct btrfs_root *root = BTRFS_I(inode)->root;
214 struct btrfs_block_group_cache *block_group;
215 struct btrfs_trans_handle *trans;
216 struct extent_buffer *leaf;
217 int found_type;
218 struct btrfs_path *path;
219 struct btrfs_file_extent_item *item;
220 int ret;
221 int err = 0;
222 struct btrfs_key found_key;
223
224 total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
225 path = btrfs_alloc_path();
226 BUG_ON(!path);
227 trans = btrfs_join_transaction(root, 1);
228 BUG_ON(!trans);
229again:
230 ret = btrfs_lookup_file_extent(NULL, root, path,
231 inode->i_ino, start, 0);
232 if (ret < 0) {
233 err = ret;
234 goto out;
235 }
236
237 if (ret != 0) {
238 if (path->slots[0] == 0)
239 goto not_found;
240 path->slots[0]--;
241 }
242
243 leaf = path->nodes[0];
244 item = btrfs_item_ptr(leaf, path->slots[0],
245 struct btrfs_file_extent_item);
246
247 /* are we inside the extent that was found? */
248 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
249 found_type = btrfs_key_type(&found_key);
250 if (found_key.objectid != inode->i_ino ||
251 found_type != BTRFS_EXTENT_DATA_KEY)
252 goto not_found;
253
254 found_type = btrfs_file_extent_type(leaf, item);
255 extent_start = found_key.offset;
256 if (found_type == BTRFS_FILE_EXTENT_REG) {
257 u64 extent_num_bytes;
258
259 extent_num_bytes = btrfs_file_extent_num_bytes(leaf, item);
260 extent_end = extent_start + extent_num_bytes;
261 err = 0;
262
263 if (loops && start != extent_start)
264 goto not_found;
265
266 if (start < extent_start || start >= extent_end)
267 goto not_found;
268
269 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
270 if (bytenr == 0)
271 goto not_found;
272
273 if (btrfs_cross_ref_exists(trans, root, &found_key, bytenr))
274 goto not_found;
275 /*
276 * we may be called by the resizer, make sure we're inside
277 * the limits of the FS
278 */
279 block_group = btrfs_lookup_block_group(root->fs_info,
280 bytenr);
281 if (!block_group || block_group->ro)
282 goto not_found;
283
284 bytenr += btrfs_file_extent_offset(leaf, item);
285 extent_num_bytes = min(end + 1, extent_end) - start;
286 ret = btrfs_add_ordered_extent(inode, start, bytenr,
287 extent_num_bytes, 1);
288 if (ret) {
289 err = ret;
290 goto out;
291 }
292
293 btrfs_release_path(root, path);
294 start = extent_end;
295 if (start <= end) {
296 loops++;
297 goto again;
298 }
299 } else {
300not_found:
301 btrfs_end_transaction(trans, root);
302 btrfs_free_path(path);
303 return cow_file_range(inode, start, end);
304 }
305out:
306 WARN_ON(err);
307 btrfs_end_transaction(trans, root);
308 btrfs_free_path(path);
309 return err;
310}
311
312/*
313 * extent_io.c call back to do delayed allocation processing
314 */
315static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
316{
317 struct btrfs_root *root = BTRFS_I(inode)->root;
318 int ret;
319
320 if (btrfs_test_opt(root, NODATACOW) ||
321 btrfs_test_flag(inode, NODATACOW))
322 ret = run_delalloc_nocow(inode, start, end);
323 else
324 ret = cow_file_range(inode, start, end);
325
326 return ret;
327}
328
329/*
330 * extent_io.c set_bit_hook, used to track delayed allocation
331 * bytes in this file, and to maintain the list of inodes that
332 * have pending delalloc work to be done.
333 */
334int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
335 unsigned long old, unsigned long bits)
336{
337 unsigned long flags;
338 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
339 struct btrfs_root *root = BTRFS_I(inode)->root;
340 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
341 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
342 root->fs_info->delalloc_bytes += end - start + 1;
343 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
344 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
345 &root->fs_info->delalloc_inodes);
346 }
347 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
348 }
349 return 0;
350}
351
352/*
353 * extent_io.c clear_bit_hook, see set_bit_hook for why
354 */
355int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
356 unsigned long old, unsigned long bits)
357{
358 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
359 struct btrfs_root *root = BTRFS_I(inode)->root;
360 unsigned long flags;
361
362 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
363 if (end - start + 1 > root->fs_info->delalloc_bytes) {
364 printk("warning: delalloc account %Lu %Lu\n",
365 end - start + 1, root->fs_info->delalloc_bytes);
366 root->fs_info->delalloc_bytes = 0;
367 BTRFS_I(inode)->delalloc_bytes = 0;
368 } else {
369 root->fs_info->delalloc_bytes -= end - start + 1;
370 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
371 }
372 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
373 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
374 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
375 }
376 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
377 }
378 return 0;
379}
380
381/*
382 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
383 * we don't create bios that span stripes or chunks
384 */
385int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
386 size_t size, struct bio *bio)
387{
388 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
389 struct btrfs_mapping_tree *map_tree;
390 u64 logical = (u64)bio->bi_sector << 9;
391 u64 length = 0;
392 u64 map_length;
393 int ret;
394
395 length = bio->bi_size;
396 map_tree = &root->fs_info->mapping_tree;
397 map_length = length;
398 ret = btrfs_map_block(map_tree, READ, logical,
399 &map_length, NULL, 0);
400
401 if (map_length < length + size) {
402 return 1;
403 }
404 return 0;
405}
406
407/*
408 * in order to insert checksums into the metadata in large chunks,
409 * we wait until bio submission time. All the pages in the bio are
410 * checksummed and sums are attached onto the ordered extent record.
411 *
412 * At IO completion time the cums attached on the ordered extent record
413 * are inserted into the btree
414 */
415int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
416 int mirror_num)
417{
418 struct btrfs_root *root = BTRFS_I(inode)->root;
419 int ret = 0;
420
421 ret = btrfs_csum_one_bio(root, inode, bio);
422 BUG_ON(ret);
423
424 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
425}
426
427/*
428 * extent_io.c submission hook. This does the right thing for csum calculation on write,
429 * or reading the csums from the tree before a read
430 */
431int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
432 int mirror_num)
433{
434 struct btrfs_root *root = BTRFS_I(inode)->root;
435 int ret = 0;
436
437 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
438 BUG_ON(ret);
439
440 if (btrfs_test_opt(root, NODATASUM) ||
441 btrfs_test_flag(inode, NODATASUM)) {
442 goto mapit;
443 }
444
445 if (!(rw & (1 << BIO_RW))) {
446 btrfs_lookup_bio_sums(root, inode, bio);
447 goto mapit;
448 }
449 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
450 inode, rw, bio, mirror_num,
451 __btrfs_submit_bio_hook);
452mapit:
453 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
454}
455
456/*
457 * given a list of ordered sums record them in the inode. This happens
458 * at IO completion time based on sums calculated at bio submission time.
459 */
460static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
461 struct inode *inode, u64 file_offset,
462 struct list_head *list)
463{
464 struct list_head *cur;
465 struct btrfs_ordered_sum *sum;
466
467 btrfs_set_trans_block_group(trans, inode);
468 list_for_each(cur, list) {
469 sum = list_entry(cur, struct btrfs_ordered_sum, list);
470 btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
471 inode, sum);
472 }
473 return 0;
474}
475
476int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
477{
478 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
479 GFP_NOFS);
480}
481
482/* see btrfs_writepage_start_hook for details on why this is required */
483struct btrfs_writepage_fixup {
484 struct page *page;
485 struct btrfs_work work;
486};
487
488void btrfs_writepage_fixup_worker(struct btrfs_work *work)
489{
490 struct btrfs_writepage_fixup *fixup;
491 struct btrfs_ordered_extent *ordered;
492 struct page *page;
493 struct inode *inode;
494 u64 page_start;
495 u64 page_end;
496
497 fixup = container_of(work, struct btrfs_writepage_fixup, work);
498 page = fixup->page;
499again:
500 lock_page(page);
501 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
502 ClearPageChecked(page);
503 goto out_page;
504 }
505
506 inode = page->mapping->host;
507 page_start = page_offset(page);
508 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
509
510 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
511
512 /* already ordered? We're done */
513 if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
514 EXTENT_ORDERED, 0)) {
515 goto out;
516 }
517
518 ordered = btrfs_lookup_ordered_extent(inode, page_start);
519 if (ordered) {
520 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
521 page_end, GFP_NOFS);
522 unlock_page(page);
523 btrfs_start_ordered_extent(inode, ordered, 1);
524 goto again;
525 }
526
527 btrfs_set_extent_delalloc(inode, page_start, page_end);
528 ClearPageChecked(page);
529out:
530 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
531out_page:
532 unlock_page(page);
533 page_cache_release(page);
534}
535
536/*
537 * There are a few paths in the higher layers of the kernel that directly
538 * set the page dirty bit without asking the filesystem if it is a
539 * good idea. This causes problems because we want to make sure COW
540 * properly happens and the data=ordered rules are followed.
541 *
542 * In our case any range that doesn't have the EXTENT_ORDERED bit set
543 * hasn't been properly setup for IO. We kick off an async process
544 * to fix it up. The async helper will wait for ordered extents, set
545 * the delalloc bit and make it safe to write the page.
546 */
547int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
548{
549 struct inode *inode = page->mapping->host;
550 struct btrfs_writepage_fixup *fixup;
551 struct btrfs_root *root = BTRFS_I(inode)->root;
552 int ret;
553
554 ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
555 EXTENT_ORDERED, 0);
556 if (ret)
557 return 0;
558
559 if (PageChecked(page))
560 return -EAGAIN;
561
562 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
563 if (!fixup)
564 return -EAGAIN;
565
566 SetPageChecked(page);
567 page_cache_get(page);
568 fixup->work.func = btrfs_writepage_fixup_worker;
569 fixup->page = page;
570 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
571 return -EAGAIN;
572}
573
574/* as ordered data IO finishes, this gets called so we can finish
575 * an ordered extent if the range of bytes in the file it covers are
576 * fully written.
577 */
578static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
579{
580 struct btrfs_root *root = BTRFS_I(inode)->root;
581 struct btrfs_trans_handle *trans;
582 struct btrfs_ordered_extent *ordered_extent;
583 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
584 struct btrfs_file_extent_item *extent_item;
585 struct btrfs_path *path = NULL;
586 struct extent_buffer *leaf;
587 u64 alloc_hint = 0;
588 struct list_head list;
589 struct btrfs_key ins;
590 int ret;
591
592 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
593 if (!ret)
594 return 0;
595
596 trans = btrfs_join_transaction(root, 1);
597
598 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
599 BUG_ON(!ordered_extent);
600 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
601 goto nocow;
602
603 path = btrfs_alloc_path();
604 BUG_ON(!path);
605
606 lock_extent(io_tree, ordered_extent->file_offset,
607 ordered_extent->file_offset + ordered_extent->len - 1,
608 GFP_NOFS);
609
610 INIT_LIST_HEAD(&list);
611
612 mutex_lock(&BTRFS_I(inode)->extent_mutex);
613
614 ret = btrfs_drop_extents(trans, root, inode,
615 ordered_extent->file_offset,
616 ordered_extent->file_offset +
617 ordered_extent->len,
618 ordered_extent->file_offset, &alloc_hint);
619 BUG_ON(ret);
620
621 ins.objectid = inode->i_ino;
622 ins.offset = ordered_extent->file_offset;
623 ins.type = BTRFS_EXTENT_DATA_KEY;
624 ret = btrfs_insert_empty_item(trans, root, path, &ins,
625 sizeof(*extent_item));
626 BUG_ON(ret);
627 leaf = path->nodes[0];
628 extent_item = btrfs_item_ptr(leaf, path->slots[0],
629 struct btrfs_file_extent_item);
630 btrfs_set_file_extent_generation(leaf, extent_item, trans->transid);
631 btrfs_set_file_extent_type(leaf, extent_item, BTRFS_FILE_EXTENT_REG);
632 btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
633 ordered_extent->start);
634 btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
635 ordered_extent->len);
636 btrfs_set_file_extent_offset(leaf, extent_item, 0);
637 btrfs_set_file_extent_num_bytes(leaf, extent_item,
638 ordered_extent->len);
639 btrfs_mark_buffer_dirty(leaf);
640
641 btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
642 ordered_extent->file_offset +
643 ordered_extent->len - 1, 0);
644 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
645
646 ins.objectid = ordered_extent->start;
647 ins.offset = ordered_extent->len;
648 ins.type = BTRFS_EXTENT_ITEM_KEY;
649 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
650 root->root_key.objectid,
651 trans->transid, inode->i_ino, &ins);
652 BUG_ON(ret);
653 btrfs_release_path(root, path);
654
655 inode_add_bytes(inode, ordered_extent->len);
656 unlock_extent(io_tree, ordered_extent->file_offset,
657 ordered_extent->file_offset + ordered_extent->len - 1,
658 GFP_NOFS);
659nocow:
660 add_pending_csums(trans, inode, ordered_extent->file_offset,
661 &ordered_extent->list);
662
663 mutex_lock(&BTRFS_I(inode)->extent_mutex);
664 btrfs_ordered_update_i_size(inode, ordered_extent);
665 btrfs_update_inode(trans, root, inode);
666 btrfs_remove_ordered_extent(inode, ordered_extent);
667 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
668
669 /* once for us */
670 btrfs_put_ordered_extent(ordered_extent);
671 /* once for the tree */
672 btrfs_put_ordered_extent(ordered_extent);
673
674 btrfs_end_transaction(trans, root);
675 if (path)
676 btrfs_free_path(path);
677 return 0;
678}
679
680int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
681 struct extent_state *state, int uptodate)
682{
683 return btrfs_finish_ordered_io(page->mapping->host, start, end);
684}
685
686/*
687 * When IO fails, either with EIO or csum verification fails, we
688 * try other mirrors that might have a good copy of the data. This
689 * io_failure_record is used to record state as we go through all the
690 * mirrors. If another mirror has good data, the page is set up to date
691 * and things continue. If a good mirror can't be found, the original
692 * bio end_io callback is called to indicate things have failed.
693 */
694struct io_failure_record {
695 struct page *page;
696 u64 start;
697 u64 len;
698 u64 logical;
699 int last_mirror;
700};
701
702int btrfs_io_failed_hook(struct bio *failed_bio,
703 struct page *page, u64 start, u64 end,
704 struct extent_state *state)
705{
706 struct io_failure_record *failrec = NULL;
707 u64 private;
708 struct extent_map *em;
709 struct inode *inode = page->mapping->host;
710 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
711 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
712 struct bio *bio;
713 int num_copies;
714 int ret;
715 int rw;
716 u64 logical;
717
718 ret = get_state_private(failure_tree, start, &private);
719 if (ret) {
720 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
721 if (!failrec)
722 return -ENOMEM;
723 failrec->start = start;
724 failrec->len = end - start + 1;
725 failrec->last_mirror = 0;
726
727 spin_lock(&em_tree->lock);
728 em = lookup_extent_mapping(em_tree, start, failrec->len);
729 if (em->start > start || em->start + em->len < start) {
730 free_extent_map(em);
731 em = NULL;
732 }
733 spin_unlock(&em_tree->lock);
734
735 if (!em || IS_ERR(em)) {
736 kfree(failrec);
737 return -EIO;
738 }
739 logical = start - em->start;
740 logical = em->block_start + logical;
741 failrec->logical = logical;
742 free_extent_map(em);
743 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
744 EXTENT_DIRTY, GFP_NOFS);
745 set_state_private(failure_tree, start,
746 (u64)(unsigned long)failrec);
747 } else {
748 failrec = (struct io_failure_record *)(unsigned long)private;
749 }
750 num_copies = btrfs_num_copies(
751 &BTRFS_I(inode)->root->fs_info->mapping_tree,
752 failrec->logical, failrec->len);
753 failrec->last_mirror++;
754 if (!state) {
755 spin_lock_irq(&BTRFS_I(inode)->io_tree.lock);
756 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
757 failrec->start,
758 EXTENT_LOCKED);
759 if (state && state->start != failrec->start)
760 state = NULL;
761 spin_unlock_irq(&BTRFS_I(inode)->io_tree.lock);
762 }
763 if (!state || failrec->last_mirror > num_copies) {
764 set_state_private(failure_tree, failrec->start, 0);
765 clear_extent_bits(failure_tree, failrec->start,
766 failrec->start + failrec->len - 1,
767 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
768 kfree(failrec);
769 return -EIO;
770 }
771 bio = bio_alloc(GFP_NOFS, 1);
772 bio->bi_private = state;
773 bio->bi_end_io = failed_bio->bi_end_io;
774 bio->bi_sector = failrec->logical >> 9;
775 bio->bi_bdev = failed_bio->bi_bdev;
776 bio->bi_size = 0;
777 bio_add_page(bio, page, failrec->len, start - page_offset(page));
778 if (failed_bio->bi_rw & (1 << BIO_RW))
779 rw = WRITE;
780 else
781 rw = READ;
782
783 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
784 failrec->last_mirror);
785 return 0;
786}
787
788/*
789 * each time an IO finishes, we do a fast check in the IO failure tree
790 * to see if we need to process or clean up an io_failure_record
791 */
792int btrfs_clean_io_failures(struct inode *inode, u64 start)
793{
794 u64 private;
795 u64 private_failure;
796 struct io_failure_record *failure;
797 int ret;
798
799 private = 0;
800 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
801 (u64)-1, 1, EXTENT_DIRTY)) {
802 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
803 start, &private_failure);
804 if (ret == 0) {
805 failure = (struct io_failure_record *)(unsigned long)
806 private_failure;
807 set_state_private(&BTRFS_I(inode)->io_failure_tree,
808 failure->start, 0);
809 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
810 failure->start,
811 failure->start + failure->len - 1,
812 EXTENT_DIRTY | EXTENT_LOCKED,
813 GFP_NOFS);
814 kfree(failure);
815 }
816 }
817 return 0;
818}
819
820/*
821 * when reads are done, we need to check csums to verify the data is correct
822 * if there's a match, we allow the bio to finish. If not, we go through
823 * the io_failure_record routines to find good copies
824 */
825int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
826 struct extent_state *state)
827{
828 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
829 struct inode *inode = page->mapping->host;
830 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
831 char *kaddr;
832 u64 private = ~(u32)0;
833 int ret;
834 struct btrfs_root *root = BTRFS_I(inode)->root;
835 u32 csum = ~(u32)0;
836 unsigned long flags;
837
838 if (btrfs_test_opt(root, NODATASUM) ||
839 btrfs_test_flag(inode, NODATASUM))
840 return 0;
841 if (state && state->start == start) {
842 private = state->private;
843 ret = 0;
844 } else {
845 ret = get_state_private(io_tree, start, &private);
846 }
847 local_irq_save(flags);
848 kaddr = kmap_atomic(page, KM_IRQ0);
849 if (ret) {
850 goto zeroit;
851 }
852 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
853 btrfs_csum_final(csum, (char *)&csum);
854 if (csum != private) {
855 goto zeroit;
856 }
857 kunmap_atomic(kaddr, KM_IRQ0);
858 local_irq_restore(flags);
859
860 /* if the io failure tree for this inode is non-empty,
861 * check to see if we've recovered from a failed IO
862 */
863 btrfs_clean_io_failures(inode, start);
864 return 0;
865
866zeroit:
867 printk("btrfs csum failed ino %lu off %llu csum %u private %Lu\n",
868 page->mapping->host->i_ino, (unsigned long long)start, csum,
869 private);
870 memset(kaddr + offset, 1, end - start + 1);
871 flush_dcache_page(page);
872 kunmap_atomic(kaddr, KM_IRQ0);
873 local_irq_restore(flags);
874 if (private == 0)
875 return 0;
876 return -EIO;
877}
878
879/*
880 * This creates an orphan entry for the given inode in case something goes
881 * wrong in the middle of an unlink/truncate.
882 */
883int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
884{
885 struct btrfs_root *root = BTRFS_I(inode)->root;
886 int ret = 0;
887
888 spin_lock(&root->list_lock);
889
890 /* already on the orphan list, we're good */
891 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
892 spin_unlock(&root->list_lock);
893 return 0;
894 }
895
896 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
897
898 spin_unlock(&root->list_lock);
899
900 /*
901 * insert an orphan item to track this unlinked/truncated file
902 */
903 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
904
905 return ret;
906}
907
908/*
909 * We have done the truncate/delete so we can go ahead and remove the orphan
910 * item for this particular inode.
911 */
912int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
913{
914 struct btrfs_root *root = BTRFS_I(inode)->root;
915 int ret = 0;
916
917 spin_lock(&root->list_lock);
918
919 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
920 spin_unlock(&root->list_lock);
921 return 0;
922 }
923
924 list_del_init(&BTRFS_I(inode)->i_orphan);
925 if (!trans) {
926 spin_unlock(&root->list_lock);
927 return 0;
928 }
929
930 spin_unlock(&root->list_lock);
931
932 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
933
934 return ret;
935}
936
937/*
938 * this cleans up any orphans that may be left on the list from the last use
939 * of this root.
940 */
941void btrfs_orphan_cleanup(struct btrfs_root *root)
942{
943 struct btrfs_path *path;
944 struct extent_buffer *leaf;
945 struct btrfs_item *item;
946 struct btrfs_key key, found_key;
947 struct btrfs_trans_handle *trans;
948 struct inode *inode;
949 int ret = 0, nr_unlink = 0, nr_truncate = 0;
950
951 /* don't do orphan cleanup if the fs is readonly. */
952 if (root->fs_info->sb->s_flags & MS_RDONLY)
953 return;
954
955 path = btrfs_alloc_path();
956 if (!path)
957 return;
958 path->reada = -1;
959
960 key.objectid = BTRFS_ORPHAN_OBJECTID;
961 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
962 key.offset = (u64)-1;
963
964
965 while (1) {
966 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
967 if (ret < 0) {
968 printk(KERN_ERR "Error searching slot for orphan: %d"
969 "\n", ret);
970 break;
971 }
972
973 /*
974 * if ret == 0 means we found what we were searching for, which
975 * is weird, but possible, so only screw with path if we didnt
976 * find the key and see if we have stuff that matches
977 */
978 if (ret > 0) {
979 if (path->slots[0] == 0)
980 break;
981 path->slots[0]--;
982 }
983
984 /* pull out the item */
985 leaf = path->nodes[0];
986 item = btrfs_item_nr(leaf, path->slots[0]);
987 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
988
989 /* make sure the item matches what we want */
990 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
991 break;
992 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
993 break;
994
995 /* release the path since we're done with it */
996 btrfs_release_path(root, path);
997
998 /*
999 * this is where we are basically btrfs_lookup, without the
1000 * crossing root thing. we store the inode number in the
1001 * offset of the orphan item.
1002 */
1003 inode = btrfs_iget_locked(root->fs_info->sb,
1004 found_key.offset, root);
1005 if (!inode)
1006 break;
1007
1008 if (inode->i_state & I_NEW) {
1009 BTRFS_I(inode)->root = root;
1010
1011 /* have to set the location manually */
1012 BTRFS_I(inode)->location.objectid = inode->i_ino;
1013 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1014 BTRFS_I(inode)->location.offset = 0;
1015
1016 btrfs_read_locked_inode(inode);
1017 unlock_new_inode(inode);
1018 }
1019
1020 /*
1021 * add this inode to the orphan list so btrfs_orphan_del does
1022 * the proper thing when we hit it
1023 */
1024 spin_lock(&root->list_lock);
1025 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1026 spin_unlock(&root->list_lock);
1027
1028 /*
1029 * if this is a bad inode, means we actually succeeded in
1030 * removing the inode, but not the orphan record, which means
1031 * we need to manually delete the orphan since iput will just
1032 * do a destroy_inode
1033 */
1034 if (is_bad_inode(inode)) {
1035 trans = btrfs_start_transaction(root, 1);
1036 btrfs_orphan_del(trans, inode);
1037 btrfs_end_transaction(trans, root);
1038 iput(inode);
1039 continue;
1040 }
1041
1042 /* if we have links, this was a truncate, lets do that */
1043 if (inode->i_nlink) {
1044 nr_truncate++;
1045 btrfs_truncate(inode);
1046 } else {
1047 nr_unlink++;
1048 }
1049
1050 /* this will do delete_inode and everything for us */
1051 iput(inode);
1052 }
1053
1054 if (nr_unlink)
1055 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
1056 if (nr_truncate)
1057 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
1058
1059 btrfs_free_path(path);
1060}
1061
1062/*
1063 * read an inode from the btree into the in-memory inode
1064 */
1065void btrfs_read_locked_inode(struct inode *inode)
1066{
1067 struct btrfs_path *path;
1068 struct extent_buffer *leaf;
1069 struct btrfs_inode_item *inode_item;
1070 struct btrfs_timespec *tspec;
1071 struct btrfs_root *root = BTRFS_I(inode)->root;
1072 struct btrfs_key location;
1073 u64 alloc_group_block;
1074 u32 rdev;
1075 int ret;
1076
1077 path = btrfs_alloc_path();
1078 BUG_ON(!path);
1079 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
1080
1081 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
1082 if (ret)
1083 goto make_bad;
1084
1085 leaf = path->nodes[0];
1086 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1087 struct btrfs_inode_item);
1088
1089 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
1090 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
1091 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
1092 inode->i_gid = btrfs_inode_gid(leaf, inode_item);
1093 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
1094
1095 tspec = btrfs_inode_atime(inode_item);
1096 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1097 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1098
1099 tspec = btrfs_inode_mtime(inode_item);
1100 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1101 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1102
1103 tspec = btrfs_inode_ctime(inode_item);
1104 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1105 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1106
1107 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
1108 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
1109 inode->i_generation = BTRFS_I(inode)->generation;
1110 inode->i_rdev = 0;
1111 rdev = btrfs_inode_rdev(leaf, inode_item);
1112
1113 BTRFS_I(inode)->index_cnt = (u64)-1;
1114
1115 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
1116 BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
1117 alloc_group_block);
1118 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
1119 if (!BTRFS_I(inode)->block_group) {
1120 BTRFS_I(inode)->block_group = btrfs_find_block_group(root,
1121 NULL, 0,
1122 BTRFS_BLOCK_GROUP_METADATA, 0);
1123 }
1124 btrfs_free_path(path);
1125 inode_item = NULL;
1126
1127 switch (inode->i_mode & S_IFMT) {
1128 case S_IFREG:
1129 inode->i_mapping->a_ops = &btrfs_aops;
1130 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
1131 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
1132 inode->i_fop = &btrfs_file_operations;
1133 inode->i_op = &btrfs_file_inode_operations;
1134 break;
1135 case S_IFDIR:
1136 inode->i_fop = &btrfs_dir_file_operations;
1137 if (root == root->fs_info->tree_root)
1138 inode->i_op = &btrfs_dir_ro_inode_operations;
1139 else
1140 inode->i_op = &btrfs_dir_inode_operations;
1141 break;
1142 case S_IFLNK:
1143 inode->i_op = &btrfs_symlink_inode_operations;
1144 inode->i_mapping->a_ops = &btrfs_symlink_aops;
1145 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
1146 break;
1147 default:
1148 init_special_inode(inode, inode->i_mode, rdev);
1149 break;
1150 }
1151 return;
1152
1153make_bad:
1154 btrfs_free_path(path);
1155 make_bad_inode(inode);
1156}
1157
1158/*
1159 * given a leaf and an inode, copy the inode fields into the leaf
1160 */
1161static void fill_inode_item(struct btrfs_trans_handle *trans,
1162 struct extent_buffer *leaf,
1163 struct btrfs_inode_item *item,
1164 struct inode *inode)
1165{
1166 btrfs_set_inode_uid(leaf, item, inode->i_uid);
1167 btrfs_set_inode_gid(leaf, item, inode->i_gid);
1168 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
1169 btrfs_set_inode_mode(leaf, item, inode->i_mode);
1170 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
1171
1172 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
1173 inode->i_atime.tv_sec);
1174 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
1175 inode->i_atime.tv_nsec);
1176
1177 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
1178 inode->i_mtime.tv_sec);
1179 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
1180 inode->i_mtime.tv_nsec);
1181
1182 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
1183 inode->i_ctime.tv_sec);
1184 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
1185 inode->i_ctime.tv_nsec);
1186
1187 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
1188 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
1189 btrfs_set_inode_transid(leaf, item, trans->transid);
1190 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
1191 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
1192 btrfs_set_inode_block_group(leaf, item,
1193 BTRFS_I(inode)->block_group->key.objectid);
1194}
1195
1196/*
1197 * copy everything in the in-memory inode into the btree.
1198 */
1199int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
1200 struct btrfs_root *root,
1201 struct inode *inode)
1202{
1203 struct btrfs_inode_item *inode_item;
1204 struct btrfs_path *path;
1205 struct extent_buffer *leaf;
1206 int ret;
1207
1208 path = btrfs_alloc_path();
1209 BUG_ON(!path);
1210 ret = btrfs_lookup_inode(trans, root, path,
1211 &BTRFS_I(inode)->location, 1);
1212 if (ret) {
1213 if (ret > 0)
1214 ret = -ENOENT;
1215 goto failed;
1216 }
1217
1218 leaf = path->nodes[0];
1219 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1220 struct btrfs_inode_item);
1221
1222 fill_inode_item(trans, leaf, inode_item, inode);
1223 btrfs_mark_buffer_dirty(leaf);
1224 btrfs_set_inode_last_trans(trans, inode);
1225 ret = 0;
1226failed:
1227 btrfs_free_path(path);
1228 return ret;
1229}
1230
1231
1232/*
1233 * unlink helper that gets used here in inode.c and in the tree logging
1234 * recovery code. It remove a link in a directory with a given name, and
1235 * also drops the back refs in the inode to the directory
1236 */
1237int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
1238 struct btrfs_root *root,
1239 struct inode *dir, struct inode *inode,
1240 const char *name, int name_len)
1241{
1242 struct btrfs_path *path;
1243 int ret = 0;
1244 struct extent_buffer *leaf;
1245 struct btrfs_dir_item *di;
1246 struct btrfs_key key;
1247 u64 index;
1248
1249 path = btrfs_alloc_path();
1250 if (!path) {
1251 ret = -ENOMEM;
1252 goto err;
1253 }
1254
1255 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
1256 name, name_len, -1);
1257 if (IS_ERR(di)) {
1258 ret = PTR_ERR(di);
1259 goto err;
1260 }
1261 if (!di) {
1262 ret = -ENOENT;
1263 goto err;
1264 }
1265 leaf = path->nodes[0];
1266 btrfs_dir_item_key_to_cpu(leaf, di, &key);
1267 ret = btrfs_delete_one_dir_name(trans, root, path, di);
1268 if (ret)
1269 goto err;
1270 btrfs_release_path(root, path);
1271
1272 ret = btrfs_del_inode_ref(trans, root, name, name_len,
1273 inode->i_ino,
1274 dir->i_ino, &index);
1275 if (ret) {
1276 printk("failed to delete reference to %.*s, "
1277 "inode %lu parent %lu\n", name_len, name,
1278 inode->i_ino, dir->i_ino);
1279 goto err;
1280 }
1281
1282 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
1283 index, name, name_len, -1);
1284 if (IS_ERR(di)) {
1285 ret = PTR_ERR(di);
1286 goto err;
1287 }
1288 if (!di) {
1289 ret = -ENOENT;
1290 goto err;
1291 }
1292 ret = btrfs_delete_one_dir_name(trans, root, path, di);
1293 btrfs_release_path(root, path);
1294
1295 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
1296 inode, dir->i_ino);
1297 BUG_ON(ret != 0 && ret != -ENOENT);
1298 if (ret != -ENOENT)
1299 BTRFS_I(dir)->log_dirty_trans = trans->transid;
1300
1301 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
1302 dir, index);
1303 BUG_ON(ret);
1304err:
1305 btrfs_free_path(path);
1306 if (ret)
1307 goto out;
1308
1309 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
1310 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1311 btrfs_update_inode(trans, root, dir);
1312 btrfs_drop_nlink(inode);
1313 ret = btrfs_update_inode(trans, root, inode);
1314 dir->i_sb->s_dirt = 1;
1315out:
1316 return ret;
1317}
1318
1319static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
1320{
1321 struct btrfs_root *root;
1322 struct btrfs_trans_handle *trans;
1323 struct inode *inode = dentry->d_inode;
1324 int ret;
1325 unsigned long nr = 0;
1326
1327 root = BTRFS_I(dir)->root;
1328
1329 ret = btrfs_check_free_space(root, 1, 1);
1330 if (ret)
1331 goto fail;
1332
1333 trans = btrfs_start_transaction(root, 1);
1334
1335 btrfs_set_trans_block_group(trans, dir);
1336 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
1337 dentry->d_name.name, dentry->d_name.len);
1338
1339 if (inode->i_nlink == 0)
1340 ret = btrfs_orphan_add(trans, inode);
1341
1342 nr = trans->blocks_used;
1343
1344 btrfs_end_transaction_throttle(trans, root);
1345fail:
1346 btrfs_btree_balance_dirty(root, nr);
1347 return ret;
1348}
1349
1350static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
1351{
1352 struct inode *inode = dentry->d_inode;
1353 int err = 0;
1354 int ret;
1355 struct btrfs_root *root = BTRFS_I(dir)->root;
1356 struct btrfs_trans_handle *trans;
1357 unsigned long nr = 0;
1358
1359 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
1360 return -ENOTEMPTY;
1361 }
1362
1363 ret = btrfs_check_free_space(root, 1, 1);
1364 if (ret)
1365 goto fail;
1366
1367 trans = btrfs_start_transaction(root, 1);
1368 btrfs_set_trans_block_group(trans, dir);
1369
1370 err = btrfs_orphan_add(trans, inode);
1371 if (err)
1372 goto fail_trans;
1373
1374 /* now the directory is empty */
1375 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
1376 dentry->d_name.name, dentry->d_name.len);
1377 if (!err) {
1378 btrfs_i_size_write(inode, 0);
1379 }
1380
1381fail_trans:
1382 nr = trans->blocks_used;
1383 ret = btrfs_end_transaction_throttle(trans, root);
1384fail:
1385 btrfs_btree_balance_dirty(root, nr);
1386
1387 if (ret && !err)
1388 err = ret;
1389 return err;
1390}
1391
1392/*
1393 * when truncating bytes in a file, it is possible to avoid reading
1394 * the leaves that contain only checksum items. This can be the
1395 * majority of the IO required to delete a large file, but it must
1396 * be done carefully.
1397 *
1398 * The keys in the level just above the leaves are checked to make sure
1399 * the lowest key in a given leaf is a csum key, and starts at an offset
1400 * after the new size.
1401 *
1402 * Then the key for the next leaf is checked to make sure it also has
1403 * a checksum item for the same file. If it does, we know our target leaf
1404 * contains only checksum items, and it can be safely freed without reading
1405 * it.
1406 *
1407 * This is just an optimization targeted at large files. It may do
1408 * nothing. It will return 0 unless things went badly.
1409 */
1410static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
1411 struct btrfs_root *root,
1412 struct btrfs_path *path,
1413 struct inode *inode, u64 new_size)
1414{
1415 struct btrfs_key key;
1416 int ret;
1417 int nritems;
1418 struct btrfs_key found_key;
1419 struct btrfs_key other_key;
1420 struct btrfs_leaf_ref *ref;
1421 u64 leaf_gen;
1422 u64 leaf_start;
1423
1424 path->lowest_level = 1;
1425 key.objectid = inode->i_ino;
1426 key.type = BTRFS_CSUM_ITEM_KEY;
1427 key.offset = new_size;
1428again:
1429 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1430 if (ret < 0)
1431 goto out;
1432
1433 if (path->nodes[1] == NULL) {
1434 ret = 0;
1435 goto out;
1436 }
1437 ret = 0;
1438 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
1439 nritems = btrfs_header_nritems(path->nodes[1]);
1440
1441 if (!nritems)
1442 goto out;
1443
1444 if (path->slots[1] >= nritems)
1445 goto next_node;
1446
1447 /* did we find a key greater than anything we want to delete? */
1448 if (found_key.objectid > inode->i_ino ||
1449 (found_key.objectid == inode->i_ino && found_key.type > key.type))
1450 goto out;
1451
1452 /* we check the next key in the node to make sure the leave contains
1453 * only checksum items. This comparison doesn't work if our
1454 * leaf is the last one in the node
1455 */
1456 if (path->slots[1] + 1 >= nritems) {
1457next_node:
1458 /* search forward from the last key in the node, this
1459 * will bring us into the next node in the tree
1460 */
1461 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
1462
1463 /* unlikely, but we inc below, so check to be safe */
1464 if (found_key.offset == (u64)-1)
1465 goto out;
1466
1467 /* search_forward needs a path with locks held, do the
1468 * search again for the original key. It is possible
1469 * this will race with a balance and return a path that
1470 * we could modify, but this drop is just an optimization
1471 * and is allowed to miss some leaves.
1472 */
1473 btrfs_release_path(root, path);
1474 found_key.offset++;
1475
1476 /* setup a max key for search_forward */
1477 other_key.offset = (u64)-1;
1478 other_key.type = key.type;
1479 other_key.objectid = key.objectid;
1480
1481 path->keep_locks = 1;
1482 ret = btrfs_search_forward(root, &found_key, &other_key,
1483 path, 0, 0);
1484 path->keep_locks = 0;
1485 if (ret || found_key.objectid != key.objectid ||
1486 found_key.type != key.type) {
1487 ret = 0;
1488 goto out;
1489 }
1490
1491 key.offset = found_key.offset;
1492 btrfs_release_path(root, path);
1493 cond_resched();
1494 goto again;
1495 }
1496
1497 /* we know there's one more slot after us in the tree,
1498 * read that key so we can verify it is also a checksum item
1499 */
1500 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
1501
1502 if (found_key.objectid < inode->i_ino)
1503 goto next_key;
1504
1505 if (found_key.type != key.type || found_key.offset < new_size)
1506 goto next_key;
1507
1508 /*
1509 * if the key for the next leaf isn't a csum key from this objectid,
1510 * we can't be sure there aren't good items inside this leaf.
1511 * Bail out
1512 */
1513 if (other_key.objectid != inode->i_ino || other_key.type != key.type)
1514 goto out;
1515
1516 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
1517 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
1518 /*
1519 * it is safe to delete this leaf, it contains only
1520 * csum items from this inode at an offset >= new_size
1521 */
1522 ret = btrfs_del_leaf(trans, root, path, leaf_start);
1523 BUG_ON(ret);
1524
1525 if (root->ref_cows && leaf_gen < trans->transid) {
1526 ref = btrfs_alloc_leaf_ref(root, 0);
1527 if (ref) {
1528 ref->root_gen = root->root_key.offset;
1529 ref->bytenr = leaf_start;
1530 ref->owner = 0;
1531 ref->generation = leaf_gen;
1532 ref->nritems = 0;
1533
1534 ret = btrfs_add_leaf_ref(root, ref, 0);
1535 WARN_ON(ret);
1536 btrfs_free_leaf_ref(root, ref);
1537 } else {
1538 WARN_ON(1);
1539 }
1540 }
1541next_key:
1542 btrfs_release_path(root, path);
1543
1544 if (other_key.objectid == inode->i_ino &&
1545 other_key.type == key.type && other_key.offset > key.offset) {
1546 key.offset = other_key.offset;
1547 cond_resched();
1548 goto again;
1549 }
1550 ret = 0;
1551out:
1552 /* fixup any changes we've made to the path */
1553 path->lowest_level = 0;
1554 path->keep_locks = 0;
1555 btrfs_release_path(root, path);
1556 return ret;
1557}
1558
1559/*
1560 * this can truncate away extent items, csum items and directory items.
1561 * It starts at a high offset and removes keys until it can't find
1562 * any higher than new_size
1563 *
1564 * csum items that cross the new i_size are truncated to the new size
1565 * as well.
1566 *
1567 * min_type is the minimum key type to truncate down to. If set to 0, this
1568 * will kill all the items on this inode, including the INODE_ITEM_KEY.
1569 */
1570noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
1571 struct btrfs_root *root,
1572 struct inode *inode,
1573 u64 new_size, u32 min_type)
1574{
1575 int ret;
1576 struct btrfs_path *path;
1577 struct btrfs_key key;
1578 struct btrfs_key found_key;
1579 u32 found_type;
1580 struct extent_buffer *leaf;
1581 struct btrfs_file_extent_item *fi;
1582 u64 extent_start = 0;
1583 u64 extent_num_bytes = 0;
1584 u64 item_end = 0;
1585 u64 root_gen = 0;
1586 u64 root_owner = 0;
1587 int found_extent;
1588 int del_item;
1589 int pending_del_nr = 0;
1590 int pending_del_slot = 0;
1591 int extent_type = -1;
1592 u64 mask = root->sectorsize - 1;
1593
1594 if (root->ref_cows)
1595 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
1596 path = btrfs_alloc_path();
1597 path->reada = -1;
1598 BUG_ON(!path);
1599
1600 /* FIXME, add redo link to tree so we don't leak on crash */
1601 key.objectid = inode->i_ino;
1602 key.offset = (u64)-1;
1603 key.type = (u8)-1;
1604
1605 btrfs_init_path(path);
1606
1607 ret = drop_csum_leaves(trans, root, path, inode, new_size);
1608 BUG_ON(ret);
1609
1610search_again:
1611 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1612 if (ret < 0) {
1613 goto error;
1614 }
1615 if (ret > 0) {
1616 /* there are no items in the tree for us to truncate, we're
1617 * done
1618 */
1619 if (path->slots[0] == 0) {
1620 ret = 0;
1621 goto error;
1622 }
1623 path->slots[0]--;
1624 }
1625
1626 while(1) {
1627 fi = NULL;
1628 leaf = path->nodes[0];
1629 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1630 found_type = btrfs_key_type(&found_key);
1631
1632 if (found_key.objectid != inode->i_ino)
1633 break;
1634
1635 if (found_type < min_type)
1636 break;
1637
1638 item_end = found_key.offset;
1639 if (found_type == BTRFS_EXTENT_DATA_KEY) {
1640 fi = btrfs_item_ptr(leaf, path->slots[0],
1641 struct btrfs_file_extent_item);
1642 extent_type = btrfs_file_extent_type(leaf, fi);
1643 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
1644 item_end +=
1645 btrfs_file_extent_num_bytes(leaf, fi);
1646 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1647 struct btrfs_item *item = btrfs_item_nr(leaf,
1648 path->slots[0]);
1649 item_end += btrfs_file_extent_inline_len(leaf,
1650 item);
1651 }
1652 item_end--;
1653 }
1654 if (found_type == BTRFS_CSUM_ITEM_KEY) {
1655 ret = btrfs_csum_truncate(trans, root, path,
1656 new_size);
1657 BUG_ON(ret);
1658 }
1659 if (item_end < new_size) {
1660 if (found_type == BTRFS_DIR_ITEM_KEY) {
1661 found_type = BTRFS_INODE_ITEM_KEY;
1662 } else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
1663 found_type = BTRFS_CSUM_ITEM_KEY;
1664 } else if (found_type == BTRFS_EXTENT_DATA_KEY) {
1665 found_type = BTRFS_XATTR_ITEM_KEY;
1666 } else if (found_type == BTRFS_XATTR_ITEM_KEY) {
1667 found_type = BTRFS_INODE_REF_KEY;
1668 } else if (found_type) {
1669 found_type--;
1670 } else {
1671 break;
1672 }
1673 btrfs_set_key_type(&key, found_type);
1674 goto next;
1675 }
1676 if (found_key.offset >= new_size)
1677 del_item = 1;
1678 else
1679 del_item = 0;
1680 found_extent = 0;
1681
1682 /* FIXME, shrink the extent if the ref count is only 1 */
1683 if (found_type != BTRFS_EXTENT_DATA_KEY)
1684 goto delete;
1685
1686 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
1687 u64 num_dec;
1688 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
1689 if (!del_item) {
1690 u64 orig_num_bytes =
1691 btrfs_file_extent_num_bytes(leaf, fi);
1692 extent_num_bytes = new_size -
1693 found_key.offset + root->sectorsize - 1;
1694 extent_num_bytes = extent_num_bytes &
1695 ~((u64)root->sectorsize - 1);
1696 btrfs_set_file_extent_num_bytes(leaf, fi,
1697 extent_num_bytes);
1698 num_dec = (orig_num_bytes -
1699 extent_num_bytes);
1700 if (root->ref_cows && extent_start != 0)
1701 inode_sub_bytes(inode, num_dec);
1702 btrfs_mark_buffer_dirty(leaf);
1703 } else {
1704 extent_num_bytes =
1705 btrfs_file_extent_disk_num_bytes(leaf,
1706 fi);
1707 /* FIXME blocksize != 4096 */
1708 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
1709 if (extent_start != 0) {
1710 found_extent = 1;
1711 if (root->ref_cows)
1712 inode_sub_bytes(inode, num_dec);
1713 }
1714 root_gen = btrfs_header_generation(leaf);
1715 root_owner = btrfs_header_owner(leaf);
1716 }
1717 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1718 if (!del_item) {
1719 u32 size = new_size - found_key.offset;
1720
1721 if (root->ref_cows) {
1722 inode_sub_bytes(inode, item_end + 1 -
1723 new_size);
1724 }
1725 size =
1726 btrfs_file_extent_calc_inline_size(size);
1727 ret = btrfs_truncate_item(trans, root, path,
1728 size, 1);
1729 BUG_ON(ret);
1730 } else if (root->ref_cows) {
1731 inode_sub_bytes(inode, item_end + 1 -
1732 found_key.offset);
1733 }
1734 }
1735delete:
1736 if (del_item) {
1737 if (!pending_del_nr) {
1738 /* no pending yet, add ourselves */
1739 pending_del_slot = path->slots[0];
1740 pending_del_nr = 1;
1741 } else if (pending_del_nr &&
1742 path->slots[0] + 1 == pending_del_slot) {
1743 /* hop on the pending chunk */
1744 pending_del_nr++;
1745 pending_del_slot = path->slots[0];
1746 } else {
1747 printk("bad pending slot %d pending_del_nr %d pending_del_slot %d\n", path->slots[0], pending_del_nr, pending_del_slot);
1748 }
1749 } else {
1750 break;
1751 }
1752 if (found_extent) {
1753 ret = btrfs_free_extent(trans, root, extent_start,
1754 extent_num_bytes,
1755 leaf->start, root_owner,
1756 root_gen, inode->i_ino, 0);
1757 BUG_ON(ret);
1758 }
1759next:
1760 if (path->slots[0] == 0) {
1761 if (pending_del_nr)
1762 goto del_pending;
1763 btrfs_release_path(root, path);
1764 goto search_again;
1765 }
1766
1767 path->slots[0]--;
1768 if (pending_del_nr &&
1769 path->slots[0] + 1 != pending_del_slot) {
1770 struct btrfs_key debug;
1771del_pending:
1772 btrfs_item_key_to_cpu(path->nodes[0], &debug,
1773 pending_del_slot);
1774 ret = btrfs_del_items(trans, root, path,
1775 pending_del_slot,
1776 pending_del_nr);
1777 BUG_ON(ret);
1778 pending_del_nr = 0;
1779 btrfs_release_path(root, path);
1780 goto search_again;
1781 }
1782 }
1783 ret = 0;
1784error:
1785 if (pending_del_nr) {
1786 ret = btrfs_del_items(trans, root, path, pending_del_slot,
1787 pending_del_nr);
1788 }
1789 btrfs_free_path(path);
1790 inode->i_sb->s_dirt = 1;
1791 return ret;
1792}
1793
1794/*
1795 * taken from block_truncate_page, but does cow as it zeros out
1796 * any bytes left in the last page in the file.
1797 */
1798static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
1799{
1800 struct inode *inode = mapping->host;
1801 struct btrfs_root *root = BTRFS_I(inode)->root;
1802 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1803 struct btrfs_ordered_extent *ordered;
1804 char *kaddr;
1805 u32 blocksize = root->sectorsize;
1806 pgoff_t index = from >> PAGE_CACHE_SHIFT;
1807 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1808 struct page *page;
1809 int ret = 0;
1810 u64 page_start;
1811 u64 page_end;
1812
1813 if ((offset & (blocksize - 1)) == 0)
1814 goto out;
1815
1816 ret = -ENOMEM;
1817again:
1818 page = grab_cache_page(mapping, index);
1819 if (!page)
1820 goto out;
1821
1822 page_start = page_offset(page);
1823 page_end = page_start + PAGE_CACHE_SIZE - 1;
1824
1825 if (!PageUptodate(page)) {
1826 ret = btrfs_readpage(NULL, page);
1827 lock_page(page);
1828 if (page->mapping != mapping) {
1829 unlock_page(page);
1830 page_cache_release(page);
1831 goto again;
1832 }
1833 if (!PageUptodate(page)) {
1834 ret = -EIO;
1835 goto out_unlock;
1836 }
1837 }
1838 wait_on_page_writeback(page);
1839
1840 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
1841 set_page_extent_mapped(page);
1842
1843 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1844 if (ordered) {
1845 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
1846 unlock_page(page);
1847 page_cache_release(page);
1848 btrfs_start_ordered_extent(inode, ordered, 1);
1849 btrfs_put_ordered_extent(ordered);
1850 goto again;
1851 }
1852
1853 btrfs_set_extent_delalloc(inode, page_start, page_end);
1854 ret = 0;
1855 if (offset != PAGE_CACHE_SIZE) {
1856 kaddr = kmap(page);
1857 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
1858 flush_dcache_page(page);
1859 kunmap(page);
1860 }
1861 ClearPageChecked(page);
1862 set_page_dirty(page);
1863 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
1864
1865out_unlock:
1866 unlock_page(page);
1867 page_cache_release(page);
1868out:
1869 return ret;
1870}
1871
1872static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
1873{
1874 struct inode *inode = dentry->d_inode;
1875 int err;
1876
1877 err = inode_change_ok(inode, attr);
1878 if (err)
1879 return err;
1880
1881 if (S_ISREG(inode->i_mode) &&
1882 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
1883 struct btrfs_trans_handle *trans;
1884 struct btrfs_root *root = BTRFS_I(inode)->root;
1885 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1886
1887 u64 mask = root->sectorsize - 1;
1888 u64 hole_start = (inode->i_size + mask) & ~mask;
1889 u64 block_end = (attr->ia_size + mask) & ~mask;
1890 u64 hole_size;
1891 u64 alloc_hint = 0;
1892
1893 if (attr->ia_size <= hole_start)
1894 goto out;
1895
1896 err = btrfs_check_free_space(root, 1, 0);
1897 if (err)
1898 goto fail;
1899
1900 btrfs_truncate_page(inode->i_mapping, inode->i_size);
1901
1902 hole_size = block_end - hole_start;
1903 while(1) {
1904 struct btrfs_ordered_extent *ordered;
1905 btrfs_wait_ordered_range(inode, hole_start, hole_size);
1906
1907 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
1908 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
1909 if (ordered) {
1910 unlock_extent(io_tree, hole_start,
1911 block_end - 1, GFP_NOFS);
1912 btrfs_put_ordered_extent(ordered);
1913 } else {
1914 break;
1915 }
1916 }
1917
1918 trans = btrfs_start_transaction(root, 1);
1919 btrfs_set_trans_block_group(trans, inode);
1920 mutex_lock(&BTRFS_I(inode)->extent_mutex);
1921 err = btrfs_drop_extents(trans, root, inode,
1922 hole_start, block_end, hole_start,
1923 &alloc_hint);
1924
1925 if (alloc_hint != EXTENT_MAP_INLINE) {
1926 err = btrfs_insert_file_extent(trans, root,
1927 inode->i_ino,
1928 hole_start, 0, 0,
1929 hole_size, 0);
1930 btrfs_drop_extent_cache(inode, hole_start,
1931 (u64)-1, 0);
1932 btrfs_check_file(root, inode);
1933 }
1934 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
1935 btrfs_end_transaction(trans, root);
1936 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
1937 if (err)
1938 return err;
1939 }
1940out:
1941 err = inode_setattr(inode, attr);
1942
1943 if (!err && ((attr->ia_valid & ATTR_MODE)))
1944 err = btrfs_acl_chmod(inode);
1945fail:
1946 return err;
1947}
1948
1949void btrfs_delete_inode(struct inode *inode)
1950{
1951 struct btrfs_trans_handle *trans;
1952 struct btrfs_root *root = BTRFS_I(inode)->root;
1953 unsigned long nr;
1954 int ret;
1955
1956 truncate_inode_pages(&inode->i_data, 0);
1957 if (is_bad_inode(inode)) {
1958 btrfs_orphan_del(NULL, inode);
1959 goto no_delete;
1960 }
1961 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1962
1963 btrfs_i_size_write(inode, 0);
1964 trans = btrfs_start_transaction(root, 1);
1965
1966 btrfs_set_trans_block_group(trans, inode);
1967 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
1968 if (ret) {
1969 btrfs_orphan_del(NULL, inode);
1970 goto no_delete_lock;
1971 }
1972
1973 btrfs_orphan_del(trans, inode);
1974
1975 nr = trans->blocks_used;
1976 clear_inode(inode);
1977
1978 btrfs_end_transaction(trans, root);
1979 btrfs_btree_balance_dirty(root, nr);
1980 return;
1981
1982no_delete_lock:
1983 nr = trans->blocks_used;
1984 btrfs_end_transaction(trans, root);
1985 btrfs_btree_balance_dirty(root, nr);
1986no_delete:
1987 clear_inode(inode);
1988}
1989
1990/*
1991 * this returns the key found in the dir entry in the location pointer.
1992 * If no dir entries were found, location->objectid is 0.
1993 */
1994static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
1995 struct btrfs_key *location)
1996{
1997 const char *name = dentry->d_name.name;
1998 int namelen = dentry->d_name.len;
1999 struct btrfs_dir_item *di;
2000 struct btrfs_path *path;
2001 struct btrfs_root *root = BTRFS_I(dir)->root;
2002 int ret = 0;
2003
2004 path = btrfs_alloc_path();
2005 BUG_ON(!path);
2006
2007 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
2008 namelen, 0);
2009 if (IS_ERR(di))
2010 ret = PTR_ERR(di);
2011 if (!di || IS_ERR(di)) {
2012 goto out_err;
2013 }
2014 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
2015out:
2016 btrfs_free_path(path);
2017 return ret;
2018out_err:
2019 location->objectid = 0;
2020 goto out;
2021}
2022
2023/*
2024 * when we hit a tree root in a directory, the btrfs part of the inode
2025 * needs to be changed to reflect the root directory of the tree root. This
2026 * is kind of like crossing a mount point.
2027 */
2028static int fixup_tree_root_location(struct btrfs_root *root,
2029 struct btrfs_key *location,
2030 struct btrfs_root **sub_root,
2031 struct dentry *dentry)
2032{
2033 struct btrfs_root_item *ri;
2034
2035 if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
2036 return 0;
2037 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
2038 return 0;
2039
2040 *sub_root = btrfs_read_fs_root(root->fs_info, location,
2041 dentry->d_name.name,
2042 dentry->d_name.len);
2043 if (IS_ERR(*sub_root))
2044 return PTR_ERR(*sub_root);
2045
2046 ri = &(*sub_root)->root_item;
2047 location->objectid = btrfs_root_dirid(ri);
2048 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2049 location->offset = 0;
2050
2051 return 0;
2052}
2053
2054static noinline void init_btrfs_i(struct inode *inode)
2055{
2056 struct btrfs_inode *bi = BTRFS_I(inode);
2057
2058 bi->i_acl = NULL;
2059 bi->i_default_acl = NULL;
2060
2061 bi->generation = 0;
2062 bi->last_trans = 0;
2063 bi->logged_trans = 0;
2064 bi->delalloc_bytes = 0;
2065 bi->disk_i_size = 0;
2066 bi->flags = 0;
2067 bi->index_cnt = (u64)-1;
2068 bi->log_dirty_trans = 0;
2069 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2070 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2071 inode->i_mapping, GFP_NOFS);
2072 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2073 inode->i_mapping, GFP_NOFS);
2074 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
2075 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2076 mutex_init(&BTRFS_I(inode)->csum_mutex);
2077 mutex_init(&BTRFS_I(inode)->extent_mutex);
2078 mutex_init(&BTRFS_I(inode)->log_mutex);
2079}
2080
2081static int btrfs_init_locked_inode(struct inode *inode, void *p)
2082{
2083 struct btrfs_iget_args *args = p;
2084 inode->i_ino = args->ino;
2085 init_btrfs_i(inode);
2086 BTRFS_I(inode)->root = args->root;
2087 return 0;
2088}
2089
2090static int btrfs_find_actor(struct inode *inode, void *opaque)
2091{
2092 struct btrfs_iget_args *args = opaque;
2093 return (args->ino == inode->i_ino &&
2094 args->root == BTRFS_I(inode)->root);
2095}
2096
2097struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
2098 struct btrfs_root *root, int wait)
2099{
2100 struct inode *inode;
2101 struct btrfs_iget_args args;
2102 args.ino = objectid;
2103 args.root = root;
2104
2105 if (wait) {
2106 inode = ilookup5(s, objectid, btrfs_find_actor,
2107 (void *)&args);
2108 } else {
2109 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
2110 (void *)&args);
2111 }
2112 return inode;
2113}
2114
2115struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
2116 struct btrfs_root *root)
2117{
2118 struct inode *inode;
2119 struct btrfs_iget_args args;
2120 args.ino = objectid;
2121 args.root = root;
2122
2123 inode = iget5_locked(s, objectid, btrfs_find_actor,
2124 btrfs_init_locked_inode,
2125 (void *)&args);
2126 return inode;
2127}
2128
2129/* Get an inode object given its location and corresponding root.
2130 * Returns in *is_new if the inode was read from disk
2131 */
2132struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2133 struct btrfs_root *root, int *is_new)
2134{
2135 struct inode *inode;
2136
2137 inode = btrfs_iget_locked(s, location->objectid, root);
2138 if (!inode)
2139 return ERR_PTR(-EACCES);
2140
2141 if (inode->i_state & I_NEW) {
2142 BTRFS_I(inode)->root = root;
2143 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
2144 btrfs_read_locked_inode(inode);
2145 unlock_new_inode(inode);
2146 if (is_new)
2147 *is_new = 1;
2148 } else {
2149 if (is_new)
2150 *is_new = 0;
2151 }
2152
2153 return inode;
2154}
2155
2156static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
2157 struct nameidata *nd)
2158{
2159 struct inode * inode;
2160 struct btrfs_inode *bi = BTRFS_I(dir);
2161 struct btrfs_root *root = bi->root;
2162 struct btrfs_root *sub_root = root;
2163 struct btrfs_key location;
2164 int ret, new, do_orphan = 0;
2165
2166 if (dentry->d_name.len > BTRFS_NAME_LEN)
2167 return ERR_PTR(-ENAMETOOLONG);
2168
2169 ret = btrfs_inode_by_name(dir, dentry, &location);
2170
2171 if (ret < 0)
2172 return ERR_PTR(ret);
2173
2174 inode = NULL;
2175 if (location.objectid) {
2176 ret = fixup_tree_root_location(root, &location, &sub_root,
2177 dentry);
2178 if (ret < 0)
2179 return ERR_PTR(ret);
2180 if (ret > 0)
2181 return ERR_PTR(-ENOENT);
2182 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
2183 if (IS_ERR(inode))
2184 return ERR_CAST(inode);
2185
2186 /* the inode and parent dir are two different roots */
2187 if (new && root != sub_root) {
2188 igrab(inode);
2189 sub_root->inode = inode;
2190 do_orphan = 1;
2191 }
2192 }
2193
2194 if (unlikely(do_orphan))
2195 btrfs_orphan_cleanup(sub_root);
2196
2197 return d_splice_alias(inode, dentry);
2198}
2199
2200static unsigned char btrfs_filetype_table[] = {
2201 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
2202};
2203
2204static int btrfs_real_readdir(struct file *filp, void *dirent,
2205 filldir_t filldir)
2206{
2207 struct inode *inode = filp->f_dentry->d_inode;
2208 struct btrfs_root *root = BTRFS_I(inode)->root;
2209 struct btrfs_item *item;
2210 struct btrfs_dir_item *di;
2211 struct btrfs_key key;
2212 struct btrfs_key found_key;
2213 struct btrfs_path *path;
2214 int ret;
2215 u32 nritems;
2216 struct extent_buffer *leaf;
2217 int slot;
2218 int advance;
2219 unsigned char d_type;
2220 int over = 0;
2221 u32 di_cur;
2222 u32 di_total;
2223 u32 di_len;
2224 int key_type = BTRFS_DIR_INDEX_KEY;
2225 char tmp_name[32];
2226 char *name_ptr;
2227 int name_len;
2228
2229 /* FIXME, use a real flag for deciding about the key type */
2230 if (root->fs_info->tree_root == root)
2231 key_type = BTRFS_DIR_ITEM_KEY;
2232
2233 /* special case for "." */
2234 if (filp->f_pos == 0) {
2235 over = filldir(dirent, ".", 1,
2236 1, inode->i_ino,
2237 DT_DIR);
2238 if (over)
2239 return 0;
2240 filp->f_pos = 1;
2241 }
2242 /* special case for .., just use the back ref */
2243 if (filp->f_pos == 1) {
2244 u64 pino = parent_ino(filp->f_path.dentry);
2245 over = filldir(dirent, "..", 2,
2246 2, pino, DT_DIR);
2247 if (over)
2248 return 0;
2249 filp->f_pos = 2;
2250 }
2251
2252 path = btrfs_alloc_path();
2253 path->reada = 2;
2254
2255 btrfs_set_key_type(&key, key_type);
2256 key.offset = filp->f_pos;
2257 key.objectid = inode->i_ino;
2258
2259 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2260 if (ret < 0)
2261 goto err;
2262 advance = 0;
2263
2264 while (1) {
2265 leaf = path->nodes[0];
2266 nritems = btrfs_header_nritems(leaf);
2267 slot = path->slots[0];
2268 if (advance || slot >= nritems) {
2269 if (slot >= nritems - 1) {
2270 ret = btrfs_next_leaf(root, path);
2271 if (ret)
2272 break;
2273 leaf = path->nodes[0];
2274 nritems = btrfs_header_nritems(leaf);
2275 slot = path->slots[0];
2276 } else {
2277 slot++;
2278 path->slots[0]++;
2279 }
2280 }
2281 advance = 1;
2282 item = btrfs_item_nr(leaf, slot);
2283 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2284
2285 if (found_key.objectid != key.objectid)
2286 break;
2287 if (btrfs_key_type(&found_key) != key_type)
2288 break;
2289 if (found_key.offset < filp->f_pos)
2290 continue;
2291
2292 filp->f_pos = found_key.offset;
2293
2294 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
2295 di_cur = 0;
2296 di_total = btrfs_item_size(leaf, item);
2297
2298 while (di_cur < di_total) {
2299 struct btrfs_key location;
2300
2301 name_len = btrfs_dir_name_len(leaf, di);
2302 if (name_len <= sizeof(tmp_name)) {
2303 name_ptr = tmp_name;
2304 } else {
2305 name_ptr = kmalloc(name_len, GFP_NOFS);
2306 if (!name_ptr) {
2307 ret = -ENOMEM;
2308 goto err;
2309 }
2310 }
2311 read_extent_buffer(leaf, name_ptr,
2312 (unsigned long)(di + 1), name_len);
2313
2314 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
2315 btrfs_dir_item_key_to_cpu(leaf, di, &location);
2316 over = filldir(dirent, name_ptr, name_len,
2317 found_key.offset, location.objectid,
2318 d_type);
2319
2320 if (name_ptr != tmp_name)
2321 kfree(name_ptr);
2322
2323 if (over)
2324 goto nopos;
2325
2326 di_len = btrfs_dir_name_len(leaf, di) +
2327 btrfs_dir_data_len(leaf, di) + sizeof(*di);
2328 di_cur += di_len;
2329 di = (struct btrfs_dir_item *)((char *)di + di_len);
2330 }
2331 }
2332
2333 /* Reached end of directory/root. Bump pos past the last item. */
2334 if (key_type == BTRFS_DIR_INDEX_KEY)
2335 filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
2336 else
2337 filp->f_pos++;
2338nopos:
2339 ret = 0;
2340err:
2341 btrfs_free_path(path);
2342 return ret;
2343}
2344
2345int btrfs_write_inode(struct inode *inode, int wait)
2346{
2347 struct btrfs_root *root = BTRFS_I(inode)->root;
2348 struct btrfs_trans_handle *trans;
2349 int ret = 0;
2350
2351 if (root->fs_info->closing > 1)
2352 return 0;
2353
2354 if (wait) {
2355 trans = btrfs_join_transaction(root, 1);
2356 btrfs_set_trans_block_group(trans, inode);
2357 ret = btrfs_commit_transaction(trans, root);
2358 }
2359 return ret;
2360}
2361
2362/*
2363 * This is somewhat expensive, updating the tree every time the
2364 * inode changes. But, it is most likely to find the inode in cache.
2365 * FIXME, needs more benchmarking...there are no reasons other than performance
2366 * to keep or drop this code.
2367 */
2368void btrfs_dirty_inode(struct inode *inode)
2369{
2370 struct btrfs_root *root = BTRFS_I(inode)->root;
2371 struct btrfs_trans_handle *trans;
2372
2373 trans = btrfs_join_transaction(root, 1);
2374 btrfs_set_trans_block_group(trans, inode);
2375 btrfs_update_inode(trans, root, inode);
2376 btrfs_end_transaction(trans, root);
2377}
2378
2379/*
2380 * find the highest existing sequence number in a directory
2381 * and then set the in-memory index_cnt variable to reflect
2382 * free sequence numbers
2383 */
2384static int btrfs_set_inode_index_count(struct inode *inode)
2385{
2386 struct btrfs_root *root = BTRFS_I(inode)->root;
2387 struct btrfs_key key, found_key;
2388 struct btrfs_path *path;
2389 struct extent_buffer *leaf;
2390 int ret;
2391
2392 key.objectid = inode->i_ino;
2393 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
2394 key.offset = (u64)-1;
2395
2396 path = btrfs_alloc_path();
2397 if (!path)
2398 return -ENOMEM;
2399
2400 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2401 if (ret < 0)
2402 goto out;
2403 /* FIXME: we should be able to handle this */
2404 if (ret == 0)
2405 goto out;
2406 ret = 0;
2407
2408 /*
2409 * MAGIC NUMBER EXPLANATION:
2410 * since we search a directory based on f_pos we have to start at 2
2411 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
2412 * else has to start at 2
2413 */
2414 if (path->slots[0] == 0) {
2415 BTRFS_I(inode)->index_cnt = 2;
2416 goto out;
2417 }
2418
2419 path->slots[0]--;
2420
2421 leaf = path->nodes[0];
2422 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2423
2424 if (found_key.objectid != inode->i_ino ||
2425 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
2426 BTRFS_I(inode)->index_cnt = 2;
2427 goto out;
2428 }
2429
2430 BTRFS_I(inode)->index_cnt = found_key.offset + 1;
2431out:
2432 btrfs_free_path(path);
2433 return ret;
2434}
2435
2436/*
2437 * helper to find a free sequence number in a given directory. This current
2438 * code is very simple, later versions will do smarter things in the btree
2439 */
2440static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
2441 u64 *index)
2442{
2443 int ret = 0;
2444
2445 if (BTRFS_I(dir)->index_cnt == (u64)-1) {
2446 ret = btrfs_set_inode_index_count(dir);
2447 if (ret) {
2448 return ret;
2449 }
2450 }
2451
2452 *index = BTRFS_I(dir)->index_cnt;
2453 BTRFS_I(dir)->index_cnt++;
2454
2455 return ret;
2456}
2457
2458static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
2459 struct btrfs_root *root,
2460 struct inode *dir,
2461 const char *name, int name_len,
2462 u64 ref_objectid,
2463 u64 objectid,
2464 struct btrfs_block_group_cache *group,
2465 int mode, u64 *index)
2466{
2467 struct inode *inode;
2468 struct btrfs_inode_item *inode_item;
2469 struct btrfs_block_group_cache *new_inode_group;
2470 struct btrfs_key *location;
2471 struct btrfs_path *path;
2472 struct btrfs_inode_ref *ref;
2473 struct btrfs_key key[2];
2474 u32 sizes[2];
2475 unsigned long ptr;
2476 int ret;
2477 int owner;
2478
2479 path = btrfs_alloc_path();
2480 BUG_ON(!path);
2481
2482 inode = new_inode(root->fs_info->sb);
2483 if (!inode)
2484 return ERR_PTR(-ENOMEM);
2485
2486 if (dir) {
2487 ret = btrfs_set_inode_index(dir, inode, index);
2488 if (ret)
2489 return ERR_PTR(ret);
2490 }
2491 /*
2492 * index_cnt is ignored for everything but a dir,
2493 * btrfs_get_inode_index_count has an explanation for the magic
2494 * number
2495 */
2496 init_btrfs_i(inode);
2497 BTRFS_I(inode)->index_cnt = 2;
2498 BTRFS_I(inode)->root = root;
2499 BTRFS_I(inode)->generation = trans->transid;
2500
2501 if (mode & S_IFDIR)
2502 owner = 0;
2503 else
2504 owner = 1;
2505 new_inode_group = btrfs_find_block_group(root, group, 0,
2506 BTRFS_BLOCK_GROUP_METADATA, owner);
2507 if (!new_inode_group) {
2508 printk("find_block group failed\n");
2509 new_inode_group = group;
2510 }
2511 BTRFS_I(inode)->block_group = new_inode_group;
2512
2513 key[0].objectid = objectid;
2514 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
2515 key[0].offset = 0;
2516
2517 key[1].objectid = objectid;
2518 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
2519 key[1].offset = ref_objectid;
2520
2521 sizes[0] = sizeof(struct btrfs_inode_item);
2522 sizes[1] = name_len + sizeof(*ref);
2523
2524 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
2525 if (ret != 0)
2526 goto fail;
2527
2528 if (objectid > root->highest_inode)
2529 root->highest_inode = objectid;
2530
2531 inode->i_uid = current->fsuid;
2532 inode->i_gid = current->fsgid;
2533 inode->i_mode = mode;
2534 inode->i_ino = objectid;
2535 inode_set_bytes(inode, 0);
2536 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2537 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2538 struct btrfs_inode_item);
2539 fill_inode_item(trans, path->nodes[0], inode_item, inode);
2540
2541 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
2542 struct btrfs_inode_ref);
2543 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
2544 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
2545 ptr = (unsigned long)(ref + 1);
2546 write_extent_buffer(path->nodes[0], name, ptr, name_len);
2547
2548 btrfs_mark_buffer_dirty(path->nodes[0]);
2549 btrfs_free_path(path);
2550
2551 location = &BTRFS_I(inode)->location;
2552 location->objectid = objectid;
2553 location->offset = 0;
2554 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2555
2556 insert_inode_hash(inode);
2557 return inode;
2558fail:
2559 if (dir)
2560 BTRFS_I(dir)->index_cnt--;
2561 btrfs_free_path(path);
2562 return ERR_PTR(ret);
2563}
2564
2565static inline u8 btrfs_inode_type(struct inode *inode)
2566{
2567 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
2568}
2569
2570/*
2571 * utility function to add 'inode' into 'parent_inode' with
2572 * a give name and a given sequence number.
2573 * if 'add_backref' is true, also insert a backref from the
2574 * inode to the parent directory.
2575 */
2576int btrfs_add_link(struct btrfs_trans_handle *trans,
2577 struct inode *parent_inode, struct inode *inode,
2578 const char *name, int name_len, int add_backref, u64 index)
2579{
2580 int ret;
2581 struct btrfs_key key;
2582 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
2583
2584 key.objectid = inode->i_ino;
2585 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
2586 key.offset = 0;
2587
2588 ret = btrfs_insert_dir_item(trans, root, name, name_len,
2589 parent_inode->i_ino,
2590 &key, btrfs_inode_type(inode),
2591 index);
2592 if (ret == 0) {
2593 if (add_backref) {
2594 ret = btrfs_insert_inode_ref(trans, root,
2595 name, name_len,
2596 inode->i_ino,
2597 parent_inode->i_ino,
2598 index);
2599 }
2600 btrfs_i_size_write(parent_inode, parent_inode->i_size +
2601 name_len * 2);
2602 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
2603 ret = btrfs_update_inode(trans, root, parent_inode);
2604 }
2605 return ret;
2606}
2607
2608static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
2609 struct dentry *dentry, struct inode *inode,
2610 int backref, u64 index)
2611{
2612 int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
2613 inode, dentry->d_name.name,
2614 dentry->d_name.len, backref, index);
2615 if (!err) {
2616 d_instantiate(dentry, inode);
2617 return 0;
2618 }
2619 if (err > 0)
2620 err = -EEXIST;
2621 return err;
2622}
2623
2624static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
2625 int mode, dev_t rdev)
2626{
2627 struct btrfs_trans_handle *trans;
2628 struct btrfs_root *root = BTRFS_I(dir)->root;
2629 struct inode *inode = NULL;
2630 int err;
2631 int drop_inode = 0;
2632 u64 objectid;
2633 unsigned long nr = 0;
2634 u64 index = 0;
2635
2636 if (!new_valid_dev(rdev))
2637 return -EINVAL;
2638
2639 err = btrfs_check_free_space(root, 1, 0);
2640 if (err)
2641 goto fail;
2642
2643 trans = btrfs_start_transaction(root, 1);
2644 btrfs_set_trans_block_group(trans, dir);
2645
2646 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
2647 if (err) {
2648 err = -ENOSPC;
2649 goto out_unlock;
2650 }
2651
2652 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
2653 dentry->d_name.len,
2654 dentry->d_parent->d_inode->i_ino, objectid,
2655 BTRFS_I(dir)->block_group, mode, &index);
2656 err = PTR_ERR(inode);
2657 if (IS_ERR(inode))
2658 goto out_unlock;
2659
2660 err = btrfs_init_acl(inode, dir);
2661 if (err) {
2662 drop_inode = 1;
2663 goto out_unlock;
2664 }
2665
2666 btrfs_set_trans_block_group(trans, inode);
2667 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
2668 if (err)
2669 drop_inode = 1;
2670 else {
2671 inode->i_op = &btrfs_special_inode_operations;
2672 init_special_inode(inode, inode->i_mode, rdev);
2673 btrfs_update_inode(trans, root, inode);
2674 }
2675 dir->i_sb->s_dirt = 1;
2676 btrfs_update_inode_block_group(trans, inode);
2677 btrfs_update_inode_block_group(trans, dir);
2678out_unlock:
2679 nr = trans->blocks_used;
2680 btrfs_end_transaction_throttle(trans, root);
2681fail:
2682 if (drop_inode) {
2683 inode_dec_link_count(inode);
2684 iput(inode);
2685 }
2686 btrfs_btree_balance_dirty(root, nr);
2687 return err;
2688}
2689
2690static int btrfs_create(struct inode *dir, struct dentry *dentry,
2691 int mode, struct nameidata *nd)
2692{
2693 struct btrfs_trans_handle *trans;
2694 struct btrfs_root *root = BTRFS_I(dir)->root;
2695 struct inode *inode = NULL;
2696 int err;
2697 int drop_inode = 0;
2698 unsigned long nr = 0;
2699 u64 objectid;
2700 u64 index = 0;
2701
2702 err = btrfs_check_free_space(root, 1, 0);
2703 if (err)
2704 goto fail;
2705 trans = btrfs_start_transaction(root, 1);
2706 btrfs_set_trans_block_group(trans, dir);
2707
2708 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
2709 if (err) {
2710 err = -ENOSPC;
2711 goto out_unlock;
2712 }
2713
2714 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
2715 dentry->d_name.len,
2716 dentry->d_parent->d_inode->i_ino,
2717 objectid, BTRFS_I(dir)->block_group, mode,
2718 &index);
2719 err = PTR_ERR(inode);
2720 if (IS_ERR(inode))
2721 goto out_unlock;
2722
2723 err = btrfs_init_acl(inode, dir);
2724 if (err) {
2725 drop_inode = 1;
2726 goto out_unlock;
2727 }
2728
2729 btrfs_set_trans_block_group(trans, inode);
2730 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
2731 if (err)
2732 drop_inode = 1;
2733 else {
2734 inode->i_mapping->a_ops = &btrfs_aops;
2735 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2736 inode->i_fop = &btrfs_file_operations;
2737 inode->i_op = &btrfs_file_inode_operations;
2738 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2739 }
2740 dir->i_sb->s_dirt = 1;
2741 btrfs_update_inode_block_group(trans, inode);
2742 btrfs_update_inode_block_group(trans, dir);
2743out_unlock:
2744 nr = trans->blocks_used;
2745 btrfs_end_transaction_throttle(trans, root);
2746fail:
2747 if (drop_inode) {
2748 inode_dec_link_count(inode);
2749 iput(inode);
2750 }
2751 btrfs_btree_balance_dirty(root, nr);
2752 return err;
2753}
2754
2755static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
2756 struct dentry *dentry)
2757{
2758 struct btrfs_trans_handle *trans;
2759 struct btrfs_root *root = BTRFS_I(dir)->root;
2760 struct inode *inode = old_dentry->d_inode;
2761 u64 index;
2762 unsigned long nr = 0;
2763 int err;
2764 int drop_inode = 0;
2765
2766 if (inode->i_nlink == 0)
2767 return -ENOENT;
2768
2769 btrfs_inc_nlink(inode);
2770 err = btrfs_check_free_space(root, 1, 0);
2771 if (err)
2772 goto fail;
2773 err = btrfs_set_inode_index(dir, inode, &index);
2774 if (err)
2775 goto fail;
2776
2777 trans = btrfs_start_transaction(root, 1);
2778
2779 btrfs_set_trans_block_group(trans, dir);
2780 atomic_inc(&inode->i_count);
2781
2782 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
2783
2784 if (err)
2785 drop_inode = 1;
2786
2787 dir->i_sb->s_dirt = 1;
2788 btrfs_update_inode_block_group(trans, dir);
2789 err = btrfs_update_inode(trans, root, inode);
2790
2791 if (err)
2792 drop_inode = 1;
2793
2794 nr = trans->blocks_used;
2795 btrfs_end_transaction_throttle(trans, root);
2796fail:
2797 if (drop_inode) {
2798 inode_dec_link_count(inode);
2799 iput(inode);
2800 }
2801 btrfs_btree_balance_dirty(root, nr);
2802 return err;
2803}
2804
2805static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2806{
2807 struct inode *inode = NULL;
2808 struct btrfs_trans_handle *trans;
2809 struct btrfs_root *root = BTRFS_I(dir)->root;
2810 int err = 0;
2811 int drop_on_err = 0;
2812 u64 objectid = 0;
2813 u64 index = 0;
2814 unsigned long nr = 1;
2815
2816 err = btrfs_check_free_space(root, 1, 0);
2817 if (err)
2818 goto out_unlock;
2819
2820 trans = btrfs_start_transaction(root, 1);
2821 btrfs_set_trans_block_group(trans, dir);
2822
2823 if (IS_ERR(trans)) {
2824 err = PTR_ERR(trans);
2825 goto out_unlock;
2826 }
2827
2828 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
2829 if (err) {
2830 err = -ENOSPC;
2831 goto out_unlock;
2832 }
2833
2834 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
2835 dentry->d_name.len,
2836 dentry->d_parent->d_inode->i_ino, objectid,
2837 BTRFS_I(dir)->block_group, S_IFDIR | mode,
2838 &index);
2839 if (IS_ERR(inode)) {
2840 err = PTR_ERR(inode);
2841 goto out_fail;
2842 }
2843
2844 drop_on_err = 1;
2845
2846 err = btrfs_init_acl(inode, dir);
2847 if (err)
2848 goto out_fail;
2849
2850 inode->i_op = &btrfs_dir_inode_operations;
2851 inode->i_fop = &btrfs_dir_file_operations;
2852 btrfs_set_trans_block_group(trans, inode);
2853
2854 btrfs_i_size_write(inode, 0);
2855 err = btrfs_update_inode(trans, root, inode);
2856 if (err)
2857 goto out_fail;
2858
2859 err = btrfs_add_link(trans, dentry->d_parent->d_inode,
2860 inode, dentry->d_name.name,
2861 dentry->d_name.len, 0, index);
2862 if (err)
2863 goto out_fail;
2864
2865 d_instantiate(dentry, inode);
2866 drop_on_err = 0;
2867 dir->i_sb->s_dirt = 1;
2868 btrfs_update_inode_block_group(trans, inode);
2869 btrfs_update_inode_block_group(trans, dir);
2870
2871out_fail:
2872 nr = trans->blocks_used;
2873 btrfs_end_transaction_throttle(trans, root);
2874
2875out_unlock:
2876 if (drop_on_err)
2877 iput(inode);
2878 btrfs_btree_balance_dirty(root, nr);
2879 return err;
2880}
2881
2882/* helper for btfs_get_extent. Given an existing extent in the tree,
2883 * and an extent that you want to insert, deal with overlap and insert
2884 * the new extent into the tree.
2885 */
2886static int merge_extent_mapping(struct extent_map_tree *em_tree,
2887 struct extent_map *existing,
2888 struct extent_map *em,
2889 u64 map_start, u64 map_len)
2890{
2891 u64 start_diff;
2892
2893 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
2894 start_diff = map_start - em->start;
2895 em->start = map_start;
2896 em->len = map_len;
2897 if (em->block_start < EXTENT_MAP_LAST_BYTE)
2898 em->block_start += start_diff;
2899 return add_extent_mapping(em_tree, em);
2900}
2901
2902/*
2903 * a bit scary, this does extent mapping from logical file offset to the disk.
2904 * the ugly parts come from merging extents from the disk with the
2905 * in-ram representation. This gets more complex because of the data=ordered code,
2906 * where the in-ram extents might be locked pending data=ordered completion.
2907 *
2908 * This also copies inline extents directly into the page.
2909 */
2910struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2911 size_t pg_offset, u64 start, u64 len,
2912 int create)
2913{
2914 int ret;
2915 int err = 0;
2916 u64 bytenr;
2917 u64 extent_start = 0;
2918 u64 extent_end = 0;
2919 u64 objectid = inode->i_ino;
2920 u32 found_type;
2921 struct btrfs_path *path = NULL;
2922 struct btrfs_root *root = BTRFS_I(inode)->root;
2923 struct btrfs_file_extent_item *item;
2924 struct extent_buffer *leaf;
2925 struct btrfs_key found_key;
2926 struct extent_map *em = NULL;
2927 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2928 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2929 struct btrfs_trans_handle *trans = NULL;
2930
2931again:
2932 spin_lock(&em_tree->lock);
2933 em = lookup_extent_mapping(em_tree, start, len);
2934 if (em)
2935 em->bdev = root->fs_info->fs_devices->latest_bdev;
2936 spin_unlock(&em_tree->lock);
2937
2938 if (em) {
2939 if (em->start > start || em->start + em->len <= start)
2940 free_extent_map(em);
2941 else if (em->block_start == EXTENT_MAP_INLINE && page)
2942 free_extent_map(em);
2943 else
2944 goto out;
2945 }
2946 em = alloc_extent_map(GFP_NOFS);
2947 if (!em) {
2948 err = -ENOMEM;
2949 goto out;
2950 }
2951 em->bdev = root->fs_info->fs_devices->latest_bdev;
2952 em->start = EXTENT_MAP_HOLE;
2953 em->len = (u64)-1;
2954
2955 if (!path) {
2956 path = btrfs_alloc_path();
2957 BUG_ON(!path);
2958 }
2959
2960 ret = btrfs_lookup_file_extent(trans, root, path,
2961 objectid, start, trans != NULL);
2962 if (ret < 0) {
2963 err = ret;
2964 goto out;
2965 }
2966
2967 if (ret != 0) {
2968 if (path->slots[0] == 0)
2969 goto not_found;
2970 path->slots[0]--;
2971 }
2972
2973 leaf = path->nodes[0];
2974 item = btrfs_item_ptr(leaf, path->slots[0],
2975 struct btrfs_file_extent_item);
2976 /* are we inside the extent that was found? */
2977 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2978 found_type = btrfs_key_type(&found_key);
2979 if (found_key.objectid != objectid ||
2980 found_type != BTRFS_EXTENT_DATA_KEY) {
2981 goto not_found;
2982 }
2983
2984 found_type = btrfs_file_extent_type(leaf, item);
2985 extent_start = found_key.offset;
2986 if (found_type == BTRFS_FILE_EXTENT_REG) {
2987 extent_end = extent_start +
2988 btrfs_file_extent_num_bytes(leaf, item);
2989 err = 0;
2990 if (start < extent_start || start >= extent_end) {
2991 em->start = start;
2992 if (start < extent_start) {
2993 if (start + len <= extent_start)
2994 goto not_found;
2995 em->len = extent_end - extent_start;
2996 } else {
2997 em->len = len;
2998 }
2999 goto not_found_em;
3000 }
3001 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
3002 if (bytenr == 0) {
3003 em->start = extent_start;
3004 em->len = extent_end - extent_start;
3005 em->block_start = EXTENT_MAP_HOLE;
3006 goto insert;
3007 }
3008 bytenr += btrfs_file_extent_offset(leaf, item);
3009 em->block_start = bytenr;
3010 em->start = extent_start;
3011 em->len = extent_end - extent_start;
3012 goto insert;
3013 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3014 u64 page_start;
3015 unsigned long ptr;
3016 char *map;
3017 size_t size;
3018 size_t extent_offset;
3019 size_t copy_size;
3020
3021 size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
3022 path->slots[0]));
3023 extent_end = (extent_start + size + root->sectorsize - 1) &
3024 ~((u64)root->sectorsize - 1);
3025 if (start < extent_start || start >= extent_end) {
3026 em->start = start;
3027 if (start < extent_start) {
3028 if (start + len <= extent_start)
3029 goto not_found;
3030 em->len = extent_end - extent_start;
3031 } else {
3032 em->len = len;
3033 }
3034 goto not_found_em;
3035 }
3036 em->block_start = EXTENT_MAP_INLINE;
3037
3038 if (!page) {
3039 em->start = extent_start;
3040 em->len = size;
3041 goto out;
3042 }
3043
3044 page_start = page_offset(page) + pg_offset;
3045 extent_offset = page_start - extent_start;
3046 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
3047 size - extent_offset);
3048 em->start = extent_start + extent_offset;
3049 em->len = (copy_size + root->sectorsize - 1) &
3050 ~((u64)root->sectorsize - 1);
3051 map = kmap(page);
3052 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
3053 if (create == 0 && !PageUptodate(page)) {
3054 read_extent_buffer(leaf, map + pg_offset, ptr,
3055 copy_size);
3056 flush_dcache_page(page);
3057 } else if (create && PageUptodate(page)) {
3058 if (!trans) {
3059 kunmap(page);
3060 free_extent_map(em);
3061 em = NULL;
3062 btrfs_release_path(root, path);
3063 trans = btrfs_join_transaction(root, 1);
3064 goto again;
3065 }
3066 write_extent_buffer(leaf, map + pg_offset, ptr,
3067 copy_size);
3068 btrfs_mark_buffer_dirty(leaf);
3069 }
3070 kunmap(page);
3071 set_extent_uptodate(io_tree, em->start,
3072 extent_map_end(em) - 1, GFP_NOFS);
3073 goto insert;
3074 } else {
3075 printk("unkknown found_type %d\n", found_type);
3076 WARN_ON(1);
3077 }
3078not_found:
3079 em->start = start;
3080 em->len = len;
3081not_found_em:
3082 em->block_start = EXTENT_MAP_HOLE;
3083insert:
3084 btrfs_release_path(root, path);
3085 if (em->start > start || extent_map_end(em) <= start) {
3086 printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->len, start, len);
3087 err = -EIO;
3088 goto out;
3089 }
3090
3091 err = 0;
3092 spin_lock(&em_tree->lock);
3093 ret = add_extent_mapping(em_tree, em);
3094 /* it is possible that someone inserted the extent into the tree
3095 * while we had the lock dropped. It is also possible that
3096 * an overlapping map exists in the tree
3097 */
3098 if (ret == -EEXIST) {
3099 struct extent_map *existing;
3100
3101 ret = 0;
3102
3103 existing = lookup_extent_mapping(em_tree, start, len);
3104 if (existing && (existing->start > start ||
3105 existing->start + existing->len <= start)) {
3106 free_extent_map(existing);
3107 existing = NULL;
3108 }
3109 if (!existing) {
3110 existing = lookup_extent_mapping(em_tree, em->start,
3111 em->len);
3112 if (existing) {
3113 err = merge_extent_mapping(em_tree, existing,
3114 em, start,
3115 root->sectorsize);
3116 free_extent_map(existing);
3117 if (err) {
3118 free_extent_map(em);
3119 em = NULL;
3120 }
3121 } else {
3122 err = -EIO;
3123 printk("failing to insert %Lu %Lu\n",
3124 start, len);
3125 free_extent_map(em);
3126 em = NULL;
3127 }
3128 } else {
3129 free_extent_map(em);
3130 em = existing;
3131 err = 0;
3132 }
3133 }
3134 spin_unlock(&em_tree->lock);
3135out:
3136 if (path)
3137 btrfs_free_path(path);
3138 if (trans) {
3139 ret = btrfs_end_transaction(trans, root);
3140 if (!err) {
3141 err = ret;
3142 }
3143 }
3144 if (err) {
3145 free_extent_map(em);
3146 WARN_ON(1);
3147 return ERR_PTR(err);
3148 }
3149 return em;
3150}
3151
3152static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
3153 const struct iovec *iov, loff_t offset,
3154 unsigned long nr_segs)
3155{
3156 return -EINVAL;
3157}
3158
3159static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
3160{
3161 return extent_bmap(mapping, iblock, btrfs_get_extent);
3162}
3163
3164int btrfs_readpage(struct file *file, struct page *page)
3165{
3166 struct extent_io_tree *tree;
3167 tree = &BTRFS_I(page->mapping->host)->io_tree;
3168 return extent_read_full_page(tree, page, btrfs_get_extent);
3169}
3170
3171static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
3172{
3173 struct extent_io_tree *tree;
3174
3175
3176 if (current->flags & PF_MEMALLOC) {
3177 redirty_page_for_writepage(wbc, page);
3178 unlock_page(page);
3179 return 0;
3180 }
3181 tree = &BTRFS_I(page->mapping->host)->io_tree;
3182 return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
3183}
3184
3185int btrfs_writepages(struct address_space *mapping,
3186 struct writeback_control *wbc)
3187{
3188 struct extent_io_tree *tree;
3189 tree = &BTRFS_I(mapping->host)->io_tree;
3190 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
3191}
3192
3193static int
3194btrfs_readpages(struct file *file, struct address_space *mapping,
3195 struct list_head *pages, unsigned nr_pages)
3196{
3197 struct extent_io_tree *tree;
3198 tree = &BTRFS_I(mapping->host)->io_tree;
3199 return extent_readpages(tree, mapping, pages, nr_pages,
3200 btrfs_get_extent);
3201}
3202static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
3203{
3204 struct extent_io_tree *tree;
3205 struct extent_map_tree *map;
3206 int ret;
3207
3208 tree = &BTRFS_I(page->mapping->host)->io_tree;
3209 map = &BTRFS_I(page->mapping->host)->extent_tree;
3210 ret = try_release_extent_mapping(map, tree, page, gfp_flags);
3211 if (ret == 1) {
3212 ClearPagePrivate(page);
3213 set_page_private(page, 0);
3214 page_cache_release(page);
3215 }
3216 return ret;
3217}
3218
3219static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
3220{
3221 if (PageWriteback(page) || PageDirty(page))
3222 return 0;
3223 return __btrfs_releasepage(page, gfp_flags);
3224}
3225
3226static void btrfs_invalidatepage(struct page *page, unsigned long offset)
3227{
3228 struct extent_io_tree *tree;
3229 struct btrfs_ordered_extent *ordered;
3230 u64 page_start = page_offset(page);
3231 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
3232
3233 wait_on_page_writeback(page);
3234 tree = &BTRFS_I(page->mapping->host)->io_tree;
3235 if (offset) {
3236 btrfs_releasepage(page, GFP_NOFS);
3237 return;
3238 }
3239
3240 lock_extent(tree, page_start, page_end, GFP_NOFS);
3241 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
3242 page_offset(page));
3243 if (ordered) {
3244 /*
3245 * IO on this page will never be started, so we need
3246 * to account for any ordered extents now
3247 */
3248 clear_extent_bit(tree, page_start, page_end,
3249 EXTENT_DIRTY | EXTENT_DELALLOC |
3250 EXTENT_LOCKED, 1, 0, GFP_NOFS);
3251 btrfs_finish_ordered_io(page->mapping->host,
3252 page_start, page_end);
3253 btrfs_put_ordered_extent(ordered);
3254 lock_extent(tree, page_start, page_end, GFP_NOFS);
3255 }
3256 clear_extent_bit(tree, page_start, page_end,
3257 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3258 EXTENT_ORDERED,
3259 1, 1, GFP_NOFS);
3260 __btrfs_releasepage(page, GFP_NOFS);
3261
3262 ClearPageChecked(page);
3263 if (PagePrivate(page)) {
3264 ClearPagePrivate(page);
3265 set_page_private(page, 0);
3266 page_cache_release(page);
3267 }
3268}
3269
3270/*
3271 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
3272 * called from a page fault handler when a page is first dirtied. Hence we must
3273 * be careful to check for EOF conditions here. We set the page up correctly
3274 * for a written page which means we get ENOSPC checking when writing into
3275 * holes and correct delalloc and unwritten extent mapping on filesystems that
3276 * support these features.
3277 *
3278 * We are not allowed to take the i_mutex here so we have to play games to
3279 * protect against truncate races as the page could now be beyond EOF. Because
3280 * vmtruncate() writes the inode size before removing pages, once we have the
3281 * page lock we can determine safely if the page is beyond EOF. If it is not
3282 * beyond EOF, then the page is guaranteed safe against truncation until we
3283 * unlock the page.
3284 */
3285int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
3286{
3287 struct inode *inode = fdentry(vma->vm_file)->d_inode;
3288 struct btrfs_root *root = BTRFS_I(inode)->root;
3289 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3290 struct btrfs_ordered_extent *ordered;
3291 char *kaddr;
3292 unsigned long zero_start;
3293 loff_t size;
3294 int ret;
3295 u64 page_start;
3296 u64 page_end;
3297
3298 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
3299 if (ret)
3300 goto out;
3301
3302 ret = -EINVAL;
3303again:
3304 lock_page(page);
3305 size = i_size_read(inode);
3306 page_start = page_offset(page);
3307 page_end = page_start + PAGE_CACHE_SIZE - 1;
3308
3309 if ((page->mapping != inode->i_mapping) ||
3310 (page_start >= size)) {
3311 /* page got truncated out from underneath us */
3312 goto out_unlock;
3313 }
3314 wait_on_page_writeback(page);
3315
3316 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3317 set_page_extent_mapped(page);
3318
3319 /*
3320 * we can't set the delalloc bits if there are pending ordered
3321 * extents. Drop our locks and wait for them to finish
3322 */
3323 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3324 if (ordered) {
3325 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3326 unlock_page(page);
3327 btrfs_start_ordered_extent(inode, ordered, 1);
3328 btrfs_put_ordered_extent(ordered);
3329 goto again;
3330 }
3331
3332 btrfs_set_extent_delalloc(inode, page_start, page_end);
3333 ret = 0;
3334
3335 /* page is wholly or partially inside EOF */
3336 if (page_start + PAGE_CACHE_SIZE > size)
3337 zero_start = size & ~PAGE_CACHE_MASK;
3338 else
3339 zero_start = PAGE_CACHE_SIZE;
3340
3341 if (zero_start != PAGE_CACHE_SIZE) {
3342 kaddr = kmap(page);
3343 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
3344 flush_dcache_page(page);
3345 kunmap(page);
3346 }
3347 ClearPageChecked(page);
3348 set_page_dirty(page);
3349 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3350
3351out_unlock:
3352 unlock_page(page);
3353out:
3354 return ret;
3355}
3356
3357static void btrfs_truncate(struct inode *inode)
3358{
3359 struct btrfs_root *root = BTRFS_I(inode)->root;
3360 int ret;
3361 struct btrfs_trans_handle *trans;
3362 unsigned long nr;
3363 u64 mask = root->sectorsize - 1;
3364
3365 if (!S_ISREG(inode->i_mode))
3366 return;
3367 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
3368 return;
3369
3370 btrfs_truncate_page(inode->i_mapping, inode->i_size);
3371 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
3372
3373 trans = btrfs_start_transaction(root, 1);
3374 btrfs_set_trans_block_group(trans, inode);
3375 btrfs_i_size_write(inode, inode->i_size);
3376
3377 ret = btrfs_orphan_add(trans, inode);
3378 if (ret)
3379 goto out;
3380 /* FIXME, add redo link to tree so we don't leak on crash */
3381 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
3382 BTRFS_EXTENT_DATA_KEY);
3383 btrfs_update_inode(trans, root, inode);
3384
3385 ret = btrfs_orphan_del(trans, inode);
3386 BUG_ON(ret);
3387
3388out:
3389 nr = trans->blocks_used;
3390 ret = btrfs_end_transaction_throttle(trans, root);
3391 BUG_ON(ret);
3392 btrfs_btree_balance_dirty(root, nr);
3393}
3394
3395/*
3396 * Invalidate a single dcache entry at the root of the filesystem.
3397 * Needed after creation of snapshot or subvolume.
3398 */
3399void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
3400 int namelen)
3401{
3402 struct dentry *alias, *entry;
3403 struct qstr qstr;
3404
3405 alias = d_find_alias(root->fs_info->sb->s_root->d_inode);
3406 if (alias) {
3407 qstr.name = name;
3408 qstr.len = namelen;
3409 /* change me if btrfs ever gets a d_hash operation */
3410 qstr.hash = full_name_hash(qstr.name, qstr.len);
3411 entry = d_lookup(alias, &qstr);
3412 dput(alias);
3413 if (entry) {
3414 d_invalidate(entry);
3415 dput(entry);
3416 }
3417 }
3418}
3419
3420/*
3421 * create a new subvolume directory/inode (helper for the ioctl).
3422 */
3423int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
3424 struct btrfs_trans_handle *trans, u64 new_dirid,
3425 struct btrfs_block_group_cache *block_group)
3426{
3427 struct inode *inode;
3428 int error;
3429 u64 index = 0;
3430
3431 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
3432 new_dirid, block_group, S_IFDIR | 0700, &index);
3433 if (IS_ERR(inode))
3434 return PTR_ERR(inode);
3435 inode->i_op = &btrfs_dir_inode_operations;
3436 inode->i_fop = &btrfs_dir_file_operations;
3437 new_root->inode = inode;
3438
3439 inode->i_nlink = 1;
3440 btrfs_i_size_write(inode, 0);
3441
3442 error = btrfs_update_inode(trans, new_root, inode);
3443 if (error)
3444 return error;
3445
3446 d_instantiate(dentry, inode);
3447 return 0;
3448}
3449
3450/* helper function for file defrag and space balancing. This
3451 * forces readahead on a given range of bytes in an inode
3452 */
3453unsigned long btrfs_force_ra(struct address_space *mapping,
3454 struct file_ra_state *ra, struct file *file,
3455 pgoff_t offset, pgoff_t last_index)
3456{
3457 pgoff_t req_size = last_index - offset + 1;
3458
3459 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
3460 return offset + req_size;
3461}
3462
3463struct inode *btrfs_alloc_inode(struct super_block *sb)
3464{
3465 struct btrfs_inode *ei;
3466
3467 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
3468 if (!ei)
3469 return NULL;
3470 ei->last_trans = 0;
3471 ei->logged_trans = 0;
3472 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
3473 ei->i_acl = BTRFS_ACL_NOT_CACHED;
3474 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
3475 INIT_LIST_HEAD(&ei->i_orphan);
3476 return &ei->vfs_inode;
3477}
3478
3479void btrfs_destroy_inode(struct inode *inode)
3480{
3481 struct btrfs_ordered_extent *ordered;
3482 WARN_ON(!list_empty(&inode->i_dentry));
3483 WARN_ON(inode->i_data.nrpages);
3484
3485 if (BTRFS_I(inode)->i_acl &&
3486 BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
3487 posix_acl_release(BTRFS_I(inode)->i_acl);
3488 if (BTRFS_I(inode)->i_default_acl &&
3489 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
3490 posix_acl_release(BTRFS_I(inode)->i_default_acl);
3491
3492 spin_lock(&BTRFS_I(inode)->root->list_lock);
3493 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
3494 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
3495 " list\n", inode->i_ino);
3496 dump_stack();
3497 }
3498 spin_unlock(&BTRFS_I(inode)->root->list_lock);
3499
3500 while(1) {
3501 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
3502 if (!ordered)
3503 break;
3504 else {
3505 printk("found ordered extent %Lu %Lu\n",
3506 ordered->file_offset, ordered->len);
3507 btrfs_remove_ordered_extent(inode, ordered);
3508 btrfs_put_ordered_extent(ordered);
3509 btrfs_put_ordered_extent(ordered);
3510 }
3511 }
3512 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
3513 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
3514}
3515
3516static void init_once(void *foo)
3517{
3518 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
3519
3520 inode_init_once(&ei->vfs_inode);
3521}
3522
3523void btrfs_destroy_cachep(void)
3524{
3525 if (btrfs_inode_cachep)
3526 kmem_cache_destroy(btrfs_inode_cachep);
3527 if (btrfs_trans_handle_cachep)
3528 kmem_cache_destroy(btrfs_trans_handle_cachep);
3529 if (btrfs_transaction_cachep)
3530 kmem_cache_destroy(btrfs_transaction_cachep);
3531 if (btrfs_bit_radix_cachep)
3532 kmem_cache_destroy(btrfs_bit_radix_cachep);
3533 if (btrfs_path_cachep)
3534 kmem_cache_destroy(btrfs_path_cachep);
3535}
3536
3537struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
3538 unsigned long extra_flags,
3539 void (*ctor)(void *))
3540{
3541 return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
3542 SLAB_MEM_SPREAD | extra_flags), ctor);
3543}
3544
3545int btrfs_init_cachep(void)
3546{
3547 btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
3548 sizeof(struct btrfs_inode),
3549 0, init_once);
3550 if (!btrfs_inode_cachep)
3551 goto fail;
3552 btrfs_trans_handle_cachep =
3553 btrfs_cache_create("btrfs_trans_handle_cache",
3554 sizeof(struct btrfs_trans_handle),
3555 0, NULL);
3556 if (!btrfs_trans_handle_cachep)
3557 goto fail;
3558 btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
3559 sizeof(struct btrfs_transaction),
3560 0, NULL);
3561 if (!btrfs_transaction_cachep)
3562 goto fail;
3563 btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
3564 sizeof(struct btrfs_path),
3565 0, NULL);
3566 if (!btrfs_path_cachep)
3567 goto fail;
3568 btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
3569 SLAB_DESTROY_BY_RCU, NULL);
3570 if (!btrfs_bit_radix_cachep)
3571 goto fail;
3572 return 0;
3573fail:
3574 btrfs_destroy_cachep();
3575 return -ENOMEM;
3576}
3577
3578static int btrfs_getattr(struct vfsmount *mnt,
3579 struct dentry *dentry, struct kstat *stat)
3580{
3581 struct inode *inode = dentry->d_inode;
3582 generic_fillattr(inode, stat);
3583 stat->blksize = PAGE_CACHE_SIZE;
3584 stat->blocks = (inode_get_bytes(inode) +
3585 BTRFS_I(inode)->delalloc_bytes) >> 9;
3586 return 0;
3587}
3588
3589static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
3590 struct inode * new_dir,struct dentry *new_dentry)
3591{
3592 struct btrfs_trans_handle *trans;
3593 struct btrfs_root *root = BTRFS_I(old_dir)->root;
3594 struct inode *new_inode = new_dentry->d_inode;
3595 struct inode *old_inode = old_dentry->d_inode;
3596 struct timespec ctime = CURRENT_TIME;
3597 u64 index = 0;
3598 int ret;
3599
3600 if (S_ISDIR(old_inode->i_mode) && new_inode &&
3601 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
3602 return -ENOTEMPTY;
3603 }
3604
3605 ret = btrfs_check_free_space(root, 1, 0);
3606 if (ret)
3607 goto out_unlock;
3608
3609 trans = btrfs_start_transaction(root, 1);
3610
3611 btrfs_set_trans_block_group(trans, new_dir);
3612
3613 btrfs_inc_nlink(old_dentry->d_inode);
3614 old_dir->i_ctime = old_dir->i_mtime = ctime;
3615 new_dir->i_ctime = new_dir->i_mtime = ctime;
3616 old_inode->i_ctime = ctime;
3617
3618 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
3619 old_dentry->d_name.name,
3620 old_dentry->d_name.len);
3621 if (ret)
3622 goto out_fail;
3623
3624 if (new_inode) {
3625 new_inode->i_ctime = CURRENT_TIME;
3626 ret = btrfs_unlink_inode(trans, root, new_dir,
3627 new_dentry->d_inode,
3628 new_dentry->d_name.name,
3629 new_dentry->d_name.len);
3630 if (ret)
3631 goto out_fail;
3632 if (new_inode->i_nlink == 0) {
3633 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
3634 if (ret)
3635 goto out_fail;
3636 }
3637
3638 }
3639 ret = btrfs_set_inode_index(new_dir, old_inode, &index);
3640 if (ret)
3641 goto out_fail;
3642
3643 ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
3644 old_inode, new_dentry->d_name.name,
3645 new_dentry->d_name.len, 1, index);
3646 if (ret)
3647 goto out_fail;
3648
3649out_fail:
3650 btrfs_end_transaction_throttle(trans, root);
3651out_unlock:
3652 return ret;
3653}
3654
3655/*
3656 * some fairly slow code that needs optimization. This walks the list
3657 * of all the inodes with pending delalloc and forces them to disk.
3658 */
3659int btrfs_start_delalloc_inodes(struct btrfs_root *root)
3660{
3661 struct list_head *head = &root->fs_info->delalloc_inodes;
3662 struct btrfs_inode *binode;
3663 struct inode *inode;
3664 unsigned long flags;
3665
3666 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
3667 while(!list_empty(head)) {
3668 binode = list_entry(head->next, struct btrfs_inode,
3669 delalloc_inodes);
3670 inode = igrab(&binode->vfs_inode);
3671 if (!inode)
3672 list_del_init(&binode->delalloc_inodes);
3673 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
3674 if (inode) {
3675 filemap_flush(inode->i_mapping);
3676 iput(inode);
3677 }
3678 cond_resched();
3679 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
3680 }
3681 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
3682
3683 /* the filemap_flush will queue IO into the worker threads, but
3684 * we have to make sure the IO is actually started and that
3685 * ordered extents get created before we return
3686 */
3687 atomic_inc(&root->fs_info->async_submit_draining);
3688 while(atomic_read(&root->fs_info->nr_async_submits)) {
3689 wait_event(root->fs_info->async_submit_wait,
3690 (atomic_read(&root->fs_info->nr_async_submits) == 0));
3691 }
3692 atomic_dec(&root->fs_info->async_submit_draining);
3693 return 0;
3694}
3695
3696static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
3697 const char *symname)
3698{
3699 struct btrfs_trans_handle *trans;
3700 struct btrfs_root *root = BTRFS_I(dir)->root;
3701 struct btrfs_path *path;
3702 struct btrfs_key key;
3703 struct inode *inode = NULL;
3704 int err;
3705 int drop_inode = 0;
3706 u64 objectid;
3707 u64 index = 0 ;
3708 int name_len;
3709 int datasize;
3710 unsigned long ptr;
3711 struct btrfs_file_extent_item *ei;
3712 struct extent_buffer *leaf;
3713 unsigned long nr = 0;
3714
3715 name_len = strlen(symname) + 1;
3716 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
3717 return -ENAMETOOLONG;
3718
3719 err = btrfs_check_free_space(root, 1, 0);
3720 if (err)
3721 goto out_fail;
3722
3723 trans = btrfs_start_transaction(root, 1);
3724 btrfs_set_trans_block_group(trans, dir);
3725
3726 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3727 if (err) {
3728 err = -ENOSPC;
3729 goto out_unlock;
3730 }
3731
3732 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3733 dentry->d_name.len,
3734 dentry->d_parent->d_inode->i_ino, objectid,
3735 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
3736 &index);
3737 err = PTR_ERR(inode);
3738 if (IS_ERR(inode))
3739 goto out_unlock;
3740
3741 err = btrfs_init_acl(inode, dir);
3742 if (err) {
3743 drop_inode = 1;
3744 goto out_unlock;
3745 }
3746
3747 btrfs_set_trans_block_group(trans, inode);
3748 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3749 if (err)
3750 drop_inode = 1;
3751 else {
3752 inode->i_mapping->a_ops = &btrfs_aops;
3753 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3754 inode->i_fop = &btrfs_file_operations;
3755 inode->i_op = &btrfs_file_inode_operations;
3756 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3757 }
3758 dir->i_sb->s_dirt = 1;
3759 btrfs_update_inode_block_group(trans, inode);
3760 btrfs_update_inode_block_group(trans, dir);
3761 if (drop_inode)
3762 goto out_unlock;
3763
3764 path = btrfs_alloc_path();
3765 BUG_ON(!path);
3766 key.objectid = inode->i_ino;
3767 key.offset = 0;
3768 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
3769 datasize = btrfs_file_extent_calc_inline_size(name_len);
3770 err = btrfs_insert_empty_item(trans, root, path, &key,
3771 datasize);
3772 if (err) {
3773 drop_inode = 1;
3774 goto out_unlock;
3775 }
3776 leaf = path->nodes[0];
3777 ei = btrfs_item_ptr(leaf, path->slots[0],
3778 struct btrfs_file_extent_item);
3779 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
3780 btrfs_set_file_extent_type(leaf, ei,
3781 BTRFS_FILE_EXTENT_INLINE);
3782 ptr = btrfs_file_extent_inline_start(ei);
3783 write_extent_buffer(leaf, symname, ptr, name_len);
3784 btrfs_mark_buffer_dirty(leaf);
3785 btrfs_free_path(path);
3786
3787 inode->i_op = &btrfs_symlink_inode_operations;
3788 inode->i_mapping->a_ops = &btrfs_symlink_aops;
3789 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3790 btrfs_i_size_write(inode, name_len - 1);
3791 err = btrfs_update_inode(trans, root, inode);
3792 if (err)
3793 drop_inode = 1;
3794
3795out_unlock:
3796 nr = trans->blocks_used;
3797 btrfs_end_transaction_throttle(trans, root);
3798out_fail:
3799 if (drop_inode) {
3800 inode_dec_link_count(inode);
3801 iput(inode);
3802 }
3803 btrfs_btree_balance_dirty(root, nr);
3804 return err;
3805}
3806
3807static int btrfs_set_page_dirty(struct page *page)
3808{
3809 return __set_page_dirty_nobuffers(page);
3810}
3811
3812static int btrfs_permission(struct inode *inode, int mask)
3813{
3814 if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
3815 return -EACCES;
3816 return generic_permission(inode, mask, btrfs_check_acl);
3817}
3818
3819static struct inode_operations btrfs_dir_inode_operations = {
3820 .lookup = btrfs_lookup,
3821 .create = btrfs_create,
3822 .unlink = btrfs_unlink,
3823 .link = btrfs_link,
3824 .mkdir = btrfs_mkdir,
3825 .rmdir = btrfs_rmdir,
3826 .rename = btrfs_rename,
3827 .symlink = btrfs_symlink,
3828 .setattr = btrfs_setattr,
3829 .mknod = btrfs_mknod,
3830 .setxattr = btrfs_setxattr,
3831 .getxattr = btrfs_getxattr,
3832 .listxattr = btrfs_listxattr,
3833 .removexattr = btrfs_removexattr,
3834 .permission = btrfs_permission,
3835};
3836static struct inode_operations btrfs_dir_ro_inode_operations = {
3837 .lookup = btrfs_lookup,
3838 .permission = btrfs_permission,
3839};
3840static struct file_operations btrfs_dir_file_operations = {
3841 .llseek = generic_file_llseek,
3842 .read = generic_read_dir,
3843 .readdir = btrfs_real_readdir,
3844 .unlocked_ioctl = btrfs_ioctl,
3845#ifdef CONFIG_COMPAT
3846 .compat_ioctl = btrfs_ioctl,
3847#endif
3848 .release = btrfs_release_file,
3849 .fsync = btrfs_sync_file,
3850};
3851
3852static struct extent_io_ops btrfs_extent_io_ops = {
3853 .fill_delalloc = run_delalloc_range,
3854 .submit_bio_hook = btrfs_submit_bio_hook,
3855 .merge_bio_hook = btrfs_merge_bio_hook,
3856 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
3857 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
3858 .writepage_start_hook = btrfs_writepage_start_hook,
3859 .readpage_io_failed_hook = btrfs_io_failed_hook,
3860 .set_bit_hook = btrfs_set_bit_hook,
3861 .clear_bit_hook = btrfs_clear_bit_hook,
3862};
3863
3864static struct address_space_operations btrfs_aops = {
3865 .readpage = btrfs_readpage,
3866 .writepage = btrfs_writepage,
3867 .writepages = btrfs_writepages,
3868 .readpages = btrfs_readpages,
3869 .sync_page = block_sync_page,
3870 .bmap = btrfs_bmap,
3871 .direct_IO = btrfs_direct_IO,
3872 .invalidatepage = btrfs_invalidatepage,
3873 .releasepage = btrfs_releasepage,
3874 .set_page_dirty = btrfs_set_page_dirty,
3875};
3876
3877static struct address_space_operations btrfs_symlink_aops = {
3878 .readpage = btrfs_readpage,
3879 .writepage = btrfs_writepage,
3880 .invalidatepage = btrfs_invalidatepage,
3881 .releasepage = btrfs_releasepage,
3882};
3883
3884static struct inode_operations btrfs_file_inode_operations = {
3885 .truncate = btrfs_truncate,
3886 .getattr = btrfs_getattr,
3887 .setattr = btrfs_setattr,
3888 .setxattr = btrfs_setxattr,
3889 .getxattr = btrfs_getxattr,
3890 .listxattr = btrfs_listxattr,
3891 .removexattr = btrfs_removexattr,
3892 .permission = btrfs_permission,
3893};
3894static struct inode_operations btrfs_special_inode_operations = {
3895 .getattr = btrfs_getattr,
3896 .setattr = btrfs_setattr,
3897 .permission = btrfs_permission,
3898 .setxattr = btrfs_setxattr,
3899 .getxattr = btrfs_getxattr,
3900 .listxattr = btrfs_listxattr,
3901 .removexattr = btrfs_removexattr,
3902};
3903static struct inode_operations btrfs_symlink_inode_operations = {
3904 .readlink = generic_readlink,
3905 .follow_link = page_follow_link_light,
3906 .put_link = page_put_link,
3907 .permission = btrfs_permission,
3908};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000000..1136ce2febcc
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,863 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/fsnotify.h>
25#include <linux/pagemap.h>
26#include <linux/highmem.h>
27#include <linux/time.h>
28#include <linux/init.h>
29#include <linux/string.h>
30#include <linux/smp_lock.h>
31#include <linux/backing-dev.h>
32#include <linux/mount.h>
33#include <linux/mpage.h>
34#include <linux/namei.h>
35#include <linux/swap.h>
36#include <linux/writeback.h>
37#include <linux/statfs.h>
38#include <linux/compat.h>
39#include <linux/bit_spinlock.h>
40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h>
43#include <linux/vmalloc.h>
44#include "ctree.h"
45#include "disk-io.h"
46#include "transaction.h"
47#include "btrfs_inode.h"
48#include "ioctl.h"
49#include "print-tree.h"
50#include "volumes.h"
51#include "locking.h"
52
53
54
55static noinline int create_subvol(struct btrfs_root *root,
56 struct dentry *dentry,
57 char *name, int namelen)
58{
59 struct btrfs_trans_handle *trans;
60 struct btrfs_key key;
61 struct btrfs_root_item root_item;
62 struct btrfs_inode_item *inode_item;
63 struct extent_buffer *leaf;
64 struct btrfs_root *new_root = root;
65 struct inode *dir;
66 int ret;
67 int err;
68 u64 objectid;
69 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
70 unsigned long nr = 1;
71
72 ret = btrfs_check_free_space(root, 1, 0);
73 if (ret)
74 goto fail_commit;
75
76 trans = btrfs_start_transaction(root, 1);
77 BUG_ON(!trans);
78
79 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
80 0, &objectid);
81 if (ret)
82 goto fail;
83
84 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
85 objectid, trans->transid, 0, 0, 0);
86 if (IS_ERR(leaf)) {
87 ret = PTR_ERR(leaf);
88 goto fail;
89 }
90
91 btrfs_set_header_nritems(leaf, 0);
92 btrfs_set_header_level(leaf, 0);
93 btrfs_set_header_bytenr(leaf, leaf->start);
94 btrfs_set_header_generation(leaf, trans->transid);
95 btrfs_set_header_owner(leaf, objectid);
96
97 write_extent_buffer(leaf, root->fs_info->fsid,
98 (unsigned long)btrfs_header_fsid(leaf),
99 BTRFS_FSID_SIZE);
100 btrfs_mark_buffer_dirty(leaf);
101
102 inode_item = &root_item.inode;
103 memset(inode_item, 0, sizeof(*inode_item));
104 inode_item->generation = cpu_to_le64(1);
105 inode_item->size = cpu_to_le64(3);
106 inode_item->nlink = cpu_to_le32(1);
107 inode_item->nbytes = cpu_to_le64(root->leafsize);
108 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
109
110 btrfs_set_root_bytenr(&root_item, leaf->start);
111 btrfs_set_root_level(&root_item, 0);
112 btrfs_set_root_refs(&root_item, 1);
113 btrfs_set_root_used(&root_item, 0);
114
115 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
116 root_item.drop_level = 0;
117
118 btrfs_tree_unlock(leaf);
119 free_extent_buffer(leaf);
120 leaf = NULL;
121
122 btrfs_set_root_dirid(&root_item, new_dirid);
123
124 key.objectid = objectid;
125 key.offset = 1;
126 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
127 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
128 &root_item);
129 if (ret)
130 goto fail;
131
132 /*
133 * insert the directory item
134 */
135 key.offset = (u64)-1;
136 dir = root->fs_info->sb->s_root->d_inode;
137 ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
138 name, namelen, dir->i_ino, &key,
139 BTRFS_FT_DIR, 0);
140 if (ret)
141 goto fail;
142
143 ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
144 name, namelen, objectid,
145 root->fs_info->sb->s_root->d_inode->i_ino, 0);
146 if (ret)
147 goto fail;
148
149 ret = btrfs_commit_transaction(trans, root);
150 if (ret)
151 goto fail_commit;
152
153 new_root = btrfs_read_fs_root(root->fs_info, &key, name, namelen);
154 BUG_ON(!new_root);
155
156 trans = btrfs_start_transaction(new_root, 1);
157 BUG_ON(!trans);
158
159 ret = btrfs_create_subvol_root(new_root, dentry, trans, new_dirid,
160 BTRFS_I(dir)->block_group);
161 if (ret)
162 goto fail;
163
164fail:
165 nr = trans->blocks_used;
166 err = btrfs_commit_transaction(trans, new_root);
167 if (err && !ret)
168 ret = err;
169fail_commit:
170 btrfs_btree_balance_dirty(root, nr);
171 return ret;
172}
173
174static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
175{
176 struct btrfs_pending_snapshot *pending_snapshot;
177 struct btrfs_trans_handle *trans;
178 int ret;
179 int err;
180 unsigned long nr = 0;
181
182 if (!root->ref_cows)
183 return -EINVAL;
184
185 ret = btrfs_check_free_space(root, 1, 0);
186 if (ret)
187 goto fail_unlock;
188
189 pending_snapshot = kmalloc(sizeof(*pending_snapshot), GFP_NOFS);
190 if (!pending_snapshot) {
191 ret = -ENOMEM;
192 goto fail_unlock;
193 }
194 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
195 if (!pending_snapshot->name) {
196 ret = -ENOMEM;
197 kfree(pending_snapshot);
198 goto fail_unlock;
199 }
200 memcpy(pending_snapshot->name, name, namelen);
201 pending_snapshot->name[namelen] = '\0';
202 trans = btrfs_start_transaction(root, 1);
203 BUG_ON(!trans);
204 pending_snapshot->root = root;
205 list_add(&pending_snapshot->list,
206 &trans->transaction->pending_snapshots);
207 ret = btrfs_update_inode(trans, root, root->inode);
208 err = btrfs_commit_transaction(trans, root);
209
210fail_unlock:
211 btrfs_btree_balance_dirty(root, nr);
212 return ret;
213}
214
215/* copy of may_create in fs/namei.c() */
216static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
217{
218 if (child->d_inode)
219 return -EEXIST;
220 if (IS_DEADDIR(dir))
221 return -ENOENT;
222 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
223}
224
225/*
226 * Create a new subvolume below @parent. This is largely modeled after
227 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
228 * inside this filesystem so it's quite a bit simpler.
229 */
230static noinline int btrfs_mksubvol(struct path *parent, char *name,
231 int mode, int namelen)
232{
233 struct dentry *dentry;
234 int error;
235
236 mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
237
238 dentry = lookup_one_len(name, parent->dentry, namelen);
239 error = PTR_ERR(dentry);
240 if (IS_ERR(dentry))
241 goto out_unlock;
242
243 error = -EEXIST;
244 if (dentry->d_inode)
245 goto out_dput;
246
247 if (!IS_POSIXACL(parent->dentry->d_inode))
248 mode &= ~current->fs->umask;
249 error = mnt_want_write(parent->mnt);
250 if (error)
251 goto out_dput;
252
253 error = btrfs_may_create(parent->dentry->d_inode, dentry);
254 if (error)
255 goto out_drop_write;
256
257 /*
258 * Actually perform the low-level subvolume creation after all
259 * this VFS fuzz.
260 *
261 * Eventually we want to pass in an inode under which we create this
262 * subvolume, but for now all are under the filesystem root.
263 *
264 * Also we should pass on the mode eventually to allow creating new
265 * subvolume with specific mode bits.
266 */
267 error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, dentry,
268 name, namelen);
269 if (error)
270 goto out_drop_write;
271
272 fsnotify_mkdir(parent->dentry->d_inode, dentry);
273out_drop_write:
274 mnt_drop_write(parent->mnt);
275out_dput:
276 dput(dentry);
277out_unlock:
278 mutex_unlock(&parent->dentry->d_inode->i_mutex);
279 return error;
280}
281
282
283int btrfs_defrag_file(struct file *file)
284{
285 struct inode *inode = fdentry(file)->d_inode;
286 struct btrfs_root *root = BTRFS_I(inode)->root;
287 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
288 struct btrfs_ordered_extent *ordered;
289 struct page *page;
290 unsigned long last_index;
291 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
292 unsigned long total_read = 0;
293 u64 page_start;
294 u64 page_end;
295 unsigned long i;
296 int ret;
297
298 ret = btrfs_check_free_space(root, inode->i_size, 0);
299 if (ret)
300 return -ENOSPC;
301
302 mutex_lock(&inode->i_mutex);
303 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
304 for (i = 0; i <= last_index; i++) {
305 if (total_read % ra_pages == 0) {
306 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
307 min(last_index, i + ra_pages - 1));
308 }
309 total_read++;
310again:
311 page = grab_cache_page(inode->i_mapping, i);
312 if (!page)
313 goto out_unlock;
314 if (!PageUptodate(page)) {
315 btrfs_readpage(NULL, page);
316 lock_page(page);
317 if (!PageUptodate(page)) {
318 unlock_page(page);
319 page_cache_release(page);
320 goto out_unlock;
321 }
322 }
323
324 wait_on_page_writeback(page);
325
326 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
327 page_end = page_start + PAGE_CACHE_SIZE - 1;
328 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
329
330 ordered = btrfs_lookup_ordered_extent(inode, page_start);
331 if (ordered) {
332 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
333 unlock_page(page);
334 page_cache_release(page);
335 btrfs_start_ordered_extent(inode, ordered, 1);
336 btrfs_put_ordered_extent(ordered);
337 goto again;
338 }
339 set_page_extent_mapped(page);
340
341 /*
342 * this makes sure page_mkwrite is called on the
343 * page if it is dirtied again later
344 */
345 clear_page_dirty_for_io(page);
346
347 btrfs_set_extent_delalloc(inode, page_start, page_end);
348
349 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
350 set_page_dirty(page);
351 unlock_page(page);
352 page_cache_release(page);
353 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
354 }
355
356out_unlock:
357 mutex_unlock(&inode->i_mutex);
358 return 0;
359}
360
361/*
362 * Called inside transaction, so use GFP_NOFS
363 */
364
365static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
366{
367 u64 new_size;
368 u64 old_size;
369 u64 devid = 1;
370 struct btrfs_ioctl_vol_args *vol_args;
371 struct btrfs_trans_handle *trans;
372 struct btrfs_device *device = NULL;
373 char *sizestr;
374 char *devstr = NULL;
375 int ret = 0;
376 int namelen;
377 int mod = 0;
378
379 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
380
381 if (!vol_args)
382 return -ENOMEM;
383
384 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
385 ret = -EFAULT;
386 goto out;
387 }
388
389 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
390 namelen = strlen(vol_args->name);
391
392 mutex_lock(&root->fs_info->volume_mutex);
393 sizestr = vol_args->name;
394 devstr = strchr(sizestr, ':');
395 if (devstr) {
396 char *end;
397 sizestr = devstr + 1;
398 *devstr = '\0';
399 devstr = vol_args->name;
400 devid = simple_strtoull(devstr, &end, 10);
401 printk(KERN_INFO "resizing devid %llu\n", devid);
402 }
403 device = btrfs_find_device(root, devid, NULL);
404 if (!device) {
405 printk(KERN_INFO "resizer unable to find device %llu\n", devid);
406 ret = -EINVAL;
407 goto out_unlock;
408 }
409 if (!strcmp(sizestr, "max"))
410 new_size = device->bdev->bd_inode->i_size;
411 else {
412 if (sizestr[0] == '-') {
413 mod = -1;
414 sizestr++;
415 } else if (sizestr[0] == '+') {
416 mod = 1;
417 sizestr++;
418 }
419 new_size = btrfs_parse_size(sizestr);
420 if (new_size == 0) {
421 ret = -EINVAL;
422 goto out_unlock;
423 }
424 }
425
426 old_size = device->total_bytes;
427
428 if (mod < 0) {
429 if (new_size > old_size) {
430 ret = -EINVAL;
431 goto out_unlock;
432 }
433 new_size = old_size - new_size;
434 } else if (mod > 0) {
435 new_size = old_size + new_size;
436 }
437
438 if (new_size < 256 * 1024 * 1024) {
439 ret = -EINVAL;
440 goto out_unlock;
441 }
442 if (new_size > device->bdev->bd_inode->i_size) {
443 ret = -EFBIG;
444 goto out_unlock;
445 }
446
447 do_div(new_size, root->sectorsize);
448 new_size *= root->sectorsize;
449
450 printk(KERN_INFO "new size for %s is %llu\n",
451 device->name, (unsigned long long)new_size);
452
453 if (new_size > old_size) {
454 trans = btrfs_start_transaction(root, 1);
455 ret = btrfs_grow_device(trans, device, new_size);
456 btrfs_commit_transaction(trans, root);
457 } else {
458 ret = btrfs_shrink_device(device, new_size);
459 }
460
461out_unlock:
462 mutex_unlock(&root->fs_info->volume_mutex);
463out:
464 kfree(vol_args);
465 return ret;
466}
467
468static noinline int btrfs_ioctl_snap_create(struct file *file,
469 void __user *arg)
470{
471 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
472 struct btrfs_ioctl_vol_args *vol_args;
473 struct btrfs_dir_item *di;
474 struct btrfs_path *path;
475 u64 root_dirid;
476 int namelen;
477 int ret;
478
479 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
480
481 if (!vol_args)
482 return -ENOMEM;
483
484 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
485 ret = -EFAULT;
486 goto out;
487 }
488
489 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
490 namelen = strlen(vol_args->name);
491 if (strchr(vol_args->name, '/')) {
492 ret = -EINVAL;
493 goto out;
494 }
495
496 path = btrfs_alloc_path();
497 if (!path) {
498 ret = -ENOMEM;
499 goto out;
500 }
501
502 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
503 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
504 path, root_dirid,
505 vol_args->name, namelen, 0);
506 btrfs_free_path(path);
507
508 if (di && !IS_ERR(di)) {
509 ret = -EEXIST;
510 goto out;
511 }
512
513 if (IS_ERR(di)) {
514 ret = PTR_ERR(di);
515 goto out;
516 }
517
518 if (root == root->fs_info->tree_root) {
519 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
520 file->f_path.dentry->d_inode->i_mode,
521 namelen);
522 } else {
523 ret = create_snapshot(root, vol_args->name, namelen);
524 }
525
526out:
527 kfree(vol_args);
528 return ret;
529}
530
531static int btrfs_ioctl_defrag(struct file *file)
532{
533 struct inode *inode = fdentry(file)->d_inode;
534 struct btrfs_root *root = BTRFS_I(inode)->root;
535
536 switch (inode->i_mode & S_IFMT) {
537 case S_IFDIR:
538 btrfs_defrag_root(root, 0);
539 btrfs_defrag_root(root->fs_info->extent_root, 0);
540 break;
541 case S_IFREG:
542 btrfs_defrag_file(file);
543 break;
544 }
545
546 return 0;
547}
548
549long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
550{
551 struct btrfs_ioctl_vol_args *vol_args;
552 int ret;
553
554 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
555
556 if (!vol_args)
557 return -ENOMEM;
558
559 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
560 ret = -EFAULT;
561 goto out;
562 }
563 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
564 ret = btrfs_init_new_device(root, vol_args->name);
565
566out:
567 kfree(vol_args);
568 return ret;
569}
570
571long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
572{
573 struct btrfs_ioctl_vol_args *vol_args;
574 int ret;
575
576 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
577
578 if (!vol_args)
579 return -ENOMEM;
580
581 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
582 ret = -EFAULT;
583 goto out;
584 }
585 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
586 ret = btrfs_rm_device(root, vol_args->name);
587
588out:
589 kfree(vol_args);
590 return ret;
591}
592
593long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
594{
595 struct inode *inode = fdentry(file)->d_inode;
596 struct btrfs_root *root = BTRFS_I(inode)->root;
597 struct file *src_file;
598 struct inode *src;
599 struct btrfs_trans_handle *trans;
600 struct btrfs_path *path;
601 struct extent_buffer *leaf;
602 char *buf;
603 struct btrfs_key key;
604 u32 nritems;
605 int slot;
606 int ret;
607
608 src_file = fget(src_fd);
609 if (!src_file)
610 return -EBADF;
611 src = src_file->f_dentry->d_inode;
612
613 ret = -EISDIR;
614 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
615 goto out_fput;
616
617 ret = -EXDEV;
618 if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
619 goto out_fput;
620
621 ret = -ENOMEM;
622 buf = vmalloc(btrfs_level_size(root, 0));
623 if (!buf)
624 goto out_fput;
625
626 path = btrfs_alloc_path();
627 if (!path) {
628 vfree(buf);
629 goto out_fput;
630 }
631 path->reada = 2;
632
633 if (inode < src) {
634 mutex_lock(&inode->i_mutex);
635 mutex_lock(&src->i_mutex);
636 } else {
637 mutex_lock(&src->i_mutex);
638 mutex_lock(&inode->i_mutex);
639 }
640
641 ret = -ENOTEMPTY;
642 if (inode->i_size)
643 goto out_unlock;
644
645 /* do any pending delalloc/csum calc on src, one way or
646 another, and lock file content */
647 while (1) {
648 struct btrfs_ordered_extent *ordered;
649 lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
650 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
651 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
652 break;
653 unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
654 if (ordered)
655 btrfs_put_ordered_extent(ordered);
656 btrfs_wait_ordered_range(src, 0, (u64)-1);
657 }
658
659 trans = btrfs_start_transaction(root, 1);
660 BUG_ON(!trans);
661
662 key.objectid = src->i_ino;
663 key.type = BTRFS_EXTENT_DATA_KEY;
664 key.offset = 0;
665
666 while (1) {
667 /*
668 * note the key will change type as we walk through the
669 * tree.
670 */
671 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
672 if (ret < 0)
673 goto out;
674
675 nritems = btrfs_header_nritems(path->nodes[0]);
676 if (path->slots[0] >= nritems) {
677 ret = btrfs_next_leaf(root, path);
678 if (ret < 0)
679 goto out;
680 if (ret > 0)
681 break;
682 nritems = btrfs_header_nritems(path->nodes[0]);
683 }
684 leaf = path->nodes[0];
685 slot = path->slots[0];
686
687 btrfs_item_key_to_cpu(leaf, &key, slot);
688 if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
689 key.objectid != src->i_ino)
690 break;
691
692 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY ||
693 btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
694 u32 size;
695 struct btrfs_key new_key;
696
697 size = btrfs_item_size_nr(leaf, slot);
698 read_extent_buffer(leaf, buf,
699 btrfs_item_ptr_offset(leaf, slot),
700 size);
701 btrfs_release_path(root, path);
702
703 memcpy(&new_key, &key, sizeof(new_key));
704 new_key.objectid = inode->i_ino;
705 ret = btrfs_insert_empty_item(trans, root, path,
706 &new_key, size);
707 if (ret)
708 goto out;
709
710 leaf = path->nodes[0];
711 slot = path->slots[0];
712 write_extent_buffer(leaf, buf,
713 btrfs_item_ptr_offset(leaf, slot),
714 size);
715 btrfs_mark_buffer_dirty(leaf);
716 }
717
718 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
719 struct btrfs_file_extent_item *extent;
720 int found_type;
721
722 extent = btrfs_item_ptr(leaf, slot,
723 struct btrfs_file_extent_item);
724 found_type = btrfs_file_extent_type(leaf, extent);
725 if (found_type == BTRFS_FILE_EXTENT_REG) {
726 u64 ds = btrfs_file_extent_disk_bytenr(leaf,
727 extent);
728 u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
729 extent);
730 /* ds == 0 means there's a hole */
731 if (ds != 0) {
732 ret = btrfs_inc_extent_ref(trans, root,
733 ds, dl, leaf->start,
734 root->root_key.objectid,
735 trans->transid,
736 inode->i_ino);
737 BUG_ON(ret);
738 }
739 }
740 }
741 btrfs_release_path(root, path);
742 key.offset++;
743 }
744 ret = 0;
745out:
746 btrfs_release_path(root, path);
747 if (ret == 0) {
748 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
749 inode_set_bytes(inode, inode_get_bytes(src));
750 btrfs_i_size_write(inode, src->i_size);
751 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
752 ret = btrfs_update_inode(trans, root, inode);
753 }
754 btrfs_end_transaction(trans, root);
755 unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
756 if (ret)
757 vmtruncate(inode, 0);
758out_unlock:
759 mutex_unlock(&src->i_mutex);
760 mutex_unlock(&inode->i_mutex);
761 vfree(buf);
762 btrfs_free_path(path);
763out_fput:
764 fput(src_file);
765 return ret;
766}
767
768/*
769 * there are many ways the trans_start and trans_end ioctls can lead
770 * to deadlocks. They should only be used by applications that
771 * basically own the machine, and have a very in depth understanding
772 * of all the possible deadlocks and enospc problems.
773 */
774long btrfs_ioctl_trans_start(struct file *file)
775{
776 struct inode *inode = fdentry(file)->d_inode;
777 struct btrfs_root *root = BTRFS_I(inode)->root;
778 struct btrfs_trans_handle *trans;
779 int ret = 0;
780
781 if (!capable(CAP_SYS_ADMIN))
782 return -EPERM;
783
784 if (file->private_data) {
785 ret = -EINPROGRESS;
786 goto out;
787 }
788
789 mutex_lock(&root->fs_info->trans_mutex);
790 root->fs_info->open_ioctl_trans++;
791 mutex_unlock(&root->fs_info->trans_mutex);
792
793 trans = btrfs_start_ioctl_transaction(root, 0);
794 if (trans)
795 file->private_data = trans;
796 else
797 ret = -ENOMEM;
798 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
799out:
800 return ret;
801}
802
803/*
804 * there are many ways the trans_start and trans_end ioctls can lead
805 * to deadlocks. They should only be used by applications that
806 * basically own the machine, and have a very in depth understanding
807 * of all the possible deadlocks and enospc problems.
808 */
809long btrfs_ioctl_trans_end(struct file *file)
810{
811 struct inode *inode = fdentry(file)->d_inode;
812 struct btrfs_root *root = BTRFS_I(inode)->root;
813 struct btrfs_trans_handle *trans;
814 int ret = 0;
815
816 trans = file->private_data;
817 if (!trans) {
818 ret = -EINVAL;
819 goto out;
820 }
821 btrfs_end_transaction(trans, root);
822 file->private_data = NULL;
823
824 mutex_lock(&root->fs_info->trans_mutex);
825 root->fs_info->open_ioctl_trans--;
826 mutex_unlock(&root->fs_info->trans_mutex);
827
828out:
829 return ret;
830}
831
832long btrfs_ioctl(struct file *file, unsigned int
833 cmd, unsigned long arg)
834{
835 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
836
837 switch (cmd) {
838 case BTRFS_IOC_SNAP_CREATE:
839 return btrfs_ioctl_snap_create(file, (void __user *)arg);
840 case BTRFS_IOC_DEFRAG:
841 return btrfs_ioctl_defrag(file);
842 case BTRFS_IOC_RESIZE:
843 return btrfs_ioctl_resize(root, (void __user *)arg);
844 case BTRFS_IOC_ADD_DEV:
845 return btrfs_ioctl_add_dev(root, (void __user *)arg);
846 case BTRFS_IOC_RM_DEV:
847 return btrfs_ioctl_rm_dev(root, (void __user *)arg);
848 case BTRFS_IOC_BALANCE:
849 return btrfs_balance(root->fs_info->dev_root);
850 case BTRFS_IOC_CLONE:
851 return btrfs_ioctl_clone(file, arg);
852 case BTRFS_IOC_TRANS_START:
853 return btrfs_ioctl_trans_start(file);
854 case BTRFS_IOC_TRANS_END:
855 return btrfs_ioctl_trans_end(file);
856 case BTRFS_IOC_SYNC:
857 btrfs_start_delalloc_inodes(root);
858 btrfs_sync_fs(file->f_dentry->d_sb, 1);
859 return 0;
860 }
861
862 return -ENOTTY;
863}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..85ed35a775b1
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,55 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __IOCTL_
20#define __IOCTL_
21#include <linux/ioctl.h>
22
23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 4095
26
27struct btrfs_ioctl_vol_args {
28 char name[BTRFS_PATH_NAME_MAX + 1];
29};
30
31#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
32 struct btrfs_ioctl_vol_args)
33#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
34 struct btrfs_ioctl_vol_args)
35#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
36 struct btrfs_ioctl_vol_args)
37#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
38 struct btrfs_ioctl_vol_args)
39/* trans start and trans end are dangerous, and only for
40 * use by applications that know how to avoid the
41 * resulting deadlocks
42 */
43#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
44#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
45#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
46
47#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
48#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
49 struct btrfs_ioctl_vol_args)
50#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
51 struct btrfs_ioctl_vol_args)
52#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
53 struct btrfs_ioctl_vol_args)
54
55#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000000..e30aa6e2958f
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,87 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h>
21#include <linux/spinlock.h>
22#include <linux/page-flags.h>
23#include <asm/bug.h>
24#include "ctree.h"
25#include "extent_io.h"
26#include "locking.h"
27
28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks
30 * and the spin is not tuned very extensively. The spinning does make a big
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree searches,
35 * and we should give up if they are in more expensive code.
36 */
37int btrfs_tree_lock(struct extent_buffer *eb)
38{
39 int i;
40
41 if (mutex_trylock(&eb->mutex))
42 return 0;
43 for (i = 0; i < 512; i++) {
44 cpu_relax();
45 if (mutex_trylock(&eb->mutex))
46 return 0;
47 }
48 cpu_relax();
49 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
50 return 0;
51}
52
53int btrfs_try_tree_lock(struct extent_buffer *eb)
54{
55 return mutex_trylock(&eb->mutex);
56}
57
58int btrfs_tree_unlock(struct extent_buffer *eb)
59{
60 mutex_unlock(&eb->mutex);
61 return 0;
62}
63
64int btrfs_tree_locked(struct extent_buffer *eb)
65{
66 return mutex_is_locked(&eb->mutex);
67}
68
69/*
70 * btrfs_search_slot uses this to decide if it should drop its locks
71 * before doing something expensive like allocating free blocks for cow.
72 */
73int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
74{
75 int i;
76 struct extent_buffer *eb;
77 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
78 eb = path->nodes[i];
79 if (!eb)
80 break;
81 smp_mb();
82 if (!list_empty(&eb->mutex.wait_list))
83 return 1;
84 }
85 return 0;
86}
87
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000000..bc1faef12519
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_
21
22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb);
25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
27#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000000..2eb6caba57c2
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,727 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/gfp.h>
20#include <linux/slab.h>
21#include <linux/blkdev.h>
22#include <linux/writeback.h>
23#include <linux/pagevec.h>
24#include "ctree.h"
25#include "transaction.h"
26#include "btrfs_inode.h"
27#include "extent_io.h"
28
29static u64 entry_end(struct btrfs_ordered_extent *entry)
30{
31 if (entry->file_offset + entry->len < entry->file_offset)
32 return (u64)-1;
33 return entry->file_offset + entry->len;
34}
35
36/* returns NULL if the insertion worked, or it returns the node it did find
37 * in the tree
38 */
39static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
40 struct rb_node *node)
41{
42 struct rb_node ** p = &root->rb_node;
43 struct rb_node * parent = NULL;
44 struct btrfs_ordered_extent *entry;
45
46 while(*p) {
47 parent = *p;
48 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
49
50 if (file_offset < entry->file_offset)
51 p = &(*p)->rb_left;
52 else if (file_offset >= entry_end(entry))
53 p = &(*p)->rb_right;
54 else
55 return parent;
56 }
57
58 rb_link_node(node, parent, p);
59 rb_insert_color(node, root);
60 return NULL;
61}
62
63/*
64 * look for a given offset in the tree, and if it can't be found return the
65 * first lesser offset
66 */
67static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
68 struct rb_node **prev_ret)
69{
70 struct rb_node * n = root->rb_node;
71 struct rb_node *prev = NULL;
72 struct rb_node *test;
73 struct btrfs_ordered_extent *entry;
74 struct btrfs_ordered_extent *prev_entry = NULL;
75
76 while(n) {
77 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
78 prev = n;
79 prev_entry = entry;
80
81 if (file_offset < entry->file_offset)
82 n = n->rb_left;
83 else if (file_offset >= entry_end(entry))
84 n = n->rb_right;
85 else
86 return n;
87 }
88 if (!prev_ret)
89 return NULL;
90
91 while(prev && file_offset >= entry_end(prev_entry)) {
92 test = rb_next(prev);
93 if (!test)
94 break;
95 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
96 rb_node);
97 if (file_offset < entry_end(prev_entry))
98 break;
99
100 prev = test;
101 }
102 if (prev)
103 prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
104 rb_node);
105 while(prev && file_offset < entry_end(prev_entry)) {
106 test = rb_prev(prev);
107 if (!test)
108 break;
109 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
110 rb_node);
111 prev = test;
112 }
113 *prev_ret = prev;
114 return NULL;
115}
116
117/*
118 * helper to check if a given offset is inside a given entry
119 */
120static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
121{
122 if (file_offset < entry->file_offset ||
123 entry->file_offset + entry->len <= file_offset)
124 return 0;
125 return 1;
126}
127
128/*
129 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset
131 */
132static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
133 u64 file_offset)
134{
135 struct rb_root *root = &tree->tree;
136 struct rb_node *prev;
137 struct rb_node *ret;
138 struct btrfs_ordered_extent *entry;
139
140 if (tree->last) {
141 entry = rb_entry(tree->last, struct btrfs_ordered_extent,
142 rb_node);
143 if (offset_in_entry(entry, file_offset))
144 return tree->last;
145 }
146 ret = __tree_search(root, file_offset, &prev);
147 if (!ret)
148 ret = prev;
149 if (ret)
150 tree->last = ret;
151 return ret;
152}
153
154/* allocate and add a new ordered_extent into the per-inode tree.
155 * file_offset is the logical offset in the file
156 *
157 * start is the disk block number of an extent already reserved in the
158 * extent allocation tree
159 *
160 * len is the length of the extent
161 *
162 * This also sets the EXTENT_ORDERED bit on the range in the inode.
163 *
164 * The tree is given a single reference on the ordered extent that was
165 * inserted.
166 */
167int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
168 u64 start, u64 len, int nocow)
169{
170 struct btrfs_ordered_inode_tree *tree;
171 struct rb_node *node;
172 struct btrfs_ordered_extent *entry;
173
174 tree = &BTRFS_I(inode)->ordered_tree;
175 entry = kzalloc(sizeof(*entry), GFP_NOFS);
176 if (!entry)
177 return -ENOMEM;
178
179 mutex_lock(&tree->mutex);
180 entry->file_offset = file_offset;
181 entry->start = start;
182 entry->len = len;
183 entry->inode = inode;
184 if (nocow)
185 set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
186
187 /* one ref for the tree */
188 atomic_set(&entry->refs, 1);
189 init_waitqueue_head(&entry->wait);
190 INIT_LIST_HEAD(&entry->list);
191 INIT_LIST_HEAD(&entry->root_extent_list);
192
193 node = tree_insert(&tree->tree, file_offset,
194 &entry->rb_node);
195 if (node) {
196 printk("warning dup entry from add_ordered_extent\n");
197 BUG();
198 }
199 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
200 entry_end(entry) - 1, GFP_NOFS);
201
202 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
203 list_add_tail(&entry->root_extent_list,
204 &BTRFS_I(inode)->root->fs_info->ordered_extents);
205 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
206
207 mutex_unlock(&tree->mutex);
208 BUG_ON(node);
209 return 0;
210}
211
212/*
213 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
214 * when an ordered extent is finished. If the list covers more than one
215 * ordered extent, it is split across multiples.
216 */
217int btrfs_add_ordered_sum(struct inode *inode,
218 struct btrfs_ordered_extent *entry,
219 struct btrfs_ordered_sum *sum)
220{
221 struct btrfs_ordered_inode_tree *tree;
222
223 tree = &BTRFS_I(inode)->ordered_tree;
224 mutex_lock(&tree->mutex);
225 list_add_tail(&sum->list, &entry->list);
226 mutex_unlock(&tree->mutex);
227 return 0;
228}
229
230/*
231 * this is used to account for finished IO across a given range
232 * of the file. The IO should not span ordered extents. If
233 * a given ordered_extent is completely done, 1 is returned, otherwise
234 * 0.
235 *
236 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
237 * to make sure this function only returns 1 once for a given ordered extent.
238 */
239int btrfs_dec_test_ordered_pending(struct inode *inode,
240 u64 file_offset, u64 io_size)
241{
242 struct btrfs_ordered_inode_tree *tree;
243 struct rb_node *node;
244 struct btrfs_ordered_extent *entry;
245 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
246 int ret;
247
248 tree = &BTRFS_I(inode)->ordered_tree;
249 mutex_lock(&tree->mutex);
250 clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
251 GFP_NOFS);
252 node = tree_search(tree, file_offset);
253 if (!node) {
254 ret = 1;
255 goto out;
256 }
257
258 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
259 if (!offset_in_entry(entry, file_offset)) {
260 ret = 1;
261 goto out;
262 }
263
264 ret = test_range_bit(io_tree, entry->file_offset,
265 entry->file_offset + entry->len - 1,
266 EXTENT_ORDERED, 0);
267 if (ret == 0)
268 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
269out:
270 mutex_unlock(&tree->mutex);
271 return ret == 0;
272}
273
274/*
275 * used to drop a reference on an ordered extent. This will free
276 * the extent if the last reference is dropped
277 */
278int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
279{
280 struct list_head *cur;
281 struct btrfs_ordered_sum *sum;
282
283 if (atomic_dec_and_test(&entry->refs)) {
284 while(!list_empty(&entry->list)) {
285 cur = entry->list.next;
286 sum = list_entry(cur, struct btrfs_ordered_sum, list);
287 list_del(&sum->list);
288 kfree(sum);
289 }
290 kfree(entry);
291 }
292 return 0;
293}
294
295/*
296 * remove an ordered extent from the tree. No references are dropped
297 * but, anyone waiting on this extent is woken up.
298 */
299int btrfs_remove_ordered_extent(struct inode *inode,
300 struct btrfs_ordered_extent *entry)
301{
302 struct btrfs_ordered_inode_tree *tree;
303 struct rb_node *node;
304
305 tree = &BTRFS_I(inode)->ordered_tree;
306 mutex_lock(&tree->mutex);
307 node = &entry->rb_node;
308 rb_erase(node, &tree->tree);
309 tree->last = NULL;
310 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
311
312 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
313 list_del_init(&entry->root_extent_list);
314 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
315
316 mutex_unlock(&tree->mutex);
317 wake_up(&entry->wait);
318 return 0;
319}
320
321/*
322 * wait for all the ordered extents in a root. This is done when balancing
323 * space between drives.
324 */
325int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
326{
327 struct list_head splice;
328 struct list_head *cur;
329 struct btrfs_ordered_extent *ordered;
330 struct inode *inode;
331
332 INIT_LIST_HEAD(&splice);
333
334 spin_lock(&root->fs_info->ordered_extent_lock);
335 list_splice_init(&root->fs_info->ordered_extents, &splice);
336 while (!list_empty(&splice)) {
337 cur = splice.next;
338 ordered = list_entry(cur, struct btrfs_ordered_extent,
339 root_extent_list);
340 if (nocow_only &&
341 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
342 list_move(&ordered->root_extent_list,
343 &root->fs_info->ordered_extents);
344 cond_resched_lock(&root->fs_info->ordered_extent_lock);
345 continue;
346 }
347
348 list_del_init(&ordered->root_extent_list);
349 atomic_inc(&ordered->refs);
350
351 /*
352 * the inode may be getting freed (in sys_unlink path).
353 */
354 inode = igrab(ordered->inode);
355
356 spin_unlock(&root->fs_info->ordered_extent_lock);
357
358 if (inode) {
359 btrfs_start_ordered_extent(inode, ordered, 1);
360 btrfs_put_ordered_extent(ordered);
361 iput(inode);
362 } else {
363 btrfs_put_ordered_extent(ordered);
364 }
365
366 spin_lock(&root->fs_info->ordered_extent_lock);
367 }
368 spin_unlock(&root->fs_info->ordered_extent_lock);
369 return 0;
370}
371
372/*
373 * Used to start IO or wait for a given ordered extent to finish.
374 *
375 * If wait is one, this effectively waits on page writeback for all the pages
376 * in the extent, and it waits on the io completion code to insert
377 * metadata into the btree corresponding to the extent
378 */
379void btrfs_start_ordered_extent(struct inode *inode,
380 struct btrfs_ordered_extent *entry,
381 int wait)
382{
383 u64 start = entry->file_offset;
384 u64 end = start + entry->len - 1;
385
386 /*
387 * pages in the range can be dirty, clean or writeback. We
388 * start IO on any dirty ones so the wait doesn't stall waiting
389 * for pdflush to find them
390 */
391 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
392 if (wait)
393 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
394 &entry->flags));
395}
396
397/*
398 * Used to wait on ordered extents across a large range of bytes.
399 */
400int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
401{
402 u64 end;
403 u64 orig_end;
404 u64 wait_end;
405 struct btrfs_ordered_extent *ordered;
406
407 if (start + len < start) {
408 orig_end = INT_LIMIT(loff_t);
409 } else {
410 orig_end = start + len - 1;
411 if (orig_end > INT_LIMIT(loff_t))
412 orig_end = INT_LIMIT(loff_t);
413 }
414 wait_end = orig_end;
415again:
416 /* start IO across the range first to instantiate any delalloc
417 * extents
418 */
419 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
420
421 btrfs_wait_on_page_writeback_range(inode->i_mapping,
422 start >> PAGE_CACHE_SHIFT,
423 orig_end >> PAGE_CACHE_SHIFT);
424
425 end = orig_end;
426 while(1) {
427 ordered = btrfs_lookup_first_ordered_extent(inode, end);
428 if (!ordered) {
429 break;
430 }
431 if (ordered->file_offset > orig_end) {
432 btrfs_put_ordered_extent(ordered);
433 break;
434 }
435 if (ordered->file_offset + ordered->len < start) {
436 btrfs_put_ordered_extent(ordered);
437 break;
438 }
439 btrfs_start_ordered_extent(inode, ordered, 1);
440 end = ordered->file_offset;
441 btrfs_put_ordered_extent(ordered);
442 if (end == 0 || end == start)
443 break;
444 end--;
445 }
446 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
447 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
448 printk("inode %lu still ordered or delalloc after wait "
449 "%llu %llu\n", inode->i_ino,
450 (unsigned long long)start,
451 (unsigned long long)orig_end);
452 goto again;
453 }
454 return 0;
455}
456
457/*
458 * find an ordered extent corresponding to file_offset. return NULL if
459 * nothing is found, otherwise take a reference on the extent and return it
460 */
461struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
462 u64 file_offset)
463{
464 struct btrfs_ordered_inode_tree *tree;
465 struct rb_node *node;
466 struct btrfs_ordered_extent *entry = NULL;
467
468 tree = &BTRFS_I(inode)->ordered_tree;
469 mutex_lock(&tree->mutex);
470 node = tree_search(tree, file_offset);
471 if (!node)
472 goto out;
473
474 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
475 if (!offset_in_entry(entry, file_offset))
476 entry = NULL;
477 if (entry)
478 atomic_inc(&entry->refs);
479out:
480 mutex_unlock(&tree->mutex);
481 return entry;
482}
483
484/*
485 * lookup and return any extent before 'file_offset'. NULL is returned
486 * if none is found
487 */
488struct btrfs_ordered_extent *
489btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
490{
491 struct btrfs_ordered_inode_tree *tree;
492 struct rb_node *node;
493 struct btrfs_ordered_extent *entry = NULL;
494
495 tree = &BTRFS_I(inode)->ordered_tree;
496 mutex_lock(&tree->mutex);
497 node = tree_search(tree, file_offset);
498 if (!node)
499 goto out;
500
501 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
502 atomic_inc(&entry->refs);
503out:
504 mutex_unlock(&tree->mutex);
505 return entry;
506}
507
508/*
509 * After an extent is done, call this to conditionally update the on disk
510 * i_size. i_size is updated to cover any fully written part of the file.
511 */
512int btrfs_ordered_update_i_size(struct inode *inode,
513 struct btrfs_ordered_extent *ordered)
514{
515 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
516 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
517 u64 disk_i_size;
518 u64 new_i_size;
519 u64 i_size_test;
520 struct rb_node *node;
521 struct btrfs_ordered_extent *test;
522
523 mutex_lock(&tree->mutex);
524 disk_i_size = BTRFS_I(inode)->disk_i_size;
525
526 /*
527 * if the disk i_size is already at the inode->i_size, or
528 * this ordered extent is inside the disk i_size, we're done
529 */
530 if (disk_i_size >= inode->i_size ||
531 ordered->file_offset + ordered->len <= disk_i_size) {
532 goto out;
533 }
534
535 /*
536 * we can't update the disk_isize if there are delalloc bytes
537 * between disk_i_size and this ordered extent
538 */
539 if (test_range_bit(io_tree, disk_i_size,
540 ordered->file_offset + ordered->len - 1,
541 EXTENT_DELALLOC, 0)) {
542 goto out;
543 }
544 /*
545 * walk backward from this ordered extent to disk_i_size.
546 * if we find an ordered extent then we can't update disk i_size
547 * yet
548 */
549 node = &ordered->rb_node;
550 while(1) {
551 node = rb_prev(node);
552 if (!node)
553 break;
554 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
555 if (test->file_offset + test->len <= disk_i_size)
556 break;
557 if (test->file_offset >= inode->i_size)
558 break;
559 if (test->file_offset >= disk_i_size)
560 goto out;
561 }
562 new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
563
564 /*
565 * at this point, we know we can safely update i_size to at least
566 * the offset from this ordered extent. But, we need to
567 * walk forward and see if ios from higher up in the file have
568 * finished.
569 */
570 node = rb_next(&ordered->rb_node);
571 i_size_test = 0;
572 if (node) {
573 /*
574 * do we have an area where IO might have finished
575 * between our ordered extent and the next one.
576 */
577 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
578 if (test->file_offset > entry_end(ordered)) {
579 i_size_test = test->file_offset;
580 }
581 } else {
582 i_size_test = i_size_read(inode);
583 }
584
585 /*
586 * i_size_test is the end of a region after this ordered
587 * extent where there are no ordered extents. As long as there
588 * are no delalloc bytes in this area, it is safe to update
589 * disk_i_size to the end of the region.
590 */
591 if (i_size_test > entry_end(ordered) &&
592 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
593 EXTENT_DELALLOC, 0)) {
594 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
595 }
596 BTRFS_I(inode)->disk_i_size = new_i_size;
597out:
598 mutex_unlock(&tree->mutex);
599 return 0;
600}
601
602/*
603 * search the ordered extents for one corresponding to 'offset' and
604 * try to find a checksum. This is used because we allow pages to
605 * be reclaimed before their checksum is actually put into the btree
606 */
607int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
608{
609 struct btrfs_ordered_sum *ordered_sum;
610 struct btrfs_sector_sum *sector_sums;
611 struct btrfs_ordered_extent *ordered;
612 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
613 struct list_head *cur;
614 unsigned long num_sectors;
615 unsigned long i;
616 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
617 int ret = 1;
618
619 ordered = btrfs_lookup_ordered_extent(inode, offset);
620 if (!ordered)
621 return 1;
622
623 mutex_lock(&tree->mutex);
624 list_for_each_prev(cur, &ordered->list) {
625 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
626 if (offset >= ordered_sum->file_offset) {
627 num_sectors = ordered_sum->len / sectorsize;
628 sector_sums = ordered_sum->sums;
629 for (i = 0; i < num_sectors; i++) {
630 if (sector_sums[i].offset == offset) {
631 *sum = sector_sums[i].sum;
632 ret = 0;
633 goto out;
634 }
635 }
636 }
637 }
638out:
639 mutex_unlock(&tree->mutex);
640 btrfs_put_ordered_extent(ordered);
641 return ret;
642}
643
644
645/**
646 * taken from mm/filemap.c because it isn't exported
647 *
648 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
649 * @mapping: address space structure to write
650 * @start: offset in bytes where the range starts
651 * @end: offset in bytes where the range ends (inclusive)
652 * @sync_mode: enable synchronous operation
653 *
654 * Start writeback against all of a mapping's dirty pages that lie
655 * within the byte offsets <start, end> inclusive.
656 *
657 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
658 * opposed to a regular memory cleansing writeback. The difference between
659 * these two operations is that if a dirty page/buffer is encountered, it must
660 * be waited upon, and not just skipped over.
661 */
662int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
663 loff_t end, int sync_mode)
664{
665 struct writeback_control wbc = {
666 .sync_mode = sync_mode,
667 .nr_to_write = mapping->nrpages * 2,
668 .range_start = start,
669 .range_end = end,
670 .for_writepages = 1,
671 };
672 return btrfs_writepages(mapping, &wbc);
673}
674
675/**
676 * taken from mm/filemap.c because it isn't exported
677 *
678 * wait_on_page_writeback_range - wait for writeback to complete
679 * @mapping: target address_space
680 * @start: beginning page index
681 * @end: ending page index
682 *
683 * Wait for writeback to complete against pages indexed by start->end
684 * inclusive
685 */
686int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
687 pgoff_t start, pgoff_t end)
688{
689 struct pagevec pvec;
690 int nr_pages;
691 int ret = 0;
692 pgoff_t index;
693
694 if (end < start)
695 return 0;
696
697 pagevec_init(&pvec, 0);
698 index = start;
699 while ((index <= end) &&
700 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
701 PAGECACHE_TAG_WRITEBACK,
702 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
703 unsigned i;
704
705 for (i = 0; i < nr_pages; i++) {
706 struct page *page = pvec.pages[i];
707
708 /* until radix tree lookup accepts end_index */
709 if (page->index > end)
710 continue;
711
712 wait_on_page_writeback(page);
713 if (PageError(page))
714 ret = -EIO;
715 }
716 pagevec_release(&pvec);
717 cond_resched();
718 }
719
720 /* Check for outstanding write errors */
721 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
722 ret = -ENOSPC;
723 if (test_and_clear_bit(AS_EIO, &mapping->flags))
724 ret = -EIO;
725
726 return ret;
727}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..f50f8870a144
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,149 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ORDERED_DATA__
20#define __BTRFS_ORDERED_DATA__
21
22/* one of these per inode */
23struct btrfs_ordered_inode_tree {
24 struct mutex mutex;
25 struct rb_root tree;
26 struct rb_node *last;
27};
28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 u64 offset;
37 u32 sum;
38};
39
40struct btrfs_ordered_sum {
41 u64 file_offset;
42 /*
43 * this is the length in bytes covered by the sums array below.
44 * But, the sums array may not be contiguous in the file.
45 */
46 unsigned long len;
47 struct list_head list;
48 /* last field is a variable length array of btrfs_sector_sums */
49 struct btrfs_sector_sum sums[];
50};
51
52/*
53 * bits for the flags field:
54 *
55 * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
56 * It is used to make sure metadata is inserted into the tree only once
57 * per extent.
58 *
59 * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
60 * rbtree, just before waking any waiters. It is used to indicate the
61 * IO is done and any metadata is inserted into the tree.
62 */
63#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
64
65#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
66
67#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
68
69struct btrfs_ordered_extent {
70 /* logical offset in the file */
71 u64 file_offset;
72
73 /* disk byte number */
74 u64 start;
75
76 /* length of the extent in bytes */
77 u64 len;
78
79 /* flags (described above) */
80 unsigned long flags;
81
82 /* reference count */
83 atomic_t refs;
84
85 /* the inode we belong to */
86 struct inode *inode;
87
88 /* list of checksums for insertion when the extent io is done */
89 struct list_head list;
90
91 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
92 wait_queue_head_t wait;
93
94 /* our friendly rbtree entry */
95 struct rb_node rb_node;
96
97 /* a per root list of all the pending ordered extents */
98 struct list_head root_extent_list;
99};
100
101
102/*
103 * calculates the total size you need to allocate for an ordered sum
104 * structure spanning 'bytes' in the file
105 */
106static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
107 unsigned long bytes)
108{
109 unsigned long num_sectors = (bytes + root->sectorsize - 1) /
110 root->sectorsize;
111 num_sectors++;
112 return sizeof(struct btrfs_ordered_sum) +
113 num_sectors * sizeof(struct btrfs_sector_sum);
114}
115
116static inline void
117btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
118{
119 mutex_init(&t->mutex);
120 t->tree.rb_node = NULL;
121 t->last = NULL;
122}
123
124int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
125int btrfs_remove_ordered_extent(struct inode *inode,
126 struct btrfs_ordered_extent *entry);
127int btrfs_dec_test_ordered_pending(struct inode *inode,
128 u64 file_offset, u64 io_size);
129int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
130 u64 start, u64 len, int nocow);
131int btrfs_add_ordered_sum(struct inode *inode,
132 struct btrfs_ordered_extent *entry,
133 struct btrfs_ordered_sum *sum);
134struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
135 u64 file_offset);
136void btrfs_start_ordered_extent(struct inode *inode,
137 struct btrfs_ordered_extent *entry, int wait);
138int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
139struct btrfs_ordered_extent *
140btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
141int btrfs_ordered_update_i_size(struct inode *inode,
142 struct btrfs_ordered_extent *ordered);
143int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
144int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
145 pgoff_t start, pgoff_t end);
146int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
147 loff_t end, int sync_mode);
148int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
149#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000000..3c0d52af4f80
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21
22int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root, u64 offset)
24{
25 struct btrfs_path *path;
26 struct btrfs_key key;
27 int ret = 0;
28
29 key.objectid = BTRFS_ORPHAN_OBJECTID;
30 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
31 key.offset = offset;
32
33 path = btrfs_alloc_path();
34 if (!path)
35 return -ENOMEM;
36
37 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
38
39 btrfs_free_path(path);
40 return ret;
41}
42
43int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
44 struct btrfs_root *root, u64 offset)
45{
46 struct btrfs_path *path;
47 struct btrfs_key key;
48 int ret = 0;
49
50 key.objectid = BTRFS_ORPHAN_OBJECTID;
51 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
52 key.offset = offset;
53
54 path = btrfs_alloc_path();
55 if (!path)
56 return -ENOMEM;
57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret)
60 goto out;
61
62 ret = btrfs_del_item(trans, root, path);
63
64out:
65 btrfs_free_path(path);
66 return ret;
67}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000000..bd9ab3e9a7f2
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,200 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "print-tree.h"
22
23static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
24{
25 int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
26 int i;
27 printk("\t\tchunk length %llu owner %llu type %llu num_stripes %d\n",
28 (unsigned long long)btrfs_chunk_length(eb, chunk),
29 (unsigned long long)btrfs_chunk_owner(eb, chunk),
30 (unsigned long long)btrfs_chunk_type(eb, chunk),
31 num_stripes);
32 for (i = 0 ; i < num_stripes ; i++) {
33 printk("\t\t\tstripe %d devid %llu offset %llu\n", i,
34 (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
35 (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
36 }
37}
38static void print_dev_item(struct extent_buffer *eb,
39 struct btrfs_dev_item *dev_item)
40{
41 printk("\t\tdev item devid %llu "
42 "total_bytes %llu bytes used %Lu\n",
43 (unsigned long long)btrfs_device_id(eb, dev_item),
44 (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
45 (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
46}
47void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
48{
49 int i;
50 u32 nr = btrfs_header_nritems(l);
51 struct btrfs_item *item;
52 struct btrfs_extent_item *ei;
53 struct btrfs_root_item *ri;
54 struct btrfs_dir_item *di;
55 struct btrfs_inode_item *ii;
56 struct btrfs_block_group_item *bi;
57 struct btrfs_file_extent_item *fi;
58 struct btrfs_key key;
59 struct btrfs_key found_key;
60 struct btrfs_extent_ref *ref;
61 struct btrfs_dev_extent *dev_extent;
62 u32 type;
63
64 printk("leaf %llu total ptrs %d free space %d\n",
65 (unsigned long long)btrfs_header_bytenr(l), nr,
66 btrfs_leaf_free_space(root, l));
67 for (i = 0 ; i < nr ; i++) {
68 item = btrfs_item_nr(l, i);
69 btrfs_item_key_to_cpu(l, &key, i);
70 type = btrfs_key_type(&key);
71 printk("\titem %d key (%llu %x %llu) itemoff %d itemsize %d\n",
72 i,
73 (unsigned long long)key.objectid, type,
74 (unsigned long long)key.offset,
75 btrfs_item_offset(l, item), btrfs_item_size(l, item));
76 switch (type) {
77 case BTRFS_INODE_ITEM_KEY:
78 ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
79 printk("\t\tinode generation %llu size %llu mode %o\n",
80 (unsigned long long)btrfs_inode_generation(l, ii),
81 (unsigned long long)btrfs_inode_size(l, ii),
82 btrfs_inode_mode(l, ii));
83 break;
84 case BTRFS_DIR_ITEM_KEY:
85 di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
86 btrfs_dir_item_key_to_cpu(l, di, &found_key);
87 printk("\t\tdir oid %llu type %u\n",
88 (unsigned long long)found_key.objectid,
89 btrfs_dir_type(l, di));
90 break;
91 case BTRFS_ROOT_ITEM_KEY:
92 ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
93 printk("\t\troot data bytenr %llu refs %u\n",
94 (unsigned long long)btrfs_disk_root_bytenr(l, ri),
95 btrfs_disk_root_refs(l, ri));
96 break;
97 case BTRFS_EXTENT_ITEM_KEY:
98 ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
99 printk("\t\textent data refs %u\n",
100 btrfs_extent_refs(l, ei));
101 break;
102 case BTRFS_EXTENT_REF_KEY:
103 ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
104 printk("\t\textent back ref root %llu gen %llu "
105 "owner %llu num_refs %lu\n",
106 (unsigned long long)btrfs_ref_root(l, ref),
107 (unsigned long long)btrfs_ref_generation(l, ref),
108 (unsigned long long)btrfs_ref_objectid(l, ref),
109 (unsigned long)btrfs_ref_num_refs(l, ref));
110 break;
111
112 case BTRFS_EXTENT_DATA_KEY:
113 fi = btrfs_item_ptr(l, i,
114 struct btrfs_file_extent_item);
115 if (btrfs_file_extent_type(l, fi) ==
116 BTRFS_FILE_EXTENT_INLINE) {
117 printk("\t\tinline extent data size %u\n",
118 btrfs_file_extent_inline_len(l, item));
119 break;
120 }
121 printk("\t\textent data disk bytenr %llu nr %llu\n",
122 (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
123 (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
124 printk("\t\textent data offset %llu nr %llu\n",
125 (unsigned long long)btrfs_file_extent_offset(l, fi),
126 (unsigned long long)btrfs_file_extent_num_bytes(l, fi));
127 break;
128 case BTRFS_BLOCK_GROUP_ITEM_KEY:
129 bi = btrfs_item_ptr(l, i,
130 struct btrfs_block_group_item);
131 printk("\t\tblock group used %llu\n",
132 (unsigned long long)btrfs_disk_block_group_used(l, bi));
133 break;
134 case BTRFS_CHUNK_ITEM_KEY:
135 print_chunk(l, btrfs_item_ptr(l, i, struct btrfs_chunk));
136 break;
137 case BTRFS_DEV_ITEM_KEY:
138 print_dev_item(l, btrfs_item_ptr(l, i,
139 struct btrfs_dev_item));
140 break;
141 case BTRFS_DEV_EXTENT_KEY:
142 dev_extent = btrfs_item_ptr(l, i,
143 struct btrfs_dev_extent);
144 printk("\t\tdev extent chunk_tree %llu\n"
145 "\t\tchunk objectid %llu chunk offset %llu "
146 "length %llu\n",
147 (unsigned long long)
148 btrfs_dev_extent_chunk_tree(l, dev_extent),
149 (unsigned long long)
150 btrfs_dev_extent_chunk_objectid(l, dev_extent),
151 (unsigned long long)
152 btrfs_dev_extent_chunk_offset(l, dev_extent),
153 (unsigned long long)
154 btrfs_dev_extent_length(l, dev_extent));
155 };
156 }
157}
158
159void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
160{
161 int i; u32 nr;
162 struct btrfs_key key;
163 int level;
164
165 if (!c)
166 return;
167 nr = btrfs_header_nritems(c);
168 level = btrfs_header_level(c);
169 if (level == 0) {
170 btrfs_print_leaf(root, c);
171 return;
172 }
173 printk("node %llu level %d total ptrs %d free spc %u\n",
174 (unsigned long long)btrfs_header_bytenr(c),
175 btrfs_header_level(c), nr,
176 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
177 for (i = 0; i < nr; i++) {
178 btrfs_node_key_to_cpu(c, &key, i);
179 printk("\tkey %d (%llu %u %llu) block %llu\n",
180 i,
181 (unsigned long long)key.objectid,
182 key.type,
183 (unsigned long long)key.offset,
184 (unsigned long long)btrfs_node_blockptr(c, i));
185 }
186 for (i = 0; i < nr; i++) {
187 struct extent_buffer *next = read_tree_block(root,
188 btrfs_node_blockptr(c, i),
189 btrfs_level_size(root, level - 1),
190 btrfs_node_ptr_generation(c, i));
191 if (btrfs_is_leaf(next) &&
192 btrfs_header_level(c) != 1)
193 BUG();
194 if (btrfs_header_level(next) !=
195 btrfs_header_level(c) - 1)
196 BUG();
197 btrfs_print_tree(root, next);
198 free_extent_buffer(next);
199 }
200}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000000..da75efe534d5
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __PRINT_TREE_
20#define __PRINT_TREE_
21void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
22void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
23#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 000000000000..a50ebb67055d
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "ref-cache.h"
22#include "transaction.h"
23
24/*
25 * leaf refs are used to cache the information about which extents
26 * a given leaf has references on. This allows us to process that leaf
27 * in btrfs_drop_snapshot without needing to read it back from disk.
28 */
29
30/*
31 * kmalloc a leaf reference struct and update the counters for the
32 * total ref cache size
33 */
34struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
35 int nr_extents)
36{
37 struct btrfs_leaf_ref *ref;
38 size_t size = btrfs_leaf_ref_size(nr_extents);
39
40 ref = kmalloc(size, GFP_NOFS);
41 if (ref) {
42 spin_lock(&root->fs_info->ref_cache_lock);
43 root->fs_info->total_ref_cache_size += size;
44 spin_unlock(&root->fs_info->ref_cache_lock);
45
46 memset(ref, 0, sizeof(*ref));
47 atomic_set(&ref->usage, 1);
48 INIT_LIST_HEAD(&ref->list);
49 }
50 return ref;
51}
52
53/*
54 * free a leaf reference struct and update the counters for the
55 * total ref cache size
56 */
57void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
58{
59 if (!ref)
60 return;
61 WARN_ON(atomic_read(&ref->usage) == 0);
62 if (atomic_dec_and_test(&ref->usage)) {
63 size_t size = btrfs_leaf_ref_size(ref->nritems);
64
65 BUG_ON(ref->in_tree);
66 kfree(ref);
67
68 spin_lock(&root->fs_info->ref_cache_lock);
69 root->fs_info->total_ref_cache_size -= size;
70 spin_unlock(&root->fs_info->ref_cache_lock);
71 }
72}
73
74static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
75 struct rb_node *node)
76{
77 struct rb_node ** p = &root->rb_node;
78 struct rb_node * parent = NULL;
79 struct btrfs_leaf_ref *entry;
80
81 while(*p) {
82 parent = *p;
83 entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
84
85 if (bytenr < entry->bytenr)
86 p = &(*p)->rb_left;
87 else if (bytenr > entry->bytenr)
88 p = &(*p)->rb_right;
89 else
90 return parent;
91 }
92
93 entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
94 rb_link_node(node, parent, p);
95 rb_insert_color(node, root);
96 return NULL;
97}
98
99static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
100{
101 struct rb_node * n = root->rb_node;
102 struct btrfs_leaf_ref *entry;
103
104 while(n) {
105 entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
106 WARN_ON(!entry->in_tree);
107
108 if (bytenr < entry->bytenr)
109 n = n->rb_left;
110 else if (bytenr > entry->bytenr)
111 n = n->rb_right;
112 else
113 return n;
114 }
115 return NULL;
116}
117
118int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
119 int shared)
120{
121 struct btrfs_leaf_ref *ref = NULL;
122 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
123
124 if (shared)
125 tree = &root->fs_info->shared_ref_tree;
126 if (!tree)
127 return 0;
128
129 spin_lock(&tree->lock);
130 while(!list_empty(&tree->list)) {
131 ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
132 BUG_ON(ref->tree != tree);
133 if (ref->root_gen > max_root_gen)
134 break;
135 if (!xchg(&ref->in_tree, 0)) {
136 cond_resched_lock(&tree->lock);
137 continue;
138 }
139
140 rb_erase(&ref->rb_node, &tree->root);
141 list_del_init(&ref->list);
142
143 spin_unlock(&tree->lock);
144 btrfs_free_leaf_ref(root, ref);
145 cond_resched();
146 spin_lock(&tree->lock);
147 }
148 spin_unlock(&tree->lock);
149 return 0;
150}
151
152/*
153 * find the leaf ref for a given extent. This returns the ref struct with
154 * a usage reference incremented
155 */
156struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
157 u64 bytenr)
158{
159 struct rb_node *rb;
160 struct btrfs_leaf_ref *ref = NULL;
161 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
162again:
163 if (tree) {
164 spin_lock(&tree->lock);
165 rb = tree_search(&tree->root, bytenr);
166 if (rb)
167 ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
168 if (ref)
169 atomic_inc(&ref->usage);
170 spin_unlock(&tree->lock);
171 if (ref)
172 return ref;
173 }
174 if (tree != &root->fs_info->shared_ref_tree) {
175 tree = &root->fs_info->shared_ref_tree;
176 goto again;
177 }
178 return NULL;
179}
180
181/*
182 * add a fully filled in leaf ref struct
183 * remove all the refs older than a given root generation
184 */
185int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
186 int shared)
187{
188 int ret = 0;
189 struct rb_node *rb;
190 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
191
192 if (shared)
193 tree = &root->fs_info->shared_ref_tree;
194
195 spin_lock(&tree->lock);
196 rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
197 if (rb) {
198 ret = -EEXIST;
199 } else {
200 atomic_inc(&ref->usage);
201 ref->tree = tree;
202 ref->in_tree = 1;
203 list_add_tail(&ref->list, &tree->list);
204 }
205 spin_unlock(&tree->lock);
206 return ret;
207}
208
209/*
210 * remove a single leaf ref from the tree. This drops the ref held by the tree
211 * only
212 */
213int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
214{
215 struct btrfs_leaf_ref_tree *tree;
216
217 if (!xchg(&ref->in_tree, 0))
218 return 0;
219
220 tree = ref->tree;
221 spin_lock(&tree->lock);
222
223 rb_erase(&ref->rb_node, &tree->root);
224 list_del_init(&ref->list);
225
226 spin_unlock(&tree->lock);
227
228 btrfs_free_leaf_ref(root, ref);
229 return 0;
230}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 000000000000..16f3183d7c59
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,77 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REFCACHE__
19#define __REFCACHE__
20
21struct btrfs_extent_info {
22 /* bytenr and num_bytes find the extent in the extent allocation tree */
23 u64 bytenr;
24 u64 num_bytes;
25
26 /* objectid and offset find the back reference for the file */
27 u64 objectid;
28 u64 offset;
29};
30
31struct btrfs_leaf_ref {
32 struct rb_node rb_node;
33 struct btrfs_leaf_ref_tree *tree;
34 int in_tree;
35 atomic_t usage;
36
37 u64 root_gen;
38 u64 bytenr;
39 u64 owner;
40 u64 generation;
41 int nritems;
42
43 struct list_head list;
44 struct btrfs_extent_info extents[];
45};
46
47static inline size_t btrfs_leaf_ref_size(int nr_extents)
48{
49 return sizeof(struct btrfs_leaf_ref) +
50 sizeof(struct btrfs_extent_info) * nr_extents;
51}
52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{
55 tree->root.rb_node = NULL;
56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock);
58}
59
60static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
61{
62 return RB_EMPTY_ROOT(&tree->root);
63}
64
65void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
66struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
67 int nr_extents);
68void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
69struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
70 u64 bytenr);
71int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
72 int shared);
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000000..eb7f7655e9d5
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,277 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "transaction.h"
21#include "disk-io.h"
22#include "print-tree.h"
23
24/*
25 * search forward for a root, starting with objectid 'search_start'
26 * if a root key is found, the objectid we find is filled into 'found_objectid'
27 * and 0 is returned. < 0 is returned on error, 1 if there is nothing
28 * left in the tree.
29 */
30int btrfs_search_root(struct btrfs_root *root, u64 search_start,
31 u64 *found_objectid)
32{
33 struct btrfs_path *path;
34 struct btrfs_key search_key;
35 int ret;
36
37 root = root->fs_info->tree_root;
38 search_key.objectid = search_start;
39 search_key.type = (u8)-1;
40 search_key.offset = (u64)-1;
41
42 path = btrfs_alloc_path();
43 BUG_ON(!path);
44again:
45 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
46 if (ret < 0)
47 goto out;
48 if (ret == 0) {
49 ret = 1;
50 goto out;
51 }
52 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
53 ret = btrfs_next_leaf(root, path);
54 if (ret)
55 goto out;
56 }
57 btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
58 if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
59 search_key.offset++;
60 btrfs_release_path(root, path);
61 goto again;
62 }
63 ret = 0;
64 *found_objectid = search_key.objectid;
65
66out:
67 btrfs_free_path(path);
68 return ret;
69}
70
71/*
72 * lookup the root with the highest offset for a given objectid. The key we do
73 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
74 * on error.
75 */
76int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
77 struct btrfs_root_item *item, struct btrfs_key *key)
78{
79 struct btrfs_path *path;
80 struct btrfs_key search_key;
81 struct btrfs_key found_key;
82 struct extent_buffer *l;
83 int ret;
84 int slot;
85
86 search_key.objectid = objectid;
87 search_key.type = (u8)-1;
88 search_key.offset = (u64)-1;
89
90 path = btrfs_alloc_path();
91 BUG_ON(!path);
92 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
93 if (ret < 0)
94 goto out;
95
96 BUG_ON(ret == 0);
97 l = path->nodes[0];
98 BUG_ON(path->slots[0] == 0);
99 slot = path->slots[0] - 1;
100 btrfs_item_key_to_cpu(l, &found_key, slot);
101 if (found_key.objectid != objectid) {
102 ret = 1;
103 goto out;
104 }
105 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
106 sizeof(*item));
107 memcpy(key, &found_key, sizeof(found_key));
108 ret = 0;
109out:
110 btrfs_free_path(path);
111 return ret;
112}
113
114/*
115 * copy the data in 'item' into the btree
116 */
117int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
118 *root, struct btrfs_key *key, struct btrfs_root_item
119 *item)
120{
121 struct btrfs_path *path;
122 struct extent_buffer *l;
123 int ret;
124 int slot;
125 unsigned long ptr;
126
127 path = btrfs_alloc_path();
128 BUG_ON(!path);
129 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
130 if (ret < 0)
131 goto out;
132
133 if (ret != 0) {
134 btrfs_print_leaf(root, path->nodes[0]);
135 printk("unable to update root key %Lu %u %Lu\n",
136 key->objectid, key->type, key->offset);
137 BUG_ON(1);
138 }
139
140 l = path->nodes[0];
141 slot = path->slots[0];
142 ptr = btrfs_item_ptr_offset(l, slot);
143 write_extent_buffer(l, item, ptr, sizeof(*item));
144 btrfs_mark_buffer_dirty(path->nodes[0]);
145out:
146 btrfs_release_path(root, path);
147 btrfs_free_path(path);
148 return ret;
149}
150
151int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
152 *root, struct btrfs_key *key, struct btrfs_root_item
153 *item)
154{
155 int ret;
156 ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
157 return ret;
158}
159
160/*
161 * at mount time we want to find all the old transaction snapshots that were in
162 * the process of being deleted if we crashed. This is any root item with an offset
163 * lower than the latest root. They need to be queued for deletion to finish
164 * what was happening when we crashed.
165 */
166int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
167 struct btrfs_root *latest)
168{
169 struct btrfs_root *dead_root;
170 struct btrfs_item *item;
171 struct btrfs_root_item *ri;
172 struct btrfs_key key;
173 struct btrfs_key found_key;
174 struct btrfs_path *path;
175 int ret;
176 u32 nritems;
177 struct extent_buffer *leaf;
178 int slot;
179
180 key.objectid = objectid;
181 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
182 key.offset = 0;
183 path = btrfs_alloc_path();
184 if (!path)
185 return -ENOMEM;
186
187again:
188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
189 if (ret < 0)
190 goto err;
191 while(1) {
192 leaf = path->nodes[0];
193 nritems = btrfs_header_nritems(leaf);
194 slot = path->slots[0];
195 if (slot >= nritems) {
196 ret = btrfs_next_leaf(root, path);
197 if (ret)
198 break;
199 leaf = path->nodes[0];
200 nritems = btrfs_header_nritems(leaf);
201 slot = path->slots[0];
202 }
203 item = btrfs_item_nr(leaf, slot);
204 btrfs_item_key_to_cpu(leaf, &key, slot);
205 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
206 goto next;
207
208 if (key.objectid < objectid)
209 goto next;
210
211 if (key.objectid > objectid)
212 break;
213
214 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
215 if (btrfs_disk_root_refs(leaf, ri) != 0)
216 goto next;
217
218 memcpy(&found_key, &key, sizeof(key));
219 key.offset++;
220 btrfs_release_path(root, path);
221 dead_root =
222 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
223 &found_key);
224 if (IS_ERR(dead_root)) {
225 ret = PTR_ERR(dead_root);
226 goto err;
227 }
228
229 if (objectid == BTRFS_TREE_RELOC_OBJECTID)
230 ret = btrfs_add_dead_reloc_root(dead_root);
231 else
232 ret = btrfs_add_dead_root(dead_root, latest);
233 if (ret)
234 goto err;
235 goto again;
236next:
237 slot++;
238 path->slots[0]++;
239 }
240 ret = 0;
241err:
242 btrfs_free_path(path);
243 return ret;
244}
245
246/* drop the root item for 'key' from 'root' */
247int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
248 struct btrfs_key *key)
249{
250 struct btrfs_path *path;
251 int ret;
252 u32 refs;
253 struct btrfs_root_item *ri;
254 struct extent_buffer *leaf;
255
256 path = btrfs_alloc_path();
257 BUG_ON(!path);
258 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
259 if (ret < 0)
260 goto out;
261 if (ret) {
262btrfs_print_leaf(root, path->nodes[0]);
263printk("failed to del %Lu %u %Lu\n", key->objectid, key->type, key->offset);
264
265 }
266 BUG_ON(ret != 0);
267 leaf = path->nodes[0];
268 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
269
270 refs = btrfs_disk_root_refs(leaf, ri);
271 BUG_ON(refs != 0);
272 ret = btrfs_del_item(trans, root, path);
273out:
274 btrfs_release_path(root, path);
275 btrfs_free_path(path);
276 return ret;
277}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000000..cdedbe144d45
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,132 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/highmem.h>
20
21/* this is some deeply nasty code. ctree.h has a different
22 * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
23 *
24 * The end result is that anyone who #includes ctree.h gets a
25 * declaration for the btrfs_set_foo functions and btrfs_foo functions
26 *
27 * This file declares the macros and then #includes ctree.h, which results
28 * in cpp creating the function here based on the template below.
29 *
30 * These setget functions do all the extent_buffer related mapping
31 * required to efficiently read and write specific fields in the extent
32 * buffers. Every pointer to metadata items in btrfs is really just
33 * an unsigned long offset into the extent buffer which has been
34 * cast to a specific type. This gives us all the gcc type checking.
35 *
36 * The extent buffer api is used to do all the kmapping and page
37 * spanning work required to get extent buffers in highmem and have
38 * a metadata blocksize different from the page size.
39 */
40
41#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
42u##bits btrfs_##name(struct extent_buffer *eb, \
43 type *s) \
44{ \
45 unsigned long part_offset = (unsigned long)s; \
46 unsigned long offset = part_offset + offsetof(type, member); \
47 type *p; \
48 /* ugly, but we want the fast path here */ \
49 if (eb->map_token && offset >= eb->map_start && \
50 offset + sizeof(((type *)0)->member) <= eb->map_start + \
51 eb->map_len) { \
52 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
53 return le##bits##_to_cpu(p->member); \
54 } \
55 { \
56 int err; \
57 char *map_token; \
58 char *kaddr; \
59 int unmap_on_exit = (eb->map_token == NULL); \
60 unsigned long map_start; \
61 unsigned long map_len; \
62 __le##bits res; \
63 err = map_extent_buffer(eb, offset, \
64 sizeof(((type *)0)->member), \
65 &map_token, &kaddr, \
66 &map_start, &map_len, KM_USER1); \
67 if (err) { \
68 read_eb_member(eb, s, type, member, &res); \
69 return le##bits##_to_cpu(res); \
70 } \
71 p = (type *)(kaddr + part_offset - map_start); \
72 res = le##bits##_to_cpu(p->member); \
73 if (unmap_on_exit) \
74 unmap_extent_buffer(eb, map_token, KM_USER1); \
75 return res; \
76 } \
77} \
78void btrfs_set_##name(struct extent_buffer *eb, \
79 type *s, u##bits val) \
80{ \
81 unsigned long part_offset = (unsigned long)s; \
82 unsigned long offset = part_offset + offsetof(type, member); \
83 type *p; \
84 /* ugly, but we want the fast path here */ \
85 if (eb->map_token && offset >= eb->map_start && \
86 offset + sizeof(((type *)0)->member) <= eb->map_start + \
87 eb->map_len) { \
88 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
89 p->member = cpu_to_le##bits(val); \
90 return; \
91 } \
92 { \
93 int err; \
94 char *map_token; \
95 char *kaddr; \
96 int unmap_on_exit = (eb->map_token == NULL); \
97 unsigned long map_start; \
98 unsigned long map_len; \
99 err = map_extent_buffer(eb, offset, \
100 sizeof(((type *)0)->member), \
101 &map_token, &kaddr, \
102 &map_start, &map_len, KM_USER1); \
103 if (err) { \
104 val = cpu_to_le##bits(val); \
105 write_eb_member(eb, s, type, member, &val); \
106 return; \
107 } \
108 p = (type *)(kaddr + part_offset - map_start); \
109 p->member = cpu_to_le##bits(val); \
110 if (unmap_on_exit) \
111 unmap_extent_buffer(eb, map_token, KM_USER1); \
112 } \
113}
114
115#include "ctree.h"
116
117void btrfs_node_key(struct extent_buffer *eb,
118 struct btrfs_disk_key *disk_key, int nr)
119{
120 unsigned long ptr = btrfs_node_key_ptr_offset(nr);
121 if (eb->map_token && ptr >= eb->map_start &&
122 ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
123 memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
124 sizeof(*disk_key));
125 return;
126 } else if (eb->map_token) {
127 unmap_extent_buffer(eb, eb->map_token, KM_USER1);
128 eb->map_token = NULL;
129 }
130 read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
131 struct btrfs_key_ptr, key, disk_key);
132}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000000..2e6039825b7b
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,659 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/module.h>
21#include <linux/buffer_head.h>
22#include <linux/fs.h>
23#include <linux/pagemap.h>
24#include <linux/highmem.h>
25#include <linux/time.h>
26#include <linux/init.h>
27#include <linux/string.h>
28#include <linux/smp_lock.h>
29#include <linux/backing-dev.h>
30#include <linux/mount.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/parser.h>
37#include <linux/ctype.h>
38#include <linux/namei.h>
39#include <linux/miscdevice.h>
40#include "ctree.h"
41#include "disk-io.h"
42#include "transaction.h"
43#include "btrfs_inode.h"
44#include "ioctl.h"
45#include "print-tree.h"
46#include "xattr.h"
47#include "volumes.h"
48#include "version.h"
49#include "export.h"
50
51#define BTRFS_SUPER_MAGIC 0x9123683E
52
53static struct super_operations btrfs_super_ops;
54
55static void btrfs_put_super (struct super_block * sb)
56{
57 struct btrfs_root *root = btrfs_sb(sb);
58 struct btrfs_fs_info *fs = root->fs_info;
59 int ret;
60
61 ret = close_ctree(root);
62 if (ret) {
63 printk("close ctree returns %d\n", ret);
64 }
65 btrfs_sysfs_del_super(fs);
66 sb->s_fs_info = NULL;
67}
68
69enum {
70 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
71 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
72 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_err,
73};
74
75static match_table_t tokens = {
76 {Opt_degraded, "degraded"},
77 {Opt_subvol, "subvol=%s"},
78 {Opt_device, "device=%s"},
79 {Opt_nodatasum, "nodatasum"},
80 {Opt_nodatacow, "nodatacow"},
81 {Opt_nobarrier, "nobarrier"},
82 {Opt_max_extent, "max_extent=%s"},
83 {Opt_max_inline, "max_inline=%s"},
84 {Opt_alloc_start, "alloc_start=%s"},
85 {Opt_thread_pool, "thread_pool=%d"},
86 {Opt_ssd, "ssd"},
87 {Opt_noacl, "noacl"},
88 {Opt_err, NULL},
89};
90
91u64 btrfs_parse_size(char *str)
92{
93 u64 res;
94 int mult = 1;
95 char *end;
96 char last;
97
98 res = simple_strtoul(str, &end, 10);
99
100 last = end[0];
101 if (isalpha(last)) {
102 last = tolower(last);
103 switch (last) {
104 case 'g':
105 mult *= 1024;
106 case 'm':
107 mult *= 1024;
108 case 'k':
109 mult *= 1024;
110 }
111 res = res * mult;
112 }
113 return res;
114}
115
116/*
117 * Regular mount options parser. Everything that is needed only when
118 * reading in a new superblock is parsed here.
119 */
120int btrfs_parse_options(struct btrfs_root *root, char *options)
121{
122 struct btrfs_fs_info *info = root->fs_info;
123 substring_t args[MAX_OPT_ARGS];
124 char *p, *num;
125 int intarg;
126
127 if (!options)
128 return 0;
129
130 /*
131 * strsep changes the string, duplicate it because parse_options
132 * gets called twice
133 */
134 options = kstrdup(options, GFP_NOFS);
135 if (!options)
136 return -ENOMEM;
137
138
139 while ((p = strsep(&options, ",")) != NULL) {
140 int token;
141 if (!*p)
142 continue;
143
144 token = match_token(p, tokens, args);
145 switch (token) {
146 case Opt_degraded:
147 printk(KERN_INFO "btrfs: allowing degraded mounts\n");
148 btrfs_set_opt(info->mount_opt, DEGRADED);
149 break;
150 case Opt_subvol:
151 case Opt_device:
152 /*
153 * These are parsed by btrfs_parse_early_options
154 * and can be happily ignored here.
155 */
156 break;
157 case Opt_nodatasum:
158 printk(KERN_INFO "btrfs: setting nodatacsum\n");
159 btrfs_set_opt(info->mount_opt, NODATASUM);
160 break;
161 case Opt_nodatacow:
162 printk(KERN_INFO "btrfs: setting nodatacow\n");
163 btrfs_set_opt(info->mount_opt, NODATACOW);
164 btrfs_set_opt(info->mount_opt, NODATASUM);
165 break;
166 case Opt_ssd:
167 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
168 btrfs_set_opt(info->mount_opt, SSD);
169 break;
170 case Opt_nobarrier:
171 printk(KERN_INFO "btrfs: turning off barriers\n");
172 btrfs_set_opt(info->mount_opt, NOBARRIER);
173 break;
174 case Opt_thread_pool:
175 intarg = 0;
176 match_int(&args[0], &intarg);
177 if (intarg) {
178 info->thread_pool_size = intarg;
179 printk(KERN_INFO "btrfs: thread pool %d\n",
180 info->thread_pool_size);
181 }
182 break;
183 case Opt_max_extent:
184 num = match_strdup(&args[0]);
185 if (num) {
186 info->max_extent = btrfs_parse_size(num);
187 kfree(num);
188
189 info->max_extent = max_t(u64,
190 info->max_extent, root->sectorsize);
191 printk(KERN_INFO "btrfs: max_extent at %llu\n",
192 info->max_extent);
193 }
194 break;
195 case Opt_max_inline:
196 num = match_strdup(&args[0]);
197 if (num) {
198 info->max_inline = btrfs_parse_size(num);
199 kfree(num);
200
201 if (info->max_inline) {
202 info->max_inline = max_t(u64,
203 info->max_inline,
204 root->sectorsize);
205 }
206 printk(KERN_INFO "btrfs: max_inline at %llu\n",
207 info->max_inline);
208 }
209 break;
210 case Opt_alloc_start:
211 num = match_strdup(&args[0]);
212 if (num) {
213 info->alloc_start = btrfs_parse_size(num);
214 kfree(num);
215 printk(KERN_INFO
216 "btrfs: allocations start at %llu\n",
217 info->alloc_start);
218 }
219 break;
220 case Opt_noacl:
221 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
222 break;
223 default:
224 break;
225 }
226 }
227 kfree(options);
228 return 0;
229}
230
231/*
232 * Parse mount options that are required early in the mount process.
233 *
234 * All other options will be parsed on much later in the mount process and
235 * only when we need to allocate a new super block.
236 */
237static int btrfs_parse_early_options(const char *options, int flags,
238 void *holder, char **subvol_name,
239 struct btrfs_fs_devices **fs_devices)
240{
241 substring_t args[MAX_OPT_ARGS];
242 char *opts, *p;
243 int error = 0;
244
245 if (!options)
246 goto out;
247
248 /*
249 * strsep changes the string, duplicate it because parse_options
250 * gets called twice
251 */
252 opts = kstrdup(options, GFP_KERNEL);
253 if (!opts)
254 return -ENOMEM;
255
256 while ((p = strsep(&opts, ",")) != NULL) {
257 int token;
258 if (!*p)
259 continue;
260
261 token = match_token(p, tokens, args);
262 switch (token) {
263 case Opt_subvol:
264 *subvol_name = match_strdup(&args[0]);
265 break;
266 case Opt_device:
267 error = btrfs_scan_one_device(match_strdup(&args[0]),
268 flags, holder, fs_devices);
269 if (error)
270 goto out_free_opts;
271 break;
272 default:
273 break;
274 }
275 }
276
277 out_free_opts:
278 kfree(opts);
279 out:
280 /*
281 * If no subvolume name is specified we use the default one. Allocate
282 * a copy of the string "default" here so that code later in the
283 * mount path doesn't care if it's the default volume or another one.
284 */
285 if (!*subvol_name) {
286 *subvol_name = kstrdup("default", GFP_KERNEL);
287 if (!*subvol_name)
288 return -ENOMEM;
289 }
290 return error;
291}
292
293static int btrfs_fill_super(struct super_block * sb,
294 struct btrfs_fs_devices *fs_devices,
295 void * data, int silent)
296{
297 struct inode * inode;
298 struct dentry * root_dentry;
299 struct btrfs_super_block *disk_super;
300 struct btrfs_root *tree_root;
301 struct btrfs_inode *bi;
302 int err;
303
304 sb->s_maxbytes = MAX_LFS_FILESIZE;
305 sb->s_magic = BTRFS_SUPER_MAGIC;
306 sb->s_op = &btrfs_super_ops;
307 sb->s_export_op = &btrfs_export_ops;
308 sb->s_xattr = btrfs_xattr_handlers;
309 sb->s_time_gran = 1;
310 sb->s_flags |= MS_POSIXACL;
311
312 tree_root = open_ctree(sb, fs_devices, (char *)data);
313
314 if (IS_ERR(tree_root)) {
315 printk("btrfs: open_ctree failed\n");
316 return PTR_ERR(tree_root);
317 }
318 sb->s_fs_info = tree_root;
319 disk_super = &tree_root->fs_info->super_copy;
320 inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super),
321 tree_root);
322 bi = BTRFS_I(inode);
323 bi->location.objectid = inode->i_ino;
324 bi->location.offset = 0;
325 bi->root = tree_root;
326
327 btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
328
329 if (!inode) {
330 err = -ENOMEM;
331 goto fail_close;
332 }
333 if (inode->i_state & I_NEW) {
334 btrfs_read_locked_inode(inode);
335 unlock_new_inode(inode);
336 }
337
338 root_dentry = d_alloc_root(inode);
339 if (!root_dentry) {
340 iput(inode);
341 err = -ENOMEM;
342 goto fail_close;
343 }
344
345 /* this does the super kobj at the same time */
346 err = btrfs_sysfs_add_super(tree_root->fs_info);
347 if (err)
348 goto fail_close;
349
350 sb->s_root = root_dentry;
351
352 save_mount_options(sb, data);
353 return 0;
354
355fail_close:
356 close_ctree(tree_root);
357 return err;
358}
359
360int btrfs_sync_fs(struct super_block *sb, int wait)
361{
362 struct btrfs_trans_handle *trans;
363 struct btrfs_root *root;
364 int ret;
365 root = btrfs_sb(sb);
366
367 sb->s_dirt = 0;
368 if (!wait) {
369 filemap_flush(root->fs_info->btree_inode->i_mapping);
370 return 0;
371 }
372 btrfs_clean_old_snapshots(root);
373 trans = btrfs_start_transaction(root, 1);
374 ret = btrfs_commit_transaction(trans, root);
375 sb->s_dirt = 0;
376 return ret;
377}
378
379static void btrfs_write_super(struct super_block *sb)
380{
381 sb->s_dirt = 0;
382}
383
384static int btrfs_test_super(struct super_block *s, void *data)
385{
386 struct btrfs_fs_devices *test_fs_devices = data;
387 struct btrfs_root *root = btrfs_sb(s);
388
389 return root->fs_info->fs_devices == test_fs_devices;
390}
391
392/*
393 * Find a superblock for the given device / mount point.
394 *
395 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
396 * for multiple device setup. Make sure to keep it in sync.
397 */
398static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
399 const char *dev_name, void *data, struct vfsmount *mnt)
400{
401 char *subvol_name = NULL;
402 struct block_device *bdev = NULL;
403 struct super_block *s;
404 struct dentry *root;
405 struct btrfs_fs_devices *fs_devices = NULL;
406 int error = 0;
407
408 error = btrfs_parse_early_options(data, flags, fs_type,
409 &subvol_name, &fs_devices);
410 if (error)
411 goto error;
412
413 error = btrfs_scan_one_device(dev_name, flags, fs_type, &fs_devices);
414 if (error)
415 goto error_free_subvol_name;
416
417 error = btrfs_open_devices(fs_devices, flags, fs_type);
418 if (error)
419 goto error_free_subvol_name;
420
421 bdev = fs_devices->latest_bdev;
422 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
423 if (IS_ERR(s))
424 goto error_s;
425
426 if (s->s_root) {
427 if ((flags ^ s->s_flags) & MS_RDONLY) {
428 up_write(&s->s_umount);
429 deactivate_super(s);
430 error = -EBUSY;
431 goto error_bdev;
432 }
433
434 } else {
435 char b[BDEVNAME_SIZE];
436
437 s->s_flags = flags;
438 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
439 error = btrfs_fill_super(s, fs_devices, data,
440 flags & MS_SILENT ? 1 : 0);
441 if (error) {
442 up_write(&s->s_umount);
443 deactivate_super(s);
444 goto error;
445 }
446
447 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
448 s->s_flags |= MS_ACTIVE;
449 }
450
451 if (!strcmp(subvol_name, "."))
452 root = dget(s->s_root);
453 else {
454 mutex_lock(&s->s_root->d_inode->i_mutex);
455 root = lookup_one_len(subvol_name, s->s_root, strlen(subvol_name));
456 mutex_unlock(&s->s_root->d_inode->i_mutex);
457 if (IS_ERR(root)) {
458 up_write(&s->s_umount);
459 deactivate_super(s);
460 error = PTR_ERR(root);
461 goto error;
462 }
463 if (!root->d_inode) {
464 dput(root);
465 up_write(&s->s_umount);
466 deactivate_super(s);
467 error = -ENXIO;
468 goto error;
469 }
470 }
471
472 mnt->mnt_sb = s;
473 mnt->mnt_root = root;
474
475 kfree(subvol_name);
476 return 0;
477
478error_s:
479 error = PTR_ERR(s);
480error_bdev:
481 btrfs_close_devices(fs_devices);
482error_free_subvol_name:
483 kfree(subvol_name);
484error:
485 return error;
486}
487
488static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
489{
490 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
491 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
492 int bits = dentry->d_sb->s_blocksize_bits;
493 __be32 *fsid = (__be32 *)root->fs_info->fsid;
494
495 buf->f_namelen = BTRFS_NAME_LEN;
496 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
497 buf->f_bfree = buf->f_blocks -
498 (btrfs_super_bytes_used(disk_super) >> bits);
499 buf->f_bavail = buf->f_bfree;
500 buf->f_bsize = dentry->d_sb->s_blocksize;
501 buf->f_type = BTRFS_SUPER_MAGIC;
502 /* We treat it as constant endianness (it doesn't matter _which_)
503 because we want the fsid to come out the same whether mounted
504 on a big-endian or little-endian host */
505 buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
506 buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
507 /* Mask in the root object ID too, to disambiguate subvols */
508 buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
509 buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
510
511 return 0;
512}
513
514static struct file_system_type btrfs_fs_type = {
515 .owner = THIS_MODULE,
516 .name = "btrfs",
517 .get_sb = btrfs_get_sb,
518 .kill_sb = kill_anon_super,
519 .fs_flags = FS_REQUIRES_DEV,
520};
521
522/*
523 * used by btrfsctl to scan devices when no FS is mounted
524 */
525static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
526 unsigned long arg)
527{
528 struct btrfs_ioctl_vol_args *vol;
529 struct btrfs_fs_devices *fs_devices;
530 int ret = 0;
531 int len;
532
533 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
534 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
535 ret = -EFAULT;
536 goto out;
537 }
538 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
539 switch (cmd) {
540 case BTRFS_IOC_SCAN_DEV:
541 ret = btrfs_scan_one_device(vol->name, MS_RDONLY,
542 &btrfs_fs_type, &fs_devices);
543 break;
544 }
545out:
546 kfree(vol);
547 return ret;
548}
549
550static void btrfs_write_super_lockfs(struct super_block *sb)
551{
552 struct btrfs_root *root = btrfs_sb(sb);
553 mutex_lock(&root->fs_info->transaction_kthread_mutex);
554 mutex_lock(&root->fs_info->cleaner_mutex);
555}
556
557static void btrfs_unlockfs(struct super_block *sb)
558{
559 struct btrfs_root *root = btrfs_sb(sb);
560 mutex_unlock(&root->fs_info->cleaner_mutex);
561 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
562}
563
564static struct super_operations btrfs_super_ops = {
565 .delete_inode = btrfs_delete_inode,
566 .put_super = btrfs_put_super,
567 .write_super = btrfs_write_super,
568 .sync_fs = btrfs_sync_fs,
569 .show_options = generic_show_options,
570 .write_inode = btrfs_write_inode,
571 .dirty_inode = btrfs_dirty_inode,
572 .alloc_inode = btrfs_alloc_inode,
573 .destroy_inode = btrfs_destroy_inode,
574 .statfs = btrfs_statfs,
575 .write_super_lockfs = btrfs_write_super_lockfs,
576 .unlockfs = btrfs_unlockfs,
577};
578
579static const struct file_operations btrfs_ctl_fops = {
580 .unlocked_ioctl = btrfs_control_ioctl,
581 .compat_ioctl = btrfs_control_ioctl,
582 .owner = THIS_MODULE,
583};
584
585static struct miscdevice btrfs_misc = {
586 .minor = MISC_DYNAMIC_MINOR,
587 .name = "btrfs-control",
588 .fops = &btrfs_ctl_fops
589};
590
591static int btrfs_interface_init(void)
592{
593 return misc_register(&btrfs_misc);
594}
595
596void btrfs_interface_exit(void)
597{
598 if (misc_deregister(&btrfs_misc) < 0)
599 printk("misc_deregister failed for control device");
600}
601
602static int __init init_btrfs_fs(void)
603{
604 int err;
605
606 err = btrfs_init_sysfs();
607 if (err)
608 return err;
609
610 err = btrfs_init_cachep();
611 if (err)
612 goto free_sysfs;
613
614 err = extent_io_init();
615 if (err)
616 goto free_cachep;
617
618 err = extent_map_init();
619 if (err)
620 goto free_extent_io;
621
622 err = btrfs_interface_init();
623 if (err)
624 goto free_extent_map;
625 err = register_filesystem(&btrfs_fs_type);
626 if (err)
627 goto unregister_ioctl;
628
629 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
630 return 0;
631
632unregister_ioctl:
633 btrfs_interface_exit();
634free_extent_map:
635 extent_map_exit();
636free_extent_io:
637 extent_io_exit();
638free_cachep:
639 btrfs_destroy_cachep();
640free_sysfs:
641 btrfs_exit_sysfs();
642 return err;
643}
644
645static void __exit exit_btrfs_fs(void)
646{
647 btrfs_destroy_cachep();
648 extent_map_exit();
649 extent_io_exit();
650 btrfs_interface_exit();
651 unregister_filesystem(&btrfs_fs_type);
652 btrfs_exit_sysfs();
653 btrfs_cleanup_fs_uuids();
654}
655
656module_init(init_btrfs_fs)
657module_exit(exit_btrfs_fs)
658
659MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000000..300076e66765
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,268 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/spinlock.h>
22#include <linux/completion.h>
23#include <linux/buffer_head.h>
24#include <linux/module.h>
25#include <linux/kobject.h>
26
27#include "ctree.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
32{
33 return snprintf(buf, PAGE_SIZE, "%llu\n",
34 (unsigned long long)btrfs_root_used(&root->root_item));
35}
36
37static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
38{
39 return snprintf(buf, PAGE_SIZE, "%llu\n",
40 (unsigned long long)btrfs_root_limit(&root->root_item));
41}
42
43static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
44{
45
46 return snprintf(buf, PAGE_SIZE, "%llu\n",
47 (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
48}
49
50static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
51{
52 return snprintf(buf, PAGE_SIZE, "%llu\n",
53 (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
54}
55
56static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
57{
58 return snprintf(buf, PAGE_SIZE, "%llu\n",
59 (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
60}
61
62/* this is for root attrs (subvols/snapshots) */
63struct btrfs_root_attr {
64 struct attribute attr;
65 ssize_t (*show)(struct btrfs_root *, char *);
66 ssize_t (*store)(struct btrfs_root *, const char *, size_t);
67};
68
69#define ROOT_ATTR(name, mode, show, store) \
70static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, show, store)
71
72ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL);
73ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL);
74
75static struct attribute *btrfs_root_attrs[] = {
76 &btrfs_root_attr_blocks_used.attr,
77 &btrfs_root_attr_block_limit.attr,
78 NULL,
79};
80
81/* this is for super attrs (actual full fs) */
82struct btrfs_super_attr {
83 struct attribute attr;
84 ssize_t (*show)(struct btrfs_fs_info *, char *);
85 ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
86};
87
88#define SUPER_ATTR(name, mode, show, store) \
89static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, show, store)
90
91SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL);
92SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL);
93SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL);
94
95static struct attribute *btrfs_super_attrs[] = {
96 &btrfs_super_attr_blocks_used.attr,
97 &btrfs_super_attr_total_blocks.attr,
98 &btrfs_super_attr_blocksize.attr,
99 NULL,
100};
101
102static ssize_t btrfs_super_attr_show(struct kobject *kobj,
103 struct attribute *attr, char *buf)
104{
105 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
106 super_kobj);
107 struct btrfs_super_attr *a = container_of(attr,
108 struct btrfs_super_attr,
109 attr);
110
111 return a->show ? a->show(fs, buf) : 0;
112}
113
114static ssize_t btrfs_super_attr_store(struct kobject *kobj,
115 struct attribute *attr,
116 const char *buf, size_t len)
117{
118 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
119 super_kobj);
120 struct btrfs_super_attr *a = container_of(attr,
121 struct btrfs_super_attr,
122 attr);
123
124 return a->store ? a->store(fs, buf, len) : 0;
125}
126
127static ssize_t btrfs_root_attr_show(struct kobject *kobj,
128 struct attribute *attr, char *buf)
129{
130 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
131 root_kobj);
132 struct btrfs_root_attr *a = container_of(attr,
133 struct btrfs_root_attr,
134 attr);
135
136 return a->show ? a->show(root, buf) : 0;
137}
138
139static ssize_t btrfs_root_attr_store(struct kobject *kobj,
140 struct attribute *attr,
141 const char *buf, size_t len)
142{
143 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
144 root_kobj);
145 struct btrfs_root_attr *a = container_of(attr,
146 struct btrfs_root_attr,
147 attr);
148 return a->store ? a->store(root, buf, len) : 0;
149}
150
151static void btrfs_super_release(struct kobject *kobj)
152{
153 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
154 super_kobj);
155 complete(&fs->kobj_unregister);
156}
157
158static void btrfs_root_release(struct kobject *kobj)
159{
160 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
161 root_kobj);
162 complete(&root->kobj_unregister);
163}
164
165static struct sysfs_ops btrfs_super_attr_ops = {
166 .show = btrfs_super_attr_show,
167 .store = btrfs_super_attr_store,
168};
169
170static struct sysfs_ops btrfs_root_attr_ops = {
171 .show = btrfs_root_attr_show,
172 .store = btrfs_root_attr_store,
173};
174
175static struct kobj_type btrfs_root_ktype = {
176 .default_attrs = btrfs_root_attrs,
177 .sysfs_ops = &btrfs_root_attr_ops,
178 .release = btrfs_root_release,
179};
180
181static struct kobj_type btrfs_super_ktype = {
182 .default_attrs = btrfs_super_attrs,
183 .sysfs_ops = &btrfs_super_attr_ops,
184 .release = btrfs_super_release,
185};
186
187/* /sys/fs/btrfs/ entry */
188static struct kset *btrfs_kset;
189
190int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
191{
192 int error;
193 char *name;
194 char c;
195 int len = strlen(fs->sb->s_id) + 1;
196 int i;
197
198 name = kmalloc(len, GFP_NOFS);
199 if (!name) {
200 error = -ENOMEM;
201 goto fail;
202 }
203
204 for (i = 0; i < len; i++) {
205 c = fs->sb->s_id[i];
206 if (c == '/' || c == '\\')
207 c = '!';
208 name[i] = c;
209 }
210 name[len] = '\0';
211
212 fs->super_kobj.kset = btrfs_kset;
213 error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
214 NULL, "%s", name);
215 if (error)
216 goto fail;
217
218 kfree(name);
219 return 0;
220
221fail:
222 kfree(name);
223 printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
224 return error;
225}
226
227int btrfs_sysfs_add_root(struct btrfs_root *root)
228{
229 int error;
230
231 error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
232 &root->fs_info->super_kobj,
233 "%s", root->name);
234 if (error)
235 goto fail;
236
237 return 0;
238
239fail:
240 printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
241 return error;
242}
243
244void btrfs_sysfs_del_root(struct btrfs_root *root)
245{
246 kobject_put(&root->root_kobj);
247 wait_for_completion(&root->kobj_unregister);
248}
249
250void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
251{
252 kobject_put(&fs->super_kobj);
253 wait_for_completion(&fs->kobj_unregister);
254}
255
256int btrfs_init_sysfs(void)
257{
258 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
259 if (!btrfs_kset)
260 return -ENOMEM;
261 return 0;
262}
263
264void btrfs_exit_sysfs(void)
265{
266 kset_unregister(btrfs_kset);
267}
268
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000000..5ecc24d634a2
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1023 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/sched.h>
21#include <linux/writeback.h>
22#include <linux/pagemap.h>
23#include "ctree.h"
24#include "disk-io.h"
25#include "transaction.h"
26#include "locking.h"
27#include "ref-cache.h"
28#include "tree-log.h"
29
30static int total_trans = 0;
31extern struct kmem_cache *btrfs_trans_handle_cachep;
32extern struct kmem_cache *btrfs_transaction_cachep;
33
34#define BTRFS_ROOT_TRANS_TAG 0
35
36static noinline void put_transaction(struct btrfs_transaction *transaction)
37{
38 WARN_ON(transaction->use_count == 0);
39 transaction->use_count--;
40 if (transaction->use_count == 0) {
41 WARN_ON(total_trans == 0);
42 total_trans--;
43 list_del_init(&transaction->list);
44 memset(transaction, 0, sizeof(*transaction));
45 kmem_cache_free(btrfs_transaction_cachep, transaction);
46 }
47}
48
49/*
50 * either allocate a new transaction or hop into the existing one
51 */
52static noinline int join_transaction(struct btrfs_root *root)
53{
54 struct btrfs_transaction *cur_trans;
55 cur_trans = root->fs_info->running_transaction;
56 if (!cur_trans) {
57 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
58 GFP_NOFS);
59 total_trans++;
60 BUG_ON(!cur_trans);
61 root->fs_info->generation++;
62 root->fs_info->last_alloc = 0;
63 root->fs_info->last_data_alloc = 0;
64 cur_trans->num_writers = 1;
65 cur_trans->num_joined = 0;
66 cur_trans->transid = root->fs_info->generation;
67 init_waitqueue_head(&cur_trans->writer_wait);
68 init_waitqueue_head(&cur_trans->commit_wait);
69 cur_trans->in_commit = 0;
70 cur_trans->blocked = 0;
71 cur_trans->use_count = 1;
72 cur_trans->commit_done = 0;
73 cur_trans->start_time = get_seconds();
74 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
75 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
76 extent_io_tree_init(&cur_trans->dirty_pages,
77 root->fs_info->btree_inode->i_mapping,
78 GFP_NOFS);
79 spin_lock(&root->fs_info->new_trans_lock);
80 root->fs_info->running_transaction = cur_trans;
81 spin_unlock(&root->fs_info->new_trans_lock);
82 } else {
83 cur_trans->num_writers++;
84 cur_trans->num_joined++;
85 }
86
87 return 0;
88}
89
90/*
91 * this does all the record keeping required to make sure that a
92 * reference counted root is properly recorded in a given transaction.
93 * This is required to make sure the old root from before we joined the transaction
94 * is deleted when the transaction commits
95 */
96noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
97{
98 struct btrfs_dirty_root *dirty;
99 u64 running_trans_id = root->fs_info->running_transaction->transid;
100 if (root->ref_cows && root->last_trans < running_trans_id) {
101 WARN_ON(root == root->fs_info->extent_root);
102 if (root->root_item.refs != 0) {
103 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
104 (unsigned long)root->root_key.objectid,
105 BTRFS_ROOT_TRANS_TAG);
106
107 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
108 BUG_ON(!dirty);
109 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
110 BUG_ON(!dirty->root);
111 dirty->latest_root = root;
112 INIT_LIST_HEAD(&dirty->list);
113
114 root->commit_root = btrfs_root_node(root);
115
116 memcpy(dirty->root, root, sizeof(*root));
117 spin_lock_init(&dirty->root->node_lock);
118 spin_lock_init(&dirty->root->list_lock);
119 mutex_init(&dirty->root->objectid_mutex);
120 mutex_init(&dirty->root->log_mutex);
121 INIT_LIST_HEAD(&dirty->root->dead_list);
122 dirty->root->node = root->commit_root;
123 dirty->root->commit_root = NULL;
124
125 spin_lock(&root->list_lock);
126 list_add(&dirty->root->dead_list, &root->dead_list);
127 spin_unlock(&root->list_lock);
128
129 root->dirty_root = dirty;
130 } else {
131 WARN_ON(1);
132 }
133 root->last_trans = running_trans_id;
134 }
135 return 0;
136}
137
138/* wait for commit against the current transaction to become unblocked
139 * when this is done, it is safe to start a new transaction, but the current
140 * transaction might not be fully on disk.
141 */
142static void wait_current_trans(struct btrfs_root *root)
143{
144 struct btrfs_transaction *cur_trans;
145
146 cur_trans = root->fs_info->running_transaction;
147 if (cur_trans && cur_trans->blocked) {
148 DEFINE_WAIT(wait);
149 cur_trans->use_count++;
150 while(1) {
151 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
152 TASK_UNINTERRUPTIBLE);
153 if (cur_trans->blocked) {
154 mutex_unlock(&root->fs_info->trans_mutex);
155 schedule();
156 mutex_lock(&root->fs_info->trans_mutex);
157 finish_wait(&root->fs_info->transaction_wait,
158 &wait);
159 } else {
160 finish_wait(&root->fs_info->transaction_wait,
161 &wait);
162 break;
163 }
164 }
165 put_transaction(cur_trans);
166 }
167}
168
169static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
170 int num_blocks, int wait)
171{
172 struct btrfs_trans_handle *h =
173 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
174 int ret;
175
176 mutex_lock(&root->fs_info->trans_mutex);
177 if (!root->fs_info->log_root_recovering &&
178 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
179 wait_current_trans(root);
180 ret = join_transaction(root);
181 BUG_ON(ret);
182
183 btrfs_record_root_in_trans(root);
184 h->transid = root->fs_info->running_transaction->transid;
185 h->transaction = root->fs_info->running_transaction;
186 h->blocks_reserved = num_blocks;
187 h->blocks_used = 0;
188 h->block_group = NULL;
189 h->alloc_exclude_nr = 0;
190 h->alloc_exclude_start = 0;
191 root->fs_info->running_transaction->use_count++;
192 mutex_unlock(&root->fs_info->trans_mutex);
193 return h;
194}
195
196struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
197 int num_blocks)
198{
199 return start_transaction(root, num_blocks, 1);
200}
201struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
202 int num_blocks)
203{
204 return start_transaction(root, num_blocks, 0);
205}
206
207struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
208 int num_blocks)
209{
210 return start_transaction(r, num_blocks, 2);
211}
212
213/* wait for a transaction commit to be fully complete */
214static noinline int wait_for_commit(struct btrfs_root *root,
215 struct btrfs_transaction *commit)
216{
217 DEFINE_WAIT(wait);
218 mutex_lock(&root->fs_info->trans_mutex);
219 while(!commit->commit_done) {
220 prepare_to_wait(&commit->commit_wait, &wait,
221 TASK_UNINTERRUPTIBLE);
222 if (commit->commit_done)
223 break;
224 mutex_unlock(&root->fs_info->trans_mutex);
225 schedule();
226 mutex_lock(&root->fs_info->trans_mutex);
227 }
228 mutex_unlock(&root->fs_info->trans_mutex);
229 finish_wait(&commit->commit_wait, &wait);
230 return 0;
231}
232
233/*
234 * rate limit against the drop_snapshot code. This helps to slow down new operations
235 * if the drop_snapshot code isn't able to keep up.
236 */
237static void throttle_on_drops(struct btrfs_root *root)
238{
239 struct btrfs_fs_info *info = root->fs_info;
240 int harder_count = 0;
241
242harder:
243 if (atomic_read(&info->throttles)) {
244 DEFINE_WAIT(wait);
245 int thr;
246 thr = atomic_read(&info->throttle_gen);
247
248 do {
249 prepare_to_wait(&info->transaction_throttle,
250 &wait, TASK_UNINTERRUPTIBLE);
251 if (!atomic_read(&info->throttles)) {
252 finish_wait(&info->transaction_throttle, &wait);
253 break;
254 }
255 schedule();
256 finish_wait(&info->transaction_throttle, &wait);
257 } while (thr == atomic_read(&info->throttle_gen));
258 harder_count++;
259
260 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
261 harder_count < 2)
262 goto harder;
263
264 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
265 harder_count < 10)
266 goto harder;
267
268 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
269 harder_count < 20)
270 goto harder;
271 }
272}
273
274void btrfs_throttle(struct btrfs_root *root)
275{
276 mutex_lock(&root->fs_info->trans_mutex);
277 if (!root->fs_info->open_ioctl_trans)
278 wait_current_trans(root);
279 mutex_unlock(&root->fs_info->trans_mutex);
280
281 throttle_on_drops(root);
282}
283
284static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
285 struct btrfs_root *root, int throttle)
286{
287 struct btrfs_transaction *cur_trans;
288 struct btrfs_fs_info *info = root->fs_info;
289
290 mutex_lock(&info->trans_mutex);
291 cur_trans = info->running_transaction;
292 WARN_ON(cur_trans != trans->transaction);
293 WARN_ON(cur_trans->num_writers < 1);
294 cur_trans->num_writers--;
295
296 if (waitqueue_active(&cur_trans->writer_wait))
297 wake_up(&cur_trans->writer_wait);
298 put_transaction(cur_trans);
299 mutex_unlock(&info->trans_mutex);
300 memset(trans, 0, sizeof(*trans));
301 kmem_cache_free(btrfs_trans_handle_cachep, trans);
302
303 if (throttle)
304 throttle_on_drops(root);
305
306 return 0;
307}
308
309int btrfs_end_transaction(struct btrfs_trans_handle *trans,
310 struct btrfs_root *root)
311{
312 return __btrfs_end_transaction(trans, root, 0);
313}
314
315int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
316 struct btrfs_root *root)
317{
318 return __btrfs_end_transaction(trans, root, 1);
319}
320
321/*
322 * when btree blocks are allocated, they have some corresponding bits set for
323 * them in one of two extent_io trees. This is used to make sure all of
324 * those extents are on disk for transaction or log commit
325 */
326int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
327 struct extent_io_tree *dirty_pages)
328{
329 int ret;
330 int err = 0;
331 int werr = 0;
332 struct page *page;
333 struct inode *btree_inode = root->fs_info->btree_inode;
334 u64 start = 0;
335 u64 end;
336 unsigned long index;
337
338 while(1) {
339 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
340 EXTENT_DIRTY);
341 if (ret)
342 break;
343 while(start <= end) {
344 cond_resched();
345
346 index = start >> PAGE_CACHE_SHIFT;
347 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
348 page = find_get_page(btree_inode->i_mapping, index);
349 if (!page)
350 continue;
351
352 btree_lock_page_hook(page);
353 if (!page->mapping) {
354 unlock_page(page);
355 page_cache_release(page);
356 continue;
357 }
358
359 if (PageWriteback(page)) {
360 if (PageDirty(page))
361 wait_on_page_writeback(page);
362 else {
363 unlock_page(page);
364 page_cache_release(page);
365 continue;
366 }
367 }
368 err = write_one_page(page, 0);
369 if (err)
370 werr = err;
371 page_cache_release(page);
372 }
373 }
374 while(1) {
375 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
376 EXTENT_DIRTY);
377 if (ret)
378 break;
379
380 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
381 while(start <= end) {
382 index = start >> PAGE_CACHE_SHIFT;
383 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
384 page = find_get_page(btree_inode->i_mapping, index);
385 if (!page)
386 continue;
387 if (PageDirty(page)) {
388 btree_lock_page_hook(page);
389 wait_on_page_writeback(page);
390 err = write_one_page(page, 0);
391 if (err)
392 werr = err;
393 }
394 wait_on_page_writeback(page);
395 page_cache_release(page);
396 cond_resched();
397 }
398 }
399 if (err)
400 werr = err;
401 return werr;
402}
403
404int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
405 struct btrfs_root *root)
406{
407 if (!trans || !trans->transaction) {
408 struct inode *btree_inode;
409 btree_inode = root->fs_info->btree_inode;
410 return filemap_write_and_wait(btree_inode->i_mapping);
411 }
412 return btrfs_write_and_wait_marked_extents(root,
413 &trans->transaction->dirty_pages);
414}
415
416/*
417 * this is used to update the root pointer in the tree of tree roots.
418 *
419 * But, in the case of the extent allocation tree, updating the root
420 * pointer may allocate blocks which may change the root of the extent
421 * allocation tree.
422 *
423 * So, this loops and repeats and makes sure the cowonly root didn't
424 * change while the root pointer was being updated in the metadata.
425 */
426static int update_cowonly_root(struct btrfs_trans_handle *trans,
427 struct btrfs_root *root)
428{
429 int ret;
430 u64 old_root_bytenr;
431 struct btrfs_root *tree_root = root->fs_info->tree_root;
432
433 btrfs_write_dirty_block_groups(trans, root);
434 while(1) {
435 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
436 if (old_root_bytenr == root->node->start)
437 break;
438 btrfs_set_root_bytenr(&root->root_item,
439 root->node->start);
440 btrfs_set_root_level(&root->root_item,
441 btrfs_header_level(root->node));
442 ret = btrfs_update_root(trans, tree_root,
443 &root->root_key,
444 &root->root_item);
445 BUG_ON(ret);
446 btrfs_write_dirty_block_groups(trans, root);
447 }
448 return 0;
449}
450
451/*
452 * update all the cowonly tree roots on disk
453 */
454int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
455 struct btrfs_root *root)
456{
457 struct btrfs_fs_info *fs_info = root->fs_info;
458 struct list_head *next;
459
460 while(!list_empty(&fs_info->dirty_cowonly_roots)) {
461 next = fs_info->dirty_cowonly_roots.next;
462 list_del_init(next);
463 root = list_entry(next, struct btrfs_root, dirty_list);
464 update_cowonly_root(trans, root);
465 }
466 return 0;
467}
468
469/*
470 * dead roots are old snapshots that need to be deleted. This allocates
471 * a dirty root struct and adds it into the list of dead roots that need to
472 * be deleted
473 */
474int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
475{
476 struct btrfs_dirty_root *dirty;
477
478 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
479 if (!dirty)
480 return -ENOMEM;
481 dirty->root = root;
482 dirty->latest_root = latest;
483
484 mutex_lock(&root->fs_info->trans_mutex);
485 list_add(&dirty->list, &latest->fs_info->dead_roots);
486 mutex_unlock(&root->fs_info->trans_mutex);
487 return 0;
488}
489
490/*
491 * at transaction commit time we need to schedule the old roots for
492 * deletion via btrfs_drop_snapshot. This runs through all the
493 * reference counted roots that were modified in the current
494 * transaction and puts them into the drop list
495 */
496static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
497 struct radix_tree_root *radix,
498 struct list_head *list)
499{
500 struct btrfs_dirty_root *dirty;
501 struct btrfs_root *gang[8];
502 struct btrfs_root *root;
503 int i;
504 int ret;
505 int err = 0;
506 u32 refs;
507
508 while(1) {
509 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
510 ARRAY_SIZE(gang),
511 BTRFS_ROOT_TRANS_TAG);
512 if (ret == 0)
513 break;
514 for (i = 0; i < ret; i++) {
515 root = gang[i];
516 radix_tree_tag_clear(radix,
517 (unsigned long)root->root_key.objectid,
518 BTRFS_ROOT_TRANS_TAG);
519
520 BUG_ON(!root->ref_tree);
521 dirty = root->dirty_root;
522
523 btrfs_free_log(trans, root);
524 btrfs_free_reloc_root(root);
525
526 if (root->commit_root == root->node) {
527 WARN_ON(root->node->start !=
528 btrfs_root_bytenr(&root->root_item));
529
530 free_extent_buffer(root->commit_root);
531 root->commit_root = NULL;
532 root->dirty_root = NULL;
533
534 spin_lock(&root->list_lock);
535 list_del_init(&dirty->root->dead_list);
536 spin_unlock(&root->list_lock);
537
538 kfree(dirty->root);
539 kfree(dirty);
540
541 /* make sure to update the root on disk
542 * so we get any updates to the block used
543 * counts
544 */
545 err = btrfs_update_root(trans,
546 root->fs_info->tree_root,
547 &root->root_key,
548 &root->root_item);
549 continue;
550 }
551
552 memset(&root->root_item.drop_progress, 0,
553 sizeof(struct btrfs_disk_key));
554 root->root_item.drop_level = 0;
555 root->commit_root = NULL;
556 root->dirty_root = NULL;
557 root->root_key.offset = root->fs_info->generation;
558 btrfs_set_root_bytenr(&root->root_item,
559 root->node->start);
560 btrfs_set_root_level(&root->root_item,
561 btrfs_header_level(root->node));
562 err = btrfs_insert_root(trans, root->fs_info->tree_root,
563 &root->root_key,
564 &root->root_item);
565 if (err)
566 break;
567
568 refs = btrfs_root_refs(&dirty->root->root_item);
569 btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
570 err = btrfs_update_root(trans, root->fs_info->tree_root,
571 &dirty->root->root_key,
572 &dirty->root->root_item);
573
574 BUG_ON(err);
575 if (refs == 1) {
576 list_add(&dirty->list, list);
577 } else {
578 WARN_ON(1);
579 free_extent_buffer(dirty->root->node);
580 kfree(dirty->root);
581 kfree(dirty);
582 }
583 }
584 }
585 return err;
586}
587
588/*
589 * defrag a given btree. If cacheonly == 1, this won't read from the disk,
590 * otherwise every leaf in the btree is read and defragged.
591 */
592int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
593{
594 struct btrfs_fs_info *info = root->fs_info;
595 int ret;
596 struct btrfs_trans_handle *trans;
597 unsigned long nr;
598
599 smp_mb();
600 if (root->defrag_running)
601 return 0;
602 trans = btrfs_start_transaction(root, 1);
603 while (1) {
604 root->defrag_running = 1;
605 ret = btrfs_defrag_leaves(trans, root, cacheonly);
606 nr = trans->blocks_used;
607 btrfs_end_transaction(trans, root);
608 btrfs_btree_balance_dirty(info->tree_root, nr);
609 cond_resched();
610
611 trans = btrfs_start_transaction(root, 1);
612 if (root->fs_info->closing || ret != -EAGAIN)
613 break;
614 }
615 root->defrag_running = 0;
616 smp_mb();
617 btrfs_end_transaction(trans, root);
618 return 0;
619}
620
621/*
622 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
623 * all of them
624 */
625static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
626 struct list_head *list)
627{
628 struct btrfs_dirty_root *dirty;
629 struct btrfs_trans_handle *trans;
630 unsigned long nr;
631 u64 num_bytes;
632 u64 bytes_used;
633 u64 max_useless;
634 int ret = 0;
635 int err;
636
637 while(!list_empty(list)) {
638 struct btrfs_root *root;
639
640 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
641 list_del_init(&dirty->list);
642
643 num_bytes = btrfs_root_used(&dirty->root->root_item);
644 root = dirty->latest_root;
645 atomic_inc(&root->fs_info->throttles);
646
647 while(1) {
648 trans = btrfs_start_transaction(tree_root, 1);
649 mutex_lock(&root->fs_info->drop_mutex);
650 ret = btrfs_drop_snapshot(trans, dirty->root);
651 if (ret != -EAGAIN) {
652 break;
653 }
654 mutex_unlock(&root->fs_info->drop_mutex);
655
656 err = btrfs_update_root(trans,
657 tree_root,
658 &dirty->root->root_key,
659 &dirty->root->root_item);
660 if (err)
661 ret = err;
662 nr = trans->blocks_used;
663 ret = btrfs_end_transaction(trans, tree_root);
664 BUG_ON(ret);
665
666 btrfs_btree_balance_dirty(tree_root, nr);
667 cond_resched();
668 }
669 BUG_ON(ret);
670 atomic_dec(&root->fs_info->throttles);
671 wake_up(&root->fs_info->transaction_throttle);
672
673 mutex_lock(&root->fs_info->alloc_mutex);
674 num_bytes -= btrfs_root_used(&dirty->root->root_item);
675 bytes_used = btrfs_root_used(&root->root_item);
676 if (num_bytes) {
677 btrfs_record_root_in_trans(root);
678 btrfs_set_root_used(&root->root_item,
679 bytes_used - num_bytes);
680 }
681 mutex_unlock(&root->fs_info->alloc_mutex);
682
683 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
684 if (ret) {
685 BUG();
686 break;
687 }
688 mutex_unlock(&root->fs_info->drop_mutex);
689
690 spin_lock(&root->list_lock);
691 list_del_init(&dirty->root->dead_list);
692 if (!list_empty(&root->dead_list)) {
693 struct btrfs_root *oldest;
694 oldest = list_entry(root->dead_list.prev,
695 struct btrfs_root, dead_list);
696 max_useless = oldest->root_key.offset - 1;
697 } else {
698 max_useless = root->root_key.offset - 1;
699 }
700 spin_unlock(&root->list_lock);
701
702 nr = trans->blocks_used;
703 ret = btrfs_end_transaction(trans, tree_root);
704 BUG_ON(ret);
705
706 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
707 BUG_ON(ret);
708
709 free_extent_buffer(dirty->root->node);
710 kfree(dirty->root);
711 kfree(dirty);
712
713 btrfs_btree_balance_dirty(tree_root, nr);
714 cond_resched();
715 }
716 return ret;
717}
718
719/*
720 * new snapshots need to be created at a very specific time in the
721 * transaction commit. This does the actual creation
722 */
723static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
724 struct btrfs_fs_info *fs_info,
725 struct btrfs_pending_snapshot *pending)
726{
727 struct btrfs_key key;
728 struct btrfs_root_item *new_root_item;
729 struct btrfs_root *tree_root = fs_info->tree_root;
730 struct btrfs_root *root = pending->root;
731 struct extent_buffer *tmp;
732 struct extent_buffer *old;
733 int ret;
734 int namelen;
735 u64 objectid;
736
737 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
738 if (!new_root_item) {
739 ret = -ENOMEM;
740 goto fail;
741 }
742 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
743 if (ret)
744 goto fail;
745
746 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
747
748 key.objectid = objectid;
749 key.offset = trans->transid;
750 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
751
752 old = btrfs_lock_root_node(root);
753 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
754
755 btrfs_copy_root(trans, root, old, &tmp, objectid);
756 btrfs_tree_unlock(old);
757 free_extent_buffer(old);
758
759 btrfs_set_root_bytenr(new_root_item, tmp->start);
760 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
761 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
762 new_root_item);
763 btrfs_tree_unlock(tmp);
764 free_extent_buffer(tmp);
765 if (ret)
766 goto fail;
767
768 /*
769 * insert the directory item
770 */
771 key.offset = (u64)-1;
772 namelen = strlen(pending->name);
773 ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
774 pending->name, namelen,
775 root->fs_info->sb->s_root->d_inode->i_ino,
776 &key, BTRFS_FT_DIR, 0);
777
778 if (ret)
779 goto fail;
780
781 ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
782 pending->name, strlen(pending->name), objectid,
783 root->fs_info->sb->s_root->d_inode->i_ino, 0);
784
785 /* Invalidate existing dcache entry for new snapshot. */
786 btrfs_invalidate_dcache_root(root, pending->name, namelen);
787
788fail:
789 kfree(new_root_item);
790 return ret;
791}
792
793/*
794 * create all the snapshots we've scheduled for creation
795 */
796static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
797 struct btrfs_fs_info *fs_info)
798{
799 struct btrfs_pending_snapshot *pending;
800 struct list_head *head = &trans->transaction->pending_snapshots;
801 int ret;
802
803 while(!list_empty(head)) {
804 pending = list_entry(head->next,
805 struct btrfs_pending_snapshot, list);
806 ret = create_pending_snapshot(trans, fs_info, pending);
807 BUG_ON(ret);
808 list_del(&pending->list);
809 kfree(pending->name);
810 kfree(pending);
811 }
812 return 0;
813}
814
815int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
816 struct btrfs_root *root)
817{
818 unsigned long joined = 0;
819 unsigned long timeout = 1;
820 struct btrfs_transaction *cur_trans;
821 struct btrfs_transaction *prev_trans = NULL;
822 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
823 struct list_head dirty_fs_roots;
824 struct extent_io_tree *pinned_copy;
825 DEFINE_WAIT(wait);
826 int ret;
827
828 INIT_LIST_HEAD(&dirty_fs_roots);
829 mutex_lock(&root->fs_info->trans_mutex);
830 if (trans->transaction->in_commit) {
831 cur_trans = trans->transaction;
832 trans->transaction->use_count++;
833 mutex_unlock(&root->fs_info->trans_mutex);
834 btrfs_end_transaction(trans, root);
835
836 ret = wait_for_commit(root, cur_trans);
837 BUG_ON(ret);
838
839 mutex_lock(&root->fs_info->trans_mutex);
840 put_transaction(cur_trans);
841 mutex_unlock(&root->fs_info->trans_mutex);
842
843 return 0;
844 }
845
846 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
847 if (!pinned_copy)
848 return -ENOMEM;
849
850 extent_io_tree_init(pinned_copy,
851 root->fs_info->btree_inode->i_mapping, GFP_NOFS);
852
853 trans->transaction->in_commit = 1;
854 trans->transaction->blocked = 1;
855 cur_trans = trans->transaction;
856 if (cur_trans->list.prev != &root->fs_info->trans_list) {
857 prev_trans = list_entry(cur_trans->list.prev,
858 struct btrfs_transaction, list);
859 if (!prev_trans->commit_done) {
860 prev_trans->use_count++;
861 mutex_unlock(&root->fs_info->trans_mutex);
862
863 wait_for_commit(root, prev_trans);
864
865 mutex_lock(&root->fs_info->trans_mutex);
866 put_transaction(prev_trans);
867 }
868 }
869
870 do {
871 int snap_pending = 0;
872 joined = cur_trans->num_joined;
873 if (!list_empty(&trans->transaction->pending_snapshots))
874 snap_pending = 1;
875
876 WARN_ON(cur_trans != trans->transaction);
877 prepare_to_wait(&cur_trans->writer_wait, &wait,
878 TASK_UNINTERRUPTIBLE);
879
880 if (cur_trans->num_writers > 1)
881 timeout = MAX_SCHEDULE_TIMEOUT;
882 else
883 timeout = 1;
884
885 mutex_unlock(&root->fs_info->trans_mutex);
886
887 if (snap_pending) {
888 ret = btrfs_wait_ordered_extents(root, 1);
889 BUG_ON(ret);
890 }
891
892 schedule_timeout(timeout);
893
894 mutex_lock(&root->fs_info->trans_mutex);
895 finish_wait(&cur_trans->writer_wait, &wait);
896 } while (cur_trans->num_writers > 1 ||
897 (cur_trans->num_joined != joined));
898
899 ret = create_pending_snapshots(trans, root->fs_info);
900 BUG_ON(ret);
901
902 WARN_ON(cur_trans != trans->transaction);
903
904 /* btrfs_commit_tree_roots is responsible for getting the
905 * various roots consistent with each other. Every pointer
906 * in the tree of tree roots has to point to the most up to date
907 * root for every subvolume and other tree. So, we have to keep
908 * the tree logging code from jumping in and changing any
909 * of the trees.
910 *
911 * At this point in the commit, there can't be any tree-log
912 * writers, but a little lower down we drop the trans mutex
913 * and let new people in. By holding the tree_log_mutex
914 * from now until after the super is written, we avoid races
915 * with the tree-log code.
916 */
917 mutex_lock(&root->fs_info->tree_log_mutex);
918 /*
919 * keep tree reloc code from adding new reloc trees
920 */
921 mutex_lock(&root->fs_info->tree_reloc_mutex);
922
923
924 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
925 &dirty_fs_roots);
926 BUG_ON(ret);
927
928 /* add_dirty_roots gets rid of all the tree log roots, it is now
929 * safe to free the root of tree log roots
930 */
931 btrfs_free_log_root_tree(trans, root->fs_info);
932
933 btrfs_free_reloc_mappings(root);
934
935 ret = btrfs_commit_tree_roots(trans, root);
936 BUG_ON(ret);
937
938 cur_trans = root->fs_info->running_transaction;
939 spin_lock(&root->fs_info->new_trans_lock);
940 root->fs_info->running_transaction = NULL;
941 spin_unlock(&root->fs_info->new_trans_lock);
942 btrfs_set_super_generation(&root->fs_info->super_copy,
943 cur_trans->transid);
944 btrfs_set_super_root(&root->fs_info->super_copy,
945 root->fs_info->tree_root->node->start);
946 btrfs_set_super_root_level(&root->fs_info->super_copy,
947 btrfs_header_level(root->fs_info->tree_root->node));
948
949 btrfs_set_super_chunk_root(&root->fs_info->super_copy,
950 chunk_root->node->start);
951 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
952 btrfs_header_level(chunk_root->node));
953
954 if (!root->fs_info->log_root_recovering) {
955 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
956 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
957 }
958
959 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
960 sizeof(root->fs_info->super_copy));
961
962 btrfs_copy_pinned(root, pinned_copy);
963
964 trans->transaction->blocked = 0;
965 wake_up(&root->fs_info->transaction_throttle);
966 wake_up(&root->fs_info->transaction_wait);
967
968 mutex_unlock(&root->fs_info->trans_mutex);
969 ret = btrfs_write_and_wait_transaction(trans, root);
970 BUG_ON(ret);
971 write_ctree_super(trans, root);
972
973 /*
974 * the super is written, we can safely allow the tree-loggers
975 * to go about their business
976 */
977 mutex_unlock(&root->fs_info->tree_log_mutex);
978
979 btrfs_finish_extent_commit(trans, root, pinned_copy);
980 kfree(pinned_copy);
981
982 btrfs_drop_dead_reloc_roots(root);
983 mutex_unlock(&root->fs_info->tree_reloc_mutex);
984
985 mutex_lock(&root->fs_info->trans_mutex);
986
987 cur_trans->commit_done = 1;
988 root->fs_info->last_trans_committed = cur_trans->transid;
989 wake_up(&cur_trans->commit_wait);
990 put_transaction(cur_trans);
991 put_transaction(cur_trans);
992
993 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
994 if (root->fs_info->closing)
995 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
996
997 mutex_unlock(&root->fs_info->trans_mutex);
998 kmem_cache_free(btrfs_trans_handle_cachep, trans);
999
1000 if (root->fs_info->closing) {
1001 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1002 }
1003 return ret;
1004}
1005
1006/*
1007 * interface function to delete all the snapshots we have scheduled for deletion
1008 */
1009int btrfs_clean_old_snapshots(struct btrfs_root *root)
1010{
1011 struct list_head dirty_roots;
1012 INIT_LIST_HEAD(&dirty_roots);
1013again:
1014 mutex_lock(&root->fs_info->trans_mutex);
1015 list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
1016 mutex_unlock(&root->fs_info->trans_mutex);
1017
1018 if (!list_empty(&dirty_roots)) {
1019 drop_dirty_roots(root, &dirty_roots);
1020 goto again;
1021 }
1022 return 0;
1023}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000000..eef2cb7d7e78
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,104 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h"
22
23struct btrfs_transaction {
24 u64 transid;
25 unsigned long num_writers;
26 unsigned long num_joined;
27 int in_commit;
28 int use_count;
29 int commit_done;
30 int blocked;
31 struct list_head list;
32 struct extent_io_tree dirty_pages;
33 unsigned long start_time;
34 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots;
37};
38
39struct btrfs_trans_handle {
40 u64 transid;
41 unsigned long blocks_reserved;
42 unsigned long blocks_used;
43 struct btrfs_transaction *transaction;
44 struct btrfs_block_group_cache *block_group;
45 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr;
47};
48
49struct btrfs_pending_snapshot {
50 struct btrfs_root *root;
51 char *name;
52 struct list_head list;
53};
54
55struct btrfs_dirty_root {
56 struct list_head list;
57 struct btrfs_root *root;
58 struct btrfs_root *latest_root;
59};
60
61static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
62 struct inode *inode)
63{
64 trans->block_group = BTRFS_I(inode)->block_group;
65}
66
67static inline void btrfs_update_inode_block_group(struct
68 btrfs_trans_handle *trans,
69 struct inode *inode)
70{
71 BTRFS_I(inode)->block_group = trans->block_group;
72}
73
74static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
75 struct inode *inode)
76{
77 BTRFS_I(inode)->last_trans = trans->transaction->transid;
78}
79
80int btrfs_end_transaction(struct btrfs_trans_handle *trans,
81 struct btrfs_root *root);
82struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
83 int num_blocks);
84struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
85 int num_blocks);
86struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
87 int num_blocks);
88int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
89 struct btrfs_root *root);
90int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
91 struct btrfs_root *root);
92
93int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
94int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
95int btrfs_clean_old_snapshots(struct btrfs_root *root);
96int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root);
98int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
99 struct btrfs_root *root);
100void btrfs_throttle(struct btrfs_root *root);
101int btrfs_record_root_in_trans(struct btrfs_root *root);
102int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
103 struct extent_io_tree *dirty_pages);
104#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000000..6f57d0889b1e
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,149 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "print-tree.h"
23#include "transaction.h"
24#include "locking.h"
25
26/* defrag all the leaves in a given btree. If cache_only == 1, don't read things
27 * from disk, otherwise read all the leaves and try to get key order to
28 * better reflect disk order
29 */
30int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, int cache_only)
32{
33 struct btrfs_path *path = NULL;
34 struct btrfs_key key;
35 int ret = 0;
36 int wret;
37 int level;
38 int orig_level;
39 int is_extent = 0;
40 int next_key_ret = 0;
41 u64 last_ret = 0;
42 u64 min_trans = 0;
43
44 if (cache_only)
45 goto out;
46
47 if (root->fs_info->extent_root == root) {
48 /*
49 * there's recursion here right now in the tree locking,
50 * we can't defrag the extent root without deadlock
51 */
52 goto out;
53 }
54
55 if (root->ref_cows == 0 && !is_extent)
56 goto out;
57
58 if (btrfs_test_opt(root, SSD))
59 goto out;
60
61 path = btrfs_alloc_path();
62 if (!path)
63 return -ENOMEM;
64
65 level = btrfs_header_level(root->node);
66 orig_level = level;
67
68 if (level == 0) {
69 goto out;
70 }
71 if (root->defrag_progress.objectid == 0) {
72 struct extent_buffer *root_node;
73 u32 nritems;
74
75 root_node = btrfs_lock_root_node(root);
76 nritems = btrfs_header_nritems(root_node);
77 root->defrag_max.objectid = 0;
78 /* from above we know this is not a leaf */
79 btrfs_node_key_to_cpu(root_node, &root->defrag_max,
80 nritems - 1);
81 btrfs_tree_unlock(root_node);
82 free_extent_buffer(root_node);
83 memset(&key, 0, sizeof(key));
84 } else {
85 memcpy(&key, &root->defrag_progress, sizeof(key));
86 }
87
88 path->keep_locks = 1;
89 if (cache_only)
90 min_trans = root->defrag_trans_start;
91
92 ret = btrfs_search_forward(root, &key, NULL, path,
93 cache_only, min_trans);
94 if (ret < 0)
95 goto out;
96 if (ret > 0) {
97 ret = 0;
98 goto out;
99 }
100 btrfs_release_path(root, path);
101 wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
102
103 if (wret < 0) {
104 ret = wret;
105 goto out;
106 }
107 if (!path->nodes[1]) {
108 ret = 0;
109 goto out;
110 }
111 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
112 next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
113 min_trans);
114 ret = btrfs_realloc_node(trans, root,
115 path->nodes[1], 0,
116 cache_only, &last_ret,
117 &root->defrag_progress);
118 WARN_ON(ret && ret != -EAGAIN);
119 if (next_key_ret == 0) {
120 memcpy(&root->defrag_progress, &key, sizeof(key));
121 ret = -EAGAIN;
122 }
123
124 btrfs_release_path(root, path);
125 if (is_extent)
126 btrfs_extent_post_op(trans, root);
127out:
128 if (is_extent)
129 mutex_unlock(&root->fs_info->alloc_mutex);
130
131 if (path)
132 btrfs_free_path(path);
133 if (ret == -EAGAIN) {
134 if (root->defrag_max.objectid > root->defrag_progress.objectid)
135 goto done;
136 if (root->defrag_max.type > root->defrag_progress.type)
137 goto done;
138 if (root->defrag_max.offset > root->defrag_progress.offset)
139 goto done;
140 ret = 0;
141 }
142done:
143 if (ret != -EAGAIN) {
144 memset(&root->defrag_progress, 0,
145 sizeof(root->defrag_progress));
146 root->defrag_trans_start = trans->transid;
147 }
148 return ret;
149}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000000..cf618cc8b34a
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2890 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "transaction.h"
22#include "disk-io.h"
23#include "locking.h"
24#include "print-tree.h"
25#include "compat.h"
26
27/* magic values for the inode_only field in btrfs_log_inode:
28 *
29 * LOG_INODE_ALL means to log everything
30 * LOG_INODE_EXISTS means to log just enough to recreate the inode
31 * during log replay
32 */
33#define LOG_INODE_ALL 0
34#define LOG_INODE_EXISTS 1
35
36/*
37 * stages for the tree walking. The first
38 * stage (0) is to only pin down the blocks we find
39 * the second stage (1) is to make sure that all the inodes
40 * we find in the log are created in the subvolume.
41 *
42 * The last stage is to deal with directories and links and extents
43 * and all the other fun semantics
44 */
45#define LOG_WALK_PIN_ONLY 0
46#define LOG_WALK_REPLAY_INODES 1
47#define LOG_WALK_REPLAY_ALL 2
48
49static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
50 struct btrfs_root *root, struct inode *inode,
51 int inode_only);
52
53/*
54 * tree logging is a special write ahead log used to make sure that
55 * fsyncs and O_SYNCs can happen without doing full tree commits.
56 *
57 * Full tree commits are expensive because they require commonly
58 * modified blocks to be recowed, creating many dirty pages in the
59 * extent tree an 4x-6x higher write load than ext3.
60 *
61 * Instead of doing a tree commit on every fsync, we use the
62 * key ranges and transaction ids to find items for a given file or directory
63 * that have changed in this transaction. Those items are copied into
64 * a special tree (one per subvolume root), that tree is written to disk
65 * and then the fsync is considered complete.
66 *
67 * After a crash, items are copied out of the log-tree back into the
68 * subvolume tree. Any file data extents found are recorded in the extent
69 * allocation tree, and the log-tree freed.
70 *
71 * The log tree is read three times, once to pin down all the extents it is
72 * using in ram and once, once to create all the inodes logged in the tree
73 * and once to do all the other items.
74 */
75
76/*
77 * btrfs_add_log_tree adds a new per-subvolume log tree into the
78 * tree of log tree roots. This must be called with a tree log transaction
79 * running (see start_log_trans).
80 */
81int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
82 struct btrfs_root *root)
83{
84 struct btrfs_key key;
85 struct btrfs_root_item root_item;
86 struct btrfs_inode_item *inode_item;
87 struct extent_buffer *leaf;
88 struct btrfs_root *new_root = root;
89 int ret;
90 u64 objectid = root->root_key.objectid;
91
92 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
93 BTRFS_TREE_LOG_OBJECTID,
94 trans->transid, 0, 0, 0);
95 if (IS_ERR(leaf)) {
96 ret = PTR_ERR(leaf);
97 return ret;
98 }
99
100 btrfs_set_header_nritems(leaf, 0);
101 btrfs_set_header_level(leaf, 0);
102 btrfs_set_header_bytenr(leaf, leaf->start);
103 btrfs_set_header_generation(leaf, trans->transid);
104 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
105
106 write_extent_buffer(leaf, root->fs_info->fsid,
107 (unsigned long)btrfs_header_fsid(leaf),
108 BTRFS_FSID_SIZE);
109 btrfs_mark_buffer_dirty(leaf);
110
111 inode_item = &root_item.inode;
112 memset(inode_item, 0, sizeof(*inode_item));
113 inode_item->generation = cpu_to_le64(1);
114 inode_item->size = cpu_to_le64(3);
115 inode_item->nlink = cpu_to_le32(1);
116 inode_item->nbytes = cpu_to_le64(root->leafsize);
117 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
118
119 btrfs_set_root_bytenr(&root_item, leaf->start);
120 btrfs_set_root_level(&root_item, 0);
121 btrfs_set_root_refs(&root_item, 0);
122 btrfs_set_root_used(&root_item, 0);
123
124 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
125 root_item.drop_level = 0;
126
127 btrfs_tree_unlock(leaf);
128 free_extent_buffer(leaf);
129 leaf = NULL;
130
131 btrfs_set_root_dirid(&root_item, 0);
132
133 key.objectid = BTRFS_TREE_LOG_OBJECTID;
134 key.offset = objectid;
135 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
136 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
137 &root_item);
138 if (ret)
139 goto fail;
140
141 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
142 &key);
143 BUG_ON(!new_root);
144
145 WARN_ON(root->log_root);
146 root->log_root = new_root;
147
148 /*
149 * log trees do not get reference counted because they go away
150 * before a real commit is actually done. They do store pointers
151 * to file data extents, and those reference counts still get
152 * updated (along with back refs to the log tree).
153 */
154 new_root->ref_cows = 0;
155 new_root->last_trans = trans->transid;
156fail:
157 return ret;
158}
159
160/*
161 * start a sub transaction and setup the log tree
162 * this increments the log tree writer count to make the people
163 * syncing the tree wait for us to finish
164 */
165static int start_log_trans(struct btrfs_trans_handle *trans,
166 struct btrfs_root *root)
167{
168 int ret;
169 mutex_lock(&root->fs_info->tree_log_mutex);
170 if (!root->fs_info->log_root_tree) {
171 ret = btrfs_init_log_root_tree(trans, root->fs_info);
172 BUG_ON(ret);
173 }
174 if (!root->log_root) {
175 ret = btrfs_add_log_tree(trans, root);
176 BUG_ON(ret);
177 }
178 atomic_inc(&root->fs_info->tree_log_writers);
179 root->fs_info->tree_log_batch++;
180 mutex_unlock(&root->fs_info->tree_log_mutex);
181 return 0;
182}
183
184/*
185 * returns 0 if there was a log transaction running and we were able
186 * to join, or returns -ENOENT if there were not transactions
187 * in progress
188 */
189static int join_running_log_trans(struct btrfs_root *root)
190{
191 int ret = -ENOENT;
192
193 smp_mb();
194 if (!root->log_root)
195 return -ENOENT;
196
197 mutex_lock(&root->fs_info->tree_log_mutex);
198 if (root->log_root) {
199 ret = 0;
200 atomic_inc(&root->fs_info->tree_log_writers);
201 root->fs_info->tree_log_batch++;
202 }
203 mutex_unlock(&root->fs_info->tree_log_mutex);
204 return ret;
205}
206
207/*
208 * indicate we're done making changes to the log tree
209 * and wake up anyone waiting to do a sync
210 */
211static int end_log_trans(struct btrfs_root *root)
212{
213 atomic_dec(&root->fs_info->tree_log_writers);
214 smp_mb();
215 if (waitqueue_active(&root->fs_info->tree_log_wait))
216 wake_up(&root->fs_info->tree_log_wait);
217 return 0;
218}
219
220
221/*
222 * the walk control struct is used to pass state down the chain when
223 * processing the log tree. The stage field tells us which part
224 * of the log tree processing we are currently doing. The others
225 * are state fields used for that specific part
226 */
227struct walk_control {
228 /* should we free the extent on disk when done? This is used
229 * at transaction commit time while freeing a log tree
230 */
231 int free;
232
233 /* should we write out the extent buffer? This is used
234 * while flushing the log tree to disk during a sync
235 */
236 int write;
237
238 /* should we wait for the extent buffer io to finish? Also used
239 * while flushing the log tree to disk for a sync
240 */
241 int wait;
242
243 /* pin only walk, we record which extents on disk belong to the
244 * log trees
245 */
246 int pin;
247
248 /* what stage of the replay code we're currently in */
249 int stage;
250
251 /* the root we are currently replaying */
252 struct btrfs_root *replay_dest;
253
254 /* the trans handle for the current replay */
255 struct btrfs_trans_handle *trans;
256
257 /* the function that gets used to process blocks we find in the
258 * tree. Note the extent_buffer might not be up to date when it is
259 * passed in, and it must be checked or read if you need the data
260 * inside it
261 */
262 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
263 struct walk_control *wc, u64 gen);
264};
265
266/*
267 * process_func used to pin down extents, write them or wait on them
268 */
269static int process_one_buffer(struct btrfs_root *log,
270 struct extent_buffer *eb,
271 struct walk_control *wc, u64 gen)
272{
273 if (wc->pin) {
274 mutex_lock(&log->fs_info->alloc_mutex);
275 btrfs_update_pinned_extents(log->fs_info->extent_root,
276 eb->start, eb->len, 1);
277 mutex_unlock(&log->fs_info->alloc_mutex);
278 }
279
280 if (btrfs_buffer_uptodate(eb, gen)) {
281 if (wc->write)
282 btrfs_write_tree_block(eb);
283 if (wc->wait)
284 btrfs_wait_tree_block_writeback(eb);
285 }
286 return 0;
287}
288
289/*
290 * Item overwrite used by replay and tree logging. eb, slot and key all refer
291 * to the src data we are copying out.
292 *
293 * root is the tree we are copying into, and path is a scratch
294 * path for use in this function (it should be released on entry and
295 * will be released on exit).
296 *
297 * If the key is already in the destination tree the existing item is
298 * overwritten. If the existing item isn't big enough, it is extended.
299 * If it is too large, it is truncated.
300 *
301 * If the key isn't in the destination yet, a new item is inserted.
302 */
303static noinline int overwrite_item(struct btrfs_trans_handle *trans,
304 struct btrfs_root *root,
305 struct btrfs_path *path,
306 struct extent_buffer *eb, int slot,
307 struct btrfs_key *key)
308{
309 int ret;
310 u32 item_size;
311 u64 saved_i_size = 0;
312 int save_old_i_size = 0;
313 unsigned long src_ptr;
314 unsigned long dst_ptr;
315 int overwrite_root = 0;
316
317 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
318 overwrite_root = 1;
319
320 item_size = btrfs_item_size_nr(eb, slot);
321 src_ptr = btrfs_item_ptr_offset(eb, slot);
322
323 /* look for the key in the destination tree */
324 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
325 if (ret == 0) {
326 char *src_copy;
327 char *dst_copy;
328 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
329 path->slots[0]);
330 if (dst_size != item_size)
331 goto insert;
332
333 if (item_size == 0) {
334 btrfs_release_path(root, path);
335 return 0;
336 }
337 dst_copy = kmalloc(item_size, GFP_NOFS);
338 src_copy = kmalloc(item_size, GFP_NOFS);
339
340 read_extent_buffer(eb, src_copy, src_ptr, item_size);
341
342 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
343 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
344 item_size);
345 ret = memcmp(dst_copy, src_copy, item_size);
346
347 kfree(dst_copy);
348 kfree(src_copy);
349 /*
350 * they have the same contents, just return, this saves
351 * us from cowing blocks in the destination tree and doing
352 * extra writes that may not have been done by a previous
353 * sync
354 */
355 if (ret == 0) {
356 btrfs_release_path(root, path);
357 return 0;
358 }
359
360 }
361insert:
362 btrfs_release_path(root, path);
363 /* try to insert the key into the destination tree */
364 ret = btrfs_insert_empty_item(trans, root, path,
365 key, item_size);
366
367 /* make sure any existing item is the correct size */
368 if (ret == -EEXIST) {
369 u32 found_size;
370 found_size = btrfs_item_size_nr(path->nodes[0],
371 path->slots[0]);
372 if (found_size > item_size) {
373 btrfs_truncate_item(trans, root, path, item_size, 1);
374 } else if (found_size < item_size) {
375 ret = btrfs_del_item(trans, root,
376 path);
377 BUG_ON(ret);
378
379 btrfs_release_path(root, path);
380 ret = btrfs_insert_empty_item(trans,
381 root, path, key, item_size);
382 BUG_ON(ret);
383 }
384 } else if (ret) {
385 BUG();
386 }
387 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
388 path->slots[0]);
389
390 /* don't overwrite an existing inode if the generation number
391 * was logged as zero. This is done when the tree logging code
392 * is just logging an inode to make sure it exists after recovery.
393 *
394 * Also, don't overwrite i_size on directories during replay.
395 * log replay inserts and removes directory items based on the
396 * state of the tree found in the subvolume, and i_size is modified
397 * as it goes
398 */
399 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
400 struct btrfs_inode_item *src_item;
401 struct btrfs_inode_item *dst_item;
402
403 src_item = (struct btrfs_inode_item *)src_ptr;
404 dst_item = (struct btrfs_inode_item *)dst_ptr;
405
406 if (btrfs_inode_generation(eb, src_item) == 0)
407 goto no_copy;
408
409 if (overwrite_root &&
410 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
411 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
412 save_old_i_size = 1;
413 saved_i_size = btrfs_inode_size(path->nodes[0],
414 dst_item);
415 }
416 }
417
418 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
419 src_ptr, item_size);
420
421 if (save_old_i_size) {
422 struct btrfs_inode_item *dst_item;
423 dst_item = (struct btrfs_inode_item *)dst_ptr;
424 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
425 }
426
427 /* make sure the generation is filled in */
428 if (key->type == BTRFS_INODE_ITEM_KEY) {
429 struct btrfs_inode_item *dst_item;
430 dst_item = (struct btrfs_inode_item *)dst_ptr;
431 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
432 btrfs_set_inode_generation(path->nodes[0], dst_item,
433 trans->transid);
434 }
435 }
436
437 if (overwrite_root &&
438 key->type == BTRFS_EXTENT_DATA_KEY) {
439 int extent_type;
440 struct btrfs_file_extent_item *fi;
441
442 fi = (struct btrfs_file_extent_item *)dst_ptr;
443 extent_type = btrfs_file_extent_type(path->nodes[0], fi);
444 if (extent_type == BTRFS_FILE_EXTENT_REG) {
445 struct btrfs_key ins;
446 ins.objectid = btrfs_file_extent_disk_bytenr(
447 path->nodes[0], fi);
448 ins.offset = btrfs_file_extent_disk_num_bytes(
449 path->nodes[0], fi);
450 ins.type = BTRFS_EXTENT_ITEM_KEY;
451
452 /*
453 * is this extent already allocated in the extent
454 * allocation tree? If so, just add a reference
455 */
456 ret = btrfs_lookup_extent(root, ins.objectid,
457 ins.offset);
458 if (ret == 0) {
459 ret = btrfs_inc_extent_ref(trans, root,
460 ins.objectid, ins.offset,
461 path->nodes[0]->start,
462 root->root_key.objectid,
463 trans->transid, key->objectid);
464 } else {
465 /*
466 * insert the extent pointer in the extent
467 * allocation tree
468 */
469 ret = btrfs_alloc_logged_extent(trans, root,
470 path->nodes[0]->start,
471 root->root_key.objectid,
472 trans->transid, key->objectid,
473 &ins);
474 BUG_ON(ret);
475 }
476 }
477 }
478no_copy:
479 btrfs_mark_buffer_dirty(path->nodes[0]);
480 btrfs_release_path(root, path);
481 return 0;
482}
483
484/*
485 * simple helper to read an inode off the disk from a given root
486 * This can only be called for subvolume roots and not for the log
487 */
488static noinline struct inode *read_one_inode(struct btrfs_root *root,
489 u64 objectid)
490{
491 struct inode *inode;
492 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
493 if (inode->i_state & I_NEW) {
494 BTRFS_I(inode)->root = root;
495 BTRFS_I(inode)->location.objectid = objectid;
496 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
497 BTRFS_I(inode)->location.offset = 0;
498 btrfs_read_locked_inode(inode);
499 unlock_new_inode(inode);
500
501 }
502 if (is_bad_inode(inode)) {
503 iput(inode);
504 inode = NULL;
505 }
506 return inode;
507}
508
509/* replays a single extent in 'eb' at 'slot' with 'key' into the
510 * subvolume 'root'. path is released on entry and should be released
511 * on exit.
512 *
513 * extents in the log tree have not been allocated out of the extent
514 * tree yet. So, this completes the allocation, taking a reference
515 * as required if the extent already exists or creating a new extent
516 * if it isn't in the extent allocation tree yet.
517 *
518 * The extent is inserted into the file, dropping any existing extents
519 * from the file that overlap the new one.
520 */
521static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
522 struct btrfs_root *root,
523 struct btrfs_path *path,
524 struct extent_buffer *eb, int slot,
525 struct btrfs_key *key)
526{
527 int found_type;
528 u64 mask = root->sectorsize - 1;
529 u64 extent_end;
530 u64 alloc_hint;
531 u64 start = key->offset;
532 struct btrfs_file_extent_item *item;
533 struct inode *inode = NULL;
534 unsigned long size;
535 int ret = 0;
536
537 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
538 found_type = btrfs_file_extent_type(eb, item);
539
540 if (found_type == BTRFS_FILE_EXTENT_REG)
541 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
542 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
543 size = btrfs_file_extent_inline_len(eb,
544 btrfs_item_nr(eb, slot));
545 extent_end = (start + size + mask) & ~mask;
546 } else {
547 ret = 0;
548 goto out;
549 }
550
551 inode = read_one_inode(root, key->objectid);
552 if (!inode) {
553 ret = -EIO;
554 goto out;
555 }
556
557 /*
558 * first check to see if we already have this extent in the
559 * file. This must be done before the btrfs_drop_extents run
560 * so we don't try to drop this extent.
561 */
562 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
563 start, 0);
564
565 if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) {
566 struct btrfs_file_extent_item cmp1;
567 struct btrfs_file_extent_item cmp2;
568 struct btrfs_file_extent_item *existing;
569 struct extent_buffer *leaf;
570
571 leaf = path->nodes[0];
572 existing = btrfs_item_ptr(leaf, path->slots[0],
573 struct btrfs_file_extent_item);
574
575 read_extent_buffer(eb, &cmp1, (unsigned long)item,
576 sizeof(cmp1));
577 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
578 sizeof(cmp2));
579
580 /*
581 * we already have a pointer to this exact extent,
582 * we don't have to do anything
583 */
584 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
585 btrfs_release_path(root, path);
586 goto out;
587 }
588 }
589 btrfs_release_path(root, path);
590
591 /* drop any overlapping extents */
592 ret = btrfs_drop_extents(trans, root, inode,
593 start, extent_end, start, &alloc_hint);
594 BUG_ON(ret);
595
596 /* insert the extent */
597 ret = overwrite_item(trans, root, path, eb, slot, key);
598 BUG_ON(ret);
599
600 /* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
601 inode_add_bytes(inode, extent_end - start);
602 btrfs_update_inode(trans, root, inode);
603out:
604 if (inode)
605 iput(inode);
606 return ret;
607}
608
609/*
610 * when cleaning up conflicts between the directory names in the
611 * subvolume, directory names in the log and directory names in the
612 * inode back references, we may have to unlink inodes from directories.
613 *
614 * This is a helper function to do the unlink of a specific directory
615 * item
616 */
617static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
618 struct btrfs_root *root,
619 struct btrfs_path *path,
620 struct inode *dir,
621 struct btrfs_dir_item *di)
622{
623 struct inode *inode;
624 char *name;
625 int name_len;
626 struct extent_buffer *leaf;
627 struct btrfs_key location;
628 int ret;
629
630 leaf = path->nodes[0];
631
632 btrfs_dir_item_key_to_cpu(leaf, di, &location);
633 name_len = btrfs_dir_name_len(leaf, di);
634 name = kmalloc(name_len, GFP_NOFS);
635 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
636 btrfs_release_path(root, path);
637
638 inode = read_one_inode(root, location.objectid);
639 BUG_ON(!inode);
640
641 btrfs_inc_nlink(inode);
642 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
643 kfree(name);
644
645 iput(inode);
646 return ret;
647}
648
649/*
650 * helper function to see if a given name and sequence number found
651 * in an inode back reference are already in a directory and correctly
652 * point to this inode
653 */
654static noinline int inode_in_dir(struct btrfs_root *root,
655 struct btrfs_path *path,
656 u64 dirid, u64 objectid, u64 index,
657 const char *name, int name_len)
658{
659 struct btrfs_dir_item *di;
660 struct btrfs_key location;
661 int match = 0;
662
663 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
664 index, name, name_len, 0);
665 if (di && !IS_ERR(di)) {
666 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
667 if (location.objectid != objectid)
668 goto out;
669 } else
670 goto out;
671 btrfs_release_path(root, path);
672
673 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
674 if (di && !IS_ERR(di)) {
675 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
676 if (location.objectid != objectid)
677 goto out;
678 } else
679 goto out;
680 match = 1;
681out:
682 btrfs_release_path(root, path);
683 return match;
684}
685
686/*
687 * helper function to check a log tree for a named back reference in
688 * an inode. This is used to decide if a back reference that is
689 * found in the subvolume conflicts with what we find in the log.
690 *
691 * inode backreferences may have multiple refs in a single item,
692 * during replay we process one reference at a time, and we don't
693 * want to delete valid links to a file from the subvolume if that
694 * link is also in the log.
695 */
696static noinline int backref_in_log(struct btrfs_root *log,
697 struct btrfs_key *key,
698 char *name, int namelen)
699{
700 struct btrfs_path *path;
701 struct btrfs_inode_ref *ref;
702 unsigned long ptr;
703 unsigned long ptr_end;
704 unsigned long name_ptr;
705 int found_name_len;
706 int item_size;
707 int ret;
708 int match = 0;
709
710 path = btrfs_alloc_path();
711 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
712 if (ret != 0)
713 goto out;
714
715 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
716 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
717 ptr_end = ptr + item_size;
718 while (ptr < ptr_end) {
719 ref = (struct btrfs_inode_ref *)ptr;
720 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
721 if (found_name_len == namelen) {
722 name_ptr = (unsigned long)(ref + 1);
723 ret = memcmp_extent_buffer(path->nodes[0], name,
724 name_ptr, namelen);
725 if (ret == 0) {
726 match = 1;
727 goto out;
728 }
729 }
730 ptr = (unsigned long)(ref + 1) + found_name_len;
731 }
732out:
733 btrfs_free_path(path);
734 return match;
735}
736
737
738/*
739 * replay one inode back reference item found in the log tree.
740 * eb, slot and key refer to the buffer and key found in the log tree.
741 * root is the destination we are replaying into, and path is for temp
742 * use by this function. (it should be released on return).
743 */
744static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
745 struct btrfs_root *root,
746 struct btrfs_root *log,
747 struct btrfs_path *path,
748 struct extent_buffer *eb, int slot,
749 struct btrfs_key *key)
750{
751 struct inode *dir;
752 int ret;
753 struct btrfs_key location;
754 struct btrfs_inode_ref *ref;
755 struct btrfs_dir_item *di;
756 struct inode *inode;
757 char *name;
758 int namelen;
759 unsigned long ref_ptr;
760 unsigned long ref_end;
761
762 location.objectid = key->objectid;
763 location.type = BTRFS_INODE_ITEM_KEY;
764 location.offset = 0;
765
766 /*
767 * it is possible that we didn't log all the parent directories
768 * for a given inode. If we don't find the dir, just don't
769 * copy the back ref in. The link count fixup code will take
770 * care of the rest
771 */
772 dir = read_one_inode(root, key->offset);
773 if (!dir)
774 return -ENOENT;
775
776 inode = read_one_inode(root, key->objectid);
777 BUG_ON(!dir);
778
779 ref_ptr = btrfs_item_ptr_offset(eb, slot);
780 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
781
782again:
783 ref = (struct btrfs_inode_ref *)ref_ptr;
784
785 namelen = btrfs_inode_ref_name_len(eb, ref);
786 name = kmalloc(namelen, GFP_NOFS);
787 BUG_ON(!name);
788
789 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
790
791 /* if we already have a perfect match, we're done */
792 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
793 btrfs_inode_ref_index(eb, ref),
794 name, namelen)) {
795 goto out;
796 }
797
798 /*
799 * look for a conflicting back reference in the metadata.
800 * if we find one we have to unlink that name of the file
801 * before we add our new link. Later on, we overwrite any
802 * existing back reference, and we don't want to create
803 * dangling pointers in the directory.
804 */
805conflict_again:
806 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
807 if (ret == 0) {
808 char *victim_name;
809 int victim_name_len;
810 struct btrfs_inode_ref *victim_ref;
811 unsigned long ptr;
812 unsigned long ptr_end;
813 struct extent_buffer *leaf = path->nodes[0];
814
815 /* are we trying to overwrite a back ref for the root directory
816 * if so, just jump out, we're done
817 */
818 if (key->objectid == key->offset)
819 goto out_nowrite;
820
821 /* check all the names in this back reference to see
822 * if they are in the log. if so, we allow them to stay
823 * otherwise they must be unlinked as a conflict
824 */
825 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
826 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
827 while(ptr < ptr_end) {
828 victim_ref = (struct btrfs_inode_ref *)ptr;
829 victim_name_len = btrfs_inode_ref_name_len(leaf,
830 victim_ref);
831 victim_name = kmalloc(victim_name_len, GFP_NOFS);
832 BUG_ON(!victim_name);
833
834 read_extent_buffer(leaf, victim_name,
835 (unsigned long)(victim_ref + 1),
836 victim_name_len);
837
838 if (!backref_in_log(log, key, victim_name,
839 victim_name_len)) {
840 btrfs_inc_nlink(inode);
841 btrfs_release_path(root, path);
842 ret = btrfs_unlink_inode(trans, root, dir,
843 inode, victim_name,
844 victim_name_len);
845 kfree(victim_name);
846 btrfs_release_path(root, path);
847 goto conflict_again;
848 }
849 kfree(victim_name);
850 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
851 }
852 BUG_ON(ret);
853 }
854 btrfs_release_path(root, path);
855
856 /* look for a conflicting sequence number */
857 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
858 btrfs_inode_ref_index(eb, ref),
859 name, namelen, 0);
860 if (di && !IS_ERR(di)) {
861 ret = drop_one_dir_item(trans, root, path, dir, di);
862 BUG_ON(ret);
863 }
864 btrfs_release_path(root, path);
865
866
867 /* look for a conflicting name */
868 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
869 name, namelen, 0);
870 if (di && !IS_ERR(di)) {
871 ret = drop_one_dir_item(trans, root, path, dir, di);
872 BUG_ON(ret);
873 }
874 btrfs_release_path(root, path);
875
876 /* insert our name */
877 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
878 btrfs_inode_ref_index(eb, ref));
879 BUG_ON(ret);
880
881 btrfs_update_inode(trans, root, inode);
882
883out:
884 ref_ptr = (unsigned long)(ref + 1) + namelen;
885 kfree(name);
886 if (ref_ptr < ref_end)
887 goto again;
888
889 /* finally write the back reference in the inode */
890 ret = overwrite_item(trans, root, path, eb, slot, key);
891 BUG_ON(ret);
892
893out_nowrite:
894 btrfs_release_path(root, path);
895 iput(dir);
896 iput(inode);
897 return 0;
898}
899
900/*
901 * replay one csum item from the log tree into the subvolume 'root'
902 * eb, slot and key all refer to the log tree
903 * path is for temp use by this function and should be released on return
904 *
905 * This copies the checksums out of the log tree and inserts them into
906 * the subvolume. Any existing checksums for this range in the file
907 * are overwritten, and new items are added where required.
908 *
909 * We keep this simple by reusing the btrfs_ordered_sum code from
910 * the data=ordered mode. This basically means making a copy
911 * of all the checksums in ram, which we have to do anyway for kmap
912 * rules.
913 *
914 * The copy is then sent down to btrfs_csum_file_blocks, which
915 * does all the hard work of finding existing items in the file
916 * or adding new ones.
917 */
918static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
919 struct btrfs_root *root,
920 struct btrfs_path *path,
921 struct extent_buffer *eb, int slot,
922 struct btrfs_key *key)
923{
924 int ret;
925 u32 item_size = btrfs_item_size_nr(eb, slot);
926 u64 cur_offset;
927 unsigned long file_bytes;
928 struct btrfs_ordered_sum *sums;
929 struct btrfs_sector_sum *sector_sum;
930 struct inode *inode;
931 unsigned long ptr;
932
933 file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
934 inode = read_one_inode(root, key->objectid);
935 if (!inode) {
936 return -EIO;
937 }
938
939 sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
940 if (!sums) {
941 iput(inode);
942 return -ENOMEM;
943 }
944
945 INIT_LIST_HEAD(&sums->list);
946 sums->len = file_bytes;
947 sums->file_offset = key->offset;
948
949 /*
950 * copy all the sums into the ordered sum struct
951 */
952 sector_sum = sums->sums;
953 cur_offset = key->offset;
954 ptr = btrfs_item_ptr_offset(eb, slot);
955 while(item_size > 0) {
956 sector_sum->offset = cur_offset;
957 read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
958 sector_sum++;
959 item_size -= BTRFS_CRC32_SIZE;
960 ptr += BTRFS_CRC32_SIZE;
961 cur_offset += root->sectorsize;
962 }
963
964 /* let btrfs_csum_file_blocks add them into the file */
965 ret = btrfs_csum_file_blocks(trans, root, inode, sums);
966 BUG_ON(ret);
967 kfree(sums);
968 iput(inode);
969
970 return 0;
971}
972/*
973 * There are a few corners where the link count of the file can't
974 * be properly maintained during replay. So, instead of adding
975 * lots of complexity to the log code, we just scan the backrefs
976 * for any file that has been through replay.
977 *
978 * The scan will update the link count on the inode to reflect the
979 * number of back refs found. If it goes down to zero, the iput
980 * will free the inode.
981 */
982static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
983 struct btrfs_root *root,
984 struct inode *inode)
985{
986 struct btrfs_path *path;
987 int ret;
988 struct btrfs_key key;
989 u64 nlink = 0;
990 unsigned long ptr;
991 unsigned long ptr_end;
992 int name_len;
993
994 key.objectid = inode->i_ino;
995 key.type = BTRFS_INODE_REF_KEY;
996 key.offset = (u64)-1;
997
998 path = btrfs_alloc_path();
999
1000 while(1) {
1001 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1002 if (ret < 0)
1003 break;
1004 if (ret > 0) {
1005 if (path->slots[0] == 0)
1006 break;
1007 path->slots[0]--;
1008 }
1009 btrfs_item_key_to_cpu(path->nodes[0], &key,
1010 path->slots[0]);
1011 if (key.objectid != inode->i_ino ||
1012 key.type != BTRFS_INODE_REF_KEY)
1013 break;
1014 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1015 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1016 path->slots[0]);
1017 while(ptr < ptr_end) {
1018 struct btrfs_inode_ref *ref;
1019
1020 ref = (struct btrfs_inode_ref *)ptr;
1021 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1022 ref);
1023 ptr = (unsigned long)(ref + 1) + name_len;
1024 nlink++;
1025 }
1026
1027 if (key.offset == 0)
1028 break;
1029 key.offset--;
1030 btrfs_release_path(root, path);
1031 }
1032 btrfs_free_path(path);
1033 if (nlink != inode->i_nlink) {
1034 inode->i_nlink = nlink;
1035 btrfs_update_inode(trans, root, inode);
1036 }
1037 BTRFS_I(inode)->index_cnt = (u64)-1;
1038
1039 return 0;
1040}
1041
1042static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1043 struct btrfs_root *root,
1044 struct btrfs_path *path)
1045{
1046 int ret;
1047 struct btrfs_key key;
1048 struct inode *inode;
1049
1050 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1051 key.type = BTRFS_ORPHAN_ITEM_KEY;
1052 key.offset = (u64)-1;
1053 while(1) {
1054 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1055 if (ret < 0)
1056 break;
1057
1058 if (ret == 1) {
1059 if (path->slots[0] == 0)
1060 break;
1061 path->slots[0]--;
1062 }
1063
1064 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1065 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1066 key.type != BTRFS_ORPHAN_ITEM_KEY)
1067 break;
1068
1069 ret = btrfs_del_item(trans, root, path);
1070 BUG_ON(ret);
1071
1072 btrfs_release_path(root, path);
1073 inode = read_one_inode(root, key.offset);
1074 BUG_ON(!inode);
1075
1076 ret = fixup_inode_link_count(trans, root, inode);
1077 BUG_ON(ret);
1078
1079 iput(inode);
1080
1081 if (key.offset == 0)
1082 break;
1083 key.offset--;
1084 }
1085 btrfs_release_path(root, path);
1086 return 0;
1087}
1088
1089
1090/*
1091 * record a given inode in the fixup dir so we can check its link
1092 * count when replay is done. The link count is incremented here
1093 * so the inode won't go away until we check it
1094 */
1095static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1096 struct btrfs_root *root,
1097 struct btrfs_path *path,
1098 u64 objectid)
1099{
1100 struct btrfs_key key;
1101 int ret = 0;
1102 struct inode *inode;
1103
1104 inode = read_one_inode(root, objectid);
1105 BUG_ON(!inode);
1106
1107 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1108 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1109 key.offset = objectid;
1110
1111 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1112
1113 btrfs_release_path(root, path);
1114 if (ret == 0) {
1115 btrfs_inc_nlink(inode);
1116 btrfs_update_inode(trans, root, inode);
1117 } else if (ret == -EEXIST) {
1118 ret = 0;
1119 } else {
1120 BUG();
1121 }
1122 iput(inode);
1123
1124 return ret;
1125}
1126
1127/*
1128 * when replaying the log for a directory, we only insert names
1129 * for inodes that actually exist. This means an fsync on a directory
1130 * does not implicitly fsync all the new files in it
1131 */
1132static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1133 struct btrfs_root *root,
1134 struct btrfs_path *path,
1135 u64 dirid, u64 index,
1136 char *name, int name_len, u8 type,
1137 struct btrfs_key *location)
1138{
1139 struct inode *inode;
1140 struct inode *dir;
1141 int ret;
1142
1143 inode = read_one_inode(root, location->objectid);
1144 if (!inode)
1145 return -ENOENT;
1146
1147 dir = read_one_inode(root, dirid);
1148 if (!dir) {
1149 iput(inode);
1150 return -EIO;
1151 }
1152 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1153
1154 /* FIXME, put inode into FIXUP list */
1155
1156 iput(inode);
1157 iput(dir);
1158 return ret;
1159}
1160
1161/*
1162 * take a single entry in a log directory item and replay it into
1163 * the subvolume.
1164 *
1165 * if a conflicting item exists in the subdirectory already,
1166 * the inode it points to is unlinked and put into the link count
1167 * fix up tree.
1168 *
1169 * If a name from the log points to a file or directory that does
1170 * not exist in the FS, it is skipped. fsyncs on directories
1171 * do not force down inodes inside that directory, just changes to the
1172 * names or unlinks in a directory.
1173 */
1174static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1175 struct btrfs_root *root,
1176 struct btrfs_path *path,
1177 struct extent_buffer *eb,
1178 struct btrfs_dir_item *di,
1179 struct btrfs_key *key)
1180{
1181 char *name;
1182 int name_len;
1183 struct btrfs_dir_item *dst_di;
1184 struct btrfs_key found_key;
1185 struct btrfs_key log_key;
1186 struct inode *dir;
1187 u8 log_type;
1188 int exists;
1189 int ret;
1190
1191 dir = read_one_inode(root, key->objectid);
1192 BUG_ON(!dir);
1193
1194 name_len = btrfs_dir_name_len(eb, di);
1195 name = kmalloc(name_len, GFP_NOFS);
1196 log_type = btrfs_dir_type(eb, di);
1197 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1198 name_len);
1199
1200 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1201 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1202 if (exists == 0)
1203 exists = 1;
1204 else
1205 exists = 0;
1206 btrfs_release_path(root, path);
1207
1208 if (key->type == BTRFS_DIR_ITEM_KEY) {
1209 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1210 name, name_len, 1);
1211 }
1212 else if (key->type == BTRFS_DIR_INDEX_KEY) {
1213 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1214 key->objectid,
1215 key->offset, name,
1216 name_len, 1);
1217 } else {
1218 BUG();
1219 }
1220 if (!dst_di || IS_ERR(dst_di)) {
1221 /* we need a sequence number to insert, so we only
1222 * do inserts for the BTRFS_DIR_INDEX_KEY types
1223 */
1224 if (key->type != BTRFS_DIR_INDEX_KEY)
1225 goto out;
1226 goto insert;
1227 }
1228
1229 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1230 /* the existing item matches the logged item */
1231 if (found_key.objectid == log_key.objectid &&
1232 found_key.type == log_key.type &&
1233 found_key.offset == log_key.offset &&
1234 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1235 goto out;
1236 }
1237
1238 /*
1239 * don't drop the conflicting directory entry if the inode
1240 * for the new entry doesn't exist
1241 */
1242 if (!exists)
1243 goto out;
1244
1245 ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1246 BUG_ON(ret);
1247
1248 if (key->type == BTRFS_DIR_INDEX_KEY)
1249 goto insert;
1250out:
1251 btrfs_release_path(root, path);
1252 kfree(name);
1253 iput(dir);
1254 return 0;
1255
1256insert:
1257 btrfs_release_path(root, path);
1258 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1259 name, name_len, log_type, &log_key);
1260
1261 if (ret && ret != -ENOENT)
1262 BUG();
1263 goto out;
1264}
1265
1266/*
1267 * find all the names in a directory item and reconcile them into
1268 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
1269 * one name in a directory item, but the same code gets used for
1270 * both directory index types
1271 */
1272static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1273 struct btrfs_root *root,
1274 struct btrfs_path *path,
1275 struct extent_buffer *eb, int slot,
1276 struct btrfs_key *key)
1277{
1278 int ret;
1279 u32 item_size = btrfs_item_size_nr(eb, slot);
1280 struct btrfs_dir_item *di;
1281 int name_len;
1282 unsigned long ptr;
1283 unsigned long ptr_end;
1284
1285 ptr = btrfs_item_ptr_offset(eb, slot);
1286 ptr_end = ptr + item_size;
1287 while(ptr < ptr_end) {
1288 di = (struct btrfs_dir_item *)ptr;
1289 name_len = btrfs_dir_name_len(eb, di);
1290 ret = replay_one_name(trans, root, path, eb, di, key);
1291 BUG_ON(ret);
1292 ptr = (unsigned long)(di + 1);
1293 ptr += name_len;
1294 }
1295 return 0;
1296}
1297
1298/*
1299 * directory replay has two parts. There are the standard directory
1300 * items in the log copied from the subvolume, and range items
1301 * created in the log while the subvolume was logged.
1302 *
1303 * The range items tell us which parts of the key space the log
1304 * is authoritative for. During replay, if a key in the subvolume
1305 * directory is in a logged range item, but not actually in the log
1306 * that means it was deleted from the directory before the fsync
1307 * and should be removed.
1308 */
1309static noinline int find_dir_range(struct btrfs_root *root,
1310 struct btrfs_path *path,
1311 u64 dirid, int key_type,
1312 u64 *start_ret, u64 *end_ret)
1313{
1314 struct btrfs_key key;
1315 u64 found_end;
1316 struct btrfs_dir_log_item *item;
1317 int ret;
1318 int nritems;
1319
1320 if (*start_ret == (u64)-1)
1321 return 1;
1322
1323 key.objectid = dirid;
1324 key.type = key_type;
1325 key.offset = *start_ret;
1326
1327 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1328 if (ret < 0)
1329 goto out;
1330 if (ret > 0) {
1331 if (path->slots[0] == 0)
1332 goto out;
1333 path->slots[0]--;
1334 }
1335 if (ret != 0)
1336 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1337
1338 if (key.type != key_type || key.objectid != dirid) {
1339 ret = 1;
1340 goto next;
1341 }
1342 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1343 struct btrfs_dir_log_item);
1344 found_end = btrfs_dir_log_end(path->nodes[0], item);
1345
1346 if (*start_ret >= key.offset && *start_ret <= found_end) {
1347 ret = 0;
1348 *start_ret = key.offset;
1349 *end_ret = found_end;
1350 goto out;
1351 }
1352 ret = 1;
1353next:
1354 /* check the next slot in the tree to see if it is a valid item */
1355 nritems = btrfs_header_nritems(path->nodes[0]);
1356 if (path->slots[0] >= nritems) {
1357 ret = btrfs_next_leaf(root, path);
1358 if (ret)
1359 goto out;
1360 } else {
1361 path->slots[0]++;
1362 }
1363
1364 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1365
1366 if (key.type != key_type || key.objectid != dirid) {
1367 ret = 1;
1368 goto out;
1369 }
1370 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1371 struct btrfs_dir_log_item);
1372 found_end = btrfs_dir_log_end(path->nodes[0], item);
1373 *start_ret = key.offset;
1374 *end_ret = found_end;
1375 ret = 0;
1376out:
1377 btrfs_release_path(root, path);
1378 return ret;
1379}
1380
1381/*
1382 * this looks for a given directory item in the log. If the directory
1383 * item is not in the log, the item is removed and the inode it points
1384 * to is unlinked
1385 */
1386static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1387 struct btrfs_root *root,
1388 struct btrfs_root *log,
1389 struct btrfs_path *path,
1390 struct btrfs_path *log_path,
1391 struct inode *dir,
1392 struct btrfs_key *dir_key)
1393{
1394 int ret;
1395 struct extent_buffer *eb;
1396 int slot;
1397 u32 item_size;
1398 struct btrfs_dir_item *di;
1399 struct btrfs_dir_item *log_di;
1400 int name_len;
1401 unsigned long ptr;
1402 unsigned long ptr_end;
1403 char *name;
1404 struct inode *inode;
1405 struct btrfs_key location;
1406
1407again:
1408 eb = path->nodes[0];
1409 slot = path->slots[0];
1410 item_size = btrfs_item_size_nr(eb, slot);
1411 ptr = btrfs_item_ptr_offset(eb, slot);
1412 ptr_end = ptr + item_size;
1413 while(ptr < ptr_end) {
1414 di = (struct btrfs_dir_item *)ptr;
1415 name_len = btrfs_dir_name_len(eb, di);
1416 name = kmalloc(name_len, GFP_NOFS);
1417 if (!name) {
1418 ret = -ENOMEM;
1419 goto out;
1420 }
1421 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1422 name_len);
1423 log_di = NULL;
1424 if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
1425 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1426 dir_key->objectid,
1427 name, name_len, 0);
1428 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
1429 log_di = btrfs_lookup_dir_index_item(trans, log,
1430 log_path,
1431 dir_key->objectid,
1432 dir_key->offset,
1433 name, name_len, 0);
1434 }
1435 if (!log_di || IS_ERR(log_di)) {
1436 btrfs_dir_item_key_to_cpu(eb, di, &location);
1437 btrfs_release_path(root, path);
1438 btrfs_release_path(log, log_path);
1439 inode = read_one_inode(root, location.objectid);
1440 BUG_ON(!inode);
1441
1442 ret = link_to_fixup_dir(trans, root,
1443 path, location.objectid);
1444 BUG_ON(ret);
1445 btrfs_inc_nlink(inode);
1446 ret = btrfs_unlink_inode(trans, root, dir, inode,
1447 name, name_len);
1448 BUG_ON(ret);
1449 kfree(name);
1450 iput(inode);
1451
1452 /* there might still be more names under this key
1453 * check and repeat if required
1454 */
1455 ret = btrfs_search_slot(NULL, root, dir_key, path,
1456 0, 0);
1457 if (ret == 0)
1458 goto again;
1459 ret = 0;
1460 goto out;
1461 }
1462 btrfs_release_path(log, log_path);
1463 kfree(name);
1464
1465 ptr = (unsigned long)(di + 1);
1466 ptr += name_len;
1467 }
1468 ret = 0;
1469out:
1470 btrfs_release_path(root, path);
1471 btrfs_release_path(log, log_path);
1472 return ret;
1473}
1474
1475/*
1476 * deletion replay happens before we copy any new directory items
1477 * out of the log or out of backreferences from inodes. It
1478 * scans the log to find ranges of keys that log is authoritative for,
1479 * and then scans the directory to find items in those ranges that are
1480 * not present in the log.
1481 *
1482 * Anything we don't find in the log is unlinked and removed from the
1483 * directory.
1484 */
1485static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1486 struct btrfs_root *root,
1487 struct btrfs_root *log,
1488 struct btrfs_path *path,
1489 u64 dirid)
1490{
1491 u64 range_start;
1492 u64 range_end;
1493 int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1494 int ret = 0;
1495 struct btrfs_key dir_key;
1496 struct btrfs_key found_key;
1497 struct btrfs_path *log_path;
1498 struct inode *dir;
1499
1500 dir_key.objectid = dirid;
1501 dir_key.type = BTRFS_DIR_ITEM_KEY;
1502 log_path = btrfs_alloc_path();
1503 if (!log_path)
1504 return -ENOMEM;
1505
1506 dir = read_one_inode(root, dirid);
1507 /* it isn't an error if the inode isn't there, that can happen
1508 * because we replay the deletes before we copy in the inode item
1509 * from the log
1510 */
1511 if (!dir) {
1512 btrfs_free_path(log_path);
1513 return 0;
1514 }
1515again:
1516 range_start = 0;
1517 range_end = 0;
1518 while(1) {
1519 ret = find_dir_range(log, path, dirid, key_type,
1520 &range_start, &range_end);
1521 if (ret != 0)
1522 break;
1523
1524 dir_key.offset = range_start;
1525 while(1) {
1526 int nritems;
1527 ret = btrfs_search_slot(NULL, root, &dir_key, path,
1528 0, 0);
1529 if (ret < 0)
1530 goto out;
1531
1532 nritems = btrfs_header_nritems(path->nodes[0]);
1533 if (path->slots[0] >= nritems) {
1534 ret = btrfs_next_leaf(root, path);
1535 if (ret)
1536 break;
1537 }
1538 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1539 path->slots[0]);
1540 if (found_key.objectid != dirid ||
1541 found_key.type != dir_key.type)
1542 goto next_type;
1543
1544 if (found_key.offset > range_end)
1545 break;
1546
1547 ret = check_item_in_log(trans, root, log, path,
1548 log_path, dir, &found_key);
1549 BUG_ON(ret);
1550 if (found_key.offset == (u64)-1)
1551 break;
1552 dir_key.offset = found_key.offset + 1;
1553 }
1554 btrfs_release_path(root, path);
1555 if (range_end == (u64)-1)
1556 break;
1557 range_start = range_end + 1;
1558 }
1559
1560next_type:
1561 ret = 0;
1562 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1563 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1564 dir_key.type = BTRFS_DIR_INDEX_KEY;
1565 btrfs_release_path(root, path);
1566 goto again;
1567 }
1568out:
1569 btrfs_release_path(root, path);
1570 btrfs_free_path(log_path);
1571 iput(dir);
1572 return ret;
1573}
1574
1575/*
1576 * the process_func used to replay items from the log tree. This
1577 * gets called in two different stages. The first stage just looks
1578 * for inodes and makes sure they are all copied into the subvolume.
1579 *
1580 * The second stage copies all the other item types from the log into
1581 * the subvolume. The two stage approach is slower, but gets rid of
1582 * lots of complexity around inodes referencing other inodes that exist
1583 * only in the log (references come from either directory items or inode
1584 * back refs).
1585 */
1586static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1587 struct walk_control *wc, u64 gen)
1588{
1589 int nritems;
1590 struct btrfs_path *path;
1591 struct btrfs_root *root = wc->replay_dest;
1592 struct btrfs_key key;
1593 u32 item_size;
1594 int level;
1595 int i;
1596 int ret;
1597
1598 btrfs_read_buffer(eb, gen);
1599
1600 level = btrfs_header_level(eb);
1601
1602 if (level != 0)
1603 return 0;
1604
1605 path = btrfs_alloc_path();
1606 BUG_ON(!path);
1607
1608 nritems = btrfs_header_nritems(eb);
1609 for (i = 0; i < nritems; i++) {
1610 btrfs_item_key_to_cpu(eb, &key, i);
1611 item_size = btrfs_item_size_nr(eb, i);
1612
1613 /* inode keys are done during the first stage */
1614 if (key.type == BTRFS_INODE_ITEM_KEY &&
1615 wc->stage == LOG_WALK_REPLAY_INODES) {
1616 struct inode *inode;
1617 struct btrfs_inode_item *inode_item;
1618 u32 mode;
1619
1620 inode_item = btrfs_item_ptr(eb, i,
1621 struct btrfs_inode_item);
1622 mode = btrfs_inode_mode(eb, inode_item);
1623 if (S_ISDIR(mode)) {
1624 ret = replay_dir_deletes(wc->trans,
1625 root, log, path, key.objectid);
1626 BUG_ON(ret);
1627 }
1628 ret = overwrite_item(wc->trans, root, path,
1629 eb, i, &key);
1630 BUG_ON(ret);
1631
1632 /* for regular files, truncate away
1633 * extents past the new EOF
1634 */
1635 if (S_ISREG(mode)) {
1636 inode = read_one_inode(root,
1637 key.objectid);
1638 BUG_ON(!inode);
1639
1640 ret = btrfs_truncate_inode_items(wc->trans,
1641 root, inode, inode->i_size,
1642 BTRFS_EXTENT_DATA_KEY);
1643 BUG_ON(ret);
1644 iput(inode);
1645 }
1646 ret = link_to_fixup_dir(wc->trans, root,
1647 path, key.objectid);
1648 BUG_ON(ret);
1649 }
1650 if (wc->stage < LOG_WALK_REPLAY_ALL)
1651 continue;
1652
1653 /* these keys are simply copied */
1654 if (key.type == BTRFS_XATTR_ITEM_KEY) {
1655 ret = overwrite_item(wc->trans, root, path,
1656 eb, i, &key);
1657 BUG_ON(ret);
1658 } else if (key.type == BTRFS_INODE_REF_KEY) {
1659 ret = add_inode_ref(wc->trans, root, log, path,
1660 eb, i, &key);
1661 BUG_ON(ret && ret != -ENOENT);
1662 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1663 ret = replay_one_extent(wc->trans, root, path,
1664 eb, i, &key);
1665 BUG_ON(ret);
1666 } else if (key.type == BTRFS_CSUM_ITEM_KEY) {
1667 ret = replay_one_csum(wc->trans, root, path,
1668 eb, i, &key);
1669 BUG_ON(ret);
1670 } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1671 key.type == BTRFS_DIR_INDEX_KEY) {
1672 ret = replay_one_dir_item(wc->trans, root, path,
1673 eb, i, &key);
1674 BUG_ON(ret);
1675 }
1676 }
1677 btrfs_free_path(path);
1678 return 0;
1679}
1680
1681static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
1682 struct btrfs_root *root,
1683 struct btrfs_path *path, int *level,
1684 struct walk_control *wc)
1685{
1686 u64 root_owner;
1687 u64 root_gen;
1688 u64 bytenr;
1689 u64 ptr_gen;
1690 struct extent_buffer *next;
1691 struct extent_buffer *cur;
1692 struct extent_buffer *parent;
1693 u32 blocksize;
1694 int ret = 0;
1695
1696 WARN_ON(*level < 0);
1697 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1698
1699 while(*level > 0) {
1700 WARN_ON(*level < 0);
1701 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1702 cur = path->nodes[*level];
1703
1704 if (btrfs_header_level(cur) != *level)
1705 WARN_ON(1);
1706
1707 if (path->slots[*level] >=
1708 btrfs_header_nritems(cur))
1709 break;
1710
1711 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1712 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1713 blocksize = btrfs_level_size(root, *level - 1);
1714
1715 parent = path->nodes[*level];
1716 root_owner = btrfs_header_owner(parent);
1717 root_gen = btrfs_header_generation(parent);
1718
1719 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1720
1721 wc->process_func(root, next, wc, ptr_gen);
1722
1723 if (*level == 1) {
1724 path->slots[*level]++;
1725 if (wc->free) {
1726 btrfs_read_buffer(next, ptr_gen);
1727
1728 btrfs_tree_lock(next);
1729 clean_tree_block(trans, root, next);
1730 btrfs_wait_tree_block_writeback(next);
1731 btrfs_tree_unlock(next);
1732
1733 ret = btrfs_drop_leaf_ref(trans, root, next);
1734 BUG_ON(ret);
1735
1736 WARN_ON(root_owner !=
1737 BTRFS_TREE_LOG_OBJECTID);
1738 ret = btrfs_free_reserved_extent(root,
1739 bytenr, blocksize);
1740 BUG_ON(ret);
1741 }
1742 free_extent_buffer(next);
1743 continue;
1744 }
1745 btrfs_read_buffer(next, ptr_gen);
1746
1747 WARN_ON(*level <= 0);
1748 if (path->nodes[*level-1])
1749 free_extent_buffer(path->nodes[*level-1]);
1750 path->nodes[*level-1] = next;
1751 *level = btrfs_header_level(next);
1752 path->slots[*level] = 0;
1753 cond_resched();
1754 }
1755 WARN_ON(*level < 0);
1756 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1757
1758 if (path->nodes[*level] == root->node) {
1759 parent = path->nodes[*level];
1760 } else {
1761 parent = path->nodes[*level + 1];
1762 }
1763 bytenr = path->nodes[*level]->start;
1764
1765 blocksize = btrfs_level_size(root, *level);
1766 root_owner = btrfs_header_owner(parent);
1767 root_gen = btrfs_header_generation(parent);
1768
1769 wc->process_func(root, path->nodes[*level], wc,
1770 btrfs_header_generation(path->nodes[*level]));
1771
1772 if (wc->free) {
1773 next = path->nodes[*level];
1774 btrfs_tree_lock(next);
1775 clean_tree_block(trans, root, next);
1776 btrfs_wait_tree_block_writeback(next);
1777 btrfs_tree_unlock(next);
1778
1779 if (*level == 0) {
1780 ret = btrfs_drop_leaf_ref(trans, root, next);
1781 BUG_ON(ret);
1782 }
1783 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1784 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1785 BUG_ON(ret);
1786 }
1787 free_extent_buffer(path->nodes[*level]);
1788 path->nodes[*level] = NULL;
1789 *level += 1;
1790
1791 cond_resched();
1792 return 0;
1793}
1794
1795static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
1796 struct btrfs_root *root,
1797 struct btrfs_path *path, int *level,
1798 struct walk_control *wc)
1799{
1800 u64 root_owner;
1801 u64 root_gen;
1802 int i;
1803 int slot;
1804 int ret;
1805
1806 for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1807 slot = path->slots[i];
1808 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1809 struct extent_buffer *node;
1810 node = path->nodes[i];
1811 path->slots[i]++;
1812 *level = i;
1813 WARN_ON(*level == 0);
1814 return 0;
1815 } else {
1816 struct extent_buffer *parent;
1817 if (path->nodes[*level] == root->node)
1818 parent = path->nodes[*level];
1819 else
1820 parent = path->nodes[*level + 1];
1821
1822 root_owner = btrfs_header_owner(parent);
1823 root_gen = btrfs_header_generation(parent);
1824 wc->process_func(root, path->nodes[*level], wc,
1825 btrfs_header_generation(path->nodes[*level]));
1826 if (wc->free) {
1827 struct extent_buffer *next;
1828
1829 next = path->nodes[*level];
1830
1831 btrfs_tree_lock(next);
1832 clean_tree_block(trans, root, next);
1833 btrfs_wait_tree_block_writeback(next);
1834 btrfs_tree_unlock(next);
1835
1836 if (*level == 0) {
1837 ret = btrfs_drop_leaf_ref(trans, root,
1838 next);
1839 BUG_ON(ret);
1840 }
1841
1842 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1843 ret = btrfs_free_reserved_extent(root,
1844 path->nodes[*level]->start,
1845 path->nodes[*level]->len);
1846 BUG_ON(ret);
1847 }
1848 free_extent_buffer(path->nodes[*level]);
1849 path->nodes[*level] = NULL;
1850 *level = i + 1;
1851 }
1852 }
1853 return 1;
1854}
1855
1856/*
1857 * drop the reference count on the tree rooted at 'snap'. This traverses
1858 * the tree freeing any blocks that have a ref count of zero after being
1859 * decremented.
1860 */
1861static int walk_log_tree(struct btrfs_trans_handle *trans,
1862 struct btrfs_root *log, struct walk_control *wc)
1863{
1864 int ret = 0;
1865 int wret;
1866 int level;
1867 struct btrfs_path *path;
1868 int i;
1869 int orig_level;
1870
1871 path = btrfs_alloc_path();
1872 BUG_ON(!path);
1873
1874 level = btrfs_header_level(log->node);
1875 orig_level = level;
1876 path->nodes[level] = log->node;
1877 extent_buffer_get(log->node);
1878 path->slots[level] = 0;
1879
1880 while(1) {
1881 wret = walk_down_log_tree(trans, log, path, &level, wc);
1882 if (wret > 0)
1883 break;
1884 if (wret < 0)
1885 ret = wret;
1886
1887 wret = walk_up_log_tree(trans, log, path, &level, wc);
1888 if (wret > 0)
1889 break;
1890 if (wret < 0)
1891 ret = wret;
1892 }
1893
1894 /* was the root node processed? if not, catch it here */
1895 if (path->nodes[orig_level]) {
1896 wc->process_func(log, path->nodes[orig_level], wc,
1897 btrfs_header_generation(path->nodes[orig_level]));
1898 if (wc->free) {
1899 struct extent_buffer *next;
1900
1901 next = path->nodes[orig_level];
1902
1903 btrfs_tree_lock(next);
1904 clean_tree_block(trans, log, next);
1905 btrfs_wait_tree_block_writeback(next);
1906 btrfs_tree_unlock(next);
1907
1908 if (orig_level == 0) {
1909 ret = btrfs_drop_leaf_ref(trans, log,
1910 next);
1911 BUG_ON(ret);
1912 }
1913 WARN_ON(log->root_key.objectid !=
1914 BTRFS_TREE_LOG_OBJECTID);
1915 ret = btrfs_free_reserved_extent(log, next->start,
1916 next->len);
1917 BUG_ON(ret);
1918 }
1919 }
1920
1921 for (i = 0; i <= orig_level; i++) {
1922 if (path->nodes[i]) {
1923 free_extent_buffer(path->nodes[i]);
1924 path->nodes[i] = NULL;
1925 }
1926 }
1927 btrfs_free_path(path);
1928 if (wc->free)
1929 free_extent_buffer(log->node);
1930 return ret;
1931}
1932
1933int wait_log_commit(struct btrfs_root *log)
1934{
1935 DEFINE_WAIT(wait);
1936 u64 transid = log->fs_info->tree_log_transid;
1937
1938 do {
1939 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1940 TASK_UNINTERRUPTIBLE);
1941 mutex_unlock(&log->fs_info->tree_log_mutex);
1942 if (atomic_read(&log->fs_info->tree_log_commit))
1943 schedule();
1944 finish_wait(&log->fs_info->tree_log_wait, &wait);
1945 mutex_lock(&log->fs_info->tree_log_mutex);
1946 } while(transid == log->fs_info->tree_log_transid &&
1947 atomic_read(&log->fs_info->tree_log_commit));
1948 return 0;
1949}
1950
1951/*
1952 * btrfs_sync_log does sends a given tree log down to the disk and
1953 * updates the super blocks to record it. When this call is done,
1954 * you know that any inodes previously logged are safely on disk
1955 */
1956int btrfs_sync_log(struct btrfs_trans_handle *trans,
1957 struct btrfs_root *root)
1958{
1959 int ret;
1960 unsigned long batch;
1961 struct btrfs_root *log = root->log_root;
1962
1963 mutex_lock(&log->fs_info->tree_log_mutex);
1964 if (atomic_read(&log->fs_info->tree_log_commit)) {
1965 wait_log_commit(log);
1966 goto out;
1967 }
1968 atomic_set(&log->fs_info->tree_log_commit, 1);
1969
1970 while(1) {
1971 batch = log->fs_info->tree_log_batch;
1972 mutex_unlock(&log->fs_info->tree_log_mutex);
1973 schedule_timeout_uninterruptible(1);
1974 mutex_lock(&log->fs_info->tree_log_mutex);
1975
1976 while(atomic_read(&log->fs_info->tree_log_writers)) {
1977 DEFINE_WAIT(wait);
1978 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1979 TASK_UNINTERRUPTIBLE);
1980 mutex_unlock(&log->fs_info->tree_log_mutex);
1981 if (atomic_read(&log->fs_info->tree_log_writers))
1982 schedule();
1983 mutex_lock(&log->fs_info->tree_log_mutex);
1984 finish_wait(&log->fs_info->tree_log_wait, &wait);
1985 }
1986 if (batch == log->fs_info->tree_log_batch)
1987 break;
1988 }
1989
1990 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1991 BUG_ON(ret);
1992 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
1993 &root->fs_info->log_root_tree->dirty_log_pages);
1994 BUG_ON(ret);
1995
1996 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1997 log->fs_info->log_root_tree->node->start);
1998 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1999 btrfs_header_level(log->fs_info->log_root_tree->node));
2000
2001 write_ctree_super(trans, log->fs_info->tree_root);
2002 log->fs_info->tree_log_transid++;
2003 log->fs_info->tree_log_batch = 0;
2004 atomic_set(&log->fs_info->tree_log_commit, 0);
2005 smp_mb();
2006 if (waitqueue_active(&log->fs_info->tree_log_wait))
2007 wake_up(&log->fs_info->tree_log_wait);
2008out:
2009 mutex_unlock(&log->fs_info->tree_log_mutex);
2010 return 0;
2011
2012}
2013
2014/* * free all the extents used by the tree log. This should be called
2015 * at commit time of the full transaction
2016 */
2017int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2018{
2019 int ret;
2020 struct btrfs_root *log;
2021 struct key;
2022 u64 start;
2023 u64 end;
2024 struct walk_control wc = {
2025 .free = 1,
2026 .process_func = process_one_buffer
2027 };
2028
2029 if (!root->log_root)
2030 return 0;
2031
2032 log = root->log_root;
2033 ret = walk_log_tree(trans, log, &wc);
2034 BUG_ON(ret);
2035
2036 while(1) {
2037 ret = find_first_extent_bit(&log->dirty_log_pages,
2038 0, &start, &end, EXTENT_DIRTY);
2039 if (ret)
2040 break;
2041
2042 clear_extent_dirty(&log->dirty_log_pages,
2043 start, end, GFP_NOFS);
2044 }
2045
2046 log = root->log_root;
2047 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2048 &log->root_key);
2049 BUG_ON(ret);
2050 root->log_root = NULL;
2051 kfree(root->log_root);
2052 return 0;
2053}
2054
2055/*
2056 * helper function to update the item for a given subvolumes log root
2057 * in the tree of log roots
2058 */
2059static int update_log_root(struct btrfs_trans_handle *trans,
2060 struct btrfs_root *log)
2061{
2062 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2063 int ret;
2064
2065 if (log->node->start == bytenr)
2066 return 0;
2067
2068 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2069 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2070 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2071 &log->root_key, &log->root_item);
2072 BUG_ON(ret);
2073 return ret;
2074}
2075
2076/*
2077 * If both a file and directory are logged, and unlinks or renames are
2078 * mixed in, we have a few interesting corners:
2079 *
2080 * create file X in dir Y
2081 * link file X to X.link in dir Y
2082 * fsync file X
2083 * unlink file X but leave X.link
2084 * fsync dir Y
2085 *
2086 * After a crash we would expect only X.link to exist. But file X
2087 * didn't get fsync'd again so the log has back refs for X and X.link.
2088 *
2089 * We solve this by removing directory entries and inode backrefs from the
2090 * log when a file that was logged in the current transaction is
2091 * unlinked. Any later fsync will include the updated log entries, and
2092 * we'll be able to reconstruct the proper directory items from backrefs.
2093 *
2094 * This optimizations allows us to avoid relogging the entire inode
2095 * or the entire directory.
2096 */
2097int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2098 struct btrfs_root *root,
2099 const char *name, int name_len,
2100 struct inode *dir, u64 index)
2101{
2102 struct btrfs_root *log;
2103 struct btrfs_dir_item *di;
2104 struct btrfs_path *path;
2105 int ret;
2106 int bytes_del = 0;
2107
2108 if (BTRFS_I(dir)->logged_trans < trans->transid)
2109 return 0;
2110
2111 ret = join_running_log_trans(root);
2112 if (ret)
2113 return 0;
2114
2115 mutex_lock(&BTRFS_I(dir)->log_mutex);
2116
2117 log = root->log_root;
2118 path = btrfs_alloc_path();
2119 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2120 name, name_len, -1);
2121 if (di && !IS_ERR(di)) {
2122 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2123 bytes_del += name_len;
2124 BUG_ON(ret);
2125 }
2126 btrfs_release_path(log, path);
2127 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2128 index, name, name_len, -1);
2129 if (di && !IS_ERR(di)) {
2130 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2131 bytes_del += name_len;
2132 BUG_ON(ret);
2133 }
2134
2135 /* update the directory size in the log to reflect the names
2136 * we have removed
2137 */
2138 if (bytes_del) {
2139 struct btrfs_key key;
2140
2141 key.objectid = dir->i_ino;
2142 key.offset = 0;
2143 key.type = BTRFS_INODE_ITEM_KEY;
2144 btrfs_release_path(log, path);
2145
2146 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2147 if (ret == 0) {
2148 struct btrfs_inode_item *item;
2149 u64 i_size;
2150
2151 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2152 struct btrfs_inode_item);
2153 i_size = btrfs_inode_size(path->nodes[0], item);
2154 if (i_size > bytes_del)
2155 i_size -= bytes_del;
2156 else
2157 i_size = 0;
2158 btrfs_set_inode_size(path->nodes[0], item, i_size);
2159 btrfs_mark_buffer_dirty(path->nodes[0]);
2160 } else
2161 ret = 0;
2162 btrfs_release_path(log, path);
2163 }
2164
2165 btrfs_free_path(path);
2166 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2167 end_log_trans(root);
2168
2169 return 0;
2170}
2171
2172/* see comments for btrfs_del_dir_entries_in_log */
2173int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2174 struct btrfs_root *root,
2175 const char *name, int name_len,
2176 struct inode *inode, u64 dirid)
2177{
2178 struct btrfs_root *log;
2179 u64 index;
2180 int ret;
2181
2182 if (BTRFS_I(inode)->logged_trans < trans->transid)
2183 return 0;
2184
2185 ret = join_running_log_trans(root);
2186 if (ret)
2187 return 0;
2188 log = root->log_root;
2189 mutex_lock(&BTRFS_I(inode)->log_mutex);
2190
2191 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2192 dirid, &index);
2193 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2194 end_log_trans(root);
2195
2196 return ret;
2197}
2198
2199/*
2200 * creates a range item in the log for 'dirid'. first_offset and
2201 * last_offset tell us which parts of the key space the log should
2202 * be considered authoritative for.
2203 */
2204static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2205 struct btrfs_root *log,
2206 struct btrfs_path *path,
2207 int key_type, u64 dirid,
2208 u64 first_offset, u64 last_offset)
2209{
2210 int ret;
2211 struct btrfs_key key;
2212 struct btrfs_dir_log_item *item;
2213
2214 key.objectid = dirid;
2215 key.offset = first_offset;
2216 if (key_type == BTRFS_DIR_ITEM_KEY)
2217 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2218 else
2219 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2220 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2221 BUG_ON(ret);
2222
2223 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2224 struct btrfs_dir_log_item);
2225 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2226 btrfs_mark_buffer_dirty(path->nodes[0]);
2227 btrfs_release_path(log, path);
2228 return 0;
2229}
2230
2231/*
2232 * log all the items included in the current transaction for a given
2233 * directory. This also creates the range items in the log tree required
2234 * to replay anything deleted before the fsync
2235 */
2236static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2237 struct btrfs_root *root, struct inode *inode,
2238 struct btrfs_path *path,
2239 struct btrfs_path *dst_path, int key_type,
2240 u64 min_offset, u64 *last_offset_ret)
2241{
2242 struct btrfs_key min_key;
2243 struct btrfs_key max_key;
2244 struct btrfs_root *log = root->log_root;
2245 struct extent_buffer *src;
2246 int ret;
2247 int i;
2248 int nritems;
2249 u64 first_offset = min_offset;
2250 u64 last_offset = (u64)-1;
2251
2252 log = root->log_root;
2253 max_key.objectid = inode->i_ino;
2254 max_key.offset = (u64)-1;
2255 max_key.type = key_type;
2256
2257 min_key.objectid = inode->i_ino;
2258 min_key.type = key_type;
2259 min_key.offset = min_offset;
2260
2261 path->keep_locks = 1;
2262
2263 ret = btrfs_search_forward(root, &min_key, &max_key,
2264 path, 0, trans->transid);
2265
2266 /*
2267 * we didn't find anything from this transaction, see if there
2268 * is anything at all
2269 */
2270 if (ret != 0 || min_key.objectid != inode->i_ino ||
2271 min_key.type != key_type) {
2272 min_key.objectid = inode->i_ino;
2273 min_key.type = key_type;
2274 min_key.offset = (u64)-1;
2275 btrfs_release_path(root, path);
2276 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2277 if (ret < 0) {
2278 btrfs_release_path(root, path);
2279 return ret;
2280 }
2281 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2282
2283 /* if ret == 0 there are items for this type,
2284 * create a range to tell us the last key of this type.
2285 * otherwise, there are no items in this directory after
2286 * *min_offset, and we create a range to indicate that.
2287 */
2288 if (ret == 0) {
2289 struct btrfs_key tmp;
2290 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2291 path->slots[0]);
2292 if (key_type == tmp.type) {
2293 first_offset = max(min_offset, tmp.offset) + 1;
2294 }
2295 }
2296 goto done;
2297 }
2298
2299 /* go backward to find any previous key */
2300 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2301 if (ret == 0) {
2302 struct btrfs_key tmp;
2303 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2304 if (key_type == tmp.type) {
2305 first_offset = tmp.offset;
2306 ret = overwrite_item(trans, log, dst_path,
2307 path->nodes[0], path->slots[0],
2308 &tmp);
2309 }
2310 }
2311 btrfs_release_path(root, path);
2312
2313 /* find the first key from this transaction again */
2314 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2315 if (ret != 0) {
2316 WARN_ON(1);
2317 goto done;
2318 }
2319
2320 /*
2321 * we have a block from this transaction, log every item in it
2322 * from our directory
2323 */
2324 while(1) {
2325 struct btrfs_key tmp;
2326 src = path->nodes[0];
2327 nritems = btrfs_header_nritems(src);
2328 for (i = path->slots[0]; i < nritems; i++) {
2329 btrfs_item_key_to_cpu(src, &min_key, i);
2330
2331 if (min_key.objectid != inode->i_ino ||
2332 min_key.type != key_type)
2333 goto done;
2334 ret = overwrite_item(trans, log, dst_path, src, i,
2335 &min_key);
2336 BUG_ON(ret);
2337 }
2338 path->slots[0] = nritems;
2339
2340 /*
2341 * look ahead to the next item and see if it is also
2342 * from this directory and from this transaction
2343 */
2344 ret = btrfs_next_leaf(root, path);
2345 if (ret == 1) {
2346 last_offset = (u64)-1;
2347 goto done;
2348 }
2349 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2350 if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2351 last_offset = (u64)-1;
2352 goto done;
2353 }
2354 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2355 ret = overwrite_item(trans, log, dst_path,
2356 path->nodes[0], path->slots[0],
2357 &tmp);
2358
2359 BUG_ON(ret);
2360 last_offset = tmp.offset;
2361 goto done;
2362 }
2363 }
2364done:
2365 *last_offset_ret = last_offset;
2366 btrfs_release_path(root, path);
2367 btrfs_release_path(log, dst_path);
2368
2369 /* insert the log range keys to indicate where the log is valid */
2370 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2371 first_offset, last_offset);
2372 BUG_ON(ret);
2373 return 0;
2374}
2375
2376/*
2377 * logging directories is very similar to logging inodes, We find all the items
2378 * from the current transaction and write them to the log.
2379 *
2380 * The recovery code scans the directory in the subvolume, and if it finds a
2381 * key in the range logged that is not present in the log tree, then it means
2382 * that dir entry was unlinked during the transaction.
2383 *
2384 * In order for that scan to work, we must include one key smaller than
2385 * the smallest logged by this transaction and one key larger than the largest
2386 * key logged by this transaction.
2387 */
2388static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2389 struct btrfs_root *root, struct inode *inode,
2390 struct btrfs_path *path,
2391 struct btrfs_path *dst_path)
2392{
2393 u64 min_key;
2394 u64 max_key;
2395 int ret;
2396 int key_type = BTRFS_DIR_ITEM_KEY;
2397
2398again:
2399 min_key = 0;
2400 max_key = 0;
2401 while(1) {
2402 ret = log_dir_items(trans, root, inode, path,
2403 dst_path, key_type, min_key,
2404 &max_key);
2405 BUG_ON(ret);
2406 if (max_key == (u64)-1)
2407 break;
2408 min_key = max_key + 1;
2409 }
2410
2411 if (key_type == BTRFS_DIR_ITEM_KEY) {
2412 key_type = BTRFS_DIR_INDEX_KEY;
2413 goto again;
2414 }
2415 return 0;
2416}
2417
2418/*
2419 * a helper function to drop items from the log before we relog an
2420 * inode. max_key_type indicates the highest item type to remove.
2421 * This cannot be run for file data extents because it does not
2422 * free the extents they point to.
2423 */
2424static int drop_objectid_items(struct btrfs_trans_handle *trans,
2425 struct btrfs_root *log,
2426 struct btrfs_path *path,
2427 u64 objectid, int max_key_type)
2428{
2429 int ret;
2430 struct btrfs_key key;
2431 struct btrfs_key found_key;
2432
2433 key.objectid = objectid;
2434 key.type = max_key_type;
2435 key.offset = (u64)-1;
2436
2437 while(1) {
2438 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2439
2440 if (ret != 1)
2441 break;
2442
2443 if (path->slots[0] == 0)
2444 break;
2445
2446 path->slots[0]--;
2447 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2448 path->slots[0]);
2449
2450 if (found_key.objectid != objectid)
2451 break;
2452
2453 ret = btrfs_del_item(trans, log, path);
2454 BUG_ON(ret);
2455 btrfs_release_path(log, path);
2456 }
2457 btrfs_release_path(log, path);
2458 return 0;
2459}
2460
2461static noinline int copy_items(struct btrfs_trans_handle *trans,
2462 struct btrfs_root *log,
2463 struct btrfs_path *dst_path,
2464 struct extent_buffer *src,
2465 int start_slot, int nr, int inode_only)
2466{
2467 unsigned long src_offset;
2468 unsigned long dst_offset;
2469 struct btrfs_file_extent_item *extent;
2470 struct btrfs_inode_item *inode_item;
2471 int ret;
2472 struct btrfs_key *ins_keys;
2473 u32 *ins_sizes;
2474 char *ins_data;
2475 int i;
2476
2477 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2478 nr * sizeof(u32), GFP_NOFS);
2479 ins_sizes = (u32 *)ins_data;
2480 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2481
2482 for (i = 0; i < nr; i++) {
2483 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2484 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2485 }
2486 ret = btrfs_insert_empty_items(trans, log, dst_path,
2487 ins_keys, ins_sizes, nr);
2488 BUG_ON(ret);
2489
2490 for (i = 0; i < nr; i++) {
2491 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2492 dst_path->slots[0]);
2493
2494 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2495
2496 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2497 src_offset, ins_sizes[i]);
2498
2499 if (inode_only == LOG_INODE_EXISTS &&
2500 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2501 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2502 dst_path->slots[0],
2503 struct btrfs_inode_item);
2504 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2505
2506 /* set the generation to zero so the recover code
2507 * can tell the difference between an logging
2508 * just to say 'this inode exists' and a logging
2509 * to say 'update this inode with these values'
2510 */
2511 btrfs_set_inode_generation(dst_path->nodes[0],
2512 inode_item, 0);
2513 }
2514 /* take a reference on file data extents so that truncates
2515 * or deletes of this inode don't have to relog the inode
2516 * again
2517 */
2518 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2519 int found_type;
2520 extent = btrfs_item_ptr(src, start_slot + i,
2521 struct btrfs_file_extent_item);
2522
2523 found_type = btrfs_file_extent_type(src, extent);
2524 if (found_type == BTRFS_FILE_EXTENT_REG) {
2525 u64 ds = btrfs_file_extent_disk_bytenr(src,
2526 extent);
2527 u64 dl = btrfs_file_extent_disk_num_bytes(src,
2528 extent);
2529 /* ds == 0 is a hole */
2530 if (ds != 0) {
2531 ret = btrfs_inc_extent_ref(trans, log,
2532 ds, dl,
2533 dst_path->nodes[0]->start,
2534 BTRFS_TREE_LOG_OBJECTID,
2535 trans->transid,
2536 ins_keys[i].objectid);
2537 BUG_ON(ret);
2538 }
2539 }
2540 }
2541 dst_path->slots[0]++;
2542 }
2543
2544 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2545 btrfs_release_path(log, dst_path);
2546 kfree(ins_data);
2547 return 0;
2548}
2549
2550/* log a single inode in the tree log.
2551 * At least one parent directory for this inode must exist in the tree
2552 * or be logged already.
2553 *
2554 * Any items from this inode changed by the current transaction are copied
2555 * to the log tree. An extra reference is taken on any extents in this
2556 * file, allowing us to avoid a whole pile of corner cases around logging
2557 * blocks that have been removed from the tree.
2558 *
2559 * See LOG_INODE_ALL and related defines for a description of what inode_only
2560 * does.
2561 *
2562 * This handles both files and directories.
2563 */
2564static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2565 struct btrfs_root *root, struct inode *inode,
2566 int inode_only)
2567{
2568 struct btrfs_path *path;
2569 struct btrfs_path *dst_path;
2570 struct btrfs_key min_key;
2571 struct btrfs_key max_key;
2572 struct btrfs_root *log = root->log_root;
2573 struct extent_buffer *src = NULL;
2574 u32 size;
2575 int ret;
2576 int nritems;
2577 int ins_start_slot = 0;
2578 int ins_nr;
2579
2580 log = root->log_root;
2581
2582 path = btrfs_alloc_path();
2583 dst_path = btrfs_alloc_path();
2584
2585 min_key.objectid = inode->i_ino;
2586 min_key.type = BTRFS_INODE_ITEM_KEY;
2587 min_key.offset = 0;
2588
2589 max_key.objectid = inode->i_ino;
2590 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2591 max_key.type = BTRFS_XATTR_ITEM_KEY;
2592 else
2593 max_key.type = (u8)-1;
2594 max_key.offset = (u64)-1;
2595
2596 /*
2597 * if this inode has already been logged and we're in inode_only
2598 * mode, we don't want to delete the things that have already
2599 * been written to the log.
2600 *
2601 * But, if the inode has been through an inode_only log,
2602 * the logged_trans field is not set. This allows us to catch
2603 * any new names for this inode in the backrefs by logging it
2604 * again
2605 */
2606 if (inode_only == LOG_INODE_EXISTS &&
2607 BTRFS_I(inode)->logged_trans == trans->transid) {
2608 btrfs_free_path(path);
2609 btrfs_free_path(dst_path);
2610 goto out;
2611 }
2612 mutex_lock(&BTRFS_I(inode)->log_mutex);
2613
2614 /*
2615 * a brute force approach to making sure we get the most uptodate
2616 * copies of everything.
2617 */
2618 if (S_ISDIR(inode->i_mode)) {
2619 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2620
2621 if (inode_only == LOG_INODE_EXISTS)
2622 max_key_type = BTRFS_XATTR_ITEM_KEY;
2623 ret = drop_objectid_items(trans, log, path,
2624 inode->i_ino, max_key_type);
2625 } else {
2626 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2627 }
2628 BUG_ON(ret);
2629 path->keep_locks = 1;
2630
2631 while(1) {
2632 ins_nr = 0;
2633 ret = btrfs_search_forward(root, &min_key, &max_key,
2634 path, 0, trans->transid);
2635 if (ret != 0)
2636 break;
2637again:
2638 /* note, ins_nr might be > 0 here, cleanup outside the loop */
2639 if (min_key.objectid != inode->i_ino)
2640 break;
2641 if (min_key.type > max_key.type)
2642 break;
2643
2644 src = path->nodes[0];
2645 size = btrfs_item_size_nr(src, path->slots[0]);
2646 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2647 ins_nr++;
2648 goto next_slot;
2649 } else if (!ins_nr) {
2650 ins_start_slot = path->slots[0];
2651 ins_nr = 1;
2652 goto next_slot;
2653 }
2654
2655 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2656 ins_nr, inode_only);
2657 BUG_ON(ret);
2658 ins_nr = 1;
2659 ins_start_slot = path->slots[0];
2660next_slot:
2661
2662 nritems = btrfs_header_nritems(path->nodes[0]);
2663 path->slots[0]++;
2664 if (path->slots[0] < nritems) {
2665 btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2666 path->slots[0]);
2667 goto again;
2668 }
2669 if (ins_nr) {
2670 ret = copy_items(trans, log, dst_path, src,
2671 ins_start_slot,
2672 ins_nr, inode_only);
2673 BUG_ON(ret);
2674 ins_nr = 0;
2675 }
2676 btrfs_release_path(root, path);
2677
2678 if (min_key.offset < (u64)-1)
2679 min_key.offset++;
2680 else if (min_key.type < (u8)-1)
2681 min_key.type++;
2682 else if (min_key.objectid < (u64)-1)
2683 min_key.objectid++;
2684 else
2685 break;
2686 }
2687 if (ins_nr) {
2688 ret = copy_items(trans, log, dst_path, src,
2689 ins_start_slot,
2690 ins_nr, inode_only);
2691 BUG_ON(ret);
2692 ins_nr = 0;
2693 }
2694 WARN_ON(ins_nr);
2695 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2696 btrfs_release_path(root, path);
2697 btrfs_release_path(log, dst_path);
2698 BTRFS_I(inode)->log_dirty_trans = 0;
2699 ret = log_directory_changes(trans, root, inode, path, dst_path);
2700 BUG_ON(ret);
2701 }
2702 BTRFS_I(inode)->logged_trans = trans->transid;
2703 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2704
2705 btrfs_free_path(path);
2706 btrfs_free_path(dst_path);
2707
2708 mutex_lock(&root->fs_info->tree_log_mutex);
2709 ret = update_log_root(trans, log);
2710 BUG_ON(ret);
2711 mutex_unlock(&root->fs_info->tree_log_mutex);
2712out:
2713 return 0;
2714}
2715
2716int btrfs_log_inode(struct btrfs_trans_handle *trans,
2717 struct btrfs_root *root, struct inode *inode,
2718 int inode_only)
2719{
2720 int ret;
2721
2722 start_log_trans(trans, root);
2723 ret = __btrfs_log_inode(trans, root, inode, inode_only);
2724 end_log_trans(root);
2725 return ret;
2726}
2727
2728/*
2729 * helper function around btrfs_log_inode to make sure newly created
2730 * parent directories also end up in the log. A minimal inode and backref
2731 * only logging is done of any parent directories that are older than
2732 * the last committed transaction
2733 */
2734int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2735 struct btrfs_root *root, struct dentry *dentry)
2736{
2737 int inode_only = LOG_INODE_ALL;
2738 struct super_block *sb;
2739 int ret;
2740
2741 start_log_trans(trans, root);
2742 sb = dentry->d_inode->i_sb;
2743 while(1) {
2744 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2745 inode_only);
2746 BUG_ON(ret);
2747 inode_only = LOG_INODE_EXISTS;
2748
2749 dentry = dentry->d_parent;
2750 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
2751 break;
2752
2753 if (BTRFS_I(dentry->d_inode)->generation <=
2754 root->fs_info->last_trans_committed)
2755 break;
2756 }
2757 end_log_trans(root);
2758 return 0;
2759}
2760
2761/*
2762 * it is not safe to log dentry if the chunk root has added new
2763 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
2764 * If this returns 1, you must commit the transaction to safely get your
2765 * data on disk.
2766 */
2767int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2768 struct btrfs_root *root, struct dentry *dentry)
2769{
2770 u64 gen;
2771 gen = root->fs_info->last_trans_new_blockgroup;
2772 if (gen > root->fs_info->last_trans_committed)
2773 return 1;
2774 else
2775 return btrfs_log_dentry(trans, root, dentry);
2776}
2777
2778/*
2779 * should be called during mount to recover any replay any log trees
2780 * from the FS
2781 */
2782int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2783{
2784 int ret;
2785 struct btrfs_path *path;
2786 struct btrfs_trans_handle *trans;
2787 struct btrfs_key key;
2788 struct btrfs_key found_key;
2789 struct btrfs_key tmp_key;
2790 struct btrfs_root *log;
2791 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2792 u64 highest_inode;
2793 struct walk_control wc = {
2794 .process_func = process_one_buffer,
2795 .stage = 0,
2796 };
2797
2798 fs_info->log_root_recovering = 1;
2799 path = btrfs_alloc_path();
2800 BUG_ON(!path);
2801
2802 trans = btrfs_start_transaction(fs_info->tree_root, 1);
2803
2804 wc.trans = trans;
2805 wc.pin = 1;
2806
2807 walk_log_tree(trans, log_root_tree, &wc);
2808
2809again:
2810 key.objectid = BTRFS_TREE_LOG_OBJECTID;
2811 key.offset = (u64)-1;
2812 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2813
2814 while(1) {
2815 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
2816 if (ret < 0)
2817 break;
2818 if (ret > 0) {
2819 if (path->slots[0] == 0)
2820 break;
2821 path->slots[0]--;
2822 }
2823 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2824 path->slots[0]);
2825 btrfs_release_path(log_root_tree, path);
2826 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
2827 break;
2828
2829 log = btrfs_read_fs_root_no_radix(log_root_tree,
2830 &found_key);
2831 BUG_ON(!log);
2832
2833
2834 tmp_key.objectid = found_key.offset;
2835 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
2836 tmp_key.offset = (u64)-1;
2837
2838 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
2839
2840 BUG_ON(!wc.replay_dest);
2841
2842 btrfs_record_root_in_trans(wc.replay_dest);
2843 ret = walk_log_tree(trans, log, &wc);
2844 BUG_ON(ret);
2845
2846 if (wc.stage == LOG_WALK_REPLAY_ALL) {
2847 ret = fixup_inode_link_counts(trans, wc.replay_dest,
2848 path);
2849 BUG_ON(ret);
2850 }
2851 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
2852 if (ret == 0) {
2853 wc.replay_dest->highest_inode = highest_inode;
2854 wc.replay_dest->last_inode_alloc = highest_inode;
2855 }
2856
2857 key.offset = found_key.offset - 1;
2858 free_extent_buffer(log->node);
2859 kfree(log);
2860
2861 if (found_key.offset == 0)
2862 break;
2863 }
2864 btrfs_release_path(log_root_tree, path);
2865
2866 /* step one is to pin it all, step two is to replay just inodes */
2867 if (wc.pin) {
2868 wc.pin = 0;
2869 wc.process_func = replay_one_buffer;
2870 wc.stage = LOG_WALK_REPLAY_INODES;
2871 goto again;
2872 }
2873 /* step three is to replay everything */
2874 if (wc.stage < LOG_WALK_REPLAY_ALL) {
2875 wc.stage++;
2876 goto again;
2877 }
2878
2879 btrfs_free_path(path);
2880
2881 free_extent_buffer(log_root_tree->node);
2882 log_root_tree->log_root = NULL;
2883 fs_info->log_root_recovering = 0;
2884
2885 /* step 4: commit the transaction, which also unpins the blocks */
2886 btrfs_commit_transaction(trans, fs_info->tree_root);
2887
2888 kfree(log_root_tree);
2889 return 0;
2890}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000000..b9409b32ed02
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __TREE_LOG_
20#define __TREE_LOG_
21
22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root,
35 const char *name, int name_len,
36 struct inode *dir, u64 index);
37int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root,
39 const char *name, int name_len,
40 struct inode *inode, u64 dirid);
41#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 000000000000..9bf3946d5ef2
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 000000000000..0f57f24404d9
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
1#!/bin/bash
2#
3# determine-version -- report a useful version for releases
4#
5# Copyright 2008, Aron Griffis <agriffis@n01se.net>
6# Copyright 2008, Oracle
7# Released under the GNU GPLv2
8
9v="v0.16"
10
11which hg > /dev/null
12if [ -d .hg ] && [ $? == 0 ]; then
13 last=$(hg tags | grep -m1 -o '^v[0-9.]\+')
14
15 # now check if the repo has commits since then...
16 if [[ $(hg id -t) == $last || \
17 $(hg di -r "$last:." | awk '/^diff/{print $NF}' | sort -u) == .hgtags ]]
18 then
19 # check if it's dirty
20 if [[ $(hg id | cut -d' ' -f1) == *+ ]]; then
21 v=$last+
22 else
23 v=$last
24 fi
25 else
26 # includes dirty flag
27 v=$last+$(hg id -i)
28 fi
29fi
30
31echo "#ifndef __BUILD_VERSION" > .build-version.h
32echo "#define __BUILD_VERSION" >> .build-version.h
33echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
34echo "#endif" >> .build-version.h
35
36diff -q version.h .build-version.h >& /dev/null
37
38if [ $? == 0 ]; then
39 rm .build-version.h
40 exit 0
41fi
42
43mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000000..2eed7f91f51a
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,2549 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/buffer_head.h>
21#include <linux/blkdev.h>
22#include <linux/random.h>
23#include <asm/div64.h>
24#include "ctree.h"
25#include "extent_map.h"
26#include "disk-io.h"
27#include "transaction.h"
28#include "print-tree.h"
29#include "volumes.h"
30#include "async-thread.h"
31
32struct map_lookup {
33 u64 type;
34 int io_align;
35 int io_width;
36 int stripe_len;
37 int sector_size;
38 int num_stripes;
39 int sub_stripes;
40 struct btrfs_bio_stripe stripes[];
41};
42
43#define map_lookup_size(n) (sizeof(struct map_lookup) + \
44 (sizeof(struct btrfs_bio_stripe) * (n)))
45
46static DEFINE_MUTEX(uuid_mutex);
47static LIST_HEAD(fs_uuids);
48
49void btrfs_lock_volumes(void)
50{
51 mutex_lock(&uuid_mutex);
52}
53
54void btrfs_unlock_volumes(void)
55{
56 mutex_unlock(&uuid_mutex);
57}
58
59static void lock_chunks(struct btrfs_root *root)
60{
61 mutex_lock(&root->fs_info->alloc_mutex);
62 mutex_lock(&root->fs_info->chunk_mutex);
63}
64
65static void unlock_chunks(struct btrfs_root *root)
66{
67 mutex_unlock(&root->fs_info->chunk_mutex);
68 mutex_unlock(&root->fs_info->alloc_mutex);
69}
70
71int btrfs_cleanup_fs_uuids(void)
72{
73 struct btrfs_fs_devices *fs_devices;
74 struct list_head *uuid_cur;
75 struct list_head *devices_cur;
76 struct btrfs_device *dev;
77
78 list_for_each(uuid_cur, &fs_uuids) {
79 fs_devices = list_entry(uuid_cur, struct btrfs_fs_devices,
80 list);
81 while(!list_empty(&fs_devices->devices)) {
82 devices_cur = fs_devices->devices.next;
83 dev = list_entry(devices_cur, struct btrfs_device,
84 dev_list);
85 if (dev->bdev) {
86 close_bdev_excl(dev->bdev);
87 fs_devices->open_devices--;
88 }
89 list_del(&dev->dev_list);
90 kfree(dev->name);
91 kfree(dev);
92 }
93 }
94 return 0;
95}
96
97static noinline struct btrfs_device *__find_device(struct list_head *head,
98 u64 devid, u8 *uuid)
99{
100 struct btrfs_device *dev;
101 struct list_head *cur;
102
103 list_for_each(cur, head) {
104 dev = list_entry(cur, struct btrfs_device, dev_list);
105 if (dev->devid == devid &&
106 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
107 return dev;
108 }
109 }
110 return NULL;
111}
112
113static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
114{
115 struct list_head *cur;
116 struct btrfs_fs_devices *fs_devices;
117
118 list_for_each(cur, &fs_uuids) {
119 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
120 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
121 return fs_devices;
122 }
123 return NULL;
124}
125
126/*
127 * we try to collect pending bios for a device so we don't get a large
128 * number of procs sending bios down to the same device. This greatly
129 * improves the schedulers ability to collect and merge the bios.
130 *
131 * But, it also turns into a long list of bios to process and that is sure
132 * to eventually make the worker thread block. The solution here is to
133 * make some progress and then put this work struct back at the end of
134 * the list if the block device is congested. This way, multiple devices
135 * can make progress from a single worker thread.
136 */
137static int noinline run_scheduled_bios(struct btrfs_device *device)
138{
139 struct bio *pending;
140 struct backing_dev_info *bdi;
141 struct btrfs_fs_info *fs_info;
142 struct bio *tail;
143 struct bio *cur;
144 int again = 0;
145 unsigned long num_run = 0;
146 unsigned long limit;
147
148 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
149 fs_info = device->dev_root->fs_info;
150 limit = btrfs_async_submit_limit(fs_info);
151 limit = limit * 2 / 3;
152
153loop:
154 spin_lock(&device->io_lock);
155
156 /* take all the bios off the list at once and process them
157 * later on (without the lock held). But, remember the
158 * tail and other pointers so the bios can be properly reinserted
159 * into the list if we hit congestion
160 */
161 pending = device->pending_bios;
162 tail = device->pending_bio_tail;
163 WARN_ON(pending && !tail);
164 device->pending_bios = NULL;
165 device->pending_bio_tail = NULL;
166
167 /*
168 * if pending was null this time around, no bios need processing
169 * at all and we can stop. Otherwise it'll loop back up again
170 * and do an additional check so no bios are missed.
171 *
172 * device->running_pending is used to synchronize with the
173 * schedule_bio code.
174 */
175 if (pending) {
176 again = 1;
177 device->running_pending = 1;
178 } else {
179 again = 0;
180 device->running_pending = 0;
181 }
182 spin_unlock(&device->io_lock);
183
184 while(pending) {
185 cur = pending;
186 pending = pending->bi_next;
187 cur->bi_next = NULL;
188 atomic_dec(&fs_info->nr_async_bios);
189
190 if (atomic_read(&fs_info->nr_async_bios) < limit &&
191 waitqueue_active(&fs_info->async_submit_wait))
192 wake_up(&fs_info->async_submit_wait);
193
194 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
195 bio_get(cur);
196 submit_bio(cur->bi_rw, cur);
197 bio_put(cur);
198 num_run++;
199
200 /*
201 * we made progress, there is more work to do and the bdi
202 * is now congested. Back off and let other work structs
203 * run instead
204 */
205 if (pending && bdi_write_congested(bdi)) {
206 struct bio *old_head;
207
208 spin_lock(&device->io_lock);
209
210 old_head = device->pending_bios;
211 device->pending_bios = pending;
212 if (device->pending_bio_tail)
213 tail->bi_next = old_head;
214 else
215 device->pending_bio_tail = tail;
216
217 spin_unlock(&device->io_lock);
218 btrfs_requeue_work(&device->work);
219 goto done;
220 }
221 }
222 if (again)
223 goto loop;
224done:
225 return 0;
226}
227
228void pending_bios_fn(struct btrfs_work *work)
229{
230 struct btrfs_device *device;
231
232 device = container_of(work, struct btrfs_device, work);
233 run_scheduled_bios(device);
234}
235
236static noinline int device_list_add(const char *path,
237 struct btrfs_super_block *disk_super,
238 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
239{
240 struct btrfs_device *device;
241 struct btrfs_fs_devices *fs_devices;
242 u64 found_transid = btrfs_super_generation(disk_super);
243
244 fs_devices = find_fsid(disk_super->fsid);
245 if (!fs_devices) {
246 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
247 if (!fs_devices)
248 return -ENOMEM;
249 INIT_LIST_HEAD(&fs_devices->devices);
250 INIT_LIST_HEAD(&fs_devices->alloc_list);
251 list_add(&fs_devices->list, &fs_uuids);
252 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
253 fs_devices->latest_devid = devid;
254 fs_devices->latest_trans = found_transid;
255 device = NULL;
256 } else {
257 device = __find_device(&fs_devices->devices, devid,
258 disk_super->dev_item.uuid);
259 }
260 if (!device) {
261 device = kzalloc(sizeof(*device), GFP_NOFS);
262 if (!device) {
263 /* we can safely leave the fs_devices entry around */
264 return -ENOMEM;
265 }
266 device->devid = devid;
267 device->work.func = pending_bios_fn;
268 memcpy(device->uuid, disk_super->dev_item.uuid,
269 BTRFS_UUID_SIZE);
270 device->barriers = 1;
271 spin_lock_init(&device->io_lock);
272 device->name = kstrdup(path, GFP_NOFS);
273 if (!device->name) {
274 kfree(device);
275 return -ENOMEM;
276 }
277 list_add(&device->dev_list, &fs_devices->devices);
278 list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
279 fs_devices->num_devices++;
280 }
281
282 if (found_transid > fs_devices->latest_trans) {
283 fs_devices->latest_devid = devid;
284 fs_devices->latest_trans = found_transid;
285 }
286 *fs_devices_ret = fs_devices;
287 return 0;
288}
289
290int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
291{
292 struct list_head *head = &fs_devices->devices;
293 struct list_head *cur;
294 struct btrfs_device *device;
295
296 mutex_lock(&uuid_mutex);
297again:
298 list_for_each(cur, head) {
299 device = list_entry(cur, struct btrfs_device, dev_list);
300 if (!device->in_fs_metadata) {
301 struct block_device *bdev;
302 list_del(&device->dev_list);
303 list_del(&device->dev_alloc_list);
304 fs_devices->num_devices--;
305 if (device->bdev) {
306 bdev = device->bdev;
307 fs_devices->open_devices--;
308 mutex_unlock(&uuid_mutex);
309 close_bdev_excl(bdev);
310 mutex_lock(&uuid_mutex);
311 }
312 kfree(device->name);
313 kfree(device);
314 goto again;
315 }
316 }
317 mutex_unlock(&uuid_mutex);
318 return 0;
319}
320
321int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
322{
323 struct list_head *head = &fs_devices->devices;
324 struct list_head *cur;
325 struct btrfs_device *device;
326
327 mutex_lock(&uuid_mutex);
328 list_for_each(cur, head) {
329 device = list_entry(cur, struct btrfs_device, dev_list);
330 if (device->bdev) {
331 close_bdev_excl(device->bdev);
332 fs_devices->open_devices--;
333 }
334 device->bdev = NULL;
335 device->in_fs_metadata = 0;
336 }
337 fs_devices->mounted = 0;
338 mutex_unlock(&uuid_mutex);
339 return 0;
340}
341
342int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
343 int flags, void *holder)
344{
345 struct block_device *bdev;
346 struct list_head *head = &fs_devices->devices;
347 struct list_head *cur;
348 struct btrfs_device *device;
349 struct block_device *latest_bdev = NULL;
350 struct buffer_head *bh;
351 struct btrfs_super_block *disk_super;
352 u64 latest_devid = 0;
353 u64 latest_transid = 0;
354 u64 transid;
355 u64 devid;
356 int ret = 0;
357
358 mutex_lock(&uuid_mutex);
359 if (fs_devices->mounted)
360 goto out;
361
362 list_for_each(cur, head) {
363 device = list_entry(cur, struct btrfs_device, dev_list);
364 if (device->bdev)
365 continue;
366
367 if (!device->name)
368 continue;
369
370 bdev = open_bdev_excl(device->name, flags, holder);
371
372 if (IS_ERR(bdev)) {
373 printk("open %s failed\n", device->name);
374 goto error;
375 }
376 set_blocksize(bdev, 4096);
377
378 bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
379 if (!bh)
380 goto error_close;
381
382 disk_super = (struct btrfs_super_block *)bh->b_data;
383 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
384 sizeof(disk_super->magic)))
385 goto error_brelse;
386
387 devid = le64_to_cpu(disk_super->dev_item.devid);
388 if (devid != device->devid)
389 goto error_brelse;
390
391 transid = btrfs_super_generation(disk_super);
392 if (!latest_transid || transid > latest_transid) {
393 latest_devid = devid;
394 latest_transid = transid;
395 latest_bdev = bdev;
396 }
397
398 device->bdev = bdev;
399 device->in_fs_metadata = 0;
400 fs_devices->open_devices++;
401 continue;
402
403error_brelse:
404 brelse(bh);
405error_close:
406 close_bdev_excl(bdev);
407error:
408 continue;
409 }
410 if (fs_devices->open_devices == 0) {
411 ret = -EIO;
412 goto out;
413 }
414 fs_devices->mounted = 1;
415 fs_devices->latest_bdev = latest_bdev;
416 fs_devices->latest_devid = latest_devid;
417 fs_devices->latest_trans = latest_transid;
418out:
419 mutex_unlock(&uuid_mutex);
420 return ret;
421}
422
423int btrfs_scan_one_device(const char *path, int flags, void *holder,
424 struct btrfs_fs_devices **fs_devices_ret)
425{
426 struct btrfs_super_block *disk_super;
427 struct block_device *bdev;
428 struct buffer_head *bh;
429 int ret;
430 u64 devid;
431 u64 transid;
432
433 mutex_lock(&uuid_mutex);
434
435 bdev = open_bdev_excl(path, flags, holder);
436
437 if (IS_ERR(bdev)) {
438 ret = PTR_ERR(bdev);
439 goto error;
440 }
441
442 ret = set_blocksize(bdev, 4096);
443 if (ret)
444 goto error_close;
445 bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
446 if (!bh) {
447 ret = -EIO;
448 goto error_close;
449 }
450 disk_super = (struct btrfs_super_block *)bh->b_data;
451 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
452 sizeof(disk_super->magic))) {
453 ret = -EINVAL;
454 goto error_brelse;
455 }
456 devid = le64_to_cpu(disk_super->dev_item.devid);
457 transid = btrfs_super_generation(disk_super);
458 if (disk_super->label[0])
459 printk("device label %s ", disk_super->label);
460 else {
461 /* FIXME, make a readl uuid parser */
462 printk("device fsid %llx-%llx ",
463 *(unsigned long long *)disk_super->fsid,
464 *(unsigned long long *)(disk_super->fsid + 8));
465 }
466 printk("devid %Lu transid %Lu %s\n", devid, transid, path);
467 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
468
469error_brelse:
470 brelse(bh);
471error_close:
472 close_bdev_excl(bdev);
473error:
474 mutex_unlock(&uuid_mutex);
475 return ret;
476}
477
478/*
479 * this uses a pretty simple search, the expectation is that it is
480 * called very infrequently and that a given device has a small number
481 * of extents
482 */
483static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
484 struct btrfs_device *device,
485 struct btrfs_path *path,
486 u64 num_bytes, u64 *start)
487{
488 struct btrfs_key key;
489 struct btrfs_root *root = device->dev_root;
490 struct btrfs_dev_extent *dev_extent = NULL;
491 u64 hole_size = 0;
492 u64 last_byte = 0;
493 u64 search_start = 0;
494 u64 search_end = device->total_bytes;
495 int ret;
496 int slot = 0;
497 int start_found;
498 struct extent_buffer *l;
499
500 start_found = 0;
501 path->reada = 2;
502
503 /* FIXME use last free of some kind */
504
505 /* we don't want to overwrite the superblock on the drive,
506 * so we make sure to start at an offset of at least 1MB
507 */
508 search_start = max((u64)1024 * 1024, search_start);
509
510 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
511 search_start = max(root->fs_info->alloc_start, search_start);
512
513 key.objectid = device->devid;
514 key.offset = search_start;
515 key.type = BTRFS_DEV_EXTENT_KEY;
516 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
517 if (ret < 0)
518 goto error;
519 ret = btrfs_previous_item(root, path, 0, key.type);
520 if (ret < 0)
521 goto error;
522 l = path->nodes[0];
523 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
524 while (1) {
525 l = path->nodes[0];
526 slot = path->slots[0];
527 if (slot >= btrfs_header_nritems(l)) {
528 ret = btrfs_next_leaf(root, path);
529 if (ret == 0)
530 continue;
531 if (ret < 0)
532 goto error;
533no_more_items:
534 if (!start_found) {
535 if (search_start >= search_end) {
536 ret = -ENOSPC;
537 goto error;
538 }
539 *start = search_start;
540 start_found = 1;
541 goto check_pending;
542 }
543 *start = last_byte > search_start ?
544 last_byte : search_start;
545 if (search_end <= *start) {
546 ret = -ENOSPC;
547 goto error;
548 }
549 goto check_pending;
550 }
551 btrfs_item_key_to_cpu(l, &key, slot);
552
553 if (key.objectid < device->devid)
554 goto next;
555
556 if (key.objectid > device->devid)
557 goto no_more_items;
558
559 if (key.offset >= search_start && key.offset > last_byte &&
560 start_found) {
561 if (last_byte < search_start)
562 last_byte = search_start;
563 hole_size = key.offset - last_byte;
564 if (key.offset > last_byte &&
565 hole_size >= num_bytes) {
566 *start = last_byte;
567 goto check_pending;
568 }
569 }
570 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
571 goto next;
572 }
573
574 start_found = 1;
575 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
576 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
577next:
578 path->slots[0]++;
579 cond_resched();
580 }
581check_pending:
582 /* we have to make sure we didn't find an extent that has already
583 * been allocated by the map tree or the original allocation
584 */
585 btrfs_release_path(root, path);
586 BUG_ON(*start < search_start);
587
588 if (*start + num_bytes > search_end) {
589 ret = -ENOSPC;
590 goto error;
591 }
592 /* check for pending inserts here */
593 return 0;
594
595error:
596 btrfs_release_path(root, path);
597 return ret;
598}
599
600int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
601 struct btrfs_device *device,
602 u64 start)
603{
604 int ret;
605 struct btrfs_path *path;
606 struct btrfs_root *root = device->dev_root;
607 struct btrfs_key key;
608 struct btrfs_key found_key;
609 struct extent_buffer *leaf = NULL;
610 struct btrfs_dev_extent *extent = NULL;
611
612 path = btrfs_alloc_path();
613 if (!path)
614 return -ENOMEM;
615
616 key.objectid = device->devid;
617 key.offset = start;
618 key.type = BTRFS_DEV_EXTENT_KEY;
619
620 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
621 if (ret > 0) {
622 ret = btrfs_previous_item(root, path, key.objectid,
623 BTRFS_DEV_EXTENT_KEY);
624 BUG_ON(ret);
625 leaf = path->nodes[0];
626 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
627 extent = btrfs_item_ptr(leaf, path->slots[0],
628 struct btrfs_dev_extent);
629 BUG_ON(found_key.offset > start || found_key.offset +
630 btrfs_dev_extent_length(leaf, extent) < start);
631 ret = 0;
632 } else if (ret == 0) {
633 leaf = path->nodes[0];
634 extent = btrfs_item_ptr(leaf, path->slots[0],
635 struct btrfs_dev_extent);
636 }
637 BUG_ON(ret);
638
639 if (device->bytes_used > 0)
640 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
641 ret = btrfs_del_item(trans, root, path);
642 BUG_ON(ret);
643
644 btrfs_free_path(path);
645 return ret;
646}
647
648int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
649 struct btrfs_device *device,
650 u64 chunk_tree, u64 chunk_objectid,
651 u64 chunk_offset,
652 u64 num_bytes, u64 *start)
653{
654 int ret;
655 struct btrfs_path *path;
656 struct btrfs_root *root = device->dev_root;
657 struct btrfs_dev_extent *extent;
658 struct extent_buffer *leaf;
659 struct btrfs_key key;
660
661 WARN_ON(!device->in_fs_metadata);
662 path = btrfs_alloc_path();
663 if (!path)
664 return -ENOMEM;
665
666 ret = find_free_dev_extent(trans, device, path, num_bytes, start);
667 if (ret) {
668 goto err;
669 }
670
671 key.objectid = device->devid;
672 key.offset = *start;
673 key.type = BTRFS_DEV_EXTENT_KEY;
674 ret = btrfs_insert_empty_item(trans, root, path, &key,
675 sizeof(*extent));
676 BUG_ON(ret);
677
678 leaf = path->nodes[0];
679 extent = btrfs_item_ptr(leaf, path->slots[0],
680 struct btrfs_dev_extent);
681 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
682 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
683 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
684
685 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
686 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
687 BTRFS_UUID_SIZE);
688
689 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
690 btrfs_mark_buffer_dirty(leaf);
691err:
692 btrfs_free_path(path);
693 return ret;
694}
695
696static noinline int find_next_chunk(struct btrfs_root *root,
697 u64 objectid, u64 *offset)
698{
699 struct btrfs_path *path;
700 int ret;
701 struct btrfs_key key;
702 struct btrfs_chunk *chunk;
703 struct btrfs_key found_key;
704
705 path = btrfs_alloc_path();
706 BUG_ON(!path);
707
708 key.objectid = objectid;
709 key.offset = (u64)-1;
710 key.type = BTRFS_CHUNK_ITEM_KEY;
711
712 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
713 if (ret < 0)
714 goto error;
715
716 BUG_ON(ret == 0);
717
718 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
719 if (ret) {
720 *offset = 0;
721 } else {
722 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
723 path->slots[0]);
724 if (found_key.objectid != objectid)
725 *offset = 0;
726 else {
727 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
728 struct btrfs_chunk);
729 *offset = found_key.offset +
730 btrfs_chunk_length(path->nodes[0], chunk);
731 }
732 }
733 ret = 0;
734error:
735 btrfs_free_path(path);
736 return ret;
737}
738
739static noinline int find_next_devid(struct btrfs_root *root,
740 struct btrfs_path *path, u64 *objectid)
741{
742 int ret;
743 struct btrfs_key key;
744 struct btrfs_key found_key;
745
746 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
747 key.type = BTRFS_DEV_ITEM_KEY;
748 key.offset = (u64)-1;
749
750 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
751 if (ret < 0)
752 goto error;
753
754 BUG_ON(ret == 0);
755
756 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
757 BTRFS_DEV_ITEM_KEY);
758 if (ret) {
759 *objectid = 1;
760 } else {
761 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
762 path->slots[0]);
763 *objectid = found_key.offset + 1;
764 }
765 ret = 0;
766error:
767 btrfs_release_path(root, path);
768 return ret;
769}
770
771/*
772 * the device information is stored in the chunk root
773 * the btrfs_device struct should be fully filled in
774 */
775int btrfs_add_device(struct btrfs_trans_handle *trans,
776 struct btrfs_root *root,
777 struct btrfs_device *device)
778{
779 int ret;
780 struct btrfs_path *path;
781 struct btrfs_dev_item *dev_item;
782 struct extent_buffer *leaf;
783 struct btrfs_key key;
784 unsigned long ptr;
785 u64 free_devid = 0;
786
787 root = root->fs_info->chunk_root;
788
789 path = btrfs_alloc_path();
790 if (!path)
791 return -ENOMEM;
792
793 ret = find_next_devid(root, path, &free_devid);
794 if (ret)
795 goto out;
796
797 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
798 key.type = BTRFS_DEV_ITEM_KEY;
799 key.offset = free_devid;
800
801 ret = btrfs_insert_empty_item(trans, root, path, &key,
802 sizeof(*dev_item));
803 if (ret)
804 goto out;
805
806 leaf = path->nodes[0];
807 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
808
809 device->devid = free_devid;
810 btrfs_set_device_id(leaf, dev_item, device->devid);
811 btrfs_set_device_type(leaf, dev_item, device->type);
812 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
813 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
814 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
815 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
816 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
817 btrfs_set_device_group(leaf, dev_item, 0);
818 btrfs_set_device_seek_speed(leaf, dev_item, 0);
819 btrfs_set_device_bandwidth(leaf, dev_item, 0);
820
821 ptr = (unsigned long)btrfs_device_uuid(dev_item);
822 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
823 btrfs_mark_buffer_dirty(leaf);
824 ret = 0;
825
826out:
827 btrfs_free_path(path);
828 return ret;
829}
830
831static int btrfs_rm_dev_item(struct btrfs_root *root,
832 struct btrfs_device *device)
833{
834 int ret;
835 struct btrfs_path *path;
836 struct block_device *bdev = device->bdev;
837 struct btrfs_device *next_dev;
838 struct btrfs_key key;
839 u64 total_bytes;
840 struct btrfs_fs_devices *fs_devices;
841 struct btrfs_trans_handle *trans;
842
843 root = root->fs_info->chunk_root;
844
845 path = btrfs_alloc_path();
846 if (!path)
847 return -ENOMEM;
848
849 trans = btrfs_start_transaction(root, 1);
850 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
851 key.type = BTRFS_DEV_ITEM_KEY;
852 key.offset = device->devid;
853 lock_chunks(root);
854
855 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
856 if (ret < 0)
857 goto out;
858
859 if (ret > 0) {
860 ret = -ENOENT;
861 goto out;
862 }
863
864 ret = btrfs_del_item(trans, root, path);
865 if (ret)
866 goto out;
867
868 /*
869 * at this point, the device is zero sized. We want to
870 * remove it from the devices list and zero out the old super
871 */
872 list_del_init(&device->dev_list);
873 list_del_init(&device->dev_alloc_list);
874 fs_devices = root->fs_info->fs_devices;
875
876 next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
877 dev_list);
878 if (bdev == root->fs_info->sb->s_bdev)
879 root->fs_info->sb->s_bdev = next_dev->bdev;
880 if (bdev == fs_devices->latest_bdev)
881 fs_devices->latest_bdev = next_dev->bdev;
882
883 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
884 btrfs_set_super_num_devices(&root->fs_info->super_copy,
885 total_bytes - 1);
886out:
887 btrfs_free_path(path);
888 unlock_chunks(root);
889 btrfs_commit_transaction(trans, root);
890 return ret;
891}
892
893int btrfs_rm_device(struct btrfs_root *root, char *device_path)
894{
895 struct btrfs_device *device;
896 struct block_device *bdev;
897 struct buffer_head *bh = NULL;
898 struct btrfs_super_block *disk_super;
899 u64 all_avail;
900 u64 devid;
901 int ret = 0;
902
903 mutex_lock(&uuid_mutex);
904 mutex_lock(&root->fs_info->volume_mutex);
905
906 all_avail = root->fs_info->avail_data_alloc_bits |
907 root->fs_info->avail_system_alloc_bits |
908 root->fs_info->avail_metadata_alloc_bits;
909
910 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
911 btrfs_super_num_devices(&root->fs_info->super_copy) <= 4) {
912 printk("btrfs: unable to go below four devices on raid10\n");
913 ret = -EINVAL;
914 goto out;
915 }
916
917 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
918 btrfs_super_num_devices(&root->fs_info->super_copy) <= 2) {
919 printk("btrfs: unable to go below two devices on raid1\n");
920 ret = -EINVAL;
921 goto out;
922 }
923
924 if (strcmp(device_path, "missing") == 0) {
925 struct list_head *cur;
926 struct list_head *devices;
927 struct btrfs_device *tmp;
928
929 device = NULL;
930 devices = &root->fs_info->fs_devices->devices;
931 list_for_each(cur, devices) {
932 tmp = list_entry(cur, struct btrfs_device, dev_list);
933 if (tmp->in_fs_metadata && !tmp->bdev) {
934 device = tmp;
935 break;
936 }
937 }
938 bdev = NULL;
939 bh = NULL;
940 disk_super = NULL;
941 if (!device) {
942 printk("btrfs: no missing devices found to remove\n");
943 goto out;
944 }
945
946 } else {
947 bdev = open_bdev_excl(device_path, 0,
948 root->fs_info->bdev_holder);
949 if (IS_ERR(bdev)) {
950 ret = PTR_ERR(bdev);
951 goto out;
952 }
953
954 bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
955 if (!bh) {
956 ret = -EIO;
957 goto error_close;
958 }
959 disk_super = (struct btrfs_super_block *)bh->b_data;
960 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
961 sizeof(disk_super->magic))) {
962 ret = -ENOENT;
963 goto error_brelse;
964 }
965 if (memcmp(disk_super->fsid, root->fs_info->fsid,
966 BTRFS_FSID_SIZE)) {
967 ret = -ENOENT;
968 goto error_brelse;
969 }
970 devid = le64_to_cpu(disk_super->dev_item.devid);
971 device = btrfs_find_device(root, devid, NULL);
972 if (!device) {
973 ret = -ENOENT;
974 goto error_brelse;
975 }
976
977 }
978 root->fs_info->fs_devices->num_devices--;
979 root->fs_info->fs_devices->open_devices--;
980
981 ret = btrfs_shrink_device(device, 0);
982 if (ret)
983 goto error_brelse;
984
985
986 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
987 if (ret)
988 goto error_brelse;
989
990 if (bh) {
991 /* make sure this device isn't detected as part of
992 * the FS anymore
993 */
994 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
995 set_buffer_dirty(bh);
996 sync_dirty_buffer(bh);
997
998 brelse(bh);
999 }
1000
1001 if (device->bdev) {
1002 /* one close for the device struct or super_block */
1003 close_bdev_excl(device->bdev);
1004 }
1005 if (bdev) {
1006 /* one close for us */
1007 close_bdev_excl(bdev);
1008 }
1009 kfree(device->name);
1010 kfree(device);
1011 ret = 0;
1012 goto out;
1013
1014error_brelse:
1015 brelse(bh);
1016error_close:
1017 if (bdev)
1018 close_bdev_excl(bdev);
1019out:
1020 mutex_unlock(&root->fs_info->volume_mutex);
1021 mutex_unlock(&uuid_mutex);
1022 return ret;
1023}
1024
1025int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1026{
1027 struct btrfs_trans_handle *trans;
1028 struct btrfs_device *device;
1029 struct block_device *bdev;
1030 struct list_head *cur;
1031 struct list_head *devices;
1032 u64 total_bytes;
1033 int ret = 0;
1034
1035
1036 bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
1037 if (!bdev) {
1038 return -EIO;
1039 }
1040
1041 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1042 mutex_lock(&root->fs_info->volume_mutex);
1043
1044 trans = btrfs_start_transaction(root, 1);
1045 lock_chunks(root);
1046 devices = &root->fs_info->fs_devices->devices;
1047 list_for_each(cur, devices) {
1048 device = list_entry(cur, struct btrfs_device, dev_list);
1049 if (device->bdev == bdev) {
1050 ret = -EEXIST;
1051 goto out;
1052 }
1053 }
1054
1055 device = kzalloc(sizeof(*device), GFP_NOFS);
1056 if (!device) {
1057 /* we can safely leave the fs_devices entry around */
1058 ret = -ENOMEM;
1059 goto out_close_bdev;
1060 }
1061
1062 device->barriers = 1;
1063 device->work.func = pending_bios_fn;
1064 generate_random_uuid(device->uuid);
1065 spin_lock_init(&device->io_lock);
1066 device->name = kstrdup(device_path, GFP_NOFS);
1067 if (!device->name) {
1068 kfree(device);
1069 goto out_close_bdev;
1070 }
1071 device->io_width = root->sectorsize;
1072 device->io_align = root->sectorsize;
1073 device->sector_size = root->sectorsize;
1074 device->total_bytes = i_size_read(bdev->bd_inode);
1075 device->dev_root = root->fs_info->dev_root;
1076 device->bdev = bdev;
1077 device->in_fs_metadata = 1;
1078
1079 ret = btrfs_add_device(trans, root, device);
1080 if (ret)
1081 goto out_close_bdev;
1082
1083 set_blocksize(device->bdev, 4096);
1084
1085 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1086 btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1087 total_bytes + device->total_bytes);
1088
1089 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1090 btrfs_set_super_num_devices(&root->fs_info->super_copy,
1091 total_bytes + 1);
1092
1093 list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1094 list_add(&device->dev_alloc_list,
1095 &root->fs_info->fs_devices->alloc_list);
1096 root->fs_info->fs_devices->num_devices++;
1097 root->fs_info->fs_devices->open_devices++;
1098out:
1099 unlock_chunks(root);
1100 btrfs_end_transaction(trans, root);
1101 mutex_unlock(&root->fs_info->volume_mutex);
1102
1103 return ret;
1104
1105out_close_bdev:
1106 close_bdev_excl(bdev);
1107 goto out;
1108}
1109
1110int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
1111 struct btrfs_device *device)
1112{
1113 int ret;
1114 struct btrfs_path *path;
1115 struct btrfs_root *root;
1116 struct btrfs_dev_item *dev_item;
1117 struct extent_buffer *leaf;
1118 struct btrfs_key key;
1119
1120 root = device->dev_root->fs_info->chunk_root;
1121
1122 path = btrfs_alloc_path();
1123 if (!path)
1124 return -ENOMEM;
1125
1126 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1127 key.type = BTRFS_DEV_ITEM_KEY;
1128 key.offset = device->devid;
1129
1130 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1131 if (ret < 0)
1132 goto out;
1133
1134 if (ret > 0) {
1135 ret = -ENOENT;
1136 goto out;
1137 }
1138
1139 leaf = path->nodes[0];
1140 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1141
1142 btrfs_set_device_id(leaf, dev_item, device->devid);
1143 btrfs_set_device_type(leaf, dev_item, device->type);
1144 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1145 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1146 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1147 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1148 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1149 btrfs_mark_buffer_dirty(leaf);
1150
1151out:
1152 btrfs_free_path(path);
1153 return ret;
1154}
1155
1156static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1157 struct btrfs_device *device, u64 new_size)
1158{
1159 struct btrfs_super_block *super_copy =
1160 &device->dev_root->fs_info->super_copy;
1161 u64 old_total = btrfs_super_total_bytes(super_copy);
1162 u64 diff = new_size - device->total_bytes;
1163
1164 btrfs_set_super_total_bytes(super_copy, old_total + diff);
1165 return btrfs_update_device(trans, device);
1166}
1167
1168int btrfs_grow_device(struct btrfs_trans_handle *trans,
1169 struct btrfs_device *device, u64 new_size)
1170{
1171 int ret;
1172 lock_chunks(device->dev_root);
1173 ret = __btrfs_grow_device(trans, device, new_size);
1174 unlock_chunks(device->dev_root);
1175 return ret;
1176}
1177
1178static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1179 struct btrfs_root *root,
1180 u64 chunk_tree, u64 chunk_objectid,
1181 u64 chunk_offset)
1182{
1183 int ret;
1184 struct btrfs_path *path;
1185 struct btrfs_key key;
1186
1187 root = root->fs_info->chunk_root;
1188 path = btrfs_alloc_path();
1189 if (!path)
1190 return -ENOMEM;
1191
1192 key.objectid = chunk_objectid;
1193 key.offset = chunk_offset;
1194 key.type = BTRFS_CHUNK_ITEM_KEY;
1195
1196 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1197 BUG_ON(ret);
1198
1199 ret = btrfs_del_item(trans, root, path);
1200 BUG_ON(ret);
1201
1202 btrfs_free_path(path);
1203 return 0;
1204}
1205
1206int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1207 chunk_offset)
1208{
1209 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1210 struct btrfs_disk_key *disk_key;
1211 struct btrfs_chunk *chunk;
1212 u8 *ptr;
1213 int ret = 0;
1214 u32 num_stripes;
1215 u32 array_size;
1216 u32 len = 0;
1217 u32 cur;
1218 struct btrfs_key key;
1219
1220 array_size = btrfs_super_sys_array_size(super_copy);
1221
1222 ptr = super_copy->sys_chunk_array;
1223 cur = 0;
1224
1225 while (cur < array_size) {
1226 disk_key = (struct btrfs_disk_key *)ptr;
1227 btrfs_disk_key_to_cpu(&key, disk_key);
1228
1229 len = sizeof(*disk_key);
1230
1231 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1232 chunk = (struct btrfs_chunk *)(ptr + len);
1233 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1234 len += btrfs_chunk_item_size(num_stripes);
1235 } else {
1236 ret = -EIO;
1237 break;
1238 }
1239 if (key.objectid == chunk_objectid &&
1240 key.offset == chunk_offset) {
1241 memmove(ptr, ptr + len, array_size - (cur + len));
1242 array_size -= len;
1243 btrfs_set_super_sys_array_size(super_copy, array_size);
1244 } else {
1245 ptr += len;
1246 cur += len;
1247 }
1248 }
1249 return ret;
1250}
1251
1252
1253int btrfs_relocate_chunk(struct btrfs_root *root,
1254 u64 chunk_tree, u64 chunk_objectid,
1255 u64 chunk_offset)
1256{
1257 struct extent_map_tree *em_tree;
1258 struct btrfs_root *extent_root;
1259 struct btrfs_trans_handle *trans;
1260 struct extent_map *em;
1261 struct map_lookup *map;
1262 int ret;
1263 int i;
1264
1265 printk("btrfs relocating chunk %llu\n",
1266 (unsigned long long)chunk_offset);
1267 root = root->fs_info->chunk_root;
1268 extent_root = root->fs_info->extent_root;
1269 em_tree = &root->fs_info->mapping_tree.map_tree;
1270
1271 /* step one, relocate all the extents inside this chunk */
1272 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1273 BUG_ON(ret);
1274
1275 trans = btrfs_start_transaction(root, 1);
1276 BUG_ON(!trans);
1277
1278 lock_chunks(root);
1279
1280 /*
1281 * step two, delete the device extents and the
1282 * chunk tree entries
1283 */
1284 spin_lock(&em_tree->lock);
1285 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1286 spin_unlock(&em_tree->lock);
1287
1288 BUG_ON(em->start > chunk_offset ||
1289 em->start + em->len < chunk_offset);
1290 map = (struct map_lookup *)em->bdev;
1291
1292 for (i = 0; i < map->num_stripes; i++) {
1293 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1294 map->stripes[i].physical);
1295 BUG_ON(ret);
1296
1297 if (map->stripes[i].dev) {
1298 ret = btrfs_update_device(trans, map->stripes[i].dev);
1299 BUG_ON(ret);
1300 }
1301 }
1302 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1303 chunk_offset);
1304
1305 BUG_ON(ret);
1306
1307 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1308 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1309 BUG_ON(ret);
1310 }
1311
1312 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1313 BUG_ON(ret);
1314
1315 spin_lock(&em_tree->lock);
1316 remove_extent_mapping(em_tree, em);
1317 spin_unlock(&em_tree->lock);
1318
1319 kfree(map);
1320 em->bdev = NULL;
1321
1322 /* once for the tree */
1323 free_extent_map(em);
1324 /* once for us */
1325 free_extent_map(em);
1326
1327 unlock_chunks(root);
1328 btrfs_end_transaction(trans, root);
1329 return 0;
1330}
1331
1332static u64 div_factor(u64 num, int factor)
1333{
1334 if (factor == 10)
1335 return num;
1336 num *= factor;
1337 do_div(num, 10);
1338 return num;
1339}
1340
1341
1342int btrfs_balance(struct btrfs_root *dev_root)
1343{
1344 int ret;
1345 struct list_head *cur;
1346 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1347 struct btrfs_device *device;
1348 u64 old_size;
1349 u64 size_to_free;
1350 struct btrfs_path *path;
1351 struct btrfs_key key;
1352 struct btrfs_chunk *chunk;
1353 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1354 struct btrfs_trans_handle *trans;
1355 struct btrfs_key found_key;
1356
1357
1358 mutex_lock(&dev_root->fs_info->volume_mutex);
1359 dev_root = dev_root->fs_info->dev_root;
1360
1361 /* step one make some room on all the devices */
1362 list_for_each(cur, devices) {
1363 device = list_entry(cur, struct btrfs_device, dev_list);
1364 old_size = device->total_bytes;
1365 size_to_free = div_factor(old_size, 1);
1366 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
1367 if (device->total_bytes - device->bytes_used > size_to_free)
1368 continue;
1369
1370 ret = btrfs_shrink_device(device, old_size - size_to_free);
1371 BUG_ON(ret);
1372
1373 trans = btrfs_start_transaction(dev_root, 1);
1374 BUG_ON(!trans);
1375
1376 ret = btrfs_grow_device(trans, device, old_size);
1377 BUG_ON(ret);
1378
1379 btrfs_end_transaction(trans, dev_root);
1380 }
1381
1382 /* step two, relocate all the chunks */
1383 path = btrfs_alloc_path();
1384 BUG_ON(!path);
1385
1386 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1387 key.offset = (u64)-1;
1388 key.type = BTRFS_CHUNK_ITEM_KEY;
1389
1390 while(1) {
1391 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1392 if (ret < 0)
1393 goto error;
1394
1395 /*
1396 * this shouldn't happen, it means the last relocate
1397 * failed
1398 */
1399 if (ret == 0)
1400 break;
1401
1402 ret = btrfs_previous_item(chunk_root, path, 0,
1403 BTRFS_CHUNK_ITEM_KEY);
1404 if (ret)
1405 break;
1406
1407 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1408 path->slots[0]);
1409 if (found_key.objectid != key.objectid)
1410 break;
1411
1412 chunk = btrfs_item_ptr(path->nodes[0],
1413 path->slots[0],
1414 struct btrfs_chunk);
1415 key.offset = found_key.offset;
1416 /* chunk zero is special */
1417 if (key.offset == 0)
1418 break;
1419
1420 btrfs_release_path(chunk_root, path);
1421 ret = btrfs_relocate_chunk(chunk_root,
1422 chunk_root->root_key.objectid,
1423 found_key.objectid,
1424 found_key.offset);
1425 BUG_ON(ret);
1426 }
1427 ret = 0;
1428error:
1429 btrfs_free_path(path);
1430 mutex_unlock(&dev_root->fs_info->volume_mutex);
1431 return ret;
1432}
1433
1434/*
1435 * shrinking a device means finding all of the device extents past
1436 * the new size, and then following the back refs to the chunks.
1437 * The chunk relocation code actually frees the device extent
1438 */
1439int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1440{
1441 struct btrfs_trans_handle *trans;
1442 struct btrfs_root *root = device->dev_root;
1443 struct btrfs_dev_extent *dev_extent = NULL;
1444 struct btrfs_path *path;
1445 u64 length;
1446 u64 chunk_tree;
1447 u64 chunk_objectid;
1448 u64 chunk_offset;
1449 int ret;
1450 int slot;
1451 struct extent_buffer *l;
1452 struct btrfs_key key;
1453 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1454 u64 old_total = btrfs_super_total_bytes(super_copy);
1455 u64 diff = device->total_bytes - new_size;
1456
1457
1458 path = btrfs_alloc_path();
1459 if (!path)
1460 return -ENOMEM;
1461
1462 trans = btrfs_start_transaction(root, 1);
1463 if (!trans) {
1464 ret = -ENOMEM;
1465 goto done;
1466 }
1467
1468 path->reada = 2;
1469
1470 lock_chunks(root);
1471
1472 device->total_bytes = new_size;
1473 ret = btrfs_update_device(trans, device);
1474 if (ret) {
1475 unlock_chunks(root);
1476 btrfs_end_transaction(trans, root);
1477 goto done;
1478 }
1479 WARN_ON(diff > old_total);
1480 btrfs_set_super_total_bytes(super_copy, old_total - diff);
1481 unlock_chunks(root);
1482 btrfs_end_transaction(trans, root);
1483
1484 key.objectid = device->devid;
1485 key.offset = (u64)-1;
1486 key.type = BTRFS_DEV_EXTENT_KEY;
1487
1488 while (1) {
1489 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1490 if (ret < 0)
1491 goto done;
1492
1493 ret = btrfs_previous_item(root, path, 0, key.type);
1494 if (ret < 0)
1495 goto done;
1496 if (ret) {
1497 ret = 0;
1498 goto done;
1499 }
1500
1501 l = path->nodes[0];
1502 slot = path->slots[0];
1503 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1504
1505 if (key.objectid != device->devid)
1506 goto done;
1507
1508 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1509 length = btrfs_dev_extent_length(l, dev_extent);
1510
1511 if (key.offset + length <= new_size)
1512 goto done;
1513
1514 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1515 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1516 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1517 btrfs_release_path(root, path);
1518
1519 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
1520 chunk_offset);
1521 if (ret)
1522 goto done;
1523 }
1524
1525done:
1526 btrfs_free_path(path);
1527 return ret;
1528}
1529
1530int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
1531 struct btrfs_root *root,
1532 struct btrfs_key *key,
1533 struct btrfs_chunk *chunk, int item_size)
1534{
1535 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1536 struct btrfs_disk_key disk_key;
1537 u32 array_size;
1538 u8 *ptr;
1539
1540 array_size = btrfs_super_sys_array_size(super_copy);
1541 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
1542 return -EFBIG;
1543
1544 ptr = super_copy->sys_chunk_array + array_size;
1545 btrfs_cpu_key_to_disk(&disk_key, key);
1546 memcpy(ptr, &disk_key, sizeof(disk_key));
1547 ptr += sizeof(disk_key);
1548 memcpy(ptr, chunk, item_size);
1549 item_size += sizeof(disk_key);
1550 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
1551 return 0;
1552}
1553
1554static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
1555 int num_stripes, int sub_stripes)
1556{
1557 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
1558 return calc_size;
1559 else if (type & BTRFS_BLOCK_GROUP_RAID10)
1560 return calc_size * (num_stripes / sub_stripes);
1561 else
1562 return calc_size * num_stripes;
1563}
1564
1565
1566int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
1567 struct btrfs_root *extent_root, u64 *start,
1568 u64 *num_bytes, u64 type)
1569{
1570 u64 dev_offset;
1571 struct btrfs_fs_info *info = extent_root->fs_info;
1572 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
1573 struct btrfs_path *path;
1574 struct btrfs_stripe *stripes;
1575 struct btrfs_device *device = NULL;
1576 struct btrfs_chunk *chunk;
1577 struct list_head private_devs;
1578 struct list_head *dev_list;
1579 struct list_head *cur;
1580 struct extent_map_tree *em_tree;
1581 struct map_lookup *map;
1582 struct extent_map *em;
1583 int min_stripe_size = 1 * 1024 * 1024;
1584 u64 physical;
1585 u64 calc_size = 1024 * 1024 * 1024;
1586 u64 max_chunk_size = calc_size;
1587 u64 min_free;
1588 u64 avail;
1589 u64 max_avail = 0;
1590 u64 percent_max;
1591 int num_stripes = 1;
1592 int min_stripes = 1;
1593 int sub_stripes = 0;
1594 int looped = 0;
1595 int ret;
1596 int index;
1597 int stripe_len = 64 * 1024;
1598 struct btrfs_key key;
1599
1600 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
1601 (type & BTRFS_BLOCK_GROUP_DUP)) {
1602 WARN_ON(1);
1603 type &= ~BTRFS_BLOCK_GROUP_DUP;
1604 }
1605 dev_list = &extent_root->fs_info->fs_devices->alloc_list;
1606 if (list_empty(dev_list))
1607 return -ENOSPC;
1608
1609 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
1610 num_stripes = extent_root->fs_info->fs_devices->open_devices;
1611 min_stripes = 2;
1612 }
1613 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
1614 num_stripes = 2;
1615 min_stripes = 2;
1616 }
1617 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
1618 num_stripes = min_t(u64, 2,
1619 extent_root->fs_info->fs_devices->open_devices);
1620 if (num_stripes < 2)
1621 return -ENOSPC;
1622 min_stripes = 2;
1623 }
1624 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
1625 num_stripes = extent_root->fs_info->fs_devices->open_devices;
1626 if (num_stripes < 4)
1627 return -ENOSPC;
1628 num_stripes &= ~(u32)1;
1629 sub_stripes = 2;
1630 min_stripes = 4;
1631 }
1632
1633 if (type & BTRFS_BLOCK_GROUP_DATA) {
1634 max_chunk_size = 10 * calc_size;
1635 min_stripe_size = 64 * 1024 * 1024;
1636 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
1637 max_chunk_size = 4 * calc_size;
1638 min_stripe_size = 32 * 1024 * 1024;
1639 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
1640 calc_size = 8 * 1024 * 1024;
1641 max_chunk_size = calc_size * 2;
1642 min_stripe_size = 1 * 1024 * 1024;
1643 }
1644
1645 path = btrfs_alloc_path();
1646 if (!path)
1647 return -ENOMEM;
1648
1649 /* we don't want a chunk larger than 10% of the FS */
1650 percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
1651 max_chunk_size = min(percent_max, max_chunk_size);
1652
1653again:
1654 if (calc_size * num_stripes > max_chunk_size) {
1655 calc_size = max_chunk_size;
1656 do_div(calc_size, num_stripes);
1657 do_div(calc_size, stripe_len);
1658 calc_size *= stripe_len;
1659 }
1660 /* we don't want tiny stripes */
1661 calc_size = max_t(u64, min_stripe_size, calc_size);
1662
1663 do_div(calc_size, stripe_len);
1664 calc_size *= stripe_len;
1665
1666 INIT_LIST_HEAD(&private_devs);
1667 cur = dev_list->next;
1668 index = 0;
1669
1670 if (type & BTRFS_BLOCK_GROUP_DUP)
1671 min_free = calc_size * 2;
1672 else
1673 min_free = calc_size;
1674
1675 /*
1676 * we add 1MB because we never use the first 1MB of the device, unless
1677 * we've looped, then we are likely allocating the maximum amount of
1678 * space left already
1679 */
1680 if (!looped)
1681 min_free += 1024 * 1024;
1682
1683 /* build a private list of devices we will allocate from */
1684 while(index < num_stripes) {
1685 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
1686
1687 if (device->total_bytes > device->bytes_used)
1688 avail = device->total_bytes - device->bytes_used;
1689 else
1690 avail = 0;
1691 cur = cur->next;
1692
1693 if (device->in_fs_metadata && avail >= min_free) {
1694 u64 ignored_start = 0;
1695 ret = find_free_dev_extent(trans, device, path,
1696 min_free,
1697 &ignored_start);
1698 if (ret == 0) {
1699 list_move_tail(&device->dev_alloc_list,
1700 &private_devs);
1701 index++;
1702 if (type & BTRFS_BLOCK_GROUP_DUP)
1703 index++;
1704 }
1705 } else if (device->in_fs_metadata && avail > max_avail)
1706 max_avail = avail;
1707 if (cur == dev_list)
1708 break;
1709 }
1710 if (index < num_stripes) {
1711 list_splice(&private_devs, dev_list);
1712 if (index >= min_stripes) {
1713 num_stripes = index;
1714 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
1715 num_stripes /= sub_stripes;
1716 num_stripes *= sub_stripes;
1717 }
1718 looped = 1;
1719 goto again;
1720 }
1721 if (!looped && max_avail > 0) {
1722 looped = 1;
1723 calc_size = max_avail;
1724 goto again;
1725 }
1726 btrfs_free_path(path);
1727 return -ENOSPC;
1728 }
1729 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1730 key.type = BTRFS_CHUNK_ITEM_KEY;
1731 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
1732 &key.offset);
1733 if (ret) {
1734 btrfs_free_path(path);
1735 return ret;
1736 }
1737
1738 chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
1739 if (!chunk) {
1740 btrfs_free_path(path);
1741 return -ENOMEM;
1742 }
1743
1744 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
1745 if (!map) {
1746 kfree(chunk);
1747 btrfs_free_path(path);
1748 return -ENOMEM;
1749 }
1750 btrfs_free_path(path);
1751 path = NULL;
1752
1753 stripes = &chunk->stripe;
1754 *num_bytes = chunk_bytes_by_type(type, calc_size,
1755 num_stripes, sub_stripes);
1756
1757 index = 0;
1758 while(index < num_stripes) {
1759 struct btrfs_stripe *stripe;
1760 BUG_ON(list_empty(&private_devs));
1761 cur = private_devs.next;
1762 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
1763
1764 /* loop over this device again if we're doing a dup group */
1765 if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
1766 (index == num_stripes - 1))
1767 list_move_tail(&device->dev_alloc_list, dev_list);
1768
1769 ret = btrfs_alloc_dev_extent(trans, device,
1770 info->chunk_root->root_key.objectid,
1771 BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset,
1772 calc_size, &dev_offset);
1773 BUG_ON(ret);
1774 device->bytes_used += calc_size;
1775 ret = btrfs_update_device(trans, device);
1776 BUG_ON(ret);
1777
1778 map->stripes[index].dev = device;
1779 map->stripes[index].physical = dev_offset;
1780 stripe = stripes + index;
1781 btrfs_set_stack_stripe_devid(stripe, device->devid);
1782 btrfs_set_stack_stripe_offset(stripe, dev_offset);
1783 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
1784 physical = dev_offset;
1785 index++;
1786 }
1787 BUG_ON(!list_empty(&private_devs));
1788
1789 /* key was set above */
1790 btrfs_set_stack_chunk_length(chunk, *num_bytes);
1791 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
1792 btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
1793 btrfs_set_stack_chunk_type(chunk, type);
1794 btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
1795 btrfs_set_stack_chunk_io_align(chunk, stripe_len);
1796 btrfs_set_stack_chunk_io_width(chunk, stripe_len);
1797 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
1798 btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
1799 map->sector_size = extent_root->sectorsize;
1800 map->stripe_len = stripe_len;
1801 map->io_align = stripe_len;
1802 map->io_width = stripe_len;
1803 map->type = type;
1804 map->num_stripes = num_stripes;
1805 map->sub_stripes = sub_stripes;
1806
1807 ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
1808 btrfs_chunk_item_size(num_stripes));
1809 BUG_ON(ret);
1810 *start = key.offset;;
1811
1812 em = alloc_extent_map(GFP_NOFS);
1813 if (!em)
1814 return -ENOMEM;
1815 em->bdev = (struct block_device *)map;
1816 em->start = key.offset;
1817 em->len = *num_bytes;
1818 em->block_start = 0;
1819
1820 if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
1821 ret = btrfs_add_system_chunk(trans, chunk_root, &key,
1822 chunk, btrfs_chunk_item_size(num_stripes));
1823 BUG_ON(ret);
1824 }
1825 kfree(chunk);
1826
1827 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
1828 spin_lock(&em_tree->lock);
1829 ret = add_extent_mapping(em_tree, em);
1830 spin_unlock(&em_tree->lock);
1831 BUG_ON(ret);
1832 free_extent_map(em);
1833 return ret;
1834}
1835
1836void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
1837{
1838 extent_map_tree_init(&tree->map_tree, GFP_NOFS);
1839}
1840
1841void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
1842{
1843 struct extent_map *em;
1844
1845 while(1) {
1846 spin_lock(&tree->map_tree.lock);
1847 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
1848 if (em)
1849 remove_extent_mapping(&tree->map_tree, em);
1850 spin_unlock(&tree->map_tree.lock);
1851 if (!em)
1852 break;
1853 kfree(em->bdev);
1854 /* once for us */
1855 free_extent_map(em);
1856 /* once for the tree */
1857 free_extent_map(em);
1858 }
1859}
1860
1861int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
1862{
1863 struct extent_map *em;
1864 struct map_lookup *map;
1865 struct extent_map_tree *em_tree = &map_tree->map_tree;
1866 int ret;
1867
1868 spin_lock(&em_tree->lock);
1869 em = lookup_extent_mapping(em_tree, logical, len);
1870 spin_unlock(&em_tree->lock);
1871 BUG_ON(!em);
1872
1873 BUG_ON(em->start > logical || em->start + em->len < logical);
1874 map = (struct map_lookup *)em->bdev;
1875 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
1876 ret = map->num_stripes;
1877 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
1878 ret = map->sub_stripes;
1879 else
1880 ret = 1;
1881 free_extent_map(em);
1882 return ret;
1883}
1884
1885static int find_live_mirror(struct map_lookup *map, int first, int num,
1886 int optimal)
1887{
1888 int i;
1889 if (map->stripes[optimal].dev->bdev)
1890 return optimal;
1891 for (i = first; i < first + num; i++) {
1892 if (map->stripes[i].dev->bdev)
1893 return i;
1894 }
1895 /* we couldn't find one that doesn't fail. Just return something
1896 * and the io error handling code will clean up eventually
1897 */
1898 return optimal;
1899}
1900
1901static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
1902 u64 logical, u64 *length,
1903 struct btrfs_multi_bio **multi_ret,
1904 int mirror_num, struct page *unplug_page)
1905{
1906 struct extent_map *em;
1907 struct map_lookup *map;
1908 struct extent_map_tree *em_tree = &map_tree->map_tree;
1909 u64 offset;
1910 u64 stripe_offset;
1911 u64 stripe_nr;
1912 int stripes_allocated = 8;
1913 int stripes_required = 1;
1914 int stripe_index;
1915 int i;
1916 int num_stripes;
1917 int max_errors = 0;
1918 struct btrfs_multi_bio *multi = NULL;
1919
1920 if (multi_ret && !(rw & (1 << BIO_RW))) {
1921 stripes_allocated = 1;
1922 }
1923again:
1924 if (multi_ret) {
1925 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
1926 GFP_NOFS);
1927 if (!multi)
1928 return -ENOMEM;
1929
1930 atomic_set(&multi->error, 0);
1931 }
1932
1933 spin_lock(&em_tree->lock);
1934 em = lookup_extent_mapping(em_tree, logical, *length);
1935 spin_unlock(&em_tree->lock);
1936
1937 if (!em && unplug_page)
1938 return 0;
1939
1940 if (!em) {
1941 printk("unable to find logical %Lu len %Lu\n", logical, *length);
1942 BUG();
1943 }
1944
1945 BUG_ON(em->start > logical || em->start + em->len < logical);
1946 map = (struct map_lookup *)em->bdev;
1947 offset = logical - em->start;
1948
1949 if (mirror_num > map->num_stripes)
1950 mirror_num = 0;
1951
1952 /* if our multi bio struct is too small, back off and try again */
1953 if (rw & (1 << BIO_RW)) {
1954 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
1955 BTRFS_BLOCK_GROUP_DUP)) {
1956 stripes_required = map->num_stripes;
1957 max_errors = 1;
1958 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1959 stripes_required = map->sub_stripes;
1960 max_errors = 1;
1961 }
1962 }
1963 if (multi_ret && rw == WRITE &&
1964 stripes_allocated < stripes_required) {
1965 stripes_allocated = map->num_stripes;
1966 free_extent_map(em);
1967 kfree(multi);
1968 goto again;
1969 }
1970 stripe_nr = offset;
1971 /*
1972 * stripe_nr counts the total number of stripes we have to stride
1973 * to get to this block
1974 */
1975 do_div(stripe_nr, map->stripe_len);
1976
1977 stripe_offset = stripe_nr * map->stripe_len;
1978 BUG_ON(offset < stripe_offset);
1979
1980 /* stripe_offset is the offset of this block in its stripe*/
1981 stripe_offset = offset - stripe_offset;
1982
1983 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
1984 BTRFS_BLOCK_GROUP_RAID10 |
1985 BTRFS_BLOCK_GROUP_DUP)) {
1986 /* we limit the length of each bio to what fits in a stripe */
1987 *length = min_t(u64, em->len - offset,
1988 map->stripe_len - stripe_offset);
1989 } else {
1990 *length = em->len - offset;
1991 }
1992
1993 if (!multi_ret && !unplug_page)
1994 goto out;
1995
1996 num_stripes = 1;
1997 stripe_index = 0;
1998 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
1999 if (unplug_page || (rw & (1 << BIO_RW)))
2000 num_stripes = map->num_stripes;
2001 else if (mirror_num)
2002 stripe_index = mirror_num - 1;
2003 else {
2004 stripe_index = find_live_mirror(map, 0,
2005 map->num_stripes,
2006 current->pid % map->num_stripes);
2007 }
2008
2009 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2010 if (rw & (1 << BIO_RW))
2011 num_stripes = map->num_stripes;
2012 else if (mirror_num)
2013 stripe_index = mirror_num - 1;
2014
2015 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2016 int factor = map->num_stripes / map->sub_stripes;
2017
2018 stripe_index = do_div(stripe_nr, factor);
2019 stripe_index *= map->sub_stripes;
2020
2021 if (unplug_page || (rw & (1 << BIO_RW)))
2022 num_stripes = map->sub_stripes;
2023 else if (mirror_num)
2024 stripe_index += mirror_num - 1;
2025 else {
2026 stripe_index = find_live_mirror(map, stripe_index,
2027 map->sub_stripes, stripe_index +
2028 current->pid % map->sub_stripes);
2029 }
2030 } else {
2031 /*
2032 * after this do_div call, stripe_nr is the number of stripes
2033 * on this device we have to walk to find the data, and
2034 * stripe_index is the number of our device in the stripe array
2035 */
2036 stripe_index = do_div(stripe_nr, map->num_stripes);
2037 }
2038 BUG_ON(stripe_index >= map->num_stripes);
2039
2040 for (i = 0; i < num_stripes; i++) {
2041 if (unplug_page) {
2042 struct btrfs_device *device;
2043 struct backing_dev_info *bdi;
2044
2045 device = map->stripes[stripe_index].dev;
2046 if (device->bdev) {
2047 bdi = blk_get_backing_dev_info(device->bdev);
2048 if (bdi->unplug_io_fn) {
2049 bdi->unplug_io_fn(bdi, unplug_page);
2050 }
2051 }
2052 } else {
2053 multi->stripes[i].physical =
2054 map->stripes[stripe_index].physical +
2055 stripe_offset + stripe_nr * map->stripe_len;
2056 multi->stripes[i].dev = map->stripes[stripe_index].dev;
2057 }
2058 stripe_index++;
2059 }
2060 if (multi_ret) {
2061 *multi_ret = multi;
2062 multi->num_stripes = num_stripes;
2063 multi->max_errors = max_errors;
2064 }
2065out:
2066 free_extent_map(em);
2067 return 0;
2068}
2069
2070int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2071 u64 logical, u64 *length,
2072 struct btrfs_multi_bio **multi_ret, int mirror_num)
2073{
2074 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2075 mirror_num, NULL);
2076}
2077
2078int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2079 u64 logical, struct page *page)
2080{
2081 u64 length = PAGE_CACHE_SIZE;
2082 return __btrfs_map_block(map_tree, READ, logical, &length,
2083 NULL, 0, page);
2084}
2085
2086
2087static void end_bio_multi_stripe(struct bio *bio, int err)
2088{
2089 struct btrfs_multi_bio *multi = bio->bi_private;
2090 int is_orig_bio = 0;
2091
2092 if (err)
2093 atomic_inc(&multi->error);
2094
2095 if (bio == multi->orig_bio)
2096 is_orig_bio = 1;
2097
2098 if (atomic_dec_and_test(&multi->stripes_pending)) {
2099 if (!is_orig_bio) {
2100 bio_put(bio);
2101 bio = multi->orig_bio;
2102 }
2103 bio->bi_private = multi->private;
2104 bio->bi_end_io = multi->end_io;
2105 /* only send an error to the higher layers if it is
2106 * beyond the tolerance of the multi-bio
2107 */
2108 if (atomic_read(&multi->error) > multi->max_errors) {
2109 err = -EIO;
2110 } else if (err) {
2111 /*
2112 * this bio is actually up to date, we didn't
2113 * go over the max number of errors
2114 */
2115 set_bit(BIO_UPTODATE, &bio->bi_flags);
2116 err = 0;
2117 }
2118 kfree(multi);
2119
2120 bio_endio(bio, err);
2121 } else if (!is_orig_bio) {
2122 bio_put(bio);
2123 }
2124}
2125
2126struct async_sched {
2127 struct bio *bio;
2128 int rw;
2129 struct btrfs_fs_info *info;
2130 struct btrfs_work work;
2131};
2132
2133/*
2134 * see run_scheduled_bios for a description of why bios are collected for
2135 * async submit.
2136 *
2137 * This will add one bio to the pending list for a device and make sure
2138 * the work struct is scheduled.
2139 */
2140static int noinline schedule_bio(struct btrfs_root *root,
2141 struct btrfs_device *device,
2142 int rw, struct bio *bio)
2143{
2144 int should_queue = 1;
2145
2146 /* don't bother with additional async steps for reads, right now */
2147 if (!(rw & (1 << BIO_RW))) {
2148 bio_get(bio);
2149 submit_bio(rw, bio);
2150 bio_put(bio);
2151 return 0;
2152 }
2153
2154 /*
2155 * nr_async_bios allows us to reliably return congestion to the
2156 * higher layers. Otherwise, the async bio makes it appear we have
2157 * made progress against dirty pages when we've really just put it
2158 * on a queue for later
2159 */
2160 atomic_inc(&root->fs_info->nr_async_bios);
2161 WARN_ON(bio->bi_next);
2162 bio->bi_next = NULL;
2163 bio->bi_rw |= rw;
2164
2165 spin_lock(&device->io_lock);
2166
2167 if (device->pending_bio_tail)
2168 device->pending_bio_tail->bi_next = bio;
2169
2170 device->pending_bio_tail = bio;
2171 if (!device->pending_bios)
2172 device->pending_bios = bio;
2173 if (device->running_pending)
2174 should_queue = 0;
2175
2176 spin_unlock(&device->io_lock);
2177
2178 if (should_queue)
2179 btrfs_queue_worker(&root->fs_info->submit_workers,
2180 &device->work);
2181 return 0;
2182}
2183
2184int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
2185 int mirror_num, int async_submit)
2186{
2187 struct btrfs_mapping_tree *map_tree;
2188 struct btrfs_device *dev;
2189 struct bio *first_bio = bio;
2190 u64 logical = (u64)bio->bi_sector << 9;
2191 u64 length = 0;
2192 u64 map_length;
2193 struct btrfs_multi_bio *multi = NULL;
2194 int ret;
2195 int dev_nr = 0;
2196 int total_devs = 1;
2197
2198 length = bio->bi_size;
2199 map_tree = &root->fs_info->mapping_tree;
2200 map_length = length;
2201
2202 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
2203 mirror_num);
2204 BUG_ON(ret);
2205
2206 total_devs = multi->num_stripes;
2207 if (map_length < length) {
2208 printk("mapping failed logical %Lu bio len %Lu "
2209 "len %Lu\n", logical, length, map_length);
2210 BUG();
2211 }
2212 multi->end_io = first_bio->bi_end_io;
2213 multi->private = first_bio->bi_private;
2214 multi->orig_bio = first_bio;
2215 atomic_set(&multi->stripes_pending, multi->num_stripes);
2216
2217 while(dev_nr < total_devs) {
2218 if (total_devs > 1) {
2219 if (dev_nr < total_devs - 1) {
2220 bio = bio_clone(first_bio, GFP_NOFS);
2221 BUG_ON(!bio);
2222 } else {
2223 bio = first_bio;
2224 }
2225 bio->bi_private = multi;
2226 bio->bi_end_io = end_bio_multi_stripe;
2227 }
2228 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
2229 dev = multi->stripes[dev_nr].dev;
2230 if (dev && dev->bdev) {
2231 bio->bi_bdev = dev->bdev;
2232 if (async_submit)
2233 schedule_bio(root, dev, rw, bio);
2234 else
2235 submit_bio(rw, bio);
2236 } else {
2237 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2238 bio->bi_sector = logical >> 9;
2239 bio_endio(bio, -EIO);
2240 }
2241 dev_nr++;
2242 }
2243 if (total_devs == 1)
2244 kfree(multi);
2245 return 0;
2246}
2247
2248struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
2249 u8 *uuid)
2250{
2251 struct list_head *head = &root->fs_info->fs_devices->devices;
2252
2253 return __find_device(head, devid, uuid);
2254}
2255
2256static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
2257 u64 devid, u8 *dev_uuid)
2258{
2259 struct btrfs_device *device;
2260 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2261
2262 device = kzalloc(sizeof(*device), GFP_NOFS);
2263 list_add(&device->dev_list,
2264 &fs_devices->devices);
2265 list_add(&device->dev_alloc_list,
2266 &fs_devices->alloc_list);
2267 device->barriers = 1;
2268 device->dev_root = root->fs_info->dev_root;
2269 device->devid = devid;
2270 device->work.func = pending_bios_fn;
2271 fs_devices->num_devices++;
2272 spin_lock_init(&device->io_lock);
2273 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
2274 return device;
2275}
2276
2277
2278static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2279 struct extent_buffer *leaf,
2280 struct btrfs_chunk *chunk)
2281{
2282 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2283 struct map_lookup *map;
2284 struct extent_map *em;
2285 u64 logical;
2286 u64 length;
2287 u64 devid;
2288 u8 uuid[BTRFS_UUID_SIZE];
2289 int num_stripes;
2290 int ret;
2291 int i;
2292
2293 logical = key->offset;
2294 length = btrfs_chunk_length(leaf, chunk);
2295
2296 spin_lock(&map_tree->map_tree.lock);
2297 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
2298 spin_unlock(&map_tree->map_tree.lock);
2299
2300 /* already mapped? */
2301 if (em && em->start <= logical && em->start + em->len > logical) {
2302 free_extent_map(em);
2303 return 0;
2304 } else if (em) {
2305 free_extent_map(em);
2306 }
2307
2308 map = kzalloc(sizeof(*map), GFP_NOFS);
2309 if (!map)
2310 return -ENOMEM;
2311
2312 em = alloc_extent_map(GFP_NOFS);
2313 if (!em)
2314 return -ENOMEM;
2315 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2316 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2317 if (!map) {
2318 free_extent_map(em);
2319 return -ENOMEM;
2320 }
2321
2322 em->bdev = (struct block_device *)map;
2323 em->start = logical;
2324 em->len = length;
2325 em->block_start = 0;
2326
2327 map->num_stripes = num_stripes;
2328 map->io_width = btrfs_chunk_io_width(leaf, chunk);
2329 map->io_align = btrfs_chunk_io_align(leaf, chunk);
2330 map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
2331 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
2332 map->type = btrfs_chunk_type(leaf, chunk);
2333 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
2334 for (i = 0; i < num_stripes; i++) {
2335 map->stripes[i].physical =
2336 btrfs_stripe_offset_nr(leaf, chunk, i);
2337 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
2338 read_extent_buffer(leaf, uuid, (unsigned long)
2339 btrfs_stripe_dev_uuid_nr(chunk, i),
2340 BTRFS_UUID_SIZE);
2341 map->stripes[i].dev = btrfs_find_device(root, devid, uuid);
2342
2343 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
2344 kfree(map);
2345 free_extent_map(em);
2346 return -EIO;
2347 }
2348 if (!map->stripes[i].dev) {
2349 map->stripes[i].dev =
2350 add_missing_dev(root, devid, uuid);
2351 if (!map->stripes[i].dev) {
2352 kfree(map);
2353 free_extent_map(em);
2354 return -EIO;
2355 }
2356 }
2357 map->stripes[i].dev->in_fs_metadata = 1;
2358 }
2359
2360 spin_lock(&map_tree->map_tree.lock);
2361 ret = add_extent_mapping(&map_tree->map_tree, em);
2362 spin_unlock(&map_tree->map_tree.lock);
2363 BUG_ON(ret);
2364 free_extent_map(em);
2365
2366 return 0;
2367}
2368
2369static int fill_device_from_item(struct extent_buffer *leaf,
2370 struct btrfs_dev_item *dev_item,
2371 struct btrfs_device *device)
2372{
2373 unsigned long ptr;
2374
2375 device->devid = btrfs_device_id(leaf, dev_item);
2376 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2377 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2378 device->type = btrfs_device_type(leaf, dev_item);
2379 device->io_align = btrfs_device_io_align(leaf, dev_item);
2380 device->io_width = btrfs_device_io_width(leaf, dev_item);
2381 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
2382
2383 ptr = (unsigned long)btrfs_device_uuid(dev_item);
2384 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
2385
2386 return 0;
2387}
2388
2389static int read_one_dev(struct btrfs_root *root,
2390 struct extent_buffer *leaf,
2391 struct btrfs_dev_item *dev_item)
2392{
2393 struct btrfs_device *device;
2394 u64 devid;
2395 int ret;
2396 u8 dev_uuid[BTRFS_UUID_SIZE];
2397
2398 devid = btrfs_device_id(leaf, dev_item);
2399 read_extent_buffer(leaf, dev_uuid,
2400 (unsigned long)btrfs_device_uuid(dev_item),
2401 BTRFS_UUID_SIZE);
2402 device = btrfs_find_device(root, devid, dev_uuid);
2403 if (!device) {
2404 printk("warning devid %Lu missing\n", devid);
2405 device = add_missing_dev(root, devid, dev_uuid);
2406 if (!device)
2407 return -ENOMEM;
2408 }
2409
2410 fill_device_from_item(leaf, dev_item, device);
2411 device->dev_root = root->fs_info->dev_root;
2412 device->in_fs_metadata = 1;
2413 ret = 0;
2414#if 0
2415 ret = btrfs_open_device(device);
2416 if (ret) {
2417 kfree(device);
2418 }
2419#endif
2420 return ret;
2421}
2422
2423int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
2424{
2425 struct btrfs_dev_item *dev_item;
2426
2427 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
2428 dev_item);
2429 return read_one_dev(root, buf, dev_item);
2430}
2431
2432int btrfs_read_sys_array(struct btrfs_root *root)
2433{
2434 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2435 struct extent_buffer *sb;
2436 struct btrfs_disk_key *disk_key;
2437 struct btrfs_chunk *chunk;
2438 u8 *ptr;
2439 unsigned long sb_ptr;
2440 int ret = 0;
2441 u32 num_stripes;
2442 u32 array_size;
2443 u32 len = 0;
2444 u32 cur;
2445 struct btrfs_key key;
2446
2447 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
2448 BTRFS_SUPER_INFO_SIZE);
2449 if (!sb)
2450 return -ENOMEM;
2451 btrfs_set_buffer_uptodate(sb);
2452 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
2453 array_size = btrfs_super_sys_array_size(super_copy);
2454
2455 ptr = super_copy->sys_chunk_array;
2456 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
2457 cur = 0;
2458
2459 while (cur < array_size) {
2460 disk_key = (struct btrfs_disk_key *)ptr;
2461 btrfs_disk_key_to_cpu(&key, disk_key);
2462
2463 len = sizeof(*disk_key); ptr += len;
2464 sb_ptr += len;
2465 cur += len;
2466
2467 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2468 chunk = (struct btrfs_chunk *)sb_ptr;
2469 ret = read_one_chunk(root, &key, sb, chunk);
2470 if (ret)
2471 break;
2472 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
2473 len = btrfs_chunk_item_size(num_stripes);
2474 } else {
2475 ret = -EIO;
2476 break;
2477 }
2478 ptr += len;
2479 sb_ptr += len;
2480 cur += len;
2481 }
2482 free_extent_buffer(sb);
2483 return ret;
2484}
2485
2486int btrfs_read_chunk_tree(struct btrfs_root *root)
2487{
2488 struct btrfs_path *path;
2489 struct extent_buffer *leaf;
2490 struct btrfs_key key;
2491 struct btrfs_key found_key;
2492 int ret;
2493 int slot;
2494
2495 root = root->fs_info->chunk_root;
2496
2497 path = btrfs_alloc_path();
2498 if (!path)
2499 return -ENOMEM;
2500
2501 /* first we search for all of the device items, and then we
2502 * read in all of the chunk items. This way we can create chunk
2503 * mappings that reference all of the devices that are afound
2504 */
2505 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2506 key.offset = 0;
2507 key.type = 0;
2508again:
2509 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2510 while(1) {
2511 leaf = path->nodes[0];
2512 slot = path->slots[0];
2513 if (slot >= btrfs_header_nritems(leaf)) {
2514 ret = btrfs_next_leaf(root, path);
2515 if (ret == 0)
2516 continue;
2517 if (ret < 0)
2518 goto error;
2519 break;
2520 }
2521 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2522 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
2523 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
2524 break;
2525 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
2526 struct btrfs_dev_item *dev_item;
2527 dev_item = btrfs_item_ptr(leaf, slot,
2528 struct btrfs_dev_item);
2529 ret = read_one_dev(root, leaf, dev_item);
2530 BUG_ON(ret);
2531 }
2532 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
2533 struct btrfs_chunk *chunk;
2534 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2535 ret = read_one_chunk(root, &found_key, leaf, chunk);
2536 }
2537 path->slots[0]++;
2538 }
2539 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
2540 key.objectid = 0;
2541 btrfs_release_path(root, path);
2542 goto again;
2543 }
2544
2545 btrfs_free_path(path);
2546 ret = 0;
2547error:
2548 return ret;
2549}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000000..c50e50580b51
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,150 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_VOLUMES_
20#define __BTRFS_VOLUMES_
21
22#include <linux/bio.h>
23#include "async-thread.h"
24
25struct buffer_head;
26struct btrfs_device {
27 struct list_head dev_list;
28 struct list_head dev_alloc_list;
29 struct btrfs_root *dev_root;
30 struct buffer_head *pending_io;
31 struct bio *pending_bios;
32 struct bio *pending_bio_tail;
33 int running_pending;
34 u64 generation;
35
36 int barriers;
37 int in_fs_metadata;
38
39 spinlock_t io_lock;
40
41 struct block_device *bdev;
42
43 char *name;
44
45 /* the internal btrfs device id */
46 u64 devid;
47
48 /* size of the device */
49 u64 total_bytes;
50
51 /* bytes used */
52 u64 bytes_used;
53
54 /* optimal io alignment for this device */
55 u32 io_align;
56
57 /* optimal io width for this device */
58 u32 io_width;
59
60 /* minimal io size for this device */
61 u32 sector_size;
62
63 /* type and info about this device */
64 u64 type;
65
66 /* physical drive uuid (or lvm uuid) */
67 u8 uuid[BTRFS_UUID_SIZE];
68
69 struct btrfs_work work;
70};
71
72struct btrfs_fs_devices {
73 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
74
75 /* the device with this id has the most recent coyp of the super */
76 u64 latest_devid;
77 u64 latest_trans;
78 u64 num_devices;
79 u64 open_devices;
80 struct block_device *latest_bdev;
81 /* all of the devices in the FS */
82 struct list_head devices;
83
84 /* devices not currently being allocated */
85 struct list_head alloc_list;
86 struct list_head list;
87 int mounted;
88};
89
90struct btrfs_bio_stripe {
91 struct btrfs_device *dev;
92 u64 physical;
93};
94
95struct btrfs_multi_bio {
96 atomic_t stripes_pending;
97 bio_end_io_t *end_io;
98 struct bio *orig_bio;
99 void *private;
100 atomic_t error;
101 int max_errors;
102 int num_stripes;
103 struct btrfs_bio_stripe stripes[];
104};
105
106#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
107 (sizeof(struct btrfs_bio_stripe) * (n)))
108
109int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
110 struct btrfs_device *device,
111 u64 chunk_tree, u64 chunk_objectid,
112 u64 chunk_offset,
113 u64 num_bytes, u64 *start);
114int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
115 u64 logical, u64 *length,
116 struct btrfs_multi_bio **multi_ret, int mirror_num);
117int btrfs_read_sys_array(struct btrfs_root *root);
118int btrfs_read_chunk_tree(struct btrfs_root *root);
119int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
120 struct btrfs_root *extent_root, u64 *start,
121 u64 *num_bytes, u64 type);
122void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
123void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
124int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
125 int mirror_num, int async_submit);
126int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
127int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
128 int flags, void *holder);
129int btrfs_scan_one_device(const char *path, int flags, void *holder,
130 struct btrfs_fs_devices **fs_devices_ret);
131int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
132int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
133int btrfs_add_device(struct btrfs_trans_handle *trans,
134 struct btrfs_root *root,
135 struct btrfs_device *device);
136int btrfs_rm_device(struct btrfs_root *root, char *device_path);
137int btrfs_cleanup_fs_uuids(void);
138int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
139int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
140 u64 logical, struct page *page);
141int btrfs_grow_device(struct btrfs_trans_handle *trans,
142 struct btrfs_device *device, u64 new_size);
143struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
144 u8 *uuid);
145int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
146int btrfs_init_new_device(struct btrfs_root *root, char *path);
147int btrfs_balance(struct btrfs_root *dev_root);
148void btrfs_unlock_volumes(void);
149void btrfs_lock_volumes(void);
150#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000000..adb4b32a9d51
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,321 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/init.h>
20#include <linux/fs.h>
21#include <linux/slab.h>
22#include <linux/rwsem.h>
23#include <linux/xattr.h>
24#include "ctree.h"
25#include "btrfs_inode.h"
26#include "transaction.h"
27#include "xattr.h"
28#include "disk-io.h"
29
30
31ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
32 void *buffer, size_t size)
33{
34 struct btrfs_dir_item *di;
35 struct btrfs_root *root = BTRFS_I(inode)->root;
36 struct btrfs_path *path;
37 struct extent_buffer *leaf;
38 int ret = 0;
39 unsigned long data_ptr;
40
41 path = btrfs_alloc_path();
42 if (!path)
43 return -ENOMEM;
44
45 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0);
48 if (!di || IS_ERR(di)) {
49 ret = -ENODATA;
50 goto out;
51 }
52
53 leaf = path->nodes[0];
54 /* if size is 0, that means we want the size of the attr */
55 if (!size) {
56 ret = btrfs_dir_data_len(leaf, di);
57 goto out;
58 }
59
60 /* now get the data out of our dir_item */
61 if (btrfs_dir_data_len(leaf, di) > size) {
62 ret = -ERANGE;
63 goto out;
64 }
65 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr,
68 btrfs_dir_data_len(leaf, di));
69 ret = btrfs_dir_data_len(leaf, di);
70
71out:
72 btrfs_free_path(path);
73 return ret;
74}
75
76int __btrfs_setxattr(struct inode *inode, const char *name,
77 const void *value, size_t size, int flags)
78{
79 struct btrfs_dir_item *di;
80 struct btrfs_root *root = BTRFS_I(inode)->root;
81 struct btrfs_trans_handle *trans;
82 struct btrfs_path *path;
83 int ret = 0, mod = 0;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88
89 trans = btrfs_start_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode);
91
92 /* first lets see if we already have this xattr */
93 di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
94 strlen(name), -1);
95 if (IS_ERR(di)) {
96 ret = PTR_ERR(di);
97 goto out;
98 }
99
100 /* ok we already have this xattr, lets remove it */
101 if (di) {
102 /* if we want create only exit */
103 if (flags & XATTR_CREATE) {
104 ret = -EEXIST;
105 goto out;
106 }
107
108 ret = btrfs_delete_one_dir_name(trans, root, path, di);
109 if (ret)
110 goto out;
111 btrfs_release_path(root, path);
112
113 /* if we don't have a value then we are removing the xattr */
114 if (!value) {
115 mod = 1;
116 goto out;
117 }
118 } else {
119 btrfs_release_path(root, path);
120
121 if (flags & XATTR_REPLACE) {
122 /* we couldn't find the attr to replace */
123 ret = -ENODATA;
124 goto out;
125 }
126 }
127
128 /* ok we have to create a completely new xattr */
129 ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
130 value, size, inode->i_ino);
131 if (ret)
132 goto out;
133 mod = 1;
134
135out:
136 if (mod) {
137 inode->i_ctime = CURRENT_TIME;
138 ret = btrfs_update_inode(trans, root, inode);
139 }
140
141 btrfs_end_transaction(trans, root);
142 btrfs_free_path(path);
143 return ret;
144}
145
146ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 struct btrfs_key key, found_key;
149 struct inode *inode = dentry->d_inode;
150 struct btrfs_root *root = BTRFS_I(inode)->root;
151 struct btrfs_path *path;
152 struct btrfs_item *item;
153 struct extent_buffer *leaf;
154 struct btrfs_dir_item *di;
155 int ret = 0, slot, advance;
156 size_t total_size = 0, size_left = size;
157 unsigned long name_ptr;
158 size_t name_len;
159 u32 nritems;
160
161 /*
162 * ok we want all objects associated with this id.
163 * NOTE: we set key.offset = 0; because we want to start with the
164 * first xattr that we find and walk forward
165 */
166 key.objectid = inode->i_ino;
167 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
168 key.offset = 0;
169
170 path = btrfs_alloc_path();
171 if (!path)
172 return -ENOMEM;
173 path->reada = 2;
174
175 /* search for our xattrs */
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0)
178 goto err;
179 ret = 0;
180 advance = 0;
181 while (1) {
182 leaf = path->nodes[0];
183 nritems = btrfs_header_nritems(leaf);
184 slot = path->slots[0];
185
186 /* this is where we start walking through the path */
187 if (advance || slot >= nritems) {
188 /*
189 * if we've reached the last slot in this leaf we need
190 * to go to the next leaf and reset everything
191 */
192 if (slot >= nritems-1) {
193 ret = btrfs_next_leaf(root, path);
194 if (ret)
195 break;
196 leaf = path->nodes[0];
197 nritems = btrfs_header_nritems(leaf);
198 slot = path->slots[0];
199 } else {
200 /*
201 * just walking through the slots on this leaf
202 */
203 slot++;
204 path->slots[0]++;
205 }
206 }
207 advance = 1;
208
209 item = btrfs_item_nr(leaf, slot);
210 btrfs_item_key_to_cpu(leaf, &found_key, slot);
211
212 /* check to make sure this item is what we want */
213 if (found_key.objectid != key.objectid)
214 break;
215 if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
216 break;
217
218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
219
220 name_len = btrfs_dir_name_len(leaf, di);
221 total_size += name_len + 1;
222
223 /* we are just looking for how big our buffer needs to be */
224 if (!size)
225 continue;
226
227 if (!buffer || (name_len + 1) > size_left) {
228 ret = -ERANGE;
229 break;
230 }
231
232 name_ptr = (unsigned long)(di + 1);
233 read_extent_buffer(leaf, buffer, name_ptr, name_len);
234 buffer[name_len] = '\0';
235
236 size_left -= name_len + 1;
237 buffer += name_len + 1;
238 }
239 ret = total_size;
240
241err:
242 btrfs_free_path(path);
243
244 return ret;
245}
246
247/*
248 * List of handlers for synthetic system.* attributes. All real ondisk
249 * attributes are handled directly.
250 */
251struct xattr_handler *btrfs_xattr_handlers[] = {
252#ifdef CONFIG_FS_POSIX_ACL
253 &btrfs_xattr_acl_access_handler,
254 &btrfs_xattr_acl_default_handler,
255#endif
256 NULL,
257};
258
259/*
260 * Check if the attribute is in a supported namespace.
261 *
262 * This applied after the check for the synthetic attributes in the system
263 * namespace.
264 */
265static bool btrfs_is_valid_xattr(const char *name)
266{
267 return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
268 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
269 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
270 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
271}
272
273ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
274 void *buffer, size_t size)
275{
276 /*
277 * If this is a request for a synthetic attribute in the system.*
278 * namespace use the generic infrastructure to resolve a handler
279 * for it via sb->s_xattr.
280 */
281 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
282 return generic_getxattr(dentry, name, buffer, size);
283
284 if (!btrfs_is_valid_xattr(name))
285 return -EOPNOTSUPP;
286 return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
287}
288
289int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
290 size_t size, int flags)
291{
292 /*
293 * If this is a request for a synthetic attribute in the system.*
294 * namespace use the generic infrastructure to resolve a handler
295 * for it via sb->s_xattr.
296 */
297 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
298 return generic_setxattr(dentry, name, value, size, flags);
299
300 if (!btrfs_is_valid_xattr(name))
301 return -EOPNOTSUPP;
302
303 if (size == 0)
304 value = ""; /* empty EA, do not remove */
305 return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
306}
307
308int btrfs_removexattr(struct dentry *dentry, const char *name)
309{
310 /*
311 * If this is a request for a synthetic attribute in the system.*
312 * namespace use the generic infrastructure to resolve a handler
313 * for it via sb->s_xattr.
314 */
315 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
316 return generic_removexattr(dentry, name);
317
318 if (!btrfs_is_valid_xattr(name))
319 return -EOPNOTSUPP;
320 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
321}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000000..5b1d08f8e68d
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __XATTR__
20#define __XATTR__
21
22#include <linux/xattr.h>
23
24extern struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[];
27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size);
30extern int __btrfs_setxattr(struct inode *inode, const char *name,
31 const void *value, size_t size, int flags);
32
33extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
34 void *buffer, size_t size);
35extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38
39#endif /* __XATTR__ */